diff --git a/cmake/external/xpu.cmake b/cmake/external/xpu.cmake
index 415c0fe9bef9eab89e670d8b3f6f7c330b316ed8..45a76fdc1f1a2aab66e7f4972eecbbec03af941a 100644
--- a/cmake/external/xpu.cmake
+++ b/cmake/external/xpu.cmake
@@ -36,7 +36,7 @@ ENDIF()
 
 if(NOT DEFINED XPU_BASE_URL)
   SET(XPU_BASE_URL_WITHOUT_DATE "https://baidu-kunlun-product.cdn.bcebos.com/KL-SDK/klsdk-dev")
-  SET(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20220219")
+  SET(XPU_BASE_URL "${XPU_BASE_URL_WITHOUT_DATE}/20220228")
 else()
   SET(XPU_BASE_URL "${XPU_BASE_URL}")
 endif()
diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index f7c17bd7cfe7e099e0afeaf623724e12387aff44..51ed537ce5db1cad1ea7b6d1921855c1c378e641 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -667,6 +667,7 @@ function(xpu_library TARGET_NAME)
       else()
         xpu_add_library(${TARGET_NAME} STATIC ${xpu_library_SRCS} DEPENDS ${xpu_library_DEPS})
         find_fluid_modules(${TARGET_NAME})
+        find_phi_modules(${TARGET_NAME})
       endif()
       if (xpu_library_DEPS)
         add_dependencies(${TARGET_NAME} ${xpu_library_DEPS})
diff --git a/cmake/phi.cmake b/cmake/phi.cmake
index d9132b84455e7309713b99f9e574bfceb83c7b6c..f6e15758379ada165a9dc0e31273a533b06ad2df 100644
--- a/cmake/phi.cmake
+++ b/cmake/phi.cmake
@@ -83,6 +83,8 @@ function(kernel_declare TARGET_LIST)
                 file(APPEND ${kernel_declare_file} "PD_DECLARE_KERNEL(${kernel_name}, XPU, ALL_LAYOUT);\n")
             elseif (${kernel_path} MATCHES "./gpudnn\/")
                 file(APPEND ${kernel_declare_file} "PD_DECLARE_KERNEL(${kernel_name}, GPUDNN, ALL_LAYOUT);\n")
+            elseif (${kernel_path} MATCHES "./kps\/")
+                file(APPEND ${kernel_declare_file} "PD_DECLARE_KERNEL(${kernel_name}, KPS, ALL_LAYOUT);\n")
             else ()
                 # deal with device independent kernel, now we use CPU temporaary
                 file(APPEND ${kernel_declare_file} "PD_DECLARE_KERNEL(${kernel_name}, CPU, ALL_LAYOUT);\n")
@@ -97,6 +99,7 @@ function(kernel_library TARGET)
     set(gpu_srcs)
     set(xpu_srcs)
     set(gpudnn_srcs)
+    set(kps_srcs)
     set(selected_rows_srcs)
     # parse and save the deps kerenl targets
     set(all_srcs)
@@ -128,6 +131,9 @@ function(kernel_library TARGET)
             if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/gpu/${TARGET}.cu.cc)
                 list(APPEND gpu_srcs ${CMAKE_CURRENT_SOURCE_DIR}/gpu/${TARGET}.cu.cc)
             endif()
+            if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/kps/${TARGET}.cu)
+                list(APPEND gpu_srcs ${CMAKE_CURRENT_SOURCE_DIR}/kps/${TARGET}.cu)
+            endif()
             if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/gpudnn/${TARGET}_gpudnn.cu)
                 list(APPEND gpudnn_srcs ${CMAKE_CURRENT_SOURCE_DIR}/gpudnn/${TARGET}_gpudnn.cu)
             endif()
@@ -137,6 +143,15 @@ function(kernel_library TARGET)
                 list(APPEND xpu_srcs ${CMAKE_CURRENT_SOURCE_DIR}/xpu/${TARGET}.cc)
             endif()
         endif()
+        if (WITH_XPU_KP)
+            if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/kps/${TARGET}.cu)
+                # Change XPU2 file suffix
+                # NOTE(chenweihang): If we can be sure that the *.kps suffix is no longer used, it can be copied directly to *.xpu
+                file(COPY ${CMAKE_CURRENT_SOURCE_DIR}/kps/${TARGET}.cu DESTINATION ${CMAKE_CURRENT_BINARY_DIR}/kps)
+                file(RENAME ${CMAKE_CURRENT_BINARY_DIR}/kps/${TARGET}.cu ${CMAKE_CURRENT_BINARY_DIR}/kps/${TARGET}.kps)
+                list(APPEND kps_srcs ${CMAKE_CURRENT_BINARY_DIR}/kps/${TARGET}.kps)
+            endif()
+        endif()
     else()
         # TODO(chenweihang): impl compile by source later
     endif()
@@ -150,6 +165,7 @@ function(kernel_library TARGET)
     list(APPEND all_srcs ${gpu_srcs})
     list(APPEND all_srcs ${xpu_srcs})
     list(APPEND all_srcs ${gpudnn_srcs})
+    list(APPEND all_srcs ${kps_srcs})
     foreach(src ${all_srcs})
         file(READ ${src} target_content)
         string(REGEX MATCHALL "#include \"paddle\/phi\/kernels\/[a-z0-9_]+_kernel.h\"" include_kernels ${target_content})
@@ -159,11 +175,11 @@ function(kernel_library TARGET)
             string(REGEX MATCHALL "#include \"paddle\/phi\/kernels\/${kernel_library_SUB_DIR}\/[a-z0-9_]+_kernel.h\"" include_kernels ${target_content})
         endif()
         foreach(include_kernel ${include_kernels})
-        if ("${kernel_library_SUB_DIR}" STREQUAL "")
-            string(REGEX REPLACE "#include \"paddle\/phi\/kernels\/" "" kernel_name ${include_kernel})
-        else()
-            string(REGEX REPLACE "#include \"paddle\/phi\/kernels\/${kernel_library_SUB_DIR}\/" "" kernel_name ${include_kernel})
-        endif()
+            if ("${kernel_library_SUB_DIR}" STREQUAL "")
+                string(REGEX REPLACE "#include \"paddle\/phi\/kernels\/" "" kernel_name ${include_kernel})
+            else()
+                string(REGEX REPLACE "#include \"paddle\/phi\/kernels\/${kernel_library_SUB_DIR}\/" "" kernel_name ${include_kernel})
+            endif()
             string(REGEX REPLACE ".h\"" "" kernel_name ${kernel_name})
             list(APPEND kernel_deps ${kernel_name})
         endforeach()
@@ -176,11 +192,20 @@ function(kernel_library TARGET)
     list(LENGTH gpu_srcs gpu_srcs_len)
     list(LENGTH xpu_srcs xpu_srcs_len)
     list(LENGTH gpudnn_srcs gpudnn_srcs_len)
+    list(LENGTH kps_srcs kps_srcs_len)
     list(LENGTH selected_rows_srcs selected_rows_srcs_len)
 
+    # kernel source file level
+    # level 1: base device kernel
+    # - cpu_srcs / gpu_srcs / xpu_srcs / kps_srcs
+    # level 2: device-independent kernel
+    # - common_srcs
+    # level 3: Kernel implemented by reusing device-independent kernel
+    # - selected_rows_srcs
+
     # Build Target according different src organization
     if((${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0 OR
-        ${xpu_srcs_len} GREATER 0 OR ${gpudnn_srcs_len} GREATER 0) AND
+        ${xpu_srcs_len} GREATER 0 OR ${gpudnn_srcs_len} GREATER 0 OR ${kps_srcs_len} GREATER 0) AND
         (${common_srcs_len} GREATER 0 OR ${selected_rows_srcs_len} GREATER 0))
         # If the common_srcs/selected_rows_srcs depends on specific device srcs, build target using this rule.
         if (WITH_GPU)
@@ -193,6 +218,11 @@ function(kernel_library TARGET)
                 hip_library(${TARGET}_part SRCS ${cpu_srcs} ${gpu_srcs} ${gpudnn_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
                 hip_library(${TARGET} SRCS ${common_srcs} ${selected_rows_srcs} DEPS ${TARGET}_part)
             endif()
+        elseif (WITH_XPU_KP)
+            if (${cpu_srcs_len} GREATER 0 OR ${xpu_srcs_len} GREATER 0 OR ${kps_srcs_len} GREATER 0)
+                xpu_library(${TARGET}_part SRCS ${cpu_srcs} ${xpu_srcs} ${kps_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
+                xpu_library(${TARGET} SRCS ${common_srcs} ${selected_rows_srcs} DEPS ${TARGET}_part)
+            endif()
         else()
             if (${cpu_srcs_len} GREATER 0 OR ${xpu_srcs_len} GREATER 0)
                 cc_library(${TARGET}_part SRCS ${cpu_srcs} ${xpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
@@ -200,7 +230,7 @@ function(kernel_library TARGET)
             endif()
         endif()
     # If there are only specific device srcs, build target using this rule.
-    elseif (${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0 OR ${xpu_srcs_len} GREATER 0 OR ${gpudnn_srcs_len} GREATER 0)
+    elseif (${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0 OR ${xpu_srcs_len} GREATER 0 OR ${gpudnn_srcs_len} GREATER 0 OR ${kps_srcs_len} GREATER 0)
         if (WITH_GPU)
             if (${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0 OR ${gpudnn_srcs_len} GREATER 0)
                 nv_library(${TARGET} SRCS ${cpu_srcs} ${gpu_srcs} ${gpudnn_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
@@ -209,6 +239,10 @@ function(kernel_library TARGET)
             if (${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0 OR ${gpudnn_srcs_len} GREATER 0)
                 hip_library(${TARGET} SRCS ${cpu_srcs} ${gpu_srcs} ${gpudnn_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
             endif()
+        elseif (WITH_XPU_KP)
+            if (${cpu_srcs_len} GREATER 0 OR ${xpu_srcs_len} GREATER 0 OR ${kps_srcs_len} GREATER 0)
+                xpu_library(${TARGET} SRCS ${cpu_srcs} ${xpu_srcs} ${kps_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
+            endif()
         else()
             if (${cpu_srcs_len} GREATER 0 OR ${xpu_srcs_len} GREATER 0)
                 cc_library(${TARGET} SRCS ${cpu_srcs} ${xpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
@@ -222,6 +256,9 @@ function(kernel_library TARGET)
         elseif (WITH_ROCM)
             hip_library(${TARGET}_part SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
             hip_library(${TARGET} SRCS ${selected_rows_srcs} DEPS ${TARGET}_part)
+        elseif (WITH_XPU_KP)
+            xpu_library(${TARGET}_part SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
+            xpu_library(${TARGET} SRCS ${selected_rows_srcs} DEPS ${TARGET}_part)
         else()
             cc_library(${TARGET}_part SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
             cc_library(${TARGET} SRCS ${selected_rows_srcs} DEPS ${TARGET}_part)
@@ -232,6 +269,8 @@ function(kernel_library TARGET)
             nv_library(${TARGET} SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
         elseif (WITH_ROCM)
             hip_library(${TARGET} SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
+        elseif (WITH_XPU_KP)
+            xpu_library(${TARGET} SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
         else()
             cc_library(${TARGET} SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
         endif()
@@ -240,6 +279,8 @@ function(kernel_library TARGET)
             nv_library(${TARGET} SRCS ${selected_rows_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
         elseif (WITH_ROCM)
             hip_library(${TARGET} SRCS ${selected_rows_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
+        elseif (WITH_XPU_KP)
+            xpu_library(${TARGET} SRCS ${selected_rows_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
         else()
             cc_library(${TARGET} SRCS ${selected_rows_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
         endif()
@@ -249,7 +290,7 @@ function(kernel_library TARGET)
 
     if (${target_build_flag} EQUAL 1)
         if (${common_srcs_len} GREATER 0 OR ${cpu_srcs_len} GREATER 0 OR
-            ${gpu_srcs_len} GREATER 0 OR ${xpu_srcs_len} GREATER 0 OR
+            ${gpu_srcs_len} GREATER 0 OR ${xpu_srcs_len} GREATER 0 OR ${kps_srcs_len} GREATER 0 OR
             ${gpudnn_srcs_len} GREATER 0 OR ${selected_rows_srcs_len} GREATER 0)
             # append target into PHI_KERNELS property
             get_property(phi_kernels GLOBAL PROPERTY PHI_KERNELS)
@@ -275,6 +316,9 @@ function(kernel_library TARGET)
         if (${gpudnn_srcs_len} GREATER 0)
             kernel_declare(${gpudnn_srcs})
         endif()
+        if (${kps_srcs_len} GREATER 0)
+            kernel_declare(${kps_srcs})
+        endif()
         if (${selected_rows_srcs_len} GREATER 0)
             kernel_declare(${selected_rows_srcs})
         endif()
diff --git a/paddle/fluid/distributed/collective/CMakeLists.txt b/paddle/fluid/distributed/collective/CMakeLists.txt
index 41652f8b6ed6f717ad8a571be8e7a16408b34504..a5b40f8aa07d77e803f2cad36155b7de1bd03719 100644
--- a/paddle/fluid/distributed/collective/CMakeLists.txt
+++ b/paddle/fluid/distributed/collective/CMakeLists.txt
@@ -1,4 +1,5 @@
 cc_library(processgroup SRCS ProcessGroup.cc DEPS phi phi_api eager_api)
+cc_library(eager_reducer SRCS reducer.cc DEPS eager_api processgroup)
 
 if(WITH_NCCL)
     cc_library(processgroup_nccl SRCS ProcessGroupNCCL.cc DEPS place cuda_stream enforce collective_helper device_context phi phi_api eager_api)
diff --git a/paddle/fluid/distributed/collective/reducer.cc b/paddle/fluid/distributed/collective/reducer.cc
new file mode 100644
index 0000000000000000000000000000000000000000..59f3ea3b0a7d85651e7780b4b11875f19b70931e
--- /dev/null
+++ b/paddle/fluid/distributed/collective/reducer.cc
@@ -0,0 +1,131 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/distributed/collective/reducer.h"
+#include "paddle/phi/common/data_type.h"
+
+namespace paddle {
+namespace distributed {
+
+std::vector<std::vector<size_t>> Eager_AssignGroupBySize(
+    const std::vector<Tensor> tensors,
+    const std::vector<bool> &is_sparse_gradient,
+    const std::vector<size_t> &group_size_limits,
+    const std::vector<int64_t> &tensor_indices) {
+  PADDLE_ENFORCE_EQ(
+      tensors.size(), is_sparse_gradient.size(),
+      platform::errors::PreconditionNotMet(
+          "tensors len must be equal to is_sparse_gradient len, but "
+          "[%lu] != [%lu]",
+          tensors.size(), is_sparse_gradient.size()));
+  auto check_perm = [](const std::vector<int64_t> &x) -> bool {
+    size_t len = x.size();
+    std::vector<size_t> cnt(len, 0);
+    for (size_t i = 0; i < len; ++i) {
+      if (x[i] >= static_cast<int64_t>(len) || x[i] < 0 || cnt[x[i]]) {
+        return false;
+      }
+      cnt[x[i]]++;
+    }
+    return true;
+  };
+
+  PADDLE_ENFORCE_EQ(true, check_perm(tensor_indices),
+                    platform::errors::PreconditionNotMet(
+                        "tensor_indices must be a permutation from 0 to %lu",
+                        tensor_indices.size()));
+  // the return vector
+  std::vector<std::vector<size_t>> res;
+
+  // Key: the var type
+  // Value: should use which index in group_size_limits for group size limit
+  std::map<experimental::DataType, size_t> group_limit_index;
+
+  // Key: the var type
+  // Value: <the var index in input tensors, total numel in this group>
+  std::map<experimental::DataType, std::pair<std::vector<size_t>, size_t>>
+      next_group;
+
+  for (size_t i = 0; i < tensors.size(); ++i) {
+    const auto &var = tensors[i];
+
+    size_t tensor_real_index = i;
+    if (!tensor_indices.empty()) {
+      tensor_real_index = tensor_indices[i];
+    }
+
+    if (is_sparse_gradient[tensor_real_index]) {
+      // we keep sparse var a single group
+      res.push_back({tensor_real_index});
+      continue;
+    }
+
+    const auto &var_dtype = var.dtype();
+    VLOG(3) << "var[" << var.name() << "] 's type is " << var_dtype;
+    auto &group_info = next_group[var_dtype];
+
+    int64_t var_size = -1;
+
+    if (var.is_dense_tensor()) {
+      var_size =
+          std::dynamic_pointer_cast<phi::DenseTensor>(var.impl())->numel();
+    } else {
+      VLOG(3) << "var " << var.name()
+              << " is not tensor or selected_rows, so skip it";
+      continue;
+    }
+
+    group_info.first.push_back(tensor_real_index);
+    group_info.second += experimental::SizeOf(var_dtype) * var_size;
+    // group_info.second += framework::SizeOfType(var_dtype) * var_size;
+
+    if (group_limit_index.find(var_dtype) == group_limit_index.end()) {
+      // means it is the first var of var_dtype
+      group_limit_index[var_dtype] = 0;
+    }
+    auto &cur_limit_index = group_limit_index[var_dtype];
+    if (group_info.second >= group_size_limits[cur_limit_index]) {
+      // exceed group capacity and create a new group
+      res.emplace_back(std::move(group_info.first));
+      group_info = std::pair<std::vector<size_t>, size_t>();
+      cur_limit_index =
+          (std::min)(cur_limit_index + 1, group_size_limits.size() - 1);
+    }
+  }
+
+  // add the final groups
+  for (auto &e : next_group) {
+    auto &group_info = e.second;
+    if (!group_info.first.empty()) {
+      res.emplace_back(std::move(group_info.first));
+    }
+  }
+
+  for (const auto &group_index : res) {
+    PADDLE_ENFORCE_NE(
+        group_index.empty(), true,
+        platform::errors::PreconditionNotMet(
+            "AssignGroupBySize construct empty group, please check."));
+  }
+  if (tensor_indices.empty()) {
+    std::sort(res.begin(), res.end(),
+              [](const std::vector<size_t> &x, const std::vector<size_t> &y) {
+                return x.front() < y.front();
+              });
+  }
+  return res;
+}
+
+}  //  namespace distributed
+}  //  namespace paddle
diff --git a/paddle/fluid/distributed/collective/reducer.h b/paddle/fluid/distributed/collective/reducer.h
new file mode 100644
index 0000000000000000000000000000000000000000..f8c75385ef8bd6891df8eda6faa93c73091c37f5
--- /dev/null
+++ b/paddle/fluid/distributed/collective/reducer.h
@@ -0,0 +1,32 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <map>
+#include <vector>
+#include "paddle/fluid/distributed/collective/ProcessGroup.h"
+#include "paddle/fluid/eager/api/utils/tensor_utils.h"
+
+namespace paddle {
+namespace distributed {
+using Tensor = paddle::experimental::Tensor;
+
+std::vector<std::vector<size_t>> Eager_AssignGroupBySize(
+    const std::vector<Tensor>, const std::vector<bool>& is_sparse_gradient,
+    const std::vector<size_t>& group_size_limits,
+    const std::vector<int64_t>& tensor_indices = {});
+
+}  //  namespace distributed
+}  //  namespace paddle
diff --git a/paddle/fluid/framework/infershape_utils.cc b/paddle/fluid/framework/infershape_utils.cc
index e14b91d935d05c12442f3d0205c1e97df9697ec3..d9287b9a624d39c40cd63071ab08257a8526ce17 100644
--- a/paddle/fluid/framework/infershape_utils.cc
+++ b/paddle/fluid/framework/infershape_utils.cc
@@ -308,22 +308,25 @@ phi::InferMetaContext BuildInferMetaContext(InferShapeContext* ctx,
   // TODO(chenweihang): support multiple inputs and outputs later
   phi::InferMetaContext infer_mete_context;
   for (auto& in_name : input_names) {
-    if (ctx->HasInput(in_name)) {
-      infer_meta_context.EmplaceBackInput(std::make_shared<CompatMetaTensor>(
-          ctx->GetInputVarPtrs(in_name)[0], ctx->IsRuntime()));
+    if (ctx->HasInputs(in_name)) {
+      auto input_var = ctx->GetInputVarPtrs(in_name);
+      if (input_var.size() == 1) {
+        infer_meta_context.EmplaceBackInput(
+            std::make_shared<CompatMetaTensor>(input_var[0], ctx->IsRuntime()));
+      } else {
+        paddle::SmallVector<std::shared_ptr<phi::MetaTensor>> inputs;
+        inputs.reserve(input_var.size());
+        for (const auto& in : input_var) {
+          inputs.push_back(
+              std::make_shared<CompatMetaTensor>(in, ctx->IsRuntime()));
+        }
+        infer_meta_context.EmplaceBackInputs(std::move(inputs));
+      }
     } else {
       infer_meta_context.EmplaceBackInput({nullptr});
     }
   }
 
-  for (auto& out_name : output_names) {
-    if (ctx->HasOutput(out_name)) {
-      infer_meta_context.EmplaceBackOutput(std::make_shared<CompatMetaTensor>(
-          ctx->GetOutputVarPtrs(out_name)[0], ctx->IsRuntime()));
-    } else {
-      infer_meta_context.EmplaceBackOutput({nullptr});
-    }
-  }
   auto attr_reader = ctx->Attrs();
   for (size_t i = 0; i < attr_names.size(); ++i) {
     auto attr_name = attr_names[i];
@@ -348,13 +351,13 @@ phi::InferMetaContext BuildInferMetaContext(InferShapeContext* ctx,
           }
         } else {
           // If is not in runtime, we will set default value(-1) for ScalarArray
-          int64_t num_ele = 0;
           std::vector<VarDesc*> vars;
           vars.reserve(infershape_inputs.size());
-          for (size_t i = 0; i < infershape_inputs.size(); i++) {
+          for (size_t i = 0; i < infershape_inputs.size(); ++i) {
             vars.push_back(BOOST_GET_CONST(VarDesc*, infershape_inputs[i]));
           }
 
+          int64_t num_ele = 0;
           if (vars.size() == 1) {
             num_ele = 1;
             const auto& tensor_dims = vars[0]->GetShape();
@@ -362,16 +365,7 @@ phi::InferMetaContext BuildInferMetaContext(InferShapeContext* ctx,
               num_ele *= tensor_dims[i];
             }
           } else {
-            for (auto& var : vars) {
-              const auto& tensor_dims = var->GetShape();
-              PADDLE_ENFORCE_EQ(tensor_dims.size(), 1,
-                                platform::errors::InvalidArgument(
-                                    "The shape is constructed by multi-tensor, "
-                                    "every tensor's dims should be 1. But your "
-                                    "shape has tensor that dims is %s.",
-                                    tensor_dims.size()));
-              num_ele += tensor_dims[0];
-            }
+            num_ele = vars.size();
           }
           phi::ScalarArray tensor_attr(std::vector<int32_t>(num_ele, -1));
           tensor_attr.SetFromTensor(true);
@@ -383,10 +377,14 @@ phi::InferMetaContext BuildInferMetaContext(InferShapeContext* ctx,
             std::type_index(typeid(std::vector<int32_t>))) {
           infer_meta_context.EmplaceBackAttr(std::move(
               phi::ScalarArray(BOOST_GET_CONST(std::vector<int32_t>, attr))));
+        } else if (std::type_index(attr.type()) ==
+                   std::type_index(typeid(int))) {
+          infer_meta_context.EmplaceBackAttr(
+              phi::ScalarArray({BOOST_GET_CONST(int, attr)}));
         } else {
           PADDLE_THROW(platform::errors::Unimplemented(
               "Unsupported cast op attribute `%s` to ScalarArray when "
-              "construct KernelContext.",
+              "construct InferMetaContext.",
               attr_name));
         }
       }
@@ -414,7 +412,6 @@ phi::InferMetaContext BuildInferMetaContext(InferShapeContext* ctx,
         }
       } else if (ctx->HasInput(attr_name)) {
         const auto& infershape_input = ctx->GetInputVarPtrs(attr_name);
-
         if (infershape_input.size() == 1) {
           if (ctx->IsRuntime()) {
             Variable* var = BOOST_GET_CONST(Variable*, infershape_input[0]);
@@ -490,6 +487,28 @@ phi::InferMetaContext BuildInferMetaContext(InferShapeContext* ctx,
             "Unsupported attribute type is received when call "
             "InferShapeFunctor."));
       }
+    } else {
+      // do nothing
+    }
+  }
+
+  for (auto& out_name : output_names) {
+    if (ctx->HasOutputs(out_name)) {
+      auto output_var = ctx->GetOutputVarPtrs(out_name);
+      if (output_var.size() == 1) {
+        infer_meta_context.EmplaceBackOutput(std::make_shared<CompatMetaTensor>(
+            output_var[0], ctx->IsRuntime()));
+      } else {
+        paddle::SmallVector<std::shared_ptr<phi::MetaTensor>> outputs;
+        outputs.reserve(output_var.size());
+        for (const auto& out : output_var) {
+          outputs.emplace_back(
+              std::make_shared<CompatMetaTensor>(out, ctx->IsRuntime()));
+        }
+        infer_meta_context.EmplaceBackOutputs(std::move(outputs));
+      }
+    } else {
+      infer_meta_context.EmplaceBackOutput({nullptr});
     }
   }
 
diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt
index dad5358590cb1497453681ce940898314a1d06eb..0d53a54ff822ae4dde9fcba7c2559569c7e1bd4f 100755
--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
@@ -78,7 +78,6 @@ pass_library(is_test_pass base)
 pass_library(conv_elementwise_add_act_fuse_pass inference)
 pass_library(conv_elementwise_add2_act_fuse_pass inference)
 pass_library(conv_elementwise_add_fuse_pass inference)
-pass_library(conv_affine_channel_fuse_pass inference)
 pass_library(transpose_flatten_concat_fuse_pass inference)
 pass_library(identity_scale_op_clean_pass base)
 pass_library(sync_batch_norm_pass base)
diff --git a/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.cc b/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.cc
deleted file mode 100644
index f28c9988bd858ad00a5c5a532b7b484315557d8f..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.cc
+++ /dev/null
@@ -1,420 +0,0 @@
-// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.h"
-
-#include <cmath>
-
-#include "paddle/fluid/framework/convert_utils.h"
-#include "paddle/fluid/framework/op_version_registry.h"
-
-namespace phi {
-class DenseTensor;
-}  // namespace phi
-
-namespace paddle {
-namespace framework {
-class Scope;
-}  // namespace framework
-}  // namespace paddle
-
-namespace paddle {
-namespace framework {
-namespace ir {
-
-class Node;
-
-#define GET_CONV_BN_NODES(pattern_name)                                    \
-  /* OPERATORS */                                                          \
-  GET_IR_NODE_FROM_SUBGRAPH(conv, conv, pattern_name);                     \
-  GET_IR_NODE_FROM_SUBGRAPH(affine_channel, affine_channel, pattern_name); \
-  /* CONV inputs */                                                        \
-  GET_IR_NODE_FROM_SUBGRAPH(conv_weight, conv_weight, pattern_name);       \
-  /* CONV outputs */                                                       \
-  GET_IR_NODE_FROM_SUBGRAPH(conv_out, conv_out, pattern_name);             \
-  /* Affine Channel inputs */                                              \
-  GET_IR_NODE_FROM_SUBGRAPH(ac_scale, ac_scale, pattern_name);             \
-  GET_IR_NODE_FROM_SUBGRAPH(ac_bias, ac_bias, pattern_name);               \
-  /* Affine channel outputs */                                             \
-  GET_IR_NODE_FROM_SUBGRAPH(ac_out, ac_out, pattern_name); /* Out */
-
-void recompute_bias_and_weights(const Scope* scope, ir::Node* conv_weight,
-                                const ir::Node& ac_scale,
-                                const LoDTensor& ac_bias_tensor,
-                                LoDTensor* eltwise_y_in_tensor) {
-  using EigenVectorArrayMap =
-      Eigen::Map<Eigen::Array<float, Eigen::Dynamic, 1>>;
-  using ConstEigenVectorArrayMap =
-      Eigen::Map<const Eigen::Array<float, Eigen::Dynamic, 1>>;
-  using EigenMatrixArrayMap = Eigen::Map<
-      Eigen::Array<float, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>>;
-
-  // Re-compute bias of conv2d from AffineChannel
-  PADDLE_ENFORCE_EQ(
-      eltwise_y_in_tensor->dims(), ac_bias_tensor.dims(),
-      platform::errors::InvalidArgument(
-          "Tensor elementwise y(%d) and activation bias(%d) must have same "
-          "dimension.",
-          eltwise_y_in_tensor->dims().size(), ac_bias_tensor.dims().size()));
-
-  auto* scale_tensor = scope->FindVar(ac_scale.Name())->GetMutable<LoDTensor>();
-
-  ConstEigenVectorArrayMap scale_array(scale_tensor->data<float>(),
-                                       scale_tensor->numel(), 1);
-  ConstEigenVectorArrayMap ac_bias_array(ac_bias_tensor.data<float>(),
-                                         ac_bias_tensor.numel(), 1);
-
-  EigenVectorArrayMap eltwise_y_in_array(
-      eltwise_y_in_tensor->mutable_data<float>(platform::CPUPlace()),
-      eltwise_y_in_tensor->numel(), 1);
-
-  eltwise_y_in_array = (eltwise_y_in_array * scale_array) + ac_bias_array;
-
-  // Re-compute weight of conv2d from AffineChannel
-  auto* weights = scope->FindVar(conv_weight->Name())->GetMutable<LoDTensor>();
-  auto weights_shape = weights->dims();
-  auto weights_shape_2d = phi::flatten_to_2d(weights_shape, 1);
-  auto* weights_data = weights->mutable_data<float>(platform::CPUPlace());
-
-  EigenMatrixArrayMap weights_array_2d(weights_data, weights_shape_2d[0],
-                                       weights_shape_2d[1]);
-
-  weights_array_2d.colwise() *= scale_array;
-
-  // Check for subnormal values that slows down convolution execution
-  for (int i = 0; i < weights->numel(); ++i) {
-    if (std::fpclassify(weights_data[i]) == FP_SUBNORMAL) weights_data[i] = 0;
-  }
-}
-
-ConvAffineChannelFusePass::ConvAffineChannelFusePass() {
-  AddOpCompat(OpCompat("conv2d"))
-      .AddInput("Input")
-      .IsTensor()
-      .End()
-      .AddInput("Filter")
-      .IsTensor()
-      .End()
-      .AddInput("Bias")
-      .IsTensor()
-      .IsOptional()
-      .End()
-      .AddInput("ResidualData")
-      .IsTensor()
-      .IsOptional()
-      .End()
-      .AddOutput("Output")
-      .IsTensor()
-      .End()
-      .AddAttr("strides")
-      .IsType<std::vector<int>>()
-      .End()
-      .AddAttr("paddings")
-      .IsType<std::vector<int>>()
-      .End()
-      .AddAttr("padding_algorithm")
-      .IsOptional()
-      .IsStringIn({"EXPLICIT", "SAME", "VALID"})
-      .End()
-      .AddAttr("groups")
-      .IsNumGE(1)
-      .End()
-      .AddAttr("dilations")
-      .IsType<std::vector<int>>()
-      .End()
-      .AddAttr("data_format")
-      .IsStringIn({"NCHW", "AnyLayout"})
-      .End();
-
-  AddOpCompat(OpCompat("affine_channel"))
-      .AddInput("X")
-      .IsTensor()
-      .End()
-      .AddInput("Scale")
-      .IsTensor()
-      .End()
-      .AddInput("Bias")
-      .IsTensor()
-      .IsOptional()
-      .End()
-      .AddOutput("Out")
-      .IsTensor()
-      .End()
-      .AddAttr("data_layout")
-      .IsStringIn({"NCHW", "AnyLayout"})
-      .End();
-
-  AddOpCompat(OpCompat("elementwise_add"))
-      .AddInput("X")
-      .IsTensor()
-      .End()
-      .AddInput("Y")
-      .IsTensor()
-      .End()
-      .AddOutput("Out")
-      .IsTensor()
-      .End()
-      .AddAttr("axis")
-      .IsNumEQ(1)
-      .End();
-}
-
-void ConvAffineChannelFusePass::ApplyImpl(ir::Graph* graph) const {
-  PADDLE_ENFORCE_NOT_NULL(
-      graph, platform::errors::InvalidArgument("Graph cannot be nullptr."));
-  FusePassBase::Init(name_scope_, graph);
-
-  auto* scope = param_scope();
-  PADDLE_ENFORCE_NOT_NULL(
-      scope, platform::errors::InvalidArgument("Scope cannot be nullptr."));
-
-  GraphPatternDetector gpd;
-  auto* conv_input =
-      gpd.mutable_pattern()
-          ->NewNode(patterns::PDNodeName(name_scope_, "conv_input"))
-          ->AsInput()
-          ->assert_is_op_input("conv2d", "Input");
-  patterns::ConvAffineChannel conv_ac_pattern(gpd.mutable_pattern(),
-                                              name_scope_);
-  conv_ac_pattern(conv_input, false /*with_eltwise_add*/);
-
-  int found_conv_ac_count = 0;
-  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
-                     Graph* g) {
-    if (!IsCompat(subgraph, g)) {
-      LOG(WARNING) << "ConvAffineChannelFusePass in op compat failed.";
-      return;
-    }
-
-    VLOG(4) << "handle ConvAffineChannel fuse";
-
-    GET_CONV_BN_NODES(conv_ac_pattern);
-
-    auto data_format = conv->Op()->GetAttrIfExists<std::string>("data_format");
-    if (data_format == "AnyLayout") {
-      LOG_FIRST_N(WARNING, 1) << "conv_affine_channel_fuse_pass is enabled, "
-                                 "it's wrong if data_format of conv is not "
-                                 "NCHW.";
-    }
-
-    // Get affine_channel bias for resizing eltwise_y!
-    auto* ac_bias_tensor =
-        scope->FindVar(ac_bias->Name())->GetMutable<LoDTensor>();
-
-    // Create eltwise_y (conv bias) variable
-    VarDesc eltwise_y_in_desc(
-        patterns::PDNodeName(name_scope_, "eltwise_y_in"));
-    // Set shape && datatype manually
-    eltwise_y_in_desc.SetShape(phi::vectorize(ac_bias_tensor->dims()));
-    eltwise_y_in_desc.SetDataType(
-        framework::TransToProtoVarType(ac_bias_tensor->dtype()));
-    eltwise_y_in_desc.SetLoDLevel(ac_bias->Var()->GetLoDLevel());
-    eltwise_y_in_desc.SetPersistable(true);
-
-    // Initialize eltwise_y
-    auto* eltwise_y_in_node = g->CreateVarNode(&eltwise_y_in_desc);
-    auto* eltwise_y_in_tensor =
-        scope->Var(eltwise_y_in_node->Name())->GetMutable<LoDTensor>();
-    eltwise_y_in_tensor->Resize(ac_bias_tensor->dims());
-    std::fill_n(eltwise_y_in_tensor->mutable_data<float>(platform::CPUPlace()),
-                eltwise_y_in_tensor->numel(), 0.0f);
-
-    // update weights and biases
-    recompute_bias_and_weights(scope, conv_weight, *ac_scale, *ac_bias_tensor,
-                               eltwise_y_in_tensor);
-
-    // create an elementwise add node.
-    OpDesc desc;
-    desc.SetInput("X", std::vector<std::string>({conv_out->Name()}));
-    desc.SetInput("Y", std::vector<std::string>({eltwise_y_in_node->Name()}));
-    desc.SetOutput("Out", std::vector<std::string>({ac_out->Name()}));
-    desc.SetType("elementwise_add");
-    desc.SetAttr("axis", 1);
-    desc.SetAttr("use_mkldnn", conv->Op()->GetAttrIfExists<bool>("use_mkldnn"));
-
-    auto eltwise_op = g->CreateOpNode(&desc);  // OpDesc will be copied.
-
-    GraphSafeRemoveNodes(graph, {ac_scale, ac_bias, affine_channel});
-
-    IR_NODE_LINK_TO(conv_out, eltwise_op);
-    IR_NODE_LINK_TO(eltwise_y_in_node, eltwise_op);
-    IR_NODE_LINK_TO(eltwise_op, ac_out);
-    found_conv_ac_count++;
-  };
-
-  gpd(graph, handler);
-
-  AddStatis(found_conv_ac_count);
-}
-
-ConvEltwiseAddAffineChannelFusePass::ConvEltwiseAddAffineChannelFusePass() {
-  AddOpCompat(OpCompat("conv2d"))
-      .AddInput("Input")
-      .IsTensor()
-      .End()
-      .AddInput("Filter")
-      .IsTensor()
-      .End()
-      .AddInput("Bias")
-      .IsTensor()
-      .IsOptional()
-      .End()
-      .AddInput("ResidualData")
-      .IsTensor()
-      .IsOptional()
-      .End()
-      .AddOutput("Output")
-      .IsTensor()
-      .End()
-      .AddAttr("strides")
-      .IsType<std::vector<int>>()
-      .End()
-      .AddAttr("paddings")
-      .IsType<std::vector<int>>()
-      .End()
-      .AddAttr("padding_algorithm")
-      .IsOptional()
-      .IsStringIn({"EXPLICIT", "SAME", "VALID"})
-      .End()
-      .AddAttr("groups")
-      .IsNumGE(1)
-      .End()
-      .AddAttr("dilations")
-      .IsType<std::vector<int>>()
-      .End()
-      .AddAttr("data_format")
-      .IsStringIn({"NCHW", "AnyLayout"})
-      .End();
-  AddOpCompat(OpCompat("affine_channel"))
-      .AddInput("X")
-      .IsTensor()
-      .End()
-      .AddInput("Scale")
-      .IsTensor()
-      .End()
-      .AddInput("Bias")
-      .IsTensor()
-      .IsOptional()
-      .End()
-      .AddOutput("Out")
-      .IsTensor()
-      .End()
-      .AddAttr("data_layout")
-      .IsStringIn({"NCHW", "AnyLayout"})
-      .End();
-  AddOpCompat(OpCompat("elementwise_add"))
-      .AddInput("X")
-      .IsTensor()
-      .End()
-      .AddInput("Y")
-      .IsTensor()
-      .End()
-      .AddOutput("Out")
-      .IsTensor()
-      .End()
-      .AddAttr("axis")
-      .IsNumEQ(1)
-      .End();
-}
-
-void ConvEltwiseAddAffineChannelFusePass::ApplyImpl(ir::Graph* graph) const {
-  PADDLE_ENFORCE_NOT_NULL(
-      graph, platform::errors::InvalidArgument("Graph cannot be nullptr."));
-  FusePassBase::Init(name_scope_, graph);
-
-  auto* scope = param_scope();
-  PADDLE_ENFORCE_NOT_NULL(
-      scope, platform::errors::InvalidArgument("Scope cannot be nullptr."));
-
-  GraphPatternDetector gpd;
-  auto* conv_input =
-      gpd.mutable_pattern()
-          ->NewNode(patterns::PDNodeName(name_scope_, "conv_input"))
-          ->AsInput()
-          ->assert_is_op_input("conv2d", "Input");
-  patterns::ConvAffineChannel conv_ac_pattern(gpd.mutable_pattern(),
-                                              name_scope_);
-  conv_ac_pattern(conv_input, true /*with_eltwise_add*/);
-
-  int found_conv_ac_count = 0;
-  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
-                     Graph* g) {
-    if (!IsCompat(subgraph, g)) {
-      LOG(WARNING)
-          << "ConvEltwiseAddAffineChannelFusePass in op compat failed.";
-      return;
-    }
-
-    VLOG(4) << "handle ConvBN fuse";
-
-    GET_CONV_BN_NODES(conv_ac_pattern);
-    auto data_format = conv->Op()->GetAttrIfExists<std::string>("data_format");
-    if (data_format == "AnyLayout") {
-      LOG_FIRST_N(WARNING, 1) << "conv_eltwiseadd_affine_channel_fuse_pass is "
-                                 "enabled, it's wrong if data_format of conv "
-                                 "is not NCHW.";
-    }
-    // OPERATORS
-    GET_IR_NODE_FROM_SUBGRAPH(eltwise, eltwise, conv_ac_pattern);
-    // BIAS inputs
-    GET_IR_NODE_FROM_SUBGRAPH(eltwise_y_in, eltwise_y_in, conv_ac_pattern);
-    // BIAS outputs
-    GET_IR_NODE_FROM_SUBGRAPH(eltwise_out, eltwise_out, conv_ac_pattern);
-
-    // Get eltwise_y (conv bias) variable
-    auto* eltwise_y_in_tensor =
-        scope->FindVar(eltwise_y_in->Name())->GetMutable<LoDTensor>();
-
-    // Get batch norm bias
-    auto* ac_bias_tensor =
-        scope->FindVar(ac_bias->Name())->GetMutable<LoDTensor>();
-
-    recompute_bias_and_weights(scope, conv_weight, *ac_scale, *ac_bias_tensor,
-                               eltwise_y_in_tensor);
-
-    // Update the elementwise_add node
-    eltwise->Op()->SetAttr("axis", 1);
-    eltwise->Op()->SetOutput("Out", std::vector<std::string>({ac_out->Name()}));
-
-    GraphSafeRemoveNodes(graph,
-                         {ac_scale, ac_bias, affine_channel, eltwise_out});
-
-    IR_NODE_LINK_TO(eltwise, ac_out);
-
-    found_conv_ac_count++;
-  };
-
-  gpd(graph, handler);
-  AddStatis(found_conv_ac_count);
-}
-
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
-
-REGISTER_PASS(conv_affine_channel_fuse_pass,
-              paddle::framework::ir::ConvAffineChannelFusePass);
-REGISTER_PASS(conv_eltwiseadd_affine_channel_fuse_pass,
-              paddle::framework::ir::ConvEltwiseAddAffineChannelFusePass);
-REGISTER_PASS_CAPABILITY(conv_affine_channel_fuse_pass)
-    .AddCombination(
-        paddle::framework::compatible::OpVersionComparatorCombination()
-            .LE("conv2d", 1)
-            .EQ("affine_channel", 0));
-REGISTER_PASS_CAPABILITY(conv_eltwiseadd_affine_channel_fuse_pass)
-    .AddCombination(
-        paddle::framework::compatible::OpVersionComparatorCombination()
-            .LE("conv2d", 1)
-            .LE("elementwise_add", 1)
-            .EQ("affine_channel", 0));
diff --git a/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.h b/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.h
deleted file mode 100644
index 8cfaf5c6a89f06b453dbbc94b5a7fe8b83e5c111..0000000000000000000000000000000000000000
--- a/paddle/fluid/framework/ir/conv_affine_channel_fuse_pass.h
+++ /dev/null
@@ -1,54 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include <string>
-
-#include "paddle/fluid/framework/ir/fuse_pass_base.h"
-#include "paddle/fluid/framework/ir/graph.h"
-#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
-
-namespace paddle {
-namespace framework {
-namespace ir {
-
-/*
- * Fuse the Conv and ConvAffineChannel.
- */
-class Graph;
-
-class ConvAffineChannelFusePass : public FusePassBase {
- public:
-  ConvAffineChannelFusePass();
-  virtual ~ConvAffineChannelFusePass() {}
-
- protected:
-  void ApplyImpl(ir::Graph*) const override;
-  const std::string name_scope_{"conv_affine_channel_fuse"};
-};
-
-class ConvEltwiseAddAffineChannelFusePass : public FusePassBase {
- public:
-  ConvEltwiseAddAffineChannelFusePass();
-  virtual ~ConvEltwiseAddAffineChannelFusePass() {}
-
- protected:
-  void ApplyImpl(ir::Graph*) const override;
-  const std::string name_scope_{"conv_eltwiseadd_affine_channel_fuse"};
-};
-
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index d33791f70c4d2f759bcd4f6443a5a1f244673d4f..b12ad552aba6e6e599689c05c23ae306110aa78f 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -2074,6 +2074,7 @@ void OperatorWithKernel::BuildPhiKernelContext(
     }
     pt_kernel_context->AssignInputRange(std::make_pair(start_idx, end_idx), i);
   }
+  VLOG(4) << "Done inputs";
 
   for (size_t i = 0; i < output_names.size(); ++i) {
     auto it = ctx.outputs.find(output_names[i]);
@@ -2107,17 +2108,12 @@ void OperatorWithKernel::BuildPhiKernelContext(
             "Unsupported output `%s` type when call pt kernel.",
             framework::ToTypeName(var->Type())));
       }
-
-      experimental::ResetTensorDtypeAndLayoutByArgDef(tensor_out,
-                                                      output_defs.at(i));
-      SetAllocationForOutputTenosr(
-          tensor_out, phi::TransToPhiPlace(output_defs.at(i).backend));
-
       pt_kernel_context->EmplaceBackOutputWithoutSetRange(tensor_out);
     }
 
     pt_kernel_context->AssignOutputRange(std::make_pair(start_idx, end_idx), i);
   }
+  VLOG(4) << "Done outputs";
 
   for (size_t i = 0; i < attr_names.size(); ++i) {
     if (attr_defs[i].type_index == std::type_index(typeid(phi::ScalarArray))) {
@@ -2226,6 +2222,7 @@ void OperatorWithKernel::BuildPhiKernelContext(
       }
     }
   }
+  VLOG(4) << "Done attributes";
 }
 
 }  // namespace framework
diff --git a/paddle/fluid/framework/phi_utils.cc b/paddle/fluid/framework/phi_utils.cc
index 355291beb60f949b52b681592d42b7da4e80186b..93bc2c02d57cb7b57cf48d6f5c34a27a97637377 100644
--- a/paddle/fluid/framework/phi_utils.cc
+++ b/paddle/fluid/framework/phi_utils.cc
@@ -68,6 +68,8 @@ OpKernelType TransPhiKernelKeyToOpKernelType(const phi::KernelKey& kernel_key) {
     library_type = LibraryType::kMKLDNN;
   } else if (kernel_key.backend() == phi::Backend::GPUDNN) {
     library_type = LibraryType::kCUDNN;
+  } else if (kernel_key.backend() == phi::Backend::KPS) {
+    library_type = LibraryType::kKP;
   } else {
     // do nothing
   }
@@ -82,6 +84,8 @@ phi::KernelKey TransOpKernelTypeToPhiKernelKey(
     backend = phi::Backend::MKLDNN;
   } else if (kernel_type.library_type_ == LibraryType::kCUDNN) {
     backend = phi::Backend::GPUDNN;
+  } else if (kernel_type.library_type_ == LibraryType::kKP) {
+    backend = phi::Backend::KPS;
   } else {
     // do
   }
@@ -229,26 +233,5 @@ static void SetAllocationForUninitializedDenseTensor(
   dense_tensor->ResetHolder(shared_allocation);
 }
 
-void SetAllocationForOutputTenosr(phi::TensorBase* tensor,
-                                  const platform::Place& place) {
-  if (phi::DenseTensor::classof(tensor)) {
-    auto* dense_tensor = static_cast<phi::DenseTensor*>(tensor);
-    if (!dense_tensor->IsInitialized() || !(dense_tensor->place() == place)) {
-      SetAllocationForUninitializedDenseTensor(dense_tensor, place);
-    }
-  } else if (phi::SelectedRows::classof(tensor)) {
-    auto* selected_rows = static_cast<phi::SelectedRows*>(tensor);
-    if (!selected_rows->value().IsInitialized() ||
-        !(selected_rows->place() == place)) {
-      SetAllocationForUninitializedDenseTensor(selected_rows->mutable_value(),
-                                               place);
-    }
-  } else {
-    PADDLE_THROW(platform::errors::Unimplemented(
-        "Unsupported tensor type is received when setting allocation for "
-        "output tensor."));
-  }
-}
-
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/phi_utils.h b/paddle/fluid/framework/phi_utils.h
index 1a1f79d82770058ae4010b7a3a3162280ceb1537..a17578816921b2337a76d1a0a69a6c8adbc51c4d 100644
--- a/paddle/fluid/framework/phi_utils.h
+++ b/paddle/fluid/framework/phi_utils.h
@@ -62,9 +62,6 @@ class KernelArgsNameMaker {
 
 void InitDefaultKernelSignatureMap();
 
-void SetAllocationForOutputTenosr(phi::TensorBase* tensor,
-                                  const platform::Place& place);
-
 // TODO(Wilber): support others device context.
 template <typename T>
 struct ConvertToPhiContext {
diff --git a/paddle/fluid/imperative/prepared_operator.h b/paddle/fluid/imperative/prepared_operator.h
index 8e1e2fbe9a12da672a633075ed4c41d3d62cd7e1..3b5762720e7fb4a9eb0be157f6dabf07aa9353c2 100644
--- a/paddle/fluid/imperative/prepared_operator.h
+++ b/paddle/fluid/imperative/prepared_operator.h
@@ -323,12 +323,6 @@ void BuildDygraphPhiKernelContext(
             "Unsupported output `%s` type when call pt kernel.",
             framework::ToTypeName(var->Type())));
       }
-
-      experimental::ResetTensorDtypeAndLayoutByArgDef(tensor_out,
-                                                      output_defs.at(i));
-      framework::SetAllocationForOutputTenosr(
-          tensor_out, phi::TransToPhiPlace(output_defs.at(i).backend));
-
       kernel_ctx->EmplaceBackOutputWithoutSetRange(tensor_out);
     }
     kernel_ctx->AssignOutputRange(std::make_pair(start_idx, end_idx), i);
diff --git a/paddle/fluid/inference/api/paddle_pass_builder.cc b/paddle/fluid/inference/api/paddle_pass_builder.cc
index 313e1f2faea553809cb6fce66ca9a751bace8d75..f5f36d805b43ea0815683e3b65bf157fe5beb2de 100644
--- a/paddle/fluid/inference/api/paddle_pass_builder.cc
+++ b/paddle/fluid/inference/api/paddle_pass_builder.cc
@@ -75,13 +75,11 @@ void PaddlePassBuilder::AppendAnalysisPass(const std::string &pass) {
 void PaddlePassBuilder::ClearPasses() { passes_.clear(); }
 
 const std::vector<std::string> kTRTSubgraphPasses({
-  "conv_affine_channel_fuse_pass",  //
-      "adaptive_pool2d_convert_global_pass",
-      "conv_eltwiseadd_affine_channel_fuse_pass",  //
-      "shuffle_channel_detect_pass",               //
-      "quant_conv2d_dequant_fuse_pass",            //
-      "delete_quant_dequant_op_pass",              //
-      "delete_quant_dequant_filter_op_pass",       //
+  "adaptive_pool2d_convert_global_pass",
+      "shuffle_channel_detect_pass",          //
+      "quant_conv2d_dequant_fuse_pass",       //
+      "delete_quant_dequant_op_pass",         //
+      "delete_quant_dequant_filter_op_pass",  //
       // "fc_fuse_pass",                        //
       "simplify_with_basic_ops_pass",                 //
       "embedding_eltwise_layernorm_fuse_pass",        //
@@ -134,22 +132,20 @@ const std::vector<std::string> kLiteSubgraphPasses({
 GpuPassStrategy::GpuPassStrategy() : PassStrategy({}) {
   passes_.assign({
     //   "identity_scale_op_clean_pass",             //
-    "is_test_pass",                                  //
-        "simplify_with_basic_ops_pass",              //
-        "conv_affine_channel_fuse_pass",             //
-        "conv_eltwiseadd_affine_channel_fuse_pass",  //
-        "conv_bn_fuse_pass",                         //
-        "conv_eltwiseadd_bn_fuse_pass",              //
-        "embedding_eltwise_layernorm_fuse_pass",     //
-        "multihead_matmul_fuse_pass_v2",             //
-        "gpu_cpu_squeeze2_matmul_fuse_pass",         //
-        "gpu_cpu_reshape2_matmul_fuse_pass",         //
-        "gpu_cpu_flatten2_matmul_fuse_pass",         //
-        "gpu_cpu_map_matmul_v2_to_mul_pass",         //
-        "gpu_cpu_map_matmul_v2_to_matmul_pass",      //
-        "gpu_cpu_map_matmul_to_mul_pass",            //
-        "fc_fuse_pass",                              //
-        "fc_elementwise_layernorm_fuse_pass",        //
+    "is_test_pass",                               //
+        "simplify_with_basic_ops_pass",           //
+        "conv_bn_fuse_pass",                      //
+        "conv_eltwiseadd_bn_fuse_pass",           //
+        "embedding_eltwise_layernorm_fuse_pass",  //
+        "multihead_matmul_fuse_pass_v2",          //
+        "gpu_cpu_squeeze2_matmul_fuse_pass",      //
+        "gpu_cpu_reshape2_matmul_fuse_pass",      //
+        "gpu_cpu_flatten2_matmul_fuse_pass",      //
+        "gpu_cpu_map_matmul_v2_to_mul_pass",      //
+        "gpu_cpu_map_matmul_v2_to_matmul_pass",   //
+        "gpu_cpu_map_matmul_to_mul_pass",         //
+        "fc_fuse_pass",                           //
+        "fc_elementwise_layernorm_fuse_pass",     //
 #if CUDNN_VERSION >= 7100  // To run conv_fusion, the version of cudnn must be
                            // guaranteed at least v7
 // cudnn8.0 has memory leak problem in conv + eltwise + act, so we
@@ -236,14 +232,12 @@ void CpuPassStrategy::EnableMKLDNN() {
     passes_.insert(passes_.begin(), "mkldnn_placement_pass");
 
     for (auto &pass : std::vector<std::string>({
-             "depthwise_conv_mkldnn_pass",     //
-             "conv_bn_fuse_pass",              // Execute BN passes again to
-             "conv_eltwiseadd_bn_fuse_pass",   // preserve correct pass order
-             "conv_affine_channel_fuse_pass",  //
-             "conv_eltwiseadd_affine_channel_fuse_pass",  //
-             "conv_transpose_bn_fuse_pass",               //
-             "conv_transpose_eltwiseadd_bn_fuse_pass",    //
-             "conv_bias_mkldnn_fuse_pass",                //
+             "depthwise_conv_mkldnn_pass",    //
+             "conv_bn_fuse_pass",             // Execute BN passes again to
+             "conv_eltwiseadd_bn_fuse_pass",  // preserve correct pass order
+             "conv_transpose_bn_fuse_pass",   //
+             "conv_transpose_eltwiseadd_bn_fuse_pass",  //
+             "conv_bias_mkldnn_fuse_pass",              //
              "conv_transpose_bias_mkldnn_fuse_pass",
              // TODO(baoachun): Need to support 5-dimensional input.
              // "conv3d_bias_mkldnn_fuse_pass",  //
diff --git a/paddle/fluid/operators/cast_op.cu b/paddle/fluid/operators/cast_op.cu
index 5c7dd0e2561fa41313b2e65a443a9e4913a39961..eb51215790bbcdbc9e7d0c3adad482d9a69324b9 100644
--- a/paddle/fluid/operators/cast_op.cu
+++ b/paddle/fluid/operators/cast_op.cu
@@ -29,9 +29,5 @@ using CUDA = paddle::platform::CUDADeviceContext;
       ops::CastOpKernel<CUDA, plat::complex<float>>,                      \
       ops::CastOpKernel<CUDA, plat::complex<double>>, ##__VA_ARGS__);
 
-#if !defined(PADDLE_WITH_HIP)
 // See [ why register transfer_dtype_op alias with cast_op? ] in cast_op.cc
 REGISTER_CAST_CUDA_BASE(transfer_dtype, ops::CastOpKernel<CUDA, plat::bfloat16>)
-#else
-REGISTER_CAST_CUDA_BASE(transfer_dtype)
-#endif
diff --git a/paddle/fluid/operators/concat_op.cc b/paddle/fluid/operators/concat_op.cc
index 55de4087f579460fa6080733f3e2f02bb082b015..1da7798ea2696516759ac49b8ce459459e74066b 100644
--- a/paddle/fluid/operators/concat_op.cc
+++ b/paddle/fluid/operators/concat_op.cc
@@ -18,7 +18,9 @@ limitations under the License. */
 #include <memory>
 #include <string>
 #include <vector>
+#include "paddle/fluid/framework/infershape_utils.h"
 
+#include "paddle/phi/infermeta/multiary.h"
 #include "paddle/phi/kernels/funcs/concat_funcs.h"
 
 #ifdef PADDLE_WITH_MKLDNN
@@ -33,41 +35,6 @@ class ConcatOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInputs("X"), "Input", "X", "Concat");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "Concat");
-
-    auto inputs_dims = ctx->GetInputsDim("X");
-
-    const size_t inputs_num = inputs_dims.size();
-    PADDLE_ENFORCE_GT(
-        inputs_num, static_cast<size_t>(0),
-        platform::errors::InvalidArgument(
-            "The number of input tensors in concat op should > 0. But "
-            "received inputs' length is 0."));
-    if (inputs_num == 1) {
-      VLOG(3) << "Warning: concat op have only one input, may waste memory";
-    }
-
-    if (ctx->HasInput("AxisTensor")) {
-      auto out_dims =
-          phi::make_ddim(std::vector<int>(inputs_dims[0].size(), -1));
-      ctx->SetOutputDim("Out", out_dims);
-      ctx->ShareLoD("X", /*->*/ "Out");
-    } else {
-      size_t axis =
-          ComputeAxis(static_cast<int64_t>(ctx->Attrs().Get<int>("axis")),
-                      static_cast<int64_t>(inputs_dims[0].size()));
-      framework::DDim out_dims =
-          phi::funcs::ComputeAndCheckShape(ctx->IsRuntime(), inputs_dims, axis);
-      if (out_dims[axis] < 0) {
-        out_dims[axis] = -1;
-      }
-      ctx->SetOutputDim("Out", out_dims);
-      ctx->ShareLoD("X", /*->*/ "Out");
-    }
-  }
-
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
@@ -237,9 +204,14 @@ class ConcatDoubleGradOpMaker : public framework::SingleGradOpMaker<T> {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
+
+DELCARE_INFER_SHAPE_FUNCTOR(concat, ConcatInferShapeFunctor,
+                            PT_INFER_META(phi::ConcatInferMeta));
+
 REGISTER_OPERATOR(concat, ops::ConcatOp, ops::ConcatOpMaker,
                   ops::ConcatGradOpMaker<paddle::framework::OpDesc>,
-                  ops::ConcatGradOpMaker<paddle::imperative::OpBase>);
+                  ops::ConcatGradOpMaker<paddle::imperative::OpBase>,
+                  ConcatInferShapeFunctor);
 REGISTER_OPERATOR(concat_grad, ops::ConcatOpGrad,
                   ops::ConcatDoubleGradOpMaker<paddle::framework::OpDesc>,
                   ops::ConcatDoubleGradOpMaker<paddle::imperative::OpBase>,
diff --git a/paddle/fluid/operators/controlflow/CMakeLists.txt b/paddle/fluid/operators/controlflow/CMakeLists.txt
index 1a2df2a0c7ba34f67ecb7c2ade002fcb4475229f..a974f2ec335487e0fbc12a578c0d80d6856e418e 100644
--- a/paddle/fluid/operators/controlflow/CMakeLists.txt
+++ b/paddle/fluid/operators/controlflow/CMakeLists.txt
@@ -20,5 +20,5 @@ else()
 endif()
 
 file(APPEND ${pybind_file} "USE_OP(less_than);\nUSE_OP(equal_all);\nUSE_NO_KERNEL_OP(read_from_array);\n")
-file(APPEND ${pybind_file} "USE_OP(logical_and);\nUSE_OP(logical_or);\nUSE_OP(logical_xor);\nUSE_OP(logical_not);\n")
+file(APPEND ${pybind_file} "USE_OP_ITSELF(logical_and);\nUSE_OP_ITSELF(logical_or);\nUSE_OP_ITSELF(logical_xor);\nUSE_OP_ITSELF(logical_not);\n")
 file(APPEND ${pybind_file} "USE_OP(bitwise_and);\nUSE_OP(bitwise_or);\nUSE_OP(bitwise_xor);\nUSE_OP(bitwise_not);\n")
diff --git a/paddle/fluid/operators/controlflow/logical_op.cc b/paddle/fluid/operators/controlflow/logical_op.cc
index a4262d405435ae31c2a5ad681ab443889ec5d393..4d11cb5ff74e69e991271d2a566dbc9344d35da2 100644
--- a/paddle/fluid/operators/controlflow/logical_op.cc
+++ b/paddle/fluid/operators/controlflow/logical_op.cc
@@ -9,11 +9,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/controlflow/logical_op.h"
 #include <algorithm>
 #include <string>
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
 
 namespace paddle {
 namespace operators {
@@ -145,15 +145,7 @@ class BinaryLogicalOp : public LogicalOp {
       ::paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
 
 REGISTER_BINARY_LOGICAL_OP(logical_and, "$$Out = X \\&\\& Y$$");
-REGISTER_BINARY_LOGICAL_KERNEL(logical_and, CPU,
-                               paddle::operators::LogicalAndFunctor);
 REGISTER_BINARY_LOGICAL_OP(logical_or, "$$Out = X || Y$$");
-REGISTER_BINARY_LOGICAL_KERNEL(logical_or, CPU,
-                               paddle::operators::LogicalOrFunctor);
 REGISTER_UNARY_LOGICAL_OP(logical_not, "$$Out = !X$$");
-REGISTER_UNARY_LOGICAL_KERNEL(logical_not, CPU,
-                              paddle::operators::LogicalNotFunctor);
 REGISTER_BINARY_LOGICAL_OP(logical_xor,
                            "$$Out = (X || Y) \\&\\& !(X \\&\\& Y)$$");
-REGISTER_BINARY_LOGICAL_KERNEL(logical_xor, CPU,
-                               paddle::operators::LogicalXorFunctor);
diff --git a/paddle/fluid/operators/controlflow/logical_op.cu b/paddle/fluid/operators/controlflow/logical_op.cu
deleted file mode 100644
index d88658607ed275808d64dddf4a60d52d4f995e73..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/controlflow/logical_op.cu
+++ /dev/null
@@ -1,69 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/controlflow/logical_op.h"
-#include "paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename Functor>
-class BinaryLogicalOpKernel<platform::CUDADeviceContext, Functor>
-    : public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    using InT = typename Functor::ELEMENT_TYPE;
-    using OutT = bool;
-
-    auto functor = Functor();
-    std::vector<const framework::Tensor*> ins;
-    std::vector<framework::Tensor*> outs;
-    const auto& cuda_ctx =
-        ctx.template device_context<platform::CUDADeviceContext>();
-    int axis = PackTensorsIntoVector<OutT>(ctx, &ins, &outs);
-
-    if (ins.size() == 1) {
-      paddle::operators::LaunchElementwiseCudaKernel<ElementwiseType::kUnary,
-                                                     InT, OutT>(
-          cuda_ctx, ins, &outs, axis, functor);
-    } else {
-      paddle::operators::LaunchElementwiseCudaKernel<ElementwiseType::kBinary,
-                                                     InT, OutT>(
-          cuda_ctx, ins, &outs, axis, functor);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-namespace plat = paddle::platform;
-
-#define REGISTER_LOGICAL_CUDA_KERNEL(op_name, func)                            \
-  REGISTER_OP_CUDA_KERNEL(                                                     \
-      op_name,                                                                 \
-      ops::BinaryLogicalOpKernel<plat::CUDADeviceContext, ops::func<bool>>,    \
-      ops::BinaryLogicalOpKernel<plat::CUDADeviceContext, ops::func<int8_t>>,  \
-      ops::BinaryLogicalOpKernel<plat::CUDADeviceContext, ops::func<int16_t>>, \
-      ops::BinaryLogicalOpKernel<plat::CUDADeviceContext, ops::func<int>>,     \
-      ops::BinaryLogicalOpKernel<plat::CUDADeviceContext, ops::func<int64_t>>, \
-      ops::BinaryLogicalOpKernel<plat::CUDADeviceContext, ops::func<float>>,   \
-      ops::BinaryLogicalOpKernel<plat::CUDADeviceContext, ops::func<double>>);
-
-REGISTER_LOGICAL_CUDA_KERNEL(logical_or, LogicalOrFunctor)
-REGISTER_LOGICAL_CUDA_KERNEL(logical_and, LogicalAndFunctor)
-REGISTER_LOGICAL_CUDA_KERNEL(logical_xor, LogicalXorFunctor)
-REGISTER_LOGICAL_CUDA_KERNEL(logical_not, LogicalNotFunctor)
-#undef REGISTER_LOGICAL_CUDA_KERNEL
diff --git a/paddle/fluid/operators/controlflow/logical_op.h b/paddle/fluid/operators/controlflow/logical_op.h
deleted file mode 100644
index 15cd643a858cc018e3007fa90ec479900cd243be..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/controlflow/logical_op.h
+++ /dev/null
@@ -1,111 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <math.h>
-#include <type_traits>
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
-#include "paddle/fluid/platform/transform.h"
-
-namespace paddle {
-namespace operators {
-
-#define LOGICAL_BINARY_FUNCTOR(func_name, op)                \
-  template <typename T>                                      \
-  struct func_name {                                         \
-    using ELEMENT_TYPE = T;                                  \
-    HOSTDEVICE bool operator()(const T a, const T b) const { \
-      return static_cast<bool>(a) op static_cast<bool>(b);   \
-    }                                                        \
-  };
-
-LOGICAL_BINARY_FUNCTOR(LogicalOrFunctor, ||)
-LOGICAL_BINARY_FUNCTOR(LogicalAndFunctor, &&)
-LOGICAL_BINARY_FUNCTOR(LogicalXorFunctor, ^)
-#undef LOGICAL_BINARY_FUNCTOR
-
-template <typename T>
-struct LogicalNotFunctor {
-  using ELEMENT_TYPE = T;
-  HOSTDEVICE bool operator()(const T a) const { return !a; }
-};
-
-template <typename DeviceContext, typename Functor>
-class BinaryLogicalOpKernel
-    : public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    using T = typename Functor::ELEMENT_TYPE;
-    auto* x = context.Input<framework::Tensor>("X");
-    auto* y = context.Input<framework::Tensor>("Y");
-    auto* out = context.Output<framework::Tensor>("Out");
-    Functor binary_func;
-    ElementwiseComputeEx<Functor, DeviceContext, T, bool>(context, x, y, -1,
-                                                          binary_func, out);
-  }
-};
-
-template <typename DeviceContext, typename Functor>
-class UnaryLogicalOpKernel
-    : public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    using T = typename Functor::ELEMENT_TYPE;
-    auto* x = context.Input<framework::Tensor>("X");
-    auto* out = context.Output<framework::Tensor>("Out");
-    Functor unary_func;
-    platform::Transform<DeviceContext> trans;
-    trans(context.template device_context<DeviceContext>(), x->data<T>(),
-          x->data<T>() + x->numel(),
-          out->mutable_data<bool>(context.GetPlace()), unary_func);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-#define REGISTER_BINARY_LOGICAL_KERNEL(op_type, dev, functor)              \
-  REGISTER_OP_##dev##_KERNEL(                                              \
-      op_type, ::paddle::operators::BinaryLogicalOpKernel<                 \
-                   ::paddle::platform::dev##DeviceContext, functor<bool>>, \
-      ::paddle::operators::BinaryLogicalOpKernel<                          \
-          ::paddle::platform::dev##DeviceContext, functor<int8_t>>,        \
-      ::paddle::operators::BinaryLogicalOpKernel<                          \
-          ::paddle::platform::dev##DeviceContext, functor<int16_t>>,       \
-      ::paddle::operators::BinaryLogicalOpKernel<                          \
-          ::paddle::platform::dev##DeviceContext, functor<int>>,           \
-      ::paddle::operators::BinaryLogicalOpKernel<                          \
-          ::paddle::platform::dev##DeviceContext, functor<int64_t>>,       \
-      ::paddle::operators::BinaryLogicalOpKernel<                          \
-          ::paddle::platform::dev##DeviceContext, functor<float>>,         \
-      ::paddle::operators::BinaryLogicalOpKernel<                          \
-          ::paddle::platform::dev##DeviceContext, functor<double>>);
-
-#define REGISTER_UNARY_LOGICAL_KERNEL(op_type, dev, functor)               \
-  REGISTER_OP_##dev##_KERNEL(                                              \
-      op_type, ::paddle::operators::UnaryLogicalOpKernel<                  \
-                   ::paddle::platform::dev##DeviceContext, functor<bool>>, \
-      ::paddle::operators::UnaryLogicalOpKernel<                           \
-          ::paddle::platform::dev##DeviceContext, functor<int8_t>>,        \
-      ::paddle::operators::UnaryLogicalOpKernel<                           \
-          ::paddle::platform::dev##DeviceContext, functor<int16_t>>,       \
-      ::paddle::operators::UnaryLogicalOpKernel<                           \
-          ::paddle::platform::dev##DeviceContext, functor<int>>,           \
-      ::paddle::operators::UnaryLogicalOpKernel<                           \
-          ::paddle::platform::dev##DeviceContext, functor<int64_t>>,       \
-      ::paddle::operators::UnaryLogicalOpKernel<                           \
-          ::paddle::platform::dev##DeviceContext, functor<float>>,         \
-      ::paddle::operators::UnaryLogicalOpKernel<                           \
-          ::paddle::platform::dev##DeviceContext, functor<double>>);
diff --git a/paddle/fluid/operators/controlflow/logical_op_npu.cc b/paddle/fluid/operators/controlflow/logical_op_npu.cc
index 02f95254035d6041ef64dd746faa924abb053165..c3d7df8d0274371a4c5a482624c75b36677778a9 100644
--- a/paddle/fluid/operators/controlflow/logical_op_npu.cc
+++ b/paddle/fluid/operators/controlflow/logical_op_npu.cc
@@ -9,7 +9,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/controlflow/logical_op.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/dot_op.cc b/paddle/fluid/operators/dot_op.cc
index ed2b09796eeeb8ce18fdc47be58347d85e6e1a80..a86a3bb35927d53d20bef91a0bf36695a268c348 100644
--- a/paddle/fluid/operators/dot_op.cc
+++ b/paddle/fluid/operators/dot_op.cc
@@ -14,6 +14,10 @@
 
 #include "paddle/fluid/operators/dot_op.h"
 
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/binary.h"
+
 namespace paddle {
 namespace operators {
 
@@ -21,51 +25,6 @@ class DotOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE_EQ(true, ctx->HasInput("X"),
-                      platform::errors::PreconditionNotMet(
-                          "Input(X) of DotOp should not be null."));
-    PADDLE_ENFORCE_EQ(true, ctx->HasInput("Y"),
-                      platform::errors::PreconditionNotMet(
-                          "Input(Y) of DotOp should not be null."));
-    PADDLE_ENFORCE_EQ(true, ctx->HasOutput("Out"),
-                      platform::errors::PreconditionNotMet(
-                          "Output(Out) of DotOp should not be null."));
-
-    auto x_dims = ctx->GetInputDim("X");
-    auto x_rank = static_cast<size_t>(x_dims.size());
-    PADDLE_ENFORCE_EQ(true, 1 == x_rank || 2 == x_rank,
-                      platform::errors::PreconditionNotMet(
-                          "ShapeError: The dimensions of input tensor X (%s) "
-                          "should be 1 or 2",
-                          x_dims.to_str()));
-
-    auto y_dims = ctx->GetInputDim("Y");
-    PADDLE_ENFORCE_EQ(
-        true, x_rank == (size_t)y_dims.size(),
-        platform::errors::PreconditionNotMet(
-            "ShapeError: The shape of input tensor Y: %s should match with "
-            "input tenosr X: %s",
-            y_dims.to_str(), x_dims.to_str()));
-    bool shape_match = true;
-    for (size_t i = 0; i < x_rank; ++i) {
-      if (x_dims[i] != y_dims[i]) {
-        shape_match = false;
-        break;
-      }
-    }
-
-    PADDLE_ENFORCE_EQ(true, shape_match,
-                      platform::errors::PreconditionNotMet(
-                          "ShapeError: The shape of input tensor X: %s should "
-                          "be exactly the same "
-                          "with input tensor Y: %s",
-                          x_dims.to_str(), y_dims.to_str()));
-    auto dims = vectorize(x_dims);
-    dims[dims.size() - 1] = 1;
-    ctx->SetOutputDim("Out", phi::make_ddim(dims));
-  }
-
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
     return framework::OpKernelType(
@@ -142,9 +101,13 @@ class DotOpGradMaker : public framework::SingleGradOpMaker<T> {
 
 namespace ops = paddle::operators;
 
+DELCARE_INFER_SHAPE_FUNCTOR(dot, DotInferShapeFunctor,
+                            PT_INFER_META(phi::DotInferMeta));
+
 REGISTER_OPERATOR(dot, ops::DotOp, ops::DotOpMaker,
                   ops::DotOpGradMaker<paddle::framework::OpDesc>,
-                  ops::DotOpGradMaker<paddle::imperative::OpBase>);
+                  ops::DotOpGradMaker<paddle::imperative::OpBase>,
+                  DotInferShapeFunctor);
 
 REGISTER_OPERATOR(dot_grad, ops::DotGradOp);
 
diff --git a/paddle/fluid/operators/gather_op.cc b/paddle/fluid/operators/gather_op.cc
index cf4d7b1d670b8add6ff5a138851c6a23ee54169e..8a405cc6fc1baefe997fb5b6133a56d6a2fc0438 100644
--- a/paddle/fluid/operators/gather_op.cc
+++ b/paddle/fluid/operators/gather_op.cc
@@ -201,12 +201,14 @@ REGISTER_OPERATOR(gather_grad, ops::GatherGradOp,
 REGISTER_OP_CPU_KERNEL(gather, ops::GatherOpKernel<float>,
                        ops::GatherOpKernel<double>, ops::GatherOpKernel<int>,
                        ops::GatherOpKernel<uint8_t>,
-                       ops::GatherOpKernel<int64_t>);
+                       ops::GatherOpKernel<int64_t>,
+                       ops::GatherOpKernel<phi::dtype::bfloat16>);
 REGISTER_OP_CPU_KERNEL(gather_grad, ops::GatherGradientOpKernel<float>,
                        ops::GatherGradientOpKernel<double>,
                        ops::GatherGradientOpKernel<int>,
                        ops::GatherGradientOpKernel<uint8_t>,
-                       ops::GatherGradientOpKernel<int64_t>);
+                       ops::GatherGradientOpKernel<int64_t>,
+                       ops::GatherGradientOpKernel<phi::dtype::bfloat16>);
 REGISTER_OP_VERSION(gather)
     .AddCheckpoint(R"ROC(upgrad gather, add a new input [Axis])ROC",
                    paddle::framework::compatible::OpVersionDesc().NewInput(
diff --git a/paddle/fluid/operators/gather_op.cu b/paddle/fluid/operators/gather_op.cu
index 19568835a6e96080bb1c0af642bf9cb19c346bf9..a502a13040949a34e88a4d585327a58ffe92562c 100644
--- a/paddle/fluid/operators/gather_op.cu
+++ b/paddle/fluid/operators/gather_op.cu
@@ -130,9 +130,11 @@ REGISTER_OP_CUDA_KERNEL(gather, ops::GatherOpCUDAKernel<float>,
                         ops::GatherOpCUDAKernel<double>,
                         ops::GatherOpCUDAKernel<int64_t>,
                         ops::GatherOpCUDAKernel<int>,
-                        ops::GatherOpCUDAKernel<plat::float16>);
+                        ops::GatherOpCUDAKernel<plat::float16>,
+                        ops::GatherOpCUDAKernel<plat::bfloat16>);
 REGISTER_OP_CUDA_KERNEL(gather_grad, ops::GatherGradOpCUDAKernel<float>,
                         ops::GatherGradOpCUDAKernel<double>,
                         ops::GatherGradOpCUDAKernel<int64_t>,
                         ops::GatherGradOpCUDAKernel<int>,
-                        ops::GatherGradOpCUDAKernel<plat::float16>);
+                        ops::GatherGradOpCUDAKernel<plat::float16>,
+                        ops::GatherGradOpCUDAKernel<plat::bfloat16>);
diff --git a/paddle/fluid/operators/group_norm_op.cu b/paddle/fluid/operators/group_norm_op.cu
index 72a90d17998d84f0d0d4e081543acae94756e635..b376334f1e93cc3be9e716d808525011edb29b94 100644
--- a/paddle/fluid/operators/group_norm_op.cu
+++ b/paddle/fluid/operators/group_norm_op.cu
@@ -29,6 +29,7 @@ namespace operators {
 
 using DataLayout = framework::DataLayout;
 enum GroupNormKernelFlags { kHasScale = 1, kHasBias = 2 };
+#define ALIGN_BYTES 16
 
 #define CHECK_CASE(i, flags, kernel_name, ...)                              \
   if (i == flags) {                                                         \
@@ -56,8 +57,7 @@ __device__ __inline__ void CudaAtomicAddWithWarp(T* sum, T value) {
 template <typename T>
 __global__ void GroupNormForwardGetMeanAndVar(const T* x, int N, int C, int W,
                                               int imsize, int groups,
-                                              int group_size, T* mean, T* var,
-                                              const DataLayout data_layout) {
+                                              int group_size, T* mean, T* var) {
   int gid = blockIdx.y;
   int cid = blockIdx.x;
   int bid = blockIdx.z;
@@ -68,13 +68,10 @@ __global__ void GroupNormForwardGetMeanAndVar(const T* x, int N, int C, int W,
   T x_mean = 0, x_var = 0;
   for (int imid = threadIdx.x; imid < imsize; imid += blockDim.x) {
     T val;
-    if (data_layout == DataLayout::kNCHW) {
-      val = x[(bid * C + ccid) * imsize + imid];
-    } else {
-      int hid = imid / W;
-      int wid = imid % W;
-      val = x[(bid * H + hid) * W * C + wid * C + ccid];
-    }
+    int hid = imid / W;
+    int wid = imid % W;
+    val = x[(bid * H + hid) * W * C + wid * C + ccid];
+
     x_mean += val;
     x_var += val * val;
   }
@@ -84,6 +81,85 @@ __global__ void GroupNormForwardGetMeanAndVar(const T* x, int N, int C, int W,
   CudaAtomicAddWithWarp(&var[bid * groups + gid], x_var);
 }
 
+template <typename T, typename AccT, int VecSize>
+__device__ __forceinline__ void ThreadReduce(const T* input, int size,
+                                             const int offset, AccT* mean,
+                                             AccT* var) {
+  using VecT = kps::details::VectorType<T, VecSize>;
+  int tid = threadIdx.x;
+  if (offset > 0) {
+    input -= offset;
+    size += offset;
+    if (tid >= offset) {
+      AccT temp = input[tid];
+      *mean += temp;
+      *var += temp * temp;
+    }
+    size -= blockDim.x;
+    input += blockDim.x;
+  }
+  int remain = size % (VecSize * blockDim.x);
+
+  T ins[VecSize];
+  VecT* ins_vec = reinterpret_cast<VecT*>(&ins);
+
+  // vector part
+  for (; VecSize * tid < (size - remain); tid += blockDim.x) {
+    *ins_vec = reinterpret_cast<const VecT*>(input)[tid];
+
+#pragma unroll
+    for (int i = 0; i < VecSize; ++i) {
+      AccT temp = ins[i];
+      *mean += temp;
+      *var += temp * temp;
+    }
+  }
+
+  // scalar part
+  tid = size - remain + threadIdx.x;
+  for (; tid < size; tid += blockDim.x) {
+    AccT temp = input[tid];
+    *mean += temp;
+    *var += temp * temp;
+  }
+}
+
+template <typename T>
+__global__ void ScalarGetMeanAndVarNCHW(const T* x, T* mean, T* var, int size) {
+  int i = blockIdx.x;
+  T x_mean = 0, x_var = 0;
+  for (int j = threadIdx.x; j < size; j += blockDim.x) {
+    T val;
+    val = x[i * size + j];
+    x_mean += val;
+    x_var += val * val;
+  }
+  x_mean /= size;
+  x_var /= size;
+  CudaAtomicAddWithWarp(&mean[i], x_mean);
+  CudaAtomicAddWithWarp(&var[i], x_var);
+}
+
+template <typename T, typename AccT, int VecSize>
+__global__ void VectorizedGetMeanAndVarNCHW(const T* x, T* mean, T* var,
+                                            int size) {
+  int i = blockIdx.x;
+  AccT x_mean = static_cast<AccT>(0);
+  AccT x_var = static_cast<AccT>(0);
+  const int input_offset = ((uint64_t)x) % ALIGN_BYTES / sizeof(T);
+  x += i * size;
+  ThreadReduce<T, AccT, VecSize>(x, size, input_offset, &x_mean, &x_var);
+  x_mean = kps::details::BlockXReduce<AccT, kps::AddFunctor<AccT>>(
+      x_mean, kps::AddFunctor<AccT>());
+  x_var = kps::details::BlockXReduce<AccT, kps::AddFunctor<AccT>>(
+      x_var, kps::AddFunctor<AccT>());
+  __syncthreads();
+  if (threadIdx.x == 0) {
+    mean[i] = static_cast<T>(x_mean / size);
+    var[i] = static_cast<T>(x_var / size);
+  }
+}
+
 template <typename T, int flags>
 __global__ void GroupNormForward(const T* x, const T* mean, const T* var,
                                  const T* scale, const T* bias, int N, int C,
@@ -96,26 +172,34 @@ __global__ void GroupNormForward(const T* x, const T* mean, const T* var,
   int H = imsize / W;
   int ccid = gid * group_size + cid;
   if (ccid >= C) return;
-  T x_mean = mean[bid * groups + gid];
-  T x_var = var[bid * groups + gid];
+  auto ng = bid * groups + gid;
+  T x_mean = mean[ng];
+  T x_var = var[ng];
   x_var = x_var - x_mean * x_mean;
-  T var_inv = 1.0 / sqrt(x_var + epsilon);
-  if (cid == 0 && threadIdx.x == 0) real_var[bid * groups + gid] = x_var;
+  T var_inv = rsqrt(x_var + epsilon);
+  if (cid == 0 && threadIdx.x == 0) {
+    real_var[ng] = x_var;
+  }
   for (int imid = threadIdx.x; imid < imsize; imid += blockDim.x) {
     T val;
     int hid, wid;
+    int index = (bid * C + ccid) * imsize + imid;
     if (data_layout == DataLayout::kNCHW) {
-      val = x[(bid * C + ccid) * imsize + imid];
+      val = x[index];
     } else {
       hid = imid / W;
       wid = imid % W;
       val = x[(bid * H + hid) * W * C + wid * C + ccid];
     }
     val = (val - x_mean) * var_inv;
-    if (flags & kHasScale) val *= scale[gid * group_size + cid];
-    if (flags & kHasBias) val += bias[gid * group_size + cid];
+    if (flags & kHasScale) {
+      val *= scale[ccid];
+    }
+    if (flags & kHasBias) {
+      val += bias[ccid];
+    }
     if (data_layout == DataLayout::kNCHW) {
-      y[(bid * C + ccid) * imsize + imid] = val;
+      y[index] = val;
     } else {
       y[(bid * H + hid) * W * C + wid * C + ccid] = val;
     }
@@ -182,16 +266,41 @@ class GroupNormKernel<platform::CUDADeviceContext, T>
         imsize *= x_dims[i];
       }
     }
+
 #ifdef __HIPCC__
     int block_size = std::max(std::min(256, imsize), 64);
 #else
     int block_size = std::min(1024, imsize);
 #endif
+
     dim3 grid(group_size, groups, x_dims[0]);
     dim3 threads(block_size, 1, 1);
-    GroupNormForwardGetMeanAndVar<T><<<grid, threads, 0, dev_ctx.stream()>>>(
-        x_data, x_dims[0], C, W, imsize, groups, group_size, mean_data,
-        temp_var_data, data_layout);
+    if (data_layout == DataLayout::kNCHW) {
+      using AccT = typename details::MPTypeTrait<T>::Type;
+      constexpr int vec_size = sizeof(float4) / sizeof(T);
+      int size = group_size * imsize;
+      const int max_num_threads = 1024;
+      int max_block_size = std::min(size / vec_size, max_num_threads);
+      int block_size_nchw = 1;
+      while (block_size_nchw < max_block_size) {
+        block_size_nchw *= 2;
+      }
+      block_size_nchw = std::max(block_size_nchw, kps::details::kWarpSize);
+      dim3 grids(x_dims[0] * groups);
+      dim3 blocks(block_size_nchw);
+      if (size < vec_size) {
+        ScalarGetMeanAndVarNCHW<T><<<grids, blocks, 0, dev_ctx.stream()>>>(
+            x_data, mean_data, temp_var_data, size);
+      } else {
+        VectorizedGetMeanAndVarNCHW<
+            T, AccT, vec_size><<<grids, blocks, 0, dev_ctx.stream()>>>(
+            x_data, mean_data, temp_var_data, size);
+      }
+    } else {
+      GroupNormForwardGetMeanAndVar<T><<<grid, threads, 0, dev_ctx.stream()>>>(
+          x_data, x_dims[0], C, W, imsize, groups, group_size, mean_data,
+          temp_var_data);
+    }
     int flags =
         (scale_data != nullptr) * kHasScale + (bias_data != nullptr) * kHasBias;
     UNROLL_ALL_CASES(flags, GroupNormForward, x_data, mean_data, temp_var_data,
diff --git a/paddle/fluid/operators/index_sample_op.cc b/paddle/fluid/operators/index_sample_op.cc
index 2d97797cfec21ed50f0999fa13f8bb1ae9618b71..68d002fceea70fd032d7613802d095770d3d4754 100644
--- a/paddle/fluid/operators/index_sample_op.cc
+++ b/paddle/fluid/operators/index_sample_op.cc
@@ -12,12 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/index_sample_op.h"
 #include <vector>
 #include "paddle/fluid/framework/no_need_buffer_vars_inference.h"
-#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/enforce.h"
 
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/binary.h"
 namespace paddle {
 namespace operators {
 class IndexSampleOpMaker : public framework::OpProtoAndCheckerMaker {
@@ -42,44 +44,6 @@ class IndexSampleOpMaker : public framework::OpProtoAndCheckerMaker {
 class IndexSampleOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true,
-                      platform::errors::InvalidArgument(
-                          "Inputs(Input) of FindByIndex should not be null."));
-    PADDLE_ENFORCE_EQ(ctx->HasInput("Index"), true,
-                      platform::errors::InvalidArgument(
-                          "Inputs(Index) of FindByIndex should not be null."));
-
-    auto input_dims = ctx->GetInputDim("X");
-    PADDLE_ENFORCE_EQ(
-        input_dims.size(), 2,
-        platform::errors::InvalidArgument(
-            "Inputs(X) shape of IndexSample op should be 2-D, but "
-            "got X's shape = [%s], please check X shape.",
-            input_dims));
-
-    auto index_dims = ctx->GetInputDim("Index");
-    PADDLE_ENFORCE_EQ(
-        input_dims.size(), 2,
-        platform::errors::InvalidArgument(
-            "Inputs(Index) shape of IndexSample op should be 2-D, but "
-            "got Index's shape [%s] , please check index shape.",
-            input_dims));
-    if (ctx->IsRuntime()) {
-      PADDLE_ENFORCE_EQ(input_dims[0], index_dims[0],
-                        platform::errors::InvalidArgument(
-                            "Inputs(X)'s value of dimension 0 must same with "
-                            "Inputs(Index)'s value of dimension 0, but "
-                            "got %d of Inputs(X), and got %d of Inputs(Index), "
-                            "please check Inputs shape.",
-                            input_dims[0], index_dims[0]));
-    }
-    ctx->SetOutputDim("Out", index_dims);
-    auto type = ctx->GetInputsVarType("Index")[0];
-    if (type == framework::proto::VarType::LOD_TENSOR) {
-      ctx->ShareLoD("Index", /*->*/ "Out");
-    }
-  }
 
  protected:
   framework::OpKernelType GetExpectedKernelType(
@@ -136,20 +100,11 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(IndexSampleGradNoNeedBufferVarInferer, "X");
 }  // namespace paddle
 
 namespace ops = paddle::operators;
+DELCARE_INFER_SHAPE_FUNCTOR(index_sample, IndexSampleInferShapeFunctor,
+                            PT_INFER_META(phi::IndexSampleInferMeta));
 REGISTER_OPERATOR(index_sample, ops::IndexSampleOp, ops::IndexSampleOpMaker,
                   ops::IndexSampleGradMaker<paddle::framework::OpDesc>,
-                  ops::IndexSampleGradMaker<paddle::imperative::OpBase>);
+                  ops::IndexSampleGradMaker<paddle::imperative::OpBase>,
+                  IndexSampleInferShapeFunctor);
 REGISTER_OPERATOR(index_sample_grad, ops::IndexSampleGradOp,
                   ops::IndexSampleGradNoNeedBufferVarInferer);
-REGISTER_OP_CPU_KERNEL(
-    index_sample,
-    ops::IndexSampleKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::IndexSampleKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::IndexSampleKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::IndexSampleKernel<paddle::platform::CPUDeviceContext, int64_t>);
-REGISTER_OP_CPU_KERNEL(
-    index_sample_grad,
-    ops::IndexSampleGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::IndexSampleGradKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::IndexSampleGradKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::IndexSampleGradKernel<paddle::platform::CPUDeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/index_sample_op.cu b/paddle/fluid/operators/index_sample_op.cu
deleted file mode 100644
index e8acbfb8be990a422e5a16e8871d47f55af6620c..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/index_sample_op.cu
+++ /dev/null
@@ -1,215 +0,0 @@
-// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/index_sample_op.h"
-#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
-#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-#define PREDEFINED_BLOCK_SIZE_X 512
-#define PREDEFINED_BLOCK_SIZE 1024
-#define MIN(a, b) ((a) < (b) ? (a) : (b))
-
-namespace paddle {
-namespace operators {
-
-namespace {
-void LimitGridDim(const framework::ExecutionContext& ctx, dim3* grid_dim) {
-  auto max_grid_dim = ctx.template device_context<platform::CUDADeviceContext>()
-                          .GetCUDAMaxGridDimSize();
-  grid_dim->x = grid_dim->x < max_grid_dim[0] ? grid_dim->x : max_grid_dim[0];
-  grid_dim->y = grid_dim->y < max_grid_dim[1] ? grid_dim->y : max_grid_dim[1];
-}
-}
-
-using Tensor = framework::Tensor;
-using LoDTensor = framework::LoDTensor;
-
-template <typename T, typename IndexT = int>
-__global__ void IndexSampleForward(const IndexT* index, const T* in_data,
-                                   T* out_data, size_t index_length,
-                                   size_t input_length, size_t batch_size) {
-  unsigned int index_i = blockDim.x * blockIdx.x + threadIdx.x;
-  unsigned int index_j = blockDim.y * blockIdx.y + threadIdx.y;
-  for (; index_j < batch_size; index_j += blockDim.y * gridDim.y) {
-    index_i = blockDim.x * blockIdx.x + threadIdx.x;
-    for (; index_i < index_length; index_i += blockDim.x * gridDim.x) {
-      unsigned int index_idx = index_j * index_length + index_i;
-      unsigned int in_idx = index_j * input_length + index_i;
-      IndexT sample_idx = index[index_idx];
-      out_data[index_idx] = in_data[in_idx - index_i + sample_idx];
-    }
-  }
-}
-
-template <typename T, typename IndexT = int>
-__global__ void IndexSampleGrad(const IndexT* index, T* in_grad,
-                                const T* out_grad, size_t index_length,
-                                size_t input_length, size_t batch_size,
-                                bool same_data_in_row = true) {
-  unsigned int index_i = blockDim.x * blockIdx.x + threadIdx.x;
-  unsigned int index_j = blockDim.y * blockIdx.y + threadIdx.y;
-
-  for (; index_j < batch_size; index_j += blockDim.y * gridDim.y) {
-    index_i = blockDim.x * blockIdx.x + threadIdx.x;
-    for (; index_i < index_length; index_i += blockDim.x * gridDim.x) {
-      unsigned int index_idx = index_j * index_length + index_i;
-      unsigned int in_idx = index_j * input_length + index_i;
-      IndexT sample_idx = index[index_idx];
-      if (same_data_in_row) {
-        platform::CudaAtomicAdd(&(in_grad[in_idx - index_i + sample_idx]),
-                                out_grad[sample_idx]);
-      } else {
-        in_grad[in_idx - index_i + sample_idx] = out_grad[index_idx];
-      }
-    }
-  }
-}
-
-template <typename T>
-class IndexSampleKernel<platform::CUDADeviceContext, T>
-    : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* input = ctx.Input<LoDTensor>("X");
-    auto* index = ctx.Input<LoDTensor>("Index");
-    auto* output = ctx.Output<LoDTensor>("Out");
-
-    const auto& index_type = framework::TransToProtoVarType(index->dtype());
-    bool index_type_match = index_type == framework::proto::VarType::INT64 ||
-                            index_type == framework::proto::VarType::INT32;
-    PADDLE_ENFORCE_EQ(index_type_match, true,
-                      platform::errors::InvalidArgument(
-                          "Input(Index) holds the wrong type, it holds %s, but "
-                          "desires to be %s or %s",
-                          paddle::framework::DataTypeToString(index_type),
-                          paddle::framework::DataTypeToString(
-                              framework::proto::VarType::INT32),
-                          paddle::framework::DataTypeToString(
-                              framework::proto::VarType::INT64)));
-    const auto* in_data = input->data<T>();
-    auto* out_data = output->mutable_data<T>(ctx.GetPlace());
-    auto stream =
-        ctx.template device_context<platform::CUDADeviceContext>().stream();
-
-    auto input_dim = input->dims();
-    auto index_dim = index->dims();
-    size_t batch_size = input_dim[0];
-    size_t input_length = input_dim[1];
-    size_t index_length = index_dim[1];
-
-    auto block_width = platform::RoundToPowerOfTwo(index_length);
-    block_width = MIN(block_width, PREDEFINED_BLOCK_SIZE_X);
-    int block_height =
-        platform::RoundToPowerOfTwo(index_length * batch_size) / block_width;
-    block_height = MIN(block_height, PREDEFINED_BLOCK_SIZE / block_width);
-    dim3 block_dim(block_width, block_height);
-    dim3 grid_dim((index_length + block_dim.x - 1) / block_dim.x,
-                  (batch_size + block_dim.y - 1) / block_dim.y);
-    LimitGridDim(ctx, &grid_dim);
-
-    if (index_type == framework::proto::VarType::INT64) {
-      const int64_t* index_data = index->data<int64_t>();
-      IndexSampleForward<T, int64_t><<<grid_dim, block_dim, 0, stream>>>(
-          index_data, in_data, out_data, index_length, input_length,
-          batch_size);
-    } else if (index_type == framework::proto::VarType::INT32) {
-      const int* index_data = index->data<int>();
-      IndexSampleForward<T, int><<<grid_dim, block_dim, 0, stream>>>(
-          index_data, in_data, out_data, index_length, input_length,
-          batch_size);
-    }
-  }
-};
-
-template <typename T>
-class IndexSampleGradKernel<platform::CUDADeviceContext, T>
-    : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto* output_grad = ctx.Input<LoDTensor>(framework::GradVarName("Out"));
-    auto* input_grad = ctx.Output<LoDTensor>(framework::GradVarName("X"));
-    auto* index = ctx.Input<LoDTensor>("Index");
-
-    const auto* output_grad_data = output_grad->data<T>();
-    auto* input_grad_data = input_grad->mutable_data<T>(ctx.GetPlace());
-
-    const auto& index_type = framework::TransToProtoVarType(index->dtype());
-    bool index_type_match = index_type == framework::proto::VarType::INT64 ||
-                            index_type == framework::proto::VarType::INT32;
-    PADDLE_ENFORCE_EQ(index_type_match, true,
-                      platform::errors::InvalidArgument(
-                          "Input(Index) holds the wrong type, it holds %s, but "
-                          "desires to be %s or %s",
-                          paddle::framework::DataTypeToString(index_type),
-                          paddle::framework::DataTypeToString(
-                              framework::proto::VarType::INT32),
-                          paddle::framework::DataTypeToString(
-                              framework::proto::VarType::INT64)));
-
-    auto stream =
-        ctx.template device_context<platform::CUDADeviceContext>().stream();
-    auto input_num = input_grad->numel();
-    auto input_dim = input_grad->dims();
-    auto index_dim = index->dims();
-    size_t batch_size = index_dim[0];
-    size_t input_length = input_dim[1];
-    size_t index_length = index_dim[1];
-    bool same_data_in_index_row = index_length == 1 ? false : true;
-
-    auto block_width = platform::RoundToPowerOfTwo(index_length);
-    block_width = MIN(block_width, PREDEFINED_BLOCK_SIZE_X);
-    auto block_height =
-        platform::RoundToPowerOfTwo(index_length * batch_size) / block_width;
-    block_height = MIN(block_height, PREDEFINED_BLOCK_SIZE / block_width);
-    dim3 block_dim(block_width, block_height);
-    dim3 grid_dim((index_length + block_dim.x - 1) / block_dim.x,
-                  (batch_size + block_dim.y - 1) / block_dim.y);
-    LimitGridDim(ctx, &grid_dim);
-
-    phi::funcs::SetConstant<platform::CUDADeviceContext, T> set_zero;
-    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
-    set_zero(dev_ctx, input_grad, static_cast<T>(0));
-
-    if (index_type == framework::proto::VarType::INT64) {
-      const int64_t* index_data = index->data<int64_t>();
-      IndexSampleGrad<T, int64_t><<<grid_dim, block_dim, 0, stream>>>(
-          index_data, input_grad_data, output_grad_data, index_length,
-          input_length, batch_size, same_data_in_index_row);
-    } else if (index_type == framework::proto::VarType::INT32) {
-      const int* index_data = index->data<int>();
-      IndexSampleGrad<T, int><<<grid_dim, block_dim, 0, stream>>>(
-          index_data, input_grad_data, output_grad_data, index_length,
-          input_length, batch_size, same_data_in_index_row);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    index_sample,
-    ops::IndexSampleKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::IndexSampleKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::IndexSampleKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::IndexSampleKernel<paddle::platform::CUDADeviceContext, int64_t>);
-REGISTER_OP_CUDA_KERNEL(
-    index_sample_grad,
-    ops::IndexSampleGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::IndexSampleGradKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::IndexSampleGradKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::IndexSampleGradKernel<paddle::platform::CUDADeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/index_sample_op.h b/paddle/fluid/operators/index_sample_op.h
deleted file mode 100644
index 6cc8ff04c544554e805c605783c9bedf1b9fcb7b..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/index_sample_op.h
+++ /dev/null
@@ -1,198 +0,0 @@
-/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <cmath>
-#include <fstream>
-#include <set>
-#include <string>
-#include <utility>
-#include <vector>
-#include "gflags/gflags.h"
-#include "paddle/fluid/framework/convert_utils.h"
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-using Tensor = framework::Tensor;
-using LoDTensor = framework::LoDTensor;
-using DDim = framework::DDim;
-
-template <typename T, typename IndexT = int>
-void IndexSampleInner(const framework::ExecutionContext &context,
-                      const LoDTensor &input, const LoDTensor &index,
-                      LoDTensor *output) {
-  auto input_dims = input.dims();
-  auto index_dims = index.dims();
-
-  int batch_size = input_dims[0];
-  auto value_length = input_dims[1];
-  auto index_length = index_dims[1];
-  int index_ids_num = index.numel();
-
-  std::vector<T> input_vec;
-  std::vector<IndexT> index_vec;
-  paddle::framework::TensorToVector(input, context.device_context(),
-                                    &input_vec);
-  paddle::framework::TensorToVector(index, context.device_context(),
-                                    &index_vec);
-
-  std::vector<T> res(index_ids_num);
-  for (int i = 0; i < index_ids_num; i++) {
-    int b = floor(i / index_length);
-    PADDLE_ENFORCE_GE(
-        index_vec[i], 0,
-        platform::errors::InvalidArgument(
-            "Variable value (index) of OP(index_sample) "
-            "expected >= 0 and < %ld, but got %ld. Please check input "
-            "value.",
-            value_length, index_vec[i]));
-    PADDLE_ENFORCE_LT(
-        index_vec[i], value_length,
-        platform::errors::InvalidArgument(
-            "Variable value (index) of OP(index_sample) "
-            "expected >= 0 and < %ld, but got %ld. Please check input "
-            "value.",
-            value_length, index_vec[i]));
-
-    int v_i = b * value_length + static_cast<int>(index_vec[i]);
-    T v = input_vec[v_i];
-    VLOG(4) << "Index Sample: batch = " << b << " index = " << v_i
-            << " value = " << v;
-    res[i] = v;
-  }
-
-  auto ddim = phi::make_ddim({batch_size, index_length});
-  output->mutable_data<T>(context.GetPlace());
-  framework::TensorFromVector(res, context.device_context(), output);
-  output->Resize(ddim);
-}
-
-template <typename DeviceContext, typename T>
-class IndexSampleKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    auto *input_var = ctx.InputVar("X");
-    auto *index_var = ctx.InputVar("Index");
-
-    auto &input_tensor = input_var->Get<LoDTensor>();
-    auto &index_tensor = index_var->Get<LoDTensor>();
-
-    auto *out_var = ctx.OutputVar("Out");
-    auto *out_tensor = out_var->GetMutable<framework::LoDTensor>();
-
-    const auto &index_type =
-        framework::TransToProtoVarType(index_tensor.dtype());
-    bool index_type_match = index_type == framework::proto::VarType::INT32 ||
-                            index_type == framework::proto::VarType::INT64;
-    PADDLE_ENFORCE_EQ(index_type_match, true,
-                      platform::errors::InvalidArgument(
-                          "Input(Index) holds the wrong type, it holds %s, but "
-                          "desires to be %s or %s",
-                          paddle::framework::DataTypeToString(index_type),
-                          paddle::framework::DataTypeToString(
-                              framework::proto::VarType::INT32),
-                          paddle::framework::DataTypeToString(
-                              framework::proto::VarType::INT64)));
-    if (index_type == framework::proto::VarType::INT32) {
-      IndexSampleInner<T, int>(ctx, input_tensor, index_tensor, out_tensor);
-    } else if (index_type == framework::proto::VarType::INT64) {
-      IndexSampleInner<T, int64_t>(ctx, input_tensor, index_tensor, out_tensor);
-    }
-  }
-};
-
-template <typename T, typename IndexT = int>
-void IndexSampleGradInner(const framework::ExecutionContext &context,
-                          const LoDTensor &out_grad, const LoDTensor &index,
-                          LoDTensor *x_grad) {
-  std::vector<T> out_grad_vec;
-  std::vector<IndexT> index_vec;
-  paddle::framework::TensorToVector(out_grad, context.device_context(),
-                                    &out_grad_vec);
-  paddle::framework::TensorToVector(index, context.device_context(),
-                                    &index_vec);
-
-  auto index_dims = index.dims();
-  auto x_grad_dims = x_grad->dims();
-
-  auto value_length = x_grad_dims[1];
-  auto index_length = index_dims[1];
-  int index_ids_num = index.numel();
-
-  std::vector<T> x_grad_vec(x_grad->numel(), 0);
-
-  for (int i = 0; i < index_ids_num; i++) {
-    int b = floor(i / index_length);
-    PADDLE_ENFORCE_GE(
-        index_vec[i], 0,
-        platform::errors::InvalidArgument(
-            "Variable value (index) of OP(index_sample_grad) "
-            "expected >= 0 and < %ld, but got %ld. Please check input "
-            "value.",
-            value_length, index_vec[i]));
-    PADDLE_ENFORCE_LT(
-        index_vec[i], value_length,
-        platform::errors::InvalidArgument(
-            "Variable value (index) of OP(index_sample_grad) "
-            "expected >= 0 and < %ld, but got %ld. Please check input "
-            "value.",
-            value_length, index_vec[i]));
-    int v_i = b * value_length + static_cast<int>(index_vec[i]);
-    x_grad_vec[v_i] += out_grad_vec[i];
-  }
-  x_grad->mutable_data<T>(context.GetPlace());
-  framework::TensorFromVector(x_grad_vec, context.device_context(), x_grad);
-  x_grad->Resize(x_grad_dims);
-}
-
-template <typename DeviceContext, typename T>
-class IndexSampleGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &context) const override {
-    auto *index_var = context.InputVar("Index");
-    auto *x_grad_var = context.OutputVar(framework::GradVarName("X"));
-    auto *out_grad_var = context.InputVar(framework::GradVarName("Out"));
-
-    auto &index_tensor = index_var->Get<LoDTensor>();
-    auto &out_grad_tensor = out_grad_var->Get<LoDTensor>();
-    auto *x_grad_tensor = x_grad_var->GetMutable<framework::LoDTensor>();
-
-    const auto &index_type =
-        framework::TransToProtoVarType(index_tensor.dtype());
-    bool index_type_match = index_type == framework::proto::VarType::INT32 ||
-                            index_type == framework::proto::VarType::INT64;
-    PADDLE_ENFORCE_EQ(index_type_match, true,
-                      platform::errors::InvalidArgument(
-                          "Input(Index) holds the wrong type, it holds %s, but "
-                          "desires to be %s or %s",
-                          paddle::framework::DataTypeToString(index_type),
-                          paddle::framework::DataTypeToString(
-                              framework::proto::VarType::INT32),
-                          paddle::framework::DataTypeToString(
-                              framework::proto::VarType::INT64)));
-    if (index_type == framework::proto::VarType::INT32) {
-      IndexSampleGradInner<T, int>(context, out_grad_tensor, index_tensor,
-                                   x_grad_tensor);
-    } else if (index_type == framework::proto::VarType::INT64) {
-      IndexSampleGradInner<T, int64_t>(context, out_grad_tensor, index_tensor,
-                                       x_grad_tensor);
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/index_sample_op_npu.cc b/paddle/fluid/operators/index_sample_op_npu.cc
index f460d0622bccc2e71b1e147c0c9add688c3b11c4..38eb5b4514993412fa3a6c96ccc92e899c57b205 100644
--- a/paddle/fluid/operators/index_sample_op_npu.cc
+++ b/paddle/fluid/operators/index_sample_op_npu.cc
@@ -12,8 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/index_sample_op.h"
-
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/layer_norm_kernel.cu.h b/paddle/fluid/operators/layer_norm_kernel.cu.h
index b31c7a1cde0f18edb00435805ce4b2a089f9eb1a..62c21dd2eee401e5f8a526870015c18cf13ee873 100644
--- a/paddle/fluid/operators/layer_norm_kernel.cu.h
+++ b/paddle/fluid/operators/layer_norm_kernel.cu.h
@@ -474,11 +474,11 @@ __global__ __launch_bounds__(THREADS_PER_CTA) void fused_ln_bwd_1024_kernel(
     for (int it = 0; it < LDGS; it++) {
 #pragma unroll
       for (int jt = 0; jt < VecSize; jt++) {
-        U x_tmp = x[it][jt];
+        U x_tmp = static_cast<U>(x[it][jt]);
         U y_tmp = var_cur_row * (x_tmp - mean_cur_row);
         U dy_tmp = static_cast<U>(gamma[it][jt]) *
-                   static_cast<U>(dout[it][jt]);  // scale * dy
-        U dout_tmp = dout[it][jt];                // dy
+                   static_cast<U>(dout[it][jt]);    // scale * dy
+        U dout_tmp = static_cast<U>(dout[it][jt]);  // dy
 
         // used for get dx (row reduction)
         sum_loss1 += dy_tmp;          // scale * dy, sum_1
diff --git a/paddle/fluid/operators/layer_norm_op.cu b/paddle/fluid/operators/layer_norm_op.cu
index d439b3220d96ecd1107d6c29850d3d5356a01e09..dfe73d3727132ae9b8f71e2a415ef5193f303493 100644
--- a/paddle/fluid/operators/layer_norm_op.cu
+++ b/paddle/fluid/operators/layer_norm_op.cu
@@ -259,6 +259,21 @@ REGISTER_OP_CUDA_KERNEL(
     ops::LayerNormGradKernel<paddle::platform::CUDADeviceContext, float>,
     ops::LayerNormGradKernel<paddle::platform::CUDADeviceContext,
                              plat::float16>);
+#elif CUDNN_VERSION_MIN(8, 1, 0)
+REGISTER_OP_CUDA_KERNEL(
+    layer_norm,
+    ops::LayerNormKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::LayerNormKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::LayerNormKernel<paddle::platform::CUDADeviceContext, plat::float16>,
+    ops::LayerNormKernel<paddle::platform::CUDADeviceContext, plat::bfloat16>);
+REGISTER_OP_CUDA_KERNEL(
+    layer_norm_grad,
+    ops::LayerNormGradKernel<paddle::platform::CUDADeviceContext, float>,
+    ops::LayerNormGradKernel<paddle::platform::CUDADeviceContext, double>,
+    ops::LayerNormGradKernel<paddle::platform::CUDADeviceContext,
+                             plat::float16>,
+    ops::LayerNormGradKernel<paddle::platform::CUDADeviceContext,
+                             plat::bfloat16>);
 #else
 REGISTER_OP_CUDA_KERNEL(
     layer_norm,
diff --git a/paddle/fluid/operators/math/selected_rows_functor.cc b/paddle/fluid/operators/math/selected_rows_functor.cc
index fcd5c06a6f310f8a23608a77f2d6b9098e99b33a..5ac39953462b5078aa663a7f39f5eb95c96bae7a 100644
--- a/paddle/fluid/operators/math/selected_rows_functor.cc
+++ b/paddle/fluid/operators/math/selected_rows_functor.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/math/selected_rows_functor.h"
+#include "paddle/fluid/platform/device/device_wrapper.h"
 
 #ifdef PADDLE_WITH_MKLDNN
 #include "paddle/fluid/operators/mkldnn/axpy_handler.h"
@@ -502,32 +503,29 @@ struct MergeAdd<platform::XPUDeviceContext, T> {
     out.mutable_value()->mutable_data<T>(
         phi::make_ddim({static_cast<int64_t>(merge_rows.size()), input_width}),
         context.GetPlace());
-    int r =
-        xpu::constant<T>(context.x_context(), out.mutable_value()->data<T>(),
-                         merge_rows.size() * input_width, static_cast<T>(0.f));
-    PADDLE_ENFORCE_EQ(r, xpu::Error_t::SUCCESS,
-                      platform::errors::External("XPU constant op return"
-                                                 " wrong value[%d %s].",
-                                                 r, XPUAPIErrorMsg[r]));
 
     std::unordered_map<int64_t, size_t> rows_to_id;
     for (size_t i = 0; i < merge_rows.size(); ++i) {
       rows_to_id[merge_rows[i]] = i;
     }
 
-    auto* out_data = out.mutable_value()->data<T>();
-    auto* input_data = input.value().data<T>();
+    auto* y_data = out.mutable_value()->data<T>();
+    auto* x_data = input.value().data<T>();
+    int xm = input_rows.size();
+    int ym = merge_rows.size();
     int n = input_width;
-    for (size_t i = 0; i < input_rows.size(); i++) {
-      size_t out_i = rows_to_id[input_rows[i]];
-      auto r = xpu::add(context.x_context(), &input_data[i * input_width],
-                        &out_data[out_i * input_width],
-                        &out_data[out_i * input_width], n);
-      PADDLE_ENFORCE_EQ(
-          r, XPU_SUCCESS,
-          platform::errors::External("XPU API return wrong value[%d %s], ", r,
-                                     XPUAPIErrorMsg[r]));
-    }
+
+    xpu::ctx_guard RAII_GUARD(context.x_context());
+    int64_t* x_rows_data = RAII_GUARD.alloc_l3_or_gm<int64_t>(xm);
+    int64_t* y_rows_data = RAII_GUARD.alloc_l3_or_gm<int64_t>(ym);
+    memory::Copy(context.GetPlace(), y_rows_data, platform::CPUPlace(),
+                 merge_rows.data(), ym * sizeof(int64_t));
+    memory::Copy(context.GetPlace(), x_rows_data, platform::CPUPlace(),
+                 input_rows.data(), xm * sizeof(int64_t));
+    int r =
+        xpu::merge_dup_rows<T, int64_t>(context.x_context(), x_data, y_data,
+                                        x_rows_data, y_rows_data, xm, n, ym);
+    PADDLE_ENFORCE_XDNN_SUCCESS(r, "merge_dup_rows");
   }
 
   void operator()(const platform::XPUDeviceContext& context,
@@ -582,15 +580,7 @@ struct MergeAdd<platform::XPUDeviceContext, T> {
             {static_cast<int64_t>(merged_row_set.size()), input_width}),
         context.GetPlace());
 
-    int r =
-        xpu::constant<T>(context.x_context(), out.mutable_value()->data<T>(),
-                         merge_rows.size() * input_width, static_cast<T>(0.f));
-    PADDLE_ENFORCE_EQ(r, xpu::Error_t::SUCCESS,
-                      platform::errors::External("XPU constant op return"
-                                                 " wrong value[%d %s].",
-                                                 r, XPUAPIErrorMsg[r]));
-
-    float* out_data = reinterpret_cast<float*>(out.mutable_value()->data<T>());
+    float* y_data = reinterpret_cast<float*>(out.mutable_value()->data<T>());
 
     std::unordered_map<int64_t, size_t> rows_to_id;
     for (size_t i = 0; i < merge_rows.size(); ++i) {
@@ -603,17 +593,22 @@ struct MergeAdd<platform::XPUDeviceContext, T> {
       }
       auto& input_rows = input->rows();
 
+      auto* x_data = input->value().data<T>();
+      int xm = input_rows.size();
+      int ym = merge_rows.size();
       int n = input_width;
-      for (size_t i = 0; i < input_rows.size(); i++) {
-        size_t out_i = rows_to_id[input_rows[i]];
-        auto r = xpu::add(
-            context.x_context(), input->value().data<T>() + i * input_width,
-            &out_data[out_i * input_width], &out_data[out_i * input_width], n);
-        PADDLE_ENFORCE_EQ(
-            r, XPU_SUCCESS,
-            platform::errors::External("XPU API return wrong value[%d %s], ", r,
-                                       XPUAPIErrorMsg[r]));
-      }
+
+      xpu::ctx_guard RAII_GUARD(context.x_context());
+      int64_t* x_rows_data = RAII_GUARD.alloc_l3_or_gm<int64_t>(xm);
+      int64_t* y_rows_data = RAII_GUARD.alloc_l3_or_gm<int64_t>(ym);
+      memory::Copy(context.GetPlace(), y_rows_data, platform::CPUPlace(),
+                   merge_rows.data(), ym * sizeof(int64_t));
+      memory::Copy(context.GetPlace(), x_rows_data, platform::CPUPlace(),
+                   input_rows.data(), xm * sizeof(int64_t));
+      int r =
+          xpu::merge_dup_rows<T, int64_t>(context.x_context(), x_data, y_data,
+                                          x_rows_data, y_rows_data, xm, n, ym);
+      PADDLE_ENFORCE_XDNN_SUCCESS(r, "merge_dup_rows");
     }
   }
 };
diff --git a/paddle/fluid/operators/math/selected_rows_functor.cu b/paddle/fluid/operators/math/selected_rows_functor.cu
index 8563d8b05b186c025ecc4c970a400765adeb0c5d..a4678550cf7bd0d4aa2759d4887dddabed5f9ba4 100644
--- a/paddle/fluid/operators/math/selected_rows_functor.cu
+++ b/paddle/fluid/operators/math/selected_rows_functor.cu
@@ -16,6 +16,7 @@ limitations under the License. */
 #include <vector>
 
 #include "paddle/fluid/operators/math/selected_rows_functor.h"
+#include "paddle/fluid/platform/bfloat16.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/fluid/platform/float16.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
@@ -445,6 +446,7 @@ template struct MergeAdd<platform::CUDADeviceContext, double>;
 template struct MergeAdd<platform::CUDADeviceContext, int>;
 template struct MergeAdd<platform::CUDADeviceContext, int64_t>;
 template struct MergeAdd<platform::CUDADeviceContext, platform::float16>;
+template struct MergeAdd<platform::CUDADeviceContext, platform::bfloat16>;
 template struct MergeAdd<platform::CUDADeviceContext, platform::complex<float>>;
 template struct MergeAdd<platform::CUDADeviceContext,
                          platform::complex<double>>;
diff --git a/paddle/fluid/operators/optimizers/distributed_fused_lamb_init_op.cc b/paddle/fluid/operators/optimizers/distributed_fused_lamb_init_op.cc
index 28c6efef14178535d7f9473c2310552037952c9f..efec50efa92ea68cb68934bde32e1f56570b0868 100644
--- a/paddle/fluid/operators/optimizers/distributed_fused_lamb_init_op.cc
+++ b/paddle/fluid/operators/optimizers/distributed_fused_lamb_init_op.cc
@@ -61,30 +61,31 @@ class DistributedFusedLambInitOpMaker
               "The fp32 beta1 power accumulator tensor. Its shape is [1].");
     AddOutput("Beta2Pow",
               "The fp32 beta2 power accumulator tensor. Its shape is [1].");
-    AddOutput("FusedIndices",
-              "The param index of each element in FP32FusedParam. Its shape is "
-              "[M1+M2]. It is like [0,0,0,1,1,1,1,2,2,...].");
     AddOutput(
         "FusedParamOffsets",
         "The numel offset of each parameter inside the FP32FusedParam. Its "
         "shape is [param_num + 1]. It is like [0, n_0, n_0 + n_1, n_0 + n_1 "
-        "+ n_2, ...].");
-    AddOutput("FP32ShardFusedParamOffsets",
-              "The sharded numel offset of each parameter in the local rank. "
-              "Its shape is [fp32_local_param_num + 1].");
-    AddOutput("FP16ShardFusedParamOffsets",
-              "The sharded numel offset of each parameter in the local rank. "
-              "Its shape is [fp16_local_param_num + 1].");
+        "+ n_2, ...]. It should be in CPUPlace.");
     AddOutput(
-        "WeightDecay",
-        "The sharded fp32 weight decay tensor. Its shape is [(M1+M2)/N].");
+        "FP32ShardFusedParamOffsets",
+        "The sharded numel offset of each parameter in the local rank. "
+        "Its shape is [fp32_local_param_num + 1]. It should be in CPUPlace.");
+    AddOutput(
+        "FP16ShardFusedParamOffsets",
+        "The sharded numel offset of each parameter in the local rank. "
+        "Its shape is [fp16_local_param_num + 1]. It should be in CPUPlace.");
     AddOutput("ParamInfo",
               "The param info. It should be in CPUPlace, and its shape is [6]"
-              "CPUPlace, and its shape is [6]. It is "
+              "CPUPlace, and its shape is [8]. It is "
               "[fp32_shard_param_start_idx, fp32_local_param_num, "
-              "fp32_global_param_num, fp16_shard_param_start_idx, "
-              "fp16_local_param_num, fp16_global_param_num].");
-
+              "fp32_global_param_num, fp32_weight_decay_end_idx, "
+              "fp16_shard_param_start_idx, "
+              "fp16_local_param_num, fp16_global_param_num, "
+              "fp16_weight_decay_end_idx].");
+    AddOutput("ParamOrder",
+              "The reordered parameter order. Inside this op, "
+              "the parameter would be reordered by data type and weight decay "
+              "value.");
     AddOutput("ParamOut", "The output parameter list.").AsDuplicable();
     AddOutput("MasterParamOut",
               "The output master parameter list. It would share the memory of "
@@ -96,10 +97,8 @@ class DistributedFusedLambInitOpMaker
 
     AddAttr<float>("beta1", "The initial value of Beta1Pow.");
     AddAttr<float>("beta2", "The initial value of Beta2Pow.");
-    AddAttr<std::vector<float>>(
-        "weight_decay",
-        "The weight decay for each parameter. Its "
-        "shape is equal to the global parameter number.");
+    AddAttr<std::vector<int>>("apply_weight_decay",
+                              "Whether to apply weight decay.");
     AddAttr<int>("alignment", "The alignment in bytes for the fused tensors.");
     AddAttr<int>("rank", "The global rank of the current process.");
     AddAttr<int>("nranks", "The global world size.");
diff --git a/paddle/fluid/operators/optimizers/distributed_fused_lamb_init_op.cu b/paddle/fluid/operators/optimizers/distributed_fused_lamb_init_op.cu
index 3445e9b658becda84aa678e9c1f03b3436d63b70..7d8a7186d58b402e208fc749524d996b351abeef 100644
--- a/paddle/fluid/operators/optimizers/distributed_fused_lamb_init_op.cu
+++ b/paddle/fluid/operators/optimizers/distributed_fused_lamb_init_op.cu
@@ -258,32 +258,6 @@ static void ShareBufferForNonInitedTensor(framework::Tensor *origin,
            << ") , dtype = " << fused_out->dtype();
 }
 
-template <typename OffsetT, typename IndexT>
-static __global__ void LambFillFusedIndicesCUDAKernel(const OffsetT *offsets,
-                                                      IndexT *out,
-                                                      int offset_num,
-                                                      int out_num) {
-  CUDA_KERNEL_LOOP_TYPE(i, out_num, int) {
-    auto idx = phi::funcs::LowerBound(offsets, offset_num, i);
-    if (idx == offset_num || offsets[idx] != i) {
-      --idx;
-    }
-    out[i] = idx;
-  }
-}
-
-template <typename T>
-static void CopyVectorToTensor(const std::vector<T> &src,
-                               framework::Tensor *dst,
-                               const platform::Place &place,
-                               gpuStream_t stream) {
-  dst->Resize({static_cast<int64_t>(src.size())});
-  T *dst_ptr = dst->mutable_data<T>(place);
-  const T *src_ptr = src.data();
-  auto nbytes = src.size() * sizeof(T);
-  memory::Copy(place, dst_ptr, platform::CPUPlace(), src_ptr, nbytes, stream);
-}
-
 template <typename T>
 static void CopyVectorToCPUTensor(const std::vector<T> &src,
                                   framework::Tensor *dst) {
@@ -294,6 +268,42 @@ static void CopyVectorToCPUTensor(const std::vector<T> &src,
   std::memcpy(dst_ptr, src_ptr, nbytes);
 }
 
+static size_t ReorderParamGradInfoList(const std::vector<int> &flags,
+                                       std::vector<ParamGradInfo> *infos) {
+  size_t n = infos->size();
+  std::vector<int> cur_flags;
+  cur_flags.reserve(n);
+  for (size_t i = 0; i < n; ++i) {
+    auto idx = (*infos)[i].idx;
+    cur_flags.push_back(flags[idx]);
+  }
+
+  auto origin_infos = *infos;
+  size_t j = 0;
+  for (size_t i = 0; i < n; ++i) {
+    if (cur_flags[i]) {
+      (*infos)[j] = origin_infos[i];
+      ++j;
+    }
+  }
+  size_t ret_idx = j;
+
+  for (size_t i = 0; i < n; ++i) {
+    if (!cur_flags[i]) {
+      (*infos)[j] = origin_infos[i];
+      ++j;
+    }
+  }
+  return ret_idx;
+}
+
+template <typename T>
+static T ClipByBound(T x, T low_value, T high_value) {
+  if (x < low_value) return low_value;
+  if (x > high_value) return high_value;
+  return x;
+}
+
 template <typename T>
 class DistributedFusedLambInitOpKernel<platform::CUDADeviceContext, T>
     : public framework::OpKernel<T> {
@@ -404,6 +414,24 @@ class DistributedFusedLambInitOpKernel<platform::CUDADeviceContext, T>
         info->numel_offset = 0;        // not determined yet
       }
     }
+    const auto &apply_weight_decay =
+        ctx.Attr<std::vector<int>>("apply_weight_decay");
+    size_t fp32_wd_end_idx =
+        ReorderParamGradInfoList(apply_weight_decay, &fp32_infos);
+    size_t fp16_wd_end_idx =
+        ReorderParamGradInfoList(apply_weight_decay, &fp16_infos);
+
+    auto *param_order_t = ctx.Output<framework::Tensor>("ParamOrder");
+    auto param_num = fp32_infos.size() + fp16_infos.size();
+    param_order_t->Resize({static_cast<int16_t>(param_num)});
+    auto *param_order = param_order_t->mutable_data<int>(platform::CPUPlace());
+    for (size_t i = 0; i < fp32_infos.size(); ++i) {
+      param_order[i] = static_cast<int>(fp32_infos[i].idx);
+    }
+    for (size_t i = 0; i < fp16_infos.size(); ++i) {
+      param_order[i + fp32_infos.size()] = static_cast<int>(fp16_infos[i].idx);
+    }
+
     VLOG(10) << "Fill ParamGradInfo ends";
 
     // Step 2: determine the numel_with_padding and numel_offset
@@ -568,45 +596,29 @@ class DistributedFusedLambInitOpKernel<platform::CUDADeviceContext, T>
     VLOG(10) << "Found the sharding arguments";
 
     auto *param_info_t = ctx.Output<framework::Tensor>("ParamInfo");
-    param_info_t->Resize({6});
+    param_info_t->Resize({8});
     auto *param_info = param_info_t->mutable_data<int>(platform::CPUPlace());
     param_info[0] = static_cast<int>(fp32_start_idx);
     param_info[1] = static_cast<int>(fp32_local_param_num);
     param_info[2] = static_cast<int>(fp32_infos.size());
-    param_info[3] = static_cast<int>(fp16_start_idx + fp32_infos.size());
-    param_info[4] = static_cast<int>(fp16_local_param_num);
-    param_info[5] = static_cast<int>(fp16_infos.size());
+    param_info[3] = ClipByBound<int>(fp32_wd_end_idx, fp32_start_idx,
+                                     fp32_start_idx + fp32_local_param_num) -
+                    static_cast<int>(fp32_start_idx);
+    param_info[4] = static_cast<int>(fp16_start_idx + fp32_infos.size());
+    param_info[5] = static_cast<int>(fp16_local_param_num);
+    param_info[6] = static_cast<int>(fp16_infos.size());
+    param_info[7] = ClipByBound<int>(fp16_wd_end_idx, fp16_start_idx,
+                                     fp16_start_idx + fp16_local_param_num) -
+                    static_cast<int>(fp16_start_idx);
 
     VLOG(10) << "Start FP32 idx: " << param_info[0];
     VLOG(10) << "Local FP32 param num: " << param_info[1];
     VLOG(10) << "Global FP32 param num: " << param_info[2];
 
-    VLOG(10) << "Start FP16 idx: " << param_info[3];
-    VLOG(10) << "Local FP16 param num: " << param_info[4];
-    VLOG(10) << "Global FP16 param num: " << param_info[5];
+    VLOG(10) << "Start FP16 idx: " << param_info[4];
+    VLOG(10) << "Local FP16 param num: " << param_info[5];
+    VLOG(10) << "Global FP16 param num: " << param_info[6];
 
-    // For WeightDecay, shard and perform H2D copy
-    const auto &origin_weight_decay =
-        ctx.Attr<std::vector<float>>("weight_decay");
-    PADDLE_ENFORCE_EQ(params.size(), origin_weight_decay.size(),
-                      platform::errors::InvalidArgument(
-                          "The attr(weight_decay) should have the "
-                          "same length with Input(Param)."));
-    std::vector<float> shard_weight_decay;
-    shard_weight_decay.reserve(total_local_param_num);
-    for (size_t i = 0; i < fp32_local_param_num; ++i) {
-      shard_weight_decay.push_back(
-          origin_weight_decay[fp32_infos[i + fp32_start_idx].idx]);
-    }
-    for (size_t i = 0; i < fp16_local_param_num; ++i) {
-      shard_weight_decay.push_back(
-          origin_weight_decay[fp16_infos[i + fp16_start_idx].idx]);
-    }
-
-    // For FusedIndices, launch CUDA kernel to do binary search
-    auto *fused_indices_t = ctx.Output<framework::Tensor>("FusedIndices");
-    fused_indices_t->Resize({static_cast<int64_t>(total_numel)});
-    auto *fused_indices = fused_indices_t->mutable_data<int>(place);
     std::vector<int> numel_offsets;
     numel_offsets.reserve(params.size() + 1);
     for (const auto &info : fp32_infos) {
@@ -621,21 +633,6 @@ class DistributedFusedLambInitOpKernel<platform::CUDADeviceContext, T>
                           "The numel_offsets number must be one larger than "
                           "the parameter number."));
     VLOG(10) << "Total numel offset: " << FlattenToString(numel_offsets);
-    auto *fused_param_offset_t =
-        ctx.Output<framework::Tensor>("FusedParamOffsets");
-    fused_param_offset_t->Resize({static_cast<int64_t>(numel_offsets.size())});
-    auto *fused_param_offset = fused_param_offset_t->mutable_data<int>(place);
-    memory::Copy(place, fused_param_offset, platform::CPUPlace(),
-                 numel_offsets.data(),
-                 numel_offsets.size() * sizeof(numel_offsets[0]), stream);
-    auto config = platform::GetGpuLaunchConfig1D(dev_ctx, total_numel);
-    LambFillFusedIndicesCUDAKernel<<<config.block_per_grid,
-                                     config.thread_per_block, 0, stream>>>(
-        fused_param_offset, fused_indices, numel_offsets.size() - 1,
-        total_numel);
-
-    std::vector<int> lengths;
-    lengths.reserve(fp32_local_param_num + fp16_local_param_num);
 
     std::vector<int> fp32_partial_numel_offsets;
     fp32_partial_numel_offsets.reserve(fp32_local_param_num + 1);
@@ -659,9 +656,9 @@ class DistributedFusedLambInitOpKernel<platform::CUDADeviceContext, T>
       VLOG(10) << "FP32 Partial numel = ["
                << valid_start_n + fp32_infos[i].numel << ","
                << end_n + fp32_infos[i].numel;
-      lengths.push_back(end_n - valid_start_n);
+      auto len = end_n - valid_start_n;
       fp32_partial_numel_offsets.push_back(fp32_partial_numel_offsets.back() +
-                                           lengths.back());
+                                           len);
     }
 
     std::vector<int> fp16_partial_numel_offsets;
@@ -682,9 +679,9 @@ class DistributedFusedLambInitOpKernel<platform::CUDADeviceContext, T>
       PADDLE_ENFORCE_NE(valid_start_n, end_n,
                         platform::errors::InvalidArgument(
                             "Indices sharding error. This may be a bug."));
-      lengths.push_back(end_n - valid_start_n);
+      auto len = end_n - valid_start_n;
       fp16_partial_numel_offsets.push_back(fp16_partial_numel_offsets.back() +
-                                           lengths.back());
+                                           len);
     }
 
     CopyVectorToCPUTensor(numel_offsets,
@@ -696,23 +693,6 @@ class DistributedFusedLambInitOpKernel<platform::CUDADeviceContext, T>
         fp16_partial_numel_offsets,
         ctx.Output<framework::Tensor>("FP16ShardFusedParamOffsets"));
 
-    // Fill the weight decay tensor
-    PADDLE_ENFORCE_EQ(lengths.size(), shard_weight_decay.size(),
-                      platform::errors::InvalidArgument(
-                          "Invalid weight decay sharding. This may be a bug."));
-    std::vector<float> wd_cpu;
-    for (size_t i = 0; i < shard_weight_decay.size(); ++i) {
-      int len = lengths[i];
-      for (int j = 0; j < len; ++j) {
-        wd_cpu.push_back(shard_weight_decay[i]);
-      }
-    }
-    PADDLE_ENFORCE_EQ(wd_cpu.size() * nranks, fp32_numel + fp16_numel,
-                      platform::errors::InvalidArgument(
-                          "Invalid weight decay sharding. This may be a bug."));
-    CopyVectorToTensor(wd_cpu, ctx.Output<framework::Tensor>("WeightDecay"),
-                       place, stream);
-
     auto *global_scale = ctx.Output<framework::Tensor>("GlobalScale");
     if (!global_scale->IsInitialized()) {
       TensorFillConstant<float>(dev_ctx, global_scale, {1}, 1.0f);
diff --git a/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cc b/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cc
index e5b27446eb330aeb08e134332a5366c6c6ed2908..8f7c87912e93aa1bb3178d37afa641047e15a82b 100644
--- a/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cc
+++ b/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cc
@@ -66,28 +66,31 @@ class DistributedFusedLambOpMaker : public framework::OpProtoAndCheckerMaker {
              "The fp32 beta1 power accumulator tensor. Its shape is [1].");
     AddInput("Beta2Pow",
              "The fp32 beta2 power accumulator tensor. Its shape is [1].");
-    AddInput("FusedIndices",
-             "The param index of each element in FP32FusedParam. Its shape is "
-             "[M1+M2]. It is like [0,0,0,1,1,1,1,2,2,...].");
     AddInput(
         "FusedParamOffsets",
         "The numel offset of each parameter inside the FP32FusedParam. Its "
         "shape is [param_num + 1]. It is like [0, n_0, n_0 + n_1, n_0 + n_1 "
-        "+ n_2, ...].");
-    AddInput("FP32ShardFusedParamOffsets",
-             "The sharded numel offset of each parameter in the local rank. "
-             "Its shape is [fp32_local_param_num + 1].");
-    AddInput("FP16ShardFusedParamOffsets",
-             "The sharded numel offset of each parameter in the local rank. "
-             "Its shape is [fp16_local_param_num + 1].");
-    AddInput("WeightDecay",
-             "The sharded fp32 weight decay tensor. Its shape is [(M1+M2)/N].");
+        "+ n_2, ...]. It should be in CPUPlace.");
+    AddInput(
+        "FP32ShardFusedParamOffsets",
+        "The sharded numel offset of each parameter in the local rank. "
+        "Its shape is [fp32_local_param_num + 1]. It should be in CPUPlace.");
+    AddInput(
+        "FP16ShardFusedParamOffsets",
+        "The sharded numel offset of each parameter in the local rank. "
+        "Its shape is [fp16_local_param_num + 1]. It should be in CPUPlace.");
     AddInput("ParamInfo",
              "The param info. It should be in CPUPlace, and its shape is [6]"
-             "CPUPlace, and its shape is [6]. It is "
+             "CPUPlace, and its shape is [8]. It is "
              "[fp32_shard_param_start_idx, fp32_local_param_num, "
-             "fp32_global_param_num, fp16_shard_param_start_idx, "
-             "fp16_local_param_num, fp16_global_param_num].");
+             "fp32_global_param_num, fp32_weight_decay_end_idx, "
+             "fp16_shard_param_start_idx, "
+             "fp16_local_param_num, fp16_global_param_num, "
+             "fp16_weight_decay_end_idx].");
+    AddInput("ParamOrder",
+             "The reordered parameter order. Inside this op, "
+             "the parameter would be reordered by data type and weight decay "
+             "value.");
 
     AddInput("LearningRate",
              "The fp32 learning rate tensor. Its shape is [1].");
@@ -116,6 +119,7 @@ class DistributedFusedLambOpMaker : public framework::OpProtoAndCheckerMaker {
         "max_global_grad_norm",
         "The maximum global gradient l2-norm value for clipping. If "
         "max_global_grad_norm <= 0, no clipping would be performed.");
+    AddAttr<float>("weight_decay", "The weight decay value.");
     AddAttr<bool>("clip_after_allreduce",
                   "Whether to clip before allreduce, only valid when the "
                   "world size is larger than 1.");
diff --git a/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu b/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu
index 3f90140f77282983f42ef03f736c35960239dd75..ca0828a6f6ab71a010ae35318fed23a8072aa69d 100644
--- a/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu
+++ b/paddle/fluid/operators/optimizers/distributed_fused_lamb_op.cu
@@ -87,7 +87,7 @@ struct L2NormFunctor {
   }
 };
 
-template <typename InT, typename OutT, int BlockDim, bool NeedSqrt>
+template <typename InT, typename OutT, int BlockDim>
 static __global__ void MultiTensorL2NormReduceAgainCUDAKernel(
     const InT *x, OutT *y, int max_chunk_num) {
   int tensor_id = blockIdx.x;
@@ -100,11 +100,7 @@ static __global__ void MultiTensorL2NormReduceAgainCUDAKernel(
   }
   sum = BlockReduce(storage).Reduce(sum, cub::Sum());
   if (threadIdx.x == 0) {
-    if (NeedSqrt) {
-      y[blockIdx.x] = static_cast<OutT>(sqrtf(sum));
-    } else {
-      y[blockIdx.x] = static_cast<OutT>(sum);
-    }
+    y[blockIdx.x] = static_cast<OutT>(sum);
   }
 }
 
@@ -118,6 +114,7 @@ static int GetChunkedVecSize(const T *ptr, int chunk_size) {
   constexpr int vec8 = alignof(platform::AlignedVector<T, 8>);
   constexpr int vec4 = alignof(platform::AlignedVector<T, 4>);
   constexpr int vec2 = alignof(platform::AlignedVector<T, 2>);
+  chunk_size *= sizeof(T);
   if (address % vec8 == 0 && chunk_size % vec8 == 0) {
     return std::min(8, valid_vec_size);
   } else if (address % vec4 == 0 && chunk_size % vec4 == 0) {
@@ -129,27 +126,26 @@ static int GetChunkedVecSize(const T *ptr, int chunk_size) {
   }
 }
 
-#define PD_VEC_MULTI_TENSOR_APPLY_CASE(__vec_size, ...) \
-  case __vec_size: {                                    \
-    constexpr int kVecSize = __vec_size;                \
-    __VA_ARGS__;                                        \
-    break;                                              \
+#define PD_VEC_LAUNCH_KERNEL_CASE(__vec_size, ...) \
+  case __vec_size: {                               \
+    constexpr int kVecSize = __vec_size;           \
+    __VA_ARGS__;                                   \
+    break;                                         \
   }
 
-#define PD_VEC_MULTI_TENSOR_APPLY(__vec_size, ...)    \
-  do {                                                \
-    switch (__vec_size) {                             \
-      PD_VEC_MULTI_TENSOR_APPLY_CASE(8, __VA_ARGS__); \
-      PD_VEC_MULTI_TENSOR_APPLY_CASE(4, __VA_ARGS__); \
-      PD_VEC_MULTI_TENSOR_APPLY_CASE(2, __VA_ARGS__); \
-      PD_VEC_MULTI_TENSOR_APPLY_CASE(1, __VA_ARGS__); \
-    }                                                 \
+#define PD_VEC_LAUNCH_KERNEL(__vec_size, ...)    \
+  do {                                           \
+    switch (__vec_size) {                        \
+      PD_VEC_LAUNCH_KERNEL_CASE(8, __VA_ARGS__); \
+      PD_VEC_LAUNCH_KERNEL_CASE(4, __VA_ARGS__); \
+      PD_VEC_LAUNCH_KERNEL_CASE(2, __VA_ARGS__); \
+      PD_VEC_LAUNCH_KERNEL_CASE(1, __VA_ARGS__); \
+    }                                            \
   } while (0)
 
 // TODO(zengjinle): which chunk_size is better?
-template <typename InT, typename OutT, bool NeedSqrt = false,
-          int MaxTensorNumPerLaunch = 50, int MaxChunkNumPerLaunch = 680,
-          int BlockDim = 512>
+template <typename InT, typename OutT, int MaxTensorNumPerLaunch = 160,
+          int MaxChunkNumPerLaunch = 780>
 static void MultiTensorL2Norm(const platform::CUDAPlace &place,
                               gpuStream_t stream, const InT *x,
                               const int *offsets, int n, OutT *y,
@@ -158,7 +154,7 @@ static void MultiTensorL2Norm(const platform::CUDAPlace &place,
 
   constexpr int kNumTensor = MaxTensorNumPerLaunch;
   constexpr int kNumChunk = MaxChunkNumPerLaunch;
-  constexpr int kBlockDim = BlockDim;
+  constexpr int kBlockDim = 512;
 
   int max_chunk_num = -1;
   int vec_size = 8;
@@ -181,22 +177,22 @@ static void MultiTensorL2Norm(const platform::CUDAPlace &place,
   auto *tmp_out_ptr = tmp_out.Alloc<MT>(n * max_chunk_num);
   FillZeroWithPtr(tmp_out_ptr, n * max_chunk_num, stream);
 
-#define PD_LAUNCH_MULTI_TENSOR_APPLY_KERNEL                         \
-  do {                                                              \
-    using FunctorT = L2NormFunctor<InT, kBlockDim, kVecSize>;       \
-    VLOG(10) << __func__ << " " << typeid(InT).name()               \
-             << " VecSize = " << kVecSize;                          \
-    MultiTensorApply<FunctorT, kBlockDim, kNumTensor, kNumChunk>(   \
-        FunctorT(), stream, offsets, n, chunk_size, x, tmp_out_ptr, \
-        max_chunk_num);                                             \
+#define PD_LAUNCH_MULTI_TENSOR_APPLY_L2_NORM_KERNEL                            \
+  do {                                                                         \
+    using FunctorT = L2NormFunctor<InT, kBlockDim, kVecSize>;                  \
+    VLOG(10) << __func__ << " " << typeid(InT).name()                          \
+             << " VecSize = " << kVecSize;                                     \
+    MultiTensorApply<FunctorT, kNumTensor, kNumChunk>(                         \
+        FunctorT(), stream, offsets, n, chunk_size, kBlockDim, x, tmp_out_ptr, \
+        max_chunk_num);                                                        \
   } while (0)
 
-  PD_VEC_MULTI_TENSOR_APPLY(vec_size, PD_LAUNCH_MULTI_TENSOR_APPLY_KERNEL);
-#undef PD_LAUNCH_MULTI_TENSOR_APPLY_KERNEL
+  PD_VEC_LAUNCH_KERNEL(vec_size, PD_LAUNCH_MULTI_TENSOR_APPLY_L2_NORM_KERNEL);
+#undef PD_LAUNCH_MULTI_TENSOR_APPLY_L2_NORM_KERNEL
 
-  MultiTensorL2NormReduceAgainCUDAKernel<MT, OutT, kBlockDim,
-                                         NeedSqrt><<<n, kBlockDim, 0, stream>>>(
-      tmp_out_ptr, y, max_chunk_num);
+  MultiTensorL2NormReduceAgainCUDAKernel<
+      MT, OutT, kBlockDim><<<n, kBlockDim, 0, stream>>>(tmp_out_ptr, y,
+                                                        max_chunk_num);
 }
 
 template <int LogLevel>
@@ -208,34 +204,17 @@ static void LogParamAndTrustRatioDivSquareNorm(
   auto tensors = ctx.MultiInput<framework::Tensor>("Param");
   if (tensors.empty()) return;
 
+  const auto *order = ctx.Input<framework::Tensor>("ParamOrder")->data<int>();
+
   size_t n = tensors.size();
   auto place = tensors[0]->place();
 
   auto pn_vec = ToVector(param_square_norm, n, place);
   auto tn_vec = ToVector(trust_ratio_div_square_norm, n, place);
 
-  std::vector<size_t> fp32_indices, fp16_indices;
-  fp32_indices.reserve(n);
-  fp16_indices.reserve(n);
-  for (size_t i = 0; i < n; ++i) {
-    const auto *t = tensors[i];
-    if (t->dtype() == phi::DataType::FLOAT32) {
-      fp32_indices.push_back(i);
-    } else if (t->dtype() == phi::DataType::FLOAT16) {
-      fp16_indices.push_back(i);
-    } else {
-      PADDLE_THROW(platform::errors::InvalidArgument(
-          "Unsupported data type %s.", t->dtype()));
-    }
-  }
-
-  for (auto idx : fp16_indices) {
-    fp32_indices.push_back(idx);
-  }
-
   const auto &names = ctx.GetOp().Inputs("Param");
-  for (size_t i = 0; i < fp32_indices.size(); ++i) {
-    auto idx = fp32_indices[i];
+  for (size_t i = 0; i < n; ++i) {
+    auto idx = order[i];
     VLOG(LogLevel) << "Param " << tensors[idx]->dtype() << " " << names[idx]
                    << " pn = " << pn_vec[i] << " , tn = " << tn_vec[i];
   }
@@ -353,7 +332,7 @@ static __global__ void CalcGradNormClipBeforeAllReduceScale(
     const T1 *__restrict__ global_scale, T1 max_global_grad_norm,
     const T1 *__restrict__ square_grad_norm, T1 *__restrict__ out1,
     T2 *__restrict__ out2, T1 clip_rescale_grad) {
-  T1 grad_norm = static_cast<T1>(sqrt(*square_grad_norm)) * clip_rescale_grad;
+  T1 grad_norm = static_cast<T1>(sqrtf(*square_grad_norm)) * clip_rescale_grad;
   T1 scale = global_scale[0] * max_global_grad_norm / (1e-6 + grad_norm);
   bool found_nan_inf = !isfinite(scale);
   if (scale >= 1 || found_nan_inf) {
@@ -380,19 +359,24 @@ static __global__ void SetNanInfValueCUDAKernelTwoFlag(const bool *in_flag_p_1,
       ((*in_flag_p_1) || (*in_flag_p_2)) ? __int_as_float(0x7fffffffU) : 0.0f;
 }
 
-// TODO(zengjinle): Vectorize this function
-// NOTE: this method does not update Beta1Pow and Beta2Pow!
-template <typename T, typename GradT, typename IndexT>
-static __global__ void UpdateLambMoment(
+template <typename T, typename GradT, int VecSize>
+static __global__ void UpdateLambMomentAndTrustRatioDivCUDAKernel(
     const T *__restrict__ param_p, const GradT *__restrict__ grad_p,
     const T *__restrict__ square_grad_norm_p,
-    const T *__restrict__ global_scale, const IndexT *__restrict__ indices,
-    const T *__restrict__ weight_decay_p, const T *__restrict__ beta1pow_p,
+    const T *__restrict__ global_scale, const T *__restrict__ beta1pow_p,
     const T *__restrict__ beta2pow_p, T *__restrict__ mom1_p,
-    T *__restrict__ mom2_p, T *__restrict__ trust_ratio_div_p, T beta1, T beta2,
-    T epsilon, T max_global_grad_norm, int num, T rescale_grad) {
+    T *__restrict__ mom2_p, T *__restrict__ trust_ratio_div_p, bool *found_inf,
+    T weight_decay, int weight_decay_end_numel, T beta1, T beta2, T epsilon,
+    T max_global_grad_norm, int num, T rescale_grad) {
   T square_grad_norm = *square_grad_norm_p;
-  if (!isfinite(square_grad_norm)) return;
+  bool need_update_found_inf =
+      (found_inf && threadIdx.x == 0 && blockIdx.x == 0);
+  if (!isfinite(square_grad_norm)) {
+    if (need_update_found_inf) *found_inf = true;
+    return;
+  } else if (need_update_found_inf) {
+    *found_inf = false;
+  }
 
   T scale = rescale_grad / global_scale[0];
   if (max_global_grad_norm > 0) {
@@ -406,27 +390,112 @@ static __global__ void UpdateLambMoment(
   T one_minus_beta1pow = 1 - beta1pow_p[0];
   T one_minus_beta2pow = 1 - beta2pow_p[0];
 
-  CUDA_KERNEL_LOOP(i, num) {
-    T p = param_p[i];
-    T g = static_cast<T>(grad_p[i]) * scale;
-    T weight_decay = weight_decay_p[i];
-    T mom1 = mom1_p[i];
-    T mom2 = mom2_p[i];
-
-    mom1 = beta1 * mom1 + (1 - beta1) * g;
-    mom2 = beta2 * mom2 + (1 - beta2) * g * g;
-
-    T mom1_unbiased = mom1 / one_minus_beta1pow;
-    T mom2_unbiased = mom2 / one_minus_beta2pow;
-    T trust_ratio_div =
-        mom1_unbiased / (sqrtf(mom2_unbiased) + epsilon) + weight_decay * p;
-
-    mom1_p[i] = mom1;
-    mom2_p[i] = mom2;
-    trust_ratio_div_p[i] = trust_ratio_div;
+  int i = (threadIdx.x + blockIdx.x * blockDim.x) * VecSize;
+  int stride = blockDim.x * gridDim.x * VecSize;
+
+  for (; i + VecSize <= num; i += stride) {
+    platform::AlignedVector<T, VecSize> param_vec;
+    platform::AlignedVector<GradT, VecSize> grad_vec;
+    platform::AlignedVector<T, VecSize> weight_decay_vec;
+    platform::AlignedVector<T, VecSize> mom1_vec;
+    platform::AlignedVector<T, VecSize> mom2_vec;
+    platform::AlignedVector<T, VecSize> trust_ratio_div_vec;
+
+    T cur_weight_decay = (i < weight_decay_end_numel) * weight_decay;
+    if (cur_weight_decay != static_cast<T>(0.0)) {
+      platform::Load(param_p + i, &param_vec);
+    } else {
+#pragma unroll
+      for (int j = 0; j < VecSize; ++j) {
+        param_vec[j] = static_cast<T>(0);
+      }
+    }
+    platform::Load(grad_p + i, &grad_vec);
+    platform::Load(mom1_p + i, &mom1_vec);
+    platform::Load(mom2_p + i, &mom2_vec);
+
+#define PD_LAMB_MOM_TRUST_RATIO_DIV_UPDATE(__param, __grad, __mom1, __mom2,    \
+                                           __trust_ratio_div, __idx)           \
+  T p = __param[__idx];                                                        \
+  T g = static_cast<T>(__grad[__idx]) * scale;                                 \
+  T mom1 = __mom1[__idx];                                                      \
+  T mom2 = __mom2[__idx];                                                      \
+  mom1 = beta1 * mom1 + (1 - beta1) * g;                                       \
+  mom2 = beta2 * mom2 + (1 - beta2) * g * g;                                   \
+  T mom1_unbiased = mom1 / one_minus_beta1pow;                                 \
+  T mom2_unbiased = mom2 / one_minus_beta2pow;                                 \
+  __trust_ratio_div[__idx] =                                                   \
+      mom1_unbiased / (sqrtf(mom2_unbiased) + epsilon) + cur_weight_decay * p; \
+  __mom1[__idx] = mom1;                                                        \
+  __mom2[__idx] = mom2;
+
+#pragma unroll
+    for (int j = 0; j < VecSize; ++j) {
+      PD_LAMB_MOM_TRUST_RATIO_DIV_UPDATE(param_vec, grad_vec, mom1_vec,
+                                         mom2_vec, trust_ratio_div_vec, j);
+    }
+
+    platform::Store(mom1_vec, mom1_p + i);
+    platform::Store(mom2_vec, mom2_p + i);
+    platform::Store(trust_ratio_div_vec, trust_ratio_div_p + i);
+  }
+
+  for (; i < num; ++i) {
+    T cur_weight_decay = (i < weight_decay_end_numel) * weight_decay;
+    PD_LAMB_MOM_TRUST_RATIO_DIV_UPDATE(param_p, grad_p, mom1_p, mom2_p,
+                                       trust_ratio_div_p, i);
   }
 }
 
+template <typename T, typename GradT>
+static void MultiTensorUpdateLambMomentAndTrustRatioDiv(
+    const platform::CUDADeviceContext &dev_ctx, const int *offsets, int n,
+    const T *param_p, const GradT *grad_p, const T *square_grad_norm_p,
+    const T *global_scale, const T *beta1pow_p, const T *beta2pow_p, T *mom1_p,
+    T *mom2_p, T *trust_ratio_div_p, bool *found_inf_p, T weight_decay,
+    int weight_decay_end_idx, T beta1, T beta2, T epsilon,
+    T max_global_grad_norm, T rescale_grad) {
+  if (n <= 0) return;
+  int numel = offsets[n] - offsets[0];
+  PADDLE_ENFORCE_GE(weight_decay_end_idx, 0,
+                    platform::errors::InvalidArgument(
+                        "The weight decay end index should be >= 0."));
+  PADDLE_ENFORCE_LE(weight_decay_end_idx, n,
+                    platform::errors::InvalidArgument(
+                        "The weight decay end index should be < %d.", n));
+  auto weight_decay_end_numel = offsets[weight_decay_end_idx] - offsets[0];
+
+  int vec_size = GetChunkedVecSize(param_p, 0);
+  vec_size = std::min(vec_size, GetChunkedVecSize(grad_p, 0));
+  vec_size = std::min(vec_size, GetChunkedVecSize(mom1_p, 0));
+  vec_size = std::min(vec_size, GetChunkedVecSize(mom2_p, 0));
+  vec_size = std::min(vec_size, GetChunkedVecSize(trust_ratio_div_p, 0));
+  for (int i = 0; i < n; ++i) {
+    auto length = offsets[i + 1] - offsets[i];
+    while (length % vec_size != 0) {
+      vec_size /= 2;
+    }
+  }
+
+  VLOG(1) << __func__ << " VecSize = " << vec_size;
+
+  auto stream = dev_ctx.stream();
+  auto config = platform::GetGpuLaunchConfig1D(dev_ctx, numel, vec_size);
+
+#define PD_LAUNCH_LAMB_MOM_TRUST_RATIO_DIV_KERNEL                      \
+  do {                                                                 \
+    UpdateLambMomentAndTrustRatioDivCUDAKernel<T, GradT, kVecSize><<<  \
+        config.block_per_grid, config.thread_per_block, 0, stream>>>(  \
+        param_p, grad_p, square_grad_norm_p, global_scale, beta1pow_p, \
+        beta2pow_p, mom1_p, mom2_p, trust_ratio_div_p, found_inf_p,    \
+        weight_decay, weight_decay_end_numel, beta1, beta2, epsilon,   \
+        max_global_grad_norm, numel, rescale_grad);                    \
+  } while (0)
+
+  PD_VEC_LAUNCH_KERNEL(vec_size, PD_LAUNCH_LAMB_MOM_TRUST_RATIO_DIV_KERNEL);
+#undef PD_LAUNCH_LAMB_MOM_TRUST_RATIO_DIV_KERNEL
+}
+
 template <typename T, bool NeedUpdate /*=true*/>
 struct LambBetaPowUpdateOnceHelper {
   LambBetaPowUpdateOnceHelper(T *beta1pow, T *beta2pow, T beta1, T beta2) {
@@ -468,33 +537,6 @@ struct LambBetaPowUpdateOnceHelper<T, false> {
   HOSTDEVICE void UpdateBetaPows() const {}
 };
 
-template <bool HasFoundInf /*=true*/>
-struct LambFoundInfHelper {
- public:
-  explicit LambFoundInfHelper(bool *found_inf) : found_inf_(found_inf) {
-    PADDLE_ENFORCE_NOT_NULL(found_inf,
-                            platform::errors::InvalidArgument(
-                                "The found_inf should not be nullptr."));
-  }
-
-  HOSTDEVICE void UpdateFoundInf(bool value) { *found_inf_ = value; }
-
- private:
-  bool *__restrict__ found_inf_;
-};
-
-template <>
-struct LambFoundInfHelper<false> {
- public:
-  explicit LambFoundInfHelper(bool *found_inf) {
-    PADDLE_ENFORCE_EQ(
-        found_inf, nullptr,
-        platform::errors::InvalidArgument("The found_inf should be nullptr."));
-  }
-
-  HOSTDEVICE void UpdateFoundInf(bool) {}
-};
-
 template <typename T, bool HasMasterParam /*=true*/>
 struct LambParamHelper {
   LambParamHelper(T *param, MasterT<T> *master_param) {
@@ -509,12 +551,9 @@ struct LambParamHelper {
     master_param_ = master_param;
   }
 
-  HOSTDEVICE void SetParam(int i, MasterT<T> updated_p) {
-    param_[i] = static_cast<T>(updated_p);
-    master_param_[i] = updated_p;
-  }
+  HOSTDEVICE T *__restrict__ ParamPtr() { return param_; }
 
-  HOSTDEVICE MasterT<T> GetParam(int i) { return master_param_[i]; }
+  HOSTDEVICE MasterT<T> *__restrict__ MasterParamPtr() { return master_param_; }
 
  private:
   T *__restrict__ param_;
@@ -538,158 +577,169 @@ struct LambParamHelper<T, false> {
     param_ = param;
   }
 
-  HOSTDEVICE void SetParam(int i, MasterT<T> updated_p) {
-    param_[i] = static_cast<T>(updated_p);
-  }
+  HOSTDEVICE T *__restrict__ ParamPtr() { return param_; }
 
-  HOSTDEVICE MasterT<T> GetParam(int i) {
-    return static_cast<MasterT<T>>(param_[i]);
-  }
+  HOSTDEVICE constexpr MasterT<T> *MasterParamPtr() { return nullptr; }
 
  private:
   T *__restrict__ param_;
 };
 
-template <typename ParamT, typename IndexT, bool HasMasterParam,
-          bool NeedUpdateBetaPow, bool HasFoundInf>
-struct LambParamAndBetaPowsUpdateHelper
-    : public LambParamHelper<ParamT, HasMasterParam>,
-      public LambBetaPowUpdateOnceHelper<MasterT<ParamT>, NeedUpdateBetaPow>,
-      public LambFoundInfHelper<HasFoundInf> {
-  LambParamAndBetaPowsUpdateHelper(
-      ParamT *param, MasterT<ParamT> *master_param, MasterT<ParamT> *beta1pow,
-      MasterT<ParamT> *beta2pow, MasterT<ParamT> beta1, MasterT<ParamT> beta2,
-      bool *found_inf, const MasterT<ParamT> *trust_ratio_div,
-      const MasterT<ParamT> *lr, const IndexT *index,
+template <typename ParamT, bool HasMasterParam, bool NeedUpdateBetaPow,
+          int VecSize>
+struct LambUpdateParamAndBetaPowsFunctor {
+  DEVICE void operator()(
+      int tensor_id, int chunk_id, int offset, int size,
+      LambParamHelper<ParamT, HasMasterParam> param_helper,
+      const MasterT<ParamT> *trust_ratio_div, const MasterT<ParamT> *lr,
       const MasterT<ParamT> *param_square_norm,
-      const MasterT<ParamT> *trust_ratio_div_square_norm,
-      const MasterT<ParamT> *update_flag)
-      : LambParamHelper<ParamT, HasMasterParam>(param, master_param),
-        LambBetaPowUpdateOnceHelper<MasterT<ParamT>, NeedUpdateBetaPow>(
-            beta1pow, beta2pow, beta1, beta2),
-        LambFoundInfHelper<HasFoundInf>(found_inf),
-        trust_ratio_div(trust_ratio_div),
-        lr(lr),
-        index(index),
-        param_square_norm(param_square_norm),
-        trust_ratio_div_square_norm(trust_ratio_div_square_norm),
-        update_flag(update_flag) {}
-
-  const MasterT<ParamT> *__restrict__ trust_ratio_div;
-  const MasterT<ParamT> *__restrict__ lr;
-  const IndexT *__restrict__ index;
-  const MasterT<ParamT> *__restrict__ param_square_norm;
-  const MasterT<ParamT> *__restrict__ trust_ratio_div_square_norm;
-  const MasterT<ParamT> *__restrict__ update_flag;
-};
+      const MasterT<ParamT> *trust_ratio_div_square_norm, const bool *found_inf,
+      LambBetaPowUpdateOnceHelper<MasterT<ParamT>, NeedUpdateBetaPow>
+          betapow_helper) const {
+    if (*found_inf) return;
+
+    using MT = MasterT<ParamT>;
 
-template <typename ParamT, typename IndexT, bool HasMasterParam,
-          bool NeedUpdateBetaPow, bool HasFoundInf>
-static __global__ void LambUpdateParamAndBetaPowsCUDAKernel(
-    LambParamAndBetaPowsUpdateHelper<ParamT, IndexT, HasMasterParam,
-                                     NeedUpdateBetaPow, HasFoundInf>
-        args,
-    int num) {
-  auto should_update = *args.update_flag;
-  if (!isfinite(should_update)) {
-    if (HasFoundInf && threadIdx.x == 0 && blockIdx.x == 0) {
-      args.UpdateFoundInf(true);
+    MT p_square_norm = param_square_norm[tensor_id];
+    MT t_square_norm = trust_ratio_div_square_norm[tensor_id];
+    MT lr_value = *lr;
+    MT ratio = (p_square_norm != static_cast<MT>(0) &&
+                        t_square_norm != static_cast<MT>(0)
+                    ? lr_value * sqrtf(p_square_norm / t_square_norm)
+                    : lr_value);
+
+    int i;
+    int stride = blockDim.x * VecSize;
+
+    ParamT *param = param_helper.ParamPtr() + offset;
+    MT *master_param = HasMasterParam ? param_helper.MasterParamPtr() + offset
+                                      : param_helper.MasterParamPtr();
+    trust_ratio_div += offset;
+
+    for (i = threadIdx.x * VecSize; i + VecSize <= size; i += stride) {
+      platform::AlignedVector<MT, VecSize> trust_ratio_div_vec;
+      platform::Load(trust_ratio_div + i, &trust_ratio_div_vec);
+      if (HasMasterParam) {
+        platform::AlignedVector<MT, VecSize> master_param_vec;
+        platform::Load(master_param + i, &master_param_vec);
+        platform::AlignedVector<ParamT, VecSize> param_vec;
+#pragma unroll
+        for (int j = 0; j < VecSize; ++j) {
+          MT p = master_param_vec[j] - ratio * trust_ratio_div_vec[j];
+          master_param_vec[j] = p;
+          param_vec[j] = static_cast<ParamT>(p);
+        }
+        platform::Store(master_param_vec, master_param + i);
+        platform::Store(param_vec, param + i);
+      } else {
+        platform::AlignedVector<ParamT, VecSize> param_vec;
+        platform::Load(param + i, &param_vec);
+#pragma unroll
+        for (int j = 0; j < VecSize; ++j) {
+          MT p = static_cast<MT>(param_vec[j]) - ratio * trust_ratio_div_vec[j];
+          param_vec[j] = static_cast<ParamT>(p);
+        }
+        platform::Store(param_vec, param + i);
+      }
+    }
+
+    for (; i < size; ++i) {
+      if (HasMasterParam) {
+        MT p = master_param[i] - ratio * trust_ratio_div[i];
+        master_param[i] = p;
+        param[i] = static_cast<ParamT>(p);
+      } else {
+        MT p = static_cast<MT>(param[i]) - ratio * trust_ratio_div[i];
+        param[i] = static_cast<ParamT>(p);
+      }
+    }
+
+    if (NeedUpdateBetaPow && threadIdx.x == 0 && blockIdx.x == 0) {
+      betapow_helper.UpdateBetaPows();
     }
-    return;
-  } else if (HasFoundInf && threadIdx.x == 0 && blockIdx.x == 0) {
-    args.UpdateFoundInf(false);
   }
+};
 
-  if (NeedUpdateBetaPow && threadIdx.x == 0 && blockIdx.x == 0) {
-    args.UpdateBetaPows();
+// TODO(zengjinle): which block_dim and chunk_size would be better?
+template <typename ParamT, int MaxTensorNumPerLaunch = 160,
+          int MaxChunkNumPerLaunch = 780>
+static void MultiTensorUpdateLambParamAndBetaPows(
+    const platform::CUDADeviceContext &dev_ctx, const int *offsets, int n,
+    const MasterT<ParamT> *trust_ratio_div, const MasterT<ParamT> *lr,
+    const MasterT<ParamT> *param_square_norm,
+    const MasterT<ParamT> *trust_ratio_div_square_norm, const bool *found_inf,
+    ParamT *param, MasterT<ParamT> *master_param, MasterT<ParamT> *beta1pow,
+    MasterT<ParamT> *beta2pow, MasterT<ParamT> beta1, MasterT<ParamT> beta2,
+    int chunk_size = 65536) {
+  constexpr bool kHasMasterParam =
+      !(std::is_same<ParamT, MasterT<ParamT>>::value);
+
+  bool has_beta_pow = (beta1pow != nullptr);
+  if (has_beta_pow) {
+    PADDLE_ENFORCE_NOT_NULL(beta2pow, platform::errors::InvalidArgument(
+                                          "Beta2Pow should not be nullptr."));
+  } else {
+    PADDLE_ENFORCE_EQ(beta2pow, nullptr, platform::errors::InvalidArgument(
+                                             "Beta2Pow should be nullptr."));
   }
 
-  using MT = MasterT<ParamT>;
+  const int block_dim = 512;
 
-  MT lr_value = *args.lr;
-  CUDA_KERNEL_LOOP(i, num) {
-    MT p = args.GetParam(i);
-    MT t = args.trust_ratio_div[i];
-    auto norm_idx = args.index[i];
-    MT p_square_norm = args.param_square_norm[norm_idx];
-    MT t_square_norm = args.trust_ratio_div_square_norm[norm_idx];
+  int vec_size = 8;
+  for (int i = 0; i < n; ++i) {
+    int offset = offsets[i] - offsets[0];
+    vec_size =
+        std::min(vec_size, GetChunkedVecSize(param + offset, chunk_size));
+    if (kHasMasterParam) {
+      vec_size = std::min(vec_size,
+                          GetChunkedVecSize(master_param + offset, chunk_size));
+    }
+    vec_size = std::min(
+        vec_size, GetChunkedVecSize(trust_ratio_div + offset, chunk_size));
+  }
 
-    MT p_norm = static_cast<MT>(sqrtf(p_square_norm));
-    MT t_norm = static_cast<MT>(sqrtf(t_square_norm));
+  VLOG(1) << __func__ << " VecSize = " << vec_size;
 
-    auto update = (p_norm != static_cast<MT>(0) && t_norm != static_cast<MT>(0))
-                      ? p_norm / t_norm
-                      : static_cast<MT>(1);
+  constexpr auto kNumTensor = MaxTensorNumPerLaunch;
+  constexpr auto kNumChunk = MaxChunkNumPerLaunch;
 
-    MT updated_p = p - lr_value * update * t;
-    args.SetParam(i, updated_p);
-  }
-}
+  auto stream = dev_ctx.stream();
+#define PD_LAUNCH_MULTI_TENSOR_UPDATE_PARAM_BETAPOW(__has_beta_pow)            \
+  do {                                                                         \
+    using FunctorT =                                                           \
+        LambUpdateParamAndBetaPowsFunctor<ParamT, kHasMasterParam,             \
+                                          __has_beta_pow, kVecSize>;           \
+    LambParamHelper<ParamT, kHasMasterParam> param_helper(param,               \
+                                                          master_param);       \
+    LambBetaPowUpdateOnceHelper<MasterT<ParamT>, __has_beta_pow>               \
+        betapow_helper(beta1pow, beta2pow, beta1, beta2);                      \
+    launcher.Launch(FunctorT(), param_helper, trust_ratio_div, lr,             \
+                    param_square_norm, trust_ratio_div_square_norm, found_inf, \
+                    betapow_helper);                                           \
+  } while (0)
 
-template <typename ParamT, typename IndexT>
-static void LambUpdateParamAndBetaPows(
-    const platform::CUDADeviceContext &dev_ctx,
-    const MasterT<ParamT> *trust_ratio_div, const MasterT<ParamT> *lr,
-    const IndexT *index, const MasterT<ParamT> *param_square_norm,
-    const MasterT<ParamT> *trust_ratio_div_square_norm,
-    const MasterT<ParamT> *update_flag, MasterT<ParamT> **beta1pow,
-    MasterT<ParamT> **beta2pow, bool **found_inf, MasterT<ParamT> beta1,
-    MasterT<ParamT> beta2, int num, ParamT *param,
-    MasterT<ParamT> *master_param, gpuStream_t stream) {
-  if (num == 0) return;
-
-  bool has_master_param = !(std::is_same<ParamT, MasterT<ParamT>>::value);
-  auto has_beta_pow = (*beta1pow) != nullptr && (*beta2pow) != nullptr;
-  auto has_found_inf = (*found_inf) != nullptr;
-
-#define PADDLE_LAUNCH_LAMB_UPDATE_PARAM_KERNEL(                              \
-    __has_master_param, __has_beta_pow, __has_found_inf)                     \
-  do {                                                                       \
-    LambParamAndBetaPowsUpdateHelper<ParamT, IndexT, __has_master_param,     \
-                                     __has_beta_pow, __has_found_inf>        \
-        helper(param, master_param, *beta1pow, *beta2pow, beta1, beta2,      \
-               *found_inf, trust_ratio_div, lr, index, param_square_norm,    \
-               trust_ratio_div_square_norm, update_flag);                    \
-    auto config = platform::GetGpuLaunchConfig1D(dev_ctx, num);              \
-    LambUpdateParamAndBetaPowsCUDAKernel<<<                                  \
-        config.block_per_grid, config.thread_per_block, 0, stream>>>(helper, \
-                                                                     num);   \
+#define PD_LAUNCH_VEC_MULTI_TENSOR_UPDATE_PARAM_BETAPOW_CASE        \
+  do {                                                              \
+    auto callback = [&](                                            \
+        const MultiTensorLauncher<kNumTensor, kNumChunk> &launcher, \
+        int launch_n) {                                             \
+      if (has_beta_pow && launch_n == 0) {                          \
+        PD_LAUNCH_MULTI_TENSOR_UPDATE_PARAM_BETAPOW(true);          \
+        beta1pow = nullptr;                                         \
+        beta2pow = nullptr;                                         \
+      } else {                                                      \
+        PD_LAUNCH_MULTI_TENSOR_UPDATE_PARAM_BETAPOW(false);         \
+      }                                                             \
+    };                                                              \
+    MultiTensorApplyWithCallback<kNumTensor, kNumChunk>(            \
+        stream, offsets, n, chunk_size, block_dim, callback);       \
   } while (0)
 
-  if (has_master_param) {
-    if (has_beta_pow) {
-      if (has_found_inf) {
-        PADDLE_LAUNCH_LAMB_UPDATE_PARAM_KERNEL(true, true, true);
-      } else {
-        PADDLE_LAUNCH_LAMB_UPDATE_PARAM_KERNEL(true, true, false);
-      }
-    } else {
-      if (has_found_inf) {
-        PADDLE_LAUNCH_LAMB_UPDATE_PARAM_KERNEL(true, false, true);
-      } else {
-        PADDLE_LAUNCH_LAMB_UPDATE_PARAM_KERNEL(true, false, false);
-      }
-    }
-  } else {
-    if (has_beta_pow) {
-      if (has_found_inf) {
-        PADDLE_LAUNCH_LAMB_UPDATE_PARAM_KERNEL(false, true, true);
-      } else {
-        PADDLE_LAUNCH_LAMB_UPDATE_PARAM_KERNEL(false, true, false);
-      }
-    } else {
-      if (has_found_inf) {
-        PADDLE_LAUNCH_LAMB_UPDATE_PARAM_KERNEL(false, false, true);
-      } else {
-        PADDLE_LAUNCH_LAMB_UPDATE_PARAM_KERNEL(false, false, false);
-      }
-    }
-  }
+  PD_VEC_LAUNCH_KERNEL(vec_size,
+                       PD_LAUNCH_VEC_MULTI_TENSOR_UPDATE_PARAM_BETAPOW_CASE);
 
-  *beta1pow = nullptr;
-  *beta2pow = nullptr;
-  *found_inf = nullptr;
-#undef PADDLE_LAUNCH_LAMB_UPDATE_PARAM_KERNEL
+#undef PD_LAUNCH_MULTI_TENSOR_UPDATE_PARAM_BETAPOW
+#undef PD_LAUNCH_VEC_MULTI_TENSOR_UPDATE_PARAM_BETAPOW_CASE
 }
 
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
@@ -1005,15 +1055,16 @@ class DistributedFusedLambOpKernel<platform::CUDADeviceContext, T>
                           "Too many parameter number. Only <= %d is supported.",
                           std::numeric_limits<int>::max()));
 
-    // Step 3: Get FusedIndices, ParamInfo
-    const auto *indices = GetInputTensorPtr<int>(ctx, "FusedIndices");
+    // Step 3: Get ParamInfo
     const auto *param_info_tensor = GetInputTensorPtr<int>(ctx, "ParamInfo");
     auto fp32_local_start_idx = param_info_tensor[0];
     auto fp32_local_param_num = param_info_tensor[1];
     auto fp32_global_param_num = param_info_tensor[2];
-    auto fp16_local_start_idx = param_info_tensor[3];
-    auto fp16_local_param_num = param_info_tensor[4];
-    auto fp16_global_param_num = param_info_tensor[5];
+    auto fp32_weight_decay_end_idx = param_info_tensor[3];
+    auto fp16_local_start_idx = param_info_tensor[4];
+    auto fp16_local_param_num = param_info_tensor[5];
+    auto fp16_global_param_num = param_info_tensor[6];
+    auto fp16_weight_decay_end_idx = param_info_tensor[7];
 
     auto local_param_num = fp32_local_param_num + fp16_local_param_num;
     auto param_num = fp32_global_param_num + fp16_global_param_num;
@@ -1031,7 +1082,7 @@ class DistributedFusedLambOpKernel<platform::CUDADeviceContext, T>
             << " , fp16_global_param_num = " << fp16_global_param_num;
 
     // Step 4: Get LearningRate, Moment1, Moment2, Beta1Pow, Beta2Pow,
-    // WeightDecay, GlobalScale, FoundInf
+    // GlobalScale, FoundInf
     const auto *global_scale = GetInputTensorPtr<float>(ctx, "GlobalScale");
     const auto *lr = GetInputTensorPtr<float>(ctx, "LearningRate");
     int64_t partial_numel = 0;
@@ -1065,14 +1116,15 @@ class DistributedFusedLambOpKernel<platform::CUDADeviceContext, T>
         GetSameInOutTensorPtr<float>(ctx, place, "Beta1Pow", "Beta1PowOut");
     auto *beta2pow =
         GetSameInOutTensorPtr<float>(ctx, place, "Beta2Pow", "Beta2PowOut");
-    const float *weight_decay = GetInputTensorPtr<float>(ctx, "WeightDecay");
 
     auto *found_inf_t = ctx.Output<framework::Tensor>("FoundInf");
     found_inf_t->Resize({1});
     auto *found_inf = found_inf_t->mutable_data<bool>(place);
 
-    // Step 5: Get attributes beta1, beta2, epsilon, max_grad_norm, ring_id,
+    // Step 5: Get attributes weight_decay, beta1, beta2, epsilon,
+    // max_grad_norm, ring_id,
     // use_master_param_norm, is_grad_scaled_by_nranks
+    auto weight_decay = ctx.Attr<float>("weight_decay");
     auto beta1 = ctx.Attr<float>("beta1");
     auto beta2 = ctx.Attr<float>("beta2");
     auto epsilon = ctx.Attr<float>("epsilon");
@@ -1105,7 +1157,8 @@ class DistributedFusedLambOpKernel<platform::CUDADeviceContext, T>
     platform::float16 *fp16_sum_grad;
     auto fp32_numel_each_device = fp32_numel / num_devices;
     auto fp16_numel_each_device = fp16_numel / num_devices;
-    if (num_devices > 1) {
+    if (num_devices > 1 ||
+        (max_global_grad_norm > 0 && !clip_after_allreduce)) {
       auto ptr = sum_grad_buffer.Alloc<uint8_t>(
           fp32_numel_each_device * sizeof(float) +
           fp16_numel_each_device * sizeof(platform::float16));
@@ -1181,7 +1234,11 @@ class DistributedFusedLambOpKernel<platform::CUDADeviceContext, T>
             float, platform::float16><<<1, 1, 0, stream>>>(
             global_scale, max_global_grad_norm, fp32_square_grad_norm,
             fp32_scale, fp16_scale, clip_scale);
-        VLOG(1) << "Grad scale: " << FlattenToString(fp32_scale, 1, place);
+        if (fp32_scale) {
+          VLOG(1) << "Grad scale: " << FlattenToString(fp32_scale, 1, place);
+        } else {
+          VLOG(1) << "Grad scale: " << FlattenToString(fp16_scale, 1, place);
+        }
         if (num_devices > 1) {
           PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllReduce(
               fp32_square_grad_norm, fp32_square_grad_norm, 1, ncclFloat32,
@@ -1218,36 +1275,56 @@ class DistributedFusedLambOpKernel<platform::CUDADeviceContext, T>
     VLOG(10) << "ReduceScatter done";
 
     // Step 7: update the moment1, moment2. Calcuate the trust_ratio_div
+    auto *fused_offsets_t = ctx.Input<framework::Tensor>("FusedParamOffsets");
+    auto *fused_offsets = fused_offsets_t->data<int>();
+    auto *fp32_partial_fused_offsets_t =
+        ctx.Input<framework::Tensor>("FP32ShardFusedParamOffsets");
+    const auto *fp32_partial_fused_offsets =
+        fp32_partial_fused_offsets_t->data<int>();
+    auto *fp16_partial_fused_offsets_t =
+        ctx.Input<framework::Tensor>("FP16ShardFusedParamOffsets");
+    const auto *fp16_partial_fused_offsets =
+        fp16_partial_fused_offsets_t->data<int>();
+
+    VLOG(1) << "FusedParamOffsets: "
+            << FlattenToString(fused_offsets, fused_offsets_t->numel(),
+                               fused_offsets_t->place());
+    VLOG(1) << "FP32ShardFusedParamOffsets: "
+            << FlattenToString(fp32_partial_fused_offsets,
+                               fp32_partial_fused_offsets_t->numel(),
+                               fp32_partial_fused_offsets_t->place());
+    VLOG(1) << "FP16ShardFusedParamOffsets: "
+            << FlattenToString(fp16_partial_fused_offsets,
+                               fp16_partial_fused_offsets_t->numel(),
+                               fp16_partial_fused_offsets_t->place());
+
     memory::Buffer trust_ratio_div_buffer(place);
     auto *trust_ratio_div = trust_ratio_div_buffer.Alloc<float>(partial_numel);
     auto fp32_offset = rank * fp32_numel_each_device;
     auto fp16_offset = rank * fp16_numel_each_device;
     if (has_fp32_param) {
-      auto config =
-          platform::GetGpuLaunchConfig1D(dev_ctx, fp32_numel_each_device);
       VLOG(10) << "Update FP32 Moment and TrustRatioDiv starts";
-      UpdateLambMoment<<<config.block_per_grid, config.thread_per_block, 0,
-                         stream>>>(
+      MultiTensorUpdateLambMomentAndTrustRatioDiv(
+          dev_ctx, fp32_partial_fused_offsets, fp32_local_param_num,
           fp32_param + fp32_offset, fp32_sum_grad, fp32_square_grad_norm,
-          global_scale, indices + fp32_offset, weight_decay, beta1pow, beta2pow,
-          moment1, moment2, trust_ratio_div, beta1, beta2, epsilon,
-          max_global_grad_norm, fp32_numel_each_device, rescale_grad);
+          global_scale, beta1pow, beta2pow, moment1, moment2, trust_ratio_div,
+          found_inf, weight_decay, fp32_weight_decay_end_idx, beta1, beta2,
+          epsilon, max_global_grad_norm, rescale_grad);
       VLOG(10) << "Update FP32 Moment and TrustRatioDiv done";
     }
     float *master_param = nullptr;
     if (has_fp16_param) {
       master_param = fp32_param + fp32_numel;
-      auto config =
-          platform::GetGpuLaunchConfig1D(dev_ctx, fp16_numel_each_device);
       VLOG(10) << "Update FP16 Moment and TrustRatioDiv starts";
-      UpdateLambMoment<<<config.block_per_grid, config.thread_per_block, 0,
-                         stream>>>(
+      auto tmp_found_inf = has_fp32_param ? nullptr : found_inf;
+      MultiTensorUpdateLambMomentAndTrustRatioDiv(
+          dev_ctx, fp16_partial_fused_offsets, fp16_local_param_num,
           master_param + fp16_offset, fp16_sum_grad, fp32_square_grad_norm,
-          global_scale, indices + fp32_numel + fp16_offset, weight_decay,
-          beta1pow, beta2pow, moment1 + fp32_numel_each_device,
+          global_scale, beta1pow, beta2pow, moment1 + fp32_numel_each_device,
           moment2 + fp32_numel_each_device,
-          trust_ratio_div + fp32_numel_each_device, beta1, beta2, epsilon,
-          max_global_grad_norm, fp16_numel_each_device, rescale_grad);
+          trust_ratio_div + fp32_numel_each_device, tmp_found_inf, weight_decay,
+          fp16_weight_decay_end_idx, beta1, beta2, epsilon,
+          max_global_grad_norm, rescale_grad);
       VLOG(10) << "Update FP16 Moment and TrustRatioDiv done";
     }
 
@@ -1257,30 +1334,6 @@ class DistributedFusedLambOpKernel<platform::CUDADeviceContext, T>
     memory::Buffer square_norm_buffer(place);
     auto *param_square_norm = square_norm_buffer.Alloc<float>(2 * param_num);
     auto *trust_ratio_div_square_norm = param_square_norm + param_num;
-
-    auto *fused_offsets_t = ctx.Input<framework::Tensor>("FusedParamOffsets");
-    auto *fused_offsets = fused_offsets_t->data<int>();
-    auto *fp32_partial_fused_offsets_t =
-        ctx.Input<framework::Tensor>("FP32ShardFusedParamOffsets");
-    const auto *fp32_partial_fused_offsets =
-        fp32_partial_fused_offsets_t->data<int>();
-    auto *fp16_partial_fused_offsets_t =
-        ctx.Input<framework::Tensor>("FP16ShardFusedParamOffsets");
-    const auto *fp16_partial_fused_offsets =
-        fp16_partial_fused_offsets_t->data<int>();
-
-    VLOG(1) << "FusedParamOffsets: "
-            << FlattenToString(fused_offsets, fused_offsets_t->numel(),
-                               fused_offsets_t->place());
-    VLOG(1) << "FP32ShardFusedParamOffsets: "
-            << FlattenToString(fp32_partial_fused_offsets,
-                               fp32_partial_fused_offsets_t->numel(),
-                               fp32_partial_fused_offsets_t->place());
-    VLOG(1) << "FP16ShardFusedParamOffsets: "
-            << FlattenToString(fp16_partial_fused_offsets,
-                               fp16_partial_fused_offsets_t->numel(),
-                               fp16_partial_fused_offsets_t->place());
-
     if (num_devices > 1) {
       if (use_master_param_norm) {
         FillZeroWithPtr(param_square_norm + fp32_global_param_num,
@@ -1296,11 +1349,11 @@ class DistributedFusedLambOpKernel<platform::CUDADeviceContext, T>
                         fp16_partial_fused_offsets, fp16_local_param_num,
                         param_square_norm + fp16_local_start_idx);
     } else {
-      // NOTE: extra computation is performed. We can improve this performance
-      // if needed in the future.
       MultiTensorL2Norm(
-          place, stream, fp16_param, fused_offsets + fp32_global_param_num,
-          fp16_global_param_num, param_square_norm + fp32_global_param_num);
+          place, stream, fp16_param + fused_offsets[fp16_local_start_idx] -
+                             fused_offsets[fp32_global_param_num],
+          fused_offsets + fp16_local_start_idx, fp16_local_param_num,
+          param_square_norm + fp16_local_start_idx);
     }
 
     MultiTensorL2Norm(place, stream, trust_ratio_div,
@@ -1333,26 +1386,29 @@ class DistributedFusedLambOpKernel<platform::CUDADeviceContext, T>
 
     // Step 9: update parameter, beta1pow, beta2pow. All gather parameters.
     if (has_fp32_param) {
-      LambUpdateParamAndBetaPows<float>(
-          dev_ctx, trust_ratio_div, lr, indices + fp32_offset,
-          param_square_norm, trust_ratio_div_square_norm, fp32_square_grad_norm,
-          &beta1pow, &beta2pow, &found_inf, beta1, beta2,
-          fp32_numel_each_device, fp32_param + fp32_offset, nullptr, stream);
+      MultiTensorUpdateLambParamAndBetaPows<float>(
+          dev_ctx, fp32_partial_fused_offsets, fp32_local_param_num,
+          trust_ratio_div, lr, param_square_norm + fp32_local_start_idx,
+          trust_ratio_div_square_norm + fp32_local_start_idx, found_inf,
+          fp32_param + fp32_offset, nullptr, beta1pow, beta2pow, beta1, beta2);
       if (num_devices > 1) {
         // ncclAllGather
         PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllGather(
             fp32_param + fp32_offset, fp32_param, fp32_numel_each_device,
             ncclFloat32, comm, stream));
       }
+
+      beta1pow = nullptr;
+      beta2pow = nullptr;
     }
     if (has_fp16_param) {
-      LambUpdateParamAndBetaPows<platform::float16>(
-          dev_ctx, trust_ratio_div + fp32_numel_each_device, lr,
-          indices + fp32_numel + fp16_offset, param_square_norm,
-          trust_ratio_div_square_norm, fp32_square_grad_norm, &beta1pow,
-          &beta2pow, &found_inf, beta1, beta2, fp16_numel_each_device,
-          fp16_param + fp16_offset, master_param + fp16_offset, stream);
-
+      MultiTensorUpdateLambParamAndBetaPows<platform::float16>(
+          dev_ctx, fp16_partial_fused_offsets, fp16_local_param_num,
+          trust_ratio_div + fp32_numel_each_device, lr,
+          param_square_norm + fp16_local_start_idx,
+          trust_ratio_div_square_norm + fp16_local_start_idx, found_inf,
+          fp16_param + fp16_offset, master_param + fp16_offset, beta1pow,
+          beta2pow, beta1, beta2);
       if (num_devices > 1) {
         // ncclAllGather
         PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::ncclAllGather(
diff --git a/paddle/fluid/operators/optimizers/multi_tensor_apply.h b/paddle/fluid/operators/optimizers/multi_tensor_apply.h
index 5d8d03c733dae210e8a41a8ad78a258df558b341..179e8f452545c437e373e42d59d18f524f260cd5 100644
--- a/paddle/fluid/operators/optimizers/multi_tensor_apply.h
+++ b/paddle/fluid/operators/optimizers/multi_tensor_apply.h
@@ -94,11 +94,40 @@ static __global__ void MultiTensorApplyCUDAKernel(
           args...);
 }
 
-template <typename Functor, int BlockDim, int MaxTensorNumPerLaunch,
-          int MaxChunkNumPerLaunch, typename... Args>
-static void MultiTensorApply(Functor functor, gpuStream_t stream,
-                             const int *offsets, int n, int chunk_size,
-                             Args... args) {
+template <int MaxTensorNumPerLaunch, int MaxChunkNumPerLaunch>
+class MultiTensorLauncher {
+ public:
+  MultiTensorLauncher(
+      const TensorMetaList<MaxTensorNumPerLaunch, MaxChunkNumPerLaunch> &meta,
+      const int &chunk_id, const int &chunk_size, const int &block_dim,
+      const gpuStream_t &stream)
+      : meta_(meta),
+        chunk_id_(chunk_id),
+        chunk_size_(chunk_size),
+        block_dim_(block_dim),
+        stream_(stream) {}
+
+  template <typename Functor, typename... Args>
+  void Launch(Functor &&functor, Args &&... args) const {
+    MultiTensorApplyCUDAKernel<
+        Functor, MaxTensorNumPerLaunch,
+        MaxChunkNumPerLaunch><<<chunk_id_, block_dim_, 0, stream_>>>(
+        functor, meta_, chunk_size_, args...);
+  }
+
+ private:
+  const TensorMetaList<MaxTensorNumPerLaunch, MaxChunkNumPerLaunch> &meta_;
+  const int &chunk_id_;
+  const int &chunk_size_;
+  const int &block_dim_;
+  const gpuStream_t &stream_;
+};
+
+template <int MaxTensorNumPerLaunch, int MaxChunkNumPerLaunch,
+          typename Callback>
+static void MultiTensorApplyWithCallback(gpuStream_t stream, const int *offsets,
+                                         int n, int chunk_size, int block_dim,
+                                         Callback &&callback) {
   if (n == 0) return;
 
   constexpr auto NumTensor = MaxTensorNumPerLaunch;
@@ -110,6 +139,11 @@ static void MultiTensorApply(Functor functor, gpuStream_t stream,
   int numel_offset = 0;
   metas.start_tensor_id = 0;
   metas.start_chunk_id = 0;
+  int launch_num = 0;
+
+  MultiTensorLauncher<MaxTensorNumPerLaunch, MaxChunkNumPerLaunch> launcher(
+      metas, chunk_id, chunk_size, block_dim, stream);
+
   for (int i = 0; i < n; ++i) {
     auto length = offsets[i + 1] - offsets[i];
     if (tensor_id == 0) {
@@ -132,9 +166,8 @@ static void MultiTensorApply(Functor functor, gpuStream_t stream,
       bool last_chunk = (i + 1 == n && j + 1 == chunk_num);
 
       if (tensor_full || block_full || last_chunk) {
-        MultiTensorApplyCUDAKernel<Functor, NumTensor,
-                                   NumChunk><<<chunk_id, BlockDim, 0, stream>>>(
-            functor, metas, chunk_size, args...);
+        callback(launcher, launch_num);
+        ++launch_num;
         chunk_id = 0;
         if (j + 1 == chunk_num) {  // chunk for the current tensor is full
           metas.start_chunk_id = 0;
@@ -152,5 +185,17 @@ static void MultiTensorApply(Functor functor, gpuStream_t stream,
   }
 }
 
+template <typename Functor, int MaxTensorNumPerLaunch, int MaxChunkNumPerLaunch,
+          typename... Args>
+static void MultiTensorApply(Functor functor, gpuStream_t stream,
+                             const int *offsets, int n, int chunk_size,
+                             int block_dim, Args &&... args) {
+  auto callback = [&](const MultiTensorLauncher<MaxTensorNumPerLaunch,
+                                                MaxChunkNumPerLaunch> &launcher,
+                      int i) { launcher.Launch(functor, args...); };
+  MultiTensorApplyWithCallback<MaxTensorNumPerLaunch, MaxChunkNumPerLaunch>(
+      stream, offsets, n, chunk_size, block_dim, callback);
+}
+
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/p_norm_op.cu b/paddle/fluid/operators/p_norm_op.cu
index f2cb427a0a5b139e1ccdf960afeb6db4bcb8b5a5..d0b78b9b0643d6c5dc5b4bfeac2cf792ac349194 100644
--- a/paddle/fluid/operators/p_norm_op.cu
+++ b/paddle/fluid/operators/p_norm_op.cu
@@ -39,6 +39,11 @@ __device__ __forceinline__ int sgn(T val) {
 __device__ __forceinline__ platform::float16 inline_abs(platform::float16 x) {
   return static_cast<platform::float16>(abs(static_cast<float>(x)));
 }
+
+__device__ __forceinline__ platform::bfloat16 inline_abs(platform::bfloat16 x) {
+  return static_cast<platform::bfloat16>(abs(static_cast<float>(x)));
+}
+
 __device__ __forceinline__ float inline_abs(float x) { return abs(x); }
 __device__ __forceinline__ double inline_abs(double x) { return abs(x); }
 
@@ -53,6 +58,11 @@ __device__ __forceinline__ platform::float16 inline_pow(
   return static_cast<platform::float16>(
       pow(static_cast<float>(base), static_cast<float>(exponent)));
 }
+__device__ __forceinline__ platform::bfloat16 inline_pow(
+    platform::bfloat16 base, platform::bfloat16 exponent) {
+  return static_cast<platform::bfloat16>(
+      pow(static_cast<float>(base), static_cast<float>(exponent)));
+}
 __device__ __forceinline__ float inline_pow(float base, float exponent) {
   return pow(base, exponent);
 }
@@ -202,9 +212,11 @@ using CUDA = paddle::platform::CUDADeviceContext;
 
 REGISTER_OP_CUDA_KERNEL(p_norm,
                         ops::PnormCUDAKernel<CUDA, paddle::platform::float16>,
+                        ops::PnormCUDAKernel<CUDA, paddle::platform::bfloat16>,
                         ops::PnormCUDAKernel<CUDA, float>,
                         ops::PnormCUDAKernel<CUDA, double>);
 REGISTER_OP_CUDA_KERNEL(
     p_norm_grad, ops::PnormGradCUDAKernel<CUDA, paddle::platform::float16>,
+    ops::PnormGradCUDAKernel<CUDA, paddle::platform::bfloat16>,
     ops::PnormGradCUDAKernel<CUDA, float>,
     ops::PnormGradCUDAKernel<CUDA, double>);
diff --git a/paddle/fluid/operators/reduce_ops/reduce_sum_op.part.cu b/paddle/fluid/operators/reduce_ops/reduce_sum_op.part.cu
index c3d3e0cf6ecd51f3bb2baa063878f80444db3563..2f6bf127518090916c4b947daf1d1f202fdd5960 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_sum_op.part.cu
+++ b/paddle/fluid/operators/reduce_ops/reduce_sum_op.part.cu
@@ -23,6 +23,7 @@ REGISTER_OP_CUDA_KERNEL(
     reduce_sum_grad, CUDAReduceSumGradKernel<bool>,
     CUDAReduceSumGradKernel<float>, CUDAReduceSumGradKernel<double>,
     CUDAReduceSumGradKernel<paddle::platform::float16>,
+    CUDAReduceSumGradKernel<paddle::platform::bfloat16>,
     CUDAReduceSumGradKernel<int>, CUDAReduceSumGradKernel<int64_t>,
     CUDAReduceSumGradKernel<paddle::platform::complex<float>>,
     CUDAReduceSumGradKernel<paddle::platform::complex<double>>);
diff --git a/paddle/fluid/operators/split_op.cc b/paddle/fluid/operators/split_op.cc
index a8f05d94563e57a20cc41ba1edd68872d869d00e..6678320f9ffa61e3e6c51fd806569c2571d63d69 100644
--- a/paddle/fluid/operators/split_op.cc
+++ b/paddle/fluid/operators/split_op.cc
@@ -15,6 +15,9 @@ limitations under the License. */
 #include "paddle/fluid/operators/split_op.h"
 #include <string>
 
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/phi/infermeta/unary.h"
+
 namespace paddle {
 namespace operators {
 using framework::Tensor;
@@ -23,52 +26,6 @@ class SplitOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    PADDLE_ENFORCE_EQ(ctx->HasInput("X"), true,
-                      platform::errors::InvalidArgument(
-                          "Input(X) of SplitOp should not be null."));
-    PADDLE_ENFORCE_GE(ctx->Outputs("Out").size(), 1UL,
-                      platform::errors::InvalidArgument(
-                          "Outputs(Out) of SplitOp should not be empty."));
-    auto in_dims = ctx->GetInputDim("X");
-    auto outs_names = ctx->Outputs("Out");
-    size_t axis = static_cast<size_t>(ctx->Attrs().Get<int>("axis"));
-    size_t num = static_cast<size_t>(ctx->Attrs().Get<int>("num"));
-    std::vector<int> sections = static_cast<std::vector<int>>(
-        ctx->Attrs().Get<std::vector<int>>("sections"));
-    const size_t outs_number = outs_names.size();
-
-    if (sections.size() > 0) {
-      PADDLE_ENFORCE_EQ(
-          sections.size(), outs_number,
-          platform::errors::InvalidArgument("tensor split sections size "
-                                            "should be equal to output size."));
-    }
-
-    if (ctx->HasInput("AxisTensor")) {
-      auto out_dims = phi::make_ddim(std::vector<int>(in_dims.size(), -1));
-      std::vector<framework::DDim> outs_dims(outs_number, out_dims);
-      ctx->SetOutputsDim("Out", outs_dims);
-      for (size_t i = 0; i < outs_number; ++i) {
-        ctx->ShareLoD("X", "Out", 0, i);
-      }
-      return;
-    }
-
-    bool each_section_is_known =
-        (sections.size() > 0 && !ctx->HasInputs("SectionsTensorList"));
-
-    auto outs_dims = UpdateOutsDims(ctx->IsRuntime(), each_section_is_known,
-                                    in_dims, num, sections, axis, outs_number);
-    ctx->SetOutputsDim("Out", outs_dims);
-    if (axis != 0) {
-      // Only pass LoD when not spliting along the first dim.
-      for (size_t i = 0; i < outs_number; ++i) {
-        ctx->ShareLoD("X", "Out", 0, i);
-      }
-    }
-  }
-
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
@@ -168,6 +125,10 @@ Example:
 
 namespace ops = paddle::operators;
 
+DELCARE_INFER_SHAPE_FUNCTOR(split, SplitInferShapeFunctor,
+                            PT_INFER_META(phi::SplitInferMeta));
+
 REGISTER_OPERATOR(split, ops::SplitOp, ops::SplitOpMaker,
                   ops::SplitGradMaker<paddle::framework::OpDesc>,
-                  ops::SplitGradMaker<paddle::imperative::OpBase>);
+                  ops::SplitGradMaker<paddle::imperative::OpBase>,
+                  SplitInferShapeFunctor);
diff --git a/paddle/fluid/operators/sum_op.cu b/paddle/fluid/operators/sum_op.cu
index 3e2d2a5495b3428ce0fad9d61431d53b44eea330..33590c1d7cca04e215e55abb26fb2aa3c3b61bec 100644
--- a/paddle/fluid/operators/sum_op.cu
+++ b/paddle/fluid/operators/sum_op.cu
@@ -258,4 +258,5 @@ REGISTER_OP_CUDA_KERNEL(
     ops::SumKernel<paddle::platform::CUDADeviceContext, double>,
     ops::SumKernel<paddle::platform::CUDADeviceContext, int>,
     ops::SumKernel<paddle::platform::CUDADeviceContext, int64_t>,
-    ops::SumKernel<paddle::platform::CUDADeviceContext, plat::float16>);
+    ops::SumKernel<paddle::platform::CUDADeviceContext, plat::float16>,
+    ops::SumKernel<paddle::platform::CUDADeviceContext, plat::bfloat16>);
diff --git a/paddle/fluid/operators/uniform_random_op.cc b/paddle/fluid/operators/uniform_random_op.cc
index 353d653f48141b2e68db6143c1ca0859a9ecc13f..1c22e60fa87aa73246806e4f5bc70e49a3b0f958 100644
--- a/paddle/fluid/operators/uniform_random_op.cc
+++ b/paddle/fluid/operators/uniform_random_op.cc
@@ -281,10 +281,6 @@ REGISTER_OPERATOR(
     paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
     paddle::operators::UniformRandomOpVarTypeInference);
 
-REGISTER_OP_CPU_KERNEL(
-    uniform_random, paddle::operators::CPUUniformRandomKernel<float>,
-    paddle::operators::CPUUniformRandomKernel<double>,
-    paddle::operators::CPUUniformRandomKernel<paddle::platform::bfloat16>);
 REGISTER_OP_CPU_KERNEL(
     uniform_random_batch_size_like,
     paddle::operators::CPUUniformRandomKernel<float>,
diff --git a/paddle/fluid/operators/uniform_random_op.cu b/paddle/fluid/operators/uniform_random_op.cu
index fb38a6aded4cf173bb4c0dd96d131ff520b6701e..2ceb8a68d863dfe71458c67deeac7f54df0a662b 100644
--- a/paddle/fluid/operators/uniform_random_op.cu
+++ b/paddle/fluid/operators/uniform_random_op.cu
@@ -58,9 +58,6 @@ class GPUUniformRandomKernel : public framework::OpKernel<T> {
 }  // namespace operators
 }  // namespace paddle
 
-REGISTER_OP_CUDA_KERNEL(uniform_random,
-                        paddle::operators::GPUUniformRandomKernel<float>,
-                        paddle::operators::GPUUniformRandomKernel<double>);
 REGISTER_OP_CUDA_KERNEL(uniform_random_batch_size_like,
                         paddle::operators::GPUUniformRandomKernel<float>,
                         paddle::operators::GPUUniformRandomKernel<double>);
diff --git a/paddle/fluid/operators/where_op.cc b/paddle/fluid/operators/where_op.cc
index 92ed2bbdc33f55315b3dddf8dc106b7716e97a6f..0f10efefa137b698b59db23b67122df990cfa366 100644
--- a/paddle/fluid/operators/where_op.cc
+++ b/paddle/fluid/operators/where_op.cc
@@ -12,8 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/operators/where_op.h"
-
+#include "paddle/fluid/framework/infershape_utils.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/core/infermeta_utils.h"
+#include "paddle/phi/infermeta/multiary.h"
 namespace paddle {
 namespace operators {
 
@@ -21,31 +23,6 @@ class WhereOp : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
-  void InferShape(framework::InferShapeContext* ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("Condition"), "Input", "Condition", "Where");
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "Where");
-    OP_INOUT_CHECK(ctx->HasInput("Y"), "Input", "Y", "Where");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "Where");
-
-    auto cond_dims = ctx->GetInputDim("Condition");
-    auto x_dims = ctx->GetInputDim("X");
-    auto y_dims = ctx->GetInputDim("Y");
-    PADDLE_ENFORCE_EQ(
-        cond_dims, x_dims,
-        platform::errors::InvalidArgument(
-            "The dims of Inputs(Condition) and Inputs(X) should be same. "
-            "But received Condition's shape is [%s], X's shape is [%s]",
-            cond_dims, x_dims));
-    PADDLE_ENFORCE_EQ(x_dims, y_dims,
-                      platform::errors::InvalidArgument(
-                          "The dims of Inputs(X) and Inputs(Y) should be same. "
-                          "But received X's shape is [%s], Y's shape is [%s]",
-                          x_dims, y_dims));
-
-    ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
-    ctx->ShareLoD("X", /*->*/ "Out");
-  }
-
  protected:
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext& ctx) const override {
@@ -140,19 +117,12 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(WhereGradNoNeedBufferVarsInferer, "X", "Y");
 }  // namespace paddle
 
 namespace ops = paddle::operators;
+DELCARE_INFER_SHAPE_FUNCTOR(where, WhereInferShapeFunctor,
+                            PT_INFER_META(phi::WhereInferMeta));
 REGISTER_OPERATOR(where, ops::WhereOp, ops::WhereOpMaker,
                   ops::WhereOpGradMaker<paddle::framework::OpDesc>,
-                  ops::WhereOpGradMaker<paddle::imperative::OpBase>);
+                  ops::WhereOpGradMaker<paddle::imperative::OpBase>,
+                  WhereInferShapeFunctor);
 
 REGISTER_OPERATOR(where_grad, ops::WhereGradOp,
                   ops::WhereGradNoNeedBufferVarsInferer);
-REGISTER_OP_CPU_KERNEL(
-    where, ops::WhereKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::WhereKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::WhereKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::WhereKernel<paddle::platform::CPUDeviceContext, int64_t>);
-REGISTER_OP_CPU_KERNEL(
-    where_grad, ops::WhereGradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::WhereGradKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::WhereGradKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::WhereGradKernel<paddle::platform::CPUDeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/where_op.cu b/paddle/fluid/operators/where_op.cu
deleted file mode 100644
index 61a1691e4fe265035917ed2407d5e3e24aa6bd88..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/where_op.cu
+++ /dev/null
@@ -1,126 +0,0 @@
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h"
-#include "paddle/fluid/operators/where_op.h"
-#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
-
-namespace platform = paddle::platform;
-
-namespace paddle {
-namespace operators {
-
-template <typename T>
-struct CondFunctor {
-  HOSTDEVICE inline CondFunctor() {}
-
-  HOSTDEVICE inline T operator()(const bool cond, const T x, const T y) const {
-    return cond ? x : y;
-  }
-};
-
-template <typename T>
-__global__ void WhereCUDAKernel(const int N, const bool* cond, const T* x,
-                                const T* y, T* out) {
-  int idx = blockDim.x * blockIdx.x + threadIdx.x;
-  for (; idx < N; idx += blockDim.x * gridDim.x) {
-    out[idx] = cond[idx] ? x[idx] : y[idx];
-  }
-}
-
-template <typename T>
-__global__ void WhereGradCUDAKernel(const int N, const T* dout,
-                                    const bool* cond, T* dx, T* dy) {
-  int idx = blockDim.x * blockIdx.x + threadIdx.x;
-  for (; idx < N; idx += blockDim.x * gridDim.x) {
-    if (dx != nullptr) {
-      dx[idx] = cond[idx] ? dout[idx] : 0.;
-    }
-    if (dy != nullptr) {
-      dy[idx] = cond[idx] ? 0. : dout[idx];
-    }
-  }
-}
-
-template <typename T>
-class WhereKernel<platform::CUDADeviceContext, T>
-    : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* condition = context.Input<framework::Tensor>("Condition");
-    auto* X = context.Input<framework::Tensor>("X");
-    auto* Y = context.Input<framework::Tensor>("Y");
-    auto* out = context.Output<framework::Tensor>("Out");
-    auto numel = condition->numel();
-
-    // TODO(GaaoWei8): Input of where can be broadcast
-    const bool* cond_data = condition->data<bool>();
-    const T* x_data = X->data<T>();
-    const T* y_data = Y->data<T>();
-    T* out_data = out->mutable_data<T>(context.GetPlace());
-
-    auto stream = context.cuda_device_context().stream();
-    auto& dev_ctx =
-        context.template device_context<platform::CUDADeviceContext>();
-    auto functor = CondFunctor<T>();
-    std::vector<const framework::Tensor*> ins = {condition, X, Y};
-    std::vector<framework::Tensor*> outs = {out};
-    paddle::operators::LaunchSameDimsElementwiseCudaKernel<T>(dev_ctx, ins,
-                                                              &outs, functor);
-  }
-};
-
-template <typename T>
-class WhereGradKernel<platform::CUDADeviceContext, T>
-    : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* condition = context.Input<framework::Tensor>("Condition");
-    const bool* cond_data = condition->data<bool>();
-    auto numel = condition->numel();
-
-    auto* dout_t =
-        context.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto* dx_t = context.Output<framework::Tensor>(framework::GradVarName("X"));
-    auto* dy_t = context.Output<framework::Tensor>(framework::GradVarName("Y"));
-    auto* dout = dout_t->data<T>();
-    T* dx =
-        (dx_t != nullptr) ? dx_t->mutable_data<T>(context.GetPlace()) : nullptr;
-    T* dy =
-        (dy_t != nullptr) ? dy_t->mutable_data<T>(context.GetPlace()) : nullptr;
-
-    auto stream = context.cuda_device_context().stream();
-    auto& dev_ctx =
-        context.template device_context<platform::CUDADeviceContext>();
-    auto config = GetGpuLaunchConfig1D(dev_ctx, condition->numel());
-    WhereGradCUDAKernel<
-        T><<<config.block_per_grid.x, config.thread_per_block.x, 0, stream>>>(
-        numel, dout, cond_data, dx, dy);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
-
-REGISTER_OP_CUDA_KERNEL(
-    where, paddle::operators::WhereKernel<platform::CUDADeviceContext, float>,
-    paddle::operators::WhereKernel<platform::CUDADeviceContext, double>,
-    paddle::operators::WhereKernel<platform::CUDADeviceContext, int>,
-    paddle::operators::WhereKernel<platform::CUDADeviceContext, int64_t>);
-REGISTER_OP_CUDA_KERNEL(
-    where_grad,
-    paddle::operators::WhereGradKernel<platform::CUDADeviceContext, float>,
-    paddle::operators::WhereGradKernel<platform::CUDADeviceContext, double>,
-    paddle::operators::WhereGradKernel<platform::CUDADeviceContext, int>,
-    paddle::operators::WhereGradKernel<platform::CUDADeviceContext, int64_t>);
diff --git a/paddle/fluid/operators/where_op.h b/paddle/fluid/operators/where_op.h
deleted file mode 100644
index 5398ee024a2890e38e88fc981721872e1ba34d60..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/where_op.h
+++ /dev/null
@@ -1,73 +0,0 @@
-// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class WhereKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* condition = context.Input<framework::Tensor>("Condition");
-    auto* X = context.Input<framework::Tensor>("X");
-    auto* Y = context.Input<framework::Tensor>("Y");
-    auto* out = context.Output<framework::Tensor>("Out");
-
-    const bool* cond_data = condition->data<bool>();
-    const T* x_data = X->data<T>();
-    const T* y_data = Y->data<T>();
-    T* out_data = out->mutable_data<T>(context.GetPlace());
-
-    auto x_numel = X->numel();
-    for (int i = 0; i < x_numel; i++) {
-      out_data[i] = cond_data[i] ? x_data[i] : y_data[i];
-    }
-  }
-};
-
-template <typename DeviceContext, typename T>
-class WhereGradKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& context) const override {
-    auto* condition = context.Input<framework::LoDTensor>("Condition");
-    const auto* cond_data = condition->data<bool>();
-    auto numel = condition->numel();
-
-    auto* dout_t =
-        context.Input<framework::Tensor>(framework::GradVarName("Out"));
-    auto* dx_t = context.Output<framework::Tensor>(framework::GradVarName("X"));
-    auto* dy_t = context.Output<framework::Tensor>(framework::GradVarName("Y"));
-
-    auto* dout = dout_t->data<T>();
-    if (dx_t != nullptr) {
-      auto* dx = dx_t->mutable_data<T>(context.GetPlace());
-      for (int i = 0; i < numel; i++) {
-        dx[i] = dout[i] * (cond_data[i] ? 1. : 0.);
-      }
-    }
-    if (dy_t != nullptr) {
-      auto* dy = dy_t->mutable_data<T>(context.GetPlace());
-      for (int i = 0; i < numel; i++) {
-        dy[i] = dout[i] * (cond_data[i] ? 0. : 1.);
-      }
-    }
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/where_op_npu.cc b/paddle/fluid/operators/where_op_npu.cc
index d4294393daa34612aae815b0ebfab7d55f0b9f46..35508950941783753734a916aa7c2dcff7731181 100755
--- a/paddle/fluid/operators/where_op_npu.cc
+++ b/paddle/fluid/operators/where_op_npu.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/operators/where_op.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
 
 namespace paddle {
diff --git a/paddle/fluid/operators/where_op_xpu.cc b/paddle/fluid/operators/where_op_xpu.cc
index 3a4875c07005119e90f5d5cb448a63bcf62a09a4..41232c8b5e8d88564e59e0343a26a4ae98d5ed90 100644
--- a/paddle/fluid/operators/where_op_xpu.cc
+++ b/paddle/fluid/operators/where_op_xpu.cc
@@ -14,7 +14,7 @@
 
 #ifdef PADDLE_WITH_XPU
 
-#include "paddle/fluid/operators/where_op.h"
+#include "paddle/fluid/framework/op_registry.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/platform/device/gpu/gpu_primitives.h b/paddle/fluid/platform/device/gpu/gpu_primitives.h
index 8aec8e840f33273a3130355c751e635e4a3f6736..803674779e756f000005d106f950659ea765c5ce 100644
--- a/paddle/fluid/platform/device/gpu/gpu_primitives.h
+++ b/paddle/fluid/platform/device/gpu/gpu_primitives.h
@@ -20,6 +20,7 @@ limitations under the License. */
 #include <hip/hip_runtime.h>
 #endif
 #include <stdio.h>
+#include "paddle/fluid/platform/bfloat16.h"
 #include "paddle/fluid/platform/complex.h"
 #include "paddle/fluid/platform/float16.h"
 
@@ -244,6 +245,72 @@ __device__ __forceinline__ void VectorizedAtomicAddPerBlock(
 #endif
 #endif
 
+// NOTE(zhangbo): cuda do not have atomicCAS for __nv_bfloat16.
+inline static __device__ uint32_t bf16_add_to_low_half(uint32_t val, float x) {
+  bfloat16 low_half;
+  // the bfloat16 in lower 16bits
+  low_half.x = static_cast<uint16_t>(val & 0xFFFFu);
+  low_half = static_cast<bfloat16>(static_cast<float>(low_half) + x);
+  return (val & 0xFFFF0000u) | low_half.x;
+}
+
+inline static __device__ uint32_t bf16_add_to_high_half(uint32_t val, float x) {
+  bfloat16 high_half;
+  // the bfloat16 in higher 16bits
+  high_half.x = static_cast<uint16_t>(val >> 16);
+  high_half = static_cast<bfloat16>(static_cast<float>(high_half) + x);
+  return (val & 0xFFFFu) | (static_cast<uint32_t>(high_half.x) << 16);
+}
+
+#if CUDA_VERSION >= 11000 && defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 800
+static __device__ __forceinline__ bfloat16 CUDABF16ToPDBF16(__nv_bfloat16 x) {
+  return *reinterpret_cast<bfloat16 *>(&x);
+}
+
+static __device__ __forceinline__ __nv_bfloat16 PDBF16ToCUDABF16(bfloat16 x) {
+  return *reinterpret_cast<__nv_bfloat16 *>(&x);
+}
+
+CUDA_ATOMIC_WRAPPER(Add, bfloat16) {
+  return CUDABF16ToPDBF16(atomicAdd(reinterpret_cast<__nv_bfloat16 *>(address),
+                                    PDBF16ToCUDABF16(val)));
+}
+#else
+CUDA_ATOMIC_WRAPPER(Add, bfloat16) {
+  // concrete packed bfloat16 value may exsits in lower or higher 16bits
+  // of the 32bits address.
+  uint32_t *address_as_ui = reinterpret_cast<uint32_t *>(
+      reinterpret_cast<char *>(address) -
+      (reinterpret_cast<uintptr_t>(address) & 0x02));
+  float val_f = static_cast<float>(val);
+  uint32_t old = *address_as_ui;
+  uint32_t sum;
+  uint32_t newval;
+  uint32_t assumed;
+  if (((uintptr_t)address & 0x02) == 0) {
+    // the bfloat16 value stay at lower 16 bits of the address.
+    do {
+      assumed = old;
+      old = atomicCAS(address_as_ui, assumed,
+                      bf16_add_to_low_half(assumed, val_f));
+    } while (old != assumed);
+    bfloat16 ret;
+    ret.x = old & 0xFFFFu;
+    return ret;
+  } else {
+    // the bfloat16 value stay at higher 16 bits of the address.
+    do {
+      assumed = old;
+      old = atomicCAS(address_as_ui, assumed,
+                      bf16_add_to_high_half(assumed, val_f));
+    } while (old != assumed);
+    bfloat16 ret;
+    ret.x = old >> 16;
+    return ret;
+  }
+}
+#endif
+
 CUDA_ATOMIC_WRAPPER(Add, complex<float>) {
   float *real = reinterpret_cast<float *>(address);
   float *imag = real + 1;
diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt
index 1f06eda8a2ee5dc8322b5e16e1f7eb2e0703f9a8..c61e8212b0257cc5ccffaa27971b959472a71f06 100644
--- a/paddle/fluid/pybind/CMakeLists.txt
+++ b/paddle/fluid/pybind/CMakeLists.txt
@@ -81,7 +81,7 @@ set(PYBIND_SRCS
   cuda_streams_py.cc)
 
 if(NOT ON_INFER)
-  set (PYBIND_DEPS ${PYBIND_DEPS} processgroup)
+  set (PYBIND_DEPS ${PYBIND_DEPS} processgroup eager_reducer)
   if (WITH_NCCL)
     set (PYBIND_DEPS ${PYBIND_DEPS} processgroup_nccl)
   endif()
diff --git a/paddle/fluid/pybind/distributed_py.cc b/paddle/fluid/pybind/distributed_py.cc
index e057fb53ccecc7193fd52b8beda2c4f2880560e8..7b59188a9f3cdae2d0e9df329b969395b50177b0 100644
--- a/paddle/fluid/pybind/distributed_py.cc
+++ b/paddle/fluid/pybind/distributed_py.cc
@@ -23,6 +23,7 @@ limitations under the License. */
 
 #include "paddle/fluid/distributed/collective/ProcessGroup.h"
 #include "paddle/fluid/distributed/collective/Types.h"
+#include "paddle/fluid/distributed/collective/reducer.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/imperative/layer.h"
@@ -143,6 +144,19 @@ void BindDistributed(py::module *m) {
                     [](distributed::ProcessGroupStrategy &self, int nrings) {
                       self.nrings_ = nrings;
                     });
+
+  m->def("eager_assign_group_by_size",
+         [](py::handle py_tensors, std::vector<bool> is_sparse_gradient,
+            std::vector<size_t> group_size_limits,
+            std::vector<int64_t> tensor_indices) {
+           auto tensors = CastPyArg2VectorOfTensor(py_tensors.ptr(), 0);
+           return distributed::Eager_AssignGroupBySize(
+               tensors, is_sparse_gradient, group_size_limits, tensor_indices);
+         },
+         py::arg("tensors"), py::arg("is_sparse_gradient"),
+         py::arg("group_size_limits") = std::vector<size_t>{25 * 1024 * 1024},
+         py::arg("tensor_indices") = std::vector<int64_t>{},
+         py::call_guard<py::gil_scoped_release>());
 }
 
 }  // end namespace pybind
diff --git a/paddle/fluid/pybind/op_function_generator.h b/paddle/fluid/pybind/op_function_generator.h
index 2b07a439d33b4a96a10a893a95e0dd26f83dd8c7..d23b3dd64ab05cf10d8096a84e317645972211d1 100644
--- a/paddle/fluid/pybind/op_function_generator.h
+++ b/paddle/fluid/pybind/op_function_generator.h
@@ -60,7 +60,8 @@ std::map<std::string, std::set<std::string>> op_ins_map = {
     {"momentum", {"Param", "Grad", "Velocity", "LearningRate", "MasterParam"}},
     {"merged_momentum",
      {"Param", "Grad", "Velocity", "LearningRate", "MasterParam"}},
-    {"sparse_momentum", {"Param", "Grad", "Velocity", "Index", "LearningRate"}},
+    {"sparse_momentum",
+     {"Param", "Grad", "Velocity", "Index", "LearningRate", "MasterParam"}},
     {"rnn", {"Input", "PreState", "WeightList", "SequenceLength"}},
     {"run_program", {"X", "Params"}},
     {"fused_feedforward",
@@ -124,7 +125,7 @@ std::map<std::string, std::set<std::string>> op_outs_map = {
     {"generate_proposals_v2", {"RpnRois", "RpnRoiProbs", "RpnRoisNum"}},
     {"momentum", {"ParamOut", "VelocityOut", "MasterParamOut"}},
     {"merged_momentum", {"ParamOut", "VelocityOut", "MasterParamOut"}},
-    {"sparse_momentum", {"ParamOut", "VelocityOut"}},
+    {"sparse_momentum", {"ParamOut", "VelocityOut", "MasterParamOut"}},
     {"rnn", {"DropoutState", "Reserve", "Out", "State"}},
     {"run_program", {"DOut"}},
     {"adam",
@@ -181,7 +182,7 @@ std::map<std::string, std::set<std::string>> op_passing_outs_map = {
       "out_old_num_accumulates", "out_num_updates"}},
     {"momentum", {"ParamOut", "VelocityOut", "MasterParamOut"}},
     {"merged_momentum", {"ParamOut", "VelocityOut", "MasterParamOut"}},
-    {"sparse_momentum", {"ParamOut", "VelocityOut"}},
+    {"sparse_momentum", {"ParamOut", "VelocityOut", "MasterParamOut"}},
     {"batch_norm", {"MeanOut", "VarianceOut"}},
     {"sync_batch_norm", {"MeanOut", "VarianceOut"}},
     {"accuracy", {"Correct", "Total"}},
diff --git a/paddle/infrt/dialect/infrt/infrt_ops_base.td b/paddle/infrt/dialect/infrt/infrt_ops_base.td
index 81d3d028a66bea29dd9a373e1905ac02468251fd..978b126d754169e4f57fdd3b79fe49855c5d3359 100644
--- a/paddle/infrt/dialect/infrt/infrt_ops_base.td
+++ b/paddle/infrt/dialect/infrt/infrt_ops_base.td
@@ -40,6 +40,12 @@ def DenseTensor : Infrt_Type<"DenseTensor"> {
   );
 }
 
+// Type Constrait for concrete DenseTensor type.
+class DenseTensor<string target, string precision, string layout> :
+    Type<CPred<"$_self == ::infrt::DenseTensorType::get($_self.getContext(), ::infrt::TargetType::"#target#",::infrt::PrecisionType::"#precision#",::infrt::LayoutType::"#layout#")">, 
+    "!infrt.DenseTensor<"#target#","#precision#","#layout#">", 
+    "::infrt::DenseTensorType">;
+
 // Base class for infrt dialect attributes.
 class Infrt_Attr<string name, list<Trait> traits = [],
                    string baseCppClass = "::mlir::Attribute">
diff --git a/paddle/infrt/dialect/init_infrt_dialects.cc b/paddle/infrt/dialect/init_infrt_dialects.cc
index b5b8de7a20d0866802b8ce72e12dd7ed35dccbd1..c5c81b4b0f22dd369d7b63d34f45c41897052185 100644
--- a/paddle/infrt/dialect/init_infrt_dialects.cc
+++ b/paddle/infrt/dialect/init_infrt_dialects.cc
@@ -21,8 +21,8 @@
 #include "paddle/infrt/dialect/infrt/infrt_dialect.h"
 #include "paddle/infrt/dialect/infrt_base.h"
 #include "paddle/infrt/dialect/pd_ops.h"
-#include "paddle/infrt/dialect/phi/infrt_phi_tensor.h"
-#include "paddle/infrt/dialect/phi/phi_base.h"
+#include "paddle/infrt/dialect/phi/ir/infrt_phi_tensor.h"
+#include "paddle/infrt/dialect/phi/ir/phi_base.h"
 #include "paddle/infrt/dialect/tensor_shape.h"
 
 namespace infrt {
diff --git a/paddle/infrt/dialect/phi/CMakeLists.txt b/paddle/infrt/dialect/phi/CMakeLists.txt
index 626b02c1f790d0a7f38887be33dace1c773a2cb1..d477b6b9bdc278b2408794fa4235d9c8bca5850a 100644
--- a/paddle/infrt/dialect/phi/CMakeLists.txt
+++ b/paddle/infrt/dialect/phi/CMakeLists.txt
@@ -2,16 +2,7 @@ if (NOT INFRT_WITH_PHI)
     return()
 endif()
 
-#mlir_tablegen_on(infrt_phi_base DIALECT phi)
-add_mlir_dialect(infrt_phi_base phi)
-add_mlir_dialect(infrt_phi_tensor phi_dt)
-add_mlir_dialect(infrt_phi_kernel phi_kernel)
-#mlir_tablegen_on(infrt_phi_tensor)
-
-gather_srcs(infrt_src SRCS
-    phi_base.cc infrt_phi_tensor.cc
-    infrt_phi_tensor.cc)
-
+add_subdirectory(ir)
 add_subdirectory(pass)
 
 add_executable(phi-exec phi_exec.cc)
diff --git a/paddle/infrt/dialect/phi/ir/CMakeLists.txt b/paddle/infrt/dialect/phi/ir/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..8c1d75629d09c210f813cd994199da77ca48a3b8
--- /dev/null
+++ b/paddle/infrt/dialect/phi/ir/CMakeLists.txt
@@ -0,0 +1,9 @@
+#mlir_tablegen_on(infrt_phi_base DIALECT phi)
+add_mlir_dialect(infrt_phi_base phi)
+add_mlir_dialect(infrt_phi_tensor phi_dt)
+add_mlir_dialect(infrt_phi_kernel phi_kernel)
+#mlir_tablegen_on(infrt_phi_tensor)
+
+gather_srcs(infrt_src SRCS
+    phi_base.cc 
+    infrt_phi_tensor.cc)
diff --git a/paddle/infrt/dialect/phi/infrt_phi_base.td b/paddle/infrt/dialect/phi/ir/infrt_phi_base.td
similarity index 100%
rename from paddle/infrt/dialect/phi/infrt_phi_base.td
rename to paddle/infrt/dialect/phi/ir/infrt_phi_base.td
diff --git a/paddle/infrt/dialect/phi/infrt_phi_kernel.td b/paddle/infrt/dialect/phi/ir/infrt_phi_kernel.td
similarity index 92%
rename from paddle/infrt/dialect/phi/infrt_phi_kernel.td
rename to paddle/infrt/dialect/phi/ir/infrt_phi_kernel.td
index 879994907cc0d951bde838b23fd129e865a360f2..37bf0b5ef213d76613162aa9bb3d2f9b6324340e 100644
--- a/paddle/infrt/dialect/phi/infrt_phi_kernel.td
+++ b/paddle/infrt/dialect/phi/ir/infrt_phi_kernel.td
@@ -4,7 +4,7 @@
 include "mlir/Interfaces/SideEffectInterfaces.td"
 include "mlir/IR/OpBase.td"
 include "paddle/infrt/dialect/infrt_base.td"
-include "paddle/infrt/dialect/phi/infrt_phi_base.td"
+include "paddle/infrt/dialect/phi/ir/infrt_phi_base.td"
 
 def PHI_KernelDialect : Dialect {
   let name = "phi_kernel";
diff --git a/paddle/infrt/dialect/phi/infrt_phi_tensor.cc b/paddle/infrt/dialect/phi/ir/infrt_phi_tensor.cc
similarity index 71%
rename from paddle/infrt/dialect/phi/infrt_phi_tensor.cc
rename to paddle/infrt/dialect/phi/ir/infrt_phi_tensor.cc
index 9df1a47031b1f726578291f628cda7d12900bcb7..64780294be92b86bcf29d3cb2045434cc6479517 100644
--- a/paddle/infrt/dialect/phi/infrt_phi_tensor.cc
+++ b/paddle/infrt/dialect/phi/ir/infrt_phi_tensor.cc
@@ -12,12 +12,12 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/infrt/dialect/phi/infrt_phi_tensor.h"
+#include "paddle/infrt/dialect/phi/ir/infrt_phi_tensor.h"
 
 #include <mlir/IR/BuiltinTypes.h>
 
-#include "paddle/infrt/dialect/phi/infrt_phi_tensorDialect.cpp.inc"
-#include "paddle/infrt/dialect/phi/infrt_phi_tensorTypes.cpp.inc"
+#include "paddle/infrt/dialect/phi/ir/infrt_phi_tensorDialect.cpp.inc"
+#include "paddle/infrt/dialect/phi/ir/infrt_phi_tensorTypes.cpp.inc"
 
 namespace infrt {
 namespace phi {
@@ -25,7 +25,7 @@ namespace phi {
 void PHIDenseTensorDialect::initialize() {
 #define GET_OP_LIST
   addOperations<
-#include "paddle/infrt/dialect/phi/infrt_phi_tensor.cpp.inc"
+#include "paddle/infrt/dialect/phi/ir/infrt_phi_tensor.cpp.inc"
       >();
 }
 
@@ -33,4 +33,4 @@ void PHIDenseTensorDialect::initialize() {
 }  // namespace infrt
 
 #define GET_OP_CLASSES
-#include "paddle/infrt/dialect/phi/infrt_phi_tensor.cpp.inc"  // NOLINT
+#include "paddle/infrt/dialect/phi/ir/infrt_phi_tensor.cpp.inc"  // NOLINT
diff --git a/paddle/infrt/dialect/phi/infrt_phi_tensor.h b/paddle/infrt/dialect/phi/ir/infrt_phi_tensor.h
similarity index 83%
rename from paddle/infrt/dialect/phi/infrt_phi_tensor.h
rename to paddle/infrt/dialect/phi/ir/infrt_phi_tensor.h
index 2780f9759185ef45bc19f43fc621f46eabbe7a66..9a92558daab0376d430fe04b853a810cf42b6e85 100644
--- a/paddle/infrt/dialect/phi/infrt_phi_tensor.h
+++ b/paddle/infrt/dialect/phi/ir/infrt_phi_tensor.h
@@ -29,11 +29,11 @@
 #include <mlir/Interfaces/LoopLikeInterface.h>
 #include <mlir/Interfaces/SideEffectInterfaces.h>
 
-#include "paddle/infrt/dialect/phi/infrt_phi_tensorDialect.h.inc"
-#include "paddle/infrt/dialect/phi/infrt_phi_tensorTypes.h.inc"
+#include "paddle/infrt/dialect/phi/ir/infrt_phi_tensorDialect.h.inc"
+#include "paddle/infrt/dialect/phi/ir/infrt_phi_tensorTypes.h.inc"
 
 #include "paddle/infrt/dialect/dense_tensor.h"
-#include "paddle/infrt/dialect/phi/phi_base.h"
+#include "paddle/infrt/dialect/phi/ir/phi_base.h"
 // NOLINT
 #define GET_OP_CLASSES
-#include "paddle/infrt/dialect/phi/infrt_phi_tensor.h.inc"
+#include "paddle/infrt/dialect/phi/ir/infrt_phi_tensor.h.inc"
diff --git a/paddle/infrt/dialect/phi/infrt_phi_tensor.td b/paddle/infrt/dialect/phi/ir/infrt_phi_tensor.td
similarity index 97%
rename from paddle/infrt/dialect/phi/infrt_phi_tensor.td
rename to paddle/infrt/dialect/phi/ir/infrt_phi_tensor.td
index b7b3b061fdbe42909ac503d9d387cb8aed6bdc1a..dc3a4b340d767a371bc411c0a58d1fc7c72ca83e 100644
--- a/paddle/infrt/dialect/phi/infrt_phi_tensor.td
+++ b/paddle/infrt/dialect/phi/ir/infrt_phi_tensor.td
@@ -2,7 +2,7 @@
 #else
 #define PHI_TENSOR
 
-include "paddle/infrt/dialect/phi/infrt_phi_base.td"
+include "paddle/infrt/dialect/phi/ir/infrt_phi_base.td"
 include "mlir/Interfaces/SideEffectInterfaces.td"
 include "mlir/IR/OpBase.td"
 include "paddle/infrt/dialect/infrt_base.td"
diff --git a/paddle/infrt/dialect/phi/phi_base.cc b/paddle/infrt/dialect/phi/ir/phi_base.cc
similarity index 84%
rename from paddle/infrt/dialect/phi/phi_base.cc
rename to paddle/infrt/dialect/phi/ir/phi_base.cc
index a1caa40f6383b5016a9e237733a0b3ef016cbc97..7a6b3f3f0a404043f49a6df3e5bdcb873dd442c9 100644
--- a/paddle/infrt/dialect/phi/phi_base.cc
+++ b/paddle/infrt/dialect/phi/ir/phi_base.cc
@@ -12,7 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/infrt/dialect/phi/phi_base.h"
+#include "paddle/infrt/dialect/phi/ir/phi_base.h"
 
 #include <mlir/IR/Builders.h>
 #include <mlir/IR/Dialect.h>
@@ -21,8 +21,8 @@
 #include <mlir/IR/TypeUtilities.h>
 #include <mlir/IR/Types.h>
 #include "paddle/infrt/common/global.h"
-#include "paddle/infrt/dialect/phi/infrt_phi_base.cpp.inc"
-#include "paddle/infrt/dialect/phi/infrt_phi_baseDialect.cpp.inc"
+#include "paddle/infrt/dialect/phi/ir/infrt_phi_base.cpp.inc"
+#include "paddle/infrt/dialect/phi/ir/infrt_phi_baseDialect.cpp.inc"
 
 namespace infrt {
 namespace phi {
@@ -51,11 +51,11 @@ void PHIDialect::printType(::mlir::Type type,
 void PHIDialect::initialize() {
   addOperations<
 #define GET_OP_LIST
-#include "paddle/infrt/dialect/phi/infrt_phi_base.cpp.inc"  // NOLINT
+#include "paddle/infrt/dialect/phi/ir/infrt_phi_base.cpp.inc"  // NOLINT
       >();
   addTypes<
 #define GET_TYPEDEF_LIST
-#include "paddle/infrt/dialect/phi/infrt_phi_baseTypes.cpp.inc"  // NOLINT
+#include "paddle/infrt/dialect/phi/ir/infrt_phi_baseTypes.cpp.inc"  // NOLINT
       >();
 }
 
@@ -81,4 +81,4 @@ mlir::Type PHIDialect::parseType(mlir::DialectAsmParser& parser) const {
 }  // namespace infrt
 
 #define GET_TYPEDEF_CLASSES
-#include "paddle/infrt/dialect/phi/infrt_phi_baseTypes.cpp.inc"  // NOLINT
+#include "paddle/infrt/dialect/phi/ir/infrt_phi_baseTypes.cpp.inc"  // NOLINT
diff --git a/paddle/infrt/dialect/phi/phi_base.h b/paddle/infrt/dialect/phi/ir/phi_base.h
similarity index 84%
rename from paddle/infrt/dialect/phi/phi_base.h
rename to paddle/infrt/dialect/phi/ir/phi_base.h
index 11174290f92bd18fdc91588d7eba89f61bb05413..a08d8229fccf53225311b451e941f99e8a3d0e8a 100644
--- a/paddle/infrt/dialect/phi/phi_base.h
+++ b/paddle/infrt/dialect/phi/ir/phi_base.h
@@ -19,11 +19,13 @@
 
 #include <string>
 
-#include "paddle/infrt/dialect/phi/infrt_phi_base.h.inc"
-#include "paddle/infrt/dialect/phi/infrt_phi_baseDialect.h.inc"
+#include "paddle/infrt/dialect/phi/ir/infrt_phi_baseDialect.h.inc"
 
 #define GET_TYPEDEF_CLASSES
-#include "paddle/infrt/dialect/phi/infrt_phi_baseTypes.h.inc"
+#include "paddle/infrt/dialect/phi/ir/infrt_phi_baseTypes.h.inc"
+
+#define GET_OP_CLASSES
+#include "paddle/infrt/dialect/phi/ir/infrt_phi_base.h.inc"
 
 namespace mlir {
 namespace OpTrait {
diff --git a/paddle/infrt/host_context/value.h b/paddle/infrt/host_context/value.h
index eb9a2092657aa079ee6a4007d7ded9f8896e93aa..7e7d77d3af741443d490dcfdd5b9ee6677b557ef 100644
--- a/paddle/infrt/host_context/value.h
+++ b/paddle/infrt/host_context/value.h
@@ -73,7 +73,7 @@ using ValueVariantType =
             std::vector<phi::DenseTensor>,
             paddle::experimental::ScalarBase<phi::DenseTensor>,
             paddle::experimental::ScalarArrayBase<phi::DenseTensor>,
-            std::vector<phi::MetaTensor>,
+            std::vector<phi::MetaTensor*>,
             phi::MetaConfig,
             paddle::experimental::Backend,
             paddle::experimental::DataLayout,
diff --git a/paddle/phi/api/lib/api_custom_impl.cc b/paddle/phi/api/lib/api_custom_impl.cc
index c7400b93fcdc18314318fae9482e1e5e5bfb8aef..19b113838eab5403aca00d9d97b278646228c512 100644
--- a/paddle/phi/api/lib/api_custom_impl.cc
+++ b/paddle/phi/api/lib/api_custom_impl.cc
@@ -94,12 +94,16 @@ std::vector<Tensor> split_impl(const Tensor& x,
   std::vector<Tensor> out;
   auto dense_outs = SetKernelOutput(out_number, kernel_backend, &out);
   std::vector<phi::MetaTensor> meta_outs;
+  meta_outs.reserve(out_number);
+  std::vector<phi::MetaTensor*> meta_out_ptrs;
+  meta_out_ptrs.reserve(out_number);
   for (size_t i = 0; i < out_number; ++i) {
     meta_outs.push_back(dense_outs[i]);
+    meta_out_ptrs.push_back(&meta_outs.back());
   }
 
   phi::SplitInferMeta(
-      MakeMetaTensor(*dense_x), num_or_sections, axis, &meta_outs);
+      MakeMetaTensor(*dense_x), num_or_sections, axis, meta_out_ptrs);
 
   using kernel_signature = void (*)(const platform::DeviceContext&,
                                     const phi::DenseTensor&,
diff --git a/paddle/phi/api/lib/utils/tensor_utils.cc b/paddle/phi/api/lib/utils/tensor_utils.cc
index 31325e22afae31e55a3a2d939739d6745ccd3d36..1c9f7c3a8683daaf26cb87b23e50284d0329c4a8 100644
--- a/paddle/phi/api/lib/utils/tensor_utils.cc
+++ b/paddle/phi/api/lib/utils/tensor_utils.cc
@@ -136,26 +136,5 @@ phi::ScalarArray MakePhiScalarArrayFromVarList(
   return result;
 }
 
-void ResetTensorDtypeAndLayoutByArgDef(phi::TensorBase* dst,
-                                       const phi::TensorArgDef& arg_def) {
-  VLOG(5) << "ResetTensor by TensorArgDef.";
-  if (phi::DenseTensor::classof(dst)) {
-    auto* dense_t = static_cast<phi::DenseTensor*>(dst);
-    auto* meta = phi::DenseTensorUtils::GetMutableMeta(dense_t);
-    meta->dtype = arg_def.dtype;
-    meta->layout = arg_def.layout;
-  } else if (phi::SelectedRows::classof(dst)) {
-    auto* selected_rows = static_cast<phi::SelectedRows*>(dst);
-    auto* meta =
-        phi::DenseTensorUtils::GetMutableMeta(selected_rows->mutable_value());
-    meta->dtype = arg_def.dtype;
-    meta->layout = arg_def.layout;
-  } else {
-    PADDLE_THROW(phi::errors::Unimplemented(
-        "Unsupported tensor type is received when reseting tensor dtype and "
-        "layout by argument definition."));
-  }
-}
-
 }  // namespace experimental
 }  // namespace paddle
diff --git a/paddle/phi/api/lib/utils/tensor_utils.h b/paddle/phi/api/lib/utils/tensor_utils.h
index 8b30d5421ab943d568a046ca0fe4698849780ffd..64df59c1a2a2de3f72ce46874fe07df70d33599e 100644
--- a/paddle/phi/api/lib/utils/tensor_utils.h
+++ b/paddle/phi/api/lib/utils/tensor_utils.h
@@ -42,8 +42,5 @@ phi::ScalarArray MakePhiScalarArrayFromVar(const framework::Variable& variable);
 phi::ScalarArray MakePhiScalarArrayFromVarList(
     const std::vector<framework::Variable*>& variable_list);
 
-void ResetTensorDtypeAndLayoutByArgDef(phi::TensorBase* dst,
-                                       const phi::TensorArgDef& arg_def);
-
 }  // namespace experimental
 }  // namespace paddle
diff --git a/paddle/phi/backends/gpu/gpu_context.h b/paddle/phi/backends/gpu/gpu_context.h
index 603ce0817c4ebdcb17bb97b14dd0700badcf2385..b9d843982dc5ebb8312a4912ebfa96c73e22b6c5 100644
--- a/paddle/phi/backends/gpu/gpu_context.h
+++ b/paddle/phi/backends/gpu/gpu_context.h
@@ -227,4 +227,12 @@ class GPUContext : public DeviceContext {
 // must use different function name for cudnn kernel
 using GPUDNNContext = GPUContext;
 
+// KPS (Kernel PrimitiveS API) needs to exist as a kind of backend,
+// because we want to implement a KPS-based kernel and make it run
+// on GPU and XPU at the same time, so we need KPSContext when registering
+// KPS Kernel. Note: XPU and GPU cannot be compiled at the same time!
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+using KPSContext = GPUContext;
+#endif
+
 }  // namespace phi
diff --git a/paddle/phi/backends/xpu/xpu_context.h b/paddle/phi/backends/xpu/xpu_context.h
index 3005d1707e638a346c0d20e83a808c5c0da334e1..b87489c567cabea137850163879ed00d151f60cb 100644
--- a/paddle/phi/backends/xpu/xpu_context.h
+++ b/paddle/phi/backends/xpu/xpu_context.h
@@ -66,4 +66,12 @@ class XPUContext : public DeviceContext {
   std::unique_ptr<Impl> impl_;
 };
 
+// KPS (Kernel PrimitiveS API) needs to exist as a kind of backend,
+// because we want to implement a KPS-based kernel and make it run
+// on GPU and XPU at the same time, so we need KPSContext when registering
+// KPS Kernel. Note: XPU and GPU cannot be compiled at the same time!
+#if PADDLE_WITH_XPU_KP
+using KPSContext = XPUContext;
+#endif
+
 }  // namespace phi
diff --git a/paddle/phi/common/backend.h b/paddle/phi/common/backend.h
index 4b7bf65be39cbc83688e7dab3fdd745c2be82b22..a9e12f5d81ed08328afad9e7da6d1e1999d47be1 100644
--- a/paddle/phi/common/backend.h
+++ b/paddle/phi/common/backend.h
@@ -52,6 +52,9 @@ enum class Backend : uint8_t {
   MKLDNN,
   GPUDNN,  // cuDNN and hipDNN
 
+  // paddle kernel primitives backend
+  KPS,
+
   // end of backend types
   NUM_BACKENDS,
 
@@ -115,6 +118,9 @@ inline std::ostream& operator<<(std::ostream& os, Backend backend) {
     case Backend::GPUDNN:
       os << "GPUDNN";
       break;
+    case Backend::KPS:
+      os << "KPS";
+      break;
     default: {
       size_t device_type_id_ = static_cast<size_t>(backend) -
                                static_cast<size_t>(Backend::NUM_BACKENDS);
@@ -147,6 +153,8 @@ inline Backend StringToBackend(const char* backend_cstr) {
     return Backend::MKLDNN;
   } else if (s == std::string("GPUDNN")) {
     return Backend::GPUDNN;
+  } else if (s == std::string("KPS")) {
+    return Backend::KPS;
   } else {
     return static_cast<Backend>(static_cast<size_t>(Backend::NUM_BACKENDS) +
                                 phi::GetOrRegisterGlobalDeviceTypeId(s));
diff --git a/paddle/phi/core/CMakeLists.txt b/paddle/phi/core/CMakeLists.txt
index f4f57a0acbbb386a3642a05e0d0dc70cd082a4d8..8ffacbb39bb249c57fb5c9ef1462d03747356f96 100644
--- a/paddle/phi/core/CMakeLists.txt
+++ b/paddle/phi/core/CMakeLists.txt
@@ -22,8 +22,8 @@ cc_library(sparse_csr_tensor SRCS sparse_csr_tensor.cc DEPS dense_tensor tensor_
 
 cc_library(meta_tensor SRCS meta_tensor.cc DEPS tensor_base tensor_meta dense_tensor)
 cc_library(infermeta_utils SRCS infermeta_utils.cc DEPS meta_tensor)
-cc_library(phi_device_context SRCS device_context.cc DEPS dense_tensor selected_rows)
 cc_library(selected_rows SRCS selected_rows_impl.cc DEPS dense_tensor phi_enforce ddim memcpy)
+cc_library(phi_device_context SRCS device_context.cc DEPS dense_tensor selected_rows)
 
 cc_library(phi_custom_kernel SRCS custom_kernel.cc DEPS kernel_factory convert_utils)
 
diff --git a/paddle/phi/core/compat/convert_utils.cc b/paddle/phi/core/compat/convert_utils.cc
index 3b7a733ede90464328600ebd3c7d371314b99cc3..b85db07bd9dfa0d798304aac6bd86089a9f0b4c0 100644
--- a/paddle/phi/core/compat/convert_utils.cc
+++ b/paddle/phi/core/compat/convert_utils.cc
@@ -66,6 +66,14 @@ phi::Place TransToPhiPlace(const Backend& backend, bool set_device_id) {
     case phi::Backend::XPU:
       return phi::XPUPlace(
           set_device_id ? phi::backends::xpu::GetXPUCurrentDeviceId() : 0);
+#endif
+    case phi::Backend::KPS:
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+      return phi::GPUPlace(
+          set_device_id ? phi::backends::gpu::GetCurrentDeviceId() : 0);
+#elif defined(PADDLE_WITH_XPU_KP)
+      return phi::XPUPlace(
+          set_device_id ? phi::backends::xpu::GetXPUCurrentDeviceId() : 0);
 #endif
     default: {
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
diff --git a/paddle/phi/core/custom_kernel.cc b/paddle/phi/core/custom_kernel.cc
index f84a2bd8d9c5d0634f29485fc07f649ea9fb1b9e..58f9e1c623e81b4f2877099d1cdc2a8fe2e18b9e 100644
--- a/paddle/phi/core/custom_kernel.cc
+++ b/paddle/phi/core/custom_kernel.cc
@@ -20,16 +20,16 @@ void RegisterCustomKernels(const CustomKernelMap& custom_kernel_map) {
   auto& kernel_info_map = custom_kernel_map.GetMap();
   VLOG(3) << "Size of custom_kernel_map: " << kernel_info_map.size();
 
+  auto& kernels = KernelFactory::Instance().kernels();
   for (auto& pair : kernel_info_map) {
-    PADDLE_ENFORCE_EQ(
-        KernelFactory::Instance().HasCompatiblePhiKernel(pair.first),
-        true,
+    PADDLE_ENFORCE_NE(
+        kernels.find(pair.first),
+        kernels.end(),
         phi::errors::InvalidArgument(
             "The kernel %s is not ready for custom kernel registering.",
             pair.first));
 
     for (auto& info_pair : pair.second) {
-      auto& kernels = KernelFactory::Instance().kernels();
       PADDLE_ENFORCE_EQ(
           kernels[pair.first].find(info_pair.first),
           kernels[pair.first].end(),
diff --git a/paddle/phi/core/dense_tensor_impl.cc b/paddle/phi/core/dense_tensor_impl.cc
index 29e7dc01f32db20e3756677fe8a48fcb138b3883..5ee83089589e89b3cb29f095bd88fb16ff39d296 100644
--- a/paddle/phi/core/dense_tensor_impl.cc
+++ b/paddle/phi/core/dense_tensor_impl.cc
@@ -73,11 +73,6 @@ void DenseTensor::set_layout(const paddle::framework::DataLayout layout) {
 // Note: When you reset holder, you need to ensure the offset is correct
 void DenseTensor::ResetHolder(const std::shared_ptr<phi::Allocation>& holder) {
   if (holder_) {
-    // TODO(zyfncg): The change of static_cast<> in check will recover back
-    // when SetAllocationForOutputTenosr is deleted.
-    // Now the numel() may return -1, and will cast to a very large number when
-    // compare with a data with unsigned long type, this will make checking
-    // failed, so it's a temporary solution to deal with this problem.
     PADDLE_ENFORCE_LE(
         numel() * static_cast<int64_t>(SizeOf(dtype())) +
             static_cast<int64_t>(meta_.offset),
diff --git a/paddle/phi/core/infermeta_utils.cc b/paddle/phi/core/infermeta_utils.cc
index f3dd056911ecf81d5ca0954114acbd1a3ac19ad9..671ba2ec7dc258865c01fff99ce97aacaeddd3cc 100644
--- a/paddle/phi/core/infermeta_utils.cc
+++ b/paddle/phi/core/infermeta_utils.cc
@@ -75,13 +75,13 @@ paddle::optional<const phi::MetaTensor&> InferMetaContext::OptionalInputAt(
                : paddle::optional<const phi::MetaTensor&>{paddle::none};
 }
 
-std::vector<MetaTensor> InferMetaContext::InputsBetween(size_t start,
-                                                        size_t end) const {
-  std::vector<MetaTensor> result;
+std::vector<MetaTensor*> InferMetaContext::InputsBetween(size_t start,
+                                                         size_t end) const {
+  std::vector<MetaTensor*> result;
   result.reserve(end - start);
 
   for (size_t i = start; i < end; ++i) {
-    result.emplace_back(*inputs_.at(i));
+    result.push_back(inputs_.at(i).get());
   }
 
   return result;
@@ -91,12 +91,12 @@ MetaTensor* InferMetaContext::MutableOutputAt(size_t idx) {
   return outputs_.at(idx).get();
 }
 
-std::vector<MetaTensor> InferMetaContext::MutableOutputBetween(size_t start,
-                                                               size_t end) {
-  std::vector<MetaTensor> result;
+std::vector<MetaTensor*> InferMetaContext::MutableOutputBetween(size_t start,
+                                                                size_t end) {
+  std::vector<MetaTensor*> result;
   result.reserve(end - start);
   for (size_t i = start; i < end; ++i) {
-    result.emplace_back(*outputs_.at(i));
+    result.emplace_back(outputs_.at(i).get());
   }
   return result;
 }
diff --git a/paddle/phi/core/infermeta_utils.h b/paddle/phi/core/infermeta_utils.h
index 203dbb269841ec8616b94c89603af3904eb572c3..a5775db74382c1aeda95a4351842444b5ad1e47e 100644
--- a/paddle/phi/core/infermeta_utils.h
+++ b/paddle/phi/core/infermeta_utils.h
@@ -50,13 +50,13 @@ class InferMetaContext {
   const std::pair<int, int>& OutputRangeAt(size_t idx) const;
 
   const MetaConfig& GetMetaConfig() const;
-  const MetaTensor& InputAt(size_t idx) const;
 
+  const MetaTensor& InputAt(size_t idx) const;
   paddle::optional<const phi::MetaTensor&> OptionalInputAt(size_t idx) const;
+  std::vector<MetaTensor*> InputsBetween(size_t start, size_t end) const;
 
-  std::vector<MetaTensor> InputsBetween(size_t start, size_t end) const;
   MetaTensor* MutableOutputAt(size_t idx);
-  std::vector<MetaTensor> MutableOutputBetween(size_t start, size_t end);
+  std::vector<MetaTensor*> MutableOutputBetween(size_t start, size_t end);
 
   template <typename AttrType>
   AttrType AttrAt(size_t idx) {
@@ -157,7 +157,7 @@ struct InferMetaFnImpl<Return (*)(Args...), infer_meta_fn> {
   };
 
   template <typename... Tail>
-  struct InferMetaFnCallHelper<const std::vector<MetaTensor>&, Tail...> {
+  struct InferMetaFnCallHelper<const std::vector<MetaTensor*>&, Tail...> {
     template <int in_idx, int attr_idx, int out_idx, typename... PreviousArgs>
     static void Call(InferMetaContext* ctx, PreviousArgs&... pargs) {
       static_assert(attr_idx == 0,
@@ -165,7 +165,7 @@ struct InferMetaFnImpl<Return (*)(Args...), infer_meta_fn> {
       static_assert(out_idx == 0,
                     "InferMeta's Input should appear before Outputs.");
       const std::pair<int, int> range = ctx->InputRangeAt(in_idx);
-      std::vector<MetaTensor> arg =
+      std::vector<MetaTensor*> arg =
           ctx->InputsBetween(range.first, range.second);
       InferMetaFnCallHelper<
           Tail...>::template Call<in_idx + 1, attr_idx, out_idx>(ctx,
@@ -210,13 +210,12 @@ struct InferMetaFnImpl<Return (*)(Args...), infer_meta_fn> {
   };
 
   template <typename... Tail>
-  struct InferMetaFnCallHelper<std::vector<MetaTensor>*, Tail...> {
+  struct InferMetaFnCallHelper<std::vector<MetaTensor*>, Tail...> {
     template <int in_idx, int attr_idx, int out_idx, typename... PreviousArgs>
     static void Call(InferMetaContext* ctx, PreviousArgs&... pargs) {
       const std::pair<int, int> range = ctx->OutputRangeAt(out_idx);
-      std::vector<MetaTensor> tmp =
+      std::vector<MetaTensor*> arg =
           ctx->MutableOutputBetween(range.first, range.second);
-      std::vector<MetaTensor>* arg = &tmp;
       InferMetaFnCallHelper<
           Tail...>::template Call<in_idx, attr_idx, out_idx + 1>(ctx,
                                                                  pargs...,
diff --git a/paddle/phi/core/kernel_registry.h b/paddle/phi/core/kernel_registry.h
index 6a1688947b986549e1feaf39cdf6c73749b0ff3a..7a05452cbebe08d16a4486a03923431a3e59cb81 100644
--- a/paddle/phi/core/kernel_registry.h
+++ b/paddle/phi/core/kernel_registry.h
@@ -87,13 +87,11 @@ struct KernelArgsParseFunctor<Return_ (*)(Args_...)> {
                               default_tensor_layout,
                               default_key.dtype(),
                               arg_type);
-#ifndef PADDLE_WITH_CUSTOM_KERNEL
       } else if (arg_type == std::type_index(typeid(const SelectedRows&))) {
         args_def->AppendInput(default_key.backend(),
                               default_tensor_layout,
                               default_key.dtype(),
                               arg_type);
-#endif
       } else if (arg_type == std::type_index(typeid(DenseTensor*))) {
         args_def->AppendOutput(default_key.backend(),
                                default_tensor_layout,
@@ -105,13 +103,11 @@ struct KernelArgsParseFunctor<Return_ (*)(Args_...)> {
                                default_tensor_layout,
                                default_key.dtype(),
                                arg_type);
-#ifndef PADDLE_WITH_CUSTOM_KERNEL
       } else if (arg_type == std::type_index(typeid(SelectedRows*))) {
         args_def->AppendOutput(default_key.backend(),
                                default_tensor_layout,
                                default_key.dtype(),
                                arg_type);
-#endif
       } else {
         // Attribute deal with
         // TODO(chenweihang): now here allow any types of attribute, maybe
diff --git a/paddle/phi/core/kernel_utils.h b/paddle/phi/core/kernel_utils.h
index 2ce1c829ce81a57cfad7343e2007ebf75b85ea80..b582375155a1878c52fd8fe9fb13f6e715df7067 100644
--- a/paddle/phi/core/kernel_utils.h
+++ b/paddle/phi/core/kernel_utils.h
@@ -23,9 +23,7 @@
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/core/enforce.h"
 #include "paddle/phi/core/kernel_context.h"
-#ifndef PADDLE_WITH_CUSTOM_KERNEL
 #include "paddle/phi/core/selected_rows.h"
-#endif
 #include "paddle/phi/core/sparse_coo_tensor.h"
 #include "paddle/phi/core/sparse_csr_tensor.h"
 #include "paddle/phi/core/type_defs.h"
@@ -223,9 +221,7 @@ struct KernelImpl<Return (*)(DevCtx, Args...), kernel_fn> {
   PT_SPECIALIZE_KernelCallHelper_FOR_OPTIONAL_INPUT(DenseTensor);
   PT_SPECIALIZE_KernelCallHelper_FOR_OPTIONAL_INPUT(SelectedRows);
   PT_SPECIALIZE_KernelCallHelper_FOR_MULTI_INPUT(DenseTensor);
-#ifndef PADDLE_WITH_CUSTOM_KERNEL
   PT_SPECIALIZE_KernelCallHelper_FOR_INPUT(SelectedRows);
-#endif
 
   PT_SPECIALIZE_KernelCallHelper_FOR_INPUT(SparseCooTensor);
   PT_SPECIALIZE_KernelCallHelper_FOR_OPTIONAL_INPUT(SparseCooTensor);
@@ -260,9 +256,7 @@ struct KernelImpl<Return (*)(DevCtx, Args...), kernel_fn> {
 
   PT_SPECIALIZE_KernelCallHelper_FOR_OUTPUT(DenseTensor);
   PT_SPECIALIZE_KernelCallHelper_FOR_MULTI_OUTPUT(DenseTensor);
-#ifndef PADDLE_WITH_CUSTOM_KERNEL
   PT_SPECIALIZE_KernelCallHelper_FOR_OUTPUT(SelectedRows);
-#endif
 
   PT_SPECIALIZE_KernelCallHelper_FOR_OUTPUT(SparseCooTensor);
   PT_SPECIALIZE_KernelCallHelper_FOR_MULTI_OUTPUT(SparseCooTensor);
diff --git a/paddle/phi/core/tensor_meta.h b/paddle/phi/core/tensor_meta.h
index 3d2da542c74176017492bdb9f567396f81308d6a..f4bd0be0b45b867b8ed98a5c50d2e3f58ea49780 100644
--- a/paddle/phi/core/tensor_meta.h
+++ b/paddle/phi/core/tensor_meta.h
@@ -23,13 +23,6 @@ limitations under the License. */
 #include "paddle/utils/any.h"
 #include "paddle/utils/optional.h"
 
-// Note: mixed_vector include many header now, LoD will be
-// used on CUDA device? Can we use small_vector here?
-// @zhanlve: Rollback to original LoD for now
-#ifndef PADDLE_WITH_CUSTOM_KERNEL
-#include "paddle/fluid/framework/mixed_vector.h"
-#endif
-
 namespace phi {
 
 using DDim = phi::DDim;
diff --git a/paddle/phi/infermeta/binary.cc b/paddle/phi/infermeta/binary.cc
index dfaabf7cae21ec9b91624211ce9b852148dd7cc2..675e68af74339b508f589a55a9c3cf3aed37cecb 100644
--- a/paddle/phi/infermeta/binary.cc
+++ b/paddle/phi/infermeta/binary.cc
@@ -225,6 +225,41 @@ void HuberLossInferMeta(const MetaTensor& input,
   out->share_lod(input);
 }
 
+void IndexSampleInferMeta(const MetaTensor& x,
+                          const MetaTensor& y,
+                          MetaTensor* out,
+                          MetaConfig config) {
+  auto input_dims = x.dims();
+  PADDLE_ENFORCE_EQ(input_dims.size(),
+                    2,
+                    errors::InvalidArgument(
+                        "Inputs(X) shape of IndexSample op should be 2-D, but "
+                        "got X's shape = [%s], please check X shape.",
+                        input_dims));
+
+  auto index_dims = y.dims();
+  PADDLE_ENFORCE_EQ(
+      index_dims.size(),
+      2,
+      errors::InvalidArgument(
+          "Inputs(Index) shape of IndexSample op should be 2-D, but "
+          "got Index's shape [%s] , please check index shape.",
+          input_dims));
+  if (config.is_runtime) {
+    PADDLE_ENFORCE_EQ(input_dims[0],
+                      index_dims[0],
+                      errors::InvalidArgument(
+                          "Inputs(X)'s value of dimension 0 must same with "
+                          "Inputs(Index)'s value of dimension 0, but "
+                          "got %d of Inputs(X), and got %d of Inputs(Index), "
+                          "please check Inputs shape.",
+                          input_dims[0],
+                          index_dims[0]));
+  }
+  out->set_dtype(x.dtype());
+  out->set_dims(index_dims);
+  out->share_lod(y);
+}
 void CrossInferMeta(const MetaTensor& x,
                     const MetaTensor& y,
                     int axis,
@@ -271,8 +306,7 @@ void CrossInferMeta(const MetaTensor& x,
 }
 
 void Atan2InferMeta(const MetaTensor& x, const MetaTensor& y, MetaTensor* out) {
-  auto in_dims = x.dims();
-  out->set_dims(in_dims);
+  out->share_meta(x);
 }
 
 void BCELossInferMeta(const MetaTensor& input,
diff --git a/paddle/phi/infermeta/binary.h b/paddle/phi/infermeta/binary.h
index 02750482dccaabd53f360fcc361bfdc8e788b89e..a0140c9a5799f79af541b45847d5e44f982a3f58 100644
--- a/paddle/phi/infermeta/binary.h
+++ b/paddle/phi/infermeta/binary.h
@@ -53,6 +53,11 @@ void HuberLossInferMeta(const MetaTensor& input_meta,
                         MetaTensor* residual,
                         MetaConfig config = MetaConfig());
 
+void IndexSampleInferMeta(const MetaTensor& x,
+                          const MetaTensor& y,
+                          MetaTensor* out,
+                          MetaConfig config = MetaConfig());
+
 void CrossInferMeta(const MetaTensor& x,
                     const MetaTensor& y,
                     int axis,
diff --git a/paddle/phi/infermeta/multiary.cc b/paddle/phi/infermeta/multiary.cc
index 7a0db3d5c17ee3cd40891601009a3841f603bb32..7634e5e01aca4cdaf7fb46399f9594897f2d0e36 100644
--- a/paddle/phi/infermeta/multiary.cc
+++ b/paddle/phi/infermeta/multiary.cc
@@ -84,7 +84,7 @@ void BilinearTensorProductInferMeta(const MetaTensor& x,
   out->set_dtype(x.dtype());
 }
 
-void ConcatInferMeta(const std::vector<MetaTensor>& x,
+void ConcatInferMeta(const std::vector<MetaTensor*>& x,
                      const Scalar& axis_scalar,
                      MetaTensor* out,
                      MetaConfig config) {
@@ -93,10 +93,19 @@ void ConcatInferMeta(const std::vector<MetaTensor>& x,
                     phi::errors::InvalidArgument(
                         "The size of input meta vector should be greater"
                         "than 0."));
+  if (axis_scalar.FromTensor()) {
+    auto out_dims =
+        phi::make_ddim(std::vector<int>(x.at(0)->dims().size(), -1));
+    out->set_dims(out_dims);
+    out->set_dtype(x.at(0)->dtype());
+    out->set_layout(x.at(0)->layout());
+    out->share_lod(*x.at(0));
+    return;
+  }
 
   int axis = axis_scalar.to<int>();
   // 1. calculate axis
-  int rank = x.at(0).dims().size();
+  int rank = x.at(0)->dims().size();
   PADDLE_ENFORCE_EQ(
       axis >= -rank && axis < rank,
       true,
@@ -111,15 +120,42 @@ void ConcatInferMeta(const std::vector<MetaTensor>& x,
 
   // 2. calculate out dims
   std::vector<phi::DDim> x_dims;
-  for (auto& x_t : x) {
-    x_dims.push_back(x_t.dims());
+  x_dims.reserve(x.size());
+  for (const auto* x_t : x) {
+    x_dims.emplace_back(x_t->dims());
   }
   phi::DDim out_dim =
       phi::funcs::ComputeAndCheckShape(config.is_runtime, x_dims, axis);
 
   out->set_dims(out_dim);
-  out->set_dtype(x.at(0).dtype());
-  out->set_layout(x.at(0).layout());
+  out->set_dtype(x.at(0)->dtype());
+  out->set_layout(x.at(0)->layout());
+  out->share_lod(*x.at(0));
+}
+
+void WhereInferMeta(const MetaTensor& condition,
+                    const MetaTensor& x,
+                    const MetaTensor& y,
+                    MetaTensor* out) {
+  auto cond_dims = condition.dims();
+  auto x_dims = x.dims();
+  auto y_dims = y.dims();
+  PADDLE_ENFORCE_EQ(
+      cond_dims,
+      x_dims,
+      phi::errors::InvalidArgument(
+          "The dims of Inputs(Condition) and Inputs(X) should be same. "
+          "But received Condition's shape is [%s], X's shape is [%s]",
+          cond_dims,
+          x_dims));
+  PADDLE_ENFORCE_EQ(x_dims,
+                    y_dims,
+                    phi::errors::InvalidArgument(
+                        "The dims of Inputs(X) and Inputs(Y) should be same. "
+                        "But received X's shape is [%s], Y's shape is [%s]",
+                        x_dims,
+                        y_dims));
+  out->share_meta(x);
 }
 
 }  // namespace phi
diff --git a/paddle/phi/infermeta/multiary.h b/paddle/phi/infermeta/multiary.h
index a5fb2a4cbddc33b97b31a26fa29293868808875a..2afb79daa355cc897e3bf4076003e9a41de8b96c 100644
--- a/paddle/phi/infermeta/multiary.h
+++ b/paddle/phi/infermeta/multiary.h
@@ -25,9 +25,13 @@ void BilinearTensorProductInferMeta(const MetaTensor& x,
                                     MetaTensor* out,
                                     MetaConfig config = MetaConfig());
 
-void ConcatInferMeta(const std::vector<MetaTensor>& x,
+void ConcatInferMeta(const std::vector<MetaTensor*>& x,
                      const Scalar& axis_scalar,
                      MetaTensor* out,
                      MetaConfig config = MetaConfig());
 
+void WhereInferMeta(const MetaTensor& condition,
+                    const MetaTensor& x,
+                    const MetaTensor& y,
+                    MetaTensor* out);
 }  // namespace phi
diff --git a/paddle/phi/infermeta/unary.cc b/paddle/phi/infermeta/unary.cc
index 49fd0a343a470f2545fc563366256f4f92294297..4696187bd2382a9d81400a0fd088f9d0013ff506 100644
--- a/paddle/phi/infermeta/unary.cc
+++ b/paddle/phi/infermeta/unary.cc
@@ -459,8 +459,19 @@ void TransferLayoutInferMeta(const MetaTensor& x,
 void SplitInferMeta(const MetaTensor& x,
                     const ScalarArray& num_or_sections,
                     const Scalar& axis,
-                    std::vector<MetaTensor>* out,
+                    std::vector<MetaTensor*> out,
                     MetaConfig config) {
+  if (!config.is_runtime) {
+    if (axis.FromTensor() || num_or_sections.FromTensor()) {
+      auto out_dims = phi::make_ddim(std::vector<int>(x.dims().size(), -1));
+      for (auto* item : out) {
+        item->set_dims(out_dims);
+        item->share_lod(x);
+      }
+      return;
+    }
+  }
+
   int axis_value = axis.to<int>();
   int rank = x.dims().size();
   PADDLE_ENFORCE_EQ(
@@ -475,27 +486,34 @@ void SplitInferMeta(const MetaTensor& x,
     axis_value = axis_value + rank;
   }
 
+  std::vector<phi::DDim> out_dims(out.size(), x.dims());
+
   auto input_axis_dim = x.dims().at(axis_value);
   auto num_or_sections_data = num_or_sections.GetData();
-  // step1: get formated sections
-  std::vector<int64_t> sections;
   // num_or_sections is a number
   if (num_or_sections_data.size() == 1) {
-    int num = num_or_sections_data.at(0);
+    if (config.is_runtime || input_axis_dim > 0) {
+      int num = num_or_sections_data.at(0);
+      PADDLE_ENFORCE_EQ(
+          input_axis_dim % num,
+          0,
+          phi::errors::InvalidArgument(
+              "The input's size along the split dimension "
+              "must be evenly divisible by Attr(num_or_sections). "
+              "But received Attr(num_or_sections) "
+              "= %d, input(X)'s shape = [%s], Attr(dim) = %d.",
+              num,
+              x.dims(),
+              axis_value));
 
-    PADDLE_ENFORCE_EQ(input_axis_dim % num,
-                      0,
-                      phi::errors::InvalidArgument(
-                          "The input's size along the split dimension "
-                          "must be evenly divisible by Attr(num_or_sections). "
-                          "But received Attr(num_or_sections) "
-                          "= %d, input(X)'s shape = [%s], Attr(dim) = %d.",
-                          num,
-                          x.dims(),
-                          axis_value));
-
-    for (int i = 0; i < num; ++i) {
-      sections.push_back(input_axis_dim / num);
+      size_t out_axis_dim = input_axis_dim / num;
+      for (auto& out_dim : out_dims) {
+        out_dim[axis_value] = out_axis_dim;
+      }
+    } else {
+      for (auto& out_dim : out_dims) {
+        out_dim[axis_value] = -1;
+      }
     }
   } else {
     // num_or_sections is a sections
@@ -503,10 +521,9 @@ void SplitInferMeta(const MetaTensor& x,
     int unknow_dim_idx = -1;
     int num_of_unknow = 0;
     int sum_of_section = 0;
+    std::vector<int64_t> sections = num_or_sections_data;
 
     for (size_t i = 0; i < num_or_sections_data.size(); ++i) {
-      sections.push_back(num_or_sections_data[i]);
-
       if (num_or_sections_data[i] == unknow_dim_val) {
         num_of_unknow++;
         unknow_dim_idx = i;
@@ -558,31 +575,22 @@ void SplitInferMeta(const MetaTensor& x,
               x.dims(),
               axis_value));
     }
-  }
-
-  // setp2: fill out dims
-  std::vector<phi::DDim> out_dims(sections.size(), x.dims());
-  if (config.is_runtime || input_axis_dim > 0) {
-    for (size_t i = 0; i < sections.size(); ++i) {
+    for (size_t i = 0; i < out_dims.size(); ++i) {
       out_dims[i][axis_value] = sections[i];
     }
-  } else {
-    for (size_t i = 0; i < sections.size(); ++i) {
-      out_dims[i][axis_value] = -1;
-    }
   }
 
-  for (size_t i = 0; i < sections.size(); ++i) {
+  for (size_t i = 0; i < out.size(); ++i) {
     if (axis_value != 0) {
       // Only pass LoD when not spliting along the first dim.
-      (*out)[i].set_dtype(x.dtype());
-      (*out)[i].set_dims(out_dims[i]);
-      (*out)[i].set_layout(x.layout());
+      out.at(i)->set_dtype(x.dtype());
+      out.at(i)->set_dims(out_dims[i]);
+      out.at(i)->set_layout(x.layout());
     } else {
-      (*out)[i].set_dtype(x.dtype());
-      (*out)[i].set_dims(out_dims[i]);
-      (*out)[i].set_layout(x.layout());
-      (*out)[i].share_lod(x);
+      out.at(i)->set_dtype(x.dtype());
+      out.at(i)->set_dims(out_dims[i]);
+      out.at(i)->set_layout(x.layout());
+      out.at(i)->share_lod(x);
     }
   }
 }
diff --git a/paddle/phi/infermeta/unary.h b/paddle/phi/infermeta/unary.h
index 4fab1ec68ec1e71af5e55a9852cd68deccc09a7c..b3929b9d2b47f87ab0f7b42ed74c2881c076f7d9 100644
--- a/paddle/phi/infermeta/unary.h
+++ b/paddle/phi/infermeta/unary.h
@@ -107,7 +107,7 @@ void TransferLayoutInferMeta(const MetaTensor& x,
 void SplitInferMeta(const MetaTensor& x_meta,
                     const ScalarArray& num_or_sections,
                     const Scalar& axis,
-                    std::vector<MetaTensor>* out,
+                    std::vector<MetaTensor*> out,
                     MetaConfig config = MetaConfig());
 
 void UnbindInferMeta(const MetaTensor& x,
diff --git a/paddle/phi/kernels/concat_kernel.h b/paddle/phi/kernels/concat_kernel.h
index fbc4a86f5af611df3bd6b8f5101a3a2f26473c9d..f13667881468e15183c3d770df638641f1dc6ed0 100644
--- a/paddle/phi/kernels/concat_kernel.h
+++ b/paddle/phi/kernels/concat_kernel.h
@@ -31,13 +31,16 @@ DenseTensor Concat(const Context& dev_ctx,
                    const std::vector<DenseTensor>& x,
                    const Scalar& axis) {
   std::vector<MetaTensor> meta_x;
+  meta_x.reserve(x.size());
+  std::vector<MetaTensor*> meta_x_ptr;
   for (const auto& t : x) {
     meta_x.emplace_back(t);
+    meta_x_ptr.push_back(&meta_x.back());
   }
 
   auto dense_out = phi::Empty<T, Context>(dev_ctx);
   MetaTensor meta_out(&dense_out);
-  ConcatInferMeta(meta_x, axis.to<int>(), &meta_out, /*is_runtime=*/true);
+  ConcatInferMeta(meta_x_ptr, axis.to<int>(), &meta_out, /*is_runtime=*/true);
   ConcatKernel<T, Context>(dev_ctx, x, axis, &dense_out);
   return dense_out;
 }
diff --git a/paddle/phi/kernels/cpu/atan2_grad_kernel.cc b/paddle/phi/kernels/cpu/atan2_grad_kernel.cc
index 6ff7431f0c8c556770b54e1328251e5996850fc9..7a519aab0ad71e4cd20270b216bf65262cab8ba6 100644
--- a/paddle/phi/kernels/cpu/atan2_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/atan2_grad_kernel.cc
@@ -12,11 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/atan2_grad_kernel.h"
+#include "paddle/phi/kernels/impl/atan2_grad_kernel_impl.h"
+
 #include "paddle/phi/backends/cpu/cpu_context.h"
-#include "paddle/phi/core/device_context.h"
 #include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/impl/atan2_grad_kernel_impl.h"
 
 PD_REGISTER_KERNEL(atan2_grad,
                    CPU,
diff --git a/paddle/phi/kernels/cpu/atan2_kernel.cc b/paddle/phi/kernels/cpu/atan2_kernel.cc
index eb38a6c90b7938ef16cf9d56dfdb93903cc3c6a1..df6f5f59ac0056f36749faec8a300c1b5a1da1c9 100644
--- a/paddle/phi/kernels/cpu/atan2_kernel.cc
+++ b/paddle/phi/kernels/cpu/atan2_kernel.cc
@@ -12,11 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/phi/kernels/atan2_kernel.h"
+#include "paddle/phi/kernels/impl/atan2_kernel_impl.h"
+
 #include "paddle/phi/backends/cpu/cpu_context.h"
-#include "paddle/phi/core/device_context.h"
 #include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/impl/atan2_kernel_impl.h"
 
 PD_REGISTER_KERNEL(atan2,
                    CPU,
diff --git a/paddle/phi/kernels/cpu/concat_kernel.cc b/paddle/phi/kernels/cpu/concat_kernel.cc
index 18bb8837b105d91e3e13a0a7519b08c9c47202c4..5c4202837c4487361f33b849df7d975e85f8490d 100644
--- a/paddle/phi/kernels/cpu/concat_kernel.cc
+++ b/paddle/phi/kernels/cpu/concat_kernel.cc
@@ -37,6 +37,7 @@ void ConcatKernel(const Context& dev_ctx,
   axis = phi::funcs::ComputeAxis(axis, x[0].dims().size());
 
   std::vector<phi::DDim> x_dims;
+  x_dims.reserve(x.size());
   for (size_t i = 0; i < x.size(); ++i) {
     x_dims.push_back(x[i].dims());
   }
@@ -97,9 +98,10 @@ void ConcatKernel(const Context& dev_ctx,
     }
   } else {
     std::vector<phi::DenseTensor> inputs;
+    inputs.reserve(x.size());
     for (size_t j = 0; j < x.size(); ++j) {
       if (x[j].numel() > 0) {
-        inputs.push_back(x[j]);
+        inputs.emplace_back(x[j]);
       } else {
         continue;
       }
diff --git a/paddle/phi/kernels/cpu/index_sample_grad_kernel.cc b/paddle/phi/kernels/cpu/index_sample_grad_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..006711ceef75edb7d9d3ed2530c0a5dda2b64993
--- /dev/null
+++ b/paddle/phi/kernels/cpu/index_sample_grad_kernel.cc
@@ -0,0 +1,106 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/index_sample_grad_kernel.h"
+#include "paddle/fluid/framework/convert_utils.h"
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/common/data_type.h"
+#include "paddle/phi/core/kernel_registry.h"
+namespace phi {
+template <typename T, typename Context, typename IndexT = int>
+void IndexSampleGradInner(const Context& context,
+                          const DenseTensor& out_grad,
+                          const DenseTensor& index,
+                          DenseTensor* x_grad) {
+  std::vector<T> out_grad_vec;
+  std::vector<IndexT> index_vec;
+  paddle::framework::TensorToVector(out_grad, context, &out_grad_vec);
+  paddle::framework::TensorToVector(index, context, &index_vec);
+
+  auto index_dims = index.dims();
+  auto x_grad_dims = x_grad->dims();
+
+  auto value_length = x_grad_dims[1];
+  auto index_length = index_dims[1];
+  int index_ids_num = index.numel();
+
+  std::vector<T> x_grad_vec(x_grad->numel(), 0);
+
+  for (int i = 0; i < index_ids_num; i++) {
+    int b = floor(i / index_length);
+    PADDLE_ENFORCE_GE(
+        index_vec[i],
+        0,
+        errors::InvalidArgument(
+            "Variable value (index) of OP(index_sample_grad) "
+            "expected >= 0 and < %ld, but got %ld. Please check input "
+            "value.",
+            value_length,
+            index_vec[i]));
+    PADDLE_ENFORCE_LT(
+        index_vec[i],
+        value_length,
+        errors::InvalidArgument(
+            "Variable value (index) of OP(index_sample_grad) "
+            "expected >= 0 and < %ld, but got %ld. Please check input "
+            "value.",
+            value_length,
+            index_vec[i]));
+    int v_i = b * value_length + static_cast<int>(index_vec[i]);
+    x_grad_vec[v_i] += out_grad_vec[i];
+  }
+  context.template Alloc<T>(x_grad);
+  paddle::framework::TensorFromVector(x_grad_vec, context, x_grad);
+  x_grad->Resize(x_grad_dims);
+}
+
+template <typename T, typename Context>
+void IndexSampleGradKernel(const Context& ctx,
+                           const DenseTensor& out_grad,
+                           const DenseTensor& x,
+                           const DenseTensor& index,
+                           DenseTensor* x_grad) {
+  auto index_type = index.dtype();
+  bool index_type_match =
+      index_type == DataType::INT32 || index_type == DataType::INT64;
+  PADDLE_ENFORCE_EQ(
+      index_type_match,
+      true,
+      errors::InvalidArgument(
+          "Input(Index) holds the wrong type, it holds %s, but "
+          "desires to be %s or %s",
+          paddle::framework::DataTypeToString(
+              paddle::framework::TransToProtoVarType(index_type)),
+          paddle::framework::DataTypeToString(
+              paddle::framework::TransToProtoVarType(DataType::INT32)),
+          paddle::framework::DataTypeToString(
+              paddle::framework::TransToProtoVarType((DataType::INT64)))));
+  if (index_type == DataType::INT32) {
+    IndexSampleGradInner<T, Context, int>(ctx, out_grad, index, x_grad);
+  } else if (index_type == DataType::INT64) {
+    IndexSampleGradInner<T, Context, int64_t>(ctx, out_grad, index, x_grad);
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(index_sample_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::IndexSampleGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/cpu/index_sample_kernel.cc b/paddle/phi/kernels/cpu/index_sample_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..21bf9faee13cfa4da271a7d1b1a9fe482a55da04
--- /dev/null
+++ b/paddle/phi/kernels/cpu/index_sample_kernel.cc
@@ -0,0 +1,118 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/index_sample_kernel.h"
+#include <cmath>
+#include <fstream>
+#include <set>
+#include <string>
+#include <utility>
+#include <vector>
+#include "paddle/fluid/framework/convert_utils.h"
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/common/data_type.h"
+#include "paddle/phi/core/kernel_registry.h"
+namespace phi {
+template <typename T, typename Context, typename IndexT = int>
+void IndexSampleInner(const Context &context,
+                      const DenseTensor &input,
+                      const DenseTensor &index,
+                      DenseTensor *output) {
+  auto input_dims = input.dims();
+  auto index_dims = index.dims();
+
+  int batch_size = input_dims[0];
+  auto value_length = input_dims[1];
+  auto index_length = index_dims[1];
+  int index_ids_num = index.numel();
+
+  std::vector<T> input_vec;
+  std::vector<IndexT> index_vec;
+  paddle::framework::TensorToVector(input, context, &input_vec);
+  paddle::framework::TensorToVector(index, context, &index_vec);
+
+  std::vector<T> res(index_ids_num);
+  for (int i = 0; i < index_ids_num; i++) {
+    int b = floor(i / index_length);
+    PADDLE_ENFORCE_GE(
+        index_vec[i],
+        0,
+        errors::InvalidArgument(
+            "Variable value (index) of OP(index_sample) "
+            "expected >= 0 and < %ld, but got %ld. Please check input "
+            "value.",
+            value_length,
+            index_vec[i]));
+    PADDLE_ENFORCE_LT(
+        index_vec[i],
+        value_length,
+        errors::InvalidArgument(
+            "Variable value (index) of OP(index_sample) "
+            "expected >= 0 and < %ld, but got %ld. Please check input "
+            "value.",
+            value_length,
+            index_vec[i]));
+
+    int v_i = b * value_length + static_cast<int>(index_vec[i]);
+    T v = input_vec[v_i];
+    VLOG(4) << "Index Sample: batch = " << b << " index = " << v_i
+            << " value = " << v;
+    res[i] = v;
+  }
+
+  auto ddim = phi::make_ddim({batch_size, index_length});
+  context.template Alloc<T>(output);
+  paddle::framework::TensorFromVector(res, context, output);
+  output->Resize(ddim);
+}
+
+template <typename T, typename Context>
+void IndexSampleKernel(const Context &ctx,
+                       const DenseTensor &x,
+                       const DenseTensor &index,
+                       DenseTensor *out) {
+  ctx.template Alloc<T>(out);
+  auto index_type = index.dtype();
+  bool index_type_match =
+      index_type == DataType::INT32 || index_type == DataType::INT64;
+  PADDLE_ENFORCE_EQ(
+      index_type_match,
+      true,
+      errors::InvalidArgument(
+          "Input(Index) holds the wrong type, it holds %s, but "
+          "desires to be %s or %s",
+          paddle::framework::DataTypeToString(
+              paddle::framework::TransToProtoVarType(index_type)),
+          paddle::framework::DataTypeToString(
+              paddle::framework::TransToProtoVarType(DataType::INT32)),
+          paddle::framework::DataTypeToString(
+              paddle::framework::TransToProtoVarType((DataType::INT64)))));
+  if (index_type == DataType::INT32) {
+    IndexSampleInner<T, Context, int>(ctx, x, index, out);
+  } else if (index_type == DataType::INT64) {
+    IndexSampleInner<T, Context, int64_t>(ctx, x, index, out);
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(index_sample,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::IndexSampleKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/cpu/logical_kernel.cc b/paddle/phi/kernels/cpu/logical_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3d179e1e75f4fa98057f32737f09025ce1d6b2fb
--- /dev/null
+++ b/paddle/phi/kernels/cpu/logical_kernel.cc
@@ -0,0 +1,72 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/logical_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/cpu/elementwise.h"
+#include "paddle/phi/kernels/funcs/logical_functor.h"
+
+// See Note [ Why still include the fluid headers? ]
+#include "paddle/fluid/platform/transform.h"
+
+namespace phi {
+
+#define DEFINE_LOGICAL_BINARY_KERNEL(type)                         \
+  template <typename T, typename Context>                          \
+  void Logical##type##Kernel(const Context& dev_ctx,               \
+                             const DenseTensor& x,                 \
+                             const DenseTensor& y,                 \
+                             DenseTensor* out) {                   \
+    funcs::Logical##type##Functor<T> binary_func;                  \
+    ElementwiseCompute<funcs::Logical##type##Functor<T>, T, bool>( \
+        dev_ctx, x, y, -1, binary_func, out);                      \
+  }
+
+DEFINE_LOGICAL_BINARY_KERNEL(And)
+DEFINE_LOGICAL_BINARY_KERNEL(Or)
+DEFINE_LOGICAL_BINARY_KERNEL(Xor)
+#undef DEFINE_LOGICAL_BINARY_KERNEL
+
+template <typename T, typename Context>
+void LogicalNotKernel(const Context& dev_ctx,
+                      const DenseTensor& x,
+                      DenseTensor* out) {
+  auto* out_ptr = dev_ctx.template Alloc<bool>(out);
+  funcs::LogicalNotFunctor<T> unary_func;
+
+  paddle::platform::Transform<Context> trans;
+  trans(dev_ctx, x.data<T>(), x.data<T>() + x.numel(), out_ptr, unary_func);
+}
+
+}  // namespace phi
+
+#define REGISTER_LOGICAL_CPU_KERNEL(logical_and, func_type) \
+  PD_REGISTER_KERNEL(logical_and,                           \
+                     CPU,                                   \
+                     ALL_LAYOUT,                            \
+                     phi::Logical##func_type##Kernel,       \
+                     float,                                 \
+                     double,                                \
+                     bool,                                  \
+                     int64_t,                               \
+                     int,                                   \
+                     int8_t,                                \
+                     int16_t) {}
+
+REGISTER_LOGICAL_CPU_KERNEL(logical_and, And)
+REGISTER_LOGICAL_CPU_KERNEL(logical_or, Or)
+REGISTER_LOGICAL_CPU_KERNEL(logical_not, Not)
+REGISTER_LOGICAL_CPU_KERNEL(logical_xor, Xor)
diff --git a/paddle/phi/kernels/cpu/split_kernel.cc b/paddle/phi/kernels/cpu/split_kernel.cc
index 722681fb7bc3f9d9f75b92468b89931910dd532e..4acf9b02028f994c38144d716fdd56c6bbb6afa2 100644
--- a/paddle/phi/kernels/cpu/split_kernel.cc
+++ b/paddle/phi/kernels/cpu/split_kernel.cc
@@ -28,20 +28,6 @@ void SplitKernel(const Context& dev_ctx,
                  const ScalarArray& num_or_sections,
                  const Scalar& axis_scalar,
                  std::vector<DenseTensor*> outs) {
-  // need to infershape output
-  if (num_or_sections.FromTensor() || axis_scalar.FromTensor()) {
-    std::vector<MetaTensor> out_metas;
-    for (size_t i = 0; i < outs.size(); ++i) {
-      out_metas.push_back(outs[i]);
-    }
-
-    phi::SplitInferMeta(x, num_or_sections, axis_scalar, &out_metas, true);
-
-    for (size_t i = 0; i < out_metas.size(); ++i) {
-      outs[i]->Resize(out_metas[i].dims());
-    }
-  }
-
   std::vector<const DenseTensor*> shape_refer;
   for (size_t j = 0; j < outs.size(); ++j) {
     dev_ctx.template Alloc<T>(outs[j]);
diff --git a/paddle/phi/kernels/cpu/uniform_random_kernel.cc b/paddle/phi/kernels/cpu/uniform_random_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..8ec1d9683e15a92c7184d91005f85258cf1dd004
--- /dev/null
+++ b/paddle/phi/kernels/cpu/uniform_random_kernel.cc
@@ -0,0 +1,115 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/uniform_random_kernel.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+template <typename T>
+inline void UniformRealDistribution(T *data,
+                                    const int64_t &size,
+                                    const float &min,
+                                    const float &max,
+                                    std::shared_ptr<std::mt19937_64> engine) {
+  std::uniform_real_distribution<T> dist(static_cast<T>(min),
+                                         static_cast<T>(max));
+  for (int64_t i = 0; i < size; ++i) {
+    data[i] = dist(*engine);
+  }
+}
+
+template <>
+inline void UniformRealDistribution(phi::dtype::bfloat16 *data,
+                                    const int64_t &size,
+                                    const float &min,
+                                    const float &max,
+                                    std::shared_ptr<std::mt19937_64> engine) {
+  std::uniform_real_distribution<float> dist(min, max);
+  for (int64_t i = 0; i < size; ++i) {
+    data[i] = static_cast<phi::dtype::bfloat16>(dist(*engine));
+  }
+}
+
+template <typename T, typename Context>
+void UniformRandomRawKernel(const Context &dev_ctx,
+                            const ScalarArray &shape,
+                            DataType dtype,
+                            float min,
+                            float max,
+                            int seed,
+                            int diag_num,
+                            int diag_step,
+                            float diag_val,
+                            DenseTensor *out) {
+  out->Resize(phi::make_ddim(shape.GetData()));
+  VLOG(4) << out->dims();
+  T *data = dev_ctx.template Alloc<T>(out);
+  auto size = out->numel();
+  std::shared_ptr<std::mt19937_64> engine;
+  if (seed) {
+    engine = std::make_shared<std::mt19937_64>();
+    engine->seed(seed);
+  } else {
+    engine = dev_ctx.GetGenerator()->GetCPUEngine();
+  }
+  UniformRealDistribution<T>(data, size, min, max, engine);
+  if (diag_num > 0) {
+    PADDLE_ENFORCE_GT(
+        size,
+        (diag_num - 1) * (diag_step + 1),
+        phi::errors::InvalidArgument(
+            "ShapeInvalid: the diagonal's elements is equal (num-1) "
+            "* (step-1) with num %d, step %d,"
+            "It should be smaller than %d, but received %d",
+            diag_num,
+            diag_step,
+            (diag_num - 1) * (diag_step + 1),
+            size));
+    for (int64_t i = 0; i < diag_num; ++i) {
+      int64_t pos = i * diag_step + i;
+      data[pos] = diag_val;
+    }
+  }
+}
+
+template <typename T, typename Context>
+void UniformRandomKernel(const Context &dev_ctx,
+                         const ScalarArray &shape,
+                         DataType dtype,
+                         float min,
+                         float max,
+                         int seed,
+                         DenseTensor *out) {
+  UniformRandomRawKernel<T>(
+      dev_ctx, shape, dtype, min, max, seed, 0, 0, 0.0f, out);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(uniform_random_raw,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::UniformRandomRawKernel,
+                   float,
+                   double,
+                   phi::dtype::bfloat16) {}
+
+PD_REGISTER_KERNEL(uniform_random,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::UniformRandomKernel,
+                   float,
+                   double,
+                   phi::dtype::bfloat16) {}
diff --git a/paddle/phi/kernels/cpu/where_grad_kernel.cc b/paddle/phi/kernels/cpu/where_grad_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..67c8cee1038c7a990e5961a3fcd17e8d7c591207
--- /dev/null
+++ b/paddle/phi/kernels/cpu/where_grad_kernel.cc
@@ -0,0 +1,54 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/where_grad_kernel.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void WhereGradKernel(const Context& ctx,
+                     const DenseTensor& condition,
+                     const DenseTensor& x,
+                     const DenseTensor& y,
+                     const DenseTensor& out_grad,
+                     DenseTensor* x_grad,
+                     DenseTensor* y_grad) {
+  const auto* cond_data = condition.data<bool>();
+  auto numel = condition.numel();
+  auto* dout = out_grad.data<T>();
+
+  if (x_grad != nullptr) {
+    auto* dx = ctx.template Alloc<T>(x_grad);
+    for (int i = 0; i < numel; i++) {
+      dx[i] = dout[i] * (cond_data[i] ? 1. : 0.);
+    }
+  }
+  if (y_grad != nullptr) {
+    auto* dy = ctx.template Alloc<T>(y_grad);
+    for (int i = 0; i < numel; i++) {
+      dy[i] = dout[i] * (cond_data[i] ? 0. : 1.);
+    }
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(where_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::WhereGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/cpu/where_kernel.cc b/paddle/phi/kernels/cpu/where_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..f624c13c262296964cef6b98f7d5d26dfc0b7d56
--- /dev/null
+++ b/paddle/phi/kernels/cpu/where_kernel.cc
@@ -0,0 +1,40 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/where_kernel.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void WhereKernel(const Context& ctx,
+                 const DenseTensor& condition,
+                 const DenseTensor& x,
+                 const DenseTensor& y,
+                 DenseTensor* out) {
+  const bool* cond_data = condition.data<bool>();
+  const T* x_data = x.data<T>();
+  const T* y_data = y.data<T>();
+  auto x_numel = x.numel();
+
+  T* out_data = ctx.template Alloc<T>(out);
+
+  for (int i = 0; i < x_numel; i++) {
+    out_data[i] = cond_data[i] ? x_data[i] : y_data[i];
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    where, CPU, ALL_LAYOUT, phi::WhereKernel, float, double, int, int64_t) {}
diff --git a/paddle/phi/kernels/funcs/aligned_vector.h b/paddle/phi/kernels/funcs/aligned_vector.h
new file mode 100644
index 0000000000000000000000000000000000000000..9382b03cf9368cc726235a753a1990baacb60d52
--- /dev/null
+++ b/paddle/phi/kernels/funcs/aligned_vector.h
@@ -0,0 +1,75 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.1 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.1
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/phi/core/hostdevice.h"
+
+namespace phi {
+
+// Aligned vector generates vectorized load/store on CUDA.
+template <typename T, int Size>
+struct alignas(sizeof(T) * Size) AlignedVector {
+  T val[Size];
+
+  HOSTDEVICE inline const T& operator[](int i) const { return val[i]; }
+  HOSTDEVICE inline T& operator[](int i) { return val[i]; }
+};
+
+template <typename T, int Size>
+HOSTDEVICE inline void Load(const T* addr, AlignedVector<T, Size>* vec) {
+  const AlignedVector<T, Size>* addr_vec =
+      reinterpret_cast<const AlignedVector<T, Size>*>(addr);
+  *vec = *addr_vec;
+}
+
+template <typename T, int Size>
+HOSTDEVICE inline void Store(const AlignedVector<T, Size>& vec, T* addr) {
+  AlignedVector<T, Size>* addr_vec =
+      reinterpret_cast<AlignedVector<T, Size>*>(addr);
+  *addr_vec = vec;
+}
+
+/*
+* Only the address of input data is the multiplier of 1,2,4, vectorized load
+* with corresponding multiplier-value is possible. Moreover, the maximum length
+* of vectorized load is 128 bits once. Hence, valid length of vectorized load
+* shall be determined under both former constraints.
+*/
+template <typename T>
+int GetVectorizedSize(const T* pointer) {
+  constexpr int max_load_bits = 128;
+  int valid_vec_size = max_load_bits / CHAR_BIT / sizeof(T);
+  uint64_t address = reinterpret_cast<uint64_t>(pointer);
+  constexpr int vec8 = std::alignment_of<AlignedVector<T, 8>>::value;  // NOLINT
+  constexpr int vec4 = std::alignment_of<AlignedVector<T, 4>>::value;  // NOLINT
+  constexpr int vec2 = std::alignment_of<AlignedVector<T, 2>>::value;  // NOLINT
+  if (address % vec8 == 0) {
+    /*
+    * Currently, decide to deal with no more than 4 data once while adopting
+    * vectorization load/store, if performance test shows that dealing with
+    * 8 data once in vectorization load/store does get optimized, return code
+    * below can be changed into " return std::min(8, valid_vec_size); " .
+    */
+    return std::min(4, valid_vec_size);
+  } else if (address % vec4 == 0) {
+    return std::min(4, valid_vec_size);
+  } else if (address % vec2 == 0) {
+    return std::min(2, valid_vec_size);
+  } else {
+    return 1;
+  }
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/funcs/distribution_helper.h b/paddle/phi/kernels/funcs/distribution_helper.h
new file mode 100644
index 0000000000000000000000000000000000000000..49e1c82482c0f14a665380e1b55e8f7bd67b1e30
--- /dev/null
+++ b/paddle/phi/kernels/funcs/distribution_helper.h
@@ -0,0 +1,249 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#ifdef __NVCC__
+#include <curand_kernel.h>
+#endif
+#ifdef __HIPCC__
+#include <hiprand_kernel.h>
+#endif
+
+#include "paddle/phi/backends/gpu/gpu_info.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/device_context.h"
+#include "paddle/phi/core/generator.h"
+
+#include "paddle/phi/kernels/funcs/index_impl.cu.h"
+
+#if defined(__NVCC__) || defined(__HIPCC__)
+#include "paddle/phi/kernels/primitive/kernel_primitives.h"
+#endif
+
+#if !defined(_WIN32)
+#define UNLIKELY(condition) __builtin_expect(static_cast<bool>(condition), 0)
+#else
+// there is no equivalent intrinsics in msvc.
+#define UNLIKELY(condition) (condition)
+#endif
+
+namespace phi {
+namespace distribution {
+
+/********************* Transformation Function **********************/
+template <typename T>
+struct exponential_transform {
+  explicit exponential_transform(T lambda) : lambda_(lambda) {}
+
+  HOSTDEVICE inline T operator()(T val) const {
+#if defined(__NVCC__) || defined(__HIPCC__)
+    if (std::is_same<T, double>::value) {
+      return static_cast<T>(-1.0) / lambda_ * log(val);
+    } else {
+      return static_cast<T>(-1.0) / lambda_ * __logf(val);
+    }
+#else
+    return static_cast<T>(-1.0) / lambda_ * std::log(static_cast<T>(1.0) - val);
+#endif
+  }
+
+ private:
+  T lambda_;
+};
+
+template <typename T>
+struct uniform_transform {
+  explicit uniform_transform(T min, T max) : range_(max - min), min_(min) {}
+
+  HOSTDEVICE inline T operator()(T val) const {
+    if (UNLIKELY(val == static_cast<T>(1.0))) {
+      return min_;
+    } else {
+      return val * range_ + min_;
+    }
+  }
+
+ private:
+  T range_;
+  T min_;
+};
+
+template <typename T>
+struct normal_transform {
+  explicit normal_transform(T mean, T std) : mean_(mean), std_(std) {}
+
+  HOSTDEVICE inline T operator()(T val) const { return val * std_ + mean_; }
+
+ private:
+  T mean_;
+  T std_;
+};
+
+#if defined(__NVCC__) || defined(__HIPCC__)
+
+namespace kps = phi::kps;
+
+/*********************** Distribution Function *************************/
+template <typename T>
+struct uniform_distribution;
+
+template <typename T>
+struct normal_distribution;
+
+#if defined(__NVCC__)
+template <>
+struct uniform_distribution<float> {
+  __device__ inline float4 operator()(curandStatePhilox4_32_10_t *state) const {
+    return curand_uniform4(state);
+  }
+  static constexpr int kReturnsCount = 4;
+};
+
+template <>
+struct uniform_distribution<double> {
+  __device__ inline double2 operator()(
+      curandStatePhilox4_32_10_t *state) const {
+    return curand_uniform2_double(state);
+  }
+  static constexpr int kReturnsCount = 2;
+};
+
+template <>
+struct normal_distribution<float> {
+  __device__ inline float4 operator()(curandStatePhilox4_32_10_t *state) const {
+    return curand_normal4(state);
+  }
+  static constexpr int kReturnsCount = 4;
+};
+
+template <>
+struct normal_distribution<double> {
+  __device__ inline double2 operator()(
+      curandStatePhilox4_32_10_t *state) const {
+    return curand_normal2_double(state);
+  }
+  static constexpr int kReturnsCount = 2;
+};
+
+#else
+template <>
+struct uniform_distribution<float> {
+  __device__ inline float4 operator()(
+      hiprandStatePhilox4_32_10_t *state) const {
+    return hiprand_uniform4(state);
+  }
+  static constexpr int kReturnsCount = 4;
+};
+
+template <>
+struct uniform_distribution<double> {
+  __device__ inline double2 operator()(
+      hiprandStatePhilox4_32_10_t *state) const {
+    return hiprand_uniform2_double(state);
+  }
+  static constexpr int kReturnsCount = 2;
+};
+
+template <>
+struct normal_distribution<float> {
+  __device__ inline float4 operator()(
+      hiprandStatePhilox4_32_10_t *state) const {
+    return hiprand_normal4(state);
+  }
+  static constexpr int kReturnsCount = 4;
+};
+
+template <>
+struct normal_distribution<double> {
+  __device__ inline double2 operator()(
+      hiprandStatePhilox4_32_10_t *state) const {
+    return hiprand_normal2_double(state);
+  }
+  static constexpr int kReturnsCount = 2;
+};
+#endif
+
+/******** Launch GPU function of distribution and transformation *********/
+template <typename T, typename DistOp, typename TransformOp>
+__global__ void DistributionKernel(size_t size,
+                                   uint64_t seed,
+                                   uint64_t offset,
+                                   DistOp dist,
+                                   TransformOp trans,
+                                   T *out_data,
+                                   size_t stride) {
+  size_t idx = static_cast<size_t>(BLOCK_ID_X * BLOCK_NUM_X);
+  static constexpr int kCount = DistOp::kReturnsCount;
+#if defined(__NVCC__)
+  curandStatePhilox4_32_10_t state;
+  curand_init(seed, idx + THREAD_ID_X, offset, &state);
+  using SType = curandStatePhilox4_32_10_t;
+#else
+  hiprandStatePhilox4_32_10_t state;
+  hiprand_init(seed, idx + THREAD_ID_X, offset, &state);
+  using SType = hiprandStatePhilox4_32_10_t;
+#endif
+  size_t total_thread = GRID_NUM_X * BLOCK_NUM_X;
+  T args[kCount];
+  T result[kCount];
+  for (size_t i = idx; i < size; i += total_thread * kCount) {
+    kps::ElementwiseRandom<SType, T, kCount, 1, DistOp>(&args[0], dist, &state);
+    kps::ElementwiseUnary<T, T, kCount, 1, 1, TransformOp>(
+        &result[0], &args[0], trans);
+    kps::WriteData<T, T, kCount, 1, 1, true>(
+        out_data + i, &result[0], size - i, 1, stride, 1);
+    __syncthreads();
+  }
+}
+
+template <typename T, typename DistOp, typename TransformOp>
+void distribution_and_transform(const GPUContext &dev_ctx,
+                                DenseTensor *out,
+                                DistOp dist,
+                                TransformOp trans) {
+  T *out_data = dev_ctx.template Alloc<T>(out);
+  auto size = out->numel();
+
+  int64_t device_id = dev_ctx.GetPlace().GetDeviceId();
+  auto gen_cuda = dev_ctx.GetGenerator();
+
+  size_t block_size = 256;
+  size_t expect_grid_size = (size + block_size - 1) / block_size;
+  const auto &prop = backends::gpu::GetDeviceProperties(device_id);
+  size_t max_grid_size = (prop.maxThreadsPerMultiProcessor / block_size) *
+                         prop.multiProcessorCount;
+  size_t grid_size =
+      expect_grid_size > max_grid_size ? max_grid_size : expect_grid_size;
+
+  size_t total_thread = block_size * grid_size;
+  size_t curand4_loop_times =
+      (size + 4 * total_thread - 1) / (4 * total_thread);
+  // 'increment' shoulde be multiple of 4
+  uint64_t increment = curand4_loop_times * 4;
+
+  auto seed_offset = gen_cuda->IncrementOffset(increment);
+  uint64_t seed = seed_offset.first;
+  uint64_t offset = seed_offset.second;
+
+  DistributionKernel<
+      T,
+      DistOp,
+      TransformOp><<<grid_size, block_size, 0, dev_ctx.stream()>>>(
+      size, seed, offset, dist, trans, out_data, total_thread);
+}
+
+#endif
+}  // namespace distribution
+}  // namespace phi
diff --git a/paddle/phi/kernels/funcs/index_impl.cu.h b/paddle/phi/kernels/funcs/index_impl.cu.h
new file mode 100644
index 0000000000000000000000000000000000000000..ccb70fe25ddce3ec9fba984a86049213ac51e5fa
--- /dev/null
+++ b/paddle/phi/kernels/funcs/index_impl.cu.h
@@ -0,0 +1,93 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <thrust/device_vector.h>
+#include <thrust/host_vector.h>
+#include <thrust/random.h>
+
+#include "paddle/phi/backends/gpu/gpu_launch_config.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/hostdevice.h"
+#include "paddle/phi/kernels/funcs/aligned_vector.h"
+#include "paddle/phi/kernels/primitive/kernel_primitives.h"
+
+namespace phi {
+
+template <typename T, typename Functor, int VecSize>
+__global__ void VectorizedIndexKernel(T *out,
+                                      size_t numel,
+                                      size_t main_offset,
+                                      Functor func) {
+  size_t data_offset = BLOCK_ID_X * BLOCK_NUM_X * VecSize;
+  size_t stride = BLOCK_NUM_X * GRID_NUM_X * VecSize;
+  size_t args[VecSize];
+  T result[VecSize];
+  for (; data_offset < main_offset; data_offset += stride) {
+    kps::InitWithDataIndex<size_t, VecSize, 1, 1>(&args[0], data_offset);
+    kps::ElementwiseUnary<size_t, T, VecSize, 1, 1, Functor>(
+        &result[0], &args[0], func);
+    kps::WriteData<T, VecSize, 1, 1, false>(
+        out + data_offset, &result[0], BLOCK_NUM_X * VecSize);
+  }
+  size_t num = numel - data_offset;
+  if (num > 0) {
+    kps::InitWithDataIndex<size_t, VecSize, 1, 1>(&args[0], data_offset);
+    kps::ElementwiseUnary<size_t, T, VecSize, 1, 1, Functor>(
+        &result[0], &args[0], func);
+    kps::WriteData<T, VecSize, 1, 1, true>(out + data_offset, &result[0], num);
+  }
+}
+
+template <typename T, typename Functor>
+void IndexKernel(const KPDevice &dev_ctx, DenseTensor *out, Functor func) {
+  int numel = out->numel();
+  T *out_data = dev_ctx.template Alloc<T>(out);
+  if (numel <= 0) return;
+  int vec_size = phi::GetVectorizedSize(out_data);
+#ifdef PADDLE_WITH_XPU_KP
+  int block = 64;
+  int grid = 8;
+  auto stream = dev_ctx.x_context()->xpu_stream;
+#else
+  auto config =
+      phi::backends::gpu::GetGpuLaunchConfig1D(dev_ctx, numel, vec_size);
+  int grid = config.block_per_grid.x;
+  int block = config.thread_per_block.x;
+  auto stream = dev_ctx.stream();
+#endif
+  size_t main_offset = (numel / (vec_size * block)) * vec_size * block;
+  switch (vec_size) {
+    case 4:
+      VectorizedIndexKernel<T, Functor, 4><<<grid, block, 0, stream>>>(
+          out_data, numel, main_offset, func);
+      break;
+    case 2:
+      VectorizedIndexKernel<T, Functor, 2><<<grid, block, 0, stream>>>(
+          out_data, numel, main_offset, func);
+      break;
+    case 1:
+      VectorizedIndexKernel<T, Functor, 1><<<grid, block, 0, stream>>>(
+          out_data, numel, main_offset, func);
+      break;
+    default: {
+      PADDLE_THROW(phi::errors::Unimplemented(
+          "Unsupported vectorized size: %d !", vec_size));
+      break;
+    }
+  }
+}
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/funcs/logical_functor.h b/paddle/phi/kernels/funcs/logical_functor.h
new file mode 100644
index 0000000000000000000000000000000000000000..1ea7fc43e6b32c85e446044011b0c2ab3c79817c
--- /dev/null
+++ b/paddle/phi/kernels/funcs/logical_functor.h
@@ -0,0 +1,41 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+namespace phi {
+namespace funcs {
+
+#define LOGICAL_BINARY_FUNCTOR(func_name, op)                \
+  template <typename T>                                      \
+  struct func_name {                                         \
+    using ELEMENT_TYPE = T;                                  \
+    HOSTDEVICE bool operator()(const T a, const T b) const { \
+      return static_cast<bool>(a) op static_cast<bool>(b);   \
+    }                                                        \
+  };
+
+LOGICAL_BINARY_FUNCTOR(LogicalOrFunctor, ||)
+LOGICAL_BINARY_FUNCTOR(LogicalAndFunctor, &&)
+LOGICAL_BINARY_FUNCTOR(LogicalXorFunctor, ^)
+#undef LOGICAL_BINARY_FUNCTOR
+
+template <typename T>
+struct LogicalNotFunctor {
+  using ELEMENT_TYPE = T;
+  HOSTDEVICE bool operator()(const T a) const { return !a; }
+};
+
+}  // namespace funcs
+}  // namespace phi
diff --git a/paddle/phi/kernels/gpu/atan2_grad_kernel.cu b/paddle/phi/kernels/gpu/atan2_grad_kernel.cu
index 1cc3311c3639820ef9b6d3a29d9274ac93bb5963..6652d242de5ce44f3bf64d91e6fae16c648c2726 100644
--- a/paddle/phi/kernels/gpu/atan2_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/atan2_grad_kernel.cu
@@ -12,11 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "paddle/phi/kernels/impl/atan2_grad_kernel_impl.h"
+
 #include "paddle/phi/backends/gpu/gpu_context.h"
-#include "paddle/phi/core/device_context.h"
 #include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/atan2_grad_kernel.h"
-#include "paddle/phi/kernels/impl/atan2_grad_kernel_impl.h"
 
 PD_REGISTER_KERNEL(atan2_grad,
                    GPU,
diff --git a/paddle/phi/kernels/gpu/atan2_kernel.cu b/paddle/phi/kernels/gpu/atan2_kernel.cu
index 702c959b78f75d0e52511d9bdc9d4330c6838aa4..dd0bba177defef7cdbd41ef7944110d126ca2d7c 100644
--- a/paddle/phi/kernels/gpu/atan2_kernel.cu
+++ b/paddle/phi/kernels/gpu/atan2_kernel.cu
@@ -12,11 +12,10 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#include "paddle/phi/kernels/impl/atan2_kernel_impl.h"
+
 #include "paddle/phi/backends/gpu/gpu_context.h"
-#include "paddle/phi/core/device_context.h"
 #include "paddle/phi/core/kernel_registry.h"
-#include "paddle/phi/kernels/atan2_kernel.h"
-#include "paddle/phi/kernels/impl/atan2_kernel_impl.h"
 
 PD_REGISTER_KERNEL(atan2,
                    GPU,
diff --git a/paddle/phi/kernels/gpu/cast_kernel.cu b/paddle/phi/kernels/gpu/cast_kernel.cu
index 7a6c99c5fe15f6ddecd190d2d77e359503be7a80..569a46f56d5638584262c0d1c8002459fa8ffd70 100644
--- a/paddle/phi/kernels/gpu/cast_kernel.cu
+++ b/paddle/phi/kernels/gpu/cast_kernel.cu
@@ -80,8 +80,4 @@ void CastKernel(const Context& dev_ctx,
         paddle::experimental::DataType::UNDEFINED);     \
   }
 
-#if !defined(PADDLE_WITH_HIP)
 PTEN_REGISTER_CAST_CUDA_BASE_TYPE(cast, phi::dtype::bfloat16)
-#else
-PTEN_REGISTER_CAST_CUDA_BASE_TYPE(cast)
-#endif
diff --git a/paddle/phi/kernels/gpu/index_sample_grad_kernel.cu b/paddle/phi/kernels/gpu/index_sample_grad_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..8b1ef964124d7d61004ba4cb9f3c53f7c5cec347
--- /dev/null
+++ b/paddle/phi/kernels/gpu/index_sample_grad_kernel.cu
@@ -0,0 +1,146 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/index_sample_grad_kernel.h"
+
+#include <algorithm>
+#include <vector>
+#include "paddle/fluid/framework/convert_utils.h"
+#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
+#include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace phi {
+
+namespace {
+template <typename Context>
+void LimitGridDim(const Context& ctx, dim3* grid_dim) {
+  auto max_grid_dim =
+      reinterpret_cast<const phi::GPUContext&>(ctx).GetCUDAMaxGridDimSize();
+  grid_dim->x = grid_dim->x < max_grid_dim[0] ? grid_dim->x : max_grid_dim[0];
+  grid_dim->y = grid_dim->y < max_grid_dim[1] ? grid_dim->y : max_grid_dim[1];
+}
+#define PREDEFINED_BLOCK_SIZE_X 512
+#define PREDEFINED_BLOCK_SIZE 1024
+#define MIN(a, b) ((a) < (b) ? (a) : (b))
+};
+
+template <typename T, typename IndexT = int>
+__global__ void IndexSampleGrad(const IndexT* index,
+                                T* in_grad,
+                                const T* out_grad,
+                                size_t index_length,
+                                size_t input_length,
+                                size_t batch_size,
+                                bool same_data_in_row = true) {
+  unsigned int index_i = blockDim.x * blockIdx.x + threadIdx.x;
+  unsigned int index_j = blockDim.y * blockIdx.y + threadIdx.y;
+
+  for (; index_j < batch_size; index_j += blockDim.y * gridDim.y) {
+    index_i = blockDim.x * blockIdx.x + threadIdx.x;
+    for (; index_i < index_length; index_i += blockDim.x * gridDim.x) {
+      unsigned int index_idx = index_j * index_length + index_i;
+      unsigned int in_idx = index_j * input_length + index_i;
+      IndexT sample_idx = index[index_idx];
+      if (same_data_in_row) {
+        paddle::platform::CudaAtomicAdd(
+            &(in_grad[in_idx - index_i + sample_idx]), out_grad[sample_idx]);
+      } else {
+        in_grad[in_idx - index_i + sample_idx] = out_grad[index_idx];
+      }
+    }
+  }
+}
+
+template <typename T, typename Context>
+void IndexSampleGradKernel(const Context& ctx,
+                           const DenseTensor& out_grad,
+                           const DenseTensor& x,
+                           const DenseTensor& index,
+                           DenseTensor* x_grad) {
+  const T* output_grad_data = out_grad.data<T>();
+  T* input_grad_data = ctx.template Alloc<T>(x_grad);
+  auto index_type = index.dtype();
+  bool index_type_match =
+      index_type == DataType::INT32 || index_type == DataType::INT64;
+  PADDLE_ENFORCE_EQ(
+      index_type_match,
+      true,
+      errors::InvalidArgument(
+          "Input(Index) holds the wrong type, it holds %s, but "
+          "desires to be %s or %s",
+          paddle::framework::DataTypeToString(
+              paddle::framework::TransToProtoVarType(index_type)),
+          paddle::framework::DataTypeToString(
+              paddle::framework::TransToProtoVarType(DataType::INT32)),
+          paddle::framework::DataTypeToString(
+              paddle::framework::TransToProtoVarType((DataType::INT64)))));
+
+  auto stream = reinterpret_cast<const phi::GPUContext&>(ctx).stream();
+  auto input_num = x.numel();
+  auto input_dim = x.dims();
+  auto index_dim = index.dims();
+  size_t batch_size = index_dim[0];
+  size_t input_length = input_dim[1];
+  size_t index_length = index_dim[1];
+  bool same_data_in_index_row = index_length == 1 ? false : true;
+
+  auto block_width = paddle::platform::RoundToPowerOfTwo(index_length);
+  block_width = MIN(block_width, PREDEFINED_BLOCK_SIZE_X);
+  auto block_height =
+      paddle::platform::RoundToPowerOfTwo(index_length * batch_size) /
+      block_width;
+  block_height = MIN(block_height, PREDEFINED_BLOCK_SIZE / block_width);
+  dim3 block_dim(block_width, block_height);
+  dim3 grid_dim((index_length + block_dim.x - 1) / block_dim.x,
+                (batch_size + block_dim.y - 1) / block_dim.y);
+  LimitGridDim(ctx, &grid_dim);
+
+  phi::funcs::SetConstant<Context, T> set_zero;
+  set_zero(ctx, x_grad, static_cast<T>(0));
+
+  if (index_type == DataType::INT64) {
+    const int64_t* index_data = index.data<int64_t>();
+    IndexSampleGrad<T, int64_t><<<grid_dim, block_dim, 0, stream>>>(
+        index_data,
+        input_grad_data,
+        output_grad_data,
+        index_length,
+        input_length,
+        batch_size,
+        same_data_in_index_row);
+  } else if (index_type == DataType::INT32) {
+    const int* index_data = index.data<int>();
+    IndexSampleGrad<T, int><<<grid_dim, block_dim, 0, stream>>>(
+        index_data,
+        input_grad_data,
+        output_grad_data,
+        index_length,
+        input_length,
+        batch_size,
+        same_data_in_index_row);
+  }
+}
+}  // namespace phi
+
+PD_REGISTER_KERNEL(index_sample_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::IndexSampleGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/gpu/index_sample_kernel.cu b/paddle/phi/kernels/gpu/index_sample_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..0e042089e1e3d0a20bf3811de3633f5fea0584fa
--- /dev/null
+++ b/paddle/phi/kernels/gpu/index_sample_kernel.cu
@@ -0,0 +1,119 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/index_sample_kernel.h"
+
+#include <algorithm>
+#include <vector>
+#include "paddle/fluid/framework/convert_utils.h"
+#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/math_function.h"
+
+namespace phi {
+
+namespace {
+template <typename Context>
+void LimitGridDim(const Context& ctx, dim3* grid_dim) {
+  auto max_grid_dim =
+      reinterpret_cast<const phi::GPUContext&>(ctx).GetCUDAMaxGridDimSize();
+  grid_dim->x = grid_dim->x < max_grid_dim[0] ? grid_dim->x : max_grid_dim[0];
+  grid_dim->y = grid_dim->y < max_grid_dim[1] ? grid_dim->y : max_grid_dim[1];
+}
+#define PREDEFINED_BLOCK_SIZE_X 512
+#define PREDEFINED_BLOCK_SIZE 1024
+#define MIN(a, b) ((a) < (b) ? (a) : (b))
+}
+
+template <typename T, typename IndexT = int>
+__global__ void IndexSampleForward(const IndexT* index,
+                                   const T* in_data,
+                                   T* out_data,
+                                   size_t index_length,
+                                   size_t input_length,
+                                   size_t batch_size) {
+  unsigned int index_i = blockDim.x * blockIdx.x + threadIdx.x;
+  unsigned int index_j = blockDim.y * blockIdx.y + threadIdx.y;
+  for (; index_j < batch_size; index_j += blockDim.y * gridDim.y) {
+    index_i = blockDim.x * blockIdx.x + threadIdx.x;
+    for (; index_i < index_length; index_i += blockDim.x * gridDim.x) {
+      unsigned int index_idx = index_j * index_length + index_i;
+      unsigned int in_idx = index_j * input_length + index_i;
+      IndexT sample_idx = index[index_idx];
+      out_data[index_idx] = in_data[in_idx - index_i + sample_idx];
+    }
+  }
+}
+
+template <typename T, typename Context>
+void IndexSampleKernel(const Context& ctx,
+                       const DenseTensor& x,
+                       const DenseTensor& index,
+                       DenseTensor* out) {
+  auto index_type = index.dtype();
+  bool index_type_match =
+      index_type == DataType::INT32 || index_type == DataType::INT64;
+  PADDLE_ENFORCE_EQ(
+      index_type_match,
+      true,
+      errors::InvalidArgument(
+          "Input(Index) holds the wrong type, it holds %s, but "
+          "desires to be %s or %s",
+          paddle::framework::DataTypeToString(
+              paddle::framework::TransToProtoVarType(index_type)),
+          paddle::framework::DataTypeToString(
+              paddle::framework::TransToProtoVarType(DataType::INT32)),
+          paddle::framework::DataTypeToString(
+              paddle::framework::TransToProtoVarType((DataType::INT64)))));
+  const T* in_data = x.data<T>();
+  T* out_data = ctx.template Alloc<T>(out);
+  auto stream = reinterpret_cast<const phi::GPUContext&>(ctx).stream();
+  auto input_dim = x.dims();
+  auto index_dim = index.dims();
+  size_t batch_size = input_dim[0];
+  size_t input_length = input_dim[1];
+  size_t index_length = index_dim[1];
+
+  auto block_width = paddle::platform::RoundToPowerOfTwo(index_length);
+  block_width = MIN(block_width, PREDEFINED_BLOCK_SIZE_X);
+  int block_height =
+      paddle::platform::RoundToPowerOfTwo(index_length * batch_size) /
+      block_width;
+  block_height = MIN(block_height, PREDEFINED_BLOCK_SIZE / block_width);
+  dim3 block_dim(block_width, block_height);
+  dim3 grid_dim((index_length + block_dim.x - 1) / block_dim.x,
+                (batch_size + block_dim.y - 1) / block_dim.y);
+  LimitGridDim(ctx, &grid_dim);
+
+  if (index_type == DataType::INT64) {
+    const int64_t* index_data = index.data<int64_t>();
+    IndexSampleForward<T, int64_t><<<grid_dim, block_dim, 0, stream>>>(
+        index_data, in_data, out_data, index_length, input_length, batch_size);
+  } else if (index_type == DataType::INT32) {
+    const int* index_data = index.data<int>();
+    IndexSampleForward<T, int><<<grid_dim, block_dim, 0, stream>>>(
+        index_data, in_data, out_data, index_length, input_length, batch_size);
+  }
+}
+}  // namespace phi
+
+PD_REGISTER_KERNEL(index_sample,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::IndexSampleKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/gpu/logical_kernel.cu b/paddle/phi/kernels/gpu/logical_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..f32d4c77d4059f4c6c0157fc839d3fa345ed489c
--- /dev/null
+++ b/paddle/phi/kernels/gpu/logical_kernel.cu
@@ -0,0 +1,79 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/logical_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/elementwise_base.h"
+#include "paddle/phi/kernels/funcs/logical_functor.h"
+#include "paddle/phi/kernels/gpu/elementwise.h"
+
+namespace phi {
+
+#define DEFINE_LOGICAL_BINARY_KERNEL(type)                               \
+  template <typename T, typename Context>                                \
+  void Logical##type##Kernel(const Context& dev_ctx,                     \
+                             const DenseTensor& x,                       \
+                             const DenseTensor& y,                       \
+                             DenseTensor* out) {                         \
+    using InT = typename funcs::Logical##type##Functor<T>::ELEMENT_TYPE; \
+    using OutT = bool;                                                   \
+    dev_ctx.template Alloc<bool>(out);                                   \
+    funcs::Logical##type##Functor<T> binary_func;                        \
+    std::vector<const DenseTensor*> ins = {&x, &y};                      \
+    std::vector<DenseTensor*> outs = {out};                              \
+    funcs::BroadcastKernel<ElementwiseType::kBinary, InT, OutT>(         \
+        dev_ctx, ins, &outs, -1, binary_func);                           \
+  }
+
+DEFINE_LOGICAL_BINARY_KERNEL(And)
+DEFINE_LOGICAL_BINARY_KERNEL(Or)
+DEFINE_LOGICAL_BINARY_KERNEL(Xor)
+#undef DEFINE_LOGICAL_BINARY_KERNEL
+
+template <typename T, typename Context>
+void LogicalNotKernel(const Context& dev_ctx,
+                      const DenseTensor& x,
+                      DenseTensor* out) {
+  using InT = typename funcs::LogicalNotFunctor<T>::ELEMENT_TYPE;
+  using OutT = bool;
+
+  dev_ctx.template Alloc<bool>(out);
+  funcs::LogicalNotFunctor<T> unary_func;
+  std::vector<const DenseTensor*> ins = {&x};
+  std::vector<DenseTensor*> outs = {out};
+  funcs::BroadcastKernel<ElementwiseType::kUnary, InT, OutT>(
+      dev_ctx, ins, &outs, -1, unary_func);
+}
+
+}  // namespace phi
+
+#define REGISTER_LOGICAL_CUDA_KERNEL(logical_and, func_type) \
+  PD_REGISTER_KERNEL(logical_and,                            \
+                     GPU,                                    \
+                     ALL_LAYOUT,                             \
+                     phi::Logical##func_type##Kernel,        \
+                     float,                                  \
+                     double,                                 \
+                     bool,                                   \
+                     int64_t,                                \
+                     int,                                    \
+                     int8_t,                                 \
+                     int16_t) {}
+
+REGISTER_LOGICAL_CUDA_KERNEL(logical_and, And)
+REGISTER_LOGICAL_CUDA_KERNEL(logical_or, Or)
+REGISTER_LOGICAL_CUDA_KERNEL(logical_not, Not)
+REGISTER_LOGICAL_CUDA_KERNEL(logical_xor, Xor)
diff --git a/paddle/phi/kernels/gpu/math_kernel.cu b/paddle/phi/kernels/gpu/math_kernel.cu
index 56e8b16ccbe0df16fdc96470a8167e6dc6abfb3c..fc73ccca6de18ea169b60fc6e998d42a8cb03919 100644
--- a/paddle/phi/kernels/gpu/math_kernel.cu
+++ b/paddle/phi/kernels/gpu/math_kernel.cu
@@ -155,6 +155,7 @@ PD_REGISTER_KERNEL(sum_raw,
                    float,
                    double,
                    float16,
+                   bfloat16,
                    int16_t,
                    int,
                    int64_t,
diff --git a/paddle/phi/kernels/gpu/scale_kernel.cu b/paddle/phi/kernels/gpu/scale_kernel.cu
index d9c8de21c5bc2d26cb371d03be30ed0616a27a64..930c50a24be8fae40535c2d5e6dbbe85e7ced990 100644
--- a/paddle/phi/kernels/gpu/scale_kernel.cu
+++ b/paddle/phi/kernels/gpu/scale_kernel.cu
@@ -70,6 +70,7 @@ PD_REGISTER_KERNEL(scale,
                    float,
                    double,
                    phi::dtype::float16,
+                   phi::dtype::bfloat16,
                    uint8_t,
                    int8_t,
                    int16_t,
diff --git a/paddle/phi/kernels/gpu/split_kernel.cu b/paddle/phi/kernels/gpu/split_kernel.cu
index a698b9e716140b59b10a5799647e0a1aa7a8261d..d2473d5b0b110a122247c32c779b7a700c3249b1 100644
--- a/paddle/phi/kernels/gpu/split_kernel.cu
+++ b/paddle/phi/kernels/gpu/split_kernel.cu
@@ -27,20 +27,6 @@ void SplitKernel(const Context& dev_ctx,
                  const ScalarArray& num_or_sections,
                  const Scalar& axis_scalar,
                  std::vector<DenseTensor*> outs) {
-  // need to infershape output
-  if (num_or_sections.FromTensor() || axis_scalar.FromTensor()) {
-    std::vector<MetaTensor> out_metas;
-    for (size_t i = 0; i < outs.size(); ++i) {
-      out_metas.push_back(outs[i]);
-    }
-
-    phi::SplitInferMeta(x, num_or_sections, axis_scalar, &out_metas, true);
-
-    for (size_t i = 0; i < out_metas.size(); ++i) {
-      outs[i]->Resize(out_metas[i].dims());
-    }
-  }
-
   std::vector<const DenseTensor*> shape_refer;
   for (size_t j = 0; j < outs.size(); ++j) {
     dev_ctx.template Alloc<T>(outs[j]);
diff --git a/paddle/phi/kernels/gpu/uniform_random_kernel.cu b/paddle/phi/kernels/gpu/uniform_random_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..7f24a6667e562e64d8b523dd3ab1883af27bed5a
--- /dev/null
+++ b/paddle/phi/kernels/gpu/uniform_random_kernel.cu
@@ -0,0 +1,163 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/uniform_random_kernel.h"
+
+#include "gflags/gflags.h"
+
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/funcs/distribution_helper.h"
+#include "paddle/phi/kernels/funcs/index_impl.cu.h"
+
+DECLARE_bool(use_curand);
+
+namespace phi {
+
+template <typename T>
+struct UniformGenerator {
+  T min_, max_;
+  unsigned int seed_;
+  T diag_val_;
+  unsigned int diag_num_;
+  unsigned int diag_step_;
+  __host__ __device__ UniformGenerator(
+      T min, T max, int seed, int diag_num, int diag_step, T diag_val)
+      : min_(min),
+        max_(max),
+        seed_(seed),
+        diag_num_(diag_num),
+        diag_step_(diag_step),
+        diag_val_(diag_val) {}
+
+  __host__ __device__ T operator()(const unsigned int n) const {
+    thrust::minstd_rand rng;
+    rng.seed(seed_);
+    thrust::uniform_real_distribution<T> dist(min_, max_);
+    rng.discard(n);
+    T out = dist(rng);
+    unsigned int remainder = n % (diag_step_ + 1);
+    if (remainder == 0 && diag_num_ > n / (diag_step_ + 1)) {
+      out = diag_val_;
+    }
+    return out;
+  }
+};
+
+template <typename T>
+struct UniformGeneratorOffset {
+  T min_, max_;
+  unsigned int seed_;
+  T diag_val_;
+  unsigned int diag_num_;
+  unsigned int diag_step_;
+  int offset_;
+  __host__ __device__ UniformGeneratorOffset(T min,
+                                             T max,
+                                             int seed,
+                                             int diag_num,
+                                             int diag_step,
+                                             T diag_val,
+                                             int offset)
+      : min_(min),
+        max_(max),
+        seed_(seed),
+        diag_num_(diag_num),
+        diag_step_(diag_step),
+        diag_val_(diag_val),
+        offset_(offset) {}
+
+  __host__ __device__ T operator()(const unsigned int n) const {
+    thrust::minstd_rand rng;
+    rng.seed(seed_);
+    thrust::uniform_real_distribution<T> dist(min_, max_);
+    rng.discard(n + offset_);
+    T out = dist(rng);
+    unsigned int remainder = n % (diag_step_ + 1);
+    if (remainder == 0 && diag_num_ > n / (diag_step_ + 1)) {
+      out = diag_val_;
+    }
+    return out;
+  }
+};
+
+template <typename T, typename Context>
+void UniformRandomRawKernel(const Context& dev_ctx,
+                            const ScalarArray& shape,
+                            DataType dtype,
+                            float min,
+                            float max,
+                            int seed,
+                            int diag_num,
+                            int diag_step,
+                            float diag_val,
+                            DenseTensor* out) {
+  out->Resize(phi::make_ddim(shape.GetData()));
+  T* data = dev_ctx.template Alloc<T>(out);
+  auto size = out->numel();
+  bool seed_flag = false;
+  if (seed == 0) {
+    std::random_device rd;
+    seed = rd();
+    seed_flag = true;
+  }
+
+  auto generator = dev_ctx.GetGenerator();
+  if (generator->GetIsInitPy() && seed_flag) {
+    if (FLAGS_use_curand) {
+      using MT = typename kps::details::MPTypeTrait<T>::Type;
+      distribution::uniform_distribution<MT> dist;
+      distribution::uniform_transform<MT> trans(min, max);
+      distribution::distribution_and_transform<T>(dev_ctx, out, dist, trans);
+    } else {
+      auto seed_offset = generator->IncrementOffset(1);
+      int64_t gen_offset = size * seed_offset.second;
+      auto func = UniformGeneratorOffset<T>(min,
+                                            max,
+                                            seed_offset.first,
+                                            diag_num,
+                                            diag_step,
+                                            diag_val,
+                                            gen_offset);
+      IndexKernel<T, UniformGeneratorOffset<T>>(dev_ctx, out, func);
+    }
+  } else {
+    auto func =
+        UniformGenerator<T>(min, max, seed, diag_num, diag_step, diag_val);
+    IndexKernel<T, UniformGenerator<T>>(dev_ctx, out, func);
+  }
+}
+
+template <typename T, typename Context>
+void UniformRandomKernel(const Context& dev_ctx,
+                         const ScalarArray& shape,
+                         DataType dtype,
+                         float min,
+                         float max,
+                         int seed,
+                         DenseTensor* out) {
+  UniformRandomRawKernel<T>(
+      dev_ctx, shape, dtype, min, max, seed, 0, 0, 0.0f, out);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(uniform_random_raw,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::UniformRandomRawKernel,
+                   float,
+                   double) {}
+
+PD_REGISTER_KERNEL(
+    uniform_random, GPU, ALL_LAYOUT, phi::UniformRandomKernel, float, double) {}
diff --git a/paddle/phi/kernels/gpu/where_grad_kernel.cu b/paddle/phi/kernels/gpu/where_grad_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..f21aca80e21b30de8931b4fcd4ae3922be959958
--- /dev/null
+++ b/paddle/phi/kernels/gpu/where_grad_kernel.cu
@@ -0,0 +1,64 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/where_grad_kernel.h"
+
+namespace phi {
+
+template <typename T>
+__global__ void WhereGradCUDAKernel(
+    const int N, const T* dout, const bool* cond, T* dx, T* dy) {
+  int idx = blockDim.x * blockIdx.x + threadIdx.x;
+  for (; idx < N; idx += blockDim.x * gridDim.x) {
+    if (dx != nullptr) {
+      dx[idx] = cond[idx] ? dout[idx] : 0.;
+    }
+    if (dy != nullptr) {
+      dy[idx] = cond[idx] ? 0. : dout[idx];
+    }
+  }
+}
+
+template <typename T, typename Context>
+void WhereGradKernel(const Context& ctx,
+                     const DenseTensor& condition,
+                     const DenseTensor& x,
+                     const DenseTensor& y,
+                     const DenseTensor& out_grad,
+                     DenseTensor* x_grad,
+                     DenseTensor* y_grad) {
+  const bool* cond_data = condition.data<bool>();
+  auto numel = condition.numel();
+  auto* dout = out_grad.data<T>();
+
+  T* dx = (x_grad != nullptr) ? ctx.template Alloc<T>(x_grad) : nullptr;
+  T* dy = (y_grad != nullptr) ? ctx.template Alloc<T>(y_grad) : nullptr;
+
+  auto stream = ctx.stream();
+  auto config = backends::gpu::GetGpuLaunchConfig1D(ctx, numel);
+  WhereGradCUDAKernel<
+      T><<<config.block_per_grid.x, config.thread_per_block.x, 0, stream>>>(
+      numel, dout, cond_data, dx, dy);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(where_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::WhereGradKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t) {}
diff --git a/paddle/phi/kernels/gpu/where_kernel.cu b/paddle/phi/kernels/gpu/where_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..03c24eea3a95af1ed57f5c8df42b01fd09af1fa2
--- /dev/null
+++ b/paddle/phi/kernels/gpu/where_kernel.cu
@@ -0,0 +1,48 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/where_kernel.h"
+
+#include "paddle/phi/kernels/funcs/broadcast_function.h"
+#include "paddle/phi/kernels/funcs/elementwise_functor.h"
+
+namespace phi {
+
+// Cond
+template <typename T>
+struct CondFunctor {
+  inline HOSTDEVICE T operator()(const bool cond, const T x, const T y) const {
+    return cond ? x : y;
+  }
+};
+
+template <typename T, typename Context>
+void WhereKernel(const Context& ctx,
+                 const DenseTensor& condition,
+                 const DenseTensor& x,
+                 const DenseTensor& y,
+                 DenseTensor* out) {
+  std::vector<const DenseTensor*> ins = {&condition, &x, &y};
+  std::vector<DenseTensor*> outs = {out};
+  ctx.template Alloc<T>(out);
+
+  CondFunctor<T> func;
+  funcs::BroadcastKernel<ElementwiseType::kTernary, T, T>(
+      ctx, ins, &outs, -1, func);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(
+    where, GPU, ALL_LAYOUT, phi::WhereKernel, float, double, int, int64_t) {}
diff --git a/paddle/phi/kernels/impl/atan2_grad_kernel_impl.h b/paddle/phi/kernels/impl/atan2_grad_kernel_impl.h
index d0dd18298518ab351918aa2492eb48d11d3cf1d7..0eff1378f41de9b31a35375f86ca69a427d19f4f 100644
--- a/paddle/phi/kernels/impl/atan2_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/atan2_grad_kernel_impl.h
@@ -14,9 +14,10 @@
 
 #pragma once
 
-#include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/kernels/atan2_grad_kernel.h"
-#include "paddle/phi/kernels/funcs/for_range.h"
+
+#include "paddle/fluid/platform/for_range.h"
+#include "paddle/phi/core/dense_tensor.h"
 
 namespace phi {
 
diff --git a/paddle/phi/kernels/impl/atan2_kernel_impl.h b/paddle/phi/kernels/impl/atan2_kernel_impl.h
index 2cae914e2f61555377f7a41b3d89cdbb2b589247..7653032f2113c6e181673c57feaec2efd6472838 100644
--- a/paddle/phi/kernels/impl/atan2_kernel_impl.h
+++ b/paddle/phi/kernels/impl/atan2_kernel_impl.h
@@ -14,9 +14,10 @@
 
 #pragma once
 
-#include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/kernels/atan2_kernel.h"
-#include "paddle/phi/kernels/funcs/for_range.h"
+
+#include "paddle/fluid/platform/for_range.h"
+#include "paddle/phi/core/dense_tensor.h"
 
 namespace phi {
 template <typename T>
diff --git a/paddle/phi/kernels/index_sample_grad_kernel.h b/paddle/phi/kernels/index_sample_grad_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..5c6e101f1b43df04d58da25fd7252f0ff929386e
--- /dev/null
+++ b/paddle/phi/kernels/index_sample_grad_kernel.h
@@ -0,0 +1,28 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void IndexSampleGradKernel(const Context& ctx,
+                           const DenseTensor& out_grad,
+                           const DenseTensor& x,
+                           const DenseTensor& index,
+                           DenseTensor* in_grad);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/index_sample_kernel.h b/paddle/phi/kernels/index_sample_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..fb43c0c6c5f97c6d47381c72786c6e44441e7762
--- /dev/null
+++ b/paddle/phi/kernels/index_sample_kernel.h
@@ -0,0 +1,27 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void IndexSampleKernel(const Context& ctx,
+                       const DenseTensor& x,
+                       const DenseTensor& index,
+                       DenseTensor* out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/logical_kernel.h b/paddle/phi/kernels/logical_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..3ccc03a5b598a0a939cde00d74e1f6126808f655
--- /dev/null
+++ b/paddle/phi/kernels/logical_kernel.h
@@ -0,0 +1,38 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+#define DECLEAR_LOGICAL_BINARY_KERNEL(type)          \
+  template <typename T, typename Context>            \
+  void Logical##type##Kernel(const Context& dev_ctx, \
+                             const DenseTensor& x,   \
+                             const DenseTensor& y,   \
+                             DenseTensor* out);
+
+DECLEAR_LOGICAL_BINARY_KERNEL(And)
+DECLEAR_LOGICAL_BINARY_KERNEL(Or)
+DECLEAR_LOGICAL_BINARY_KERNEL(Xor)
+#undef DECLEAR_LOGICAL_BINARY_KERNEL
+
+template <typename T, typename Context>
+void LogicalNotKernel(const Context& dev_ctx,
+                      const DenseTensor& x,
+                      DenseTensor* out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/math_kernel.cc b/paddle/phi/kernels/math_kernel.cc
index 3cb7b66ddf73e5fa3c5502a4acaad2c277a22ac6..480eb56c8b05c12c36337d4649a17b3b03146fdf 100644
--- a/paddle/phi/kernels/math_kernel.cc
+++ b/paddle/phi/kernels/math_kernel.cc
@@ -165,6 +165,7 @@ PD_REGISTER_KERNEL(sum,
                    float,
                    double,
                    phi::dtype::float16,
+                   phi::dtype::bfloat16,
                    int16_t,
                    int,
                    int64_t,
diff --git a/paddle/phi/kernels/selected_rows/uniform_random_kernel.cc b/paddle/phi/kernels/selected_rows/uniform_random_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..881180b71b151aa48a16dcd15871d4a7cd656fb7
--- /dev/null
+++ b/paddle/phi/kernels/selected_rows/uniform_random_kernel.cc
@@ -0,0 +1,88 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/kernels/uniform_random_kernel.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void UniformRandomRawSRKernel(const Context& dev_ctx,
+                              const ScalarArray& shape,
+                              DataType dtype,
+                              float min,
+                              float max,
+                              int seed,
+                              int diag_num,
+                              int diag_step,
+                              float diag_val,
+                              SelectedRows* out) {
+  phi::UniformRandomRawKernel<T>(dev_ctx,
+                                 shape,
+                                 dtype,
+                                 min,
+                                 max,
+                                 seed,
+                                 diag_num,
+                                 diag_step,
+                                 diag_val,
+                                 out->mutable_value());
+}
+
+template <typename T, typename Context>
+void UniformRandomSRKernel(const Context& dev_ctx,
+                           const ScalarArray& shape,
+                           DataType dtype,
+                           float min,
+                           float max,
+                           int seed,
+                           SelectedRows* out) {
+  phi::UniformRandomKernel<T>(
+      dev_ctx, shape, dtype, min, max, seed, out->mutable_value());
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(uniform_random_raw_sr,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::UniformRandomRawSRKernel,
+                   float,
+                   double,
+                   phi::dtype::bfloat16) {}
+
+PD_REGISTER_KERNEL(uniform_random_sr,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::UniformRandomSRKernel,
+                   float,
+                   double,
+                   phi::dtype::bfloat16) {}
+
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+
+PD_REGISTER_KERNEL(uniform_random_raw_sr,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::UniformRandomRawSRKernel,
+                   float,
+                   double) {}
+
+PD_REGISTER_KERNEL(uniform_random_sr,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::UniformRandomSRKernel,
+                   float,
+                   double) {}
+#endif
diff --git a/paddle/phi/kernels/split_kernel.h b/paddle/phi/kernels/split_kernel.h
index 1e730d809bc3a225d8dc34d24bde48f857b7ca9a..840fe4366ce7eaca82608612dfb41cc7f7783f4c 100644
--- a/paddle/phi/kernels/split_kernel.h
+++ b/paddle/phi/kernels/split_kernel.h
@@ -43,18 +43,18 @@ std::vector<DenseTensor> Split(const Context& dev_ctx,
   }
 
   std::vector<MetaTensor> out_meta;
+  std::vector<MetaTensor*> out_meta_ptr;
   out_meta.reserve(out_number);
+  out_meta_ptr.reserve(out_number);
   std::vector<DenseTensor> result;
   result.reserve(out_number);
 
   for (size_t i = 0; i < out_number; ++i) {
-    auto dense_out = phi::Empty<T, Context>(dev_ctx);
-    MetaTensor tmp_meta(&dense_out);
-
-    result.push_back(dense_out);
-    out_meta.push_back(&result.back());
+    result.emplace_back(phi::Empty<T, Context>(dev_ctx));
+    out_meta.emplace_back(&result.back());
+    out_meta_ptr.push_back(&out_meta.back());
   }
-  SplitInferMeta(x, num_or_sections, axis, &out_meta);
+  SplitInferMeta(x, num_or_sections, axis, out_meta_ptr);
 
   std::vector<DenseTensor*> outs;
   outs.reserve(out_meta.size());
diff --git a/paddle/phi/kernels/uniform_random_kernel.h b/paddle/phi/kernels/uniform_random_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..5bba127278541e61b142dbeb7d00f10ed3f8437b
--- /dev/null
+++ b/paddle/phi/kernels/uniform_random_kernel.h
@@ -0,0 +1,66 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/common/scalar_array.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/device_context.h"
+#include "paddle/phi/core/selected_rows.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void UniformRandomRawKernel(const Context& dev_ctx,
+                            const ScalarArray& shape,
+                            DataType dtype,
+                            float min,
+                            float max,
+                            int seed,
+                            int diag_num,
+                            int diag_step,
+                            float diag_val,
+                            DenseTensor* out);
+
+template <typename T, typename Context>
+void UniformRandomKernel(const Context& dev_ctx,
+                         const ScalarArray& shape,
+                         DataType dtype,
+                         float min,
+                         float max,
+                         int seed,
+                         DenseTensor* out);
+
+template <typename T, typename Context>
+void UniformRandomRawSRKernel(const Context& dev_ctx,
+                              const ScalarArray& shape,
+                              DataType dtype,
+                              float min,
+                              float max,
+                              int seed,
+                              int diag_num,
+                              int diag_step,
+                              float diag_val,
+                              SelectedRows* out);
+
+template <typename T, typename Context>
+void UniformRandomSRKernel(const Context& dev_ctx,
+                           const ScalarArray& shape,
+                           DataType dtype,
+                           float min,
+                           float max,
+                           int seed,
+                           SelectedRows* out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/where_grad_kernel.h b/paddle/phi/kernels/where_grad_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..1a3c66ee6ed8403d0b453ed38d21e4beed02661c
--- /dev/null
+++ b/paddle/phi/kernels/where_grad_kernel.h
@@ -0,0 +1,33 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/backends/all_context.h"
+#include "paddle/phi/backends/gpu/gpu_launch_config.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void WhereGradKernel(const Context& ctx,
+                     const DenseTensor& condition,
+                     const DenseTensor& x,
+                     const DenseTensor& y,
+                     const DenseTensor& out_grad,
+                     DenseTensor* x_grad,
+                     DenseTensor* y_grad);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/where_kernel.h b/paddle/phi/kernels/where_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..254271ac9c7238c66d09ffe41d12e29fe8f23237
--- /dev/null
+++ b/paddle/phi/kernels/where_kernel.h
@@ -0,0 +1,31 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/backends/all_context.h"
+#include "paddle/phi/backends/gpu/gpu_launch_config.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void WhereKernel(const Context& ctx,
+                 const DenseTensor& condition,
+                 const DenseTensor& x,
+                 const DenseTensor& y,
+                 DenseTensor* out);
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/xpu/full_kernel.cc b/paddle/phi/kernels/xpu/full_kernel.cc
index 574f4e991a260e8ebc250fe3f8461736dc3eb7f8..d43126d56e88c868d4e273aaf13bd71bc570d37c 100644
--- a/paddle/phi/kernels/xpu/full_kernel.cc
+++ b/paddle/phi/kernels/xpu/full_kernel.cc
@@ -59,7 +59,7 @@ void FullKernel(const Context& dev_ctx,
                 const Scalar& val,
                 DataType dtype,
                 DenseTensor* out) {
-  out->ResizeAndAllocate(phi::make_ddim(shape.GetData()));
+  out->Resize(phi::make_ddim(shape.GetData()));
   FullValueXPU<T>(dev_ctx, out, val.to<T>());
 }
 
@@ -69,6 +69,7 @@ void FullLikeKernel(const Context& dev_ctx,
                     const Scalar& val,
                     DataType dtype,
                     DenseTensor* out) {
+  dev_ctx.template Alloc<T>(out);
   auto value = val.to<float>();
   using XPUInTDType = typename XPUTypeTrait<T>::Type;
   using CommonType = typename std::common_type<
diff --git a/paddle/phi/ops/compat/index_sample_sig.cc b/paddle/phi/ops/compat/index_sample_sig.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0d2aed68a72a5e3aa762b4adbcf7c6e39869b927
--- /dev/null
+++ b/paddle/phi/ops/compat/index_sample_sig.cc
@@ -0,0 +1,30 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature IndexSampleGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("index_sample_grad",
+                         {GradVarName("Out"), "X", "Index"},
+                         {},
+                         {GradVarName("X")});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(index_sample_grad,
+                           phi::IndexSampleGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/uniform_random_sig.cc b/paddle/phi/ops/compat/uniform_random_sig.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d06d4026f4f5f81a07eee2131b7df7808592132b
--- /dev/null
+++ b/paddle/phi/ops/compat/uniform_random_sig.cc
@@ -0,0 +1,159 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature UniformRandomOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  int diag_num = paddle::any_cast<int>(ctx.Attr("diag_num"));
+  if (ctx.IsDenseTensorOutput("Out")) {
+    if (diag_num) {
+      if (ctx.InputSize("ShapeTensorList") > 0) {
+        return KernelSignature("uniform_random_raw",
+                               {},
+                               {"ShapeTensorList",
+                                "dtype",
+                                "min",
+                                "max",
+                                "seed",
+                                "diag_num",
+                                "diag_step",
+                                "diag_val"},
+                               {"Out"});
+      } else {
+        const auto& shape =
+            paddle::any_cast<std::vector<int64_t>>(ctx.Attr("shape"));
+        if (ctx.HasInput("ShapeTensor") && shape.empty()) {
+          return KernelSignature("uniform_random_raw",
+                                 {},
+                                 {"ShapeTensor",
+                                  "dtype",
+                                  "min",
+                                  "max",
+                                  "seed",
+                                  "diag_num",
+                                  "diag_step",
+                                  "diag_val"},
+                                 {"Out"});
+        } else {
+          return KernelSignature("uniform_random_raw",
+                                 {},
+                                 {"shape",
+                                  "dtype",
+                                  "min",
+                                  "max",
+                                  "seed",
+                                  "diag_num",
+                                  "diag_step",
+                                  "diag_val"},
+                                 {"Out"});
+        }
+      }
+    } else {
+      if (ctx.InputSize("ShapeTensorList") > 0) {
+        return KernelSignature(
+            "uniform_random",
+            {},
+            {"ShapeTensorList", "dtype", "min", "max", "seed"},
+            {"Out"});
+      } else {
+        const auto& shape =
+            paddle::any_cast<std::vector<int64_t>>(ctx.Attr("shape"));
+        if (ctx.HasInput("ShapeTensor") && shape.empty()) {
+          return KernelSignature("uniform_random",
+                                 {},
+                                 {"ShapeTensor", "dtype", "min", "max", "seed"},
+                                 {"Out"});
+        } else {
+          return KernelSignature("uniform_random",
+                                 {},
+                                 {"shape", "dtype", "min", "max", "seed"},
+                                 {"Out"});
+        }
+      }
+    }
+  } else if (ctx.IsSelectedRowsOutput("Out")) {
+    if (diag_num) {
+      if (ctx.InputSize("ShapeTensorList") > 0) {
+        return KernelSignature("uniform_random_raw_sr",
+                               {},
+                               {"ShapeTensorList",
+                                "dtype",
+                                "min",
+                                "max",
+                                "seed",
+                                "diag_num",
+                                "diag_step",
+                                "diag_val"},
+                               {"Out"});
+      } else {
+        const auto& shape =
+            paddle::any_cast<std::vector<int64_t>>(ctx.Attr("shape"));
+        if (ctx.HasInput("ShapeTensor") && shape.empty()) {
+          return KernelSignature("uniform_random_raw_sr",
+                                 {},
+                                 {"ShapeTensor",
+                                  "dtype",
+                                  "min",
+                                  "max",
+                                  "seed",
+                                  "diag_num",
+                                  "diag_step",
+                                  "diag_val"},
+                                 {"Out"});
+        } else {
+          return KernelSignature("uniform_random_raw_sr",
+                                 {},
+                                 {"shape",
+                                  "dtype",
+                                  "min",
+                                  "max",
+                                  "seed",
+                                  "diag_num",
+                                  "diag_step",
+                                  "diag_val"},
+                                 {"Out"});
+        }
+      }
+    } else {
+      if (ctx.InputSize("ShapeTensorList") > 0) {
+        return KernelSignature(
+            "uniform_random_sr",
+            {},
+            {"ShapeTensorList", "dtype", "min", "max", "seed"},
+            {"Out"});
+      } else {
+        const auto& shape =
+            paddle::any_cast<std::vector<int64_t>>(ctx.Attr("shape"));
+        if (ctx.HasInput("ShapeTensor") && shape.empty()) {
+          return KernelSignature("uniform_random_sr",
+                                 {},
+                                 {"ShapeTensor", "dtype", "min", "max", "seed"},
+                                 {"Out"});
+        } else {
+          return KernelSignature("uniform_random_sr",
+                                 {},
+                                 {"shape", "dtype", "min", "max", "seed"},
+                                 {"Out"});
+        }
+      }
+    }
+  }
+  return KernelSignature("unregistered", {}, {}, {});
+}
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(uniform_random, phi::UniformRandomOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/where_grad_sig.cc b/paddle/phi/ops/compat/where_grad_sig.cc
new file mode 100644
index 0000000000000000000000000000000000000000..71984a26d35afd841654d82480c263799bdbf181
--- /dev/null
+++ b/paddle/phi/ops/compat/where_grad_sig.cc
@@ -0,0 +1,28 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature WhereGradOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature("where_grad",
+                         {"Condition", "X", "Y", GradVarName("Out")},
+                         {},
+                         {GradVarName("X"), GradVarName("Y")});
+}
+
+}  // namespace phi
+
+PD_REGISTER_ARG_MAPPING_FN(where_grad, phi::WhereGradOpArgumentMapping);
diff --git a/paddle/phi/tests/common/test_backend.cc b/paddle/phi/tests/common/test_backend.cc
index fa4ffc84bf587defae06deb18dae283a64206b75..5d6862c368c57bc7dbfba2bc9eab960818c25e05 100644
--- a/paddle/phi/tests/common/test_backend.cc
+++ b/paddle/phi/tests/common/test_backend.cc
@@ -44,6 +44,9 @@ TEST(Backend, OStream) {
   oss << phi::Backend::GPUDNN;
   EXPECT_EQ(oss.str(), "GPUDNN");
   oss.str("");
+  oss << phi::Backend::KPS;
+  EXPECT_EQ(oss.str(), "KPS");
+  oss.str("");
   try {
     oss << phi::Backend::NUM_BACKENDS;
   } catch (const std::exception& exception) {
@@ -61,6 +64,7 @@ TEST(Backend, StringToBackend) {
   EXPECT_EQ(phi::Backend::NPU, pexp::StringToBackend("NPU"));
   EXPECT_EQ(phi::Backend::MKLDNN, pexp::StringToBackend("MKLDNN"));
   EXPECT_EQ(phi::Backend::GPUDNN, pexp::StringToBackend("GPUDNN"));
+  EXPECT_EQ(phi::Backend::KPS, pexp::StringToBackend("KPS"));
   EXPECT_EQ(static_cast<phi::Backend>(
                 static_cast<size_t>(phi::Backend::NUM_BACKENDS) + 1),
             pexp::StringToBackend("CustomBackend"));
diff --git a/paddle/phi/tests/core/test_custom_kernel.cc b/paddle/phi/tests/core/test_custom_kernel.cc
index d8e42c9d0d8b11d393dbb71776671d9cb50a7715..69922c055cbac5fe3c3947d0d8d63ee4a1262a4c 100644
--- a/paddle/phi/tests/core/test_custom_kernel.cc
+++ b/paddle/phi/tests/core/test_custom_kernel.cc
@@ -146,12 +146,10 @@ TEST(CustomKernel, custom_kernel_dot) {
               custom_fake_dot_kernels.end());
 
   // 3.before register
-  auto& kernel_factory_instance = phi::KernelFactory::Instance();
   auto& kernels = phi::KernelFactory::Instance().kernels();
-  EXPECT_TRUE(!kernel_factory_instance.HasCompatiblePhiKernel(op_name));
+  EXPECT_TRUE(kernels.find(op_name) == kernels.end());
 
-  // mock fake_dot is supported by phi for HasCompatiblePhiKernel check while
-  // registering
+  // mock fake_dot is supported by phi for check while registering
   auto& fake_dot_kernels = kernels[op_name];
 
   EXPECT_TRUE(fake_dot_kernels.find(
@@ -196,7 +194,7 @@ TEST(CustomKernel, custom_kernel_dot) {
               fake_dot_kernels.end());
 
   // 4.kernel select
-  auto kernel = kernel_factory_instance.SelectKernelOrThrowError(
+  auto kernel = phi::KernelFactory::Instance().SelectKernelOrThrowError(
       op_name, phi::KernelKey(backend, layout, phi::DataType::UINT8));
 
   // 5.prepare parameters for kernel
diff --git a/python/paddle/fluid/contrib/slim/quantization/quant2_int8_mkldnn_pass.py b/python/paddle/fluid/contrib/slim/quantization/quant2_int8_mkldnn_pass.py
index d5bc2e6b5307bf477c928380070644aca3c67f62..9d9fbd39a5767ffe72ad579df2d31ac66eda2234 100644
--- a/python/paddle/fluid/contrib/slim/quantization/quant2_int8_mkldnn_pass.py
+++ b/python/paddle/fluid/contrib/slim/quantization/quant2_int8_mkldnn_pass.py
@@ -426,9 +426,6 @@ class Quant2Int8MkldnnPass(object):
         graph = self._apply_pass(graph, 'depthwise_conv_mkldnn_pass')
         graph = self._apply_pass(graph, 'conv_bn_fuse_pass')
         graph = self._apply_pass(graph, 'conv_eltwiseadd_bn_fuse_pass')
-        graph = self._apply_pass(graph, 'conv_affine_channel_fuse_pass')
-        graph = self._apply_pass(graph,
-                                 'conv_eltwiseadd_affine_channel_fuse_pass')
         graph = self._apply_pass(graph, 'conv_transpose_bn_fuse_pass')
         graph = self._apply_pass(graph,
                                  'conv_transpose_eltwiseadd_bn_fuse_pass')
diff --git a/python/paddle/fluid/dygraph/parallel.py b/python/paddle/fluid/dygraph/parallel.py
index ddb86848f842a85acc12dca1044a594c484c06fe..0049f387b707fc853699474b34235f177d4672af 100644
--- a/python/paddle/fluid/dygraph/parallel.py
+++ b/python/paddle/fluid/dygraph/parallel.py
@@ -560,13 +560,19 @@ class DataParallel(layers.Layer):
                  strategy=None,
                  comm_buffer_size=25,
                  last_comm_buffer_size=1,
-                 find_unused_parameters=False):
+                 find_unused_parameters=False,
+                 process_group=None,
+                 gradient_as_buffer_view=False,
+                 static_graph=False):
         super(DataParallel,
               self).__init__(layers.full_name() + "_data_parallel")
 
         self._layers = layers
         self.find_unused_parameters = find_unused_parameters
         self.grad_need_sync = True
+        self.process_group = process_group
+        self.gradient_as_buffer_view = gradient_as_buffer_view
+        self.static_graph = static_graph
 
         # NOTE(chenweihang): The ParallelStrategy here is not strictly a strategy. 
         # It just stores some environment variables, which can be constructed by 
diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index 2361bd270623873384d3cea8cd11eb10a78ec116..7d64cf7bd894553de84293295dc737255e803613 100644
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -590,7 +590,7 @@ foreach(TEST_OP ${TEST_OPS})
     py_test_modules(${TEST_OP} MODULES ${TEST_OP})
 endforeach(TEST_OP)
 py_test_modules(test_adam_op_multi_thread MODULES test_adam_op ENVS FLAGS_inner_op_parallelism=4)
-if (WITH_GPU OR WITH_XPU OR WITH_ASCEND OR WITH_ASCEND_CL)
+if (WITH_GPU OR WITH_XPU OR WITH_ASCEND OR WITH_ASCEND_CL OR APPLE)
     py_test_modules(test_warpctc_op MODULES test_warpctc_op)
     set_tests_properties(test_warpctc_op PROPERTIES TIMEOUT 120)
 endif()
diff --git a/python/paddle/fluid/tests/unittests/distributed_fused_lamb_test_base.py b/python/paddle/fluid/tests/unittests/distributed_fused_lamb_test_base.py
index e0529c5d5f82cfccb1fd47705b2a4cda39c17827..00d2a1f71d6bd3f1d8ce3b8981be2b4732163340 100644
--- a/python/paddle/fluid/tests/unittests/distributed_fused_lamb_test_base.py
+++ b/python/paddle/fluid/tests/unittests/distributed_fused_lamb_test_base.py
@@ -144,6 +144,11 @@ def run_model(use_distributed_lamb, use_fp16, use_master_param_norm, **kwargs):
             grad_clip = kwargs.get('grad_clip', None)
             clip_after_allreduce = kwargs.get('clip_after_allreduce', True)
 
+            parameters = [p.name for p in main.all_parameters()]
+            exclude_fn = lambda var: var.name in parameters[::4]
+            kwargs['exclude_from_weight_decay_fn'] = exclude_fn
+            kwargs['lamb_weight_decay'] = 0.1
+
             if use_distributed_lamb:
                 optimizer_class = DistributedFusedLamb
                 kwargs = dict(kwargs)
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_conv_affine_channel_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_conv_affine_channel_fuse_pass.py
deleted file mode 100644
index 5afaf08eec3b1324df312920bd9e8c8970fd7dbc..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_conv_affine_channel_fuse_pass.py
+++ /dev/null
@@ -1,160 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from auto_scan_test import PassAutoScanTest, IgnoreReasons
-from program_config import TensorConfig, ProgramConfig, OpConfig
-import numpy as np
-import paddle.inference as paddle_infer
-from functools import partial
-from typing import Optional, List, Callable, Dict, Any, Set
-import unittest
-
-import hypothesis
-from hypothesis import given, settings, seed, example, assume, reproduce_failure
-import hypothesis.strategies as st
-
-
-class TestConvAffineChannelFusePass(PassAutoScanTest):
-    def is_program_valid(self, program_config: ProgramConfig) -> bool:
-        return True
-
-    def sample_program_config(self, draw):
-        padding_algorithm = draw(st.sampled_from(["EXPLICIT", "SAME", "VALID"]))
-        groups = draw(st.integers(min_value=1, max_value=3))
-        data_format = draw(st.sampled_from(["NCHW", "NHWC"]))
-        axis = draw(st.sampled_from([1]))
-        filter_channel = draw(st.integers(min_value=1, max_value=16)) * 4
-        filter_size = draw(st.integers(min_value=1, max_value=4))
-        in_channel = groups * filter_channel
-        out_channel_factor = draw(st.integers(min_value=1, max_value=16)) * 4
-        out_channel = groups * out_channel_factor
-        batch_size = draw(st.integers(min_value=1, max_value=4))
-        dilations = draw(
-            st.lists(
-                st.integers(
-                    min_value=1, max_value=2), min_size=2, max_size=2))
-        paddings = draw(
-            st.lists(
-                st.integers(
-                    min_value=0, max_value=2), min_size=2, max_size=2))
-        strides = draw(
-            st.lists(
-                st.integers(
-                    min_value=1, max_value=2), min_size=2, max_size=2))
-        has_bias = draw(st.booleans())
-
-        x_shape = [
-            batch_size, in_channel, 64, 64
-        ] if data_format == "NCHW" else [batch_size, 64, 64, in_channel]
-        w_shape = [out_channel, filter_channel, filter_size, filter_size]
-        scale_shape = [out_channel]
-        bias_shape = [out_channel]
-
-        def generate_input():
-            return np.random.random(x_shape).astype(np.float32)
-
-        def generate_weight():
-            return np.random.random(w_shape).astype(np.float32)
-
-        def generate_bias():
-            return np.random.random(bias_shape).astype(np.float32)
-
-        def generate_scale_bias():
-            return np.random.random(bias_shape).astype(np.float32)
-
-        conv2d_op = OpConfig(
-            "conv2d",
-            inputs={
-                "Input": ["input_data"],
-                "Filter": ["conv2d_weight"],
-            },
-            outputs={"Output": ["conv_output"]},
-            data_format=data_format,
-            dilations=dilations,
-            padding_algorithm=padding_algorithm,
-            groups=groups,
-            paddings=paddings,
-            strides=strides,
-            has_bias=has_bias,
-            is_test=True)
-        ac_op = OpConfig(
-            "affine_channel",
-            inputs={
-                "X": ["conv_output"],
-                "Scale": ["affine_channel_scale"],
-                "Bias": ["affine_channel_bias"]
-            },
-            outputs={"Out": ["affine_channel_ouput"]},
-            data_layout=data_format)
-        if has_bias == True:
-            conv2d_op.inputs["Bias"] = ["conv2d_bias"]
-        ops = [conv2d_op, ac_op]
-
-        program_config = ProgramConfig(
-            ops=ops,
-            inputs={
-                "input_data": TensorConfig(data_gen=partial(generate_input)),
-            },
-            weights={
-                "conv2d_weight":
-                TensorConfig(data_gen=partial(generate_weight)),
-                "affine_channel_scale":
-                TensorConfig(data_gen=partial(generate_scale_bias)),
-                "affine_channel_bias":
-                TensorConfig(data_gen=partial(generate_scale_bias)),
-            },
-            outputs=["affine_channel_ouput"])
-        if has_bias == True:
-            program_config.weights["conv2d_bias"] = TensorConfig(
-                data_gen=partial(generate_bias))
-        return program_config
-
-    def sample_predictor_configs(self, program_config):
-        config = self.create_inference_config(use_gpu=True)
-        yield config, ['conv2d', 'elementwise_add'], (1e-4, 1e-4)
-
-        config = self.create_inference_config(use_mkldnn=True)
-        yield config, ['conv2d', 'elementwise_add'], (1e-4, 1e-4)
-
-    def add_ignore_pass_case(self):
-        # If the problem has been fixed, the judgment 
-        # in is_program_valid needs to be deleted!!!
-        def teller1(program_config, predictor_config):
-            if program_config.ops[0].attrs['data_format'] == "NHWC":
-                return True
-            return False
-
-        # mkldnn Output has diff with bias!
-        def teller2(program_config, predictor_config):
-            return predictor_config.mkldnn_enabled() and program_config.ops[
-                0].attrs['has_bias'] == True
-
-        self.add_ignore_check_case(
-            teller1, IgnoreReasons.PASS_ACCURACY_ERROR,
-            "The output format of conv2d is wrong when data_format attribute is NHWC, \
-            because currently its fused op (Conv2DFusion) only supports data format of channel first (NCHW)."
-        )
-
-        self.add_ignore_check_case(
-            teller2, IgnoreReasons.PASS_ACCURACY_ERROR,
-            "Currently mkldnn Output has diff with bias!")
-
-    def test(self):
-        self.run_and_statis(
-            quant=False,
-            passes=["conv_affine_channel_fuse_pass"], )
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_conv_eltwiseadd_affine_channel_fuse_pass.py b/python/paddle/fluid/tests/unittests/ir/inference/test_conv_eltwiseadd_affine_channel_fuse_pass.py
deleted file mode 100644
index a8bfdb79ca1daa5caa0cffb945fee76fdef36c36..0000000000000000000000000000000000000000
--- a/python/paddle/fluid/tests/unittests/ir/inference/test_conv_eltwiseadd_affine_channel_fuse_pass.py
+++ /dev/null
@@ -1,183 +0,0 @@
-# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from auto_scan_test import PassAutoScanTest, IgnoreReasons
-from program_config import TensorConfig, ProgramConfig, OpConfig
-import numpy as np
-import paddle.inference as paddle_infer
-from functools import partial
-from typing import Optional, List, Callable, Dict, Any, Set
-import unittest
-
-import hypothesis
-from hypothesis import given, settings, seed, example, assume
-import hypothesis.strategies as st
-
-
-class TestConvEltwiseAddAffineChannelFusePass(PassAutoScanTest):
-    def is_program_valid(self, program_config: ProgramConfig) -> bool:
-        attrs = [
-            program_config.ops[i].attrs
-            for i in range(len(program_config.ops))
-        ]
-
-        if attrs[0]['data_format'] == "NHWC" and attrs[1]['axis'] != 3:
-            return False
-
-        return True
-
-    def sample_program_config(self, draw):
-        padding_algorithm = draw(st.sampled_from(["EXPLICIT", "SAME", "VALID"]))
-        groups = draw(st.integers(min_value=1, max_value=3))
-        data_format = draw(st.sampled_from(["NCHW", "NHWC"]))
-        axis = draw(st.sampled_from([1]))
-        filter_channel = draw(st.integers(min_value=1, max_value=16)) * 4
-        filter_size = draw(st.integers(min_value=1, max_value=4))
-        in_channel = groups * filter_channel
-        out_channel_factor = draw(st.integers(min_value=1, max_value=16)) * 4
-        out_channel = groups * out_channel_factor
-        batch_size = draw(st.integers(min_value=1, max_value=4))
-        dilations = draw(
-            st.lists(
-                st.integers(
-                    min_value=1, max_value=2), min_size=2, max_size=2))
-        paddings = draw(
-            st.lists(
-                st.integers(
-                    min_value=0, max_value=2), min_size=2, max_size=2))
-        strides = draw(
-            st.lists(
-                st.integers(
-                    min_value=1, max_value=2), min_size=2, max_size=2))
-        has_bias = draw(st.booleans())
-
-        x_shape = [
-            batch_size, in_channel, 64, 64
-        ] if data_format == "NCHW" else [batch_size, 64, 64, in_channel]
-        w_shape = [out_channel, filter_channel, filter_size, filter_size]
-        scale_shape = [out_channel]
-        bias_shape = [out_channel]
-
-        def generate_input():
-            return np.random.random(x_shape).astype(np.float32)
-
-        def generate_weight():
-            return np.random.random(w_shape).astype(np.float32)
-
-        def generate_bias():
-            return np.random.random(bias_shape).astype(np.float32)
-
-        def generate_scale_bias():
-            return np.random.random(bias_shape).astype(np.float32)
-
-        conv2d_op = OpConfig(
-            "conv2d",
-            inputs={
-                "Input": ["input_data"],
-                "Filter": ["conv2d_weight"],
-            },
-            outputs={"Output": ["conv_output"]},
-            data_format=data_format,
-            dilations=dilations,
-            padding_algorithm=padding_algorithm,
-            groups=groups,
-            paddings=paddings,
-            strides=strides,
-            has_bias=has_bias,
-            is_test=True)
-        eltwise_op = OpConfig(
-            "elementwise_add",
-            inputs={"X": ["conv_output"],
-                    "Y": ["conv2d_bias"]},
-            outputs={"Out": ["elementwise_output"]},
-            axis=axis)
-        ac_op = OpConfig(
-            "affine_channel",
-            inputs={
-                "X": ["elementwise_output"],
-                "Scale": ["affine_channel_scale"],
-                "Bias": ["affine_channel_bias"]
-            },
-            outputs={"Out": ["affine_channel_ouput"]},
-            data_layout=data_format)
-        if has_bias == True:
-            conv2d_op.inputs["Bias"] = ["conv2d_bias"]
-        ops = [conv2d_op, eltwise_op, ac_op]
-        program_config = ProgramConfig(
-            ops=ops,
-            inputs={
-                "input_data": TensorConfig(data_gen=partial(generate_input)),
-            },
-            weights={
-                "conv2d_weight":
-                TensorConfig(data_gen=partial(generate_weight)),
-                "conv2d_bias": TensorConfig(data_gen=partial(generate_bias)),
-                "affine_channel_scale":
-                TensorConfig(data_gen=partial(generate_scale_bias)),
-                "affine_channel_bias":
-                TensorConfig(data_gen=partial(generate_scale_bias)),
-            },
-            outputs=["affine_channel_ouput"])
-        return program_config
-
-    def sample_predictor_configs(self, program_config):
-        config = self.create_inference_config(use_gpu=True)
-        yield config, ['conv2d', 'elementwise_add'], (1e-4, 1e-4)
-
-        config = self.create_inference_config(use_mkldnn=True)
-        yield config, ['conv2d', 'elementwise_add'], (1e-4, 1e-4)
-
-        # TRT
-        config = self.create_trt_inference_config()
-        config.enable_tensorrt_engine(
-            workspace_size=1 << 20,
-            max_batch_size=4,
-            min_subgraph_size=1,
-            precision_mode=paddle_infer.PrecisionType.Float32,
-            use_static=False,
-            use_calib_mode=False)
-        yield config, ['conv2d', 'elementwise_add'], (1e-4, 1e-4)
-
-    def add_ignore_pass_case(self):
-        # If the problem has been fixed, the judgment 
-        # in is_program_valid needs to be deleted!!!
-        def teller1(program_config, predictor_config):
-            if program_config.ops[0].attrs['data_format'] == "NHWC":
-                return True
-            return False
-
-        # mkldnn Output has diff with bias!
-        def teller2(program_config, predictor_config):
-            return predictor_config.mkldnn_enabled() and program_config.ops[
-                0].attrs['has_bias'] == True
-
-        self.add_ignore_check_case(
-            teller1, IgnoreReasons.PASS_ACCURACY_ERROR,
-            "The output format of conv2d is wrong when data_format attribute is NHWC, \
-            it will trigger Broadcast dimension mismatch bug \
-            when data_format attribute is NHWC and axis of eltwise op is 1 for this pass."
-        )
-
-        self.add_ignore_check_case(
-            teller2, IgnoreReasons.PASS_ACCURACY_ERROR,
-            "Currently mkldnn Output has diff with bias!")
-
-    def test(self):
-        self.run_and_statis(
-            quant=False,
-            passes=["conv_eltwiseadd_affine_channel_fuse_pass"], )
-
-
-if __name__ == "__main__":
-    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/op_test.py b/python/paddle/fluid/tests/unittests/op_test.py
index 848ebae0706e3c62e0e0e6579cd3c04f02d43be4..628791afef5f66cd8eeddae7685d7a7ffdb6dd08 100644
--- a/python/paddle/fluid/tests/unittests/op_test.py
+++ b/python/paddle/fluid/tests/unittests/op_test.py
@@ -482,7 +482,12 @@ class OpTest(unittest.TestCase):
 
         op_proto = OpProtoHolder.instance().get_op_proto(self.op_type)
         "infer datatype from inputs and outputs for this test case"
-        self.infer_dtype_from_inputs_outputs(self.inputs, self.outputs)
+        if self.is_bfloat16_op():
+            self.dtype = np.uint16
+            self.__class__.dtype = self.dtype
+            self.output_dtype = np.uint16
+        else:
+            self.infer_dtype_from_inputs_outputs(self.inputs, self.outputs)
         inputs = append_input_output(block, op_proto, self.inputs, True,
                                      self.dtype)
         outputs = append_input_output(block, op_proto, self.outputs, False,
@@ -1135,7 +1140,7 @@ class OpTest(unittest.TestCase):
                 else:
                     atol = 2
             else:
-                atol = 1e-2
+                atol = 1e-1
 
         if no_check_set is not None:
             if self.op_type not in no_check_set_white_list.no_check_set_white_list:
diff --git a/python/paddle/fluid/tests/unittests/test_diff_op.py b/python/paddle/fluid/tests/unittests/test_diff_op.py
index 345dad54132bc8c2d8520bc86c3276f651893e99..1ae780f488d2dc6bf37f88505a67723ea867dd94 100644
--- a/python/paddle/fluid/tests/unittests/test_diff_op.py
+++ b/python/paddle/fluid/tests/unittests/test_diff_op.py
@@ -55,7 +55,7 @@ class TestDiffOp(unittest.TestCase):
 
     def test_dygraph(self):
         for place in self.places:
-            paddle.disable_static(place)
+            paddle.disable_static()
             x = paddle.to_tensor(self.input, place=place)
             if self.prepend is not None:
                 self.prepend = paddle.to_tensor(self.prepend, place=place)
diff --git a/python/paddle/fluid/tests/unittests/test_gather_op.py b/python/paddle/fluid/tests/unittests/test_gather_op.py
index 83b39a62f152d2c7e02abe313ffeeafe017d033d..978a3d86d882a2e0d59e8244a956f5c97a4bd9ef 100644
--- a/python/paddle/fluid/tests/unittests/test_gather_op.py
+++ b/python/paddle/fluid/tests/unittests/test_gather_op.py
@@ -16,7 +16,7 @@ from __future__ import print_function
 
 import unittest
 import numpy as np
-from op_test import OpTest
+from op_test import OpTest, convert_float_to_uint16
 import paddle
 import paddle.fluid as fluid
 from paddle.framework import core
@@ -117,6 +117,39 @@ class TestCase6(TestGatherOp):
         self.index_type = "int32"
 
 
+class TestGatherBF16Op(OpTest):
+    def setUp(self):
+        self.op_type = "gather"
+        self.dtype = np.uint16
+        self.config()
+        xnp = np.random.random(self.x_shape).astype(np.float32)
+        axis_np = np.array(self.axis).astype(self.axis_type)
+        index_np = np.array(self.index).astype(self.index_type)
+        self.inputs = {
+            'X': convert_float_to_uint16(xnp),
+            'Index': index_np,
+            'Axis': axis_np
+        }
+        out = gather_numpy(self.inputs['X'], index_np, axis_np[0])
+        self.outputs = {'Out': out}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out', numeric_grad_delta=0.5)
+
+    def config(self):
+        """
+        For multi-dimension input
+        """
+        self.x_shape = (3, 88, 3)
+        self.index = [1, 3, 5]
+        self.index_type = "int32"
+        self.axis = [1]
+        self.axis_type = "int32"
+
+
 class TestGatherOp1(OpTest):
     def setUp(self):
         self.op_type = "gather"
diff --git a/python/paddle/fluid/tests/unittests/test_imperative_group.py b/python/paddle/fluid/tests/unittests/test_imperative_group.py
index f96358096516e67af6269c321a2722c500489959..89535797ed09890df44939efbc531df53d710304 100644
--- a/python/paddle/fluid/tests/unittests/test_imperative_group.py
+++ b/python/paddle/fluid/tests/unittests/test_imperative_group.py
@@ -26,159 +26,149 @@ import paddle.fluid.dygraph as dygraph
 from paddle.fluid.dygraph.nn import Linear
 import paddle.fluid.core as core
 from paddle.fluid.optimizer import SGDOptimizer
-
-
-class MLP(fluid.Layer):
-    def __init__(self, param_attr=None, bias_attr=None):
-        super(MLP, self).__init__()
-
-        self._linear1 = Linear(784, 10)
-        self._linear2 = Linear(10, 10)
-
-    def forward(self, inputs):
-        y = self._linear1(inputs)
-        y = self._linear2(y)
-        return y
+from paddle.fluid.framework import _test_eager_guard
 
 
 class TestDataParallelGroup(unittest.TestCase):
-    def create_varbase(self, dtype, shape,
-                       type=core.VarDesc.VarType.LOD_TENSOR):
-        return core.VarBase(dtype, shape, "", type, True)
+    def create_varbase(self, dtype, shape):
+        return paddle.rand(shape=shape, dtype=dtype)
+
+    def assign_group_by_size(self, *args):
+        return core.assign_group_by_size(*args)
 
     def test_construct_group0(self):
         # one dtype & one limit capability
         var_list = []
-        var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [2, 50]))
-        var_list.append(
-            self.create_varbase(core.VarDesc.VarType.FP32, [2, 100]))
-        var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [2, 50]))
-        var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [2, 25]))
-        res = core.assign_group_by_size(var_list, [False, False, False, False],
+        var_list.append(self.create_varbase("float32", [2, 50]))
+        var_list.append(self.create_varbase("float32", [2, 100]))
+        var_list.append(self.create_varbase("float32", [2, 50]))
+        var_list.append(self.create_varbase("float32", [2, 25]))
+        res = self.assign_group_by_size(var_list, [False, False, False, False],
                                         [400])
         self.assertEqual([[0], [1], [2], [3]], res)
 
     def test_construct_group1(self):
         # multi dtype & one limit capability
         var_list = []
-        var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [1, 50]))
-        var_list.append(self.create_varbase(core.VarDesc.VarType.FP64, [1, 25]))
-        var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [1, 50]))
-        var_list.append(self.create_varbase(core.VarDesc.VarType.FP64, [1, 25]))
-        var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [1, 50]))
-        var_list.append(self.create_varbase(core.VarDesc.VarType.FP64, [1, 25]))
-        res = core.assign_group_by_size(
+        var_list.append(self.create_varbase("float32", [1, 50]))
+        var_list.append(self.create_varbase("float64", [1, 25]))
+        var_list.append(self.create_varbase("float32", [1, 50]))
+        var_list.append(self.create_varbase("float64", [1, 25]))
+        var_list.append(self.create_varbase("float32", [1, 50]))
+        var_list.append(self.create_varbase("float64", [1, 25]))
+        res = self.assign_group_by_size(
             var_list, [False, False, False, False, False, False], [400])
         self.assertEqual([[0, 2], [1, 3], [4], [5]], res)
 
     def test_construct_group2(self):
         # one dtype & multi limit capability
         var_list = []
-        var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [2, 50]))
-        var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [2, 50]))
-        var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [2, 50]))
-        var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [2, 50]))
-        res = core.assign_group_by_size(var_list, [False, False, False, False],
+        var_list.append(self.create_varbase("float32", [2, 50]))
+        var_list.append(self.create_varbase("float32", [2, 50]))
+        var_list.append(self.create_varbase("float32", [2, 50]))
+        var_list.append(self.create_varbase("float32", [2, 50]))
+        res = self.assign_group_by_size(var_list, [False, False, False, False],
                                         [400, 800])
         self.assertEqual([[0], [1, 2], [3]], res)
 
     def test_construct_group3(self):
         # multi dtype & multi limit capability
         var_list = []
-        var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [1, 50]))
-        var_list.append(self.create_varbase(core.VarDesc.VarType.FP64, [1, 25]))
-        var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [1, 50]))
-        var_list.append(self.create_varbase(core.VarDesc.VarType.FP64, [1, 25]))
-        var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [1, 50]))
-        var_list.append(self.create_varbase(core.VarDesc.VarType.FP64, [1, 25]))
-        res = core.assign_group_by_size(
+        var_list.append(self.create_varbase("float32", [1, 50]))
+        var_list.append(self.create_varbase("float64", [1, 25]))
+        var_list.append(self.create_varbase("float32", [1, 50]))
+        var_list.append(self.create_varbase("float64", [1, 25]))
+        var_list.append(self.create_varbase("float32", [1, 50]))
+        var_list.append(self.create_varbase("float64", [1, 25]))
+        res = self.assign_group_by_size(
             var_list, [False, False, False, False, False, False], [200, 400])
         self.assertEqual([[0], [1], [2, 4], [3, 5]], res)
 
     def test_construct_group4(self):
         # multi dtype & zero limit capability
         var_list = []
-        var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [1, 50]))
-        var_list.append(self.create_varbase(core.VarDesc.VarType.FP64, [1, 25]))
-        var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [1, 50]))
-        var_list.append(self.create_varbase(core.VarDesc.VarType.FP64, [1, 25]))
-        var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [1, 50]))
-        var_list.append(self.create_varbase(core.VarDesc.VarType.FP64, [1, 25]))
-        res = core.assign_group_by_size(
+        var_list.append(self.create_varbase("float32", [1, 50]))
+        var_list.append(self.create_varbase("float64", [1, 25]))
+        var_list.append(self.create_varbase("float32", [1, 50]))
+        var_list.append(self.create_varbase("float64", [1, 25]))
+        var_list.append(self.create_varbase("float32", [1, 50]))
+        var_list.append(self.create_varbase("float64", [1, 25]))
+        res = self.assign_group_by_size(
             var_list, [False, False, False, False, False, False], [0])
         self.assertEqual([[0], [1], [2], [3], [4], [5]], res)
 
     def test_construct_group5(self):
         # multi dtype & infinite capability
         var_list = []
-        var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [1, 50]))
-        var_list.append(self.create_varbase(core.VarDesc.VarType.FP64, [1, 25]))
-        var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [1, 50]))
-        var_list.append(self.create_varbase(core.VarDesc.VarType.FP64, [1, 25]))
-        var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [1, 50]))
-        var_list.append(self.create_varbase(core.VarDesc.VarType.FP64, [1, 25]))
-        res = core.assign_group_by_size(
+        var_list.append(self.create_varbase("float32", [1, 50]))
+        var_list.append(self.create_varbase("float64", [1, 25]))
+        var_list.append(self.create_varbase("float32", [1, 50]))
+        var_list.append(self.create_varbase("float64", [1, 25]))
+        var_list.append(self.create_varbase("float32", [1, 50]))
+        var_list.append(self.create_varbase("float64", [1, 25]))
+        res = self.assign_group_by_size(
             var_list, [False, False, False, False, False, False], [10000])
         self.assertEqual([[0, 2, 4], [1, 3, 5]], res)
 
     def test_construct_group6(self):
         # multi dtype & limit capability & multi tensor type
         var_list = []
-        var_list.append(
-            self.create_varbase(core.VarDesc.VarType.FP32, [1, 50],
-                                core.VarDesc.VarType.SELECTED_ROWS))
-        var_list.append(self.create_varbase(core.VarDesc.VarType.FP64, [1, 25]))
-        var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [1, 50]))
-        var_list.append(self.create_varbase(core.VarDesc.VarType.FP64, [1, 25]))
-        var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [1, 50]))
-        var_list.append(
-            self.create_varbase(core.VarDesc.VarType.FP64, [1, 25],
-                                core.VarDesc.VarType.SELECTED_ROWS))
-        res = core.assign_group_by_size(
+        var_list.append(self.create_varbase(
+            "float32",
+            [1, 50], ))
+        var_list.append(self.create_varbase("float64", [1, 25]))
+        var_list.append(self.create_varbase("float32", [1, 50]))
+        var_list.append(self.create_varbase("float64", [1, 25]))
+        var_list.append(self.create_varbase("float32", [1, 50]))
+        var_list.append(self.create_varbase("float64", [1, 25]))
+        res = self.assign_group_by_size(
             var_list, [True, False, False, False, False, True], [400])
         self.assertEqual([[0], [1, 3], [2, 4], [5]], res)
 
     def test_construct_group7(self):
         # multi dtype & multi limit capability & multi tensor type
         var_list = []
-        var_list.append(
-            self.create_varbase(core.VarDesc.VarType.FP32, [1, 50],
-                                core.VarDesc.VarType.SELECTED_ROWS))
-        var_list.append(self.create_varbase(core.VarDesc.VarType.FP64, [1, 25]))
-        var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [1, 50]))
-        var_list.append(self.create_varbase(core.VarDesc.VarType.FP64, [1, 25]))
-        var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [1, 50]))
-        var_list.append(
-            self.create_varbase(core.VarDesc.VarType.FP64, [1, 25],
-                                core.VarDesc.VarType.SELECTED_ROWS))
-        res = core.assign_group_by_size(
+        var_list.append(self.create_varbase("float32", [1, 50]))
+        var_list.append(self.create_varbase("float64", [1, 25]))
+        var_list.append(self.create_varbase("float32", [1, 50]))
+        var_list.append(self.create_varbase("float64", [1, 25]))
+        var_list.append(self.create_varbase("float32", [1, 50]))
+        var_list.append(self.create_varbase("float64", [1, 25]))
+        res = self.assign_group_by_size(
             var_list, [True, False, False, False, False, True], [200, 400])
         self.assertEqual([[0], [1], [2], [3], [4], [5]], res)
 
     def test_construct_group8(self):
         # one dtype & one limit capability & have tensor_indices
         var_list = []
-        var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [2, 25]))
-        var_list.append(
-            self.create_varbase(core.VarDesc.VarType.FP32, [2, 100]))
-        var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [2, 50]))
-        var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [2, 25]))
-        res = core.assign_group_by_size(var_list, [False, False, False, False],
+        var_list.append(self.create_varbase("float32", [2, 25]))
+        var_list.append(self.create_varbase("float32", [2, 100]))
+        var_list.append(self.create_varbase("float32", [2, 50]))
+        var_list.append(self.create_varbase("float32", [2, 25]))
+        res = self.assign_group_by_size(var_list, [False, False, False, False],
                                         [400], [3, 0, 1, 2])
         self.assertEqual([[3, 0], [1], [2]], res)
 
     def test_construct_group9(self):
         # one dtype & one limit capability & have tensor_indices
         var_list = []
-        var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [2, 25]))
-        var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [2, 25]))
-        var_list.append(self.create_varbase(core.VarDesc.VarType.FP32, [2, 25]))
-        var_list.append(
-            self.create_varbase(core.VarDesc.VarType.FP32, [2, 1000]))
-        res = core.assign_group_by_size(var_list, [False, False, False, True],
+        var_list.append(self.create_varbase("float32", [2, 25]))
+        var_list.append(self.create_varbase("float32", [2, 25]))
+        var_list.append(self.create_varbase("float32", [2, 25]))
+        var_list.append(self.create_varbase("float32", [2, 1000]))
+        res = self.assign_group_by_size(var_list, [False, False, False, True],
                                         [300], [1, 0, 2, 3])
         self.assertEqual([[1, 0], [3], [2]], res)
 
 
+class TestDataParallelGroupEager(TestDataParallelGroup):
+    def create_varbase(self, dtype, shape):
+        with _test_eager_guard():
+            return paddle.rand(shape=shape, dtype=dtype)
+
+    def assign_group_by_size(self, *args):
+        return core.eager_assign_group_by_size(*args)
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_layer_norm_op.py b/python/paddle/fluid/tests/unittests/test_layer_norm_op.py
index 7dd310d2b88a90e09ba5ceedb541da4be263e559..ca9a489c7496f33cb084f1cd43158cebc7a1add6 100644
--- a/python/paddle/fluid/tests/unittests/test_layer_norm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_layer_norm_op.py
@@ -375,6 +375,53 @@ class TestFP16ScaleBiasLayerNorm(unittest.TestCase):
         assert_equal(b_g_np_1, b_g_np_2)
 
 
+class TestBF16ScaleBiasLayerNorm(unittest.TestCase):
+    def check_main(self, x_np, weight_np, bias_np, dtype):
+        paddle.disable_static()
+
+        x = paddle.to_tensor(x_np)
+        weight = paddle.to_tensor(weight_np)
+        bias = paddle.to_tensor(bias_np)
+
+        if dtype == "bfloat16":
+            x = x.cast(paddle.fluid.core.VarDesc.VarType.BF16)
+
+        x.stop_gradient = False
+        weight.stop_gradient = False
+        bias.stop_gradient = False
+
+        y = F.layer_norm(x, x.shape[1:], weight, bias)
+        x_g, w_g, b_g = paddle.grad(y, [x, weight, bias])
+
+        y_np = y.cast('float32').numpy()
+        x_g_np = x_g.cast('float32').numpy()
+        w_g_np = w_g.cast('float32').numpy()
+        b_g_np = b_g.cast('float32').numpy()
+
+        paddle.enable_static()
+        return y_np, x_g_np, w_g_np, b_g_np
+
+    def test_main(self):
+        if (not core.is_compiled_with_cuda()) or (core.cudnn_version() < 8100):
+            return
+        x_np = np.random.random([10, 20]).astype('float32')
+        weight_np = np.random.random([20]).astype('float32')
+        bias_np = np.random.random([20]).astype('float32')
+
+        y_np_1, x_g_np_1, w_g_np_1, b_g_np_1 = self.check_main(
+            x_np, weight_np, bias_np, 'float32')
+        y_np_2, x_g_np_2, w_g_np_2, b_g_np_2 = self.check_main(
+            x_np, weight_np, bias_np, 'bfloat16')
+
+        def assert_equal(x, y):
+            self.assertTrue(np.allclose(x, y, atol=1.e-1))
+
+        assert_equal(y_np_1, y_np_2)
+        assert_equal(x_g_np_1, x_g_np_2)
+        assert_equal(w_g_np_1, w_g_np_2)
+        assert_equal(b_g_np_1, b_g_np_2)
+
+
 class TestGetSetKeepLayerNormScaleBiasFP32Flag(unittest.TestCase):
     def test_main(self):
         self.assertTrue(_keep_layer_norm_scale_bias_to_fp32())
diff --git a/python/paddle/fluid/tests/unittests/test_norm_all.py b/python/paddle/fluid/tests/unittests/test_norm_all.py
index b20305b78efe2dfe73e069e13f0d0eca3bb84057..575bc653618a583e883783cd1fffe1db371eccff 100644
--- a/python/paddle/fluid/tests/unittests/test_norm_all.py
+++ b/python/paddle/fluid/tests/unittests/test_norm_all.py
@@ -16,7 +16,7 @@ from __future__ import print_function
 
 import unittest
 import numpy as np
-from op_test import OpTest
+from op_test import OpTest, convert_float_to_uint16
 import paddle
 import paddle.fluid as fluid
 import paddle.fluid.core as core
@@ -282,6 +282,80 @@ class TestPnormOpFP161(TestPnormOpFP16):
         self.asvector = True
 
 
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestPnormBF16Op(OpTest):
+    def setUp(self):
+        self.op_type = "p_norm"
+        self.init_test_case()
+        self.x = (np.random.random(self.shape) + 0.5).astype(np.float32)
+        self.norm = p_norm(self.x, self.axis, self.porder, self.keepdim,
+                           self.asvector)
+        self.gradient = self.calc_gradient()
+        self.inputs = {'X': convert_float_to_uint16(self.x)}
+        self.attrs = {
+            'epsilon': self.epsilon,
+            'axis': self.axis,
+            'keepdim': self.keepdim,
+            'porder': float(self.porder),
+            'asvector': self.asvector
+        }
+        self.outputs = {'Out': convert_float_to_uint16(self.norm)}
+
+    def test_check_output(self):
+        place = core.CUDAPlace(0)
+        self.check_output_with_place(place, atol=1e-3)
+
+    def test_check_grad(self):
+        place = core.CUDAPlace(0)
+        self.check_grad_with_place(
+            place, ['X'], 'Out', user_defined_grads=self.gradient)
+
+    def init_test_case(self):
+        self.shape = [2, 3, 4, 5]
+        self.axis = 1
+        self.epsilon = 1e-12
+        self.porder = 2.0
+        self.keepdim = False
+        self.dtype = np.uint16
+        self.asvector = False
+
+    def calc_gradient(self):
+        self.attrs = {
+            'epsilon': self.epsilon,
+            'axis': self.axis,
+            'keepdim': self.keepdim,
+            'porder': float(self.porder),
+            'asvector': self.asvector
+        }
+        x = self.x
+        porder = self.attrs["porder"]
+        axis = self.attrs["axis"]
+        asvector = self.attrs["asvector"]
+        x_dtype = x.dtype
+        x = x.astype(np.float32) if x.dtype == np.float16 else x
+        if porder == 0:
+            grad = np.zeros(x.shape).astype(x.dtype)
+        elif porder in [float("inf"), float("-inf")]:
+            norm = p_norm(
+                x, axis=axis, porder=porder, keepdims=True, reduce_all=asvector)
+            x_abs = np.abs(x)
+            grad = np.sign(x)
+            grad[x_abs != norm] = 0.0
+        else:
+            norm = p_norm(
+                x, axis=axis, porder=porder, keepdims=True, reduce_all=asvector)
+            grad = np.power(norm, 1 - porder) * np.power(
+                np.abs(x), porder - 1) * np.sign(x)
+
+        numel = 1
+        for s in x.shape:
+            numel *= s
+        divisor = numel if asvector else x.shape[axis]
+        numel /= divisor
+        return [grad.astype(x_dtype) * 1 / numel]
+
+
 def run_fro(self, p, axis, shape_x, dtype, keep_dim, check_dim=False):
     with fluid.program_guard(fluid.Program()):
         data = fluid.data(name="X", shape=shape_x, dtype=dtype)
diff --git a/python/paddle/fluid/tests/unittests/test_reduce_op.py b/python/paddle/fluid/tests/unittests/test_reduce_op.py
index faa67e1d6da8f44bf1a09036d0d1dc9e49ff462c..d246356b4ec75a96162d0b37d4d1cbfab9493440 100644
--- a/python/paddle/fluid/tests/unittests/test_reduce_op.py
+++ b/python/paddle/fluid/tests/unittests/test_reduce_op.py
@@ -16,7 +16,7 @@ from __future__ import print_function
 
 import unittest
 import numpy as np
-from op_test import OpTest, skip_check_grad_ci
+from op_test import OpTest, skip_check_grad_ci, convert_float_to_uint16
 import paddle
 import paddle.fluid.core as core
 import paddle.fluid as fluid
@@ -61,6 +61,37 @@ class TestSumOp_fp16(OpTest):
         self.check_grad(['X'], 'Out', user_defined_grads=self.gradient)
 
 
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestSumOp_bf16(OpTest):
+    def setUp(self):
+        np.random.seed(100)
+        self.op_type = "reduce_sum"
+        self.dtype = np.uint16
+        self.x = np.random.uniform(0, 0.1, (2, 5, 10)).astype(np.float32)
+        self.attrs = {'dim': [0, 1, 2]}
+        self.out = self.x.sum(axis=tuple(self.attrs['dim']))
+        self.gradient = self.calc_gradient()
+
+        self.inputs = {'X': convert_float_to_uint16(self.x)}
+        self.outputs = {'Out': convert_float_to_uint16(self.out)}
+        self.gradient = self.calc_gradient()
+
+    def test_check_output(self):
+        place = core.CUDAPlace(0)
+        self.check_output_with_place(place)
+
+    def test_check_grad(self):
+        place = core.CUDAPlace(0)
+        self.check_grad_with_place(
+            place, ['X'], 'Out', user_defined_grads=self.gradient)
+
+    def calc_gradient(self):
+        x = self.x
+        grad = np.ones(x.shape, dtype=x.dtype)
+        return [grad]
+
+
 class TestSumOp_fp16_withInt(OpTest):
     def setUp(self):
         self.op_type = "reduce_sum"
diff --git a/python/paddle/fluid/tests/unittests/test_scale_op.py b/python/paddle/fluid/tests/unittests/test_scale_op.py
index c1ce032f506127e495dfd3231471fdabe6dfa26b..d432b8057f624831f40b8cd48a0ede694f8d0a55 100644
--- a/python/paddle/fluid/tests/unittests/test_scale_op.py
+++ b/python/paddle/fluid/tests/unittests/test_scale_op.py
@@ -16,7 +16,7 @@ from __future__ import print_function
 
 import unittest
 import numpy as np
-from op_test import OpTest
+from op_test import OpTest, convert_float_to_uint16
 import paddle
 import paddle.fluid as fluid
 import paddle.fluid.core as core
@@ -153,6 +153,23 @@ class TestScaleFp16Op(TestScaleOp):
                 place, ["X"], "Out", max_relative_error=0.05)
 
 
+class TestScaleBF16Op(OpTest):
+    def setUp(self):
+        self.op_type = "scale"
+        self.dtype = np.uint16
+        self.attrs = {'scale': -2.3}
+        x = np.random.random((10, 10)).astype(np.float32)
+        out = x * np.float32(self.attrs['scale'])
+        self.inputs = {'X': convert_float_to_uint16(x)}
+        self.outputs = {'Out': convert_float_to_uint16(out)}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out', numeric_grad_delta=0.8)
+
+
 @unittest.skipIf(not core.is_compiled_with_cuda(),
                  "core is not compiled with CUDA")
 class TestScaleFp16OpSelectedRows(TestScaleOpSelectedRows):
diff --git a/python/paddle/fluid/tests/unittests/test_sum_op.py b/python/paddle/fluid/tests/unittests/test_sum_op.py
index eddccd4ff24f1a8b7c23bda3da813bc87c199cbe..7040145a76833588f0a5738b1b09e10061497e8c 100644
--- a/python/paddle/fluid/tests/unittests/test_sum_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sum_op.py
@@ -298,6 +298,32 @@ def create_test_sum_fp16_class(parent):
     globals()[cls_name] = TestSumFp16Case
 
 
+#----------- test bf16 -----------
+class TestSumBF16Op(OpTest):
+    def setUp(self):
+        self.op_type = "sum"
+        self.init_kernel_type()
+        x0 = np.random.random((3, 40)).astype(np.float32)
+        x1 = np.random.random((3, 40)).astype(np.float32)
+        x2 = np.random.random((3, 40)).astype(np.float32)
+        y = x0 + x1 + x2
+        self.inputs = {
+            "X": [("x0", convert_float_to_uint16(x0)),
+                  ("x1", convert_float_to_uint16(x1)),
+                  ("x2", convert_float_to_uint16(x2))]
+        }
+        self.outputs = {'Out': convert_float_to_uint16(y)}
+
+    def init_kernel_type(self):
+        self.dtype = np.uint16
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['x0'], 'Out', numeric_grad_delta=0.5)
+
+
 class API_Test_Add_n(unittest.TestCase):
     def test_api(self):
         with fluid.program_guard(fluid.Program(), fluid.Program()):
diff --git a/python/paddle/incubate/optimizer/distributed_fused_lamb.py b/python/paddle/incubate/optimizer/distributed_fused_lamb.py
index e7c3cfbb7b93b5deffb95e9ee175a7a03d1aaf7f..cc33a909632766e81bfabdb73cc3a1e177c1fe1a 100644
--- a/python/paddle/incubate/optimizer/distributed_fused_lamb.py
+++ b/python/paddle/incubate/optimizer/distributed_fused_lamb.py
@@ -171,10 +171,7 @@ class DistributedFusedLamb(Optimizer):
         moment2.is_distributed = True
         beta1pow = self._create_persistable_var('beta1pow')
         beta2pow = self._create_persistable_var('beta2pow')
-        fused_indices = self._create_persistable_var(
-            'fused_indices', dtype='int32')
-        weight_decay = self._create_persistable_var('weight_decay')
-        weight_decay.is_distributed = True
+
         param_info = self._create_persistable_var('param_info', dtype='int32')
         param_info.is_distributed = True
 
@@ -189,17 +186,20 @@ class DistributedFusedLamb(Optimizer):
             'fp16_partial_fused_offsets', dtype='int32')
         fp16_partial_fused_offsets.is_distributed = True
 
+        param_order = self._create_persistable_var('param_order', dtype='int32')
+        param_order.is_distributed = True
+
         rank = get_rank()
         nranks = get_world_size()
         scale = self._get_or_create_scale()
 
         params = [p for p, _ in params_grads]
         grads = [g for _, g in params_grads]
-        weight_decay_values = [self._weight_decay] * len(params)
+        apply_weight_decay = [1] * len(params)
         if self._exclude_from_weight_decay_fn is not None:
             for i, p in enumerate(params):
                 if self._exclude_from_weight_decay_fn(p):
-                    weight_decay_values[i] = 0.0
+                    apply_weight_decay[i] = 0
 
         startup_block = self.helper.startup_program.global_block()
         for g in grads:
@@ -225,8 +225,6 @@ class DistributedFusedLamb(Optimizer):
                 'Moment2': [moment2],
                 'Beta1Pow': [beta1pow],
                 'Beta2Pow': [beta2pow],
-                'FusedIndices': [fused_indices],
-                'WeightDecay': [weight_decay],
                 'GlobalScale': [scale],
                 'ParamInfo': [param_info],
                 'ParamOut': params,
@@ -235,12 +233,13 @@ class DistributedFusedLamb(Optimizer):
                 'FP32ShardFusedParamOffsets': [fp32_partial_fused_offsets],
                 'FP16ShardFusedParamOffsets': [fp16_partial_fused_offsets],
                 'FusedParamOffsets': [fused_offsets],
+                'ParamOrder': [param_order],
             },
             attrs={
                 'alignment': self._alignment,
                 'rank': rank,
                 'nranks': nranks,
-                'weight_decay': weight_decay_values,
+                'apply_weight_decay': apply_weight_decay,
                 'moment1': 0.0,
                 'moment2': 0.0,
                 'beta1': self._beta1,
@@ -272,8 +271,6 @@ class DistributedFusedLamb(Optimizer):
                 'Moment2': [moment2],
                 'Beta1Pow': [beta1pow],
                 'Beta2Pow': [beta2pow],
-                'FusedIndices': [fused_indices],
-                'WeightDecay': [weight_decay],
                 'GlobalScale': [scale],
                 'ParamInfo': [param_info],
                 'Param': params,
@@ -281,6 +278,7 @@ class DistributedFusedLamb(Optimizer):
                 'FusedParamOffsets': [fused_offsets],
                 'FP32ShardFusedParamOffsets': [fp32_partial_fused_offsets],
                 'FP16ShardFusedParamOffsets': [fp16_partial_fused_offsets],
+                'ParamOrder': [param_order],
             },
             outputs={
                 'FP32FusedParamOut': [fp32_fused_param],
@@ -294,6 +292,7 @@ class DistributedFusedLamb(Optimizer):
                 'FoundInf': [self._found_inf],
             },
             attrs={
+                'weight_decay': self._weight_decay,
                 'beta1': self._beta1,
                 'beta2': self._beta2,
                 'epsilon': self._epsilon,
diff --git a/python/paddle/nn/functional/loss.py b/python/paddle/nn/functional/loss.py
index e59ef5ebfb0ab26c16c78933733bc11c0c4148d0..e6efde836284ac361f9781a0cb18b0df72afe354 100755
--- a/python/paddle/nn/functional/loss.py
+++ b/python/paddle/nn/functional/loss.py
@@ -1667,11 +1667,11 @@ def cross_entropy(input,
             label_min = paddle.min(valid_label)
             label_max = paddle.max(valid_label)
             if label_min < 0:
-                raise ValueError("label should not out of bound, but got{}".
-                                 format(label_min))
+                raise ValueError("Target {} is out of lower bound.".format(
+                    label_min.item()))
             if label_max >= input.shape[axis]:
-                raise ValueError("label should not out of bound, but got{}".
-                                 format(label_max))
+                raise ValueError("Target {} is out of upper bound.".format(
+                    label_max.item()))
         if core.is_compiled_with_npu() or core.is_compiled_with_mlu():
             _, _, out = _C_ops.softmax_with_cross_entropy(
                 input, label, 'soft_label', soft_label, 'ignore_index',
diff --git a/python/paddle/utils/code_gen/api_base.py b/python/paddle/utils/code_gen/api_base.py
index 5fc9dfe3f6499701f75fffc62bdcf3f9a0c28821..cfd817c24c7367f69673353a8aaceeedec506e15 100644
--- a/python/paddle/utils/code_gen/api_base.py
+++ b/python/paddle/utils/code_gen/api_base.py
@@ -451,7 +451,20 @@ PADDLE_API {self.outputs['return_type']} {self.get_api_func_name() + '_'}({self.
         param_code = ""
         for param in infer_meta_params:
             if param in input_names:
-                if param in self.optional_vars:
+                if self.inputs['input_info'][param] == "const Tensor&":
+                    param_code = param_code + "MakeMetaTensor(*" + PREFIX_TENSOR_NAME + param + "), "
+                elif self.inputs['input_info'][
+                        param] == "const std::vector<Tensor>&":
+                    meta_tensor_code = meta_tensor_code + f"""
+{code_indent}  auto {param}_meta_vec = MakeMetaTensor(*{PREFIX_TENSOR_NAME}{param});
+{code_indent}  std::vector<phi::MetaTensor*> {param}_metas({param}_meta_vec.size());
+{code_indent}  for (size_t i = 0; i < {param}_meta_vec.size(); ++i) {{
+{code_indent}    {param}_metas[i] = &{param}_meta_vec[i];
+{code_indent}  }}
+"""
+
+                    param_code = param_code + param + "_metas, "
+                elif param in self.optional_vars:
                     meta_tensor_code = meta_tensor_code + f"""
 {code_indent}  paddle::optional<const phi::MetaTensor&> {PREFIX_TENSOR_NAME}meta_ref_{param}(paddle::none);
 {code_indent}  auto {PREFIX_TENSOR_NAME}meta_{param} = MakeMetaTensor({PREFIX_TENSOR_NAME}{param});
@@ -461,7 +474,9 @@ PADDLE_API {self.outputs['return_type']} {self.get_api_func_name() + '_'}({self.
 
                     param_code = param_code + f"{PREFIX_TENSOR_NAME}meta_ref_{param}, "
                 else:
-                    param_code = param_code + "MakeMetaTensor(*" + PREFIX_TENSOR_NAME + param + "), "
+                    raise ValueError(
+                        f"{self.api} : Param of infer_meta error : {self.inputs['input_info'][param]} type is not supported."
+                    )
             elif param in kernel_output_names:
                 meta_tensor_code = meta_tensor_code + code_indent + "  phi::MetaTensor " + param.replace(
                     'kernel_', PREFIX_META_TENSOR_NAME) + "(" + param + ");\n"
diff --git a/tools/ci_op_benchmark.sh b/tools/ci_op_benchmark.sh
index 80efd32ecf14eebac990dd8a531c134e95e7c039..1db79418b2d8f296c37d7757cace7b7bc2a8141c 100644
--- a/tools/ci_op_benchmark.sh
+++ b/tools/ci_op_benchmark.sh
@@ -106,7 +106,7 @@ function prepare_benchmark_environment {
   [ $? -ne 0 ] && LOG "[FATAL] Clone benchmark repo fail." && exit -1
   LOG "[INFO] Collect api info ..."
   python benchmark/api/deploy/collect_api_info.py \
-      --test_module_name tests_v2                 \
+      --test_module_name dynamic_tests_v2         \
       --info_file api_info.txt >& 2
   [ $? -ne 0 ] && LOG "[FATAL] Collect api info fail." && exit -1
   [ ! -f benchmark/ci/scripts/op_benchmark.config ] && LOG "[FATAL] Missing op_benchmark.config!" && exit -1
@@ -185,7 +185,7 @@ function run_op_benchmark_test {
     logs_dir="$(pwd)/logs-${branch_name}"
     [ -d $logs_dir ] && rm -rf $logs_dir/* || mkdir -p $logs_dir
     pushd benchmark/api > /dev/null
-    bash deploy/main_control.sh tests_v2 \
+    bash deploy/main_control.sh dynamic_tests_v2 \
                                 tests_v2/configs \
                                 $logs_dir \
                                 $VISIBLE_DEVICES \
@@ -212,7 +212,7 @@ function check_op_benchmark_result {
       # there is no need to recompile and install paddle
       LOG "[INFO] retry ${retry_time} times ..."
       pushd benchmark/api > /dev/null
-      bash deploy/main_control.sh tests_v2 \
+      bash deploy/main_control.sh dynamic_tests_v2 \
                                   tests_v2/configs \
                                   ${logs_dir} \
                                   $VISIBLE_DEVICES \
diff --git a/tools/parallel_UT_rule.py b/tools/parallel_UT_rule.py
index 4df27bfe4e923868ac5267119c5b56b6ba3839c8..7f8e516496f32352fa18f950a4687d5b52f4d10d 100755
--- a/tools/parallel_UT_rule.py
+++ b/tools/parallel_UT_rule.py
@@ -958,7 +958,6 @@ FOURTH_HIGH_PARALLEL_JOB_NEW = [
     'test_dynamic_rnn_stop_gradient', 'test_raw_program_optimizer', 'test_pow',
     'test_inplace_softmax_with_cross_entropy', 'test_transforms',
     'test_unfold_op', 'test_assign_op', 'test_isinstance',
-    'test_conv_affine_channel_fuse_pass',
     'auto_growth_best_fit_allocator_facade_test', 'test_cholesky_op',
     'test_adaptive_avg_pool3d', 'test_paddle_save_load_binary',
     'test_fused_fc_elementwise_layernorm_op', 'test_sequence_enumerate_op',
@@ -1873,7 +1872,6 @@ TETRAD_PARALLEL_JOB = [
     'test_dataloader_unkeep_order',
     'test_parallel_executor_profiler',
     'test_correlation',
-    'test_conv_affine_channel_fuse_pass',
     'test_ir_inplace_pass',
     'test_moving_average_abs_max_scale_op',
     'test_flatten_contiguous_range_op',
diff --git a/tools/static_mode_white_list.py b/tools/static_mode_white_list.py
index 694283264ca8f63ec3bcbe73a884c6a9f280bc15..7356f0c8db02551c930e424571cf779f0c3dbc9c 100755
--- a/tools/static_mode_white_list.py
+++ b/tools/static_mode_white_list.py
@@ -578,7 +578,6 @@ STATIC_MODE_TESTING_LIST = [
     'test_ir_embedding_eltwise_layernorm_fuse_pass',
     'test_ir_fc_fuse_pass',
     'test_ir_skip_layernorm_pass',
-    'test_conv_affine_channel_fuse_pass',
     'test_conv_bias_mkldnn_fuse_pass',
     'test_conv_bn_fuse_pass',
     'test_conv_elementwise_add2_act_fuse_pass',