Merge remote-tracking branch 'zyc/develop' into semi-auto/rule-base

3631f064 · liangjianzhong · 4cd1a2cb · c3ea2a6b · 3631f064 · 3631f064
62 changed file
--- a/.gitmodules
+++ b/.gitmodules
@@ -30,10 +30,6 @@
 	path = third_party/xxhash
 	url = https://github.com/Cyan4973/xxHash.git
 	ignore = dirty
-[submodule "third_party/eigen3"]
-	path = third_party/eigen3
-	url = https://gitlab.com/libeigen/eigen.git
-	ignore = dirty
 [submodule "third_party/leveldb"]
 	path = third_party/leveldb
 	url = https://github.com/google/leveldb
@@ -50,3 +46,7 @@
 	path = third_party/glog
 	url = https://github.com/google/glog.git
 	ignore = dirty
+[submodule "third_party/eigen3"]
+	path = third_party/eigen3
+	url = https://gitlab.com/libeigen/eigen.git
+	ignore = dirty
--- a/cmake/external/protobuf.cmake
+++ b/cmake/external/protobuf.cmake
@@ -296,6 +296,8 @@ function(build_protobuf TARGET_NAME BUILD_FOR_HOST)
      PREFIX ${PROTOBUF_PREFIX_DIR}
      SOURCE_DIR ${SOURCE_DIR}
      UPDATE_COMMAND ""
+      PATCH_COMMAND
+      COMMAND cd ${SOURCE_DIR} && git checkout ${PROTOBUF_TAG}
      DEPENDS zlib
      CONFIGURE_COMMAND
        ${CMAKE_COMMAND} ${SOURCE_DIR}/cmake ${OPTIONAL_ARGS}

--- a/cmake/external/xpu.cmake
+++ b/cmake/external/xpu.cmake
@@ -8,7 +8,7 @@ set(XPU_API_LIB_NAME "libxpuapi.so")
 set(XPU_RT_LIB_NAME "libxpurt.so")
 set(XPU_XFT_LIB_NAME "libxft.so")

-set(XPU_BASE_DATE "20230519")
+set(XPU_BASE_DATE "20230523")
 set(XPU_XCCL_BASE_VERSION "1.0.49.2")
 set(XPU_XFT_BASE_VERSION "latest")


--- a/paddle/fluid/distributed/auto_parallel/CMakeLists.txt
+++ b/paddle/fluid/distributed/auto_parallel/CMakeLists.txt
@@ -6,6 +6,3 @@ cc_library(

 add_subdirectory(test)
 add_subdirectory(spmd_rules)
-
-cc_library(auto_parallel DEPS device_mesh process_mesh dist_attr dist_mapper
-                              dist_tensor_spec)
--- a/paddle/fluid/distributed/auto_parallel/spmd_rules/dist_tensor_spec.cc
+++ b/paddle/fluid/distributed/auto_parallel/spmd_rules/dist_tensor_spec.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "paddle/fluid/distributed/auto_parallel/spmd_rules/dist_tensor_spec.h"
-#include "paddle/fluid/distributed/auto_parallel/process_mesh.h"
+#include "paddle/phi/core/distributed/auto_parallel/utils.h"

 namespace paddle {
 namespace distributed {
@@ -27,28 +27,41 @@ DistTensorSpec::DistTensorSpec(const std::vector<int64_t>& shape,
  dist_attr_.copy_from(dist_attr);
 }

+DistTensorSpec::DistTensorSpec(const DistTensorSpec& spec) {
+  std::vector<int64_t> spec_shape = spec.get_shape();
+  shape_.assign(spec_shape.begin(), spec_shape.end());
+  dist_attr_.copy_from(spec.get_dist_attr());
+}
+
 DistTensorSpec::~DistTensorSpec() {}

 DistTensorSpec::DistTensorSpec(const Tensor& tensor) {
  shape_ = tensor.shape();

-  std::vector<int64_t> pm_shape, pm_ids;
-  pm_shape = {4};
-  pm_ids = {0, 1, 2, 3};
-  std::vector<std::string> dim_name = {"mp"};
+  // std::vector<int64_t> pm_shape, pm_ids;
+  // pm_shape = {4};
+  // pm_ids = {0, 1, 2, 3};
+  // std::vector<std::string> dim_name = {"mp"};

-  ProcessMesh pm(pm_shape, pm_ids, dim_name);
-  std::vector<int64_t> dims_mapping = {-1, 0};
-  TensorDistAttr dist_attr;
-  dist_attr.set_process_mesh(pm);
-  dist_attr.set_dims_mapping(dims_mapping);
+  // ProcessMesh pm(pm_shape, pm_ids, dim_name);
+  // std::vector<int64_t> dims_mapping = {-1, 0};
+  // TensorDistAttr dist_attr;
+  // dist_attr.set_process_mesh(pm);
+  // dist_attr.set_dims_mapping(dims_mapping);

-  dist_attr_.copy_from(dist_attr);
+  // dist_attr_.copy_from(dist_attr);

-  std::cout << dist_attr_;
+  // std::cout << dist_attr_;
 }

-const std::vector<int64_t>& DistTensorSpec::get_dims_mapping() {
+DistTensorSpec& DistTensorSpec::operator=(const DistTensorSpec& spec) {
+  std::vector<int64_t> spec_shape = spec.get_shape();
+  shape_ = spec_shape;
+  dist_attr_.copy_from(spec.get_dist_attr());
+  return *this;
+}
+
+const std::vector<int64_t>& DistTensorSpec::get_dims_mapping() const {
  return dist_attr_.dims_mapping();
 }

@@ -57,7 +70,7 @@ void DistTensorSpec::set_dims_mapping(
  dist_attr_.set_dims_mapping(dims_mapping);
 }

-const ProcessMesh& DistTensorSpec::get_process_mesh() {
+const ProcessMesh& DistTensorSpec::get_process_mesh() const {
  return dist_attr_.process_mesh();
 }

@@ -65,7 +78,22 @@ void DistTensorSpec::set_process_mesh(const ProcessMesh& process_mesh) {
  dist_attr_.set_process_mesh(process_mesh);
 }

-const std::vector<int64_t>& DistTensorSpec::get_shape() { return shape_; }
+const std::vector<int64_t>& DistTensorSpec::get_shape() const { return shape_; }
+
+const TensorDistAttr& DistTensorSpec::get_dist_attr() const {
+  return dist_attr_;
+}
+
+void DistTensorSpec::set_dist_attr(const TensorDistAttr& dist_attr) {
+  dist_attr_ = dist_attr;
+}
+
+std::string DistTensorSpec::to_string() const {
+  using phi::distributed::auto_parallel::str_join;
+  std::string spec_str = "{tensor_shape:[" + str_join(shape_) + "], ";
+  spec_str += "dist_attr:" + dist_attr_.to_string() + "}";
+  return spec_str;
+}

 }  // namespace auto_parallel
 }  // namespace distributed

--- a/paddle/fluid/distributed/auto_parallel/spmd_rules/dist_tensor_spec.h
+++ b/paddle/fluid/distributed/auto_parallel/spmd_rules/dist_tensor_spec.h
@@ -14,39 +14,55 @@ limitations under the License. */

 #pragma once

-#include "paddle/fluid/distributed/auto_parallel/dist_attr.h"
 #include "paddle/phi/api/include/tensor.h"
+#include "paddle/phi/core/distributed/auto_parallel/dist_attr.h"

 namespace paddle {
 namespace distributed {
 namespace auto_parallel {

+using phi::distributed::auto_parallel::ProcessMesh;
+using phi::distributed::auto_parallel::TensorDistAttr;
+
 /**
 * A unified data class for inferring distributed attributes
 * in both dygraph mode and static mode
 */
 class DistTensorSpec {
 public:
+  DistTensorSpec() = default;
+
  DistTensorSpec(const std::vector<int64_t>& shape,
                 const TensorDistAttr& dist_attr);

+  DistTensorSpec(const DistTensorSpec& spec);
+
+  // temp function, only for test in dygraph mode
  explicit DistTensorSpec(const Tensor& tensor);

  ~DistTensorSpec();

+  DistTensorSpec& operator=(const DistTensorSpec& spec);
+
  // get dims_mapping from dist_attr_
-  const std::vector<int64_t>& get_dims_mapping();
+  const std::vector<int64_t>& get_dims_mapping() const;

  // set dims_mapping in dist_attr_
  void set_dims_mapping(const std::vector<int64_t>& dims_mapping);

  // get process_mesh from dist_attr_
-  const ProcessMesh& get_process_mesh();
+  const ProcessMesh& get_process_mesh() const;

  // set process_mesh in dist_attr_
  void set_process_mesh(const ProcessMesh& process_mesh);

-  const std::vector<int64_t>& get_shape();
+  const TensorDistAttr& get_dist_attr() const;
+
+  void set_dist_attr(const TensorDistAttr& dist_attr);
+
+  const std::vector<int64_t>& get_shape() const;
+
+  std::string to_string() const;

 private:
  std::vector<int64_t> shape_;

--- a/paddle/fluid/operators/cinn/cinn_launch_op.cc
+++ b/paddle/fluid/operators/cinn/cinn_launch_op.cc
@@ -17,6 +17,7 @@
 #include <functional>
 #include <vector>

+#include "cinn/common/target.h"
 #include "cinn/hlir/framework/graph_compiler.h"
 #include "cinn/runtime/cinn_runtime.h"
 #include "cinn/runtime/flags.h"
@@ -94,6 +95,11 @@ void SetCinnRandomSeed<phi::CPUContext>() {
  ::cinn::runtime::RandomSeed::GetOrSet(seed);
 }

+void SetCinnTarget(const ::cinn::common::Target& target) {
+  VLOG(4) << "Set CINN compile target to " << target;
+  ::cinn::runtime::CurrentTarget::SetCurrentTarget(target);
+}
+
 }  // namespace details

 class CinnLaunchOp : public framework::OperatorWithKernel {

--- a/paddle/fluid/operators/cinn/cinn_launch_op.h
+++ b/paddle/fluid/operators/cinn/cinn_launch_op.h
@@ -58,6 +58,9 @@ void SetCinnRuntimeFlags();
 template <typename DeviceContext>
 void SetCinnRandomSeed();

+// set CINN compile target
+void SetCinnTarget(const ::cinn::common::Target& target);
+
 }  // namespace details

 template <typename T, typename DeviceContext>
@@ -115,6 +118,7 @@ class CinnLaunchOpKernel : public framework::OpKernel<T> {
        "Step 2. Get compilation result of the graph");
    // Step 2. Get compilation result of the graph
    auto target = details::PlaceToCinnTarget(place);
+    details::SetCinnTarget(target);
    using ClockType = std::chrono::steady_clock;
    std::chrono::time_point<ClockType> start_t, end_t;
    if (VLOG_IS_ON(1)) {

--- a/paddle/fluid/operators/collective/alltoall_op.cu.cc
+++ b/paddle/fluid/operators/collective/alltoall_op.cu.cc
@@ -98,7 +98,7 @@ PD_REGISTER_STRUCT_KERNEL(alltoall,
                          ops::AllToAllOpCUDAKernel,
                          float,
                          double,
-#if NCCL_VERSION_CODE >= 21000
+#if NCCL_VERSION_CODE >= 21000 && CUDA_VERSION >= 11000
                          plat::bfloat16,
 #endif
                          int,

--- a/paddle/fluid/operators/collective/c_allgather_op.cu.cc
+++ b/paddle/fluid/operators/collective/c_allgather_op.cu.cc
@@ -95,7 +95,7 @@ PD_REGISTER_STRUCT_KERNEL(c_allgather,
                          ops::CAllGatherOpCUDAKernel,
                          float,
                          double,
-#if NCCL_VERSION_CODE >= 21000
+#if NCCL_VERSION_CODE >= 21000 && CUDA_VERSION >= 11000
                          plat::bfloat16,
 #endif
                          int,

--- a/paddle/fluid/operators/collective/c_allreduce_max_op.cu.cc
+++ b/paddle/fluid/operators/collective/c_allreduce_max_op.cu.cc
@@ -28,7 +28,7 @@ PD_REGISTER_STRUCT_KERNEL(c_allreduce_max,
                          ALL_LAYOUT,
                          ops::CAllReduceMaxCUDAKernel,
                          float,
-#if NCCL_VERSION_CODE >= 21000
+#if NCCL_VERSION_CODE >= 21000 && CUDA_VERSION >= 11000
                          plat::bfloat16,
 #endif
                          double,

--- a/paddle/fluid/operators/collective/c_allreduce_sum_op.cu.cc
+++ b/paddle/fluid/operators/collective/c_allreduce_sum_op.cu.cc
@@ -28,7 +28,7 @@ PD_REGISTER_STRUCT_KERNEL(c_allreduce_sum,
                          ALL_LAYOUT,
                          ops::CAllReduceSumCUDAKernel,
                          float,
-#if NCCL_VERSION_CODE >= 21000
+#if NCCL_VERSION_CODE >= 21000 && CUDA_VERSION >= 11000
                          plat::bfloat16,
 #endif
                          double,

--- a/paddle/fluid/operators/collective/c_broadcast_op.cu.cc
+++ b/paddle/fluid/operators/collective/c_broadcast_op.cu.cc
@@ -100,7 +100,7 @@ PD_REGISTER_STRUCT_KERNEL(c_broadcast,
                          int64_t,
                          float,
                          double,
-#if NCCL_VERSION_CODE >= 21000
+#if NCCL_VERSION_CODE >= 21000 && CUDA_VERSION >= 11000
                          plat::bfloat16,
 #endif
                          plat::float16) {

--- a/paddle/fluid/operators/collective/c_concat_op.cu.cc
+++ b/paddle/fluid/operators/collective/c_concat_op.cu.cc
@@ -137,7 +137,7 @@ PD_REGISTER_STRUCT_KERNEL(c_concat,
                          double,
                          int,
                          int64_t,
-#if NCCL_VERSION_CODE >= 21000
+#if NCCL_VERSION_CODE >= 21000 && CUDA_VERSION >= 11000
                          plat::bfloat16,
 #endif
                          plat::float16) {

--- a/paddle/fluid/operators/collective/c_embedding_op.cu
+++ b/paddle/fluid/operators/collective/c_embedding_op.cu
@@ -239,7 +239,7 @@ PD_REGISTER_STRUCT_KERNEL(c_embedding,
                          ops::CEmbeddingCUDAKernel,
                          float,
                          double,
-#if NCCL_VERSION_CODE >= 21000
+#if NCCL_VERSION_CODE >= 21000 && CUDA_VERSION >= 11000
                          plat::bfloat16,
 #endif
                          plat::float16) {
@@ -251,7 +251,7 @@ PD_REGISTER_STRUCT_KERNEL(c_embedding_grad,
                          ops::CEmbeddingGradCUDAKernel,
                          float,
                          double,
-#if NCCL_VERSION_CODE >= 21000
+#if NCCL_VERSION_CODE >= 21000 && CUDA_VERSION >= 11000
                          plat::bfloat16,
 #endif
                          plat::float16) {

--- a/paddle/fluid/operators/collective/c_identity_op.cu.cc
+++ b/paddle/fluid/operators/collective/c_identity_op.cu.cc
@@ -25,7 +25,7 @@ PD_REGISTER_STRUCT_KERNEL(c_identity,
                          double,
                          int,
                          int64_t,
-#if NCCL_VERSION_CODE >= 21000
+#if NCCL_VERSION_CODE >= 21000 && CUDA_VERSION >= 11000
                          plat::bfloat16,
 #endif
                          plat::float16) {

--- a/paddle/fluid/operators/collective/c_reducescatter_op.cu.cc
+++ b/paddle/fluid/operators/collective/c_reducescatter_op.cu.cc
@@ -87,7 +87,7 @@ PD_REGISTER_STRUCT_KERNEL(c_reducescatter,
                          ops::CReduceScatterOpCUDAKernel,
                          float,
                          double,
-#if NCCL_VERSION_CODE >= 21000
+#if NCCL_VERSION_CODE >= 21000 && CUDA_VERSION >= 11000
                          plat::bfloat16,
 #endif
                          int,

--- a/paddle/fluid/operators/collective/c_split_op.cu
+++ b/paddle/fluid/operators/collective/c_split_op.cu
@@ -123,7 +123,7 @@ PD_REGISTER_STRUCT_KERNEL(c_split,
                          double,
                          int,
                          int64_t,
-#if NCCL_VERSION_CODE >= 21000
+#if NCCL_VERSION_CODE >= 21000 && CUDA_VERSION >= 11000
                          plat::bfloat16,
 #endif
                          plat::float16) {

--- a/paddle/fluid/operators/collective/mp_allreduce_sum_op.cu.cc
+++ b/paddle/fluid/operators/collective/mp_allreduce_sum_op.cu.cc
@@ -31,7 +31,7 @@ PD_REGISTER_STRUCT_KERNEL(mp_allreduce_sum,
                          double,
                          int,
                          int64_t,
-#if NCCL_VERSION_CODE >= 21000
+#if NCCL_VERSION_CODE >= 21000 && CUDA_VERSION >= 11000
                          plat::bfloat16,
 #endif
                          plat::float16) {

--- a/paddle/fluid/operators/collective/partial_allgather_op.cu.cc
+++ b/paddle/fluid/operators/collective/partial_allgather_op.cu.cc
@@ -108,7 +108,7 @@ PD_REGISTER_STRUCT_KERNEL(partial_allgather,
                          ops::PartialAllGatherOpCUDAKernel,
                          float,
                          double,
-#if NCCL_VERSION_CODE >= 21000
+#if NCCL_VERSION_CODE >= 21000 && CUDA_VERSION >= 11000
                          plat::bfloat16,
 #endif
                          int,

--- a/paddle/fluid/operators/collective/partial_recv_op.cu.cc
+++ b/paddle/fluid/operators/collective/partial_recv_op.cu.cc
@@ -124,7 +124,7 @@ PD_REGISTER_STRUCT_KERNEL(partial_recv,
                          ops::PartialRecvOpCUDAKernel,
                          float,
                          double,
-#if NCCL_VERSION_CODE >= 21000
+#if NCCL_VERSION_CODE >= 21000 && CUDA_VERSION >= 11000
                          plat::bfloat16,
 #endif
                          int,

--- a/paddle/fluid/operators/collective/partial_send_op.cu.cc
+++ b/paddle/fluid/operators/collective/partial_send_op.cu.cc
@@ -123,7 +123,7 @@ PD_REGISTER_STRUCT_KERNEL(partial_send,
                          ops::PartialSendCUDAKernel,
                          float,
                          double,
-#if NCCL_VERSION_CODE >= 21000
+#if NCCL_VERSION_CODE >= 21000 && CUDA_VERSION >= 11000
                          plat::bfloat16,
 #endif
                          int,

--- a/paddle/fluid/operators/collective/recv_v2_op.cu.cc
+++ b/paddle/fluid/operators/collective/recv_v2_op.cu.cc
@@ -238,7 +238,7 @@ PD_REGISTER_STRUCT_KERNEL(recv_v2,
                          ops::RecvOpV2CUDAKernel,
                          float,
                          double,
-#if NCCL_VERSION_CODE >= 21000
+#if NCCL_VERSION_CODE >= 21000 && CUDA_VERSION >= 11000
                          plat::bfloat16,
 #endif
                          int,

--- a/paddle/fluid/operators/collective/send_v2_op.cu.cc
+++ b/paddle/fluid/operators/collective/send_v2_op.cu.cc
@@ -223,7 +223,7 @@ PD_REGISTER_STRUCT_KERNEL(send_v2,
                          ops::SendOpV2CUDAKernel,
                          float,
                          double,
-#if NCCL_VERSION_CODE >= 21000
+#if NCCL_VERSION_CODE >= 21000 && CUDA_VERSION >= 11000
                          plat::bfloat16,
 #endif
                          int,

--- a/paddle/fluid/platform/device/gpu/nccl_helper.h
+++ b/paddle/fluid/platform/device/gpu/nccl_helper.h
@@ -59,7 +59,7 @@ inline ncclDataType_t ToNCCLDataType(framework::proto::VarType::Type type) {
    return ncclUint8;
  } else if (type == framework::proto::VarType::BOOL) {
    return ncclUint8;
-#if NCCL_VERSION_CODE >= 21000
+#if NCCL_VERSION_CODE >= 21000 && CUDA_VERSION >= 11000
  } else if (type == framework::proto::VarType::BF16) {
    return ncclBfloat16;
 #endif
@@ -86,7 +86,7 @@ inline ncclDataType_t ToNCCLDataType(phi::DataType type) {
    return ncclInt8;
  } else if (type == phi::DataType::BOOL) {
    return ncclUint8;
-#if NCCL_VERSION_CODE >= 21000
+#if NCCL_VERSION_CODE >= 21000 && CUDA_VERSION >= 11000
  } else if (type == phi::DataType::BFLOAT16) {
    return ncclBfloat16;
 #endif

--- a/paddle/fluid/pybind/auto_parallel_py.cc
+++ b/paddle/fluid/pybind/auto_parallel_py.cc
@@ -15,6 +15,7 @@
 #include <pybind11/operators.h>
 #include <pybind11/stl.h>

+#include "paddle/fluid/distributed/auto_parallel/spmd_rules/dist_tensor_spec.h"
 #include "paddle/fluid/framework/op_desc.h"
 #include "paddle/fluid/framework/var_desc.h"
 #include "paddle/fluid/pybind/auto_parallel_py.h"
@@ -29,6 +30,7 @@ namespace py = pybind11;
 namespace paddle {
 namespace pybind {

+using paddle::distributed::auto_parallel::DistTensorSpec;
 using paddle::distributed::auto_parallel::OperatorDistAttr;
 using paddle::framework::OpDesc;
 using paddle::framework::VarDesc;
@@ -276,6 +278,25 @@ void BindAutoParallel(py::module *m) {
          py::arg("memo"))
      .def("__str__", &TensorDistAttr::to_string);

+  py::class_<DistTensorSpec>(*m, "DistTensorSpec")
+      .def(py::init<>())
+      .def(py::init<const DistTensorSpec &>())
+      .def(py::init<const std::vector<int64_t> &, const TensorDistAttr &>())
+      .def("get_dims_mapping", &DistTensorSpec::get_dims_mapping)
+      .def("set_dims_mapping", &DistTensorSpec::set_dims_mapping)
+      .def("get_process_mesh", &DistTensorSpec::get_process_mesh)
+      .def("set_process_mesh", &DistTensorSpec::set_process_mesh)
+      .def_property_readonly("shape", &DistTensorSpec::get_shape)
+      .def("__str__", &DistTensorSpec::to_string)
+      .def("__copy__",
+           [](const DistTensorSpec &self) { return DistTensorSpec(self); })
+      .def(
+          "__deepcopy__",
+          [](const DistTensorSpec &self, py::dict) {
+            return DistTensorSpec(self);
+          },
+          py::arg("memo"));
+
  py::class_<OperatorDistAttr>(*m, "OperatorDistAttr")
      .def(py::init<>())
      .def(py::init<const OpDesc &>())

--- a/paddle/phi/api/yaml/generator/api_base.py
+++ b/paddle/phi/api/yaml/generator/api_base.py
@@ -1280,7 +1280,7 @@ PADDLE_API {self.get_return_type(inplace_flag=True)} {api_func_name}({self.get_d

    def gen_dist_tensor_code(self):
        # define the DistTensorSpec vector for input and output tensors
-        api_code = "  \nstd::vector<paddle::distributed::auto_parallel::DistTensorSpec> input_specs;\n"
+        api_code = "  \n  std::vector<paddle::distributed::auto_parallel::DistTensorSpec> input_specs;\n"

        # get DistTensorSpec for each input tensor
        for tensor_name in self.inputs['names']:
@@ -1297,8 +1297,8 @@ PADDLE_API {self.get_return_type(inplace_flag=True)} {api_func_name}({self.get_d
 PADDLE_API {self.get_return_type(inplace_flag)} {api_func_name}({self.get_define_args(inplace_flag)}) {{
 {self.gene_kernel_select()}
 """
-        if api_func_name == 'matmul':
-            api_code += self.gen_dist_tensor_code()
+        # if api_func_name == 'matmul':
+        #     api_code += self.gen_dist_tensor_code()

        if len(self.kernel['func']) > 1:
            kernel_dispatch_code = ''

--- a/paddle/phi/backends/xpu/xpu2_op_list.cc
+++ b/paddle/phi/backends/xpu/xpu2_op_list.cc
@@ -525,6 +525,8 @@ XPUOpMap& get_kl2_ops() {
                     phi::DataType::FLOAT16,
                     phi::DataType::INT64})},
      {"nearest_interp_v2_grad", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"nll_loss", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"nll_loss_grad", XPUKernelSet({phi::DataType::FLOAT32})},
      {"not_equal",
       XPUKernelSet({phi::DataType::INT64,
                     phi::DataType::INT32,

--- a/paddle/phi/core/distributed/auto_parallel/CMakeLists.txt
+++ b/paddle/phi/core/distributed/auto_parallel/CMakeLists.txt
@@ -20,4 +20,5 @@ cc_library(
  SRCS dist_mapper.cc
  DEPS device_mesh auto_parallel_proto phi_enforce)

-cc_library(auto_parallel DEPS device_mesh process_mesh dist_attr dist_mapper)
+cc_library(auto_parallel DEPS device_mesh process_mesh dist_attr dist_mapper
+                              dist_tensor_spec)
--- a/paddle/phi/core/utils/data_type.h
+++ b/paddle/phi/core/utils/data_type.h
@@ -229,7 +229,7 @@ inline ncclDataType_t ToNCCLDataType(DataType type) {
    return ncclInt8;
  } else if (type == DataType::BOOL) {
    return ncclUint8;
-#if NCCL_VERSION_CODE >= 21000
+#if NCCL_VERSION_CODE >= 21000 && CUDA_VERSION >= 11000
  } else if (type == DataType::BFLOAT16) {
    return ncclBfloat16;
 #endif

--- a/paddle/phi/kernels/gpu/activation_kernel.cu
+++ b/paddle/phi/kernels/gpu/activation_kernel.cu
@@ -274,4 +274,5 @@ PD_REGISTER_KERNEL(selu,
                   phi::SeluKernel,
                   float,
                   double,
+                   phi::dtype::float16,
                   phi::dtype::bfloat16) {}
--- a/paddle/phi/kernels/gpu/selu_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/selu_grad_kernel.cu
@@ -24,4 +24,5 @@ PD_REGISTER_KERNEL(selu_grad,
                   phi::SeluGradKernel,
                   float,
                   double,
+                   phi::dtype::float16,
                   phi::dtype::bfloat16) {}
--- a/paddle/phi/kernels/xpu/nll_loss_grad_kernel.cc
+++ b/paddle/phi/kernels/xpu/nll_loss_grad_kernel.cc
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/nll_loss_grad_kernel.h"
+#include "paddle/phi/backends/xpu/enforce_xpu.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void NllLossGradKernel(const Context& dev_ctx,
+                       const DenseTensor& x,
+                       const DenseTensor& label,
+                       const paddle::optional<DenseTensor>& weight,
+                       const DenseTensor& total_weight,
+                       const DenseTensor& d_out,
+                       int64_t ignore_index,
+                       const std::string& reduction,
+                       DenseTensor* d_x) {
+  using XPUType = typename XPUTypeTrait<T>::Type;
+  const auto& label_type = label.dtype();
+  bool label_type_match =
+      label_type == phi::DataType::INT32 || label_type == phi::DataType::INT64;
+  PADDLE_ENFORCE_EQ(label_type_match,
+                    true,
+                    phi::errors::InvalidArgument(
+                        "Input(Label) holds the wrong type, it holds %s, but "
+                        "desires to be %s or %s",
+                        label_type,
+                        phi::DataType::INT32,
+                        phi::DataType::INT64));
+
+  auto d_out_data = d_out.data<XPUType>();
+  auto d_x_data = dev_ctx.template Alloc<XPUType>(d_x);
+
+  auto d_x_dims = d_x->dims();
+  std::vector<int64_t> d_x_shape = phi::vectorize<int64_t>(d_x_dims);
+
+  auto weight_data =
+      weight.get_ptr() ? weight.get_ptr()->data<float>() : nullptr;
+
+  int64_t reduction_id = 0;
+  if (reduction == "none") {
+    reduction_id = 0;
+  } else if (reduction == "mean") {
+    reduction_id = 1;
+  } else if (reduction == "sum") {
+    reduction_id = 2;
+  }
+
+  auto total_weight_data = total_weight.data<XPUType>();
+
+  int r;
+  if (label_type == phi::DataType::INT32) {
+    const int* label_data = label.data<int>();
+    r = xpu::nll_loss_grad(dev_ctx.x_context(),
+                           d_out_data,
+                           d_x_data,
+                           d_x_shape,
+                           label_data,
+                           weight_data,
+                           reduction_id,
+                           ignore_index,
+                           total_weight_data);
+  } else if (label_type == phi::DataType::INT64) {
+    const int64_t* label_data = label.data<int64_t>();
+    r = xpu::nll_loss_grad(dev_ctx.x_context(),
+                           d_out_data,
+                           d_x_data,
+                           d_x_shape,
+                           label_data,
+                           weight_data,
+                           reduction_id,
+                           ignore_index,
+                           total_weight_data);
+  }
+  PADDLE_ENFORCE_XDNN_SUCCESS(r, "nll_loss_grad");
+}
+
+}  // namespace phi
+
+// TODO(xiongkun): add the non-raw kernel register here.
+PD_REGISTER_KERNEL(
+    nll_loss_grad, XPU, ALL_LAYOUT, phi::NllLossGradKernel, float) {}
--- a/paddle/phi/kernels/xpu/nll_loss_kernel.cc
+++ b/paddle/phi/kernels/xpu/nll_loss_kernel.cc
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/nll_loss_kernel.h"
+#include "paddle/phi/backends/xpu/enforce_xpu.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void NllLossRawKernel(const Context& dev_ctx,
+                      const DenseTensor& x,
+                      const DenseTensor& label,
+                      const paddle::optional<DenseTensor>& weight,
+                      int64_t ignore_index,
+                      const std::string& reduction,
+                      DenseTensor* out,
+                      DenseTensor* total_weight) {
+  using XPUType = typename XPUTypeTrait<T>::Type;
+  const auto& label_type = label.dtype();
+  bool label_type_match =
+      label_type == phi::DataType::INT32 || label_type == phi::DataType::INT64;
+  PADDLE_ENFORCE_EQ(label_type_match,
+                    true,
+                    phi::errors::InvalidArgument(
+                        "Input(Label) holds the wrong type, it holds %s, but "
+                        "desires to be %s or %s",
+                        label_type,
+                        phi::DataType::INT32,
+                        phi::DataType::INT64));
+
+  auto x_data = x.data<XPUType>();
+  auto out_data = dev_ctx.template Alloc<XPUType>(out);
+
+  auto weight_data =
+      weight.get_ptr() ? weight.get_ptr()->data<XPUType>() : nullptr;
+
+  auto total_weight_data = dev_ctx.template Alloc<XPUType>(total_weight);
+
+  auto x_dims = x.dims();
+  std::vector<int64_t> x_shape = phi::vectorize<int64_t>(x_dims);
+
+  int64_t reduction_id = 0;
+  if (reduction == "none") {
+    reduction_id = 0;
+  } else if (reduction == "mean") {
+    reduction_id = 1;
+  } else if (reduction == "sum") {
+    reduction_id = 2;
+  }
+
+  int r;
+  if (label_type == phi::DataType::INT32) {
+    const int* label_data = label.data<int>();
+    r = xpu::nll_loss(dev_ctx.x_context(),
+                      x_data,
+                      out_data,
+                      total_weight_data,
+                      x_shape,
+                      label_data,
+                      weight_data,
+                      reduction_id,
+                      ignore_index);
+  } else if (label_type == phi::DataType::INT64) {
+    const int64_t* label_data = label.data<int64_t>();
+    r = xpu::nll_loss(dev_ctx.x_context(),
+                      x_data,
+                      out_data,
+                      total_weight_data,
+                      x_shape,
+                      label_data,
+                      weight_data,
+                      reduction_id,
+                      ignore_index);
+  }
+  PADDLE_ENFORCE_XDNN_SUCCESS(r, "nll_loss");
+}
+
+}  // namespace phi
+
+// TODO(xiongkun): add the non-raw kernel register here.
+PD_REGISTER_KERNEL(nll_loss, XPU, ALL_LAYOUT, phi::NllLossRawKernel, float) {}
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -2235,14 +2235,70 @@ set +x
            fi
        done <<< "$test_cases";
        card_test "$single_card_tests" 1
+        failed_test_lists=''
        collect_failed_tests
+        xputest_error=0
+        retry_unittests_record=''
+        retry_time=3
+        exec_times=0
+        exec_time_array=('first' 'second' 'third')
+        exec_retry_threshold=10
+        is_retry_execuate=0
+        if [ -n "$failed_test_lists" ];then
+            xputest_error=1
+            need_retry_ut_str=$(echo "$failed_test_lists" | grep -oEi "\-.+\(" | sed 's/(//' | sed 's/- //' )
+            need_retry_ut_arr=(${need_retry_ut_str})
+            need_retry_ut_count=${#need_retry_ut_arr[@]}
+            retry_unittests=$(echo "$failed_test_lists" | grep -oEi "\-.+\(" | sed 's/(//' | sed 's/- //' )
+            if [ $need_retry_ut_count -lt $exec_retry_threshold ];then
+                while ( [ $exec_times -lt $retry_time ] )
+                    do
+                        set +e
+                        retry_unittests_record="$retry_unittests_record$failed_test_lists"
+                        failed_test_lists_ult=`echo "${failed_test_lists}"`
+                        set -e
+                        if [[ "${exec_times}" == "1" ]];then
+                            if [[ "${failed_test_lists}" == "" ]];then
+                                break
+                            else
+                                retry_unittests=$(echo "$failed_test_lists" | grep -oEi "\-.+\(" | sed 's/(//' | sed 's/- //' )
+                            fi
+                        fi
+                        echo "========================================="
+                        echo "This is the ${exec_time_array[$exec_times]} time to re-run"
+                        echo "========================================="
+                        echo "The following unittest will be re-run:"
+                        echo "${retry_unittests}"
+                        echo "========================================="
+
+                        retry_unittests_regular=''
+                        for line in ${retry_unittests[@]} ;
+                            do
+                                if [[ "$retry_unittests_regular" == "" ]];then
+                                    retry_unittests_regular="^$line$"
+                                else
+                                    retry_unittests_regular="$retry_unittests_regular|^$line$"
+                                fi
+                            done
+                        rm -f $tmp_dir/*
+                        failed_test_lists=''
+                        ctest -R "($retry_unittests_regular)" --output-on-failure -j $2 | tee $tmpfile
+                        collect_failed_tests
+                        exec_times=$[$exec_times+1]
+                    done
+            else
+                # There are more than 10 failed unit tests, so no unit test retry
+                is_retry_execuate=1
+            fi
+
+        fi
 set -x
        ut_endTime_s=`date +%s`
        echo "XPU testCase Time: $[ $ut_endTime_s - $ut_startTime_s ]s"
        python ${PADDLE_ROOT}/build/test/xpu/get_test_cover_info.py
        unset XPU_OP_LIST_DIR
-        if [[ "$EXIT_CODE" != "0" ]]; then
-            exit 8;
+        if [ "$xputest_error" != 0 ];then
+            show_ut_retry_result
        fi
    fi
 }

--- a/python/paddle/distributed/auto_parallel/dist_attribute.py
+++ b/python/paddle/distributed/auto_parallel/dist_attribute.py
@@ -12,5 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License

+from paddle.fluid.core import DistTensorSpec  # noqa: F401
 from paddle.fluid.core import OperatorDistAttr  # noqa: F401
 from paddle.fluid.core import TensorDistAttr  # noqa: F401
--- a/python/paddle/distributed/auto_parallel/operators/dist_matmul.py
+++ b/python/paddle/distributed/auto_parallel/operators/dist_matmul.py
@@ -105,6 +105,18 @@ def _update_dims_mapping_for_matmul(dist_op):
    changed = False
    op_desc = dist_op.serial_op.desc
    op_dist_attr = dist_op.dist_attr
+
+    # test DistTensorSpec
+    # input_name_list = []
+    # output_name_list = []
+    # input_name_list.append(op_desc.input('X')[0])
+    # input_name_list.append(op_desc.input('Y')[0])
+    # output_name_list.append(op_desc.output('Out')[0])
+    # attr_name_list = ['trans_x', 'trans_y']
+    # input_specs, output_specs, attrs = wrap_data_for_completion(
+    #     dist_op, input_name_list, output_name_list, attr_name_list
+    # )
+
    x_name = op_desc.input('X')[0]
    y_name = op_desc.input('Y')[0]
    out_name = op_desc.output('Out')[0]

--- a/python/paddle/distributed/auto_parallel/utils.py
+++ b/python/paddle/distributed/auto_parallel/utils.py
@@ -26,7 +26,7 @@ from paddle.framework import core
 from paddle.framework.io_utils import is_belong_to_optimizer, is_parameter
 from paddle.static import Variable

-from .dist_attribute import OperatorDistAttr, TensorDistAttr
+from .dist_attribute import DistTensorSpec, OperatorDistAttr, TensorDistAttr
 from .process_group import get_all_process_groups
 from .process_mesh import ProcessMesh

@@ -2357,50 +2357,64 @@ def is_dep_skip_op(op):
    return False


-# def wrap_data_for_completion(
-#     dist_op: DistributedOperator,
-#     input_names: list,
-#     output_names: list,
-#     attr_names: list
-# ):
-#     """
-#     Get data used in inferring distributed attributes, including:
-#       1. DistTensorSpec for each input and output tensor of this dist_op.
-#       2. Operator attributes of this dist_op, e.g. transpose_x in matmul op.
-#
-#     Args:
-#       dist_op: the DistributedOperator
-#       input_names: list, name of the dist_op's input tensors
-#       output_names: list, name of the dist_op's output tensors
-#       attr_names: list, attribute name of the dist_op's corresponding serial op
-#
-#     Returns:
-#       input_specs: list, DistTensorSpec for each input tensor of the dist_op
-#       output_specs: list, DistTensorSpec for each output tensor of the dist_op
-#       attrs: dict, attribute map of the dist op
-#     """
-#
-#     input_specs = []
-#     output_specs = []
-#     attrs = {}
-#
-#     serial_op = dist_op.serial_op
-#
-#     # Construct each input tensor's DistTensorSpec with shape and dist_attr
-#     for name in input_names:
-#         tensor_dist_attr = dist_op.dist_attr.get_input_dist_attr(name)
-#         var = serial_op.block._var_recursive(name)
-#         tensor_shape = var.shape
-#         dist_spec = DistTensorSpec(tensor_shape, tensor_dist_attr)
-#         input_specs.append(dist_spec)
-#
-#     # Construct each output tensor's DistTensorSpec with shape and dist_attr
-#     for name in output_names:
-#         tensor_dist_attr = dist_op.dist_attr.get_output_dist_attr(name)
-#         var = serial_op.block._var_recursive(name)
-#         tensor_shape = var.shape
-#         dist_spec = DistTensorSpec(tensor_shape, tensor_dist_attr)
-#         output_specs.append(dist_spec)
-#
-#     for attr_name in attr_names:
-#         attrs[attr_name] = serial_op.desc.attr(attr_name)
+def wrap_data_for_completion(
+    dist_op, input_names: list, output_names: list, attr_names: list
+):
+    """
+    Get data used in inferring distributed attributes, including:
+      1. DistTensorSpec for each input and output tensor of this dist_op.
+      2. Operator attributes of this dist_op, e.g. transpose_x in matmul op.
+
+    Args:
+      dist_op: the DistributedOperator
+      input_names: list, name of the dist_op's input tensors
+      output_names: list, name of the dist_op's output tensors
+      attr_names: list, attribute name of the dist_op's corresponding serial op
+
+    Returns:
+      input_specs: list, DistTensorSpec for each input tensor of the dist_op
+      output_specs: list, DistTensorSpec for each output tensor of the dist_op
+      attrs: dict, attribute map of the dist op
+
+    Usage:
+      op_desc = dist_op.serial_op.desc
+      input_name_list = []
+      output_name_list = []
+      input_name_list.append(op_desc.input('X')[0]) # 'X' is the arg name for op
+      input_name_list.append(op_desc.input('Y')[0])
+      output_name_list.append(op_desc.output('Out')[0])
+      attr_name_list = ['trans_x', 'trans_y']
+      input_specs, output_specs, attrs = wrap_data_for_completion(
+          dist_op,
+          input_name_list,
+          output_name_list,
+          attr_name_list)
+
+    """
+
+    input_specs = []
+    output_specs = []
+    attrs = {}
+
+    serial_op = dist_op.serial_op
+
+    # Construct each input tensor's DistTensorSpec with shape and dist_attr
+    for name in input_names:
+        tensor_dist_attr = dist_op.dist_attr.get_input_dist_attr(name)
+        var = serial_op.block._var_recursive(name)
+        tensor_shape = var.shape
+        dist_spec = DistTensorSpec(tensor_shape, tensor_dist_attr)
+        input_specs.append(dist_spec)
+
+    # Construct each output tensor's DistTensorSpec with shape and dist_attr
+    for name in output_names:
+        tensor_dist_attr = dist_op.dist_attr.get_output_dist_attr(name)
+        var = serial_op.block._var_recursive(name)
+        tensor_shape = var.shape
+        dist_spec = DistTensorSpec(tensor_shape, tensor_dist_attr)
+        output_specs.append(dist_spec)
+
+    for attr_name in attr_names:
+        attrs[attr_name] = serial_op.desc.attr(attr_name)
+
+    return input_specs, output_specs, attrs
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -1116,7 +1116,15 @@ set(TEST_CINN_OPS
    test_tile_op
    test_roll_op
    test_sum_op
-    test_elementwise_min_op)
+    test_elementwise_min_op
+    test_bitwise_op
+    test_compare_op
+    test_shape_op
+    test_assign_value_op
+    test_lookup_table_op
+    test_lookup_table_v2_op
+    test_norm_op
+    test_one_hot_v2_op)

 foreach(TEST_CINN_OPS ${TEST_CINN_OPS})
  if(WITH_CINN)

--- a/python/paddle/fluid/tests/unittests/test_assign_value_op.py
+++ b/python/paddle/fluid/tests/unittests/test_assign_value_op.py
@@ -49,7 +49,7 @@ class TestAssignValueOp(eager_op_test.OpTest):
        self.attrs["fp32_values"] = [float(v) for v in self.value.flat]

    def test_forward(self):
-        self.check_output()
+        self.check_output(check_cinn=True)


 class TestAssignValueOp2(TestAssignValueOp):

--- a/python/paddle/fluid/tests/unittests/test_bitwise_op.py
+++ b/python/paddle/fluid/tests/unittests/test_bitwise_op.py
@@ -43,7 +43,7 @@ class TestBitwiseAnd(OpTest):
        self.outputs = {'Out': out}

    def test_check_output(self):
-        self.check_output()
+        self.check_output(check_cinn=True)

    def test_check_grad(self):
        pass
@@ -150,7 +150,7 @@ class TestBitwiseOr(OpTest):
        self.outputs = {'Out': out}

    def test_check_output(self):
-        self.check_output()
+        self.check_output(check_cinn=True)

    def test_check_grad(self):
        pass
@@ -258,7 +258,7 @@ class TestBitwiseXor(OpTest):
        self.outputs = {'Out': out}

    def test_check_output(self):
-        self.check_output()
+        self.check_output(check_cinn=True)

    def test_check_grad(self):
        pass
@@ -363,7 +363,7 @@ class TestBitwiseNot(OpTest):
        self.outputs = {'Out': out}

    def test_check_output(self):
-        self.check_output()
+        self.check_output(check_cinn=True)

    def test_check_grad(self):
        pass

--- a/python/paddle/fluid/tests/unittests/test_compare_op.py
+++ b/python/paddle/fluid/tests/unittests/test_compare_op.py
@@ -35,7 +35,7 @@ def create_test_class(op_type, typename, callback):
            self.op_type = op_type

        def test_output(self):
-            self.check_output()
+            self.check_output(check_cinn=True)

        def test_errors(self):
            paddle.enable_static()
@@ -460,7 +460,7 @@ def create_bf16_case(op_type, callback):
            self.outputs = {'Out': real_result}

        def test_check_output(self):
-            self.check_output()
+            self.check_output(check_cinn=True)

    cls_name = f"BF16TestCase_{op_type}"
    TestCompareOpBF16Op.__name__ = cls_name

--- a/python/paddle/fluid/tests/unittests/test_expand_v2_op.py
+++ b/python/paddle/fluid/tests/unittests/test_expand_v2_op.py
@@ -44,7 +44,7 @@ class TestExpandV2OpRank1(OpTest):
        self.expand_times = [1]

    def test_check_output(self):
-        self.check_output()
+        self.check_output(check_cinn=self.enable_cinn)

    def test_check_grad(self):
        self.check_grad(['X'], 'Out', check_prim=True)
@@ -107,10 +107,10 @@ class TestExpandV2OpRank1_tensor_attr(OpTest):
        self.infer_expand_shape = [-1]

    def test_check_output(self):
-        self.check_output()
+        self.check_output(check_cinn=True)

    def test_check_grad(self):
-        self.check_grad(['X'], 'Out')
+        self.check_grad(['X'], 'Out', check_cinn=True)


 class TestExpandV2OpRank2_Corner_tensor_attr(TestExpandV2OpRank1_tensor_attr):
@@ -144,10 +144,10 @@ class TestExpandV2OpRank1_tensor(OpTest):
        self.expand_shape = [2, 100]

    def test_check_output(self):
-        self.check_output()
+        self.check_output(check_cinn=True)

    def test_check_grad(self):
-        self.check_grad(['X'], 'Out')
+        self.check_grad(['X'], 'Out', check_cinn=True)


 # Situation 4: input x is Integer
@@ -165,7 +165,7 @@ class TestExpandV2OpInteger(OpTest):
        self.outputs = {'Out': output}

    def test_check_output(self):
-        self.check_output()
+        self.check_output(check_cinn=True)


 #  Situation 5: input x is Bool
@@ -181,7 +181,7 @@ class TestExpandV2OpBoolean(OpTest):
        self.outputs = {'Out': output}

    def test_check_output(self):
-        self.check_output()
+        self.check_output(check_cinn=True)


 #  Situation 6: input x is Integer
@@ -199,7 +199,7 @@ class TestExpandV2OpInt64_t(OpTest):
        self.outputs = {'Out': output}

    def test_check_output(self):
-        self.check_output()
+        self.check_output(check_cinn=True)


 #  Situation 7: input x is Float16
@@ -218,7 +218,7 @@ class TestExpandV2FP16Op(OpTest):
        self.outputs = {'Out': output}

    def test_check_output(self):
-        self.check_output()
+        self.check_output(check_cinn=True)

    def test_check_grad(self):
        self.check_grad(['X'], 'Out', check_prim=True)
@@ -245,7 +245,7 @@ class TestExpandV2BF16Op(OpTest):

    def test_check_output(self):
        place = core.CUDAPlace(0)
-        self.check_output_with_place(place)
+        self.check_output_with_place(place, check_cinn=True)

    def test_check_grad(self):
        place = core.CUDAPlace(0)

--- a/python/paddle/fluid/tests/unittests/test_lookup_table_op.py
+++ b/python/paddle/fluid/tests/unittests/test_lookup_table_op.py
@@ -39,10 +39,10 @@ class TestLookupTableOp(OpTest):
        self.outputs = {'Out': table[ids]}

    def test_check_output(self):
-        self.check_output()
+        self.check_output(check_cinn=True)

    def test_check_grad(self):
-        self.check_grad(['W'], 'Out', no_grad_set=set('Ids'))
+        self.check_grad(['W'], 'Out', no_grad_set=set('Ids'), check_cinn=True)


 class TestLookupTableOpWithTensorIds(OpTest):
@@ -56,10 +56,10 @@ class TestLookupTableOpWithTensorIds(OpTest):
        self.outputs = {'Out': table[ids.flatten()].reshape((2, 4, 5, 31))}

    def test_check_output(self):
-        self.check_output()
+        self.check_output(check_cinn=True)

    def test_check_grad(self):
-        self.check_grad(['W'], 'Out', no_grad_set=set('Ids'))
+        self.check_grad(['W'], 'Out', no_grad_set=set('Ids'), check_cinn=True)


 @skip_check_grad_ci(
@@ -73,7 +73,7 @@ class TestLookupTableOpWithPadding(TestLookupTableOp):
        padding_idx = np.random.choice(ids, 1)[0]
        self.outputs['Out'][ids == padding_idx] = np.zeros(31)
        self.attrs = {'padding_idx': int(padding_idx)}
-        self.check_output()
+        self.check_output(check_cinn=True)


 @skip_check_grad_ci(
@@ -88,7 +88,7 @@ class TestLookupTableOpWithTensorIdsAndPadding(TestLookupTableOpWithTensorIds):
        padding_idx = np.random.choice(flatten_idx, 1)[0]
        self.outputs['Out'][np.squeeze(ids == padding_idx)] = np.zeros(31)
        self.attrs = {'padding_idx': padding_idx}
-        self.check_output()
+        self.check_output(check_cinn=True)


 class TestLookupTableWIsSelectedRows(unittest.TestCase):
@@ -212,7 +212,7 @@ class TestLookupTableOpInt8(OpTest):
        self.outputs = {'Out': table[ids]}

    def test_check_output(self):
-        self.check_output()
+        self.check_output(check_cinn=True)

    def test_check_grad(self):
        # since int8 type only be used in test and inference, there is
@@ -233,7 +233,7 @@ class TestLookupTableOpWithTensorIdsInt8(OpTest):
        self.outputs = {'Out': table[ids.flatten()].reshape((2, 4, 5, 31))}

    def test_check_output(self):
-        self.check_output()
+        self.check_output(check_cinn=True)

    def test_check_grad(self):
        # since int8 type only be used in test and inference, there is
@@ -247,7 +247,7 @@ class TestLookupTableOpWithPaddingInt8(TestLookupTableOpInt8):
        padding_idx = np.random.choice(ids, 1)[0]
        self.outputs['Out'][ids == padding_idx] = np.zeros(31)
        self.attrs = {'padding_idx': int(padding_idx)}
-        self.check_output()
+        self.check_output(check_cinn=True)

    def test_check_grad(self):
        # Since paddings are not trainable and fixed in forward, the gradient of
@@ -264,7 +264,7 @@ class TestLookupTableOpWithTensorIdsAndPaddingInt8(
        padding_idx = np.random.choice(flatten_idx, 1)[0]
        self.outputs['Out'][np.squeeze(ids == padding_idx)] = np.zeros(31)
        self.attrs = {'padding_idx': padding_idx}
-        self.check_output()
+        self.check_output(check_cinn=True)

    def test_check_grad(self):
        # Since paddings are not trainable and fixed in forward, the gradient of
@@ -354,7 +354,7 @@ class TestLookupTableOpInt16(OpTest):
        self.outputs = {'Out': table[ids]}

    def test_check_output(self):
-        self.check_output()
+        self.check_output(check_cinn=True)


 @skip_check_grad_ci(reason="Int16 type only be used in test and inference.")
@@ -371,7 +371,7 @@ class TestLookupTableOpWithTensorIdsInt16(OpTest):
        self.outputs = {'Out': table[ids.flatten()].reshape((2, 4, 5, 31))}

    def test_check_output(self):
-        self.check_output()
+        self.check_output(check_cinn=True)


 @skip_check_grad_ci(reason="Int16 type only be used in test and inference.")
@@ -381,7 +381,7 @@ class TestLookupTableOpWithPaddingInt16(TestLookupTableOpInt16):
        padding_idx = np.random.choice(ids, 1)[0]
        self.outputs['Out'][ids == padding_idx] = np.zeros(31)
        self.attrs = {'padding_idx': int(padding_idx)}
-        self.check_output()
+        self.check_output(check_cinn=True)


 @skip_check_grad_ci(reason="Int16 type only be used in test and inference.")
@@ -394,7 +394,7 @@ class TestLookupTableOpWithTensorIdsAndPaddingInt16(
        padding_idx = np.random.choice(flatten_idx, 1)[0]
        self.outputs['Out'][np.squeeze(ids == padding_idx)] = np.zeros(31)
        self.attrs = {'padding_idx': padding_idx}
-        self.check_output()
+        self.check_output(check_cinn=True)


 class TestLookupTableWIsSelectedRowsInt16(unittest.TestCase):

--- a/python/paddle/fluid/tests/unittests/test_lookup_table_v2_op.py
+++ b/python/paddle/fluid/tests/unittests/test_lookup_table_v2_op.py
@@ -56,10 +56,10 @@ class TestLookupTableOp(OpTest):
        return "int64"

    def test_check_output(self):
-        self.check_output()
+        self.check_output(check_cinn=True)

    def test_check_grad(self):
-        self.check_grad(['W'], 'Out', no_grad_set=set('Ids'))
+        self.check_grad(['W'], 'Out', no_grad_set=set('Ids'), check_cinn=True)


 class TestLookupTableOpInt16(OpTest):
@@ -87,10 +87,10 @@ class TestLookupTableOpWithTensorIds(OpTest):
        self.outputs = {'Out': table[ids.flatten()].reshape((2, 4, 5, 31))}

    def test_check_output(self):
-        self.check_output()
+        self.check_output(check_cinn=True)

    def test_check_grad(self):
-        self.check_grad(['W'], 'Out', no_grad_set=set('Ids'))
+        self.check_grad(['W'], 'Out', no_grad_set=set('Ids'), check_cinn=True)


 @skip_check_grad_ci(
@@ -104,7 +104,7 @@ class TestLookupTableOpWithPadding(TestLookupTableOp):
        padding_idx = np.random.choice(ids, 1)[0]
        self.outputs['Out'][ids == padding_idx] = np.zeros(31)
        self.attrs = {'padding_idx': int(padding_idx)}
-        self.check_output()
+        self.check_output(check_cinn=True)


 @skip_check_grad_ci(
@@ -119,7 +119,7 @@ class TestLookupTableOpWithTensorIdsAndPadding(TestLookupTableOpWithTensorIds):
        padding_idx = np.random.choice(flatten_idx, 1)[0]
        self.outputs['Out'][np.squeeze(ids == padding_idx)] = np.zeros(31)
        self.attrs = {'padding_idx': padding_idx}
-        self.check_output()
+        self.check_output(check_cinn=True)


 class TestLookupTableWIsSelectedRows(unittest.TestCase):

--- a/python/paddle/fluid/tests/unittests/test_matmul_op.py
+++ b/python/paddle/fluid/tests/unittests/test_matmul_op.py
@@ -100,19 +100,29 @@ class Generator:
        self.outputs = {'Out': Out}

    def test_check_output(self):
-        self.check_output()
+        self.check_output(check_cinn=True)

    def test_check_grad_normal(self):
-        self.check_grad(['X', 'Y'], 'Out', max_relative_error=1e-3)
+        self.check_grad(
+            ['X', 'Y'], 'Out', max_relative_error=1e-3, check_cinn=True
+        )

    def test_check_grad_ignore_x(self):
        self.check_grad(
-            ['Y'], 'Out', max_relative_error=1e-3, no_grad_set=set("X")
+            ['Y'],
+            'Out',
+            max_relative_error=1e-3,
+            no_grad_set=set("X"),
+            check_cinn=True,
        )

    def test_check_grad_ignore_y(self):
        self.check_grad(
-            ['X'], 'Out', max_relative_error=1e-3, no_grad_set=set('Y')
+            ['X'],
+            'Out',
+            max_relative_error=1e-3,
+            no_grad_set=set('Y'),
+            check_cinn=True,
        )



--- a/python/paddle/fluid/tests/unittests/test_matmul_v2_op.py
+++ b/python/paddle/fluid/tests/unittests/test_matmul_v2_op.py
@@ -103,13 +103,28 @@ class TestMatMulV2Op(OpTest):
        self.outputs = {'Out': result}

    def test_check_output(self):
-        self.check_output()
+        self.check_output(
+            check_cinn=self.check_cinn if hasattr(self, 'check_cinn') else True
+        )

    def test_check_grad(self):
        if core.is_compiled_with_rocm():
-            self.check_grad(['X', 'Y'], 'Out', max_relative_error=1e-2)
+            self.check_grad(
+                ['X', 'Y'],
+                'Out',
+                max_relative_error=1e-2,
+                check_cinn=self.check_cinn
+                if hasattr(self, 'check_cinn')
+                else True,
+            )
        else:
-            self.check_grad(['X', 'Y'], 'Out')
+            self.check_grad(
+                ['X', 'Y'],
+                'Out',
+                check_cinn=self.check_cinn
+                if hasattr(self, 'check_cinn')
+                else True,
+            )


 class TestMatMulOp2(TestMatMulV2Op):
@@ -290,6 +305,7 @@ class TestMatMulOp16(TestMatMulV2Op):
        self.y_shape = (1, 2, 2, 100, 2)
        self.trans_x = False
        self.trans_y = False
+        self.check_cinn = False


 class TestMatMulOp17(TestMatMulV2Op):
@@ -343,7 +359,13 @@ def create_test_fp16_class(parent, atol=0.001, max_relative_error=1.0):
            if core.is_compiled_with_cuda():
                place = core.CUDAPlace(0)
                if core.is_float16_supported(place):
-                    self.check_output_with_place(place, atol=atol)
+                    self.check_output_with_place(
+                        place,
+                        atol=atol,
+                        check_cinn=self.check_cinn
+                        if hasattr(self, 'check_cinn')
+                        else True,
+                    )

        def test_check_grad(self):
            place = core.CUDAPlace(0)
@@ -353,6 +375,9 @@ def create_test_fp16_class(parent, atol=0.001, max_relative_error=1.0):
                    ['X', 'Y'],
                    'Out',
                    max_relative_error=max_relative_error,
+                    check_cinn=self.check_cinn
+                    if hasattr(self, 'check_cinn')
+                    else True,
                )

    cls_name = "{}_{}".format(parent.__name__, "Fp16")
@@ -405,7 +430,13 @@ def create_test_bf16_class(parent, atol=0.01):

        def test_check_output(self):
            place = core.CUDAPlace(0)
-            self.check_output_with_place(place, atol=atol)
+            self.check_output_with_place(
+                place,
+                atol=atol,
+                check_cinn=self.check_cinn
+                if hasattr(self, 'check_cinn')
+                else True,
+            )

        def test_check_grad_x(self):
            place = core.CUDAPlace(0)
@@ -416,6 +447,9 @@ def create_test_bf16_class(parent, atol=0.01):
                'Out',
                no_grad_set={'Y'},
                user_defined_grads=[numeric_grads],
+                check_cinn=self.check_cinn
+                if hasattr(self, 'check_cinn')
+                else True,
            )

        def test_check_grad_y(self):
@@ -427,6 +461,9 @@ def create_test_bf16_class(parent, atol=0.01):
                'Out',
                no_grad_set={'X'},
                user_defined_grads=[numeric_grads],
+                check_cinn=self.check_cinn
+                if hasattr(self, 'check_cinn')
+                else True,
            )

        def test_check_grad(self):
@@ -596,7 +633,7 @@ class TestComplexMatMulOp(OpTest):
        self.grad_y = np.matmul(np.conj(self.x).T, self.grad_out)

    def test_check_output(self):
-        self.check_output()
+        self.check_output(check_cinn=False)

    def test_check_grad_normal(self):
        self.check_grad(
@@ -604,6 +641,7 @@ class TestComplexMatMulOp(OpTest):
            'Out',
            user_defined_grads=[self.grad_x, self.grad_y],
            user_defined_grad_outputs=[self.grad_out],
+            check_cinn=False,
        )

    def test_check_grad_ingore_x(self):
@@ -613,6 +651,7 @@ class TestComplexMatMulOp(OpTest):
            no_grad_set=set("X"),
            user_defined_grads=[self.grad_y],
            user_defined_grad_outputs=[self.grad_out],
+            check_cinn=False,
        )

    def test_check_grad_ingore_y(self):
@@ -622,6 +661,7 @@ class TestComplexMatMulOp(OpTest):
            no_grad_set=set('Y'),
            user_defined_grads=[self.grad_x],
            user_defined_grad_outputs=[self.grad_out],
+            check_cinn=False,
        )


@@ -662,7 +702,7 @@ class TestComplexMatMulOpBroadcast(OpTest):
        )

    def test_check_output(self):
-        self.check_output()
+        self.check_output(check_cinn=False)

    def test_check_grad_normal(self):
        self.check_grad(
@@ -670,6 +710,7 @@ class TestComplexMatMulOpBroadcast(OpTest):
            'Out',
            user_defined_grads=[self.grad_x, self.grad_y],
            user_defined_grad_outputs=[self.grad_out],
+            check_cinn=False,
        )

    def test_check_grad_ingore_x(self):
@@ -679,6 +720,7 @@ class TestComplexMatMulOpBroadcast(OpTest):
            no_grad_set=set("X"),
            user_defined_grads=[self.grad_y],
            user_defined_grad_outputs=[self.grad_out],
+            check_cinn=False,
        )

    def test_check_grad_ingore_y(self):
@@ -688,6 +730,7 @@ class TestComplexMatMulOpBroadcast(OpTest):
            no_grad_set=set('Y'),
            user_defined_grads=[self.grad_x],
            user_defined_grad_outputs=[self.grad_out],
+            check_cinn=False,
        )



--- a/python/paddle/fluid/tests/unittests/test_norm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_norm_op.py
@@ -48,10 +48,10 @@ class TestNormOp(OpTest):
        self.python_out_sig = ['Out']

    def test_check_output(self):
-        self.check_output()
+        self.check_output(check_cinn=True)

    def test_check_grad(self):
-        self.check_grad(['X'], 'Out')
+        self.check_grad(['X'], 'Out', check_cinn=True)

    def init_test_case(self):
        self.shape = [2, 3, 4, 5]
@@ -109,7 +109,7 @@ class TestNormOp6(TestNormOp):
        self.dtype = "float32"

    def test_check_grad(self):
-        self.check_grad(['X'], 'Out', max_relative_error=0.008)
+        self.check_grad(['X'], 'Out', max_relative_error=0.008, check_cinn=True)


 @unittest.skipIf(
@@ -120,11 +120,17 @@ class TestNormOp7(TestNormOp):
        self.dtype = "float16"

    def test_check_output(self):
-        self.check_output_with_place(fluid.core.CUDAPlace(0), atol=5e-2)
+        self.check_output_with_place(
+            fluid.core.CUDAPlace(0), atol=5e-2, check_cinn=True
+        )

    def test_check_grad(self):
        self.check_grad_with_place(
-            fluid.core.CUDAPlace(0), ['X'], 'Out', max_relative_error=0.05
+            fluid.core.CUDAPlace(0),
+            ['X'],
+            'Out',
+            max_relative_error=0.05,
+            check_cinn=True,
        )


@@ -147,7 +153,7 @@ class TestNormTestOp(OpTest):

    def test_check_output(self):
        # dynamic graph just supports float tensor
-        self.check_output(check_dygraph=True)
+        self.check_output(check_dygraph=True, check_cinn=True)

    def test_check_grad(self):
        pass
@@ -176,11 +182,17 @@ class TestNormBF16Op(OpTest):
        self.python_out_sig = ['Out']

    def test_check_output(self):
-        self.check_output_with_place(core.CUDAPlace(0), atol=1e-1)
+        self.check_output_with_place(
+            core.CUDAPlace(0), atol=1e-1, check_cinn=True
+        )

    def test_check_grad(self):
        self.check_grad_with_place(
-            core.CUDAPlace(0), ['X'], 'Out', max_relative_error=1e-2
+            core.CUDAPlace(0),
+            ['X'],
+            'Out',
+            max_relative_error=1e-2,
+            check_cinn=True,
        )

    def init_test_case(self):

--- a/python/paddle/fluid/tests/unittests/test_one_hot_v2_op.py
+++ b/python/paddle/fluid/tests/unittests/test_one_hot_v2_op.py
@@ -49,7 +49,7 @@ class TestOneHotOp(OpTest):
        self.outputs = {'Out': (out, x_lod)}

    def test_check_output(self):
-        self.check_output()
+        self.check_output(check_cinn=True)


 class TestOneHotOp_attr(OpTest):
@@ -57,6 +57,7 @@ class TestOneHotOp_attr(OpTest):
        self.op_type = 'one_hot_v2'
        self.python_api = one_hot_wrapper
        depth = 10
+        depth_np = np.array(10).astype('int32')
        dimension = 12
        x_lod = [[4, 1, 3, 3]]
        x = [np.random.randint(0, depth - 1) for i in range(sum(x_lod[0]))]
@@ -69,12 +70,12 @@ class TestOneHotOp_attr(OpTest):
        for i in range(np.product(x.shape)):
            out[i, 0, x[i]] = 1.0

-        self.inputs = {'X': (x, x_lod)}
+        self.inputs = {'X': (x, x_lod), 'depth_tensor': depth_np}
        self.attrs = {'dtype': int(core.VarDesc.VarType.FP32), 'depth': depth}
        self.outputs = {'Out': (out, x_lod)}

    def test_check_output(self):
-        self.check_output()
+        self.check_output(check_cinn=True)


 class TestOneHotOp_default_dtype(OpTest):
@@ -98,7 +99,7 @@ class TestOneHotOp_default_dtype(OpTest):
        self.outputs = {'Out': (out, x_lod)}

    def test_check_output(self):
-        self.check_output()
+        self.check_output(check_cinn=True)


 class TestOneHotOp_default_dtype_attr(OpTest):
@@ -106,6 +107,7 @@ class TestOneHotOp_default_dtype_attr(OpTest):
        self.op_type = 'one_hot_v2'
        self.python_api = one_hot_wrapper
        depth = 10
+        depth_np = np.array(depth).astype('int32')
        dimension = 12
        x_lod = [[4, 1, 3, 3]]
        x = [np.random.randint(0, depth - 1) for i in range(sum(x_lod[0]))]

--- a/python/paddle/fluid/tests/unittests/test_selu_op.py
+++ b/python/paddle/fluid/tests/unittests/test_selu_op.py
@@ -15,7 +15,7 @@
 import unittest

 import numpy as np
-from eager_op_test import OpTest
+from eager_op_test import OpTest, convert_float_to_uint16

 import paddle
 import paddle.nn.functional as F
@@ -43,13 +43,15 @@ class SeluTest(OpTest):
        self.op_type = "selu"
        self.python_api = paddle.nn.functional.selu
        self.x_shape = [3, 5, 5, 10]
-        self.dtype = np.float64
        self.init_x_shape()
        self.init_dtype()

        alpha = 1.6732632423543772848170429916717
        scale = 1.0507009873554804934193349852946

+        if self.dtype == np.uint16:
+            x = np.random.normal(size=self.x_shape).astype(np.float32)
+        else:
            x = np.random.normal(size=self.x_shape).astype(self.dtype)

        # Since zero point in selu is not differentiable, avoid randomize
@@ -58,6 +60,10 @@ class SeluTest(OpTest):

        out = ref_selu(x, scale, alpha)

+        if self.dtype == np.uint16:
+            self.inputs = {'X': convert_float_to_uint16(x)}
+            self.outputs = {'Out': convert_float_to_uint16(out)}
+        else:
            self.inputs = {'X': x}
            self.outputs = {'Out': out}

@@ -70,7 +76,7 @@ class SeluTest(OpTest):
        pass

    def init_dtype(self):
-        pass
+        self.dtype = np.float64

    def test_check_output(self):
        self.check_output()
@@ -79,6 +85,27 @@ class SeluTest(OpTest):
        self.check_grad(['X'], 'Out')


+class SeluTestFP16OP(SeluTest):
+    def init_dtype(self):
+        self.dtype = np.float16
+
+
+@unittest.skipIf(
+    not core.is_compiled_with_cuda()
+    or not core.is_bfloat16_supported(core.CUDAPlace(0)),
+    "core is not compiled with CUDA and do not support bfloat16",
+)
+class SeluTestBF16OP(SeluTest):
+    def init_dtype(self):
+        self.dtype = np.uint16
+
+    def test_check_output(self):
+        self.check_output_with_place(core.CUDAPlace(0))
+
+    def test_check_grad(self):
+        self.check_grad_with_place(core.CUDAPlace(0), ['X'], 'Out')
+
+
 class TestSeluAPI(unittest.TestCase):
    # test paddle.nn.SELU, paddle.nn.functional.selu
    def setUp(self):

--- a/python/paddle/fluid/tests/unittests/test_shape_op.py
+++ b/python/paddle/fluid/tests/unittests/test_shape_op.py
@@ -36,7 +36,7 @@ class TestShapeOp(OpTest):
        self.dtype = np.float32

    def test_check_output(self):
-        self.check_output()
+        self.check_output(check_cinn=True)


 class case1(TestShapeOp):
@@ -125,7 +125,7 @@ class TestShapeOpBf16(OpTest):

    def test_check_output(self):
        place = core.CUDAPlace(0)
-        self.check_output_with_place(place)
+        self.check_output_with_place(place, check_cinn=True)


 class case1Bf16(TestShapeOpBf16):

--- a/python/paddle/fluid/tests/unittests/test_sum_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sum_op.py
@@ -62,10 +62,10 @@ class TestSumOp(OpTest):
        self.dtype = np.float64

    def test_check_output(self):
-        self.check_output(check_prim=True)
+        self.check_output(check_prim=True, check_cinn=True)

    def test_check_grad(self):
-        self.check_grad(['x0'], 'Out', check_prim=True)
+        self.check_grad(['x0'], 'Out', check_prim=True, check_cinn=True)


 class TestSelectedRowsSumOp(unittest.TestCase):
@@ -299,14 +299,14 @@ class TestFP16SumOp(TestSumOp):
    def test_check_output(self):
        place = core.CUDAPlace(0)
        if core.is_float16_supported(place):
-            self.check_output_with_place(place)
+            self.check_output_with_place(place, check_cinn=True)

    # FIXME: Because of the precision fp16, max_relative_error
    # should be 0.15 here.
    def test_check_grad(self):
        place = core.CUDAPlace(0)
        if core.is_float16_supported(place):
-            self.check_grad(['x0'], 'Out')
+            self.check_grad(['x0'], 'Out', check_cinn=True)


 def create_test_sum_fp16_class(parent):

--- a/test/ir/inference/inference_pass_test.py
+++ b/test/ir/inference/inference_pass_test.py
@@ -12,7 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

+import os
 import random
+import tempfile
 import unittest

 import numpy as np
@@ -41,7 +43,10 @@ class InferencePassTest(unittest.TestCase):
        self.dynamic_shape_params = None
        self.enable_lite = False
        self.lite_parameters = None
-        self.path = "./inference_pass/" + self.__class__.__name__ + "/"
+        self.temp_dir = tempfile.TemporaryDirectory()
+        self.path = os.path.join(
+            self.temp_dir.name, 'inference_pass', self.__class__.__name__
+        )
        np.random.seed(1)
        random.seed(1)


--- a/test/ir/inference/test_trt_activation_pass.py
+++ b/test/ir/inference/test_trt_activation_pass.py
@@ -53,8 +53,9 @@ class TensorRTSubgraphPassActivationTest(InferencePassTest):
    def test_check_output(self):
        if core.is_compiled_with_cuda():
            use_gpu = True
-            if os.path.exists(self.path + "_opt_cache"):
-                shutil.rmtree(self.path + "_opt_cache")
+            opt_path = os.path.join(self.path, '_opt_cache')
+            if os.path.exists(opt_path):
+                shutil.rmtree(opt_path)
            if (
                self.trt_parameters.precision
                == AnalysisConfig.Precision.Float32

--- a/test/ir/inference/test_trt_elementwise_op.py
+++ b/test/ir/inference/test_trt_elementwise_op.py
@@ -53,8 +53,9 @@ class TensorRTSubgraphPassElementwiseBroadcastTest(InferencePassTest):
        return paddle.tensor.math.add(x=data1, y=data2)

    def test_check_output(self):
-        if os.path.exists(self.path + "_opt_cache"):
-            shutil.rmtree(self.path + "_opt_cache")
+        opt_path = os.path.join(self.path, '_opt_cache')
+        if os.path.exists(opt_path):
+            shutil.rmtree(opt_path)
        if core.is_compiled_with_cuda():
            use_gpu = True
            self.check_output_with_option(use_gpu)

--- a/test/ir/inference/test_trt_instance_norm_op.py
+++ b/test/ir/inference/test_trt_instance_norm_op.py
@@ -55,8 +55,9 @@ class TRTInstanceNormTest(InferencePassTest):
        self.fetch_list = [out]

    def check_output(self, remove_cache=False):
-        if remove_cache and os.path.exists(self.path + "_opt_cache"):
-            shutil.rmtree(self.path + "_opt_cache")
+        opt_path = os.path.join(self.path, '_opt_cache')
+        if remove_cache and os.path.exists(opt_path):
+            shutil.rmtree(opt_path)
        if core.is_compiled_with_cuda():
            use_gpu = True
            atol = 1e-5

--- a/test/ir/inference/test_trt_pool3d_op.py
+++ b/test/ir/inference/test_trt_pool3d_op.py
@@ -84,8 +84,9 @@ class TensorRTPool3dTest(InferencePassTest):
            self.fetch_list = [pool_out]

    def check_output(self):
-        if os.path.exists(self.path + "_opt_cache"):
-            shutil.rmtree(self.path + "_opt_cache")
+        opt_path = os.path.join(self.path, '_opt_cache')
+        if os.path.exists(opt_path):
+            shutil.rmtree(opt_path)
        if core.is_compiled_with_cuda():
            use_gpu = True
            if self.precision == AnalysisConfig.Precision.Float32:
@@ -200,8 +201,9 @@ class TensorRTAdaptiveAvgPool3DTest(InferencePassTest):
            self.fetch_list = [pool_out]

    def check_output(self):
-        if os.path.exists(self.path + "_opt_cache"):
-            shutil.rmtree(self.path + "_opt_cache")
+        opt_path = os.path.join(self.path, '_opt_cache')
+        if os.path.exists(opt_path):
+            shutil.rmtree(opt_path)
        if core.is_compiled_with_cuda():
            use_gpu = True
            self.check_output_with_option(use_gpu)
@@ -300,8 +302,9 @@ class TensorRTAdaptiveMaxPool3DTest(InferencePassTest):
            self.fetch_list = [pool_out]

    def check_output(self):
-        if os.path.exists(self.path + "_opt_cache"):
-            shutil.rmtree(self.path + "_opt_cache")
+        opt_path = os.path.join(self.path, '_opt_cache')
+        if os.path.exists(opt_path):
+            shutil.rmtree(opt_path)
        if core.is_compiled_with_cuda():
            use_gpu = True
            self.check_output_with_option(use_gpu)

--- a/test/ir/inference/test_trt_pool_op.py
+++ b/test/ir/inference/test_trt_pool_op.py
@@ -86,8 +86,9 @@ class TensorRTPoolTest(InferencePassTest):
            self.fetch_list = [out]

    def check_output(self):
-        if os.path.exists(self.path + "_opt_cache"):
-            shutil.rmtree(self.path + "_opt_cache")
+        opt_path = os.path.join(self.path, '_opt_cache')
+        if os.path.exists(opt_path):
+            shutil.rmtree(opt_path)
        if core.is_compiled_with_cuda():
            use_gpu = True
            if self.precision == AnalysisConfig.Precision.Float32:

--- a/test/ir/inference/test_trt_skip_layernorm_fuse_pass.py
+++ b/test/ir/inference/test_trt_skip_layernorm_fuse_pass.py
@@ -60,8 +60,9 @@ class SkipLayernormFusePassTest0(InferencePassTest):
        return paddle.add(data1, data2)

    def test_check_output(self):
-        if os.path.exists(self.path + "_opt_cache"):
-            shutil.rmtree(self.path + "_opt_cache")
+        opt_path = os.path.join(self.path, '_opt_cache')
+        if os.path.exists(opt_path):
+            shutil.rmtree(opt_path)
        if core.is_compiled_with_cuda():
            use_gpu = True
            self.check_output_with_option(use_gpu, atol=0.01, rtol=0.00001)
@@ -107,8 +108,9 @@ class SkipLayernormFusePassTest1(InferencePassTest):
        return paddle.add(data1, data2)

    def test_check_output(self):
-        if os.path.exists(self.path + "_opt_cache"):
-            shutil.rmtree(self.path + "_opt_cache")
+        opt_path = os.path.join(self.path, '_opt_cache')
+        if os.path.exists(opt_path):
+            shutil.rmtree(opt_path)
        if core.is_compiled_with_cuda():
            use_gpu = True
            self.check_output_with_option(use_gpu, atol=0.01, rtol=0.00001)
@@ -154,8 +156,9 @@ class SkipLayernormFusePassTest2(InferencePassTest):
        return paddle.add(data1, data2)

    def test_check_output(self):
-        if os.path.exists(self.path + "_opt_cache"):
-            shutil.rmtree(self.path + "_opt_cache")
+        opt_path = os.path.join(self.path, '_opt_cache')
+        if os.path.exists(opt_path):
+            shutil.rmtree(opt_path)
        if core.is_compiled_with_cuda():
            use_gpu = True
            self.check_output_with_option(use_gpu, atol=0.1, rtol=0.00001)
@@ -201,8 +204,9 @@ class SkipLayernormFusePassTest3(InferencePassTest):
        return paddle.add(data1, data2)

    def test_check_output(self):
-        if os.path.exists(self.path + "_opt_cache"):
-            shutil.rmtree(self.path + "_opt_cache")
+        opt_path = os.path.join(self.path, '_opt_cache')
+        if os.path.exists(opt_path):
+            shutil.rmtree(opt_path)
        if core.is_compiled_with_cuda():
            use_gpu = True
            self.check_output_with_option(use_gpu, atol=0.1, rtol=0.00001)

--- a/test/ir/inference/test_trt_subgraph_pass.py
+++ b/test/ir/inference/test_trt_subgraph_pass.py
@@ -128,8 +128,9 @@ class TensorRTSubgraphPassSplitSerializeTest(InferencePassTest):
    def test_check_output(self):
        if paddle.is_compiled_with_cuda():
            use_gpu = True
-            if os.path.exists(self.path + "_opt_cache"):
-                shutil.rmtree(self.path + "_opt_cache")
+            opt_path = os.path.join(self.path, '_opt_cache')
+            if os.path.exists(opt_path):
+                shutil.rmtree(opt_path)
            self.check_output_with_option(use_gpu)
            self.assertTrue(
                PassVersionChecker.IsCompatible('tensorrt_subgraph_pass')
@@ -164,8 +165,9 @@ class TensorRTSubgraphPassDynamicSplitFp16SerializeTest(InferencePassTest):
    def test_check_output(self):
        if paddle.is_compiled_with_cuda():
            use_gpu = True
-            if os.path.exists(self.path + "_opt_cache"):
-                shutil.rmtree(self.path + "_opt_cache")
+            opt_path = os.path.join(self.path, '_opt_cache')
+            if os.path.exists(opt_path):
+                shutil.rmtree(opt_path)
            self.check_output_with_option(use_gpu, 1e-3)
            self.assertTrue(
                PassVersionChecker.IsCompatible('tensorrt_subgraph_pass')
@@ -313,8 +315,9 @@ class TensorRTSubgraphPassLayerNormDynamicTest(InferencePassTest):
        self.serialize = True

    def test_check_output(self):
-        if os.path.exists(self.path + "_opt_cache"):
-            shutil.rmtree(self.path + "_opt_cache")
+        opt_path = os.path.join(self.path, '_opt_cache')
+        if os.path.exists(opt_path):
+            shutil.rmtree(opt_path)
        if paddle.is_compiled_with_cuda():
            use_gpu = True
            self.check_output_with_option(use_gpu)
@@ -332,8 +335,9 @@ class TensorRTSubgraphPassLayerNormDynamicFP16Test(
        self.serialize = True

    def test_check_output(self):
-        if os.path.exists(self.path + "_opt_cache"):
-            shutil.rmtree(self.path + "_opt_cache")
+        opt_path = os.path.join(self.path, '_opt_cache')
+        if os.path.exists(opt_path):
+            shutil.rmtree(opt_path)
        if paddle.is_compiled_with_cuda():
            use_gpu = True
            self.check_output_with_option(use_gpu, atol=0.01, rtol=0.01)
@@ -406,8 +410,9 @@ class TensorRTSubgraphPassElementwiseSerializeTest(
        )

    def test_check_output(self):
-        if os.path.exists(self.path + "_opt_cache"):
-            shutil.rmtree(self.path + "_opt_cache")
+        opt_path = os.path.join(self.path, '_opt_cache')
+        if os.path.exists(opt_path):
+            shutil.rmtree(opt_path)
        super().test_check_output()


@@ -444,8 +449,9 @@ class TensorRTSubgraphPassElementwiseBroadcastDynamicTest(InferencePassTest):
        return paddle.add(x=data1, y=data2)

    def test_check_output(self):
-        if os.path.exists(self.path + "_opt_cache"):
-            shutil.rmtree(self.path + "_opt_cache")
+        opt_path = os.path.join(self.path, '_opt_cache')
+        if os.path.exists(opt_path):
+            shutil.rmtree(opt_path)
        if paddle.is_compiled_with_cuda():
            use_gpu = True
            self.check_output_with_option(use_gpu)

--- a/test/xpu/test_nll_loss_op_xpu.py
+++ b/test/xpu/test_nll_loss_op_xpu.py
+#   Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+from get_test_cover_info import (
+    XPUOpTestWrapper,
+    create_test_class,
+    get_xpu_op_support_types,
+)
+from op_test_xpu import XPUOpTest
+
+import paddle
+
+paddle.enable_static()
+
+
+def nll_loss_1d(
+    logs, dtype, targets, weight=None, reduction='mean', ignore_index=-100
+):
+    input_shape = logs.shape
+    N = input_shape[0]
+    C = input_shape[1]
+    out = np.zeros_like(targets).astype(dtype)
+    total_weight = 0
+    for i in range(N):
+        cur_target = targets[i]
+        if cur_target == ignore_index:
+            out[i] = 0
+            continue
+        cur_weight = weight[cur_target] if weight is not None else 1
+        total_weight += cur_weight
+        out[i] = -logs[i][cur_target] * cur_weight
+    if reduction == 'sum':
+        out = np.sum(out)
+        total_weight = np.array([total_weight]).astype(dtype)
+        return {'Out': out, 'Total_weight': total_weight}
+    elif reduction == 'mean':
+        out = np.sum(out)
+        if total_weight != 0:
+            out /= total_weight
+        total_weight = np.array([total_weight]).astype(dtype)
+        return {'Out': out, 'Total_weight': total_weight}
+    elif reduction == 'none':
+        total_weight = np.array([0]).astype(dtype)
+        return {'Out': out, 'Total_weight': total_weight}
+
+
+def nll_loss_2d(
+    logs, dtype, targets, weight=None, reduction='mean', ignore_index=-100
+):
+    input_shape = logs.shape
+    N = input_shape[0]
+    H = input_shape[2]
+    W = input_shape[3]
+    out = np.zeros_like(targets).astype(dtype)
+    total_weight = 0
+    for i in range(N):
+        for h in range(H):
+            for w in range(W):
+                cur_target = targets[i][h][w]
+                if cur_target == ignore_index:
+                    out[i][h][w] = 0
+                    continue
+                cur_weight = weight[cur_target] if weight is not None else 1
+                total_weight += cur_weight
+                out[i][h][w] = -logs[i][cur_target][h][w] * cur_weight
+    if reduction == 'sum':
+        out = np.sum(out)
+        total_weight = np.array([total_weight]).astype(dtype)
+        return {'Out': out, 'Total_weight': total_weight}
+    elif reduction == 'mean':
+        out = np.sum(out)
+        if total_weight != 0:
+            out /= total_weight
+        total_weight = np.array([total_weight]).astype(dtype)
+        return {'Out': out, 'Total_weight': total_weight}
+    elif reduction == 'none':
+        total_weight = np.array([0]).astype(dtype)
+        return {'Out': out, 'Total_weight': total_weight}
+
+
+class XPUTestNLLLossOP(XPUOpTestWrapper):
+    def __init__(self):
+        self.op_name = 'nll_loss'
+        self.use_dynamic_create_class = False
+
+    class TestNLLLossOpBase1D(XPUOpTest):
+        op_type = 'nll_loss'
+
+        def setUp(self):
+            self.dtype = self.in_type
+            self.place = paddle.XPUPlace(0)
+            self.set_attrs()
+            self.set_inputs()
+            self.inputs = {
+                'X': self.x,
+                'Label': self.label,
+            }
+            if self.weight is not None:
+                self.inputs['Weight'] = self.weight
+            self.outputs = nll_loss_1d(
+                self.x,
+                self.dtype,
+                self.label,
+                self.weight,
+                self.attrs['reduction'],
+            )
+
+        def set_attrs(self):
+            self.attrs = {'reduction': 'none'}
+
+        def set_inputs(self):
+            self.class_num = 3
+            x_shape = [5, self.class_num]
+            label_shape = [5]
+            self.x = np.random.random(x_shape).astype(self.dtype)
+            self.label = np.random.randint(
+                low=0, high=self.class_num, size=label_shape
+            ).astype(np.int64)
+            self.weight = np.random.random(self.class_num).astype(self.dtype)
+
+        def test_check_output(self):
+            self.check_output_with_place(self.place)
+
+        def test_check_grad(self):
+            self.check_grad_with_place(self.place, ['X'], 'Out')
+
+    class TestNLLLossOpWithWeightMean1D(TestNLLLossOpBase1D):
+        def set_attrs(self):
+            self.attrs = {'reduction': 'mean'}
+
+    class TestNLLLossOpWithWeightSum1D(TestNLLLossOpBase1D):
+        def set_attrs(self):
+            self.attrs = {'reduction': 'sum'}
+
+    class TestNLLLossOpWithoutWeightNone1D(TestNLLLossOpBase1D):
+        def set_inputs(self):
+            self.class_num = 3
+            x_shape = [5, self.class_num]
+            label_shape = [5]
+            self.x = np.random.random(x_shape).astype(self.dtype)
+            self.label = np.random.randint(
+                low=0, high=self.class_num, size=label_shape
+            ).astype(np.int64)
+            self.weight = None
+
+        def set_attrs(self):
+            self.attrs = {'reduction': 'none'}
+
+    class TestNLLLossOpWithoutWeightMean1D(TestNLLLossOpBase1D):
+        def set_inputs(self):
+            self.class_num = 3
+            x_shape = [5, self.class_num]
+            label_shape = [5]
+            self.x = np.random.random(x_shape).astype(self.dtype)
+            self.label = np.random.randint(
+                low=0, high=self.class_num, size=label_shape
+            ).astype(np.int64)
+            self.weight = None
+
+        def set_attrs(self):
+            self.attrs = {'reduction': 'mean'}
+
+    class TestNLLLossOpWithoutWeightSum1D(TestNLLLossOpBase1D):
+        def set_inputs(self):
+            self.class_num = 3
+            x_shape = [5, self.class_num]
+            label_shape = [5]
+            self.x = np.random.random(x_shape).astype(self.dtype)
+            self.label = np.random.randint(
+                low=0, high=self.class_num, size=label_shape
+            ).astype(np.int64)
+            self.weight = None
+
+        def set_attrs(self):
+            self.attrs = {'reduction': 'sum'}
+
+    class TestNLLLossOpBase2D(XPUOpTest):
+        op_type = 'nll_loss'
+
+        def setUp(self):
+            self.dtype = self.in_type
+            self.place = paddle.XPUPlace(0)
+            self.set_attrs()
+            self.set_inputs()
+            self.inputs = {'X': self.x, 'Label': self.label}
+            if self.weight is not None:
+                self.inputs['Weight'] = self.weight
+            self.outputs = nll_loss_2d(
+                self.x,
+                self.dtype,
+                self.label,
+                self.weight,
+                self.attrs['reduction'],
+            )
+
+        def set_attrs(self):
+            self.attrs = {'reduction': 'none'}
+
+        def set_inputs(self):
+            self.class_num = 3
+            x_shape = [5, self.class_num, 7, 11]
+            label_shape = [5, 7, 11]
+            self.x = np.random.random(x_shape).astype(self.dtype)
+            self.label = np.random.randint(
+                low=0, high=self.class_num, size=label_shape
+            ).astype(np.int64)
+            self.weight = np.random.random(self.class_num).astype(self.dtype)
+
+        def test_check_output(self):
+            self.check_output_with_place(self.place)
+
+        def test_check_grad(self):
+            self.check_grad_with_place(self.place, ['X'], 'Out')
+
+    class TestNLLLossOpWithWeightMean2D(TestNLLLossOpBase2D):
+        def set_attrs(self):
+            self.attrs = {'reduction': 'mean'}
+
+    class TestNLLLossOpWithWeightSum2D(TestNLLLossOpBase2D):
+        def set_attrs(self):
+            self.attrs = {'reduction': 'sum'}
+
+    class TestNLLLossOpWithoutWeightNone2D(TestNLLLossOpBase2D):
+        def set_inputs(self):
+            self.dtype = self.in_type
+            self.class_num = 3
+            x_shape = [5, self.class_num, 7, 11]
+            label_shape = [5, 7, 11]
+            self.x = np.random.random(x_shape).astype(self.dtype)
+            self.label = np.random.randint(
+                low=0, high=self.class_num, size=label_shape
+            ).astype(np.int64)
+            self.weight = None
+
+        def set_attrs(self):
+            self.attrs = {'reduction': 'none'}
+
+    class TestNLLLossOpWithoutWeightMean2D(TestNLLLossOpBase2D):
+        def set_inputs(self):
+            self.dtype = self.in_type
+            self.class_num = 3
+            x_shape = [5, self.class_num, 7, 11]
+            label_shape = [5, 7, 11]
+            self.x = np.random.random(x_shape).astype(self.dtype)
+            self.label = np.random.randint(
+                low=0, high=self.class_num, size=label_shape
+            ).astype(np.int64)
+            self.weight = None
+
+        def set_attrs(self):
+            self.attrs = {'reduction': 'mean'}
+
+    class TestNLLLossOpWithoutWeightSum2D(TestNLLLossOpBase2D):
+        def set_inputs(self):
+            self.dtype = self.in_type
+            self.class_num = 3
+            x_shape = [5, self.class_num, 7, 11]
+            label_shape = [5, 7, 11]
+            self.x = np.random.random(x_shape).astype(self.dtype)
+            self.label = np.random.randint(
+                low=0, high=self.class_num, size=label_shape
+            ).astype(np.int64)
+            self.weight = None
+
+        def set_attrs(self):
+            self.attrs = {'reduction': 'sum'}
+
+
+support_types = get_xpu_op_support_types('nll_loss')
+for stype in support_types:
+    create_test_class(globals(), XPUTestNLLLossOP, stype)
+
+if __name__ == '__main__':
+    unittest.main()
--- a/tools/check_file_diff_approvals.sh
+++ b/tools/check_file_diff_approvals.sh
@@ -344,7 +344,7 @@ fi
 OUTPUT_LOG=`echo "$ALL_ADDED_LINES" | grep -Ew "print|printf|fprintf|std::cout" || true`
 if [ "$OUTPUT_LOG" != "" ];then
    echo_line="print or std::cout is not recommended for direct use, please use loggin or glog. If it is necessary to use, please contact tianshuo78520a (Recommend) or zhangbo9674 review and approve.\n"
-    check_approval 1 tianshuo7852a zhangbo9674
+    check_approval 1 tianshuo78520a zhangbo9674
 fi

 HAS_MODIFIED_PHI_FILES=`git diff --name-only upstream/$BRANCH | grep "paddle/phi/" || true`