diff --git a/.gitmodules b/.gitmodules
index 4c3d2b21424dd6f541f407bf3d62612a3c04e0ba..8c294e25bd6095ee8bbc9f51d82ffebbf1bd7bcf 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -30,10 +30,6 @@
 	path = third_party/xxhash
 	url = https://github.com/Cyan4973/xxHash.git
 	ignore = dirty
-[submodule "third_party/eigen3"]
-	path = third_party/eigen3
-	url = https://gitlab.com/libeigen/eigen.git
-	ignore = dirty
 [submodule "third_party/leveldb"]
 	path = third_party/leveldb
 	url = https://github.com/google/leveldb
@@ -50,3 +46,7 @@
 	path = third_party/glog
 	url = https://github.com/google/glog.git
 	ignore = dirty
+[submodule "third_party/eigen3"]
+	path = third_party/eigen3
+	url = https://gitlab.com/libeigen/eigen.git
+	ignore = dirty
diff --git a/cmake/external/protobuf.cmake b/cmake/external/protobuf.cmake
index c6f3eb23e38d6ed9bd26704ea7e048451eb8ad8d..993d079b63f25756b377ce5bca1783b666262a02 100755
--- a/cmake/external/protobuf.cmake
+++ b/cmake/external/protobuf.cmake
@@ -296,6 +296,8 @@ function(build_protobuf TARGET_NAME BUILD_FOR_HOST)
       PREFIX ${PROTOBUF_PREFIX_DIR}
       SOURCE_DIR ${SOURCE_DIR}
       UPDATE_COMMAND ""
+      PATCH_COMMAND
+      COMMAND cd ${SOURCE_DIR} && git checkout ${PROTOBUF_TAG}
       DEPENDS zlib
       CONFIGURE_COMMAND
         ${CMAKE_COMMAND} ${SOURCE_DIR}/cmake ${OPTIONAL_ARGS}
diff --git a/cmake/external/xpu.cmake b/cmake/external/xpu.cmake
index 61188ae383a2d580f4b1621e6ff69d2a8f4234db..1ba00fe42c6d7660583745a218021ee7b61f699f 100644
--- a/cmake/external/xpu.cmake
+++ b/cmake/external/xpu.cmake
@@ -8,7 +8,7 @@ set(XPU_API_LIB_NAME "libxpuapi.so")
 set(XPU_RT_LIB_NAME "libxpurt.so")
 set(XPU_XFT_LIB_NAME "libxft.so")
 
-set(XPU_BASE_DATE "20230519")
+set(XPU_BASE_DATE "20230523")
 set(XPU_XCCL_BASE_VERSION "1.0.49.2")
 set(XPU_XFT_BASE_VERSION "latest")
 
diff --git a/paddle/fluid/distributed/auto_parallel/CMakeLists.txt b/paddle/fluid/distributed/auto_parallel/CMakeLists.txt
index d4c74fe946c1283c653fc464ec0718f5c9499079..1245aebdf152a42364ab062b9c7513d217faa43f 100644
--- a/paddle/fluid/distributed/auto_parallel/CMakeLists.txt
+++ b/paddle/fluid/distributed/auto_parallel/CMakeLists.txt
@@ -6,6 +6,3 @@ cc_library(
 
 add_subdirectory(test)
 add_subdirectory(spmd_rules)
-
-cc_library(auto_parallel DEPS device_mesh process_mesh dist_attr dist_mapper
-                              dist_tensor_spec)
diff --git a/paddle/fluid/distributed/auto_parallel/spmd_rules/dist_tensor_spec.cc b/paddle/fluid/distributed/auto_parallel/spmd_rules/dist_tensor_spec.cc
index 5775e72527a7591082b30c8f6ed90d8c99331855..c756c54c4adfcafd4621a686468efe80785c9010 100644
--- a/paddle/fluid/distributed/auto_parallel/spmd_rules/dist_tensor_spec.cc
+++ b/paddle/fluid/distributed/auto_parallel/spmd_rules/dist_tensor_spec.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/distributed/auto_parallel/spmd_rules/dist_tensor_spec.h"
-#include "paddle/fluid/distributed/auto_parallel/process_mesh.h"
+#include "paddle/phi/core/distributed/auto_parallel/utils.h"
 
 namespace paddle {
 namespace distributed {
@@ -27,28 +27,41 @@ DistTensorSpec::DistTensorSpec(const std::vector<int64_t>& shape,
   dist_attr_.copy_from(dist_attr);
 }
 
+DistTensorSpec::DistTensorSpec(const DistTensorSpec& spec) {
+  std::vector<int64_t> spec_shape = spec.get_shape();
+  shape_.assign(spec_shape.begin(), spec_shape.end());
+  dist_attr_.copy_from(spec.get_dist_attr());
+}
+
 DistTensorSpec::~DistTensorSpec() {}
 
 DistTensorSpec::DistTensorSpec(const Tensor& tensor) {
   shape_ = tensor.shape();
 
-  std::vector<int64_t> pm_shape, pm_ids;
-  pm_shape = {4};
-  pm_ids = {0, 1, 2, 3};
-  std::vector<std::string> dim_name = {"mp"};
+  // std::vector<int64_t> pm_shape, pm_ids;
+  // pm_shape = {4};
+  // pm_ids = {0, 1, 2, 3};
+  // std::vector<std::string> dim_name = {"mp"};
 
-  ProcessMesh pm(pm_shape, pm_ids, dim_name);
-  std::vector<int64_t> dims_mapping = {-1, 0};
-  TensorDistAttr dist_attr;
-  dist_attr.set_process_mesh(pm);
-  dist_attr.set_dims_mapping(dims_mapping);
+  // ProcessMesh pm(pm_shape, pm_ids, dim_name);
+  // std::vector<int64_t> dims_mapping = {-1, 0};
+  // TensorDistAttr dist_attr;
+  // dist_attr.set_process_mesh(pm);
+  // dist_attr.set_dims_mapping(dims_mapping);
 
-  dist_attr_.copy_from(dist_attr);
+  // dist_attr_.copy_from(dist_attr);
 
-  std::cout << dist_attr_;
+  // std::cout << dist_attr_;
 }
 
-const std::vector<int64_t>& DistTensorSpec::get_dims_mapping() {
+DistTensorSpec& DistTensorSpec::operator=(const DistTensorSpec& spec) {
+  std::vector<int64_t> spec_shape = spec.get_shape();
+  shape_ = spec_shape;
+  dist_attr_.copy_from(spec.get_dist_attr());
+  return *this;
+}
+
+const std::vector<int64_t>& DistTensorSpec::get_dims_mapping() const {
   return dist_attr_.dims_mapping();
 }
 
@@ -57,7 +70,7 @@ void DistTensorSpec::set_dims_mapping(
   dist_attr_.set_dims_mapping(dims_mapping);
 }
 
-const ProcessMesh& DistTensorSpec::get_process_mesh() {
+const ProcessMesh& DistTensorSpec::get_process_mesh() const {
   return dist_attr_.process_mesh();
 }
 
@@ -65,7 +78,22 @@ void DistTensorSpec::set_process_mesh(const ProcessMesh& process_mesh) {
   dist_attr_.set_process_mesh(process_mesh);
 }
 
-const std::vector<int64_t>& DistTensorSpec::get_shape() { return shape_; }
+const std::vector<int64_t>& DistTensorSpec::get_shape() const { return shape_; }
+
+const TensorDistAttr& DistTensorSpec::get_dist_attr() const {
+  return dist_attr_;
+}
+
+void DistTensorSpec::set_dist_attr(const TensorDistAttr& dist_attr) {
+  dist_attr_ = dist_attr;
+}
+
+std::string DistTensorSpec::to_string() const {
+  using phi::distributed::auto_parallel::str_join;
+  std::string spec_str = "{tensor_shape:[" + str_join(shape_) + "], ";
+  spec_str += "dist_attr:" + dist_attr_.to_string() + "}";
+  return spec_str;
+}
 
 }  // namespace auto_parallel
 }  // namespace distributed
diff --git a/paddle/fluid/distributed/auto_parallel/spmd_rules/dist_tensor_spec.h b/paddle/fluid/distributed/auto_parallel/spmd_rules/dist_tensor_spec.h
index 2e79148ab0efb6a291c21731c00771dc82cd09e4..dc1f157ccbfb39e5b98d7b92de8722a2e7fdb374 100644
--- a/paddle/fluid/distributed/auto_parallel/spmd_rules/dist_tensor_spec.h
+++ b/paddle/fluid/distributed/auto_parallel/spmd_rules/dist_tensor_spec.h
@@ -14,39 +14,55 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/fluid/distributed/auto_parallel/dist_attr.h"
 #include "paddle/phi/api/include/tensor.h"
+#include "paddle/phi/core/distributed/auto_parallel/dist_attr.h"
 
 namespace paddle {
 namespace distributed {
 namespace auto_parallel {
 
+using phi::distributed::auto_parallel::ProcessMesh;
+using phi::distributed::auto_parallel::TensorDistAttr;
+
 /**
  * A unified data class for inferring distributed attributes
  * in both dygraph mode and static mode
  */
 class DistTensorSpec {
  public:
+  DistTensorSpec() = default;
+
   DistTensorSpec(const std::vector<int64_t>& shape,
                  const TensorDistAttr& dist_attr);
 
+  DistTensorSpec(const DistTensorSpec& spec);
+
+  // temp function, only for test in dygraph mode
   explicit DistTensorSpec(const Tensor& tensor);
 
   ~DistTensorSpec();
 
+  DistTensorSpec& operator=(const DistTensorSpec& spec);
+
   // get dims_mapping from dist_attr_
-  const std::vector<int64_t>& get_dims_mapping();
+  const std::vector<int64_t>& get_dims_mapping() const;
 
   // set dims_mapping in dist_attr_
   void set_dims_mapping(const std::vector<int64_t>& dims_mapping);
 
   // get process_mesh from dist_attr_
-  const ProcessMesh& get_process_mesh();
+  const ProcessMesh& get_process_mesh() const;
 
   // set process_mesh in dist_attr_
   void set_process_mesh(const ProcessMesh& process_mesh);
 
-  const std::vector<int64_t>& get_shape();
+  const TensorDistAttr& get_dist_attr() const;
+
+  void set_dist_attr(const TensorDistAttr& dist_attr);
+
+  const std::vector<int64_t>& get_shape() const;
+
+  std::string to_string() const;
 
  private:
   std::vector<int64_t> shape_;
diff --git a/paddle/fluid/operators/cinn/cinn_launch_op.cc b/paddle/fluid/operators/cinn/cinn_launch_op.cc
index ad74d88f70e1d8e96b74e6451d941d21a67462e6..3ab9f6ba99b58470e780cc65a05af03cd701a7ed 100644
--- a/paddle/fluid/operators/cinn/cinn_launch_op.cc
+++ b/paddle/fluid/operators/cinn/cinn_launch_op.cc
@@ -17,6 +17,7 @@
 #include <functional>
 #include <vector>
 
+#include "cinn/common/target.h"
 #include "cinn/hlir/framework/graph_compiler.h"
 #include "cinn/runtime/cinn_runtime.h"
 #include "cinn/runtime/flags.h"
@@ -94,6 +95,11 @@ void SetCinnRandomSeed<phi::CPUContext>() {
   ::cinn::runtime::RandomSeed::GetOrSet(seed);
 }
 
+void SetCinnTarget(const ::cinn::common::Target& target) {
+  VLOG(4) << "Set CINN compile target to " << target;
+  ::cinn::runtime::CurrentTarget::SetCurrentTarget(target);
+}
+
 }  // namespace details
 
 class CinnLaunchOp : public framework::OperatorWithKernel {
diff --git a/paddle/fluid/operators/cinn/cinn_launch_op.h b/paddle/fluid/operators/cinn/cinn_launch_op.h
index 59970412ea6a3b772dd282caa9674ab738b7d7d4..90751c72c605bada307f999ec9cf4177ff1c3671 100644
--- a/paddle/fluid/operators/cinn/cinn_launch_op.h
+++ b/paddle/fluid/operators/cinn/cinn_launch_op.h
@@ -58,6 +58,9 @@ void SetCinnRuntimeFlags();
 template <typename DeviceContext>
 void SetCinnRandomSeed();
 
+// set CINN compile target
+void SetCinnTarget(const ::cinn::common::Target& target);
+
 }  // namespace details
 
 template <typename T, typename DeviceContext>
@@ -115,6 +118,7 @@ class CinnLaunchOpKernel : public framework::OpKernel<T> {
         "Step 2. Get compilation result of the graph");
     // Step 2. Get compilation result of the graph
     auto target = details::PlaceToCinnTarget(place);
+    details::SetCinnTarget(target);
     using ClockType = std::chrono::steady_clock;
     std::chrono::time_point<ClockType> start_t, end_t;
     if (VLOG_IS_ON(1)) {
diff --git a/paddle/fluid/operators/collective/alltoall_op.cu.cc b/paddle/fluid/operators/collective/alltoall_op.cu.cc
index aacd76af4af0586de2cd2c97b439d8c380eaeefc..0ae338c745ae3a890b3e36a86ebc012b0f04636f 100644
--- a/paddle/fluid/operators/collective/alltoall_op.cu.cc
+++ b/paddle/fluid/operators/collective/alltoall_op.cu.cc
@@ -98,7 +98,7 @@ PD_REGISTER_STRUCT_KERNEL(alltoall,
                           ops::AllToAllOpCUDAKernel,
                           float,
                           double,
-#if NCCL_VERSION_CODE >= 21000
+#if NCCL_VERSION_CODE >= 21000 && CUDA_VERSION >= 11000
                           plat::bfloat16,
 #endif
                           int,
diff --git a/paddle/fluid/operators/collective/c_allgather_op.cu.cc b/paddle/fluid/operators/collective/c_allgather_op.cu.cc
index 70b7d70dc93b31b032bf80e9e41121eeb57c4848..c3eff905851e3fff741024d850cda95ef9ec3bcd 100644
--- a/paddle/fluid/operators/collective/c_allgather_op.cu.cc
+++ b/paddle/fluid/operators/collective/c_allgather_op.cu.cc
@@ -95,7 +95,7 @@ PD_REGISTER_STRUCT_KERNEL(c_allgather,
                           ops::CAllGatherOpCUDAKernel,
                           float,
                           double,
-#if NCCL_VERSION_CODE >= 21000
+#if NCCL_VERSION_CODE >= 21000 && CUDA_VERSION >= 11000
                           plat::bfloat16,
 #endif
                           int,
diff --git a/paddle/fluid/operators/collective/c_allreduce_max_op.cu.cc b/paddle/fluid/operators/collective/c_allreduce_max_op.cu.cc
index 9be9674bb082bd1b5f6890f422521626d6da232b..277988b56916f8e682b8e67abd4adf20ef78fed5 100644
--- a/paddle/fluid/operators/collective/c_allreduce_max_op.cu.cc
+++ b/paddle/fluid/operators/collective/c_allreduce_max_op.cu.cc
@@ -28,7 +28,7 @@ PD_REGISTER_STRUCT_KERNEL(c_allreduce_max,
                           ALL_LAYOUT,
                           ops::CAllReduceMaxCUDAKernel,
                           float,
-#if NCCL_VERSION_CODE >= 21000
+#if NCCL_VERSION_CODE >= 21000 && CUDA_VERSION >= 11000
                           plat::bfloat16,
 #endif
                           double,
diff --git a/paddle/fluid/operators/collective/c_allreduce_sum_op.cu.cc b/paddle/fluid/operators/collective/c_allreduce_sum_op.cu.cc
index 8e45b7e43b2ed15b17bd8cf1a5198ef6ff613fe6..76d809cd234f03813fdea62f982757340c85e3f2 100644
--- a/paddle/fluid/operators/collective/c_allreduce_sum_op.cu.cc
+++ b/paddle/fluid/operators/collective/c_allreduce_sum_op.cu.cc
@@ -28,7 +28,7 @@ PD_REGISTER_STRUCT_KERNEL(c_allreduce_sum,
                           ALL_LAYOUT,
                           ops::CAllReduceSumCUDAKernel,
                           float,
-#if NCCL_VERSION_CODE >= 21000
+#if NCCL_VERSION_CODE >= 21000 && CUDA_VERSION >= 11000
                           plat::bfloat16,
 #endif
                           double,
diff --git a/paddle/fluid/operators/collective/c_broadcast_op.cu.cc b/paddle/fluid/operators/collective/c_broadcast_op.cu.cc
index d0d3ebb1394cbbee971e070502bcc3d03a3681ec..e37657a1747dec1b3ccd14ea9b32188d7a636b76 100644
--- a/paddle/fluid/operators/collective/c_broadcast_op.cu.cc
+++ b/paddle/fluid/operators/collective/c_broadcast_op.cu.cc
@@ -100,7 +100,7 @@ PD_REGISTER_STRUCT_KERNEL(c_broadcast,
                           int64_t,
                           float,
                           double,
-#if NCCL_VERSION_CODE >= 21000
+#if NCCL_VERSION_CODE >= 21000 && CUDA_VERSION >= 11000
                           plat::bfloat16,
 #endif
                           plat::float16) {
diff --git a/paddle/fluid/operators/collective/c_concat_op.cu.cc b/paddle/fluid/operators/collective/c_concat_op.cu.cc
index dc47c184c221db33b07ff79599dad1a54fe284a4..1760b6ea3909393c3d8f982de3a6ad5af1891108 100644
--- a/paddle/fluid/operators/collective/c_concat_op.cu.cc
+++ b/paddle/fluid/operators/collective/c_concat_op.cu.cc
@@ -137,7 +137,7 @@ PD_REGISTER_STRUCT_KERNEL(c_concat,
                           double,
                           int,
                           int64_t,
-#if NCCL_VERSION_CODE >= 21000
+#if NCCL_VERSION_CODE >= 21000 && CUDA_VERSION >= 11000
                           plat::bfloat16,
 #endif
                           plat::float16) {
diff --git a/paddle/fluid/operators/collective/c_embedding_op.cu b/paddle/fluid/operators/collective/c_embedding_op.cu
index 4861b5d26ab0f026563305dcda4fa32da1dd0409..758734ada66e83ee46dfc0476628eb8275d5accf 100644
--- a/paddle/fluid/operators/collective/c_embedding_op.cu
+++ b/paddle/fluid/operators/collective/c_embedding_op.cu
@@ -239,7 +239,7 @@ PD_REGISTER_STRUCT_KERNEL(c_embedding,
                           ops::CEmbeddingCUDAKernel,
                           float,
                           double,
-#if NCCL_VERSION_CODE >= 21000
+#if NCCL_VERSION_CODE >= 21000 && CUDA_VERSION >= 11000
                           plat::bfloat16,
 #endif
                           plat::float16) {
@@ -251,7 +251,7 @@ PD_REGISTER_STRUCT_KERNEL(c_embedding_grad,
                           ops::CEmbeddingGradCUDAKernel,
                           float,
                           double,
-#if NCCL_VERSION_CODE >= 21000
+#if NCCL_VERSION_CODE >= 21000 && CUDA_VERSION >= 11000
                           plat::bfloat16,
 #endif
                           plat::float16) {
diff --git a/paddle/fluid/operators/collective/c_identity_op.cu.cc b/paddle/fluid/operators/collective/c_identity_op.cu.cc
index 3d5f16c218c8c5fa5840c5af00a08e9e1c871bd5..9571168db152c61c4cb12461406730ebfc8b27c9 100644
--- a/paddle/fluid/operators/collective/c_identity_op.cu.cc
+++ b/paddle/fluid/operators/collective/c_identity_op.cu.cc
@@ -25,7 +25,7 @@ PD_REGISTER_STRUCT_KERNEL(c_identity,
                           double,
                           int,
                           int64_t,
-#if NCCL_VERSION_CODE >= 21000
+#if NCCL_VERSION_CODE >= 21000 && CUDA_VERSION >= 11000
                           plat::bfloat16,
 #endif
                           plat::float16) {
diff --git a/paddle/fluid/operators/collective/c_reducescatter_op.cu.cc b/paddle/fluid/operators/collective/c_reducescatter_op.cu.cc
index 35053b1511fcc3707aaabba690d01b42eb08e5c6..edae8feb61257b9678724124cedb01a29fac78b7 100644
--- a/paddle/fluid/operators/collective/c_reducescatter_op.cu.cc
+++ b/paddle/fluid/operators/collective/c_reducescatter_op.cu.cc
@@ -87,7 +87,7 @@ PD_REGISTER_STRUCT_KERNEL(c_reducescatter,
                           ops::CReduceScatterOpCUDAKernel,
                           float,
                           double,
-#if NCCL_VERSION_CODE >= 21000
+#if NCCL_VERSION_CODE >= 21000 && CUDA_VERSION >= 11000
                           plat::bfloat16,
 #endif
                           int,
diff --git a/paddle/fluid/operators/collective/c_split_op.cu b/paddle/fluid/operators/collective/c_split_op.cu
index b01ed790e851715063312336b3eeae18b0382a26..0b3e2aaf781dbe227c646c2c2161d49b954d6829 100644
--- a/paddle/fluid/operators/collective/c_split_op.cu
+++ b/paddle/fluid/operators/collective/c_split_op.cu
@@ -123,7 +123,7 @@ PD_REGISTER_STRUCT_KERNEL(c_split,
                           double,
                           int,
                           int64_t,
-#if NCCL_VERSION_CODE >= 21000
+#if NCCL_VERSION_CODE >= 21000 && CUDA_VERSION >= 11000
                           plat::bfloat16,
 #endif
                           plat::float16) {
diff --git a/paddle/fluid/operators/collective/mp_allreduce_sum_op.cu.cc b/paddle/fluid/operators/collective/mp_allreduce_sum_op.cu.cc
index b6af2dbd1c847ca8e347fe3ce99a5b0a6ffc2ccf..b4773a8eb54562f3bb6c6a85e39f31788002c0cc 100644
--- a/paddle/fluid/operators/collective/mp_allreduce_sum_op.cu.cc
+++ b/paddle/fluid/operators/collective/mp_allreduce_sum_op.cu.cc
@@ -31,7 +31,7 @@ PD_REGISTER_STRUCT_KERNEL(mp_allreduce_sum,
                           double,
                           int,
                           int64_t,
-#if NCCL_VERSION_CODE >= 21000
+#if NCCL_VERSION_CODE >= 21000 && CUDA_VERSION >= 11000
                           plat::bfloat16,
 #endif
                           plat::float16) {
diff --git a/paddle/fluid/operators/collective/partial_allgather_op.cu.cc b/paddle/fluid/operators/collective/partial_allgather_op.cu.cc
index 2374f4a4aed8239053a4ccb51803377c0d75b596..d22fd70bd0f61846ec18eda7994ee2a31c9f2d70 100644
--- a/paddle/fluid/operators/collective/partial_allgather_op.cu.cc
+++ b/paddle/fluid/operators/collective/partial_allgather_op.cu.cc
@@ -108,7 +108,7 @@ PD_REGISTER_STRUCT_KERNEL(partial_allgather,
                           ops::PartialAllGatherOpCUDAKernel,
                           float,
                           double,
-#if NCCL_VERSION_CODE >= 21000
+#if NCCL_VERSION_CODE >= 21000 && CUDA_VERSION >= 11000
                           plat::bfloat16,
 #endif
                           int,
diff --git a/paddle/fluid/operators/collective/partial_recv_op.cu.cc b/paddle/fluid/operators/collective/partial_recv_op.cu.cc
index b0df94194e4f87801b38a1a6df65236e8f9944a2..0c33ca7c25c3268db652356a2d78d8126dd53a5a 100644
--- a/paddle/fluid/operators/collective/partial_recv_op.cu.cc
+++ b/paddle/fluid/operators/collective/partial_recv_op.cu.cc
@@ -124,7 +124,7 @@ PD_REGISTER_STRUCT_KERNEL(partial_recv,
                           ops::PartialRecvOpCUDAKernel,
                           float,
                           double,
-#if NCCL_VERSION_CODE >= 21000
+#if NCCL_VERSION_CODE >= 21000 && CUDA_VERSION >= 11000
                           plat::bfloat16,
 #endif
                           int,
diff --git a/paddle/fluid/operators/collective/partial_send_op.cu.cc b/paddle/fluid/operators/collective/partial_send_op.cu.cc
index dc24ea01fc98e96f59409f5a0628ba36642cc6c7..4f9fc41bc4e16fc1c8c243de7a329bebbcdc8324 100644
--- a/paddle/fluid/operators/collective/partial_send_op.cu.cc
+++ b/paddle/fluid/operators/collective/partial_send_op.cu.cc
@@ -123,7 +123,7 @@ PD_REGISTER_STRUCT_KERNEL(partial_send,
                           ops::PartialSendCUDAKernel,
                           float,
                           double,
-#if NCCL_VERSION_CODE >= 21000
+#if NCCL_VERSION_CODE >= 21000 && CUDA_VERSION >= 11000
                           plat::bfloat16,
 #endif
                           int,
diff --git a/paddle/fluid/operators/collective/recv_v2_op.cu.cc b/paddle/fluid/operators/collective/recv_v2_op.cu.cc
index bfa12f911946d4b3eb17c99ce75caba3ba436c64..28058aa4868cd688e7470e83fea90d403b19065a 100644
--- a/paddle/fluid/operators/collective/recv_v2_op.cu.cc
+++ b/paddle/fluid/operators/collective/recv_v2_op.cu.cc
@@ -238,7 +238,7 @@ PD_REGISTER_STRUCT_KERNEL(recv_v2,
                           ops::RecvOpV2CUDAKernel,
                           float,
                           double,
-#if NCCL_VERSION_CODE >= 21000
+#if NCCL_VERSION_CODE >= 21000 && CUDA_VERSION >= 11000
                           plat::bfloat16,
 #endif
                           int,
diff --git a/paddle/fluid/operators/collective/send_v2_op.cu.cc b/paddle/fluid/operators/collective/send_v2_op.cu.cc
index adea7db0b8088e61583f88c65a3b4f386177b5cd..a80dc1f91e45d55ac778e0f3a95050f299de30c6 100644
--- a/paddle/fluid/operators/collective/send_v2_op.cu.cc
+++ b/paddle/fluid/operators/collective/send_v2_op.cu.cc
@@ -223,7 +223,7 @@ PD_REGISTER_STRUCT_KERNEL(send_v2,
                           ops::SendOpV2CUDAKernel,
                           float,
                           double,
-#if NCCL_VERSION_CODE >= 21000
+#if NCCL_VERSION_CODE >= 21000 && CUDA_VERSION >= 11000
                           plat::bfloat16,
 #endif
                           int,
diff --git a/paddle/fluid/platform/device/gpu/nccl_helper.h b/paddle/fluid/platform/device/gpu/nccl_helper.h
index 8dd0639ce72f3bd30a43c7f5141852b2338ad4df..6afcd2eb7cd9720c7dfffdfc2625f26ba9910a16 100644
--- a/paddle/fluid/platform/device/gpu/nccl_helper.h
+++ b/paddle/fluid/platform/device/gpu/nccl_helper.h
@@ -59,7 +59,7 @@ inline ncclDataType_t ToNCCLDataType(framework::proto::VarType::Type type) {
     return ncclUint8;
   } else if (type == framework::proto::VarType::BOOL) {
     return ncclUint8;
-#if NCCL_VERSION_CODE >= 21000
+#if NCCL_VERSION_CODE >= 21000 && CUDA_VERSION >= 11000
   } else if (type == framework::proto::VarType::BF16) {
     return ncclBfloat16;
 #endif
@@ -86,7 +86,7 @@ inline ncclDataType_t ToNCCLDataType(phi::DataType type) {
     return ncclInt8;
   } else if (type == phi::DataType::BOOL) {
     return ncclUint8;
-#if NCCL_VERSION_CODE >= 21000
+#if NCCL_VERSION_CODE >= 21000 && CUDA_VERSION >= 11000
   } else if (type == phi::DataType::BFLOAT16) {
     return ncclBfloat16;
 #endif
diff --git a/paddle/fluid/pybind/auto_parallel_py.cc b/paddle/fluid/pybind/auto_parallel_py.cc
index fdac30be8f07b397996681ac42c87c531ee9a3c1..439aa6a623769c2d07559a36f4571caa3d76a7f4 100644
--- a/paddle/fluid/pybind/auto_parallel_py.cc
+++ b/paddle/fluid/pybind/auto_parallel_py.cc
@@ -15,6 +15,7 @@
 #include <pybind11/operators.h>
 #include <pybind11/stl.h>
 
+#include "paddle/fluid/distributed/auto_parallel/spmd_rules/dist_tensor_spec.h"
 #include "paddle/fluid/framework/op_desc.h"
 #include "paddle/fluid/framework/var_desc.h"
 #include "paddle/fluid/pybind/auto_parallel_py.h"
@@ -29,6 +30,7 @@ namespace py = pybind11;
 namespace paddle {
 namespace pybind {
 
+using paddle::distributed::auto_parallel::DistTensorSpec;
 using paddle::distributed::auto_parallel::OperatorDistAttr;
 using paddle::framework::OpDesc;
 using paddle::framework::VarDesc;
@@ -276,6 +278,25 @@ void BindAutoParallel(py::module *m) {
           py::arg("memo"))
       .def("__str__", &TensorDistAttr::to_string);
 
+  py::class_<DistTensorSpec>(*m, "DistTensorSpec")
+      .def(py::init<>())
+      .def(py::init<const DistTensorSpec &>())
+      .def(py::init<const std::vector<int64_t> &, const TensorDistAttr &>())
+      .def("get_dims_mapping", &DistTensorSpec::get_dims_mapping)
+      .def("set_dims_mapping", &DistTensorSpec::set_dims_mapping)
+      .def("get_process_mesh", &DistTensorSpec::get_process_mesh)
+      .def("set_process_mesh", &DistTensorSpec::set_process_mesh)
+      .def_property_readonly("shape", &DistTensorSpec::get_shape)
+      .def("__str__", &DistTensorSpec::to_string)
+      .def("__copy__",
+           [](const DistTensorSpec &self) { return DistTensorSpec(self); })
+      .def(
+          "__deepcopy__",
+          [](const DistTensorSpec &self, py::dict) {
+            return DistTensorSpec(self);
+          },
+          py::arg("memo"));
+
   py::class_<OperatorDistAttr>(*m, "OperatorDistAttr")
       .def(py::init<>())
       .def(py::init<const OpDesc &>())
diff --git a/paddle/phi/api/yaml/generator/api_base.py b/paddle/phi/api/yaml/generator/api_base.py
index 23d3be56a11ca16cd735b7bf4b94407f800c2595..db858bd85e562c03062090a2653fb3e008dff6a2 100644
--- a/paddle/phi/api/yaml/generator/api_base.py
+++ b/paddle/phi/api/yaml/generator/api_base.py
@@ -1280,7 +1280,7 @@ PADDLE_API {self.get_return_type(inplace_flag=True)} {api_func_name}({self.get_d
 
     def gen_dist_tensor_code(self):
         # define the DistTensorSpec vector for input and output tensors
-        api_code = "  \nstd::vector<paddle::distributed::auto_parallel::DistTensorSpec> input_specs;\n"
+        api_code = "  \n  std::vector<paddle::distributed::auto_parallel::DistTensorSpec> input_specs;\n"
 
         # get DistTensorSpec for each input tensor
         for tensor_name in self.inputs['names']:
@@ -1297,8 +1297,8 @@ PADDLE_API {self.get_return_type(inplace_flag=True)} {api_func_name}({self.get_d
 PADDLE_API {self.get_return_type(inplace_flag)} {api_func_name}({self.get_define_args(inplace_flag)}) {{
 {self.gene_kernel_select()}
 """
-        if api_func_name == 'matmul':
-            api_code += self.gen_dist_tensor_code()
+        # if api_func_name == 'matmul':
+        #     api_code += self.gen_dist_tensor_code()
 
         if len(self.kernel['func']) > 1:
             kernel_dispatch_code = ''
diff --git a/paddle/phi/backends/xpu/xpu2_op_list.cc b/paddle/phi/backends/xpu/xpu2_op_list.cc
index a8bf526cf87b41a3e2a1436a3f23bda1e127af00..5b7c847d76d91b4a872e774c881f6151b280fd8e 100644
--- a/paddle/phi/backends/xpu/xpu2_op_list.cc
+++ b/paddle/phi/backends/xpu/xpu2_op_list.cc
@@ -525,6 +525,8 @@ XPUOpMap& get_kl2_ops() {
                      phi::DataType::FLOAT16,
                      phi::DataType::INT64})},
       {"nearest_interp_v2_grad", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"nll_loss", XPUKernelSet({phi::DataType::FLOAT32})},
+      {"nll_loss_grad", XPUKernelSet({phi::DataType::FLOAT32})},
       {"not_equal",
        XPUKernelSet({phi::DataType::INT64,
                      phi::DataType::INT32,
diff --git a/paddle/phi/core/distributed/auto_parallel/CMakeLists.txt b/paddle/phi/core/distributed/auto_parallel/CMakeLists.txt
index 2c4728c5a4c21e4801e67e6ece7776377b066aed..7121d93c05eaa795ac692f3ad6ce9a532324d7ac 100644
--- a/paddle/phi/core/distributed/auto_parallel/CMakeLists.txt
+++ b/paddle/phi/core/distributed/auto_parallel/CMakeLists.txt
@@ -20,4 +20,5 @@ cc_library(
   SRCS dist_mapper.cc
   DEPS device_mesh auto_parallel_proto phi_enforce)
 
-cc_library(auto_parallel DEPS device_mesh process_mesh dist_attr dist_mapper)
+cc_library(auto_parallel DEPS device_mesh process_mesh dist_attr dist_mapper
+                              dist_tensor_spec)
diff --git a/paddle/phi/core/utils/data_type.h b/paddle/phi/core/utils/data_type.h
index 16b73e0f2baa6738702971a89101df99ce68c99f..018672e45b5970c4cb26d73cb47a1bb48a2842b2 100644
--- a/paddle/phi/core/utils/data_type.h
+++ b/paddle/phi/core/utils/data_type.h
@@ -229,7 +229,7 @@ inline ncclDataType_t ToNCCLDataType(DataType type) {
     return ncclInt8;
   } else if (type == DataType::BOOL) {
     return ncclUint8;
-#if NCCL_VERSION_CODE >= 21000
+#if NCCL_VERSION_CODE >= 21000 && CUDA_VERSION >= 11000
   } else if (type == DataType::BFLOAT16) {
     return ncclBfloat16;
 #endif
diff --git a/paddle/phi/kernels/gpu/activation_kernel.cu b/paddle/phi/kernels/gpu/activation_kernel.cu
index cf3c66f53de2d56663c75e9b5ecbc17798716dfb..794d442ce2acc1114cfb06ab05f7b23d14f92a02 100644
--- a/paddle/phi/kernels/gpu/activation_kernel.cu
+++ b/paddle/phi/kernels/gpu/activation_kernel.cu
@@ -274,4 +274,5 @@ PD_REGISTER_KERNEL(selu,
                    phi::SeluKernel,
                    float,
                    double,
+                   phi::dtype::float16,
                    phi::dtype::bfloat16) {}
diff --git a/paddle/phi/kernels/gpu/selu_grad_kernel.cu b/paddle/phi/kernels/gpu/selu_grad_kernel.cu
index c715831ffc7ffcef400eb7ff11551cf5d636f055..68f91aa2b45e730ba22a52f1c193f70455856bcd 100644
--- a/paddle/phi/kernels/gpu/selu_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/selu_grad_kernel.cu
@@ -24,4 +24,5 @@ PD_REGISTER_KERNEL(selu_grad,
                    phi::SeluGradKernel,
                    float,
                    double,
+                   phi::dtype::float16,
                    phi::dtype::bfloat16) {}
diff --git a/paddle/phi/kernels/xpu/nll_loss_grad_kernel.cc b/paddle/phi/kernels/xpu/nll_loss_grad_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..1dbe679e67498e3cd32af71ac5ef75f2058f53f4
--- /dev/null
+++ b/paddle/phi/kernels/xpu/nll_loss_grad_kernel.cc
@@ -0,0 +1,95 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/nll_loss_grad_kernel.h"
+#include "paddle/phi/backends/xpu/enforce_xpu.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void NllLossGradKernel(const Context& dev_ctx,
+                       const DenseTensor& x,
+                       const DenseTensor& label,
+                       const paddle::optional<DenseTensor>& weight,
+                       const DenseTensor& total_weight,
+                       const DenseTensor& d_out,
+                       int64_t ignore_index,
+                       const std::string& reduction,
+                       DenseTensor* d_x) {
+  using XPUType = typename XPUTypeTrait<T>::Type;
+  const auto& label_type = label.dtype();
+  bool label_type_match =
+      label_type == phi::DataType::INT32 || label_type == phi::DataType::INT64;
+  PADDLE_ENFORCE_EQ(label_type_match,
+                    true,
+                    phi::errors::InvalidArgument(
+                        "Input(Label) holds the wrong type, it holds %s, but "
+                        "desires to be %s or %s",
+                        label_type,
+                        phi::DataType::INT32,
+                        phi::DataType::INT64));
+
+  auto d_out_data = d_out.data<XPUType>();
+  auto d_x_data = dev_ctx.template Alloc<XPUType>(d_x);
+
+  auto d_x_dims = d_x->dims();
+  std::vector<int64_t> d_x_shape = phi::vectorize<int64_t>(d_x_dims);
+
+  auto weight_data =
+      weight.get_ptr() ? weight.get_ptr()->data<float>() : nullptr;
+
+  int64_t reduction_id = 0;
+  if (reduction == "none") {
+    reduction_id = 0;
+  } else if (reduction == "mean") {
+    reduction_id = 1;
+  } else if (reduction == "sum") {
+    reduction_id = 2;
+  }
+
+  auto total_weight_data = total_weight.data<XPUType>();
+
+  int r;
+  if (label_type == phi::DataType::INT32) {
+    const int* label_data = label.data<int>();
+    r = xpu::nll_loss_grad(dev_ctx.x_context(),
+                           d_out_data,
+                           d_x_data,
+                           d_x_shape,
+                           label_data,
+                           weight_data,
+                           reduction_id,
+                           ignore_index,
+                           total_weight_data);
+  } else if (label_type == phi::DataType::INT64) {
+    const int64_t* label_data = label.data<int64_t>();
+    r = xpu::nll_loss_grad(dev_ctx.x_context(),
+                           d_out_data,
+                           d_x_data,
+                           d_x_shape,
+                           label_data,
+                           weight_data,
+                           reduction_id,
+                           ignore_index,
+                           total_weight_data);
+  }
+  PADDLE_ENFORCE_XDNN_SUCCESS(r, "nll_loss_grad");
+}
+
+}  // namespace phi
+
+// TODO(xiongkun): add the non-raw kernel register here.
+PD_REGISTER_KERNEL(
+    nll_loss_grad, XPU, ALL_LAYOUT, phi::NllLossGradKernel, float) {}
diff --git a/paddle/phi/kernels/xpu/nll_loss_kernel.cc b/paddle/phi/kernels/xpu/nll_loss_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..2d9bf5baf576707a5bb9c6f53e3bb0575e3f777f
--- /dev/null
+++ b/paddle/phi/kernels/xpu/nll_loss_kernel.cc
@@ -0,0 +1,93 @@
+// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/nll_loss_kernel.h"
+#include "paddle/phi/backends/xpu/enforce_xpu.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void NllLossRawKernel(const Context& dev_ctx,
+                      const DenseTensor& x,
+                      const DenseTensor& label,
+                      const paddle::optional<DenseTensor>& weight,
+                      int64_t ignore_index,
+                      const std::string& reduction,
+                      DenseTensor* out,
+                      DenseTensor* total_weight) {
+  using XPUType = typename XPUTypeTrait<T>::Type;
+  const auto& label_type = label.dtype();
+  bool label_type_match =
+      label_type == phi::DataType::INT32 || label_type == phi::DataType::INT64;
+  PADDLE_ENFORCE_EQ(label_type_match,
+                    true,
+                    phi::errors::InvalidArgument(
+                        "Input(Label) holds the wrong type, it holds %s, but "
+                        "desires to be %s or %s",
+                        label_type,
+                        phi::DataType::INT32,
+                        phi::DataType::INT64));
+
+  auto x_data = x.data<XPUType>();
+  auto out_data = dev_ctx.template Alloc<XPUType>(out);
+
+  auto weight_data =
+      weight.get_ptr() ? weight.get_ptr()->data<XPUType>() : nullptr;
+
+  auto total_weight_data = dev_ctx.template Alloc<XPUType>(total_weight);
+
+  auto x_dims = x.dims();
+  std::vector<int64_t> x_shape = phi::vectorize<int64_t>(x_dims);
+
+  int64_t reduction_id = 0;
+  if (reduction == "none") {
+    reduction_id = 0;
+  } else if (reduction == "mean") {
+    reduction_id = 1;
+  } else if (reduction == "sum") {
+    reduction_id = 2;
+  }
+
+  int r;
+  if (label_type == phi::DataType::INT32) {
+    const int* label_data = label.data<int>();
+    r = xpu::nll_loss(dev_ctx.x_context(),
+                      x_data,
+                      out_data,
+                      total_weight_data,
+                      x_shape,
+                      label_data,
+                      weight_data,
+                      reduction_id,
+                      ignore_index);
+  } else if (label_type == phi::DataType::INT64) {
+    const int64_t* label_data = label.data<int64_t>();
+    r = xpu::nll_loss(dev_ctx.x_context(),
+                      x_data,
+                      out_data,
+                      total_weight_data,
+                      x_shape,
+                      label_data,
+                      weight_data,
+                      reduction_id,
+                      ignore_index);
+  }
+  PADDLE_ENFORCE_XDNN_SUCCESS(r, "nll_loss");
+}
+
+}  // namespace phi
+
+// TODO(xiongkun): add the non-raw kernel register here.
+PD_REGISTER_KERNEL(nll_loss, XPU, ALL_LAYOUT, phi::NllLossRawKernel, float) {}
diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index 77ed8e531da0261bf89a078b0ffe531e3d10135d..697b74c39a41ea9d74ff93618dea5aa273931a2b 100644
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -2235,14 +2235,70 @@ set +x
             fi
         done <<< "$test_cases";
         card_test "$single_card_tests" 1
+        failed_test_lists=''
         collect_failed_tests
+        xputest_error=0
+        retry_unittests_record=''
+        retry_time=3
+        exec_times=0
+        exec_time_array=('first' 'second' 'third')
+        exec_retry_threshold=10
+        is_retry_execuate=0
+        if [ -n "$failed_test_lists" ];then
+            xputest_error=1
+            need_retry_ut_str=$(echo "$failed_test_lists" | grep -oEi "\-.+\(" | sed 's/(//' | sed 's/- //' )
+            need_retry_ut_arr=(${need_retry_ut_str})
+            need_retry_ut_count=${#need_retry_ut_arr[@]}
+            retry_unittests=$(echo "$failed_test_lists" | grep -oEi "\-.+\(" | sed 's/(//' | sed 's/- //' )
+            if [ $need_retry_ut_count -lt $exec_retry_threshold ];then
+                while ( [ $exec_times -lt $retry_time ] )
+                    do
+                        set +e
+                        retry_unittests_record="$retry_unittests_record$failed_test_lists"
+                        failed_test_lists_ult=`echo "${failed_test_lists}"`
+                        set -e
+                        if [[ "${exec_times}" == "1" ]];then
+                            if [[ "${failed_test_lists}" == "" ]];then
+                                break
+                            else
+                                retry_unittests=$(echo "$failed_test_lists" | grep -oEi "\-.+\(" | sed 's/(//' | sed 's/- //' )
+                            fi
+                        fi
+                        echo "========================================="
+                        echo "This is the ${exec_time_array[$exec_times]} time to re-run"
+                        echo "========================================="
+                        echo "The following unittest will be re-run:"
+                        echo "${retry_unittests}"
+                        echo "========================================="
+
+                        retry_unittests_regular=''
+                        for line in ${retry_unittests[@]} ;
+                            do
+                                if [[ "$retry_unittests_regular" == "" ]];then
+                                    retry_unittests_regular="^$line$"
+                                else
+                                    retry_unittests_regular="$retry_unittests_regular|^$line$"
+                                fi
+                            done
+                        rm -f $tmp_dir/*
+                        failed_test_lists=''
+                        ctest -R "($retry_unittests_regular)" --output-on-failure -j $2 | tee $tmpfile
+                        collect_failed_tests
+                        exec_times=$[$exec_times+1]
+                    done
+            else
+                # There are more than 10 failed unit tests, so no unit test retry
+                is_retry_execuate=1
+            fi
+
+        fi
 set -x
         ut_endTime_s=`date +%s`
         echo "XPU testCase Time: $[ $ut_endTime_s - $ut_startTime_s ]s"
         python ${PADDLE_ROOT}/build/test/xpu/get_test_cover_info.py
         unset XPU_OP_LIST_DIR
-        if [[ "$EXIT_CODE" != "0" ]]; then
-            exit 8;
+        if [ "$xputest_error" != 0 ];then
+            show_ut_retry_result
         fi
     fi
 }
diff --git a/python/paddle/distributed/auto_parallel/dist_attribute.py b/python/paddle/distributed/auto_parallel/dist_attribute.py
index 5c7fadf2e20771a263315670d0c4fa325c8296de..d31df134d6b6a0ff25e4ba8bdb93e36d172889d4 100644
--- a/python/paddle/distributed/auto_parallel/dist_attribute.py
+++ b/python/paddle/distributed/auto_parallel/dist_attribute.py
@@ -12,5 +12,6 @@
 # See the License for the specific language governing permissions and
 # limitations under the License
 
+from paddle.fluid.core import DistTensorSpec  # noqa: F401
 from paddle.fluid.core import OperatorDistAttr  # noqa: F401
 from paddle.fluid.core import TensorDistAttr  # noqa: F401
diff --git a/python/paddle/distributed/auto_parallel/operators/dist_matmul.py b/python/paddle/distributed/auto_parallel/operators/dist_matmul.py
index 8825e14d9aba7da62d52e09d4f228be8afc1e056..a7e539d460a7047d8ae0132a56b7d65b8446b704 100644
--- a/python/paddle/distributed/auto_parallel/operators/dist_matmul.py
+++ b/python/paddle/distributed/auto_parallel/operators/dist_matmul.py
@@ -105,6 +105,18 @@ def _update_dims_mapping_for_matmul(dist_op):
     changed = False
     op_desc = dist_op.serial_op.desc
     op_dist_attr = dist_op.dist_attr
+
+    # test DistTensorSpec
+    # input_name_list = []
+    # output_name_list = []
+    # input_name_list.append(op_desc.input('X')[0])
+    # input_name_list.append(op_desc.input('Y')[0])
+    # output_name_list.append(op_desc.output('Out')[0])
+    # attr_name_list = ['trans_x', 'trans_y']
+    # input_specs, output_specs, attrs = wrap_data_for_completion(
+    #     dist_op, input_name_list, output_name_list, attr_name_list
+    # )
+
     x_name = op_desc.input('X')[0]
     y_name = op_desc.input('Y')[0]
     out_name = op_desc.output('Out')[0]
diff --git a/python/paddle/distributed/auto_parallel/utils.py b/python/paddle/distributed/auto_parallel/utils.py
index 1a3299e20a48a9b116baed255c079c60d4726a83..43b293b750a9313f6f1cb55f5ae40ed43e676f9c 100644
--- a/python/paddle/distributed/auto_parallel/utils.py
+++ b/python/paddle/distributed/auto_parallel/utils.py
@@ -26,7 +26,7 @@ from paddle.framework import core
 from paddle.framework.io_utils import is_belong_to_optimizer, is_parameter
 from paddle.static import Variable
 
-from .dist_attribute import OperatorDistAttr, TensorDistAttr
+from .dist_attribute import DistTensorSpec, OperatorDistAttr, TensorDistAttr
 from .process_group import get_all_process_groups
 from .process_mesh import ProcessMesh
 
@@ -2357,50 +2357,64 @@ def is_dep_skip_op(op):
     return False
 
 
-# def wrap_data_for_completion(
-#     dist_op: DistributedOperator,
-#     input_names: list,
-#     output_names: list,
-#     attr_names: list
-# ):
-#     """
-#     Get data used in inferring distributed attributes, including:
-#       1. DistTensorSpec for each input and output tensor of this dist_op.
-#       2. Operator attributes of this dist_op, e.g. transpose_x in matmul op.
-#
-#     Args:
-#       dist_op: the DistributedOperator
-#       input_names: list, name of the dist_op's input tensors
-#       output_names: list, name of the dist_op's output tensors
-#       attr_names: list, attribute name of the dist_op's corresponding serial op
-#
-#     Returns:
-#       input_specs: list, DistTensorSpec for each input tensor of the dist_op
-#       output_specs: list, DistTensorSpec for each output tensor of the dist_op
-#       attrs: dict, attribute map of the dist op
-#     """
-#
-#     input_specs = []
-#     output_specs = []
-#     attrs = {}
-#
-#     serial_op = dist_op.serial_op
-#
-#     # Construct each input tensor's DistTensorSpec with shape and dist_attr
-#     for name in input_names:
-#         tensor_dist_attr = dist_op.dist_attr.get_input_dist_attr(name)
-#         var = serial_op.block._var_recursive(name)
-#         tensor_shape = var.shape
-#         dist_spec = DistTensorSpec(tensor_shape, tensor_dist_attr)
-#         input_specs.append(dist_spec)
-#
-#     # Construct each output tensor's DistTensorSpec with shape and dist_attr
-#     for name in output_names:
-#         tensor_dist_attr = dist_op.dist_attr.get_output_dist_attr(name)
-#         var = serial_op.block._var_recursive(name)
-#         tensor_shape = var.shape
-#         dist_spec = DistTensorSpec(tensor_shape, tensor_dist_attr)
-#         output_specs.append(dist_spec)
-#
-#     for attr_name in attr_names:
-#         attrs[attr_name] = serial_op.desc.attr(attr_name)
+def wrap_data_for_completion(
+    dist_op, input_names: list, output_names: list, attr_names: list
+):
+    """
+    Get data used in inferring distributed attributes, including:
+      1. DistTensorSpec for each input and output tensor of this dist_op.
+      2. Operator attributes of this dist_op, e.g. transpose_x in matmul op.
+
+    Args:
+      dist_op: the DistributedOperator
+      input_names: list, name of the dist_op's input tensors
+      output_names: list, name of the dist_op's output tensors
+      attr_names: list, attribute name of the dist_op's corresponding serial op
+
+    Returns:
+      input_specs: list, DistTensorSpec for each input tensor of the dist_op
+      output_specs: list, DistTensorSpec for each output tensor of the dist_op
+      attrs: dict, attribute map of the dist op
+
+    Usage:
+      op_desc = dist_op.serial_op.desc
+      input_name_list = []
+      output_name_list = []
+      input_name_list.append(op_desc.input('X')[0]) # 'X' is the arg name for op
+      input_name_list.append(op_desc.input('Y')[0])
+      output_name_list.append(op_desc.output('Out')[0])
+      attr_name_list = ['trans_x', 'trans_y']
+      input_specs, output_specs, attrs = wrap_data_for_completion(
+          dist_op,
+          input_name_list,
+          output_name_list,
+          attr_name_list)
+
+    """
+
+    input_specs = []
+    output_specs = []
+    attrs = {}
+
+    serial_op = dist_op.serial_op
+
+    # Construct each input tensor's DistTensorSpec with shape and dist_attr
+    for name in input_names:
+        tensor_dist_attr = dist_op.dist_attr.get_input_dist_attr(name)
+        var = serial_op.block._var_recursive(name)
+        tensor_shape = var.shape
+        dist_spec = DistTensorSpec(tensor_shape, tensor_dist_attr)
+        input_specs.append(dist_spec)
+
+    # Construct each output tensor's DistTensorSpec with shape and dist_attr
+    for name in output_names:
+        tensor_dist_attr = dist_op.dist_attr.get_output_dist_attr(name)
+        var = serial_op.block._var_recursive(name)
+        tensor_shape = var.shape
+        dist_spec = DistTensorSpec(tensor_shape, tensor_dist_attr)
+        output_specs.append(dist_spec)
+
+    for attr_name in attr_names:
+        attrs[attr_name] = serial_op.desc.attr(attr_name)
+
+    return input_specs, output_specs, attrs
diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index 257e3d1a6b72cce73d579da15c495f1296ba3ca0..d4c50707cbe78238042985aa4141f5ae2099a249 100755
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -1116,7 +1116,15 @@ set(TEST_CINN_OPS
     test_tile_op
     test_roll_op
     test_sum_op
-    test_elementwise_min_op)
+    test_elementwise_min_op
+    test_bitwise_op
+    test_compare_op
+    test_shape_op
+    test_assign_value_op
+    test_lookup_table_op
+    test_lookup_table_v2_op
+    test_norm_op
+    test_one_hot_v2_op)
 
 foreach(TEST_CINN_OPS ${TEST_CINN_OPS})
   if(WITH_CINN)
diff --git a/python/paddle/fluid/tests/unittests/test_assign_value_op.py b/python/paddle/fluid/tests/unittests/test_assign_value_op.py
index 243dccc2422444a87d29248f63c0bfe783779036..7cb5dece346c88476ea4710ce9b60a0f1ff60cf7 100644
--- a/python/paddle/fluid/tests/unittests/test_assign_value_op.py
+++ b/python/paddle/fluid/tests/unittests/test_assign_value_op.py
@@ -49,7 +49,7 @@ class TestAssignValueOp(eager_op_test.OpTest):
         self.attrs["fp32_values"] = [float(v) for v in self.value.flat]
 
     def test_forward(self):
-        self.check_output()
+        self.check_output(check_cinn=True)
 
 
 class TestAssignValueOp2(TestAssignValueOp):
diff --git a/python/paddle/fluid/tests/unittests/test_bitwise_op.py b/python/paddle/fluid/tests/unittests/test_bitwise_op.py
index 084552e6b1ae653ccc01f54e44c97b6560478204..728ea62dbf2cb182aadadf80e8f71a6f8e229a48 100644
--- a/python/paddle/fluid/tests/unittests/test_bitwise_op.py
+++ b/python/paddle/fluid/tests/unittests/test_bitwise_op.py
@@ -43,7 +43,7 @@ class TestBitwiseAnd(OpTest):
         self.outputs = {'Out': out}
 
     def test_check_output(self):
-        self.check_output()
+        self.check_output(check_cinn=True)
 
     def test_check_grad(self):
         pass
@@ -150,7 +150,7 @@ class TestBitwiseOr(OpTest):
         self.outputs = {'Out': out}
 
     def test_check_output(self):
-        self.check_output()
+        self.check_output(check_cinn=True)
 
     def test_check_grad(self):
         pass
@@ -258,7 +258,7 @@ class TestBitwiseXor(OpTest):
         self.outputs = {'Out': out}
 
     def test_check_output(self):
-        self.check_output()
+        self.check_output(check_cinn=True)
 
     def test_check_grad(self):
         pass
@@ -363,7 +363,7 @@ class TestBitwiseNot(OpTest):
         self.outputs = {'Out': out}
 
     def test_check_output(self):
-        self.check_output()
+        self.check_output(check_cinn=True)
 
     def test_check_grad(self):
         pass
diff --git a/python/paddle/fluid/tests/unittests/test_compare_op.py b/python/paddle/fluid/tests/unittests/test_compare_op.py
index 0b8c4aa8eae4215545a3953fe96a05040e2f557a..2f4e12f2b4e40db3fdf3792a80f4f6b975232bac 100755
--- a/python/paddle/fluid/tests/unittests/test_compare_op.py
+++ b/python/paddle/fluid/tests/unittests/test_compare_op.py
@@ -35,7 +35,7 @@ def create_test_class(op_type, typename, callback):
             self.op_type = op_type
 
         def test_output(self):
-            self.check_output()
+            self.check_output(check_cinn=True)
 
         def test_errors(self):
             paddle.enable_static()
@@ -460,7 +460,7 @@ def create_bf16_case(op_type, callback):
             self.outputs = {'Out': real_result}
 
         def test_check_output(self):
-            self.check_output()
+            self.check_output(check_cinn=True)
 
     cls_name = f"BF16TestCase_{op_type}"
     TestCompareOpBF16Op.__name__ = cls_name
diff --git a/python/paddle/fluid/tests/unittests/test_expand_v2_op.py b/python/paddle/fluid/tests/unittests/test_expand_v2_op.py
index 27fc92292f36f976a6049b3ecda73302450c783b..92cf190cb60a21288ec7a3c858aca6f31237092e 100644
--- a/python/paddle/fluid/tests/unittests/test_expand_v2_op.py
+++ b/python/paddle/fluid/tests/unittests/test_expand_v2_op.py
@@ -44,7 +44,7 @@ class TestExpandV2OpRank1(OpTest):
         self.expand_times = [1]
 
     def test_check_output(self):
-        self.check_output()
+        self.check_output(check_cinn=self.enable_cinn)
 
     def test_check_grad(self):
         self.check_grad(['X'], 'Out', check_prim=True)
@@ -107,10 +107,10 @@ class TestExpandV2OpRank1_tensor_attr(OpTest):
         self.infer_expand_shape = [-1]
 
     def test_check_output(self):
-        self.check_output()
+        self.check_output(check_cinn=True)
 
     def test_check_grad(self):
-        self.check_grad(['X'], 'Out')
+        self.check_grad(['X'], 'Out', check_cinn=True)
 
 
 class TestExpandV2OpRank2_Corner_tensor_attr(TestExpandV2OpRank1_tensor_attr):
@@ -144,10 +144,10 @@ class TestExpandV2OpRank1_tensor(OpTest):
         self.expand_shape = [2, 100]
 
     def test_check_output(self):
-        self.check_output()
+        self.check_output(check_cinn=True)
 
     def test_check_grad(self):
-        self.check_grad(['X'], 'Out')
+        self.check_grad(['X'], 'Out', check_cinn=True)
 
 
 # Situation 4: input x is Integer
@@ -165,7 +165,7 @@ class TestExpandV2OpInteger(OpTest):
         self.outputs = {'Out': output}
 
     def test_check_output(self):
-        self.check_output()
+        self.check_output(check_cinn=True)
 
 
 #  Situation 5: input x is Bool
@@ -181,7 +181,7 @@ class TestExpandV2OpBoolean(OpTest):
         self.outputs = {'Out': output}
 
     def test_check_output(self):
-        self.check_output()
+        self.check_output(check_cinn=True)
 
 
 #  Situation 6: input x is Integer
@@ -199,7 +199,7 @@ class TestExpandV2OpInt64_t(OpTest):
         self.outputs = {'Out': output}
 
     def test_check_output(self):
-        self.check_output()
+        self.check_output(check_cinn=True)
 
 
 #  Situation 7: input x is Float16
@@ -218,7 +218,7 @@ class TestExpandV2FP16Op(OpTest):
         self.outputs = {'Out': output}
 
     def test_check_output(self):
-        self.check_output()
+        self.check_output(check_cinn=True)
 
     def test_check_grad(self):
         self.check_grad(['X'], 'Out', check_prim=True)
@@ -245,7 +245,7 @@ class TestExpandV2BF16Op(OpTest):
 
     def test_check_output(self):
         place = core.CUDAPlace(0)
-        self.check_output_with_place(place)
+        self.check_output_with_place(place, check_cinn=True)
 
     def test_check_grad(self):
         place = core.CUDAPlace(0)
diff --git a/python/paddle/fluid/tests/unittests/test_lookup_table_op.py b/python/paddle/fluid/tests/unittests/test_lookup_table_op.py
index de868db11fb1a69244526cee698367899518e819..cd26f390747ee495779dfd8f5ea00eea50a47853 100644
--- a/python/paddle/fluid/tests/unittests/test_lookup_table_op.py
+++ b/python/paddle/fluid/tests/unittests/test_lookup_table_op.py
@@ -39,10 +39,10 @@ class TestLookupTableOp(OpTest):
         self.outputs = {'Out': table[ids]}
 
     def test_check_output(self):
-        self.check_output()
+        self.check_output(check_cinn=True)
 
     def test_check_grad(self):
-        self.check_grad(['W'], 'Out', no_grad_set=set('Ids'))
+        self.check_grad(['W'], 'Out', no_grad_set=set('Ids'), check_cinn=True)
 
 
 class TestLookupTableOpWithTensorIds(OpTest):
@@ -56,10 +56,10 @@ class TestLookupTableOpWithTensorIds(OpTest):
         self.outputs = {'Out': table[ids.flatten()].reshape((2, 4, 5, 31))}
 
     def test_check_output(self):
-        self.check_output()
+        self.check_output(check_cinn=True)
 
     def test_check_grad(self):
-        self.check_grad(['W'], 'Out', no_grad_set=set('Ids'))
+        self.check_grad(['W'], 'Out', no_grad_set=set('Ids'), check_cinn=True)
 
 
 @skip_check_grad_ci(
@@ -73,7 +73,7 @@ class TestLookupTableOpWithPadding(TestLookupTableOp):
         padding_idx = np.random.choice(ids, 1)[0]
         self.outputs['Out'][ids == padding_idx] = np.zeros(31)
         self.attrs = {'padding_idx': int(padding_idx)}
-        self.check_output()
+        self.check_output(check_cinn=True)
 
 
 @skip_check_grad_ci(
@@ -88,7 +88,7 @@ class TestLookupTableOpWithTensorIdsAndPadding(TestLookupTableOpWithTensorIds):
         padding_idx = np.random.choice(flatten_idx, 1)[0]
         self.outputs['Out'][np.squeeze(ids == padding_idx)] = np.zeros(31)
         self.attrs = {'padding_idx': padding_idx}
-        self.check_output()
+        self.check_output(check_cinn=True)
 
 
 class TestLookupTableWIsSelectedRows(unittest.TestCase):
@@ -212,7 +212,7 @@ class TestLookupTableOpInt8(OpTest):
         self.outputs = {'Out': table[ids]}
 
     def test_check_output(self):
-        self.check_output()
+        self.check_output(check_cinn=True)
 
     def test_check_grad(self):
         # since int8 type only be used in test and inference, there is
@@ -233,7 +233,7 @@ class TestLookupTableOpWithTensorIdsInt8(OpTest):
         self.outputs = {'Out': table[ids.flatten()].reshape((2, 4, 5, 31))}
 
     def test_check_output(self):
-        self.check_output()
+        self.check_output(check_cinn=True)
 
     def test_check_grad(self):
         # since int8 type only be used in test and inference, there is
@@ -247,7 +247,7 @@ class TestLookupTableOpWithPaddingInt8(TestLookupTableOpInt8):
         padding_idx = np.random.choice(ids, 1)[0]
         self.outputs['Out'][ids == padding_idx] = np.zeros(31)
         self.attrs = {'padding_idx': int(padding_idx)}
-        self.check_output()
+        self.check_output(check_cinn=True)
 
     def test_check_grad(self):
         # Since paddings are not trainable and fixed in forward, the gradient of
@@ -264,7 +264,7 @@ class TestLookupTableOpWithTensorIdsAndPaddingInt8(
         padding_idx = np.random.choice(flatten_idx, 1)[0]
         self.outputs['Out'][np.squeeze(ids == padding_idx)] = np.zeros(31)
         self.attrs = {'padding_idx': padding_idx}
-        self.check_output()
+        self.check_output(check_cinn=True)
 
     def test_check_grad(self):
         # Since paddings are not trainable and fixed in forward, the gradient of
@@ -354,7 +354,7 @@ class TestLookupTableOpInt16(OpTest):
         self.outputs = {'Out': table[ids]}
 
     def test_check_output(self):
-        self.check_output()
+        self.check_output(check_cinn=True)
 
 
 @skip_check_grad_ci(reason="Int16 type only be used in test and inference.")
@@ -371,7 +371,7 @@ class TestLookupTableOpWithTensorIdsInt16(OpTest):
         self.outputs = {'Out': table[ids.flatten()].reshape((2, 4, 5, 31))}
 
     def test_check_output(self):
-        self.check_output()
+        self.check_output(check_cinn=True)
 
 
 @skip_check_grad_ci(reason="Int16 type only be used in test and inference.")
@@ -381,7 +381,7 @@ class TestLookupTableOpWithPaddingInt16(TestLookupTableOpInt16):
         padding_idx = np.random.choice(ids, 1)[0]
         self.outputs['Out'][ids == padding_idx] = np.zeros(31)
         self.attrs = {'padding_idx': int(padding_idx)}
-        self.check_output()
+        self.check_output(check_cinn=True)
 
 
 @skip_check_grad_ci(reason="Int16 type only be used in test and inference.")
@@ -394,7 +394,7 @@ class TestLookupTableOpWithTensorIdsAndPaddingInt16(
         padding_idx = np.random.choice(flatten_idx, 1)[0]
         self.outputs['Out'][np.squeeze(ids == padding_idx)] = np.zeros(31)
         self.attrs = {'padding_idx': padding_idx}
-        self.check_output()
+        self.check_output(check_cinn=True)
 
 
 class TestLookupTableWIsSelectedRowsInt16(unittest.TestCase):
diff --git a/python/paddle/fluid/tests/unittests/test_lookup_table_v2_op.py b/python/paddle/fluid/tests/unittests/test_lookup_table_v2_op.py
index 54e9992a13d6baf410290848a7adf4e61866391b..b36f914a25786bc0d8d9ee76da22884f5f791e51 100644
--- a/python/paddle/fluid/tests/unittests/test_lookup_table_v2_op.py
+++ b/python/paddle/fluid/tests/unittests/test_lookup_table_v2_op.py
@@ -56,10 +56,10 @@ class TestLookupTableOp(OpTest):
         return "int64"
 
     def test_check_output(self):
-        self.check_output()
+        self.check_output(check_cinn=True)
 
     def test_check_grad(self):
-        self.check_grad(['W'], 'Out', no_grad_set=set('Ids'))
+        self.check_grad(['W'], 'Out', no_grad_set=set('Ids'), check_cinn=True)
 
 
 class TestLookupTableOpInt16(OpTest):
@@ -87,10 +87,10 @@ class TestLookupTableOpWithTensorIds(OpTest):
         self.outputs = {'Out': table[ids.flatten()].reshape((2, 4, 5, 31))}
 
     def test_check_output(self):
-        self.check_output()
+        self.check_output(check_cinn=True)
 
     def test_check_grad(self):
-        self.check_grad(['W'], 'Out', no_grad_set=set('Ids'))
+        self.check_grad(['W'], 'Out', no_grad_set=set('Ids'), check_cinn=True)
 
 
 @skip_check_grad_ci(
@@ -104,7 +104,7 @@ class TestLookupTableOpWithPadding(TestLookupTableOp):
         padding_idx = np.random.choice(ids, 1)[0]
         self.outputs['Out'][ids == padding_idx] = np.zeros(31)
         self.attrs = {'padding_idx': int(padding_idx)}
-        self.check_output()
+        self.check_output(check_cinn=True)
 
 
 @skip_check_grad_ci(
@@ -119,7 +119,7 @@ class TestLookupTableOpWithTensorIdsAndPadding(TestLookupTableOpWithTensorIds):
         padding_idx = np.random.choice(flatten_idx, 1)[0]
         self.outputs['Out'][np.squeeze(ids == padding_idx)] = np.zeros(31)
         self.attrs = {'padding_idx': padding_idx}
-        self.check_output()
+        self.check_output(check_cinn=True)
 
 
 class TestLookupTableWIsSelectedRows(unittest.TestCase):
diff --git a/python/paddle/fluid/tests/unittests/test_matmul_op.py b/python/paddle/fluid/tests/unittests/test_matmul_op.py
index 30085a841de314588ee07217d5d1d5f1c345930a..c7c870b3c46eb67f6c5635ec83a4acc01852754b 100644
--- a/python/paddle/fluid/tests/unittests/test_matmul_op.py
+++ b/python/paddle/fluid/tests/unittests/test_matmul_op.py
@@ -100,19 +100,29 @@ class Generator:
         self.outputs = {'Out': Out}
 
     def test_check_output(self):
-        self.check_output()
+        self.check_output(check_cinn=True)
 
     def test_check_grad_normal(self):
-        self.check_grad(['X', 'Y'], 'Out', max_relative_error=1e-3)
+        self.check_grad(
+            ['X', 'Y'], 'Out', max_relative_error=1e-3, check_cinn=True
+        )
 
     def test_check_grad_ignore_x(self):
         self.check_grad(
-            ['Y'], 'Out', max_relative_error=1e-3, no_grad_set=set("X")
+            ['Y'],
+            'Out',
+            max_relative_error=1e-3,
+            no_grad_set=set("X"),
+            check_cinn=True,
         )
 
     def test_check_grad_ignore_y(self):
         self.check_grad(
-            ['X'], 'Out', max_relative_error=1e-3, no_grad_set=set('Y')
+            ['X'],
+            'Out',
+            max_relative_error=1e-3,
+            no_grad_set=set('Y'),
+            check_cinn=True,
         )
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_matmul_v2_op.py b/python/paddle/fluid/tests/unittests/test_matmul_v2_op.py
index a0c41b63b05f2e9b2388566f1ddf5f0510c14e80..e0dcc3bfdd35ef896b4b0ea55402468abe7f1931 100644
--- a/python/paddle/fluid/tests/unittests/test_matmul_v2_op.py
+++ b/python/paddle/fluid/tests/unittests/test_matmul_v2_op.py
@@ -103,13 +103,28 @@ class TestMatMulV2Op(OpTest):
         self.outputs = {'Out': result}
 
     def test_check_output(self):
-        self.check_output()
+        self.check_output(
+            check_cinn=self.check_cinn if hasattr(self, 'check_cinn') else True
+        )
 
     def test_check_grad(self):
         if core.is_compiled_with_rocm():
-            self.check_grad(['X', 'Y'], 'Out', max_relative_error=1e-2)
+            self.check_grad(
+                ['X', 'Y'],
+                'Out',
+                max_relative_error=1e-2,
+                check_cinn=self.check_cinn
+                if hasattr(self, 'check_cinn')
+                else True,
+            )
         else:
-            self.check_grad(['X', 'Y'], 'Out')
+            self.check_grad(
+                ['X', 'Y'],
+                'Out',
+                check_cinn=self.check_cinn
+                if hasattr(self, 'check_cinn')
+                else True,
+            )
 
 
 class TestMatMulOp2(TestMatMulV2Op):
@@ -290,6 +305,7 @@ class TestMatMulOp16(TestMatMulV2Op):
         self.y_shape = (1, 2, 2, 100, 2)
         self.trans_x = False
         self.trans_y = False
+        self.check_cinn = False
 
 
 class TestMatMulOp17(TestMatMulV2Op):
@@ -343,7 +359,13 @@ def create_test_fp16_class(parent, atol=0.001, max_relative_error=1.0):
             if core.is_compiled_with_cuda():
                 place = core.CUDAPlace(0)
                 if core.is_float16_supported(place):
-                    self.check_output_with_place(place, atol=atol)
+                    self.check_output_with_place(
+                        place,
+                        atol=atol,
+                        check_cinn=self.check_cinn
+                        if hasattr(self, 'check_cinn')
+                        else True,
+                    )
 
         def test_check_grad(self):
             place = core.CUDAPlace(0)
@@ -353,6 +375,9 @@ def create_test_fp16_class(parent, atol=0.001, max_relative_error=1.0):
                     ['X', 'Y'],
                     'Out',
                     max_relative_error=max_relative_error,
+                    check_cinn=self.check_cinn
+                    if hasattr(self, 'check_cinn')
+                    else True,
                 )
 
     cls_name = "{}_{}".format(parent.__name__, "Fp16")
@@ -405,7 +430,13 @@ def create_test_bf16_class(parent, atol=0.01):
 
         def test_check_output(self):
             place = core.CUDAPlace(0)
-            self.check_output_with_place(place, atol=atol)
+            self.check_output_with_place(
+                place,
+                atol=atol,
+                check_cinn=self.check_cinn
+                if hasattr(self, 'check_cinn')
+                else True,
+            )
 
         def test_check_grad_x(self):
             place = core.CUDAPlace(0)
@@ -416,6 +447,9 @@ def create_test_bf16_class(parent, atol=0.01):
                 'Out',
                 no_grad_set={'Y'},
                 user_defined_grads=[numeric_grads],
+                check_cinn=self.check_cinn
+                if hasattr(self, 'check_cinn')
+                else True,
             )
 
         def test_check_grad_y(self):
@@ -427,6 +461,9 @@ def create_test_bf16_class(parent, atol=0.01):
                 'Out',
                 no_grad_set={'X'},
                 user_defined_grads=[numeric_grads],
+                check_cinn=self.check_cinn
+                if hasattr(self, 'check_cinn')
+                else True,
             )
 
         def test_check_grad(self):
@@ -596,7 +633,7 @@ class TestComplexMatMulOp(OpTest):
         self.grad_y = np.matmul(np.conj(self.x).T, self.grad_out)
 
     def test_check_output(self):
-        self.check_output()
+        self.check_output(check_cinn=False)
 
     def test_check_grad_normal(self):
         self.check_grad(
@@ -604,6 +641,7 @@ class TestComplexMatMulOp(OpTest):
             'Out',
             user_defined_grads=[self.grad_x, self.grad_y],
             user_defined_grad_outputs=[self.grad_out],
+            check_cinn=False,
         )
 
     def test_check_grad_ingore_x(self):
@@ -613,6 +651,7 @@ class TestComplexMatMulOp(OpTest):
             no_grad_set=set("X"),
             user_defined_grads=[self.grad_y],
             user_defined_grad_outputs=[self.grad_out],
+            check_cinn=False,
         )
 
     def test_check_grad_ingore_y(self):
@@ -622,6 +661,7 @@ class TestComplexMatMulOp(OpTest):
             no_grad_set=set('Y'),
             user_defined_grads=[self.grad_x],
             user_defined_grad_outputs=[self.grad_out],
+            check_cinn=False,
         )
 
 
@@ -662,7 +702,7 @@ class TestComplexMatMulOpBroadcast(OpTest):
         )
 
     def test_check_output(self):
-        self.check_output()
+        self.check_output(check_cinn=False)
 
     def test_check_grad_normal(self):
         self.check_grad(
@@ -670,6 +710,7 @@ class TestComplexMatMulOpBroadcast(OpTest):
             'Out',
             user_defined_grads=[self.grad_x, self.grad_y],
             user_defined_grad_outputs=[self.grad_out],
+            check_cinn=False,
         )
 
     def test_check_grad_ingore_x(self):
@@ -679,6 +720,7 @@ class TestComplexMatMulOpBroadcast(OpTest):
             no_grad_set=set("X"),
             user_defined_grads=[self.grad_y],
             user_defined_grad_outputs=[self.grad_out],
+            check_cinn=False,
         )
 
     def test_check_grad_ingore_y(self):
@@ -688,6 +730,7 @@ class TestComplexMatMulOpBroadcast(OpTest):
             no_grad_set=set('Y'),
             user_defined_grads=[self.grad_x],
             user_defined_grad_outputs=[self.grad_out],
+            check_cinn=False,
         )
 
 
diff --git a/python/paddle/fluid/tests/unittests/test_norm_op.py b/python/paddle/fluid/tests/unittests/test_norm_op.py
index f87d5250f1cf80037f7493f7085b87f37282c979..3144ec189ed4e16fe819a71deb3956641cfcbc87 100644
--- a/python/paddle/fluid/tests/unittests/test_norm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_norm_op.py
@@ -48,10 +48,10 @@ class TestNormOp(OpTest):
         self.python_out_sig = ['Out']
 
     def test_check_output(self):
-        self.check_output()
+        self.check_output(check_cinn=True)
 
     def test_check_grad(self):
-        self.check_grad(['X'], 'Out')
+        self.check_grad(['X'], 'Out', check_cinn=True)
 
     def init_test_case(self):
         self.shape = [2, 3, 4, 5]
@@ -109,7 +109,7 @@ class TestNormOp6(TestNormOp):
         self.dtype = "float32"
 
     def test_check_grad(self):
-        self.check_grad(['X'], 'Out', max_relative_error=0.008)
+        self.check_grad(['X'], 'Out', max_relative_error=0.008, check_cinn=True)
 
 
 @unittest.skipIf(
@@ -120,11 +120,17 @@ class TestNormOp7(TestNormOp):
         self.dtype = "float16"
 
     def test_check_output(self):
-        self.check_output_with_place(fluid.core.CUDAPlace(0), atol=5e-2)
+        self.check_output_with_place(
+            fluid.core.CUDAPlace(0), atol=5e-2, check_cinn=True
+        )
 
     def test_check_grad(self):
         self.check_grad_with_place(
-            fluid.core.CUDAPlace(0), ['X'], 'Out', max_relative_error=0.05
+            fluid.core.CUDAPlace(0),
+            ['X'],
+            'Out',
+            max_relative_error=0.05,
+            check_cinn=True,
         )
 
 
@@ -147,7 +153,7 @@ class TestNormTestOp(OpTest):
 
     def test_check_output(self):
         # dynamic graph just supports float tensor
-        self.check_output(check_dygraph=True)
+        self.check_output(check_dygraph=True, check_cinn=True)
 
     def test_check_grad(self):
         pass
@@ -176,11 +182,17 @@ class TestNormBF16Op(OpTest):
         self.python_out_sig = ['Out']
 
     def test_check_output(self):
-        self.check_output_with_place(core.CUDAPlace(0), atol=1e-1)
+        self.check_output_with_place(
+            core.CUDAPlace(0), atol=1e-1, check_cinn=True
+        )
 
     def test_check_grad(self):
         self.check_grad_with_place(
-            core.CUDAPlace(0), ['X'], 'Out', max_relative_error=1e-2
+            core.CUDAPlace(0),
+            ['X'],
+            'Out',
+            max_relative_error=1e-2,
+            check_cinn=True,
         )
 
     def init_test_case(self):
diff --git a/python/paddle/fluid/tests/unittests/test_one_hot_v2_op.py b/python/paddle/fluid/tests/unittests/test_one_hot_v2_op.py
index 173dae20ac690c2e552cdb90192d0fc18333d13d..a49060e536de8a1f777efd72fb9d868fe8becb20 100644
--- a/python/paddle/fluid/tests/unittests/test_one_hot_v2_op.py
+++ b/python/paddle/fluid/tests/unittests/test_one_hot_v2_op.py
@@ -49,7 +49,7 @@ class TestOneHotOp(OpTest):
         self.outputs = {'Out': (out, x_lod)}
 
     def test_check_output(self):
-        self.check_output()
+        self.check_output(check_cinn=True)
 
 
 class TestOneHotOp_attr(OpTest):
@@ -57,6 +57,7 @@ class TestOneHotOp_attr(OpTest):
         self.op_type = 'one_hot_v2'
         self.python_api = one_hot_wrapper
         depth = 10
+        depth_np = np.array(10).astype('int32')
         dimension = 12
         x_lod = [[4, 1, 3, 3]]
         x = [np.random.randint(0, depth - 1) for i in range(sum(x_lod[0]))]
@@ -69,12 +70,12 @@ class TestOneHotOp_attr(OpTest):
         for i in range(np.product(x.shape)):
             out[i, 0, x[i]] = 1.0
 
-        self.inputs = {'X': (x, x_lod)}
+        self.inputs = {'X': (x, x_lod), 'depth_tensor': depth_np}
         self.attrs = {'dtype': int(core.VarDesc.VarType.FP32), 'depth': depth}
         self.outputs = {'Out': (out, x_lod)}
 
     def test_check_output(self):
-        self.check_output()
+        self.check_output(check_cinn=True)
 
 
 class TestOneHotOp_default_dtype(OpTest):
@@ -98,7 +99,7 @@ class TestOneHotOp_default_dtype(OpTest):
         self.outputs = {'Out': (out, x_lod)}
 
     def test_check_output(self):
-        self.check_output()
+        self.check_output(check_cinn=True)
 
 
 class TestOneHotOp_default_dtype_attr(OpTest):
@@ -106,6 +107,7 @@ class TestOneHotOp_default_dtype_attr(OpTest):
         self.op_type = 'one_hot_v2'
         self.python_api = one_hot_wrapper
         depth = 10
+        depth_np = np.array(depth).astype('int32')
         dimension = 12
         x_lod = [[4, 1, 3, 3]]
         x = [np.random.randint(0, depth - 1) for i in range(sum(x_lod[0]))]
diff --git a/python/paddle/fluid/tests/unittests/test_selu_op.py b/python/paddle/fluid/tests/unittests/test_selu_op.py
index 845cf97d7a2c802f56c2d5fe8ba69c5afc08cfbe..acc9214220ef37af3ebb7dafec7ab3972f6844c6 100644
--- a/python/paddle/fluid/tests/unittests/test_selu_op.py
+++ b/python/paddle/fluid/tests/unittests/test_selu_op.py
@@ -15,7 +15,7 @@
 import unittest
 
 import numpy as np
-from eager_op_test import OpTest
+from eager_op_test import OpTest, convert_float_to_uint16
 
 import paddle
 import paddle.nn.functional as F
@@ -43,14 +43,16 @@ class SeluTest(OpTest):
         self.op_type = "selu"
         self.python_api = paddle.nn.functional.selu
         self.x_shape = [3, 5, 5, 10]
-        self.dtype = np.float64
         self.init_x_shape()
         self.init_dtype()
 
         alpha = 1.6732632423543772848170429916717
         scale = 1.0507009873554804934193349852946
 
-        x = np.random.normal(size=self.x_shape).astype(self.dtype)
+        if self.dtype == np.uint16:
+            x = np.random.normal(size=self.x_shape).astype(np.float32)
+        else:
+            x = np.random.normal(size=self.x_shape).astype(self.dtype)
 
         # Since zero point in selu is not differentiable, avoid randomize
         # zero.
@@ -58,8 +60,12 @@ class SeluTest(OpTest):
 
         out = ref_selu(x, scale, alpha)
 
-        self.inputs = {'X': x}
-        self.outputs = {'Out': out}
+        if self.dtype == np.uint16:
+            self.inputs = {'X': convert_float_to_uint16(x)}
+            self.outputs = {'Out': convert_float_to_uint16(out)}
+        else:
+            self.inputs = {'X': x}
+            self.outputs = {'Out': out}
 
         self.attrs = {
             'alpha': alpha,
@@ -70,7 +76,7 @@ class SeluTest(OpTest):
         pass
 
     def init_dtype(self):
-        pass
+        self.dtype = np.float64
 
     def test_check_output(self):
         self.check_output()
@@ -79,6 +85,27 @@ class SeluTest(OpTest):
         self.check_grad(['X'], 'Out')
 
 
+class SeluTestFP16OP(SeluTest):
+    def init_dtype(self):
+        self.dtype = np.float16
+
+
+@unittest.skipIf(
+    not core.is_compiled_with_cuda()
+    or not core.is_bfloat16_supported(core.CUDAPlace(0)),
+    "core is not compiled with CUDA and do not support bfloat16",
+)
+class SeluTestBF16OP(SeluTest):
+    def init_dtype(self):
+        self.dtype = np.uint16
+
+    def test_check_output(self):
+        self.check_output_with_place(core.CUDAPlace(0))
+
+    def test_check_grad(self):
+        self.check_grad_with_place(core.CUDAPlace(0), ['X'], 'Out')
+
+
 class TestSeluAPI(unittest.TestCase):
     # test paddle.nn.SELU, paddle.nn.functional.selu
     def setUp(self):
diff --git a/python/paddle/fluid/tests/unittests/test_shape_op.py b/python/paddle/fluid/tests/unittests/test_shape_op.py
index 3609370e73cf852df8d107d87f8be70cc1b3073c..d9dade1cf99ce700ea8850e0b6a51768e6a6d36a 100644
--- a/python/paddle/fluid/tests/unittests/test_shape_op.py
+++ b/python/paddle/fluid/tests/unittests/test_shape_op.py
@@ -36,7 +36,7 @@ class TestShapeOp(OpTest):
         self.dtype = np.float32
 
     def test_check_output(self):
-        self.check_output()
+        self.check_output(check_cinn=True)
 
 
 class case1(TestShapeOp):
@@ -125,7 +125,7 @@ class TestShapeOpBf16(OpTest):
 
     def test_check_output(self):
         place = core.CUDAPlace(0)
-        self.check_output_with_place(place)
+        self.check_output_with_place(place, check_cinn=True)
 
 
 class case1Bf16(TestShapeOpBf16):
diff --git a/python/paddle/fluid/tests/unittests/test_sum_op.py b/python/paddle/fluid/tests/unittests/test_sum_op.py
index 99406f4599c837459119f1ea4d47d4c42af4f108..49e42b5434242ba3f24eee4ae61f374f88c4a2c1 100644
--- a/python/paddle/fluid/tests/unittests/test_sum_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sum_op.py
@@ -62,10 +62,10 @@ class TestSumOp(OpTest):
         self.dtype = np.float64
 
     def test_check_output(self):
-        self.check_output(check_prim=True)
+        self.check_output(check_prim=True, check_cinn=True)
 
     def test_check_grad(self):
-        self.check_grad(['x0'], 'Out', check_prim=True)
+        self.check_grad(['x0'], 'Out', check_prim=True, check_cinn=True)
 
 
 class TestSelectedRowsSumOp(unittest.TestCase):
@@ -299,14 +299,14 @@ class TestFP16SumOp(TestSumOp):
     def test_check_output(self):
         place = core.CUDAPlace(0)
         if core.is_float16_supported(place):
-            self.check_output_with_place(place)
+            self.check_output_with_place(place, check_cinn=True)
 
     # FIXME: Because of the precision fp16, max_relative_error
     # should be 0.15 here.
     def test_check_grad(self):
         place = core.CUDAPlace(0)
         if core.is_float16_supported(place):
-            self.check_grad(['x0'], 'Out')
+            self.check_grad(['x0'], 'Out', check_cinn=True)
 
 
 def create_test_sum_fp16_class(parent):
diff --git a/test/ir/inference/inference_pass_test.py b/test/ir/inference/inference_pass_test.py
index 4ac3c861213303e70823c3e338a499afac5bd921..cee27eca7d28a08498c47b6f76e984e165bf98aa 100644
--- a/test/ir/inference/inference_pass_test.py
+++ b/test/ir/inference/inference_pass_test.py
@@ -12,7 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+import os
 import random
+import tempfile
 import unittest
 
 import numpy as np
@@ -41,7 +43,10 @@ class InferencePassTest(unittest.TestCase):
         self.dynamic_shape_params = None
         self.enable_lite = False
         self.lite_parameters = None
-        self.path = "./inference_pass/" + self.__class__.__name__ + "/"
+        self.temp_dir = tempfile.TemporaryDirectory()
+        self.path = os.path.join(
+            self.temp_dir.name, 'inference_pass', self.__class__.__name__
+        )
         np.random.seed(1)
         random.seed(1)
 
diff --git a/test/ir/inference/test_trt_activation_pass.py b/test/ir/inference/test_trt_activation_pass.py
index 858a307dd629e881d767d98330f93f2ed14fc3a1..cf63d203224503f4d71f41d8fcec3bc7ff9f3668 100644
--- a/test/ir/inference/test_trt_activation_pass.py
+++ b/test/ir/inference/test_trt_activation_pass.py
@@ -53,8 +53,9 @@ class TensorRTSubgraphPassActivationTest(InferencePassTest):
     def test_check_output(self):
         if core.is_compiled_with_cuda():
             use_gpu = True
-            if os.path.exists(self.path + "_opt_cache"):
-                shutil.rmtree(self.path + "_opt_cache")
+            opt_path = os.path.join(self.path, '_opt_cache')
+            if os.path.exists(opt_path):
+                shutil.rmtree(opt_path)
             if (
                 self.trt_parameters.precision
                 == AnalysisConfig.Precision.Float32
diff --git a/test/ir/inference/test_trt_elementwise_op.py b/test/ir/inference/test_trt_elementwise_op.py
index 0f9caee745b9a6af6814521fb9d43e722351c40f..7f4a34db52fbb33941e9c71490aa2559481349a1 100644
--- a/test/ir/inference/test_trt_elementwise_op.py
+++ b/test/ir/inference/test_trt_elementwise_op.py
@@ -53,8 +53,9 @@ class TensorRTSubgraphPassElementwiseBroadcastTest(InferencePassTest):
         return paddle.tensor.math.add(x=data1, y=data2)
 
     def test_check_output(self):
-        if os.path.exists(self.path + "_opt_cache"):
-            shutil.rmtree(self.path + "_opt_cache")
+        opt_path = os.path.join(self.path, '_opt_cache')
+        if os.path.exists(opt_path):
+            shutil.rmtree(opt_path)
         if core.is_compiled_with_cuda():
             use_gpu = True
             self.check_output_with_option(use_gpu)
diff --git a/test/ir/inference/test_trt_instance_norm_op.py b/test/ir/inference/test_trt_instance_norm_op.py
index 532c9e8c08a112d2ec0819a4ea3a7c96acbd1fef..fdf3523e880cec53c50ed3c336574d87d32fbadb 100644
--- a/test/ir/inference/test_trt_instance_norm_op.py
+++ b/test/ir/inference/test_trt_instance_norm_op.py
@@ -55,8 +55,9 @@ class TRTInstanceNormTest(InferencePassTest):
         self.fetch_list = [out]
 
     def check_output(self, remove_cache=False):
-        if remove_cache and os.path.exists(self.path + "_opt_cache"):
-            shutil.rmtree(self.path + "_opt_cache")
+        opt_path = os.path.join(self.path, '_opt_cache')
+        if remove_cache and os.path.exists(opt_path):
+            shutil.rmtree(opt_path)
         if core.is_compiled_with_cuda():
             use_gpu = True
             atol = 1e-5
diff --git a/test/ir/inference/test_trt_pool3d_op.py b/test/ir/inference/test_trt_pool3d_op.py
index 886f46dbfd76e62dc4b10c92d99bc727b29d58e3..f64ff97e4e8db1426235d8ee33ddbc121452ecf9 100644
--- a/test/ir/inference/test_trt_pool3d_op.py
+++ b/test/ir/inference/test_trt_pool3d_op.py
@@ -84,8 +84,9 @@ class TensorRTPool3dTest(InferencePassTest):
             self.fetch_list = [pool_out]
 
     def check_output(self):
-        if os.path.exists(self.path + "_opt_cache"):
-            shutil.rmtree(self.path + "_opt_cache")
+        opt_path = os.path.join(self.path, '_opt_cache')
+        if os.path.exists(opt_path):
+            shutil.rmtree(opt_path)
         if core.is_compiled_with_cuda():
             use_gpu = True
             if self.precision == AnalysisConfig.Precision.Float32:
@@ -200,8 +201,9 @@ class TensorRTAdaptiveAvgPool3DTest(InferencePassTest):
             self.fetch_list = [pool_out]
 
     def check_output(self):
-        if os.path.exists(self.path + "_opt_cache"):
-            shutil.rmtree(self.path + "_opt_cache")
+        opt_path = os.path.join(self.path, '_opt_cache')
+        if os.path.exists(opt_path):
+            shutil.rmtree(opt_path)
         if core.is_compiled_with_cuda():
             use_gpu = True
             self.check_output_with_option(use_gpu)
@@ -300,8 +302,9 @@ class TensorRTAdaptiveMaxPool3DTest(InferencePassTest):
             self.fetch_list = [pool_out]
 
     def check_output(self):
-        if os.path.exists(self.path + "_opt_cache"):
-            shutil.rmtree(self.path + "_opt_cache")
+        opt_path = os.path.join(self.path, '_opt_cache')
+        if os.path.exists(opt_path):
+            shutil.rmtree(opt_path)
         if core.is_compiled_with_cuda():
             use_gpu = True
             self.check_output_with_option(use_gpu)
diff --git a/test/ir/inference/test_trt_pool_op.py b/test/ir/inference/test_trt_pool_op.py
index 0885ff6acc319af1ba91b20b5144f5413b8ffe50..8826a3f06cd01e7e44e36491c639f78da3820af1 100644
--- a/test/ir/inference/test_trt_pool_op.py
+++ b/test/ir/inference/test_trt_pool_op.py
@@ -86,8 +86,9 @@ class TensorRTPoolTest(InferencePassTest):
             self.fetch_list = [out]
 
     def check_output(self):
-        if os.path.exists(self.path + "_opt_cache"):
-            shutil.rmtree(self.path + "_opt_cache")
+        opt_path = os.path.join(self.path, '_opt_cache')
+        if os.path.exists(opt_path):
+            shutil.rmtree(opt_path)
         if core.is_compiled_with_cuda():
             use_gpu = True
             if self.precision == AnalysisConfig.Precision.Float32:
diff --git a/test/ir/inference/test_trt_skip_layernorm_fuse_pass.py b/test/ir/inference/test_trt_skip_layernorm_fuse_pass.py
index a91809584f63c5fb94197113d44a3e506216b953..1bf140a365aaeac004966bd5f8341958c3fc47f5 100644
--- a/test/ir/inference/test_trt_skip_layernorm_fuse_pass.py
+++ b/test/ir/inference/test_trt_skip_layernorm_fuse_pass.py
@@ -60,8 +60,9 @@ class SkipLayernormFusePassTest0(InferencePassTest):
         return paddle.add(data1, data2)
 
     def test_check_output(self):
-        if os.path.exists(self.path + "_opt_cache"):
-            shutil.rmtree(self.path + "_opt_cache")
+        opt_path = os.path.join(self.path, '_opt_cache')
+        if os.path.exists(opt_path):
+            shutil.rmtree(opt_path)
         if core.is_compiled_with_cuda():
             use_gpu = True
             self.check_output_with_option(use_gpu, atol=0.01, rtol=0.00001)
@@ -107,8 +108,9 @@ class SkipLayernormFusePassTest1(InferencePassTest):
         return paddle.add(data1, data2)
 
     def test_check_output(self):
-        if os.path.exists(self.path + "_opt_cache"):
-            shutil.rmtree(self.path + "_opt_cache")
+        opt_path = os.path.join(self.path, '_opt_cache')
+        if os.path.exists(opt_path):
+            shutil.rmtree(opt_path)
         if core.is_compiled_with_cuda():
             use_gpu = True
             self.check_output_with_option(use_gpu, atol=0.01, rtol=0.00001)
@@ -154,8 +156,9 @@ class SkipLayernormFusePassTest2(InferencePassTest):
         return paddle.add(data1, data2)
 
     def test_check_output(self):
-        if os.path.exists(self.path + "_opt_cache"):
-            shutil.rmtree(self.path + "_opt_cache")
+        opt_path = os.path.join(self.path, '_opt_cache')
+        if os.path.exists(opt_path):
+            shutil.rmtree(opt_path)
         if core.is_compiled_with_cuda():
             use_gpu = True
             self.check_output_with_option(use_gpu, atol=0.1, rtol=0.00001)
@@ -201,8 +204,9 @@ class SkipLayernormFusePassTest3(InferencePassTest):
         return paddle.add(data1, data2)
 
     def test_check_output(self):
-        if os.path.exists(self.path + "_opt_cache"):
-            shutil.rmtree(self.path + "_opt_cache")
+        opt_path = os.path.join(self.path, '_opt_cache')
+        if os.path.exists(opt_path):
+            shutil.rmtree(opt_path)
         if core.is_compiled_with_cuda():
             use_gpu = True
             self.check_output_with_option(use_gpu, atol=0.1, rtol=0.00001)
diff --git a/test/ir/inference/test_trt_subgraph_pass.py b/test/ir/inference/test_trt_subgraph_pass.py
index bc102b60b30ef5c0c6127c9f2f9f76f982e6ecd9..4031a882758b99809843bfeed9d5e4034ee3655e 100644
--- a/test/ir/inference/test_trt_subgraph_pass.py
+++ b/test/ir/inference/test_trt_subgraph_pass.py
@@ -128,8 +128,9 @@ class TensorRTSubgraphPassSplitSerializeTest(InferencePassTest):
     def test_check_output(self):
         if paddle.is_compiled_with_cuda():
             use_gpu = True
-            if os.path.exists(self.path + "_opt_cache"):
-                shutil.rmtree(self.path + "_opt_cache")
+            opt_path = os.path.join(self.path, '_opt_cache')
+            if os.path.exists(opt_path):
+                shutil.rmtree(opt_path)
             self.check_output_with_option(use_gpu)
             self.assertTrue(
                 PassVersionChecker.IsCompatible('tensorrt_subgraph_pass')
@@ -164,8 +165,9 @@ class TensorRTSubgraphPassDynamicSplitFp16SerializeTest(InferencePassTest):
     def test_check_output(self):
         if paddle.is_compiled_with_cuda():
             use_gpu = True
-            if os.path.exists(self.path + "_opt_cache"):
-                shutil.rmtree(self.path + "_opt_cache")
+            opt_path = os.path.join(self.path, '_opt_cache')
+            if os.path.exists(opt_path):
+                shutil.rmtree(opt_path)
             self.check_output_with_option(use_gpu, 1e-3)
             self.assertTrue(
                 PassVersionChecker.IsCompatible('tensorrt_subgraph_pass')
@@ -313,8 +315,9 @@ class TensorRTSubgraphPassLayerNormDynamicTest(InferencePassTest):
         self.serialize = True
 
     def test_check_output(self):
-        if os.path.exists(self.path + "_opt_cache"):
-            shutil.rmtree(self.path + "_opt_cache")
+        opt_path = os.path.join(self.path, '_opt_cache')
+        if os.path.exists(opt_path):
+            shutil.rmtree(opt_path)
         if paddle.is_compiled_with_cuda():
             use_gpu = True
             self.check_output_with_option(use_gpu)
@@ -332,8 +335,9 @@ class TensorRTSubgraphPassLayerNormDynamicFP16Test(
         self.serialize = True
 
     def test_check_output(self):
-        if os.path.exists(self.path + "_opt_cache"):
-            shutil.rmtree(self.path + "_opt_cache")
+        opt_path = os.path.join(self.path, '_opt_cache')
+        if os.path.exists(opt_path):
+            shutil.rmtree(opt_path)
         if paddle.is_compiled_with_cuda():
             use_gpu = True
             self.check_output_with_option(use_gpu, atol=0.01, rtol=0.01)
@@ -406,8 +410,9 @@ class TensorRTSubgraphPassElementwiseSerializeTest(
         )
 
     def test_check_output(self):
-        if os.path.exists(self.path + "_opt_cache"):
-            shutil.rmtree(self.path + "_opt_cache")
+        opt_path = os.path.join(self.path, '_opt_cache')
+        if os.path.exists(opt_path):
+            shutil.rmtree(opt_path)
         super().test_check_output()
 
 
@@ -444,8 +449,9 @@ class TensorRTSubgraphPassElementwiseBroadcastDynamicTest(InferencePassTest):
         return paddle.add(x=data1, y=data2)
 
     def test_check_output(self):
-        if os.path.exists(self.path + "_opt_cache"):
-            shutil.rmtree(self.path + "_opt_cache")
+        opt_path = os.path.join(self.path, '_opt_cache')
+        if os.path.exists(opt_path):
+            shutil.rmtree(opt_path)
         if paddle.is_compiled_with_cuda():
             use_gpu = True
             self.check_output_with_option(use_gpu)
diff --git a/test/xpu/test_nll_loss_op_xpu.py b/test/xpu/test_nll_loss_op_xpu.py
new file mode 100644
index 0000000000000000000000000000000000000000..71ce382933449c8af9624c04c1658be0ff8e2dcd
--- /dev/null
+++ b/test/xpu/test_nll_loss_op_xpu.py
@@ -0,0 +1,288 @@
+#   Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+
+import numpy as np
+from get_test_cover_info import (
+    XPUOpTestWrapper,
+    create_test_class,
+    get_xpu_op_support_types,
+)
+from op_test_xpu import XPUOpTest
+
+import paddle
+
+paddle.enable_static()
+
+
+def nll_loss_1d(
+    logs, dtype, targets, weight=None, reduction='mean', ignore_index=-100
+):
+    input_shape = logs.shape
+    N = input_shape[0]
+    C = input_shape[1]
+    out = np.zeros_like(targets).astype(dtype)
+    total_weight = 0
+    for i in range(N):
+        cur_target = targets[i]
+        if cur_target == ignore_index:
+            out[i] = 0
+            continue
+        cur_weight = weight[cur_target] if weight is not None else 1
+        total_weight += cur_weight
+        out[i] = -logs[i][cur_target] * cur_weight
+    if reduction == 'sum':
+        out = np.sum(out)
+        total_weight = np.array([total_weight]).astype(dtype)
+        return {'Out': out, 'Total_weight': total_weight}
+    elif reduction == 'mean':
+        out = np.sum(out)
+        if total_weight != 0:
+            out /= total_weight
+        total_weight = np.array([total_weight]).astype(dtype)
+        return {'Out': out, 'Total_weight': total_weight}
+    elif reduction == 'none':
+        total_weight = np.array([0]).astype(dtype)
+        return {'Out': out, 'Total_weight': total_weight}
+
+
+def nll_loss_2d(
+    logs, dtype, targets, weight=None, reduction='mean', ignore_index=-100
+):
+    input_shape = logs.shape
+    N = input_shape[0]
+    H = input_shape[2]
+    W = input_shape[3]
+    out = np.zeros_like(targets).astype(dtype)
+    total_weight = 0
+    for i in range(N):
+        for h in range(H):
+            for w in range(W):
+                cur_target = targets[i][h][w]
+                if cur_target == ignore_index:
+                    out[i][h][w] = 0
+                    continue
+                cur_weight = weight[cur_target] if weight is not None else 1
+                total_weight += cur_weight
+                out[i][h][w] = -logs[i][cur_target][h][w] * cur_weight
+    if reduction == 'sum':
+        out = np.sum(out)
+        total_weight = np.array([total_weight]).astype(dtype)
+        return {'Out': out, 'Total_weight': total_weight}
+    elif reduction == 'mean':
+        out = np.sum(out)
+        if total_weight != 0:
+            out /= total_weight
+        total_weight = np.array([total_weight]).astype(dtype)
+        return {'Out': out, 'Total_weight': total_weight}
+    elif reduction == 'none':
+        total_weight = np.array([0]).astype(dtype)
+        return {'Out': out, 'Total_weight': total_weight}
+
+
+class XPUTestNLLLossOP(XPUOpTestWrapper):
+    def __init__(self):
+        self.op_name = 'nll_loss'
+        self.use_dynamic_create_class = False
+
+    class TestNLLLossOpBase1D(XPUOpTest):
+        op_type = 'nll_loss'
+
+        def setUp(self):
+            self.dtype = self.in_type
+            self.place = paddle.XPUPlace(0)
+            self.set_attrs()
+            self.set_inputs()
+            self.inputs = {
+                'X': self.x,
+                'Label': self.label,
+            }
+            if self.weight is not None:
+                self.inputs['Weight'] = self.weight
+            self.outputs = nll_loss_1d(
+                self.x,
+                self.dtype,
+                self.label,
+                self.weight,
+                self.attrs['reduction'],
+            )
+
+        def set_attrs(self):
+            self.attrs = {'reduction': 'none'}
+
+        def set_inputs(self):
+            self.class_num = 3
+            x_shape = [5, self.class_num]
+            label_shape = [5]
+            self.x = np.random.random(x_shape).astype(self.dtype)
+            self.label = np.random.randint(
+                low=0, high=self.class_num, size=label_shape
+            ).astype(np.int64)
+            self.weight = np.random.random(self.class_num).astype(self.dtype)
+
+        def test_check_output(self):
+            self.check_output_with_place(self.place)
+
+        def test_check_grad(self):
+            self.check_grad_with_place(self.place, ['X'], 'Out')
+
+    class TestNLLLossOpWithWeightMean1D(TestNLLLossOpBase1D):
+        def set_attrs(self):
+            self.attrs = {'reduction': 'mean'}
+
+    class TestNLLLossOpWithWeightSum1D(TestNLLLossOpBase1D):
+        def set_attrs(self):
+            self.attrs = {'reduction': 'sum'}
+
+    class TestNLLLossOpWithoutWeightNone1D(TestNLLLossOpBase1D):
+        def set_inputs(self):
+            self.class_num = 3
+            x_shape = [5, self.class_num]
+            label_shape = [5]
+            self.x = np.random.random(x_shape).astype(self.dtype)
+            self.label = np.random.randint(
+                low=0, high=self.class_num, size=label_shape
+            ).astype(np.int64)
+            self.weight = None
+
+        def set_attrs(self):
+            self.attrs = {'reduction': 'none'}
+
+    class TestNLLLossOpWithoutWeightMean1D(TestNLLLossOpBase1D):
+        def set_inputs(self):
+            self.class_num = 3
+            x_shape = [5, self.class_num]
+            label_shape = [5]
+            self.x = np.random.random(x_shape).astype(self.dtype)
+            self.label = np.random.randint(
+                low=0, high=self.class_num, size=label_shape
+            ).astype(np.int64)
+            self.weight = None
+
+        def set_attrs(self):
+            self.attrs = {'reduction': 'mean'}
+
+    class TestNLLLossOpWithoutWeightSum1D(TestNLLLossOpBase1D):
+        def set_inputs(self):
+            self.class_num = 3
+            x_shape = [5, self.class_num]
+            label_shape = [5]
+            self.x = np.random.random(x_shape).astype(self.dtype)
+            self.label = np.random.randint(
+                low=0, high=self.class_num, size=label_shape
+            ).astype(np.int64)
+            self.weight = None
+
+        def set_attrs(self):
+            self.attrs = {'reduction': 'sum'}
+
+    class TestNLLLossOpBase2D(XPUOpTest):
+        op_type = 'nll_loss'
+
+        def setUp(self):
+            self.dtype = self.in_type
+            self.place = paddle.XPUPlace(0)
+            self.set_attrs()
+            self.set_inputs()
+            self.inputs = {'X': self.x, 'Label': self.label}
+            if self.weight is not None:
+                self.inputs['Weight'] = self.weight
+            self.outputs = nll_loss_2d(
+                self.x,
+                self.dtype,
+                self.label,
+                self.weight,
+                self.attrs['reduction'],
+            )
+
+        def set_attrs(self):
+            self.attrs = {'reduction': 'none'}
+
+        def set_inputs(self):
+            self.class_num = 3
+            x_shape = [5, self.class_num, 7, 11]
+            label_shape = [5, 7, 11]
+            self.x = np.random.random(x_shape).astype(self.dtype)
+            self.label = np.random.randint(
+                low=0, high=self.class_num, size=label_shape
+            ).astype(np.int64)
+            self.weight = np.random.random(self.class_num).astype(self.dtype)
+
+        def test_check_output(self):
+            self.check_output_with_place(self.place)
+
+        def test_check_grad(self):
+            self.check_grad_with_place(self.place, ['X'], 'Out')
+
+    class TestNLLLossOpWithWeightMean2D(TestNLLLossOpBase2D):
+        def set_attrs(self):
+            self.attrs = {'reduction': 'mean'}
+
+    class TestNLLLossOpWithWeightSum2D(TestNLLLossOpBase2D):
+        def set_attrs(self):
+            self.attrs = {'reduction': 'sum'}
+
+    class TestNLLLossOpWithoutWeightNone2D(TestNLLLossOpBase2D):
+        def set_inputs(self):
+            self.dtype = self.in_type
+            self.class_num = 3
+            x_shape = [5, self.class_num, 7, 11]
+            label_shape = [5, 7, 11]
+            self.x = np.random.random(x_shape).astype(self.dtype)
+            self.label = np.random.randint(
+                low=0, high=self.class_num, size=label_shape
+            ).astype(np.int64)
+            self.weight = None
+
+        def set_attrs(self):
+            self.attrs = {'reduction': 'none'}
+
+    class TestNLLLossOpWithoutWeightMean2D(TestNLLLossOpBase2D):
+        def set_inputs(self):
+            self.dtype = self.in_type
+            self.class_num = 3
+            x_shape = [5, self.class_num, 7, 11]
+            label_shape = [5, 7, 11]
+            self.x = np.random.random(x_shape).astype(self.dtype)
+            self.label = np.random.randint(
+                low=0, high=self.class_num, size=label_shape
+            ).astype(np.int64)
+            self.weight = None
+
+        def set_attrs(self):
+            self.attrs = {'reduction': 'mean'}
+
+    class TestNLLLossOpWithoutWeightSum2D(TestNLLLossOpBase2D):
+        def set_inputs(self):
+            self.dtype = self.in_type
+            self.class_num = 3
+            x_shape = [5, self.class_num, 7, 11]
+            label_shape = [5, 7, 11]
+            self.x = np.random.random(x_shape).astype(self.dtype)
+            self.label = np.random.randint(
+                low=0, high=self.class_num, size=label_shape
+            ).astype(np.int64)
+            self.weight = None
+
+        def set_attrs(self):
+            self.attrs = {'reduction': 'sum'}
+
+
+support_types = get_xpu_op_support_types('nll_loss')
+for stype in support_types:
+    create_test_class(globals(), XPUTestNLLLossOP, stype)
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/tools/check_file_diff_approvals.sh b/tools/check_file_diff_approvals.sh
index 97a5ee146df7008ce16b03ea30f02e117cb33892..80be47871ef4820f0eb4ae0f9f2c3f74175f3dd3 100644
--- a/tools/check_file_diff_approvals.sh
+++ b/tools/check_file_diff_approvals.sh
@@ -344,7 +344,7 @@ fi
 OUTPUT_LOG=`echo "$ALL_ADDED_LINES" | grep -Ew "print|printf|fprintf|std::cout" || true`
 if [ "$OUTPUT_LOG" != "" ];then
     echo_line="print or std::cout is not recommended for direct use, please use loggin or glog. If it is necessary to use, please contact tianshuo78520a (Recommend) or zhangbo9674 review and approve.\n"
-    check_approval 1 tianshuo7852a zhangbo9674
+    check_approval 1 tianshuo78520a zhangbo9674
 fi
 
 HAS_MODIFIED_PHI_FILES=`git diff --name-only upstream/$BRANCH | grep "paddle/phi/" || true`