diff --git a/cmake/phi.cmake b/cmake/phi.cmake
index ebb686d8ad0f31917e64161d6f7d2ecd4644fadd..1c4dd723b9b71ffaab33599eb13ee8235393a097 100644
--- a/cmake/phi.cmake
+++ b/cmake/phi.cmake
@@ -118,7 +118,7 @@ function(kernel_library TARGET)
         if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cc)
             list(APPEND common_srcs ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cc)
         endif()
-        if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/cpu/${TARGET}.cc)
+        if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/cpu/${TARGET}.cc AND NOT WITH_XPU_KP)
             list(APPEND cpu_srcs ${CMAKE_CURRENT_SOURCE_DIR}/cpu/${TARGET}.cc)
         endif()
         if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/selected_rows/${TARGET}.cc)
@@ -151,6 +151,9 @@ function(kernel_library TARGET)
                 file(RENAME ${CMAKE_CURRENT_BINARY_DIR}/kps/${TARGET}.cu ${CMAKE_CURRENT_BINARY_DIR}/kps/${TARGET}.kps)
                 list(APPEND kps_srcs ${CMAKE_CURRENT_BINARY_DIR}/kps/${TARGET}.kps)
             endif()
+            if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/cpu/${TARGET}.cc )
+                list(APPEND kps_srcs ${CMAKE_CURRENT_SOURCE_DIR}/cpu/${TARGET}.cc)
+            endif()
         endif()
     else()
         # TODO(chenweihang): impl compile by source later
diff --git a/paddle/fluid/eager/tests/task_tests/generated_test.cc b/paddle/fluid/eager/tests/task_tests/generated_test.cc
index 49e517dc9b3f3271ef26dfbece46f799ef805c57..3c237b76e64b0d15ba86a20a2308e968557d6800 100644
--- a/paddle/fluid/eager/tests/task_tests/generated_test.cc
+++ b/paddle/fluid/eager/tests/task_tests/generated_test.cc
@@ -35,6 +35,8 @@ PD_DECLARE_KERNEL(matmul, CPU, ALL_LAYOUT);
 PD_DECLARE_KERNEL(matmul_grad, CPU, ALL_LAYOUT);
 PD_DECLARE_KERNEL(add, CPU, ALL_LAYOUT);
 PD_DECLARE_KERNEL(add_grad, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(sigmoid, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(sigmoid_grad, CPU, ALL_LAYOUT);
 
 namespace egr {
 
diff --git a/paddle/fluid/eager/tests/task_tests/hook_test_intermidiate.cc b/paddle/fluid/eager/tests/task_tests/hook_test_intermidiate.cc
index b86865e2d126fbfc0b00495a6e3208932ac6de39..8524be7800bfdda4f588007f8f21548e2995b0c3 100644
--- a/paddle/fluid/eager/tests/task_tests/hook_test_intermidiate.cc
+++ b/paddle/fluid/eager/tests/task_tests/hook_test_intermidiate.cc
@@ -32,6 +32,8 @@ PD_DECLARE_KERNEL(matmul, CPU, ALL_LAYOUT);
 PD_DECLARE_KERNEL(matmul_grad, CPU, ALL_LAYOUT);
 PD_DECLARE_KERNEL(add, CPU, ALL_LAYOUT);
 PD_DECLARE_KERNEL(add_grad, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(sigmoid, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(sigmoid_grad, CPU, ALL_LAYOUT);
 
 namespace egr {
 
diff --git a/paddle/fluid/framework/ir/mkldnn/mkldnn_conv_bn_fuse_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/mkldnn_conv_bn_fuse_pass_tester.cc
index 17663ecf6baa35f698aca35e451de34c647d2214..4236dc55d518686b97ea47aa12e756eb39304f35 100644
--- a/paddle/fluid/framework/ir/mkldnn/mkldnn_conv_bn_fuse_pass_tester.cc
+++ b/paddle/fluid/framework/ir/mkldnn/mkldnn_conv_bn_fuse_pass_tester.cc
@@ -25,6 +25,12 @@
 #include "paddle/fluid/framework/naive_executor.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/place.h"
+#include "paddle/phi/core/kernel_registry.h"
+
+PD_DECLARE_KERNEL(conv2d_transpose, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(batch_norm, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(add, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(gelu, CPU, ALL_LAYOUT);
 
 USE_OP_ITSELF(batch_norm);
 USE_OP_DEVICE_KERNEL(batch_norm, MKLDNN);
diff --git a/paddle/fluid/framework/new_executor/CMakeLists.txt b/paddle/fluid/framework/new_executor/CMakeLists.txt
index 46f340d681a225c03dbf4ad5cfd8f24ffd42ec0d..c2f32f5fe22311aa3df8051817d0e363373879b7 100644
--- a/paddle/fluid/framework/new_executor/CMakeLists.txt
+++ b/paddle/fluid/framework/new_executor/CMakeLists.txt
@@ -23,7 +23,7 @@ cc_library(standalone_executor SRCS standalone_executor.cc DEPS interpretercore)
 # cc_binary(standalone_executor_test SRCS standalone_executor_test.cc DEPS interpretercore standalone_executor operator op_registry executor ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS} profiler)
 # skip win32 since wget is not installed by default on windows machine.
 # skip COVERAGE_CI since the test runs slowly because of instrumentation.
-if (WITH_CUDA AND WITH_TESTING AND NOT WIN32 AND NOT WITH_COVERAGE AND NOT "$ENV{CI_SKIP_CPP_TEST}" STREQUAL "ON")
+if (WITH_GPU AND WITH_TESTING AND NOT WIN32 AND NOT WITH_COVERAGE AND NOT "$ENV{CI_SKIP_CPP_TEST}" STREQUAL "ON")
     add_custom_target(
         download_program
         COMMAND wget -nc https://paddle-ci.gz.bcebos.com/new_exec/lm_main_program 
diff --git a/paddle/fluid/framework/new_executor/interpretercore.cc b/paddle/fluid/framework/new_executor/interpretercore.cc
index 878b845211ca1ae9e92f43fcc6ac82da366264d4..62e801b76955d74f15bfd81f8da641671de7307b 100644
--- a/paddle/fluid/framework/new_executor/interpretercore.cc
+++ b/paddle/fluid/framework/new_executor/interpretercore.cc
@@ -41,6 +41,7 @@ namespace paddle {
 namespace framework {
 // NOTE(Aurelius84): Need a better strategy to determine it.
 static constexpr size_t kHostNumThreads = 4;
+static constexpr size_t kDeviceNumThreads = 1;
 
 bool IsInterpretercoreFastGCEnabled() {
   return FLAGS_fast_eager_deletion_mode && FLAGS_use_stream_safe_cuda_allocator;
@@ -54,8 +55,8 @@ InterpreterCore::InterpreterCore(const platform::Place& place,
       global_scope_(global_scope),
       stream_analyzer_(place) {
   is_build_ = false;
-  async_work_queue_.reset(
-      new interpreter::AsyncWorkQueue(kHostNumThreads, &main_thread_blocker_));
+  async_work_queue_.reset(new interpreter::AsyncWorkQueue(
+      kHostNumThreads, kDeviceNumThreads, &main_thread_blocker_));
 
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
   if (IsInterpretercoreFastGCEnabled()) {
@@ -271,6 +272,10 @@ void InterpreterCore::Convert(
   if (FLAGS_new_executor_use_inplace) {
     BuildInplace();
   }
+
+  // prepare for the first time.
+  async_work_queue_->PrepareAtomicDeps(dependecy_count_);
+  async_work_queue_->PrepareAtomicVarRef(vec_meta_info);
 }
 
 bool InterpreterCore::BuildInplaceCheckVarIsOnlyInput(size_t var_index) {
@@ -388,18 +393,18 @@ void InterpreterCore::RunInstruction(const Instruction& instr_node) {
                            : global_scope_->GetMutableScope();
   auto op_with_kernel = dynamic_cast<const framework::OperatorWithKernel*>(op);
   {
-    platform::RecordEvent infershape_event(
-        "infer_shape", platform::TracerEventType::OperatorInner, 1,
-        platform::EventRole::kInnerOp);
-    // If it is OperatorBase, InferShape do nothing.
-    if (op_with_kernel != nullptr)
+    if (op_with_kernel != nullptr) {
+      platform::RecordEvent infershape_event(
+          "infer_shape", platform::TracerEventType::OperatorInner, 1,
+          platform::EventRole::kInnerOp);
+      // If it is OperatorBase, InferShape do nothing.
       op_with_kernel->Info().infer_shape_(
           instr_node.InnerInferShapeContext().get());
+    }
   }
 
-  if (op_with_kernel != nullptr &&
-      FLAGS_new_executor_use_inplace) {  // TODO(xiongkun03) Does operator
-                                         // base support inplace ?
+  if (op_with_kernel != nullptr && FLAGS_new_executor_use_inplace) {
+    // TODO(xiongkun03) Does operator base support inplace ?
     for (auto& pair : instr_node.InplaceInfo()) {
       const auto& in = paddle::framework::details::GetTensorFromVar(pair.first);
       auto* out =
@@ -409,6 +414,7 @@ void InterpreterCore::RunInstruction(const Instruction& instr_node) {
       }
     }
   }
+
   {
     platform::RecordEvent compute_event(
         "compute", platform::TracerEventType::OperatorInner, 1,
@@ -458,16 +464,24 @@ void InterpreterCore::RunInstruction(const Instruction& instr_node) {
 
 void InterpreterCore::ExecuteInstructionList(
     const std::vector<Instruction>& vec_instr) {
+  // NOTE(zhiqiu): get the prepared deps from std::future, and async prepare
+  // those for the next step
+  auto atomic_deps = async_work_queue_->AtomicDeps();
+  auto atomic_var_ref = async_work_queue_->AtomicVarRef();
+
   async_work_queue_->PrepareAtomicDeps(dependecy_count_);
   async_work_queue_->PrepareAtomicVarRef(global_scope_->VecMetaInfo());
+
   unfinished_op_numer_ = vec_instr.size();
 
   exception_holder_.Clear();
 
   for (size_t i = 0; i < dependecy_count_.size(); ++i) {
     if (dependecy_count_[i] == 0) {
-      async_work_queue_->AddTask(vec_instr.at(i).KernelType(),
-                                 [&, i] { RunInstructionAsync(i); });
+      async_work_queue_->AddTask(vec_instr.at(i).KernelType(), [
+        this, i, atomic_deps = atomic_deps.get(),
+        atomic_var_ref = atomic_var_ref.get()
+      ] { RunInstructionAsync(i, atomic_deps, atomic_var_ref); });
     }
   }
 
@@ -490,11 +504,16 @@ void InterpreterCore::ExecuteInstructionList(
 }
 
 void InterpreterCore::RunNextInstructions(
-    const Instruction& instr, std::queue<size_t>* reserved_next_ops) {
+    const Instruction& instr, std::queue<size_t>* reserved_next_ops,
+    std::vector<std::atomic<size_t>>* atomic_deps,
+    std::vector<std::atomic<size_t>>* atomic_var_ref) {
+  VLOG(4) << "atomic 1:" << atomic_deps;
   auto& next_instr = instr.NextInstructions();
-  auto& atomic_deps = async_work_queue_->AtomicDeps();
-  auto IsReady = [&](size_t next_id) {
-    return atomic_deps[next_id]->fetch_sub(1, std::memory_order_relaxed) == 1;
+
+  auto IsReady = [atomic_deps](size_t next_id) {
+    VLOG(4) << "atomic:" << atomic_deps << " " << &(*atomic_deps)[next_id]
+            << " " << next_id;
+    return (*atomic_deps)[next_id].fetch_sub(1, std::memory_order_relaxed) == 1;
   };
 
   if (instr.KernelType() == OpFuncType::kQueueAsync) {
@@ -503,7 +522,9 @@ void InterpreterCore::RunNextInstructions(
       if (IsReady(next_id)) {
         async_work_queue_->AddTask(
             vec_instruction_[next_id].KernelType(),
-            [&, next_id] { RunInstructionAsync(next_id); });
+            [this, next_id, atomic_deps, atomic_var_ref]() {
+              RunInstructionAsync(next_id, atomic_deps, atomic_var_ref);
+            });
       }
     }
     // keep all async_ops running in current thread
@@ -523,7 +544,9 @@ void InterpreterCore::RunNextInstructions(
       if (IsReady(next_id)) {
         async_work_queue_->AddTask(
             vec_instruction_[next_id].KernelType(),
-            [&, next_id] { RunInstructionAsync(next_id); });
+            [this, next_id, atomic_deps, atomic_var_ref] {
+              RunInstructionAsync(next_id, atomic_deps, atomic_var_ref);
+            });
       }
     }
     auto direct_run_ops = interpreter::merge_vector(next_instr.SyncRunIds(),
@@ -539,14 +562,18 @@ void InterpreterCore::RunNextInstructions(
         // move rest ops into other threads
         async_work_queue_->AddTask(
             vec_instruction_[next_id].KernelType(),
-            [&, next_id] { RunInstructionAsync(next_id); });
+            [this, next_id, atomic_deps, atomic_var_ref] {
+              RunInstructionAsync(next_id, atomic_deps, atomic_var_ref);
+            });
       }
     }
     if (first_op != 0) reserved_next_ops->push(first_op);
   }
 }
 
-void InterpreterCore::RunInstructionAsync(size_t instr_id) {
+void InterpreterCore::RunInstructionAsync(
+    size_t instr_id, std::vector<std::atomic<size_t>>* atomic_deps,
+    std::vector<std::atomic<size_t>>* atomic_var_ref) {
   std::queue<size_t> ready_ops;
   ready_ops.push(instr_id);
   while (!ready_ops.empty()) {
@@ -571,7 +598,7 @@ void InterpreterCore::RunInstructionAsync(size_t instr_id) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
       RecordStreamForGC(instr_node);
 #endif
-      CheckGC(instr_node);
+      CheckGC(instr_node, atomic_var_ref);
     } catch (platform::EnforceNotMet& ex) {
       framework::InsertCallStackInfo(op->Type(), op->Attrs(), &ex);
       exception_holder_.Catch(std::make_exception_ptr(std::move(ex)));
@@ -605,7 +632,7 @@ void InterpreterCore::RunInstructionAsync(size_t instr_id) {
 
     interpreter::RecordEvent(instr_node, place_);
 
-    RunNextInstructions(instr_node, &ready_ops);
+    RunNextInstructions(instr_node, &ready_ops, atomic_deps, atomic_var_ref);
   }
 }
 
@@ -703,17 +730,19 @@ void InterpreterCore::RecordStreamForGC(const Instruction& instr) {
 }
 #endif
 
-void InterpreterCore::CheckGC(const Instruction& instr) {
+void InterpreterCore::CheckGC(
+    const Instruction& instr,
+    std::vector<std::atomic<size_t>>* atomic_var_ref) {
   size_t instr_id = instr.Id();
   auto& var_scope = *global_scope_;
-  auto& atomic_var_ref = async_work_queue_->AtomicVarRef();
 
   for (auto var_id : instr.GCCheckVars()) {
     VLOG(4) << "GC " << global_scope_->GetNameById(var_id) << " "
             << var_scope.VarDesc(var_id);
-
+    VLOG(4) << "atomic:" << atomic_var_ref << " " << &(*atomic_var_ref)[var_id]
+            << " " << var_id;
     bool is_ready =
-        atomic_var_ref[var_id]->fetch_sub(1, std::memory_order_relaxed) == 1;
+        (*atomic_var_ref)[var_id].fetch_sub(1, std::memory_order_relaxed) == 1;
     // ignore all persistable var while GC
     if (var_scope.VarDesc(var_id) && var_scope.VarDesc(var_id)->Persistable()) {
       continue;
diff --git a/paddle/fluid/framework/new_executor/interpretercore.h b/paddle/fluid/framework/new_executor/interpretercore.h
index 51734abbb1bf82345b525e6df3f766f99921b8b1..c1ade85e1384c0e1f6fe3f3d6480b606e8a24391 100644
--- a/paddle/fluid/framework/new_executor/interpretercore.h
+++ b/paddle/fluid/framework/new_executor/interpretercore.h
@@ -76,11 +76,16 @@ class InterpreterCore {
   void RecordStreamForGC(const Instruction& instr);
 #endif
 
-  void CheckGC(const Instruction& instr);
+  void CheckGC(const Instruction& instr,
+               std::vector<std::atomic<size_t>>* atomic_var_ref);
 
-  void RunInstructionAsync(size_t instr_id);
+  void RunInstructionAsync(size_t instr_id,
+                           std::vector<std::atomic<size_t>>* atomic_deps,
+                           std::vector<std::atomic<size_t>>* atomic_var_ref);
   void RunNextInstructions(const Instruction& instr_id,
-                           std::queue<size_t>* reserved_next_ops);
+                           std::queue<size_t>* reserved_next_ops,
+                           std::vector<std::atomic<size_t>>* atomic_deps,
+                           std::vector<std::atomic<size_t>>* atomic_var_ref);
 
   void BuildSkipShareLoDInfo();
 
diff --git a/paddle/fluid/framework/new_executor/interpretercore_util.cc b/paddle/fluid/framework/new_executor/interpretercore_util.cc
index d595af58257d4f6e0f6bd1fd009ab78e181f96f7..a045d6c7f4a65fdda83578f319b75788d0f68f95 100644
--- a/paddle/fluid/framework/new_executor/interpretercore_util.cc
+++ b/paddle/fluid/framework/new_executor/interpretercore_util.cc
@@ -44,32 +44,37 @@ void AsyncWorkQueue::AddTask(const OpFuncType& op_func_type,
 
 using VariableIdMap = std::map<std::string, std::vector<int>>;
 
-AtomicVectorSizeT& AsyncWorkQueue::PrepareAtomicDeps(
+void AsyncWorkQueue::PrepareAtomicDeps(
     const std::vector<size_t>& dependecy_count) {
-  if (atomic_deps_.size() != dependecy_count.size()) {
-    atomic_deps_.clear();
-    std::generate_n(std::back_inserter(atomic_deps_), dependecy_count.size(),
-                    [] { return std::make_unique<std::atomic<size_t>>(0); });
-  }
-
-  for (size_t i = 0; i < dependecy_count.size(); ++i) {
-    atomic_deps_[i]->store(dependecy_count[i]);
-  }
-  return atomic_deps_;
+  VLOG(4) << "PrepareAtomicDeps";
+  auto p = std::make_shared<
+      std::promise<std::unique_ptr<std::vector<std::atomic<size_t>>>>>();
+  atomic_deps_ = p->get_future();
+  queue_group_->AddTask(2, [&dependecy_count, p] {
+    auto* op_deps =
+        new std::vector<std::atomic<size_t>>(dependecy_count.size());
+    for (size_t i = 0; i < dependecy_count.size(); ++i) {
+      (*op_deps)[i] = dependecy_count[i];
+    }
+    VLOG(4) << "AtomicDeps:" << op_deps << " " << (*op_deps).size();
+    p->set_value(std::unique_ptr<std::vector<std::atomic<size_t>>>(op_deps));
+  });
 }
 
-AtomicVectorSizeT& AsyncWorkQueue::PrepareAtomicVarRef(
+void AsyncWorkQueue::PrepareAtomicVarRef(
     const std::vector<VariableMetaInfo>& vec_meta_info) {
-  if (atomic_var_ref_.size() != vec_meta_info.size()) {
-    atomic_var_ref_.clear();
-    std::generate_n(std::back_inserter(atomic_var_ref_), vec_meta_info.size(),
-                    [] { return std::make_unique<std::atomic<size_t>>(0); });
-  }
-
-  for (size_t i = 0; i < vec_meta_info.size(); ++i) {
-    atomic_var_ref_[i]->store(vec_meta_info[i].var_ref_count_);
-  }
-  return atomic_var_ref_;
+  VLOG(4) << "PrepareAtomicVarRef";
+  auto p = std::make_shared<
+      std::promise<std::unique_ptr<std::vector<std::atomic<size_t>>>>>();
+  atomic_var_ref_ = p->get_future();
+  queue_group_->AddTask(2, [&vec_meta_info, p] {
+    auto* var_ref = new std::vector<std::atomic<size_t>>(vec_meta_info.size());
+    for (size_t i = 0; i < vec_meta_info.size(); ++i) {
+      (*var_ref)[i] = vec_meta_info[i].var_ref_count_;
+    }
+    VLOG(4) << "AtomicVarRef:" << var_ref << " " << (*var_ref).size();
+    p->set_value(std::unique_ptr<std::vector<std::atomic<size_t>>>(var_ref));
+  });
 }
 
 bool var_can_be_deleted(const std::string& name, const BlockDesc& block) {
diff --git a/paddle/fluid/framework/new_executor/interpretercore_util.h b/paddle/fluid/framework/new_executor/interpretercore_util.h
index 81c05df62ec41970bcfbefe5e001527e777051ed..044a9ea368cbc506ce4a30bb82562177263786f9 100644
--- a/paddle/fluid/framework/new_executor/interpretercore_util.h
+++ b/paddle/fluid/framework/new_executor/interpretercore_util.h
@@ -50,11 +50,13 @@ namespace framework {
 
 namespace interpreter {
 
-using AtomicVectorSizeT = std::vector<std::unique_ptr<std::atomic<size_t>>>;
+using AtomicVectorSizeT =
+    std::future<std::unique_ptr<std::vector<std::atomic<size_t>>>>;
 
 class AsyncWorkQueue {
  public:
-  AsyncWorkQueue(size_t host_num_threads, EventsWaiter* waiter)
+  AsyncWorkQueue(size_t host_num_threads, size_t deivce_num_threads,
+                 EventsWaiter* waiter)
       : host_num_thread_(host_num_threads) {
     std::vector<WorkQueueOptions> group_options;
     // for execute host Kernel
@@ -66,6 +68,13 @@ class AsyncWorkQueue {
                                /*events_waiter*/ waiter);
     // for launch device Kernel
     group_options.emplace_back(/*name*/ "DeviceKernelLaunch",
+                               /*num_threads*/ deivce_num_threads,
+                               /*allow_spinning*/ true,
+                               /*track_task*/ false,
+                               /*detached*/ true,
+                               /*events_waiter*/ waiter);
+    // for prepare deps and others
+    group_options.emplace_back(/*name*/ "Prepare",
                                /*num_threads*/ 1,
                                /*allow_spinning*/ true,
                                /*track_task*/ false,
@@ -74,10 +83,8 @@ class AsyncWorkQueue {
     queue_group_ = CreateWorkQueueGroup(group_options);
   }
 
-  AtomicVectorSizeT& PrepareAtomicDeps(
-      const std::vector<size_t>& dependecy_count);
-  AtomicVectorSizeT& PrepareAtomicVarRef(
-      const std::vector<VariableMetaInfo>& vec_meta_info);
+  void PrepareAtomicDeps(const std::vector<size_t>& dependecy_count);
+  void PrepareAtomicVarRef(const std::vector<VariableMetaInfo>& vec_meta_info);
 
   // void WaitEmpty() { queue_group_->WaitQueueGroupEmpty(); }
 
@@ -85,8 +92,12 @@ class AsyncWorkQueue {
 
   void Cancel() { queue_group_->Cancel(); }
 
-  AtomicVectorSizeT& AtomicDeps() { return atomic_deps_; }
-  AtomicVectorSizeT& AtomicVarRef() { return atomic_var_ref_; }
+  std::unique_ptr<std::vector<std::atomic<size_t>>> AtomicDeps() {
+    return atomic_deps_.get();
+  }
+  std::unique_ptr<std::vector<std::atomic<size_t>>> AtomicVarRef() {
+    return atomic_var_ref_.get();
+  }
 
  private:
   size_t host_num_thread_;
diff --git a/paddle/fluid/framework/new_executor/standalone_executor_test.cc b/paddle/fluid/framework/new_executor/standalone_executor_test.cc
index 7fe1852f7396cb8cebe4b83f4cc80a8023421351..8d5058a586b9e2324bc204bc89d1f3bf0c53bd45 100644
--- a/paddle/fluid/framework/new_executor/standalone_executor_test.cc
+++ b/paddle/fluid/framework/new_executor/standalone_executor_test.cc
@@ -20,45 +20,65 @@
 // #include "gperftools/profiler.h"
 
 #include "paddle/fluid/framework/new_executor/standalone_executor.h"
+#include "paddle/phi/core/kernel_registry.h"
 
 USE_OP_ITSELF(fill_constant);
-USE_OP(uniform_random);
+USE_OP_ITSELF(uniform_random);
 USE_OP(lookup_table);
-USE_OP(transpose2);
+USE_OP_ITSELF(transpose2);
 USE_OP_ITSELF(reshape2);
-USE_OP(split);
-USE_OP(slice);
-USE_OP(concat);
-USE_OP(matmul);
+USE_OP_ITSELF(split);
+USE_OP_ITSELF(slice);
+USE_OP_ITSELF(concat);
+USE_OP_ITSELF(matmul);
 USE_OP_ITSELF(elementwise_add);
 USE_OP_ITSELF(sigmoid);
 USE_OP_ITSELF(tanh);
-USE_OP(elementwise_mul);
+USE_OP_ITSELF(elementwise_mul);
 USE_OP(softmax_with_cross_entropy);
 USE_OP_ITSELF(reduce_mean);
 USE_OP_ITSELF(reduce_sum);
 USE_OP_ITSELF(reduce_sum_grad);
 USE_OP_ITSELF(reduce_mean_grad);
 USE_OP_ITSELF(reshape2_grad);
-USE_OP(softmax_with_cross_entropy_grad);
+USE_OP_ITSELF(softmax_with_cross_entropy_grad);
 USE_OP_ITSELF(elementwise_add_grad);
-USE_OP(matmul_grad);
-USE_OP(square);
-USE_OP(transpose2_grad);
+USE_OP_ITSELF(matmul_grad);
+USE_OP_ITSELF(square);
+USE_OP_ITSELF(transpose2_grad);
 USE_OP(concat_grad);
 USE_OP_ITSELF(elementwise_mul_grad);
 USE_OP_ITSELF(sigmoid_grad);
 USE_OP_ITSELF(tanh_grad);
 USE_OP(sum);
-USE_OP(slice_grad);
-USE_OP(lookup_table_grad);
+USE_OP_ITSELF(slice_grad);
+USE_OP_ITSELF(lookup_table_grad);
 USE_OP(sqrt);
 USE_OP(elementwise_max);
 USE_OP_ITSELF(elementwise_div);
-USE_OP(sgd);
+USE_OP_ITSELF(sgd);
 USE_OP(squared_l2_norm);
-USE_OP(memcpy_h2d);
-USE_OP(memcpy_d2h);
+USE_OP_ITSELF(memcpy_h2d);
+USE_OP_ITSELF(memcpy_d2h);
+
+PD_DECLARE_KERNEL(full, GPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(uniform_random_raw, GPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(transpose, GPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(reshape, GPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(split, GPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(concat, GPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(matmul, GPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(add_raw, GPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(mean, GPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(reshape_grad, GPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(add_grad, GPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(matmul_grad, GPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(transpose_grad, GPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(sum, GPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(sum_grad, GPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(max_raw, GPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(sgd, GPU, ALL_LAYOUT);
+
 DECLARE_double(eager_delete_tensor_gb);
 
 namespace paddle {
diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index 42fbeb5d29ce4ac3a1498704b1fff88570c9c092..15777c287b42241350193583a21f3796febf5f02 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -1122,7 +1122,15 @@ static void CheckTensorNANOrInf(const std::string& op_type,
 
 bool OperatorWithKernel::SupportsMKLDNN(
     const proto::VarType::Type data_type) const {
-  auto& op_kernels = OperatorWithKernel::AllOpKernels().at(type_);
+  auto op_kernel_iter = OperatorWithKernel::AllOpKernels().find(type_);
+  if (op_kernel_iter == OperatorWithKernel::AllOpKernels().end()) {
+    VLOG(6) << "Warning: " << type_ << " don't find its MKLDNN Kernel in Fluid "
+                                       "Registered Kernels. And We don't "
+                                       "search its kernels in phi lib, "
+                                       "SupportsMKLDNN() return false.";
+    return false;
+  }
+  auto& op_kernels = op_kernel_iter->second;
   return std::any_of(op_kernels.begin(), op_kernels.end(),
                      [data_type](OpKernelMap::const_reference kern_pair) {
                        return platform::is_cpu_place(kern_pair.first.place_) &&
diff --git a/paddle/fluid/framework/scope.h b/paddle/fluid/framework/scope.h
index db4f6761bcec9d970863bd7f38b110ad719271ca..1669fba1327e5c08053735036b1946d35f2e8e49 100644
--- a/paddle/fluid/framework/scope.h
+++ b/paddle/fluid/framework/scope.h
@@ -144,6 +144,9 @@ class Scope : public ScopeBase {
   void Rename(const std::string& origin_name,
               const std::string& new_name) const;
 
+  // Return the number of variables in scope
+  size_t Size() { return vars_.size(); }
+
   // Rename variable to a new name and return the new name
   std::string Rename(const std::string& origin_name) const;
 
diff --git a/paddle/fluid/imperative/tests/test_tracer.cc b/paddle/fluid/imperative/tests/test_tracer.cc
index f754c6fdd0ee7742f0e544baad0225502c172848..75876e07fb5c78fb6ec6949489efac9fcf618a69 100644
--- a/paddle/fluid/imperative/tests/test_tracer.cc
+++ b/paddle/fluid/imperative/tests/test_tracer.cc
@@ -32,6 +32,8 @@
 
 PD_DECLARE_KERNEL(add, CPU, ALL_LAYOUT);
 PD_DECLARE_KERNEL(add_grad, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(sum, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(sum_grad, CPU, ALL_LAYOUT);
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 PD_DECLARE_KERNEL(add_grad, GPU, ALL_LAYOUT);
 PD_DECLARE_KERNEL(sum_grad, GPU, ALL_LAYOUT);
diff --git a/paddle/fluid/operators/activation_op.cc b/paddle/fluid/operators/activation_op.cc
index 845d0ed073b32cc136ec6b9d76c9e3073d7b051a..8f7b62a2c9d27b699688d53ff1545b80a8f4400c 100644
--- a/paddle/fluid/operators/activation_op.cc
+++ b/paddle/fluid/operators/activation_op.cc
@@ -1496,6 +1496,9 @@ REGISTER_ACTIVATION_OP(hard_sigmoid, HardSigmoid, HardSigmoidFunctor,
                        HardSigmoidGradFunctor);
 REGISTER_ACTIVATION_OP(logsigmoid, LogSigmoid, LogSigmoidFunctor,
                        LogSigmoidGradFunctor);
+REGISTER_ACTIVATION_OP(log2, Log2, Log2Functor, Log2GradFunctor);
+REGISTER_ACTIVATION_OP(log10, Log10, Log10Functor, Log10GradFunctor);
+REGISTER_ACTIVATION_OP(log1p, Log1p, Log1pFunctor, Log1pGradFunctor);
 
 /* ==========================    sigmoid register  =============================
  */
@@ -1867,15 +1870,6 @@ REGISTER_OPERATOR(
     ops::ActivationOpDoubleGrad<ops::LogGradGradFunctor<float>::FwdDeps()>,
     ops::ActivationDoubleGradOpInplaceInferer);
 
-REGISTER_ACTIVATION_CPU_KERNEL(log, Log, LogFunctor, LogGradFunctor);
-
-REGISTER_OP_CPU_KERNEL(
-    log_grad_grad, ops::LogDoubleGradKernel<plat::CPUDeviceContext,
-                                            ops::LogGradGradFunctor<float>>,
-    ops::LogDoubleGradKernel<plat::CPUDeviceContext,
-                             ops::LogGradGradFunctor<double>>,
-    ops::LogDoubleGradKernel<plat::CPUDeviceContext,
-                             ops::LogGradGradFunctor<plat::float16>>);
 /* ========================================================================== */
 
 /* ==========================  register checkpoint ===========================*/
diff --git a/paddle/fluid/operators/activation_op.h b/paddle/fluid/operators/activation_op.h
index f1984af6e15eac6682bd341f470727b899e82f3a..7db5675c16b2d3cc7df7176a40e2010355ebd37c 100644
--- a/paddle/fluid/operators/activation_op.h
+++ b/paddle/fluid/operators/activation_op.h
@@ -281,6 +281,11 @@ USE_PHI_DOUBLE_GRAD_FUNCTOR(Sigmoid)
 USE_PHI_TRIPLE_GRAD_FUNCTOR(Sigmoid)
 USE_PHI_FUNCTOR(LogSigmoid)
 USE_PHI_FUNCTOR(HardSigmoid)
+USE_PHI_FUNCTOR(Log)
+USE_PHI_DOUBLE_GRAD_FUNCTOR(Log)
+USE_PHI_FUNCTOR(Log2)
+USE_PHI_FUNCTOR(Log10)
+USE_PHI_FUNCTOR(Log1p)
 
 template <typename T>
 using ELUGradNegativeAlphaFunctor = phi::funcs::ELUGradNegativeAlphaFunctor<T>;
@@ -448,88 +453,6 @@ struct ReciprocalGradFunctor : public BaseActivationFunctor<T> {
   }
 };
 
-// log(x) = natural logarithm of x
-template <typename T>
-struct LogFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out>
-  void operator()(Device d, X x, Out out) const {
-    out.device(d) = x.log();
-  }
-};
-
-template <typename T>
-struct LogGradFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out, typename dOut,
-            typename dX>
-  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    dx.device(d) = dout * (static_cast<T>(1) / x);
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
-};
-
-// log2(x) = logarithm to the base 2 of the elements of x
-template <typename T>
-struct Log2Functor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out>
-  void operator()(Device d, X x, Out out) const {
-    out.device(d) = x.log() / static_cast<T>(log(2));
-  }
-};
-
-// the gradient of log2(x) is 1/(x*ln(2))
-template <typename T>
-struct Log2GradFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out, typename dOut,
-            typename dX>
-  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    dx.device(d) = dout * static_cast<T>(1) / (x * static_cast<T>(log(2)));
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
-};
-
-// log10(x) = logarithm to the base 10 of the elements of x
-template <typename T>
-struct Log10Functor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out>
-  void operator()(Device d, X x, Out out) const {
-    out.device(d) = x.log() / static_cast<T>(log(10));
-  }
-};
-
-// the gradient of log10(x) is 1/(x*ln(10))
-template <typename T>
-struct Log10GradFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out, typename dOut,
-            typename dX>
-  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    dx.device(d) = dout * static_cast<T>(1) / (x * static_cast<T>(log(10)));
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
-};
-
-// log1p(x) = natural logarithm of x+1
-template <typename T>
-struct Log1pFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out>
-  void operator()(Device d, X x, Out out) const {
-    out.device(d) = (static_cast<T>(1) + x).log();
-  }
-};
-
-template <typename T>
-struct Log1pGradFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out, typename dOut,
-            typename dX>
-  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    dx.device(d) = dout * (static_cast<T>(1) / (x + static_cast<T>(1)));
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
-};
-
 // square(x) = x^2
 template <typename T>
 struct SquareFunctor : public BaseActivationFunctor<T> {
@@ -1197,37 +1120,6 @@ class SquareDoubleGradKernel
   }
 };
 
-template <typename DeviceContext, typename Functor>
-class LogDoubleGradKernel
-    : public SquareDoubleGradKernel<DeviceContext, Functor> {};
-
-template <typename DeviceContext, typename Functor>
-class ELUDoubleGradKernel
-    : public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
- public:
-  using T = typename Functor::ELEMENT_TYPE;
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    const framework::Tensor *X, *ddX, *dOut;
-    X = ddX = dOut = nullptr;
-    framework::Tensor *dX, *ddOut;
-    dX = ddOut = nullptr;
-
-    ExtractDoubleGradTensorWithInputDOut(ctx, &X, &ddX, &dX, &dOut, &ddOut);
-
-    if (dX) dX->mutable_data<T>(X->dims(), ctx.GetPlace());
-    if (ddOut) ddOut->mutable_data<T>(ctx.GetPlace());
-
-    auto& place = ctx.template device_context<DeviceContext>();
-
-    Functor functor;
-    auto attrs = functor.GetAttrs();
-    for (auto& attr : attrs) {
-      *attr.second = ctx.Attr<float>(attr.first);
-    }
-    functor(place, X, ddX, ddOut, dOut, dX);
-  }
-};
-
 template <typename DeviceContext, typename Functor>
 class CELUDoubleGradKernel
     : public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
@@ -1522,36 +1414,6 @@ class LogitGradKernel : public framework::OpKernel<T> {
   }
 };
 
-template <typename T>
-struct LogGradGradFunctor : public BaseActivationFunctor<T> {
-  template <typename Device>
-  void operator()(const Device& dev, const framework::Tensor* X,
-                  const framework::Tensor* ddX, framework::Tensor* ddOut,
-                  const framework::Tensor* dOut, framework::Tensor* dX) const {
-    auto* d = dev.eigen_device();
-    auto ddx = framework::EigenVector<T>::Flatten(
-        GET_DATA_SAFELY(ddX, "Input", "DDX", "LogGradGrad"));
-    auto x = framework::EigenVector<T>::Flatten(
-        GET_DATA_SAFELY(X, "Input", "X", "LogGradGrad"));
-    // ddout = ddx / x; dx = -(dout / x) * (ddx / x)
-    // calculate dx first, so ddout can inplace ddx
-    if (dX) {
-      auto dout = framework::EigenVector<T>::Flatten(
-          GET_DATA_SAFELY(dOut, "Output", "DOut", "LogGradGrad"));
-      auto dx = framework::EigenVector<T>::Flatten(
-          GET_DATA_SAFELY(dX, "Output", "DX", "LogGradGrad"));
-      dx.device(*d) = dout * static_cast<T>(-1) * ddx / (x * x);
-    }
-    if (ddOut) {
-      auto ddout = framework::EigenVector<T>::Flatten(
-          GET_DATA_SAFELY(ddOut, "Output", "DDOut", "LogGradGrad"));
-      ddout.device(*d) = ddx * static_cast<T>(1) / x;
-    }
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
-};
-
 }  // namespace operators
 }  // namespace paddle
 
@@ -1560,9 +1422,6 @@ struct LogGradGradFunctor : public BaseActivationFunctor<T> {
   __macro(floor, Floor, FloorFunctor, ZeroGradFunctor);                      \
   __macro(round, Round, RoundFunctor, ZeroGradFunctor);                      \
   __macro(reciprocal, Reciprocal, ReciprocalFunctor, ReciprocalGradFunctor); \
-  __macro(log1p, Log1p, Log1pFunctor, Log1pGradFunctor);                     \
-  __macro(log2, Log2, Log2Functor, Log2GradFunctor);                         \
-  __macro(log10, Log10, Log10Functor, Log10GradFunctor);                     \
   __macro(soft_relu, SoftRelu, SoftReluFunctor, SoftReluGradFunctor);        \
   __macro(stanh, STanh, STanhFunctor, STanhGradFunctor);                     \
   __macro(softplus, Softplus, SoftplusFunctor, SoftplusGradFunctor);         \
diff --git a/paddle/fluid/operators/activation_op.kps b/paddle/fluid/operators/activation_op.kps
index 7c1b288080162e2a5bf847a795fc640ab5e5e4e1..bb08cee5bcde929f71415e5844a30c4d47f37b0a 100644
--- a/paddle/fluid/operators/activation_op.kps
+++ b/paddle/fluid/operators/activation_op.kps
@@ -131,27 +131,6 @@ struct CudaExpm1GradFunctor : public BaseActivationFunctor<T> {
   }
 };
 
-template <typename T>
-struct CudaLogFunctor : public BaseActivationFunctor<T> {
-  using MPType = typename details::MPTypeTrait<T>::Type;
-
-  // log(x) = log(x)
-  __device__ __forceinline__ T operator()(const T arg_x) const {
-    MPType x = static_cast<MPType>(arg_x);
-    return static_cast<T>(log(x));
-  }
-};
-
-template <typename T>
-struct CudaLogGradFunctor : public BaseActivationFunctor<T> {
-  // dx = dout / x
-  __device__ __forceinline__ T operator()(const T dout, const T x) const {
-    return dout / x;
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
-};
-
 template <typename T>
 struct CudaSquareFunctor : public BaseActivationFunctor<T> {
   // square(x) = x * x
@@ -220,78 +199,6 @@ struct CudaRsqrtGradFunctor : public BaseActivationFunctor<T> {
   }
 };
 
-template <typename T>
-struct CudaLog1pFunctor : public BaseActivationFunctor<T> {
-  using MPType = typename details::MPTypeTrait<T>::Type;
-  MPType one = static_cast<MPType>(1.0f);
-
-  // log1p(x) = log(1 + x)
-  __device__ __forceinline__ T operator()(const T arg_x) const {
-    MPType x = static_cast<MPType>(arg_x);
-    return static_cast<T>(log(one + x));
-  }
-};
-
-template <typename T>
-struct CudaLog1pGradFunctor : public BaseActivationFunctor<T> {
-  T one = static_cast<T>(1.0f);
-
-  // dx = dout / (1 + x)
-  __device__ __forceinline__ T operator()(const T dout, const T x) const {
-    return dout / (one + x);
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
-};
-
-template <typename T>
-struct CudaLog2Functor : public BaseActivationFunctor<T> {
-  using MPType = typename details::MPTypeTrait<T>::Type;
-
-  // log2(x) = log2(x)
-  __device__ __forceinline__ T operator()(const T arg_x) const {
-    MPType x = static_cast<MPType>(arg_x);
-    return static_cast<T>(log2(x));
-  }
-};
-
-template <typename T>
-struct CudaLog2GradFunctor : public BaseActivationFunctor<T> {
-  using MPType = typename details::MPTypeTrait<T>::Type;
-  T log_two = static_cast<T>(log(static_cast<MPType>(2.0f)));
-
-  // dx = dout / (x * log(2))
-  __device__ __forceinline__ T operator()(const T dout, const T x) const {
-    return dout / (x * log_two);
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
-};
-
-template <typename T>
-struct CudaLog10Functor : public BaseActivationFunctor<T> {
-  using MPType = typename details::MPTypeTrait<T>::Type;
-
-  // log10(x) = log10(x)
-  __device__ __forceinline__ T operator()(const T arg_x) const {
-    MPType x = static_cast<MPType>(arg_x);
-    return static_cast<T>(log10(x));
-  }
-};
-
-template <typename T>
-struct CudaLog10GradFunctor : public BaseActivationFunctor<T> {
-  using MPType = typename details::MPTypeTrait<T>::Type;
-  T log_ten = static_cast<T>(log(static_cast<MPType>(10.0f)));
-
-  // dx = dout / (x * log(10))
-  __device__ __forceinline__ T operator()(const T dout, const T x) const {
-    return dout / (x * log_ten);
-  }
-
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
-};
-
 template <typename T>
 struct CudaSoftReluFunctor : public BaseActivationFunctor<T> {
   using MPType = typename details::MPTypeTrait<T>::Type;
@@ -773,6 +680,10 @@ USE_PHI_FUNCTOR(CudaELU)
 USE_PHI_FUNCTOR(CudaSigmoid)
 USE_PHI_FUNCTOR(CudaLogSigmoid)
 USE_PHI_FUNCTOR(CudaHardSigmoid)
+USE_PHI_FUNCTOR(CudaLog)
+USE_PHI_FUNCTOR(CudaLog2)
+USE_PHI_FUNCTOR(CudaLog10)
+USE_PHI_FUNCTOR(CudaLog1p)
 
 template <typename T>
 using CudaELUGradNegativeAlphaFunctor =
@@ -975,18 +886,6 @@ REGISTER_OP_CUDA_KERNEL(
                                   ops::CudaExpm1GradFunctor<plat::float16>>);
 /* ========================================================================== */
 
-/* ==========================  Log register ==================================*/
-REGISTER_ACTIVATION_CUDA_KERNEL(log, Log, CudaLogFunctor, CudaLogGradFunctor);
-
-REGISTER_OP_CUDA_KERNEL(
-    log_grad_grad, ops::LogDoubleGradKernel<plat::CUDADeviceContext,
-                                            ops::LogGradGradFunctor<float>>,
-    ops::LogDoubleGradKernel<plat::CUDADeviceContext,
-                             ops::LogGradGradFunctor<double>>,
-    ops::LogDoubleGradKernel<plat::CUDADeviceContext,
-                             ops::LogGradGradFunctor<plat::float16>>);
-/* ========================================================================== */
-
 #define FOR_EACH_ACTIVATION_CUDA_OP(__macro)                                  \
   __macro(softshrink, SoftShrink, CudaSoftShrinkFunctor,                      \
           CudaSoftShrinkGradFunctor);                                         \
@@ -995,9 +894,6 @@ REGISTER_OP_CUDA_KERNEL(
   __macro(round, Round, CudaRoundFunctor, CudaZeroGradFunctor);               \
   __macro(reciprocal, Reciprocal, CudaReciprocalFunctor,                      \
           CudaReciprocalGradFunctor);                                         \
-  __macro(log1p, Log1p, CudaLog1pFunctor, CudaLog1pGradFunctor);              \
-  __macro(log2, Log2, CudaLog2Functor, CudaLog2GradFunctor);                  \
-  __macro(log10, Log10, CudaLog10Functor, CudaLog10GradFunctor);              \
   __macro(soft_relu, SoftRelu, CudaSoftReluFunctor, CudaSoftReluGradFunctor); \
   __macro(stanh, STanh, CudaSTanhFunctor, CudaSTanhGradFunctor);              \
   __macro(softplus, Softplus, CudaSoftplusFunctor, CudaSoftplusGradFunctor);  \
diff --git a/paddle/fluid/operators/fill_constant_batch_size_like_op.cc b/paddle/fluid/operators/fill_constant_batch_size_like_op.cc
index f699dac7976c5aa6745ca5d08079699e3cc0a63c..57e7cbb74079ed44a3f5554cda00243dc51f3a31 100644
--- a/paddle/fluid/operators/fill_constant_batch_size_like_op.cc
+++ b/paddle/fluid/operators/fill_constant_batch_size_like_op.cc
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/fill_constant_batch_size_like_op.h"
 #include "paddle/fluid/operators/batch_size_like.h"
 
 namespace paddle {
@@ -23,9 +22,13 @@ class FillConstantBatchSizeLikeOp : public BatchSizeLikeOp {
   using BatchSizeLikeOp::BatchSizeLikeOp;
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
-    return framework::OpKernelType(
+    framework::OpKernelType kernel_type = framework::OpKernelType(
         static_cast<framework::proto::VarType::Type>(ctx.Attr<int>("dtype")),
         ctx.device_context());
+    if (ctx.Attr<bool>("force_cpu")) {
+      kernel_type.place_ = platform::CPUPlace();
+    }
+    return kernel_type;
   }
 };
 
@@ -64,15 +67,3 @@ REGISTER_OPERATOR(
     paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
     ops::FillConstantBatchSizeLikeOpMaker,
     ops::BatchSizeLikeNoNeedBufferVarsInferer);
-REGISTER_OP_CPU_KERNEL(
-    fill_constant_batch_size_like,
-    ops::FillConstantBatchSizeLikeOpKernel<paddle::platform::CPUDeviceContext,
-                                           float>,
-    ops::FillConstantBatchSizeLikeOpKernel<paddle::platform::CPUDeviceContext,
-                                           double>,
-    ops::FillConstantBatchSizeLikeOpKernel<paddle::platform::CPUDeviceContext,
-                                           int>,
-    ops::FillConstantBatchSizeLikeOpKernel<paddle::platform::CPUDeviceContext,
-                                           int64_t>,
-    ops::FillConstantBatchSizeLikeOpKernel<paddle::platform::CPUDeviceContext,
-                                           bool>);
diff --git a/paddle/fluid/operators/fill_constant_batch_size_like_op.cu.cc b/paddle/fluid/operators/fill_constant_batch_size_like_op.cu.cc
deleted file mode 100644
index de06aeb01e4dda4e8ca4b4e70ca2c3ad6aa4b5dc..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/fill_constant_batch_size_like_op.cu.cc
+++ /dev/null
@@ -1,32 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/fill_constant_batch_size_like_op.h"
-#include "paddle/fluid/framework/op_registry.h"
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    fill_constant_batch_size_like,
-    ops::FillConstantBatchSizeLikeOpKernel<paddle::platform::CUDADeviceContext,
-                                           paddle::platform::float16>,
-    ops::FillConstantBatchSizeLikeOpKernel<paddle::platform::CUDADeviceContext,
-                                           float>,
-    ops::FillConstantBatchSizeLikeOpKernel<paddle::platform::CUDADeviceContext,
-                                           double>,
-    ops::FillConstantBatchSizeLikeOpKernel<paddle::platform::CUDADeviceContext,
-                                           int>,
-    ops::FillConstantBatchSizeLikeOpKernel<paddle::platform::CUDADeviceContext,
-                                           int64_t>,
-    ops::FillConstantBatchSizeLikeOpKernel<paddle::platform::CUDADeviceContext,
-                                           bool>);
diff --git a/paddle/fluid/operators/fill_constant_batch_size_like_op.h b/paddle/fluid/operators/fill_constant_batch_size_like_op.h
deleted file mode 100644
index 31471c6b622684ac2134366bd23b8919ba1f93e5..0000000000000000000000000000000000000000
--- a/paddle/fluid/operators/fill_constant_batch_size_like_op.h
+++ /dev/null
@@ -1,84 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <string>
-#include "paddle/fluid/framework/convert_utils.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class FillConstantBatchSizeLikeOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    auto data_type =
-        static_cast<framework::proto::VarType::Type>(ctx.Attr<int>("dtype"));
-    auto float_value = ctx.Attr<float>("value");
-    auto str_value = ctx.Attr<std::string>("str_value");
-    auto force_cpu = ctx.Attr<bool>("force_cpu");
-
-    auto *out = ctx.Output<framework::Tensor>("Out");
-    auto *in = ctx.Input<framework::LoDTensor>("Input");
-    if (in->lod().size() && ctx.Attr<int>("input_dim_idx") == 0) {
-      // set the correct batch size for the LoDTensor.
-      auto odims = out->dims();
-      int output_dim_idx = ctx.Attr<int>("output_dim_idx");
-      odims[output_dim_idx] = static_cast<int>(in->lod().back().size()) - 1;
-      out->mutable_data<T>(odims, ctx.GetPlace());
-    }
-
-    T value;
-    if (str_value.empty()) {
-      value = static_cast<T>(float_value);
-    } else {
-      std::stringstream convert_stream(str_value);
-      if (std::is_same<int64_t, T>::value) {
-        int64_t tmp_value;
-        convert_stream >> tmp_value;
-        value = static_cast<T>(tmp_value);
-      } else {
-        double tmp_value;
-        convert_stream >> tmp_value;
-        value = static_cast<T>(tmp_value);
-      }
-    }
-
-    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-    bool cpu_place = force_cpu || ctx.GetPlace() == platform::CPUPlace();
-    if (cpu_place) {
-      auto &dev_ctx = *pool.Get(platform::CPUPlace());
-      phi::funcs::SetConstant<platform::CPUDeviceContext, T> functor;
-      out->mutable_data(platform::CPUPlace(),
-                        framework::TransToPhiDataType(data_type));
-      functor(reinterpret_cast<const platform::CPUDeviceContext &>(dev_ctx),
-              out, static_cast<T>(value));
-    }
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-    if (!cpu_place) {
-      auto &dev_ctx = *pool.Get(ctx.GetPlace());
-      phi::funcs::SetConstant<platform::CUDADeviceContext, T> functor;
-      out->mutable_data(ctx.GetPlace(),
-                        framework::TransToPhiDataType(data_type));
-      functor(reinterpret_cast<const platform::CUDADeviceContext &>(dev_ctx),
-              out, static_cast<T>(value));
-    }
-#endif
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/squeeze_op.cc b/paddle/fluid/operators/squeeze_op.cc
index b3403a960a128dae86395c2d7feabf2c07461c03..ff378396b188fbd55c8228dbba6226cb799204dc 100644
--- a/paddle/fluid/operators/squeeze_op.cc
+++ b/paddle/fluid/operators/squeeze_op.cc
@@ -19,7 +19,9 @@ limitations under the License. */
 #include <unordered_map>
 #include <vector>
 
+#include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/infermeta/unary.h"
 
 namespace paddle {
 namespace operators {
@@ -113,13 +115,13 @@ class SqueezeOp : public framework::OperatorWithKernel {
     auto input_data_type =
         framework::OperatorWithKernel::IndicateVarDataType(ctx, "X");
 
-    //#ifdef PADDLE_WITH_MKLDNN
+    // #ifdef PADDLE_WITH_MKLDNN
     //    if (this->CanMKLDNNBeUsed(ctx, input_data_type)) {
     //      return framework::OpKernelType(input_data_type, ctx.GetPlace(),
     //                                     framework::DataLayout::kMKLDNN,
     //                                     framework::LibraryType::kMKLDNN);
     //    }
-    //#endif
+    // #endif
     return framework::OpKernelType(input_data_type, ctx.GetPlace());
   }
 };
@@ -140,13 +142,13 @@ class SqueezeGradOp : public framework::OperatorWithKernel {
     auto input_data_type = framework::OperatorWithKernel::IndicateVarDataType(
         ctx, framework::GradVarName("Out"));
 
-    //#ifdef PADDLE_WITH_MKLDNN
+    // #ifdef PADDLE_WITH_MKLDNN
     //    if (this->CanMKLDNNBeUsed(ctx, input_data_type)) {
     //      return framework::OpKernelType(input_data_type, ctx.GetPlace(),
     //                                     framework::DataLayout::kMKLDNN,
     //                                     framework::LibraryType::kMKLDNN);
     //    }
-    //#endif
+    // #endif
     return framework::OpKernelType(input_data_type, ctx.GetPlace());
   }
 };
@@ -201,53 +203,18 @@ class SqueezeOpMaker : public framework::OpProtoAndCheckerMaker {
 class Squeeze2Op : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "Squeeze2");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "Squeeze2");
-
-    const auto &x_dims = ctx->GetInputDim("X");
-    // Check input tensor dims (<6) Eigen limit.
-    PADDLE_ENFORCE_LE(x_dims.size(), 6,
-                      platform::errors::InvalidArgument(
-                          "The dimensions of Input(X) "
-                          "should be in the range of [1, 6] (Eigen limit)."
-                          "But received X's dimensions = %d, X's shape = [%s].",
-                          x_dims.size(), x_dims));
-
-    const auto &axes = ctx->Attrs().Get<std::vector<int>>("axes");
-
-    auto out_dims = GetOutputShape(axes, x_dims, false);
-    ctx->SetOutputDim("Out", out_dims);
-    if (x_dims[0] == out_dims[0]) {
-      // Only pass LoD when the first dimension of output and Input(X)
-      // are the same.
-      ctx->ShareLoD("X", "Out");
-    }
-
-    if (!ctx->HasOutput("XShape")) return;
-
-    std::vector<int64_t> xshape_dims(x_dims.size() + 1);
-    xshape_dims[0] = 0;
-    for (int i = 0; i < x_dims.size(); ++i) {
-      xshape_dims[i + 1] = x_dims[i];
-    }
-    ctx->SetOutputDim("XShape", phi::make_ddim(xshape_dims));
-    ctx->ShareLoD("X", /*->*/ "XShape");
-  }
-
   framework::OpKernelType GetExpectedKernelType(
       const framework::ExecutionContext &ctx) const override {
     auto input_data_type =
         framework::OperatorWithKernel::IndicateVarDataType(ctx, "X");
 
-    //#ifdef PADDLE_WITH_MKLDNN
+    // #ifdef PADDLE_WITH_MKLDNN
     //    if (this->CanMKLDNNBeUsed(ctx, input_data_type)) {
     //      return framework::OpKernelType(input_data_type, ctx.GetPlace(),
     //                                     framework::DataLayout::kMKLDNN,
     //                                     framework::LibraryType::kMKLDNN);
     //    }
-    //#endif
+    // #endif
     return framework::OpKernelType(input_data_type, ctx.GetPlace());
   }
 };
@@ -287,13 +254,13 @@ class Squeeze2GradOp : public framework::OperatorWithKernel {
     auto input_data_type = framework::OperatorWithKernel::IndicateVarDataType(
         ctx, framework::GradVarName("Out"));
 
-    //#ifdef PADDLE_WITH_MKLDNN
+    // #ifdef PADDLE_WITH_MKLDNN
     //    if (this->CanMKLDNNBeUsed(ctx, input_data_type)) {
     //      return framework::OpKernelType(input_data_type, ctx.GetPlace(),
     //                                     framework::DataLayout::kMKLDNN,
     //                                     framework::LibraryType::kMKLDNN);
     //    }
-    //#endif
+    // #endif
     return framework::OpKernelType(input_data_type, ctx.GetPlace());
   }
 };
@@ -365,6 +332,10 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(SqueezeGradNoNeedBufferVarsInferer, "X");
 }  // namespace paddle
 
 namespace ops = paddle::operators;
+
+DECLARE_INFER_SHAPE_FUNCTOR(squeeze2, SqueezeInferShapeFunctor,
+                            PD_INFER_META(phi::SqueezeInferMeta));
+
 REGISTER_OPERATOR(squeeze, ops::SqueezeOp, ops::SqueezeOpMaker,
                   ops::SqueezeGradOpMaker<paddle::framework::OpDesc>,
                   ops::SqueezeGradOpMaker<paddle::imperative::OpBase>);
@@ -376,7 +347,7 @@ REGISTER_OPERATOR(squeeze_grad, ops::SqueezeGradOp,
 REGISTER_OPERATOR(squeeze2, ops::Squeeze2Op, ops::Squeeze2OpMaker,
                   ops::Squeeze2GradOpMaker<paddle::framework::OpDesc>,
                   ops::Squeeze2GradOpMaker<paddle::imperative::OpBase>,
-                  ops::SqueezeInplaceInferer);
+                  ops::SqueezeInplaceInferer, SqueezeInferShapeFunctor);
 REGISTER_OPERATOR(squeeze2_grad, ops::Squeeze2GradOp,
                   ops::Squeeze2DoubleGradOpMaker<paddle::framework::OpDesc>,
                   ops::Squeeze2DoubleGradOpMaker<paddle::imperative::OpBase>,
@@ -411,34 +382,3 @@ REGISTER_OP_CPU_KERNEL(
                            paddle::platform::complex<double>>,
     ops::SqueezeGradKernel<paddle::platform::CPUDeviceContext,
                            paddle::platform::bfloat16>);
-
-REGISTER_OP_CPU_KERNEL(
-    squeeze2, ops::Squeeze2Kernel<paddle::platform::CPUDeviceContext, float>,
-    ops::Squeeze2Kernel<paddle::platform::CPUDeviceContext, double>,
-    ops::Squeeze2Kernel<paddle::platform::CPUDeviceContext, bool>,
-    ops::Squeeze2Kernel<paddle::platform::CPUDeviceContext, int>,
-    ops::Squeeze2Kernel<paddle::platform::CPUDeviceContext, uint8_t>,
-    ops::Squeeze2Kernel<paddle::platform::CPUDeviceContext, int8_t>,
-    ops::Squeeze2Kernel<paddle::platform::CPUDeviceContext, int64_t>,
-    ops::Squeeze2Kernel<paddle::platform::CPUDeviceContext,
-                        paddle::platform::complex<float>>,
-    ops::Squeeze2Kernel<paddle::platform::CPUDeviceContext,
-                        paddle::platform::complex<double>>,
-    ops::Squeeze2Kernel<paddle::platform::CPUDeviceContext,
-                        paddle::platform::bfloat16>);
-
-REGISTER_OP_CPU_KERNEL(
-    squeeze2_grad,
-    ops::Squeeze2GradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::Squeeze2GradKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::Squeeze2GradKernel<paddle::platform::CPUDeviceContext, bool>,
-    ops::Squeeze2GradKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::Squeeze2GradKernel<paddle::platform::CPUDeviceContext, uint8_t>,
-    ops::Squeeze2GradKernel<paddle::platform::CPUDeviceContext, int8_t>,
-    ops::Squeeze2GradKernel<paddle::platform::CPUDeviceContext, int64_t>,
-    ops::Squeeze2GradKernel<paddle::platform::CPUDeviceContext,
-                            paddle::platform::complex<float>>,
-    ops::Squeeze2GradKernel<paddle::platform::CPUDeviceContext,
-                            paddle::platform::complex<double>>,
-    ops::Squeeze2GradKernel<paddle::platform::CPUDeviceContext,
-                            paddle::platform::bfloat16>);
diff --git a/paddle/fluid/operators/squeeze_op.cu.cc b/paddle/fluid/operators/squeeze_op.cu.cc
index 8d7c0e5b4ff0e1351a3ebfccb70e33c36115d4de..19aa12cb55e2feb1bc7bb2070b165d923befb9a5 100644
--- a/paddle/fluid/operators/squeeze_op.cu.cc
+++ b/paddle/fluid/operators/squeeze_op.cu.cc
@@ -46,33 +46,3 @@ REGISTER_OP_CUDA_KERNEL(
                            paddle::platform::complex<float>>,
     ops::SqueezeGradKernel<paddle::platform::CUDADeviceContext,
                            paddle::platform::complex<double>>);
-REGISTER_OP_CUDA_KERNEL(
-    squeeze2, ops::Squeeze2Kernel<paddle::platform::CUDADeviceContext, float>,
-    ops::Squeeze2Kernel<paddle::platform::CUDADeviceContext, double>,
-    ops::Squeeze2Kernel<paddle::platform::CUDADeviceContext, plat::float16>,
-    ops::Squeeze2Kernel<paddle::platform::CUDADeviceContext, plat::bfloat16>,
-    ops::Squeeze2Kernel<paddle::platform::CUDADeviceContext, bool>,
-    ops::Squeeze2Kernel<paddle::platform::CUDADeviceContext, int>,
-    ops::Squeeze2Kernel<paddle::platform::CUDADeviceContext, int8_t>,
-    ops::Squeeze2Kernel<paddle::platform::CUDADeviceContext, uint8_t>,
-    ops::Squeeze2Kernel<paddle::platform::CUDADeviceContext, int64_t>,
-    ops::Squeeze2Kernel<paddle::platform::CUDADeviceContext,
-                        paddle::platform::complex<float>>,
-    ops::Squeeze2Kernel<paddle::platform::CUDADeviceContext,
-                        paddle::platform::complex<double>>);
-REGISTER_OP_CUDA_KERNEL(
-    squeeze2_grad,
-    ops::Squeeze2GradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::Squeeze2GradKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::Squeeze2GradKernel<paddle::platform::CUDADeviceContext, plat::float16>,
-    ops::Squeeze2GradKernel<paddle::platform::CUDADeviceContext,
-                            plat::bfloat16>,
-    ops::Squeeze2GradKernel<paddle::platform::CUDADeviceContext, bool>,
-    ops::Squeeze2GradKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::Squeeze2GradKernel<paddle::platform::CUDADeviceContext, int8_t>,
-    ops::Squeeze2GradKernel<paddle::platform::CUDADeviceContext, uint8_t>,
-    ops::Squeeze2GradKernel<paddle::platform::CUDADeviceContext, int64_t>,
-    ops::Squeeze2GradKernel<paddle::platform::CUDADeviceContext,
-                            paddle::platform::complex<float>>,
-    ops::Squeeze2GradKernel<paddle::platform::CUDADeviceContext,
-                            paddle::platform::complex<double>>);
diff --git a/paddle/fluid/operators/unsqueeze_op.cc b/paddle/fluid/operators/unsqueeze_op.cc
index 6389c5b2680138930a9b6408da6128c8a5eefcd6..445e8cd468bf3d55215fb4e756a849500a3fbb2d 100644
--- a/paddle/fluid/operators/unsqueeze_op.cc
+++ b/paddle/fluid/operators/unsqueeze_op.cc
@@ -18,7 +18,9 @@ limitations under the License. */
 #include <string>
 #include <vector>
 
+#include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/infermeta/unary.h"
 
 namespace paddle {
 namespace operators {
@@ -251,19 +253,6 @@ class UnsqueezeDoubleGradOpMaker : public framework::SingleGradOpMaker<T> {
 class Unsqueeze2Op : public UnsqueezeOp {
  public:
   using UnsqueezeOp::UnsqueezeOp;
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    UnsqueezeOp::InferShape(ctx);
-    const auto &x_dims = ctx->GetInputDim("X");
-
-    if (!ctx->HasOutput("XShape")) return;
-    std::vector<int64_t> xshape_dims(x_dims.size() + 1);
-    xshape_dims[0] = 0;
-    for (int i = 0; i < x_dims.size(); ++i) {
-      xshape_dims[i + 1] = x_dims[i];
-    }
-    ctx->SetOutputDim("XShape", phi::make_ddim(xshape_dims));
-    ctx->ShareLoD("X", /*->*/ "XShape");
-  }
 };
 
 class Unsqueeze2OpMaker : public UnsqueezeOpMaker {
@@ -339,10 +328,14 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(UnsqueezeGradOpNoNeedBufferVarInferer, "X");
 }  // namespace operators
 }  // namespace paddle
 
+DECLARE_INFER_SHAPE_FUNCTOR(unsqueeze2, Unsqueeze2InferShapeFunctor,
+                            PD_INFER_META(phi::UnsqueezeInferMeta));
+
 namespace ops = paddle::operators;
 REGISTER_OPERATOR(unsqueeze, ops::UnsqueezeOp, ops::UnsqueezeOpMaker,
                   ops::UnsqueezeGradOpMaker<paddle::framework::OpDesc>,
                   ops::UnsqueezeGradOpMaker<paddle::imperative::OpBase>);
+
 REGISTER_OPERATOR(unsqueeze_grad, ops::UnsqueezeGradOp,
                   ops::UnsqueezeDoubleGradOpMaker<paddle::framework::OpDesc>,
                   ops::UnsqueezeDoubleGradOpMaker<paddle::imperative::OpBase>,
@@ -351,7 +344,8 @@ REGISTER_OPERATOR(unsqueeze_grad, ops::UnsqueezeGradOp,
 REGISTER_OPERATOR(unsqueeze2, ops::Unsqueeze2Op, ops::Unsqueeze2OpMaker,
                   ops::Unsqueeze2GradOpMaker<paddle::framework::OpDesc>,
                   ops::Unsqueeze2GradOpMaker<paddle::imperative::OpBase>,
-                  ops::UnsqueezeInplaceInferer);
+                  Unsqueeze2InferShapeFunctor, ops::UnsqueezeInplaceInferer);
+
 REGISTER_OPERATOR(unsqueeze2_grad, ops::Unsqueeze2GradOp,
                   ops::Unsqueeze2DoubleGradOpMaker<paddle::framework::OpDesc>,
                   ops::Unsqueeze2DoubleGradOpMaker<paddle::imperative::OpBase>,
@@ -388,34 +382,3 @@ REGISTER_OP_CPU_KERNEL(
                              paddle::platform::complex<double>>,
     ops::UnsqueezeGradKernel<paddle::platform::CPUDeviceContext,
                              paddle::platform::bfloat16>);
-REGISTER_OP_CPU_KERNEL(
-    unsqueeze2, ops::UnsqueezeKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::UnsqueezeKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::UnsqueezeKernel<paddle::platform::CPUDeviceContext, bool>,
-    ops::UnsqueezeKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::UnsqueezeKernel<paddle::platform::CPUDeviceContext, int16_t>,
-    ops::UnsqueezeKernel<paddle::platform::CPUDeviceContext, uint8_t>,
-    ops::UnsqueezeKernel<paddle::platform::CPUDeviceContext, int8_t>,
-    ops::UnsqueezeKernel<paddle::platform::CPUDeviceContext, int64_t>,
-    ops::UnsqueezeKernel<paddle::platform::CPUDeviceContext,
-                         paddle::platform::complex<float>>,
-    ops::UnsqueezeKernel<paddle::platform::CPUDeviceContext,
-                         paddle::platform::complex<double>>,
-    ops::UnsqueezeKernel<paddle::platform::CPUDeviceContext,
-                         paddle::platform::bfloat16>);
-REGISTER_OP_CPU_KERNEL(
-    unsqueeze2_grad,
-    ops::Unsqueeze2GradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::Unsqueeze2GradKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::Unsqueeze2GradKernel<paddle::platform::CPUDeviceContext, bool>,
-    ops::Unsqueeze2GradKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::Unsqueeze2GradKernel<paddle::platform::CPUDeviceContext, int16_t>,
-    ops::Unsqueeze2GradKernel<paddle::platform::CPUDeviceContext, uint8_t>,
-    ops::Unsqueeze2GradKernel<paddle::platform::CPUDeviceContext, int8_t>,
-    ops::Unsqueeze2GradKernel<paddle::platform::CPUDeviceContext, int64_t>,
-    ops::Unsqueeze2GradKernel<paddle::platform::CPUDeviceContext,
-                              paddle::platform::complex<float>>,
-    ops::Unsqueeze2GradKernel<paddle::platform::CPUDeviceContext,
-                              paddle::platform::complex<double>>,
-    ops::Unsqueeze2GradKernel<paddle::platform::CPUDeviceContext,
-                              paddle::platform::bfloat16>);
diff --git a/paddle/fluid/operators/unsqueeze_op.cu.cc b/paddle/fluid/operators/unsqueeze_op.cu.cc
index 2dcc4d2152a5c82a8f344b96084e70ba4df25bdd..f20ddb5c881e41fbecbb48d884004c1554e1fdb5 100644
--- a/paddle/fluid/operators/unsqueeze_op.cu.cc
+++ b/paddle/fluid/operators/unsqueeze_op.cu.cc
@@ -50,37 +50,3 @@ REGISTER_OP_CUDA_KERNEL(
                              paddle::platform::complex<float>>,
     ops::UnsqueezeGradKernel<paddle::platform::CUDADeviceContext,
                              paddle::platform::complex<double>>);
-REGISTER_OP_CUDA_KERNEL(
-    unsqueeze2,
-    ops::UnsqueezeKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::UnsqueezeKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::UnsqueezeKernel<paddle::platform::CUDADeviceContext, plat::float16>,
-    ops::UnsqueezeKernel<paddle::platform::CUDADeviceContext, plat::bfloat16>,
-    ops::UnsqueezeKernel<paddle::platform::CUDADeviceContext, bool>,
-    ops::UnsqueezeKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::UnsqueezeKernel<paddle::platform::CUDADeviceContext, int16_t>,
-    ops::UnsqueezeKernel<paddle::platform::CUDADeviceContext, uint8_t>,
-    ops::UnsqueezeKernel<paddle::platform::CUDADeviceContext, int8_t>,
-    ops::UnsqueezeKernel<paddle::platform::CUDADeviceContext, int64_t>,
-    ops::UnsqueezeKernel<paddle::platform::CUDADeviceContext,
-                         paddle::platform::complex<float>>,
-    ops::UnsqueezeKernel<paddle::platform::CUDADeviceContext,
-                         paddle::platform::complex<double>>);
-REGISTER_OP_CUDA_KERNEL(
-    unsqueeze2_grad,
-    ops::Unsqueeze2GradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::Unsqueeze2GradKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::Unsqueeze2GradKernel<paddle::platform::CUDADeviceContext,
-                              plat::float16>,
-    ops::Unsqueeze2GradKernel<paddle::platform::CUDADeviceContext,
-                              plat::bfloat16>,
-    ops::Unsqueeze2GradKernel<paddle::platform::CUDADeviceContext, bool>,
-    ops::Unsqueeze2GradKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::Unsqueeze2GradKernel<paddle::platform::CUDADeviceContext, int16_t>,
-    ops::Unsqueeze2GradKernel<paddle::platform::CUDADeviceContext, uint8_t>,
-    ops::Unsqueeze2GradKernel<paddle::platform::CUDADeviceContext, int8_t>,
-    ops::Unsqueeze2GradKernel<paddle::platform::CUDADeviceContext, int64_t>,
-    ops::Unsqueeze2GradKernel<paddle::platform::CUDADeviceContext,
-                              paddle::platform::complex<float>>,
-    ops::Unsqueeze2GradKernel<paddle::platform::CUDADeviceContext,
-                              paddle::platform::complex<double>>);
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index bbaa7e3dd6471587c82d271ef881276818dd1b79..dcfad030a689c278b72a0061cfb170762d1a3156 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -1761,6 +1761,7 @@ All parameter, weight, gradient are variables in Paddle.
                out (core.Variable|None): the found variable or None.
            )DOC",
            py::return_value_policy::reference)
+      .def("size", &Scope::Size)
       .def("erase", &Scope::EraseVars, py::arg("names"),
            R"DOC(
            Find variable named :code:`name` in the current scope or
@@ -2857,6 +2858,9 @@ All parameter, weight, gradient are variables in Paddle.
       .def("run",
            [](StandaloneExecutor &self, std::vector<std::string> feed_names,
               std::vector<std::string> fetch_names) {
+             platform::RecordEvent record_event(
+                 "StandaloneExecutor:run",
+                 platform::TracerEventType::UserDefined, 1);
              paddle::framework::FetchList ret;
              {
                pybind11::gil_scoped_release release;
diff --git a/paddle/phi/core/compat/op_utils.h b/paddle/phi/core/compat/op_utils.h
index 946230cb169d20db56a46399552b629348c4783f..613a2f9960a6ffd2ca4a02f20710018fcc00eaed 100644
--- a/paddle/phi/core/compat/op_utils.h
+++ b/paddle/phi/core/compat/op_utils.h
@@ -42,6 +42,10 @@ const std::unordered_set<std::string> deprecated_op_names({"diag",
                                                            "flatten_grad",
                                                            "isinf",
                                                            "isnan",
+                                                           "unsqueeze",
+                                                           "unsqueeze_grad",
+                                                           "squeeze",
+                                                           "squeeze_grad",
                                                            "isfinite",
                                                            "matmul",
                                                            "matmul_grad",
diff --git a/paddle/phi/infermeta/unary.cc b/paddle/phi/infermeta/unary.cc
index 80503dd2430927223dedd80d8e44c08473536997..e44032285ac1af0e8e1930583b360b818c7455a4 100644
--- a/paddle/phi/infermeta/unary.cc
+++ b/paddle/phi/infermeta/unary.cc
@@ -24,6 +24,7 @@ limitations under the License. */
 #include "paddle/phi/core/infermeta_utils.h"
 #include "paddle/phi/kernels/funcs/pooling.h"
 #include "paddle/phi/kernels/funcs/unfold_functor.h"
+#include "paddle/phi/kernels/funcs/unsqueeze.h"
 
 namespace phi {
 
@@ -1497,6 +1498,40 @@ void SplitInferMeta(const MetaTensor& x,
   }
 }
 
+void SqueezeInferMeta(const MetaTensor& x,
+                      const std::vector<int>& axes,
+                      MetaTensor* xshape,
+                      MetaTensor* out) {
+  const auto& x_dims = x.dims();
+  // Check input tensor dims (<6) Eigen limit.
+  PADDLE_ENFORCE_LE(x_dims.size(),
+                    6,
+                    phi::errors::InvalidArgument(
+                        "The dimensions of Input(X) "
+                        "should be in the range of [1, 6] (Eigen limit)."
+                        "But received X's dimensions = %d, X's shape = [%s].",
+                        x_dims.size(),
+                        x_dims));
+
+  auto out_dims = funcs::GetOutputSqueezeShape(axes, x_dims, false);
+  out->set_dims(out_dims);
+  if (x_dims[0] == out_dims[0]) {
+    // Only pass LoD when the first dimension of output and Input(X)
+    // are the same.
+    out->share_lod(x);
+  }
+
+  std::vector<int64_t> xshape_dims(x_dims.size() + 1);
+  xshape_dims[0] = 0;
+  for (int i = 0; i < x_dims.size(); ++i) {
+    xshape_dims[i + 1] = x_dims[i];
+  }
+  xshape->set_dims(phi::make_ddim(xshape_dims));
+  xshape->share_lod(x);
+  xshape->set_dtype(x.dtype());
+  out->set_dtype(x.dtype());
+}
+
 /*  Why not use SumRawInferMeta directly?
     Because we need make InferMetaFunction's args follow the design of api.yaml
 */
@@ -1982,6 +2017,41 @@ void UnfoldInferMeta(const MetaTensor& x,
   out->set_dims(phi::make_ddim(out_dims));
 }
 
+void UnsqueezeInferMeta(const MetaTensor& x,
+                        const ScalarArray& axes,
+                        MetaTensor* xshape,
+                        MetaTensor* out) {
+  const auto& x_dims = x.dims();
+  // Validity Check: input tensor dims (<6).
+  PADDLE_ENFORCE_LE(x_dims.size(),
+                    6,
+                    phi::errors::InvalidArgument(
+                        "Invalid "
+                        "dimensions, the rank of Input(X) "
+                        "should be in the range of [1, 6] (Eigen limit)"));
+  if (!axes.GetData().empty()) {
+    std::vector<int32_t> tmp;
+    tmp.reserve(axes.GetData().size());
+    std::for_each(axes.GetData().begin(),
+                  axes.GetData().end(),
+                  [&tmp](const int64_t& t) { tmp.push_back(t); });
+    auto out_dims = funcs::GetUnsqueezeShape(tmp, x_dims);
+    out->set_dims(out_dims);
+    if (x_dims[0] == out_dims[0]) {
+      out->share_lod(x);
+    }
+  }
+  std::vector<int64_t> xshape_dims(x_dims.size() + 1);
+  xshape_dims[0] = 0;
+  for (int i = 0; i < x_dims.size(); ++i) {
+    xshape_dims[i + 1] = x_dims[i];
+  }
+  xshape->set_dims(phi::make_ddim(xshape_dims));
+  xshape->share_lod(x);
+  out->set_dtype(x.dtype());
+  xshape->set_dtype(x.dtype());
+}
+
 void OneHotRawInferMeta(const MetaTensor& x,
                         int32_t depth,
                         DataType dtype,
@@ -1992,7 +2062,6 @@ void OneHotRawInferMeta(const MetaTensor& x,
       x_dims.size(),
       1,
       phi::errors::InvalidArgument("Rank of Input(X) should be at least 1."));
-
   auto out_dims_vec = phi::vectorize(x_dims);
   out_dims_vec.push_back(depth);
   auto out_dims = phi::make_ddim(out_dims_vec);
diff --git a/paddle/phi/infermeta/unary.h b/paddle/phi/infermeta/unary.h
index 0322a18fc3153b996e03aace0f705f1a776ad99f..f623f14a709adb09bf23e60c1e5ce98759238bd9 100644
--- a/paddle/phi/infermeta/unary.h
+++ b/paddle/phi/infermeta/unary.h
@@ -229,6 +229,11 @@ void SplitInferMeta(const MetaTensor& x_meta,
                     std::vector<MetaTensor*> out,
                     MetaConfig config = MetaConfig());
 
+void SqueezeInferMeta(const MetaTensor& x,
+                      const std::vector<int>& axes,
+                      MetaTensor* xshape,
+                      MetaTensor* out);
+
 void SumInferMeta(const MetaTensor& x,
                   const std::vector<int64_t>& axis,
                   DataType dtype,
@@ -290,6 +295,11 @@ void UnfoldInferMeta(const MetaTensor& x,
                      MetaTensor* out,
                      MetaConfig config = MetaConfig());
 
+void UnsqueezeInferMeta(const MetaTensor& x,
+                        const ScalarArray& axes,
+                        MetaTensor* xshape,
+                        MetaTensor* out);
+
 void OneHotRawInferMeta(const MetaTensor& x,
                         int32_t depth,
                         DataType dtype,
diff --git a/paddle/phi/kernels/CMakeLists.txt b/paddle/phi/kernels/CMakeLists.txt
index d140912aa783047ba021be171805adff071bf22b..59540dbaefdd81ace1ca232a1c54ba68fe953562 100644
--- a/paddle/phi/kernels/CMakeLists.txt
+++ b/paddle/phi/kernels/CMakeLists.txt
@@ -62,3 +62,6 @@ register_kernels(EXCLUDES ${COMMON_BAISC_KERNELS} ${MANUAL_BUILD_KERNELS} DEPS $
 add_subdirectory(sparse)
 
 copy_if_different(${kernel_declare_file} ${kernel_declare_file_final})
+
+# 5. kernel autotune 
+add_subdirectory(autotune)
diff --git a/paddle/phi/kernels/activation_grad_kernel.h b/paddle/phi/kernels/activation_grad_kernel.h
index 241a80d85ead2d7bb6cd63105feb345c62a29a62..6ad28f348f22fc3c8bc6ba7e1ce2bb4d431971be 100644
--- a/paddle/phi/kernels/activation_grad_kernel.h
+++ b/paddle/phi/kernels/activation_grad_kernel.h
@@ -135,6 +135,14 @@ void SigmoidTripleGradKernel(const Context& dev_ctx,
                              DenseTensor* d_dout,
                              DenseTensor* d_ddx);
 
+template <typename T, typename Context>
+void LogDoubleGradKernel(const Context& dev_ctx,
+                         const DenseTensor& x,
+                         const DenseTensor& dout,
+                         const DenseTensor& ddx,
+                         DenseTensor* dx,
+                         DenseTensor* ddout);
+
 DECLARE_ACTIVATION_GRAD_KERNEL_DEPX(Cos);
 DECLARE_ACTIVATION_GRAD_KERNEL_DEPX(Tan);
 DECLARE_ACTIVATION_GRAD_KERNEL_DEPX(Acos);
@@ -149,6 +157,10 @@ DECLARE_ACTIVATION_GRAD_KERNEL_DEPX(Atanh);
 DECLARE_ACTIVATION_GRAD_KERNEL_DEPX(TanhShrink);
 DECLARE_ACTIVATION_GRAD_KERNEL_DEPX(Silu);
 DECLARE_ACTIVATION_GRAD_KERNEL_DEPX(LogSigmoid);
+DECLARE_ACTIVATION_GRAD_KERNEL_DEPX(Log);
+DECLARE_ACTIVATION_GRAD_KERNEL_DEPX(Log2);
+DECLARE_ACTIVATION_GRAD_KERNEL_DEPX(Log10);
+DECLARE_ACTIVATION_GRAD_KERNEL_DEPX(Log1p);
 
 DECLARE_ACTIVATION_GRAD_KERNEL_DEPOUT(Relu);
 DECLARE_ACTIVATION_GRAD_KERNEL_DEPOUT(Tanh);
diff --git a/paddle/phi/kernels/activation_kernel.h b/paddle/phi/kernels/activation_kernel.h
index dbc63a636edb188e4640fdd02895868034f1dd80..785d1089f06e8eece211a6d5978e1e0b1b67ba7f 100644
--- a/paddle/phi/kernels/activation_kernel.h
+++ b/paddle/phi/kernels/activation_kernel.h
@@ -56,6 +56,10 @@ DECLARE_ACTIVATION_KERNEL(TanhShrink)
 DECLARE_ACTIVATION_KERNEL(Silu)
 DECLARE_ACTIVATION_KERNEL(Sigmoid)
 DECLARE_ACTIVATION_KERNEL(LogSigmoid)
+DECLARE_ACTIVATION_KERNEL(Log)
+DECLARE_ACTIVATION_KERNEL(Log2)
+DECLARE_ACTIVATION_KERNEL(Log10)
+DECLARE_ACTIVATION_KERNEL(Log1p)
 
 DECLARE_ACTIVATION_KERNEL_WITH_ONE_ATTRS(LeakyRelu, alpha)
 DECLARE_ACTIVATION_KERNEL_WITH_ONE_ATTRS(ThresholdedRelu, threshold)
diff --git a/paddle/phi/kernels/autotune/CMakeLists.txt b/paddle/phi/kernels/autotune/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..c7bb30d2d767cfc712fc19152f35bb406a89eac9
--- /dev/null
+++ b/paddle/phi/kernels/autotune/CMakeLists.txt
@@ -0,0 +1,5 @@
+if (WITH_GPU)
+     nv_test(gpu_timer_test SRCS gpu_timer_test.cu DEPS gtest)
+elseif (WITH_ROCM)
+    hip_test(gpu_timer_test SRCS gpu_timer_test.cu DEPS gtest)
+endif()
diff --git a/paddle/phi/kernels/autotune/gpu_timer.h b/paddle/phi/kernels/autotune/gpu_timer.h
new file mode 100644
index 0000000000000000000000000000000000000000..87eca2613a7b5290341b448e6910ddbbcc833325
--- /dev/null
+++ b/paddle/phi/kernels/autotune/gpu_timer.h
@@ -0,0 +1,88 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/backends/gpu/gpu_decls.h"
+#include "paddle/phi/core/enforce.h"
+#include "paddle/phi/core/errors.h"
+#ifdef PADDLE_WITH_CUDA
+#include <cuda_runtime.h>
+#endif
+#ifdef PADDLE_WITH_HIP
+#include <hip/hip_runtime.h>
+#endif
+
+namespace phi {
+
+class GpuTimer {
+ public:
+  GpuTimer() {
+#ifdef PADDLE_WITH_HIP
+    hipEventCreate(&start_);
+    hipEventCreate(&stop_);
+#else
+    cudaEventCreate(&start_);
+    cudaEventCreate(&stop_);
+#endif
+    PADDLE_ENFORCE_NOT_NULL(
+        start_, phi::errors::PreconditionNotMet("Start Event is not ready."));
+    PADDLE_ENFORCE_NOT_NULL(
+        stop_, phi::errors::PreconditionNotMet("Stop Event is not ready."));
+  }
+
+  ~GpuTimer() {
+#ifdef PADDLE_WITH_HIP
+    hipEventDestroy(start_);
+    hipEventDestroy(stop_);
+#else
+    cudaEventDestroy(start_);
+    cudaEventDestroy(stop_);
+#endif
+  }
+
+  void Start(gpuStream_t stream) {
+#ifdef PADDLE_WITH_HIP
+    hipEventRecord(start_, stream);
+#else
+    cudaEventRecord(start_, stream);
+#endif
+  }
+
+  void Stop(gpuStream_t stream) {
+#ifdef PADDLE_WITH_HIP
+    hipEventRecord(stop_, stream);
+#else
+    cudaEventRecord(stop_, stream);
+#endif
+  }
+
+  float ElapsedTime() {
+    float milliseconds = 0;
+#ifdef PADDLE_WITH_HIP
+    hipEventSynchronize(stop_);
+    hipEventElapsedTime(&milliseconds, start_, stop_);
+#else
+    cudaEventSynchronize(stop_);
+    cudaEventElapsedTime(&milliseconds, start_, stop_);
+#endif
+    return milliseconds;
+  }
+
+ private:
+  gpuEvent_t start_;
+  gpuEvent_t stop_;
+};
+
+}  // namespace phi
diff --git a/paddle/phi/kernels/autotune/gpu_timer_test.cu b/paddle/phi/kernels/autotune/gpu_timer_test.cu
new file mode 100644
index 0000000000000000000000000000000000000000..b6eb345885f30e2c0ab2406b65bbe5f2d01f944e
--- /dev/null
+++ b/paddle/phi/kernels/autotune/gpu_timer_test.cu
@@ -0,0 +1,117 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gtest/gtest.h>
+#include <functional>
+#include "glog/logging.h"
+#include "paddle/phi/kernels/autotune/gpu_timer.h"
+#include "paddle/phi/kernels/funcs/aligned_vector.h"
+
+template <typename T, int VecSize>
+__global__ void VecSum(T *x, T *y, int N) {
+#ifdef __HIPCC__
+  int idx = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
+#else
+  int idx = blockDim.x * blockIdx.x + threadIdx.x;
+#endif
+  using LoadT = phi::AlignedVector<T, VecSize>;
+  for (int i = idx * VecSize; i < N; i += blockDim.x * gridDim.x * VecSize) {
+    LoadT x_vec;
+    LoadT y_vec;
+    phi::Load<T, VecSize>(&x[i], &x_vec);
+    phi::Load<T, VecSize>(&y[i], &y_vec);
+#pragma unroll
+    for (int j = 0; j < VecSize; j++) {
+      y_vec[j] = x_vec[j] + y_vec[j];
+    }
+    phi::Store<T, VecSize>(y_vec, &y[i]);
+  }
+}
+
+template <int Vecsize, int Threads, size_t Blocks>
+void Algo(float *d_in, float *d_out, size_t N) {
+#ifdef __HIPCC__
+  hipLaunchKernelGGL(HIP_KERNEL_NAME(VecSum<float, Vecsize>),
+                     dim3(Blocks),
+                     dim3(Threads),
+                     0,
+                     0,
+                     d_in,
+                     d_out,
+                     N);
+#else
+  VecSum<float, Vecsize><<<Blocks, Threads>>>(d_in, d_out, N);
+#endif
+}
+
+TEST(GpuTimer, Sum) {
+  float *in1, *in2, *out;
+  float *d_in1, *d_in2;
+  size_t N = 1 << 20;
+  size_t size = sizeof(float) * N;
+#ifdef __HIPCC__
+  hipMalloc(reinterpret_cast<void **>(&d_in1), size);
+  hipMalloc(reinterpret_cast<void **>(&d_in2), size);
+#else
+  cudaMalloc(reinterpret_cast<void **>(&d_in1), size);
+  cudaMalloc(reinterpret_cast<void **>(&d_in2), size);
+#endif
+  in1 = reinterpret_cast<float *>(malloc(size));
+  in2 = reinterpret_cast<float *>(malloc(size));
+  out = reinterpret_cast<float *>(malloc(size));
+  for (size_t i = 0; i < N; i++) {
+    in1[i] = 1.0f;
+    in2[i] = 2.0f;
+  }
+
+#ifdef __HIPCC__
+  hipMemcpy(d_in1, in1, size, hipMemcpyHostToDevice);
+  hipMemcpy(d_in2, in2, size, hipMemcpyHostToDevice);
+#else
+  cudaMemcpy(d_in1, in1, size, cudaMemcpyHostToDevice);
+  cudaMemcpy(d_in2, in2, size, cudaMemcpyHostToDevice);
+#endif
+
+  using Functor = std::function<void(float *, float *, size_t)>;
+  Functor alog0 = Algo<4, 256, 1024>;
+  Functor algo1 = Algo<1, 256, 1024>;
+  Functor alog2 = Algo<1, 256, 8>;
+
+  std::vector<Functor> algos = {alog0, algo1, alog2};
+
+  for (int j = 0; j < algos.size(); ++j) {
+    auto algo = algos[j];
+    phi::GpuTimer timer;
+    timer.Start(0);
+    algo(d_in1, d_in2, N);
+    timer.Stop(0);
+    VLOG(3) << "alog: " << j << " cost: " << timer.ElapsedTime() << "ms";
+  }
+
+#ifdef __HIPCC__
+  hipMemcpy(out, d_in2, size, hipMemcpyDeviceToHost);
+#else
+  cudaMemcpy(out, d_in2, size, cudaMemcpyDeviceToHost);
+#endif
+  free(in1);
+  free(in2);
+  free(out);
+#ifdef __HIPCC__
+  hipFree(d_in1);
+  hipFree(d_in2);
+#else
+  cudaFree(d_in1);
+  cudaFree(d_in2);
+#endif
+}
diff --git a/paddle/phi/kernels/cpu/activation_grad_kernel.cc b/paddle/phi/kernels/cpu/activation_grad_kernel.cc
index c582261596221f4db8bd03599386082cee909096..0776e570e9cd35be4a0f7d10c8fb4dd40aa07171 100644
--- a/paddle/phi/kernels/cpu/activation_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/activation_grad_kernel.cc
@@ -121,6 +121,10 @@ DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(Atanh, AtanhGradFunctor);
 DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(TanhShrink, TanhShrinkGradFunctor);
 DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(Silu, SiluGradFunctor);
 DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(LogSigmoid, LogSigmoidGradFunctor);
+DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(Log, LogGradFunctor);
+DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(Log2, Log2GradFunctor);
+DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(Log10, Log10GradFunctor);
+DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(Log1p, Log1pGradFunctor);
 
 DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Relu, ReluGradFunctor);
 DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Tanh, TanhGradFunctor);
@@ -233,3 +237,8 @@ PD_REGISTER_ACTIVATION_GRAD_KERNEL(sigmoid_double_grad, SigmoidDoubleGradKernel)
 PD_REGISTER_ACTIVATION_GRAD_KERNEL(sigmoid_triple_grad, SigmoidTripleGradKernel)
 PD_REGISTER_ACTIVATION_GRAD_KERNEL(hard_sigmoid_grad, HardSigmoidGradKernel)
 PD_REGISTER_ACTIVATION_GRAD_KERNEL(logsigmoid_grad, LogSigmoidGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(log_grad, LogGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(log2_grad, Log2GradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(log10_grad, Log10GradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(log1p_grad, Log1pGradKernel)
+PD_REGISTER_ACTIVATION_DOUBLE_GRAD_KERNEL(log_double_grad, LogDoubleGradKernel)
diff --git a/paddle/phi/kernels/cpu/activation_kernel.cc b/paddle/phi/kernels/cpu/activation_kernel.cc
index 1d7b77ea4445f494105d4c23516f31f349847089..c8709261d2cb0b162306b0c52d5dfefd6f09ef52 100644
--- a/paddle/phi/kernels/cpu/activation_kernel.cc
+++ b/paddle/phi/kernels/cpu/activation_kernel.cc
@@ -74,6 +74,10 @@ DEFINE_CPU_ACTIVATION_KERNEL(TanhShrink, TanhShrinkFunctor)
 DEFINE_CPU_ACTIVATION_KERNEL(Silu, SiluFunctor)
 DEFINE_CPU_ACTIVATION_KERNEL(Sigmoid, SigmoidFunctor)
 DEFINE_CPU_ACTIVATION_KERNEL(LogSigmoid, LogSigmoidFunctor)
+DEFINE_CPU_ACTIVATION_KERNEL(Log, LogFunctor)
+DEFINE_CPU_ACTIVATION_KERNEL(Log2, Log2Functor)
+DEFINE_CPU_ACTIVATION_KERNEL(Log10, Log10Functor)
+DEFINE_CPU_ACTIVATION_KERNEL(Log1p, Log1pFunctor)
 
 DEFINE_CPU_ACT_KERNEL_WITH_ONE_ATTRS(LeakyRelu, LeakyReluFunctor, alpha)
 DEFINE_CPU_ACT_KERNEL_WITH_ONE_ATTRS(ThresholdedRelu,
@@ -118,3 +122,7 @@ PD_REGISTER_ACTIVATION_KERNEL(silu, SiluKernel)
 PD_REGISTER_ACTIVATION_KERNEL(sigmoid, SigmoidKernel)
 PD_REGISTER_ACTIVATION_KERNEL(logsigmoid, LogSigmoidKernel)
 PD_REGISTER_ACTIVATION_KERNEL(hard_sigmoid, HardSigmoidKernel)
+PD_REGISTER_ACTIVATION_KERNEL(log, LogKernel)
+PD_REGISTER_ACTIVATION_KERNEL(log2, Log2Kernel)
+PD_REGISTER_ACTIVATION_KERNEL(log10, Log10Kernel)
+PD_REGISTER_ACTIVATION_KERNEL(log1p, Log1pKernel)
diff --git a/paddle/phi/kernels/cpu/squeeze_grad_kernel.cc b/paddle/phi/kernels/cpu/squeeze_grad_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..5f605e6c2504baec2276ed9288c72f466f3c40f0
--- /dev/null
+++ b/paddle/phi/kernels/cpu/squeeze_grad_kernel.cc
@@ -0,0 +1,34 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/squeeze_grad_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/squeeze_grad_kernel_impl.h"
+
+PD_REGISTER_KERNEL(squeeze_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::SqueezeGradKernel,
+                   float,
+                   double,
+                   phi::dtype::bfloat16,
+                   bool,
+                   int,
+                   uint8_t,
+                   int8_t,
+                   int64_t,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
diff --git a/paddle/phi/kernels/cpu/squeeze_kernel.cc b/paddle/phi/kernels/cpu/squeeze_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..7d5a6ca4e884e80a20e9583be4aa9f48f9484793
--- /dev/null
+++ b/paddle/phi/kernels/cpu/squeeze_kernel.cc
@@ -0,0 +1,34 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/squeeze_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/squeeze_kernel_impl.h"
+
+PD_REGISTER_KERNEL(squeeze,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::SqueezeKernel,
+                   float,
+                   double,
+                   phi::dtype::bfloat16,
+                   bool,
+                   int,
+                   uint8_t,
+                   int8_t,
+                   int64_t,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
diff --git a/paddle/phi/kernels/cpu/unsqueeze_grad_kernel.cc b/paddle/phi/kernels/cpu/unsqueeze_grad_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0cbccac4734a768db143387bdbfa48fd0ca148f3
--- /dev/null
+++ b/paddle/phi/kernels/cpu/unsqueeze_grad_kernel.cc
@@ -0,0 +1,35 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/unsqueeze_grad_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/unsqueeze_grad_kernel_impl.h"
+
+PD_REGISTER_KERNEL(unsqueeze_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::UnsqueezeGradKernel,
+                   phi::dtype::bfloat16,
+                   bool,
+                   int,
+                   int16_t,
+                   uint8_t,
+                   int8_t,
+                   int64_t,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>,
+                   float,
+                   double) {}
diff --git a/paddle/phi/kernels/cpu/unsqueeze_kernel.cc b/paddle/phi/kernels/cpu/unsqueeze_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0152a31f80ba80280af137dcbc544172bf89c430
--- /dev/null
+++ b/paddle/phi/kernels/cpu/unsqueeze_kernel.cc
@@ -0,0 +1,35 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/unsqueeze_kernel.h"
+
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/unsqueeze_kernel_impl.h"
+
+PD_REGISTER_KERNEL(unsqueeze,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::UnsqueezeKernel,
+                   float,
+                   double,
+                   phi::dtype::bfloat16,
+                   bool,
+                   int,
+                   int16_t,
+                   uint8_t,
+                   int8_t,
+                   int64_t,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
diff --git a/paddle/phi/kernels/full_kernel.cc b/paddle/phi/kernels/full_kernel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..9622bff5c255aef470cbd50c9e8496e39bf7d02b
--- /dev/null
+++ b/paddle/phi/kernels/full_kernel.cc
@@ -0,0 +1,65 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/kernels/full_kernel.h"
+
+#include "paddle/phi/core/kernel_registry.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void FullBatchSizeLikeKernel(const Context& dev_ctx,
+                             const DenseTensor& x,
+                             const std::vector<int>& shape,
+                             const Scalar& val,
+                             DataType dtype,
+                             int x_batch_size_dim,
+                             int out_batch_size_dim,
+                             DenseTensor* out) {
+  if (x.lod().size() && x_batch_size_dim == 0) {
+    // set the correct batch size for the LoDTensor.
+    auto odims = out->dims();
+    odims[out_batch_size_dim] = static_cast<int>(x.lod().back().size()) - 1;
+    FullKernel<T, Context>(dev_ctx, phi::vectorize(odims), val, dtype, out);
+  }
+  FullLikeKernel<T, Context>(dev_ctx, x, val, dtype, out);
+}
+
+}  // namespace phi
+
+PD_REGISTER_KERNEL(full_batch_size_like,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::FullBatchSizeLikeKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   bool) {
+  kernel->InputAt(0).SetBackend(phi::Backend::ALL_BACKEND);
+}
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+PD_REGISTER_KERNEL(full_batch_size_like,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::FullBatchSizeLikeKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   bool,
+                   phi::dtype::float16) {
+  kernel->InputAt(0).SetBackend(phi::Backend::ALL_BACKEND);
+}
+#endif
diff --git a/paddle/phi/kernels/full_kernel.h b/paddle/phi/kernels/full_kernel.h
index 41fc96b6db1fae5eb54b24923b68b4491c158d93..df82e651a0b268cba49d64dff6307d41a377fe0a 100644
--- a/paddle/phi/kernels/full_kernel.h
+++ b/paddle/phi/kernels/full_kernel.h
@@ -14,6 +14,8 @@
 
 #pragma once
 
+#include <vector>
+
 #include "paddle/phi/common/scalar.h"
 #include "paddle/phi/common/scalar_array.h"
 #include "paddle/phi/core/dense_tensor.h"
@@ -37,6 +39,18 @@ void FullLikeKernel(const Context& dev_ctx,
                     DataType dtype,
                     DenseTensor* out);
 
+// In order to be compatible with fill_constant_batch_size_like op
+// that are still used in the 2.x APIs
+template <typename T, typename Context>
+void FullBatchSizeLikeKernel(const Context& dev_ctx,
+                             const DenseTensor& x,
+                             const std::vector<int>& shape,
+                             const Scalar& val,
+                             DataType dtype,
+                             int x_batch_size_dim,
+                             int out_batch_size_dim,
+                             DenseTensor* out);
+
 template <typename T, typename Context>
 void Full(const Context& dev_ctx,
           const ScalarArray& shape,
diff --git a/paddle/phi/kernels/funcs/activation_functor.h b/paddle/phi/kernels/funcs/activation_functor.h
index 6c5ffbd06e3a435d9568a6c4717d8ce83b5aec00..6e536bd00a4a12fc2e4034920c344993f54f447b 100644
--- a/paddle/phi/kernels/funcs/activation_functor.h
+++ b/paddle/phi/kernels/funcs/activation_functor.h
@@ -1223,6 +1223,133 @@ struct HardSigmoidGradFunctor : public BaseActivationFunctor<T> {
   }
 };
 
+// log(x) = natural logarithm of x
+template <typename T>
+struct LogFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    out.device(d) = x.log();
+  }
+};
+
+template <typename T>
+struct LogGradFunctor : public BaseActivationFunctor<T> {
+  template <typename Device,
+            typename X,
+            typename Out,
+            typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    dx.device(d) = dout * (static_cast<T>(1) / x);
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
+};
+
+// log2(x) = logarithm to the base 2 of the elements of x
+template <typename T>
+struct Log2Functor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    out.device(d) = x.log() / static_cast<T>(log(2));
+  }
+};
+
+// the gradient of log2(x) is 1/(x*ln(2))
+template <typename T>
+struct Log2GradFunctor : public BaseActivationFunctor<T> {
+  template <typename Device,
+            typename X,
+            typename Out,
+            typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    dx.device(d) = dout * static_cast<T>(1) / (x * static_cast<T>(log(2)));
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
+};
+
+// log10(x) = logarithm to the base 10 of the elements of x
+template <typename T>
+struct Log10Functor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    out.device(d) = x.log() / static_cast<T>(log(10));
+  }
+};
+
+// the gradient of log10(x) is 1/(x*ln(10))
+template <typename T>
+struct Log10GradFunctor : public BaseActivationFunctor<T> {
+  template <typename Device,
+            typename X,
+            typename Out,
+            typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    dx.device(d) = dout * static_cast<T>(1) / (x * static_cast<T>(log(10)));
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
+};
+
+// log1p(x) = natural logarithm of x+1
+template <typename T>
+struct Log1pFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    out.device(d) = (static_cast<T>(1) + x).log();
+  }
+};
+
+template <typename T>
+struct Log1pGradFunctor : public BaseActivationFunctor<T> {
+  template <typename Device,
+            typename X,
+            typename Out,
+            typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    dx.device(d) = dout * (static_cast<T>(1) / (x + static_cast<T>(1)));
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
+};
+
+template <typename T>
+struct LogGradGradFunctor : public BaseActivationFunctor<T> {
+  template <typename Device>
+  void operator()(const Device& dev,
+                  const DenseTensor* X,
+                  const DenseTensor* ddX,
+                  DenseTensor* ddOut,
+                  const DenseTensor* dOut,
+                  DenseTensor* dX) const {
+    auto* d = dev.eigen_device();
+    auto ddx = EigenVector<T>::Flatten(
+        GET_DATA_SAFELY(ddX, "Input", "DDX", "LogGradGrad"));
+    auto x = EigenVector<T>::Flatten(
+        GET_DATA_SAFELY(X, "Input", "X", "LogGradGrad"));
+    // ddout = ddx / x; dx = -(dout / x) * (ddx / x)
+    // calculate dx first, so ddout can inplace ddx
+    if (dX) {
+      auto dout = EigenVector<T>::Flatten(
+          GET_DATA_SAFELY(dOut, "Output", "DOut", "LogGradGrad"));
+      auto dx = EigenVector<T>::Flatten(
+          GET_DATA_SAFELY(dX, "Output", "DX", "LogGradGrad"));
+      dx.device(*d) = dout * static_cast<T>(-1) * ddx / (x * x);
+    }
+    if (ddOut) {
+      auto ddout = EigenVector<T>::Flatten(
+          GET_DATA_SAFELY(ddOut, "Output", "DDOut", "LogGradGrad"));
+      ddout.device(*d) = ddx * static_cast<T>(1) / x;
+    }
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
+};
+
 #if defined(__NVCC__) || defined(__HIPCC__) || defined(__xpu__)
 template <typename T>
 struct CudaReluFunctor : public BaseActivationFunctor<T> {
@@ -1970,6 +2097,99 @@ struct CudaHardSigmoidGradFunctor : public BaseActivationFunctor<T> {
   }
 };
 
+template <typename T>
+struct CudaLogFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+
+  // log(x) = log(x)
+  __device__ __forceinline__ T operator()(const T arg_x) const {
+    MPType x = static_cast<MPType>(arg_x);
+    return static_cast<T>(log(x));
+  }
+};
+
+template <typename T>
+struct CudaLogGradFunctor : public BaseActivationFunctor<T> {
+  // dx = dout / x
+  __device__ __forceinline__ T operator()(const T dout, const T x) const {
+    return dout / x;
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
+};
+
+template <typename T>
+struct CudaLog1pFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+  MPType one = static_cast<MPType>(1.0f);
+
+  // log1p(x) = log(1 + x)
+  __device__ __forceinline__ T operator()(const T arg_x) const {
+    MPType x = static_cast<MPType>(arg_x);
+    return static_cast<T>(log(one + x));
+  }
+};
+
+template <typename T>
+struct CudaLog1pGradFunctor : public BaseActivationFunctor<T> {
+  T one = static_cast<T>(1.0f);
+
+  // dx = dout / (1 + x)
+  __device__ __forceinline__ T operator()(const T dout, const T x) const {
+    return dout / (one + x);
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
+};
+
+template <typename T>
+struct CudaLog2Functor : public BaseActivationFunctor<T> {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+
+  // log2(x) = log2(x)
+  __device__ __forceinline__ T operator()(const T arg_x) const {
+    MPType x = static_cast<MPType>(arg_x);
+    return static_cast<T>(log2(x));
+  }
+};
+
+template <typename T>
+struct CudaLog2GradFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+  T log_two = static_cast<T>(log(static_cast<MPType>(2.0f)));
+
+  // dx = dout / (x * log(2))
+  __device__ __forceinline__ T operator()(const T dout, const T x) const {
+    return dout / (x * log_two);
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
+};
+
+template <typename T>
+struct CudaLog10Functor : public BaseActivationFunctor<T> {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+
+  // log10(x) = log10(x)
+  __device__ __forceinline__ T operator()(const T arg_x) const {
+    MPType x = static_cast<MPType>(arg_x);
+    return static_cast<T>(log10(x));
+  }
+};
+
+template <typename T>
+struct CudaLog10GradFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+  T log_ten = static_cast<T>(log(static_cast<MPType>(10.0f)));
+
+  // dx = dout / (x * log(10))
+  __device__ __forceinline__ T operator()(const T dout, const T x) const {
+    return dout / (x * log_ten);
+  }
+
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
+};
+
 #endif
 
 }  // namespace funcs
diff --git a/paddle/phi/kernels/funcs/unsqueeze.h b/paddle/phi/kernels/funcs/unsqueeze.h
index 7b8a81471ef769dc5ddf18889f60813641d86d22..2d77c809bf9c9fd16d6c92d99ad686dae7642203 100644
--- a/paddle/phi/kernels/funcs/unsqueeze.h
+++ b/paddle/phi/kernels/funcs/unsqueeze.h
@@ -21,6 +21,118 @@
 
 namespace phi {
 namespace funcs {
+inline DDim GetOutputSqueezeShape(const std::vector<int> squeeze_dims,
+                                  const DDim& in_dims,
+                                  bool is_runtime) {
+  size_t num_squeeze_dims = squeeze_dims.size();
+  std::vector<bool> should_squeeze(in_dims.size(), false);
+
+  // Mark dimensions need to be squeezed.
+  if (num_squeeze_dims == 0) {
+    for (int i = 0; i < in_dims.size(); ++i) {
+      if (in_dims[i] == 1) {
+        should_squeeze[i] = true;
+      }
+    }
+  } else {
+    for (size_t i = 0; i < num_squeeze_dims; ++i) {
+      int current = squeeze_dims[i] < 0 ? squeeze_dims[i] + in_dims.size()
+                                        : squeeze_dims[i];
+
+      PADDLE_ENFORCE_GE(
+          current,
+          0,
+          phi::errors::InvalidArgument(
+              "Each axis in Attr(axes) should be in the range of [%d, %d]"
+              "But current axis is:%d, input tensor's shape = [%s].",
+              -in_dims.size(),
+              in_dims.size() - 1,
+              current,
+              in_dims));
+      PADDLE_ENFORCE_LT(
+          current,
+          in_dims.size(),
+          phi::errors::InvalidArgument(
+              "Each axis in Attr(axes) should be in the range of [%d, %d]"
+              "But current axis is:%d, input tensor's shape = [%s].",
+              -in_dims.size(),
+              in_dims.size() - 1,
+              current,
+              in_dims));
+
+      if (!should_squeeze[current]) {
+        if (is_runtime) {
+          // At run time, dim of 1 is allowed to squeeze
+          if (in_dims[current] == 1) {
+            should_squeeze[current] = true;
+          }
+        } else {
+          // At compile time, dim of -1 or 1 is allowed to squeeze
+          if (in_dims[current] == 1 || in_dims[current] == -1) {
+            should_squeeze[current] = true;
+          }
+        }
+      }
+    }
+  }
+  // Make output dimensions
+  std::vector<int64_t> output_shape;
+  for (int i = 0; i < in_dims.size(); ++i) {
+    if (!should_squeeze[i]) {
+      output_shape.push_back(in_dims[i]);
+    }
+  }
+  return phi::make_ddim(output_shape);
+}
+
+inline DDim GetUnsqueezeShape(const std::vector<int> unsqz_dims,
+                              const DDim& in_dims) {
+  int output_size = in_dims.size() + static_cast<int>(unsqz_dims.size());
+  int cur_output_size = in_dims.size();
+  std::vector<int64_t> output_shape(output_size, 0);
+
+  // Validity Check: rank range.
+  PADDLE_ENFORCE_LE(
+      output_size,
+      6,
+      phi::errors::InvalidArgument("The output "
+                                   "tensor's rank should be less than 6."));
+
+  for (int axis : unsqz_dims) {
+    int cur = axis < 0 ? axis + cur_output_size + 1 : axis;
+    // Vaildity Check: the axis bound
+    PADDLE_ENFORCE_GE(
+        cur,
+        0,
+        phi::errors::InvalidArgument("The insert dimension value should "
+                                     "not be less than 0"));
+    PADDLE_ENFORCE_LE(cur,
+                      cur_output_size,
+                      phi::errors::InvalidArgument(
+                          "The insert dimension value shoule not be larger "
+                          "than the dimension size of input tensor"));
+    // Move old axis, and insert new axis
+    for (int i = cur_output_size; i >= cur; --i) {
+      if (output_shape[i] == 1) {
+        // Move axis
+        output_shape[i + 1] = 1;
+        output_shape[i] = 0;
+      }
+    }
+    output_shape[cur] = 1;
+    // Add the output size.
+    cur_output_size++;
+  }
+
+  // Make output shape
+  for (int in_idx = 0, out_idx = 0; out_idx < output_size; ++out_idx) {
+    if (output_shape[out_idx] == 0) {
+      output_shape[out_idx] = in_dims[in_idx++];
+    }
+  }
+
+  return phi::make_ddim(output_shape);
+}
 
 inline const DenseTensor Unsqueeze(const DenseTensor& x, int axis = 0) {
   // don't copy data, only change the dims
diff --git a/paddle/phi/kernels/gpu/activation_grad_kernel.cu b/paddle/phi/kernels/gpu/activation_grad_kernel.cu
index c912d0c4686ff3fee88925f4d7121f38f24a5485..3cc41555a898b041921aecea309eb6d97d58a0ba 100644
--- a/paddle/phi/kernels/gpu/activation_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/activation_grad_kernel.cu
@@ -177,6 +177,10 @@ DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(Atanh, CudaAtanhGradFunctor);
 DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(TanhShrink, CudaTanhShrinkGradFunctor);
 DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(Silu, CudaSiluGradFunctor);
 DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(LogSigmoid, CudaLogSigmoidGradFunctor);
+DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(Log, CudaLogGradFunctor);
+DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(Log2, CudaLog2GradFunctor);
+DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(Log10, CudaLog10GradFunctor);
+DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(Log1p, CudaLog1pGradFunctor);
 
 DEFINE_GPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(LeakyRelu,
                                                CudaLeakyReluGradFunctor,
@@ -300,3 +304,14 @@ PD_REGISTER_ACTIVATION_GRAD_KERNEL(sigmoid_double_grad, SigmoidDoubleGradKernel)
 PD_REGISTER_ACTIVATION_GRAD_KERNEL(sigmoid_triple_grad, SigmoidTripleGradKernel)
 PD_REGISTER_ACTIVATION_GRAD_KERNEL(hard_sigmoid_grad, HardSigmoidGradKernel)
 PD_REGISTER_ACTIVATION_GRAD_KERNEL(logsigmoid_grad, LogSigmoidGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(log_grad, LogGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(log2_grad, Log2GradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(log10_grad, Log10GradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(log1p_grad, Log1pGradKernel)
+PD_REGISTER_KERNEL(log_double_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::LogDoubleGradKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
diff --git a/paddle/phi/kernels/gpu/activation_kernel.cu b/paddle/phi/kernels/gpu/activation_kernel.cu
index 6b598c764debb059072ba3ae3ac90e6985479133..fb4e2e07b21cbed384248fd8400a04a6e1965320 100644
--- a/paddle/phi/kernels/gpu/activation_kernel.cu
+++ b/paddle/phi/kernels/gpu/activation_kernel.cu
@@ -19,7 +19,7 @@ limitations under the License. */
 #include "paddle/phi/common/float16.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/elementwise_base.h"
-#include "paddle/phi/kernels/impl/activation_grad_impl.h"
+#include "paddle/phi/kernels/impl/activation_impl.h"
 
 #include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
 
@@ -93,6 +93,10 @@ DEFINE_GPU_ACTIVATION_KERNEL(TanhShrink, CudaTanhShrinkFunctor)
 DEFINE_GPU_ACTIVATION_KERNEL(Silu, CudaSiluFunctor)
 DEFINE_GPU_ACTIVATION_KERNEL(Sigmoid, CudaSigmoidFunctor)
 DEFINE_GPU_ACTIVATION_KERNEL(LogSigmoid, CudaLogSigmoidFunctor)
+DEFINE_GPU_ACTIVATION_KERNEL(Log, CudaLogFunctor)
+DEFINE_GPU_ACTIVATION_KERNEL(Log2, CudaLog2Functor)
+DEFINE_GPU_ACTIVATION_KERNEL(Log10, CudaLog10Functor)
+DEFINE_GPU_ACTIVATION_KERNEL(Log1p, CudaLog1pFunctor)
 
 DEFINE_GPU_ACT_KERNEL_WITH_ONE_ATTRS(LeakyRelu, CudaLeakyReluFunctor, alpha)
 DEFINE_GPU_ACT_KERNEL_WITH_ONE_ATTRS(ThresholdedRelu,
@@ -164,3 +168,7 @@ PD_REGISTER_ACTIVATION_KERNEL(silu, SiluKernel)
 PD_REGISTER_ACTIVATION_KERNEL(sigmoid, SigmoidKernel)
 PD_REGISTER_ACTIVATION_KERNEL(logsigmoid, LogSigmoidKernel)
 PD_REGISTER_ACTIVATION_KERNEL(hard_sigmoid, HardSigmoidKernel)
+PD_REGISTER_ACTIVATION_KERNEL(log, LogKernel)
+PD_REGISTER_ACTIVATION_KERNEL(log2, Log2Kernel)
+PD_REGISTER_ACTIVATION_KERNEL(log10, Log10Kernel)
+PD_REGISTER_ACTIVATION_KERNEL(log1p, Log1pKernel)
diff --git a/paddle/phi/kernels/gpu/squeeze_grad_kernel.cu b/paddle/phi/kernels/gpu/squeeze_grad_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..c5a243f45bd9769b3f6a052248e979dad10e0a4f
--- /dev/null
+++ b/paddle/phi/kernels/gpu/squeeze_grad_kernel.cu
@@ -0,0 +1,35 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/squeeze_grad_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/squeeze_grad_kernel_impl.h"
+
+PD_REGISTER_KERNEL(squeeze_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::SqueezeGradKernel,
+                   float,
+                   double,
+                   phi::dtype::bfloat16,
+                   phi::dtype::float16,
+                   bool,
+                   int,
+                   uint8_t,
+                   int8_t,
+                   int64_t,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
diff --git a/paddle/phi/kernels/gpu/squeeze_kernel.cu b/paddle/phi/kernels/gpu/squeeze_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..ae15e210a02e72fd681d18fc6679f6e61c74e2bf
--- /dev/null
+++ b/paddle/phi/kernels/gpu/squeeze_kernel.cu
@@ -0,0 +1,35 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/squeeze_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/squeeze_kernel_impl.h"
+
+PD_REGISTER_KERNEL(squeeze,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::SqueezeKernel,
+                   float,
+                   double,
+                   phi::dtype::bfloat16,
+                   phi::dtype::float16,
+                   bool,
+                   int,
+                   uint8_t,
+                   int8_t,
+                   int64_t,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
diff --git a/paddle/phi/kernels/gpu/unsqueeze_grad_kernel.cu b/paddle/phi/kernels/gpu/unsqueeze_grad_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..6c3a2066f0f2d18f35e8c2e58b9a26ae3b5624d3
--- /dev/null
+++ b/paddle/phi/kernels/gpu/unsqueeze_grad_kernel.cu
@@ -0,0 +1,36 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/unsqueeze_grad_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/unsqueeze_grad_kernel_impl.h"
+
+PD_REGISTER_KERNEL(unsqueeze_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::UnsqueezeGradKernel,
+                   phi::dtype::bfloat16,
+                   phi::dtype::float16,
+                   bool,
+                   int,
+                   int16_t,
+                   uint8_t,
+                   int8_t,
+                   int64_t,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>,
+                   float,
+                   double) {}
diff --git a/paddle/phi/kernels/gpu/unsqueeze_kernel.cu b/paddle/phi/kernels/gpu/unsqueeze_kernel.cu
new file mode 100644
index 0000000000000000000000000000000000000000..86b44622546375321086216ff5a8b8c5b3429f55
--- /dev/null
+++ b/paddle/phi/kernels/gpu/unsqueeze_kernel.cu
@@ -0,0 +1,36 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/kernels/unsqueeze_kernel.h"
+
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/unsqueeze_kernel_impl.h"
+
+PD_REGISTER_KERNEL(unsqueeze,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::UnsqueezeKernel,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16,
+                   bool,
+                   int,
+                   int16_t,
+                   uint8_t,
+                   int8_t,
+                   int64_t,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
diff --git a/paddle/phi/kernels/impl/activation_grad_impl.h b/paddle/phi/kernels/impl/activation_grad_impl.h
index 7d6b6dc72ea60214ff4c9974b4ff885feecb5822..7ef8a0887c75cce8be9618593073e84089739c30 100644
--- a/paddle/phi/kernels/impl/activation_grad_impl.h
+++ b/paddle/phi/kernels/impl/activation_grad_impl.h
@@ -275,4 +275,22 @@ void SigmoidTripleGradKernel(const Context& dev_ctx,
           d_ddx);
 }
 
+template <typename T, typename Context>
+void LogDoubleGradKernel(const Context& dev_ctx,
+                         const DenseTensor& x,
+                         const DenseTensor& dout,
+                         const DenseTensor& ddx,
+                         DenseTensor* dx,
+                         DenseTensor* ddout) {
+  if (dx) {
+    dx->Resize(x.dims());
+    dev_ctx.template Alloc<T>(dx);
+  }
+  if (ddout) {
+    dev_ctx.template Alloc<T>(ddout);
+  }
+  funcs::LogGradGradFunctor<T> functor;
+  functor(dev_ctx, &x, &ddx, ddout, &dout, dx);
+}
+
 }  // namespace phi
diff --git a/paddle/phi/kernels/impl/squeeze_grad_kernel_impl.h b/paddle/phi/kernels/impl/squeeze_grad_kernel_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..c74aa5c7243f3ccd24c2e44042ea88826637b6a5
--- /dev/null
+++ b/paddle/phi/kernels/impl/squeeze_grad_kernel_impl.h
@@ -0,0 +1,33 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/kernels/copy_kernel.h"
+
+namespace phi {
+template <typename T, typename Context>
+void SqueezeGradKernel(const Context& dev_ctx,
+                       const DenseTensor& xshape,
+                       const DenseTensor& dout,
+                       const std::vector<int>& axes,
+                       DenseTensor* dx) {
+  auto xshape_dims = xshape.dims();
+  auto x_dims = phi::slice_ddim(xshape_dims, 1, xshape_dims.size());
+
+  dev_ctx.template Alloc<T>(dx);
+  phi::Copy(dev_ctx, dout, dev_ctx.GetPlace(), false, dx);
+  dx->Resize(x_dims);
+}
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/squeeze_kernel_impl.h b/paddle/phi/kernels/impl/squeeze_kernel_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..d2b40824a91c99ac4f69b689b2219822e21c6aea
--- /dev/null
+++ b/paddle/phi/kernels/impl/squeeze_kernel_impl.h
@@ -0,0 +1,34 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/kernels/funcs/unsqueeze.h"
+
+namespace phi {
+template <typename T, typename Context>
+void SqueezeKernel(const Context& dev_ctx,
+                   const DenseTensor& x,
+                   const std::vector<int>& axes,
+                   DenseTensor* xshape,
+                   DenseTensor* out) {
+  auto x_dims = x.dims();
+  auto out_dims = funcs::GetOutputSqueezeShape(axes, x_dims, true);
+
+  dev_ctx.template Alloc<T>(out);
+  phi::Copy(dev_ctx, x, dev_ctx.GetPlace(), false, out);
+  out->Resize(out_dims);
+}
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/unsqueeze_grad_kernel_impl.h b/paddle/phi/kernels/impl/unsqueeze_grad_kernel_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..54b332ea4c898d10b63037375805d07f04ab2e63
--- /dev/null
+++ b/paddle/phi/kernels/impl/unsqueeze_grad_kernel_impl.h
@@ -0,0 +1,31 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/kernels/copy_kernel.h"
+
+namespace phi {
+template <typename T, typename Context>
+void UnsqueezeGradKernel(const Context& dev_ctx,
+                         const DenseTensor& x_shape,
+                         const DenseTensor& dout,
+                         DenseTensor* dx) {
+  auto xshape_dims = x_shape.dims();
+  auto x_dims = phi::slice_ddim(xshape_dims, 1, xshape_dims.size());
+  dev_ctx.template Alloc<T>(dx);
+  phi::Copy(dev_ctx, dout, dev_ctx.GetPlace(), true, dx);
+  dx->Resize(x_dims);
+}
+}  // namespace phi
diff --git a/paddle/phi/kernels/impl/unsqueeze_kernel_impl.h b/paddle/phi/kernels/impl/unsqueeze_kernel_impl.h
new file mode 100644
index 0000000000000000000000000000000000000000..884fa26df451c680fe8352f0c0b21b2ce8a33b6c
--- /dev/null
+++ b/paddle/phi/kernels/impl/unsqueeze_kernel_impl.h
@@ -0,0 +1,42 @@
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/kernels/funcs/unsqueeze.h"
+
+namespace phi {
+template <typename T, typename Context>
+void UnsqueezeKernel(const Context& dev_ctx,
+                     const DenseTensor& x,
+                     const ScalarArray& axes,
+                     DenseTensor* xshape,
+                     DenseTensor* out) {
+  auto x_dims = x.dims();
+  auto out_dims = out->dims();
+  if (axes.FromTensor()) {
+    std::vector<int32_t> tmp;
+    tmp.reserve(axes.GetData().size());
+    std::for_each(axes.GetData().begin(),
+                  axes.GetData().end(),
+                  [&tmp](const int64_t& t) { tmp.push_back(t); });
+    out_dims = funcs::GetUnsqueezeShape(tmp, x_dims);
+  }
+  out->Resize(out_dims);
+  dev_ctx.template Alloc<T>(out);
+  phi::Copy(dev_ctx, x, dev_ctx.GetPlace(), false, out);
+  out->Resize(out_dims);  // copy will reset the dims.
+}
+}  // namespace phi
diff --git a/paddle/phi/kernels/squeeze_grad_kernel.h b/paddle/phi/kernels/squeeze_grad_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..52b02bdbb95299b37fc63532e8210af8be6a2e28
--- /dev/null
+++ b/paddle/phi/kernels/squeeze_grad_kernel.h
@@ -0,0 +1,28 @@
+
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void SqueezeGradKernel(const Context& dev_ctx,
+                       const DenseTensor& xshape,
+                       const DenseTensor& dout,
+                       const std::vector<int>& axes,
+                       DenseTensor* dx);
+}  // namespace phi
diff --git a/paddle/phi/kernels/squeeze_kernel.h b/paddle/phi/kernels/squeeze_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..22254eacfcefcd3a3dd33870c4d068c1114335b7
--- /dev/null
+++ b/paddle/phi/kernels/squeeze_kernel.h
@@ -0,0 +1,28 @@
+
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void SqueezeKernel(const Context& dev_ctx,
+                   const DenseTensor& x,
+                   const std::vector<int>& axes,
+                   DenseTensor* xshape,
+                   DenseTensor* out);
+}  // namespace phi
diff --git a/paddle/phi/kernels/unsqueeze_grad_kernel.h b/paddle/phi/kernels/unsqueeze_grad_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..0c5afe7be6039d408e4ad0b05144cc2fbe2c11cf
--- /dev/null
+++ b/paddle/phi/kernels/unsqueeze_grad_kernel.h
@@ -0,0 +1,27 @@
+
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void UnsqueezeGradKernel(const Context& dev_ctx,
+                         const DenseTensor& x_shape,
+                         const DenseTensor& dout,
+                         DenseTensor* dx);
+}  // namespace phi
diff --git a/paddle/phi/kernels/unsqueeze_kernel.h b/paddle/phi/kernels/unsqueeze_kernel.h
new file mode 100644
index 0000000000000000000000000000000000000000..8f818a1b4904223e093c18107168afde2fd599a5
--- /dev/null
+++ b/paddle/phi/kernels/unsqueeze_kernel.h
@@ -0,0 +1,29 @@
+
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include "paddle/phi/common/scalar_array.h"
+#include "paddle/phi/core/dense_tensor.h"
+
+namespace phi {
+
+template <typename T, typename Context>
+void UnsqueezeKernel(const Context& dev_ctx,
+                     const DenseTensor& x,
+                     const ScalarArray& axes,
+                     DenseTensor* xshape,
+                     DenseTensor* out);
+}  // namespace phi
diff --git a/paddle/phi/ops/compat/activation_sig.cc b/paddle/phi/ops/compat/activation_sig.cc
index 7ae0dc45c5e1be09a31821c171b84fbb47fe1c9e..8b4884e35b608c1d60645e5d2b59131f240d8be3 100644
--- a/paddle/phi/ops/compat/activation_sig.cc
+++ b/paddle/phi/ops/compat/activation_sig.cc
@@ -57,6 +57,10 @@ DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(HardShrink, "hard_shrink", "threshold");
 DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(TanhShrink, "tanh_shrink", );  // NOLINT
 DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Silu, "silu", );               // NOLINT
 DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(LogSigmoid, "logsigmoid", );   // NOLINT
+DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Log, "log", );                 // NOLINT
+DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Log2, "log2", );               // NOLINT
+DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Log10, "log10", );             // NOLINT
+DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Log1p, "log1p", );             // NOLINT
 
 DEFINE_ACT_GRAD_DEPOUT_OP_ARGMAP(Relu, "relu", );        // NOLINT
 DEFINE_ACT_GRAD_DEPOUT_OP_ARGMAP(Tanh, "tanh", );        // NOLINT
@@ -125,6 +129,12 @@ KernelSignature EluDoubleGradOpArgumentMapping(
       "elu_double_grad", {"X", "DOut", "DDX"}, {"alpha"}, {"DX", "DDOut"});
 }
 
+KernelSignature LogDoubleGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature(
+      "log_double_grad", {"X", "DOut", "DDX"}, {}, {"DX", "DDOut"});
+}
+
 }  // namespace phi
 
 PD_REGISTER_BASE_KERNEL_NAME(relu_grad_grad, relu_double_grad);
@@ -134,6 +144,7 @@ PD_REGISTER_BASE_KERNEL_NAME(softshrink, soft_shrink);
 PD_REGISTER_BASE_KERNEL_NAME(softshrink_grad, soft_shrink_grad);
 PD_REGISTER_BASE_KERNEL_NAME(elu_grad_grad, elu_double_grad);
 PD_REGISTER_BASE_KERNEL_NAME(sigmoid_grad_grad, sigmoid_double_grad);
+PD_REGISTER_BASE_KERNEL_NAME(log_grad_grad, log_double_grad);
 
 PD_REGISTER_ARG_MAPPING_FN(cos_grad, phi::CosGradOpArgumentMapping);
 PD_REGISTER_ARG_MAPPING_FN(tan_grad, phi::TanGradOpArgumentMapping);
@@ -181,3 +192,8 @@ PD_REGISTER_ARG_MAPPING_FN(logsigmoid_grad,
                            phi::LogSigmoidGradOpArgumentMapping);
 PD_REGISTER_ARG_MAPPING_FN(hard_sigmoid_grad,
                            phi::HardSigmoidGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(log_grad, phi::LogGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(log_grad_grad, phi::LogDoubleGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(log2_grad, phi::Log2GradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(log10_grad, phi::Log10GradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(log1p_grad, phi::Log1pGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/fill_constant_batch_size_like_sig.cc b/paddle/phi/ops/compat/fill_constant_batch_size_like_sig.cc
new file mode 100644
index 0000000000000000000000000000000000000000..444c0ec5b16fe639ee8ff36c22e38a2aa77db8fa
--- /dev/null
+++ b/paddle/phi/ops/compat/fill_constant_batch_size_like_sig.cc
@@ -0,0 +1,43 @@
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature FillConstantBatchSizeLikeOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  const auto& str_value = paddle::any_cast<std::string>(ctx.Attr("str_value"));
+  if (str_value.empty()) {
+    return KernelSignature(
+        "full_batch_size_like",
+        {"Input"},
+        {"shape", "value", "dtype", "input_dim_idx", "output_dim_idx"},
+        {"Out"});
+  } else {
+    return KernelSignature(
+        "full_batch_size_like",
+        {"Input"},
+        {"shape", "str_value", "dtype", "input_dim_idx", "output_dim_idx"},
+        {"Out"});
+  }
+}
+
+}  // namespace phi
+
+PD_REGISTER_BASE_KERNEL_NAME(fill_constant_batch_size_like,
+                             full_batch_size_like);
+
+PD_REGISTER_ARG_MAPPING_FN(fill_constant_batch_size_like,
+                           phi::FillConstantBatchSizeLikeOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/squeeze_sig.cc b/paddle/phi/ops/compat/squeeze_sig.cc
new file mode 100644
index 0000000000000000000000000000000000000000..276246533e89e29f1a5d38fd4f3d831a044b5535
--- /dev/null
+++ b/paddle/phi/ops/compat/squeeze_sig.cc
@@ -0,0 +1,36 @@
+
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature SqueezeOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature("squeeze", {"X"}, {"axes"}, {"XShape", "Out"});
+}
+
+KernelSignature SqueezeGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("squeeze_grad",
+                         {"XShape", GradVarName("Out")},
+                         {"axes"},
+                         {GradVarName("X")});
+}
+
+}  // namespace phi
+PD_REGISTER_BASE_KERNEL_NAME(squeeze2, squeeze);
+PD_REGISTER_BASE_KERNEL_NAME(squeeze2_grad, squeeze_grad);
+PD_REGISTER_ARG_MAPPING_FN(squeeze2, phi::SqueezeOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(squeeze2_grad, phi::SqueezeGradOpArgumentMapping);
diff --git a/paddle/phi/ops/compat/unsqueeze_sig.cc b/paddle/phi/ops/compat/unsqueeze_sig.cc
new file mode 100644
index 0000000000000000000000000000000000000000..20cd9701e83e5ecf3563eeccb5e1b4b2923bd65a
--- /dev/null
+++ b/paddle/phi/ops/compat/unsqueeze_sig.cc
@@ -0,0 +1,46 @@
+
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/phi/core/compat/op_utils.h"
+
+namespace phi {
+
+KernelSignature UnsqueezeOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  if (ctx.InputSize("AxesTensorList") > 0) {
+    VLOG(2) << "unsqueeze2 in AxesTensorList";
+    return KernelSignature(
+        "unsqueeze", {"X"}, {"AxesTensorList"}, {"XShape", "Out"});
+  } else if (ctx.InputSize("AxesTensor") > 0) {
+    VLOG(2) << "unsqueeze2 in AxesTensor";
+    return KernelSignature(
+        "unsqueeze", {"X"}, {"AxesTensor"}, {"XShape", "Out"});
+  } else {
+    VLOG(2) << "unsqueeze2 in axes";
+    return KernelSignature("unsqueeze", {"X"}, {"axes"}, {"XShape", "Out"});
+  }
+}
+
+KernelSignature UnsqueezeGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature(
+      "unsqueeze_grad", {"XShape", GradVarName("Out")}, {}, {GradVarName("X")});
+}
+}  // namespace phi
+PD_REGISTER_BASE_KERNEL_NAME(unsqueeze2, unsqueeze);
+PD_REGISTER_BASE_KERNEL_NAME(unsqueeze2_grad, unsqueeze_grad);
+
+PD_REGISTER_ARG_MAPPING_FN(unsqueeze2, phi::UnsqueezeOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(unsqueeze2_grad,
+                           phi::UnsqueezeGradOpArgumentMapping);
diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index 39676b916e50470ac9774f3564b4bdc3a8fcb20f..bc19b50616d139e2a2db83ad51f602dff0f0fa7a 100755
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -774,12 +774,12 @@ set +x
         get_precision_ut_mac
         ut_actual_total_startTime_s=`date +%s`
         if [[ "$on_precision" == "0" ]];then
-            ctest -E "$disable_ut_quickly" -LE ${nightly_label} --output-on-failure -j $2 | tee $tmpfile
+            ctest -E "$disable_ut_quickly" -LE ${nightly_label} --timeout 120 --output-on-failure -j $2 | tee $tmpfile
         else
-            ctest -R "$UT_list_prec" -E "$disable_ut_quickly" -LE ${nightly_label} --output-on-failure -j $2 | tee $tmpfile
+            ctest -R "$UT_list_prec" -E "$disable_ut_quickly" -LE ${nightly_label} --timeout 120 --output-on-failure -j $2 | tee $tmpfile
             tmpfile_rand=`date +%s%N`
             tmpfile=$tmp_dir/$tmpfile_rand
-            ctest -R "$UT_list_prec_1" -E "$disable_ut_quickly" -LE ${nightly_label} --output-on-failure -j $2 | tee $tmpfile
+            ctest -R "$UT_list_prec_1" -E "$disable_ut_quickly" -LE ${nightly_label} --timeout 120 --output-on-failure -j $2 | tee $tmpfile
         fi
         ut_total_endTime_s=`date +%s`
         echo "TestCases Total Time: $[ $ut_total_endTime_s - $ut_actual_total_startTime_s ]s"
@@ -848,7 +848,7 @@ set +x
                                 fi
                             done
                         failed_test_lists=''
-                        ctest -R "$retry_unittests_regular" --output-on-failure -j 2 | tee $tmpfile
+                        ctest -R "$retry_unittests_regular" --timeout 120 --output-on-failure -j 2 | tee $tmpfile
                         collect_failed_tests
                         rm -f $tmp_dir/*
                         exec_times=$[$exec_times+1]
diff --git a/python/paddle/fluid/tests/unittests/test_scope.py b/python/paddle/fluid/tests/unittests/test_scope.py
index aa093069c49ecffd402eae018f2acbe30117a52e..805aabd393e49bc0fcf92c15d9c9de0c456770de 100644
--- a/python/paddle/fluid/tests/unittests/test_scope.py
+++ b/python/paddle/fluid/tests/unittests/test_scope.py
@@ -59,6 +59,13 @@ class TestScope(unittest.TestCase):
             # It is not allowed to delete a nonexistent scope.
             scope._remove_from_pool()
 
+    def test_size(self):
+        paddle_c = paddle.fluid.core
+        scope = paddle_c.Scope()
+        var_a = scope.var("var_a")
+        self.assertEqual(scope.size(), 1)
+        self.assertIsNotNone(scope.find_var('var_a'))
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/utils/code_gen/api_gen.py b/python/paddle/utils/code_gen/api_gen.py
index 07baa9b51de391721bc5c33745c77f7f56c4f974..cf9cb65f6d1f4c4464d084c36cc09e23d531f78c 100644
--- a/python/paddle/utils/code_gen/api_gen.py
+++ b/python/paddle/utils/code_gen/api_gen.py
@@ -148,7 +148,6 @@ def source_include(header_file_path):
 #include "paddle/phi/infermeta/nullary.h"
 #include "paddle/phi/infermeta/unary.h"
 #include "paddle/phi/infermeta/ternary.h"
-#include "paddle/phi/kernels/declarations.h"
 
 #include "paddle/fluid/platform/profiler/event_tracing.h"
 """
diff --git a/tools/check_file_diff_approvals.sh b/tools/check_file_diff_approvals.sh
index 9c802a56a7b6e29bc89ad164a15f2f6d4749734e..d2892d13fc401c069065675dbbb8f00bfa372797 100644
--- a/tools/check_file_diff_approvals.sh
+++ b/tools/check_file_diff_approvals.sh
@@ -231,6 +231,12 @@ if [ "${HAS_MODIFIED_ALLOCATION}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then
     check_approval 1 6888866 39303645
   fi
 
+HAS_MODIFIED_DECLARATIONS=`git diff --name-only upstream/$BRANCH | grep "paddle/phi/kernels/declarations.h" || true`
+if [ "${HAS_MODIFIED_DECLARATIONS}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then
+    echo_line="You must be approved by chenwhql for any use of paddle/phi/kernels/declarations.h. Thanks!\n"
+    check_approval 1 22561442
+  fi
+
 ALL_PADDLE_ENFORCE=`git diff -U0 upstream/$BRANCH |grep "^+" |grep -zoE "PADDLE_ENFORCE\(.[^,\);]+.[^;]*\);\s" || true`
 if [ "${ALL_PADDLE_ENFORCE}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then
     echo_line="PADDLE_ENFORCE is not recommended. Please use PADDLE_ENFORCE_EQ/NE/GT/GE/LT/LE or PADDLE_ENFORCE_NOT_NULL or PADDLE_ENFORCE_GPU_SUCCESS instead, see [ https://github.com/PaddlePaddle/Paddle/wiki/PADDLE_ENFORCE-Rewriting-Specification ] for details.\nYou must have one RD (chenwhql (Recommend) , luotao1 (Recommend) or lanxianghit) approval for the usage (either add or delete) of PADDLE_ENFORCE.\n${ALL_PADDLE_ENFORCE}\n"