Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into add_some_yaml_config

f3a5d012 · phlrain · 0be73847 · 9121115b · f3a5d012 · f3a5d012
65 changed file
--- a/cmake/phi.cmake
+++ b/cmake/phi.cmake
@@ -118,7 +118,7 @@ function(kernel_library TARGET)
        if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cc)
            list(APPEND common_srcs ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cc)
        endif()
-        if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/cpu/${TARGET}.cc)
+        if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/cpu/${TARGET}.cc AND NOT WITH_XPU_KP)
            list(APPEND cpu_srcs ${CMAKE_CURRENT_SOURCE_DIR}/cpu/${TARGET}.cc)
        endif()
        if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/selected_rows/${TARGET}.cc)
@@ -151,6 +151,9 @@ function(kernel_library TARGET)
                file(RENAME ${CMAKE_CURRENT_BINARY_DIR}/kps/${TARGET}.cu ${CMAKE_CURRENT_BINARY_DIR}/kps/${TARGET}.kps)
                list(APPEND kps_srcs ${CMAKE_CURRENT_BINARY_DIR}/kps/${TARGET}.kps)
            endif()
+            if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/cpu/${TARGET}.cc )
+                list(APPEND kps_srcs ${CMAKE_CURRENT_SOURCE_DIR}/cpu/${TARGET}.cc)
+            endif()
        endif()
    else()
        # TODO(chenweihang): impl compile by source later

--- a/paddle/fluid/eager/tests/task_tests/generated_test.cc
+++ b/paddle/fluid/eager/tests/task_tests/generated_test.cc
@@ -35,6 +35,8 @@ PD_DECLARE_KERNEL(matmul, CPU, ALL_LAYOUT);
 PD_DECLARE_KERNEL(matmul_grad, CPU, ALL_LAYOUT);
 PD_DECLARE_KERNEL(add, CPU, ALL_LAYOUT);
 PD_DECLARE_KERNEL(add_grad, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(sigmoid, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(sigmoid_grad, CPU, ALL_LAYOUT);
 namespace egr {

--- a/paddle/fluid/eager/tests/task_tests/hook_test_intermidiate.cc
+++ b/paddle/fluid/eager/tests/task_tests/hook_test_intermidiate.cc
@@ -32,6 +32,8 @@ PD_DECLARE_KERNEL(matmul, CPU, ALL_LAYOUT);
 PD_DECLARE_KERNEL(matmul_grad, CPU, ALL_LAYOUT);
 PD_DECLARE_KERNEL(add, CPU, ALL_LAYOUT);
 PD_DECLARE_KERNEL(add_grad, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(sigmoid, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(sigmoid_grad, CPU, ALL_LAYOUT);
 namespace egr {

--- a/paddle/fluid/framework/ir/mkldnn/mkldnn_conv_bn_fuse_pass_tester.cc
+++ b/paddle/fluid/framework/ir/mkldnn/mkldnn_conv_bn_fuse_pass_tester.cc
@@ -25,6 +25,12 @@
 #include "paddle/fluid/framework/naive_executor.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/place.h"
+#include "paddle/phi/core/kernel_registry.h"
+PD_DECLARE_KERNEL(conv2d_transpose, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(batch_norm, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(add, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(gelu, CPU, ALL_LAYOUT);
 USE_OP_ITSELF(batch_norm);
 USE_OP_DEVICE_KERNEL(batch_norm, MKLDNN);

--- a/paddle/fluid/framework/new_executor/CMakeLists.txt
+++ b/paddle/fluid/framework/new_executor/CMakeLists.txt
@@ -23,7 +23,7 @@ cc_library(standalone_executor SRCS standalone_executor.cc DEPS interpretercore)
 # cc_binary(standalone_executor_test SRCS standalone_executor_test.cc DEPS interpretercore standalone_executor operator op_registry executor ${GLOB_OP_LIB} ${GLOB_OPERATOR_DEPS} profiler)
 # skip win32 since wget is not installed by default on windows machine.
 # skip COVERAGE_CI since the test runs slowly because of instrumentation.
-if (WITH_CUDA AND WITH_TESTING AND NOT WIN32 AND NOT WITH_COVERAGE AND NOT "$ENV{CI_SKIP_CPP_TEST}" STREQUAL "ON")
+if (WITH_GPU AND WITH_TESTING AND NOT WIN32 AND NOT WITH_COVERAGE AND NOT "$ENV{CI_SKIP_CPP_TEST}" STREQUAL "ON")
    add_custom_target(
        download_program
        COMMAND wget -nc https://paddle-ci.gz.bcebos.com/new_exec/lm_main_program 

--- a/paddle/fluid/framework/new_executor/interpretercore.cc
+++ b/paddle/fluid/framework/new_executor/interpretercore.cc
@@ -41,6 +41,7 @@ namespace paddle {
 namespace framework {
 // NOTE(Aurelius84): Need a better strategy to determine it.
 static constexpr size_t kHostNumThreads = 4;
+static constexpr size_t kDeviceNumThreads = 1;
 bool IsInterpretercoreFastGCEnabled() {
  return FLAGS_fast_eager_deletion_mode && FLAGS_use_stream_safe_cuda_allocator;
@@ -54,8 +55,8 @@ InterpreterCore::InterpreterCore(const platform::Place& place,
      global_scope_(global_scope),
      stream_analyzer_(place) {
  is_build_ = false;
-  async_work_queue_.reset(
+  async_work_queue_.reset(new interpreter::AsyncWorkQueue(
-      new interpreter::AsyncWorkQueue(kHostNumThreads, &main_thread_blocker_));
+      kHostNumThreads, kDeviceNumThreads, &main_thread_blocker_));
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
  if (IsInterpretercoreFastGCEnabled()) {
@@ -271,6 +272,10 @@ void InterpreterCore::Convert(
  if (FLAGS_new_executor_use_inplace) {
    BuildInplace();
  }
+  // prepare for the first time.
+  async_work_queue_->PrepareAtomicDeps(dependecy_count_);
+  async_work_queue_->PrepareAtomicVarRef(vec_meta_info);
 }
 bool InterpreterCore::BuildInplaceCheckVarIsOnlyInput(size_t var_index) {
@@ -388,18 +393,18 @@ void InterpreterCore::RunInstruction(const Instruction& instr_node) {
                           : global_scope_->GetMutableScope();
  auto op_with_kernel = dynamic_cast<const framework::OperatorWithKernel*>(op);
  {
-    platform::RecordEvent infershape_event(
+    if (op_with_kernel != nullptr) {
-        "infer_shape", platform::TracerEventType::OperatorInner, 1,
+      platform::RecordEvent infershape_event(
-        platform::EventRole::kInnerOp);
+          "infer_shape", platform::TracerEventType::OperatorInner, 1,
-    // If it is OperatorBase, InferShape do nothing.
+          platform::EventRole::kInnerOp);
-    if (op_with_kernel != nullptr)
+      // If it is OperatorBase, InferShape do nothing.
      op_with_kernel->Info().infer_shape_(
          instr_node.InnerInferShapeContext().get());
+    }
  }
-  if (op_with_kernel != nullptr &&
+  if (op_with_kernel != nullptr && FLAGS_new_executor_use_inplace) {
-      FLAGS_new_executor_use_inplace) {  // TODO(xiongkun03) Does operator
+    // TODO(xiongkun03) Does operator base support inplace ?
-                                         // base support inplace ?
    for (auto& pair : instr_node.InplaceInfo()) {
      const auto& in = paddle::framework::details::GetTensorFromVar(pair.first);
      auto* out =
@@ -409,6 +414,7 @@ void InterpreterCore::RunInstruction(const Instruction& instr_node) {
      }
    }
  }
  {
    platform::RecordEvent compute_event(
        "compute", platform::TracerEventType::OperatorInner, 1,
@@ -458,16 +464,24 @@ void InterpreterCore::RunInstruction(const Instruction& instr_node) {
 void InterpreterCore::ExecuteInstructionList(
    const std::vector<Instruction>& vec_instr) {
+  // NOTE(zhiqiu): get the prepared deps from std::future, and async prepare
+  // those for the next step
+  auto atomic_deps = async_work_queue_->AtomicDeps();
+  auto atomic_var_ref = async_work_queue_->AtomicVarRef();
  async_work_queue_->PrepareAtomicDeps(dependecy_count_);
  async_work_queue_->PrepareAtomicVarRef(global_scope_->VecMetaInfo());
  unfinished_op_numer_ = vec_instr.size();
  exception_holder_.Clear();
  for (size_t i = 0; i < dependecy_count_.size(); ++i) {
    if (dependecy_count_[i] == 0) {
-      async_work_queue_->AddTask(vec_instr.at(i).KernelType(),
+      async_work_queue_->AddTask(vec_instr.at(i).KernelType(), [
-                                 [&, i] { RunInstructionAsync(i); });
+        this, i, atomic_deps = atomic_deps.get(),
+        atomic_var_ref = atomic_var_ref.get()
+      ] { RunInstructionAsync(i, atomic_deps, atomic_var_ref); });
    }
  }
@@ -490,11 +504,16 @@ void InterpreterCore::ExecuteInstructionList(
 }
 void InterpreterCore::RunNextInstructions(
-    const Instruction& instr, std::queue<size_t>* reserved_next_ops) {
+    const Instruction& instr, std::queue<size_t>* reserved_next_ops,
+    std::vector<std::atomic<size_t>>* atomic_deps,
+    std::vector<std::atomic<size_t>>* atomic_var_ref) {
+  VLOG(4) << "atomic 1:" << atomic_deps;
  auto& next_instr = instr.NextInstructions();
-  auto& atomic_deps = async_work_queue_->AtomicDeps();
-  auto IsReady = [&](size_t next_id) {
+  auto IsReady = [atomic_deps](size_t next_id) {
-    return atomic_deps[next_id]->fetch_sub(1, std::memory_order_relaxed) == 1;
+    VLOG(4) << "atomic:" << atomic_deps << " " << &(*atomic_deps)[next_id]
+            << " " << next_id;
+    return (*atomic_deps)[next_id].fetch_sub(1, std::memory_order_relaxed) == 1;
  };
  if (instr.KernelType() == OpFuncType::kQueueAsync) {
@@ -503,7 +522,9 @@ void InterpreterCore::RunNextInstructions(
      if (IsReady(next_id)) {
        async_work_queue_->AddTask(
            vec_instruction_[next_id].KernelType(),
-            [&, next_id] { RunInstructionAsync(next_id); });
+            [this, next_id, atomic_deps, atomic_var_ref]() {
+              RunInstructionAsync(next_id, atomic_deps, atomic_var_ref);
+            });
      }
    }
    // keep all async_ops running in current thread
@@ -523,7 +544,9 @@ void InterpreterCore::RunNextInstructions(
      if (IsReady(next_id)) {
        async_work_queue_->AddTask(
            vec_instruction_[next_id].KernelType(),
-            [&, next_id] { RunInstructionAsync(next_id); });
+            [this, next_id, atomic_deps, atomic_var_ref] {
+              RunInstructionAsync(next_id, atomic_deps, atomic_var_ref);
+            });
      }
    }
    auto direct_run_ops = interpreter::merge_vector(next_instr.SyncRunIds(),
@@ -539,14 +562,18 @@ void InterpreterCore::RunNextInstructions(
        // move rest ops into other threads
        async_work_queue_->AddTask(
            vec_instruction_[next_id].KernelType(),
-            [&, next_id] { RunInstructionAsync(next_id); });
+            [this, next_id, atomic_deps, atomic_var_ref] {
+              RunInstructionAsync(next_id, atomic_deps, atomic_var_ref);
+            });
      }
    }
    if (first_op != 0) reserved_next_ops->push(first_op);
  }
 }
-void InterpreterCore::RunInstructionAsync(size_t instr_id) {
+void InterpreterCore::RunInstructionAsync(
+    size_t instr_id, std::vector<std::atomic<size_t>>* atomic_deps,
+    std::vector<std::atomic<size_t>>* atomic_var_ref) {
  std::queue<size_t> ready_ops;
  ready_ops.push(instr_id);
  while (!ready_ops.empty()) {
@@ -571,7 +598,7 @@ void InterpreterCore::RunInstructionAsync(size_t instr_id) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
      RecordStreamForGC(instr_node);
 #endif
-      CheckGC(instr_node);
+      CheckGC(instr_node, atomic_var_ref);
    } catch (platform::EnforceNotMet& ex) {
      framework::InsertCallStackInfo(op->Type(), op->Attrs(), &ex);
      exception_holder_.Catch(std::make_exception_ptr(std::move(ex)));
@@ -605,7 +632,7 @@ void InterpreterCore::RunInstructionAsync(size_t instr_id) {
    interpreter::RecordEvent(instr_node, place_);
-    RunNextInstructions(instr_node, &ready_ops);
+    RunNextInstructions(instr_node, &ready_ops, atomic_deps, atomic_var_ref);
  }
 }
@@ -703,17 +730,19 @@ void InterpreterCore::RecordStreamForGC(const Instruction& instr) {
 }
 #endif
-void InterpreterCore::CheckGC(const Instruction& instr) {
+void InterpreterCore::CheckGC(
+    const Instruction& instr,
+    std::vector<std::atomic<size_t>>* atomic_var_ref) {
  size_t instr_id = instr.Id();
  auto& var_scope = *global_scope_;
-  auto& atomic_var_ref = async_work_queue_->AtomicVarRef();
  for (auto var_id : instr.GCCheckVars()) {
    VLOG(4) << "GC " << global_scope_->GetNameById(var_id) << " "
            << var_scope.VarDesc(var_id);
+    VLOG(4) << "atomic:" << atomic_var_ref << " " << &(*atomic_var_ref)[var_id]
+            << " " << var_id;
    bool is_ready =
-        atomic_var_ref[var_id]->fetch_sub(1, std::memory_order_relaxed) == 1;
+        (*atomic_var_ref)[var_id].fetch_sub(1, std::memory_order_relaxed) == 1;
    // ignore all persistable var while GC
    if (var_scope.VarDesc(var_id) && var_scope.VarDesc(var_id)->Persistable()) {
      continue;

--- a/paddle/fluid/framework/new_executor/interpretercore.h
+++ b/paddle/fluid/framework/new_executor/interpretercore.h
@@ -76,11 +76,16 @@ class InterpreterCore {
  void RecordStreamForGC(const Instruction& instr);
 #endif
-  void CheckGC(const Instruction& instr);
+  void CheckGC(const Instruction& instr,
+               std::vector<std::atomic<size_t>>* atomic_var_ref);
-  void RunInstructionAsync(size_t instr_id);
+  void RunInstructionAsync(size_t instr_id,
+                           std::vector<std::atomic<size_t>>* atomic_deps,
+                           std::vector<std::atomic<size_t>>* atomic_var_ref);
  void RunNextInstructions(const Instruction& instr_id,
-                           std::queue<size_t>* reserved_next_ops);
+                           std::queue<size_t>* reserved_next_ops,
+                           std::vector<std::atomic<size_t>>* atomic_deps,
+                           std::vector<std::atomic<size_t>>* atomic_var_ref);
  void BuildSkipShareLoDInfo();

--- a/paddle/fluid/framework/new_executor/interpretercore_util.cc
+++ b/paddle/fluid/framework/new_executor/interpretercore_util.cc
@@ -44,32 +44,37 @@ void AsyncWorkQueue::AddTask(const OpFuncType& op_func_type,
 using VariableIdMap = std::map<std::string, std::vector<int>>;
-AtomicVectorSizeT& AsyncWorkQueue::PrepareAtomicDeps(
+void AsyncWorkQueue::PrepareAtomicDeps(
    const std::vector<size_t>& dependecy_count) {
-  if (atomic_deps_.size() != dependecy_count.size()) {
+  VLOG(4) << "PrepareAtomicDeps";
-    atomic_deps_.clear();
+  auto p = std::make_shared<
-    std::generate_n(std::back_inserter(atomic_deps_), dependecy_count.size(),
+      std::promise<std::unique_ptr<std::vector<std::atomic<size_t>>>>>();
-                    [] { return std::make_unique<std::atomic<size_t>>(0); });
+  atomic_deps_ = p->get_future();
-  }
+  queue_group_->AddTask(2, [&dependecy_count, p] {
+    auto* op_deps =
-  for (size_t i = 0; i < dependecy_count.size(); ++i) {
+        new std::vector<std::atomic<size_t>>(dependecy_count.size());
-    atomic_deps_[i]->store(dependecy_count[i]);
+    for (size_t i = 0; i < dependecy_count.size(); ++i) {
-  }
+      (*op_deps)[i] = dependecy_count[i];
-  return atomic_deps_;
+    }
+    VLOG(4) << "AtomicDeps:" << op_deps << " " << (*op_deps).size();
+    p->set_value(std::unique_ptr<std::vector<std::atomic<size_t>>>(op_deps));
+  });
 }
-AtomicVectorSizeT& AsyncWorkQueue::PrepareAtomicVarRef(
+void AsyncWorkQueue::PrepareAtomicVarRef(
    const std::vector<VariableMetaInfo>& vec_meta_info) {
-  if (atomic_var_ref_.size() != vec_meta_info.size()) {
+  VLOG(4) << "PrepareAtomicVarRef";
-    atomic_var_ref_.clear();
+  auto p = std::make_shared<
-    std::generate_n(std::back_inserter(atomic_var_ref_), vec_meta_info.size(),
+      std::promise<std::unique_ptr<std::vector<std::atomic<size_t>>>>>();
-                    [] { return std::make_unique<std::atomic<size_t>>(0); });
+  atomic_var_ref_ = p->get_future();
-  }
+  queue_group_->AddTask(2, [&vec_meta_info, p] {
+    auto* var_ref = new std::vector<std::atomic<size_t>>(vec_meta_info.size());
-  for (size_t i = 0; i < vec_meta_info.size(); ++i) {
+    for (size_t i = 0; i < vec_meta_info.size(); ++i) {
-    atomic_var_ref_[i]->store(vec_meta_info[i].var_ref_count_);
+      (*var_ref)[i] = vec_meta_info[i].var_ref_count_;
-  }
+    }
-  return atomic_var_ref_;
+    VLOG(4) << "AtomicVarRef:" << var_ref << " " << (*var_ref).size();
+    p->set_value(std::unique_ptr<std::vector<std::atomic<size_t>>>(var_ref));
+  });
 }
 bool var_can_be_deleted(const std::string& name, const BlockDesc& block) {

--- a/paddle/fluid/framework/new_executor/interpretercore_util.h
+++ b/paddle/fluid/framework/new_executor/interpretercore_util.h
@@ -50,11 +50,13 @@ namespace framework {
 namespace interpreter {
-using AtomicVectorSizeT = std::vector<std::unique_ptr<std::atomic<size_t>>>;
+using AtomicVectorSizeT =
+    std::future<std::unique_ptr<std::vector<std::atomic<size_t>>>>;
 class AsyncWorkQueue {
 public:
-  AsyncWorkQueue(size_t host_num_threads, EventsWaiter* waiter)
+  AsyncWorkQueue(size_t host_num_threads, size_t deivce_num_threads,
+                 EventsWaiter* waiter)
      : host_num_thread_(host_num_threads) {
    std::vector<WorkQueueOptions> group_options;
    // for execute host Kernel
@@ -66,6 +68,13 @@ class AsyncWorkQueue {
                               /*events_waiter*/ waiter);
    // for launch device Kernel
    group_options.emplace_back(/*name*/ "DeviceKernelLaunch",
+                               /*num_threads*/ deivce_num_threads,
+                               /*allow_spinning*/ true,
+                               /*track_task*/ false,
+                               /*detached*/ true,
+                               /*events_waiter*/ waiter);
+    // for prepare deps and others
+    group_options.emplace_back(/*name*/ "Prepare",
                               /*num_threads*/ 1,
                               /*allow_spinning*/ true,
                               /*track_task*/ false,
@@ -74,10 +83,8 @@ class AsyncWorkQueue {
    queue_group_ = CreateWorkQueueGroup(group_options);
  }
-  AtomicVectorSizeT& PrepareAtomicDeps(
+  void PrepareAtomicDeps(const std::vector<size_t>& dependecy_count);
-      const std::vector<size_t>& dependecy_count);
+  void PrepareAtomicVarRef(const std::vector<VariableMetaInfo>& vec_meta_info);
-  AtomicVectorSizeT& PrepareAtomicVarRef(
-      const std::vector<VariableMetaInfo>& vec_meta_info);
  // void WaitEmpty() { queue_group_->WaitQueueGroupEmpty(); }
@@ -85,8 +92,12 @@ class AsyncWorkQueue {
  void Cancel() { queue_group_->Cancel(); }
-  AtomicVectorSizeT& AtomicDeps() { return atomic_deps_; }
+  std::unique_ptr<std::vector<std::atomic<size_t>>> AtomicDeps() {
-  AtomicVectorSizeT& AtomicVarRef() { return atomic_var_ref_; }
+    return atomic_deps_.get();
+  }
+  std::unique_ptr<std::vector<std::atomic<size_t>>> AtomicVarRef() {
+    return atomic_var_ref_.get();
+  }
 private:
  size_t host_num_thread_;

--- a/paddle/fluid/framework/new_executor/standalone_executor_test.cc
+++ b/paddle/fluid/framework/new_executor/standalone_executor_test.cc
@@ -20,45 +20,65 @@
 // #include "gperftools/profiler.h"
 #include "paddle/fluid/framework/new_executor/standalone_executor.h"
+#include "paddle/phi/core/kernel_registry.h"
 USE_OP_ITSELF(fill_constant);
-USE_OP(uniform_random);
+USE_OP_ITSELF(uniform_random);
 USE_OP(lookup_table);
-USE_OP(transpose2);
+USE_OP_ITSELF(transpose2);
 USE_OP_ITSELF(reshape2);
-USE_OP(split);
+USE_OP_ITSELF(split);
-USE_OP(slice);
+USE_OP_ITSELF(slice);
-USE_OP(concat);
+USE_OP_ITSELF(concat);
-USE_OP(matmul);
+USE_OP_ITSELF(matmul);
 USE_OP_ITSELF(elementwise_add);
 USE_OP_ITSELF(sigmoid);
 USE_OP_ITSELF(tanh);
-USE_OP(elementwise_mul);
+USE_OP_ITSELF(elementwise_mul);
 USE_OP(softmax_with_cross_entropy);
 USE_OP_ITSELF(reduce_mean);
 USE_OP_ITSELF(reduce_sum);
 USE_OP_ITSELF(reduce_sum_grad);
 USE_OP_ITSELF(reduce_mean_grad);
 USE_OP_ITSELF(reshape2_grad);
-USE_OP(softmax_with_cross_entropy_grad);
+USE_OP_ITSELF(softmax_with_cross_entropy_grad);
 USE_OP_ITSELF(elementwise_add_grad);
-USE_OP(matmul_grad);
+USE_OP_ITSELF(matmul_grad);
-USE_OP(square);
+USE_OP_ITSELF(square);
-USE_OP(transpose2_grad);
+USE_OP_ITSELF(transpose2_grad);
 USE_OP(concat_grad);
 USE_OP_ITSELF(elementwise_mul_grad);
 USE_OP_ITSELF(sigmoid_grad);
 USE_OP_ITSELF(tanh_grad);
 USE_OP(sum);
-USE_OP(slice_grad);
+USE_OP_ITSELF(slice_grad);
-USE_OP(lookup_table_grad);
+USE_OP_ITSELF(lookup_table_grad);
 USE_OP(sqrt);
 USE_OP(elementwise_max);
 USE_OP_ITSELF(elementwise_div);
-USE_OP(sgd);
+USE_OP_ITSELF(sgd);
 USE_OP(squared_l2_norm);
-USE_OP(memcpy_h2d);
+USE_OP_ITSELF(memcpy_h2d);
-USE_OP(memcpy_d2h);
+USE_OP_ITSELF(memcpy_d2h);
+PD_DECLARE_KERNEL(full, GPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(uniform_random_raw, GPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(transpose, GPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(reshape, GPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(split, GPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(concat, GPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(matmul, GPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(add_raw, GPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(mean, GPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(reshape_grad, GPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(add_grad, GPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(matmul_grad, GPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(transpose_grad, GPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(sum, GPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(sum_grad, GPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(max_raw, GPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(sgd, GPU, ALL_LAYOUT);
 DECLARE_double(eager_delete_tensor_gb);
 namespace paddle {

--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -1122,7 +1122,15 @@ static void CheckTensorNANOrInf(const std::string& op_type,
 bool OperatorWithKernel::SupportsMKLDNN(
    const proto::VarType::Type data_type) const {
-  auto& op_kernels = OperatorWithKernel::AllOpKernels().at(type_);
+  auto op_kernel_iter = OperatorWithKernel::AllOpKernels().find(type_);
+  if (op_kernel_iter == OperatorWithKernel::AllOpKernels().end()) {
+    VLOG(6) << "Warning: " << type_ << " don't find its MKLDNN Kernel in Fluid "
+                                       "Registered Kernels. And We don't "
+                                       "search its kernels in phi lib, "
+                                       "SupportsMKLDNN() return false.";
+    return false;
+  }
+  auto& op_kernels = op_kernel_iter->second;
  return std::any_of(op_kernels.begin(), op_kernels.end(),
                     [data_type](OpKernelMap::const_reference kern_pair) {
                       return platform::is_cpu_place(kern_pair.first.place_) &&

--- a/paddle/fluid/framework/scope.h
+++ b/paddle/fluid/framework/scope.h
@@ -144,6 +144,9 @@ class Scope : public ScopeBase {
  void Rename(const std::string& origin_name,
              const std::string& new_name) const;
+  // Return the number of variables in scope
+  size_t Size() { return vars_.size(); }
  // Rename variable to a new name and return the new name
  std::string Rename(const std::string& origin_name) const;

--- a/paddle/fluid/imperative/tests/test_tracer.cc
+++ b/paddle/fluid/imperative/tests/test_tracer.cc
@@ -32,6 +32,8 @@
 PD_DECLARE_KERNEL(add, CPU, ALL_LAYOUT);
 PD_DECLARE_KERNEL(add_grad, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(sum, CPU, ALL_LAYOUT);
+PD_DECLARE_KERNEL(sum_grad, CPU, ALL_LAYOUT);
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 PD_DECLARE_KERNEL(add_grad, GPU, ALL_LAYOUT);
 PD_DECLARE_KERNEL(sum_grad, GPU, ALL_LAYOUT);

--- a/paddle/fluid/operators/activation_op.cc
+++ b/paddle/fluid/operators/activation_op.cc
@@ -1496,6 +1496,9 @@ REGISTER_ACTIVATION_OP(hard_sigmoid, HardSigmoid, HardSigmoidFunctor,
                       HardSigmoidGradFunctor);
 REGISTER_ACTIVATION_OP(logsigmoid, LogSigmoid, LogSigmoidFunctor,
                       LogSigmoidGradFunctor);
+REGISTER_ACTIVATION_OP(log2, Log2, Log2Functor, Log2GradFunctor);
+REGISTER_ACTIVATION_OP(log10, Log10, Log10Functor, Log10GradFunctor);
+REGISTER_ACTIVATION_OP(log1p, Log1p, Log1pFunctor, Log1pGradFunctor);
 /* ==========================    sigmoid register  =============================
 */
@@ -1867,15 +1870,6 @@ REGISTER_OPERATOR(
    ops::ActivationOpDoubleGrad<ops::LogGradGradFunctor<float>::FwdDeps()>,
    ops::ActivationDoubleGradOpInplaceInferer);
-REGISTER_ACTIVATION_CPU_KERNEL(log, Log, LogFunctor, LogGradFunctor);
-REGISTER_OP_CPU_KERNEL(
-    log_grad_grad, ops::LogDoubleGradKernel<plat::CPUDeviceContext,
-                                            ops::LogGradGradFunctor<float>>,
-    ops::LogDoubleGradKernel<plat::CPUDeviceContext,
-                             ops::LogGradGradFunctor<double>>,
-    ops::LogDoubleGradKernel<plat::CPUDeviceContext,
-                             ops::LogGradGradFunctor<plat::float16>>);
 /* ========================================================================== */
 /* ==========================  register checkpoint ===========================*/

--- a/paddle/fluid/operators/activation_op.h
+++ b/paddle/fluid/operators/activation_op.h
@@ -281,6 +281,11 @@ USE_PHI_DOUBLE_GRAD_FUNCTOR(Sigmoid)
 USE_PHI_TRIPLE_GRAD_FUNCTOR(Sigmoid)
 USE_PHI_FUNCTOR(LogSigmoid)
 USE_PHI_FUNCTOR(HardSigmoid)
+USE_PHI_FUNCTOR(Log)
+USE_PHI_DOUBLE_GRAD_FUNCTOR(Log)
+USE_PHI_FUNCTOR(Log2)
+USE_PHI_FUNCTOR(Log10)
+USE_PHI_FUNCTOR(Log1p)
 template <typename T>
 using ELUGradNegativeAlphaFunctor = phi::funcs::ELUGradNegativeAlphaFunctor<T>;
@@ -448,88 +453,6 @@ struct ReciprocalGradFunctor : public BaseActivationFunctor<T> {
  }
 };
-// log(x) = natural logarithm of x
-template <typename T>
-struct LogFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out>
-  void operator()(Device d, X x, Out out) const {
-    out.device(d) = x.log();
-  }
-};
-template <typename T>
-struct LogGradFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out, typename dOut,
-            typename dX>
-  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    dx.device(d) = dout * (static_cast<T>(1) / x);
-  }
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
-};
-// log2(x) = logarithm to the base 2 of the elements of x
-template <typename T>
-struct Log2Functor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out>
-  void operator()(Device d, X x, Out out) const {
-    out.device(d) = x.log() / static_cast<T>(log(2));
-  }
-};
-// the gradient of log2(x) is 1/(x*ln(2))
-template <typename T>
-struct Log2GradFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out, typename dOut,
-            typename dX>
-  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    dx.device(d) = dout * static_cast<T>(1) / (x * static_cast<T>(log(2)));
-  }
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
-};
-// log10(x) = logarithm to the base 10 of the elements of x
-template <typename T>
-struct Log10Functor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out>
-  void operator()(Device d, X x, Out out) const {
-    out.device(d) = x.log() / static_cast<T>(log(10));
-  }
-};
-// the gradient of log10(x) is 1/(x*ln(10))
-template <typename T>
-struct Log10GradFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out, typename dOut,
-            typename dX>
-  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    dx.device(d) = dout * static_cast<T>(1) / (x * static_cast<T>(log(10)));
-  }
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
-};
-// log1p(x) = natural logarithm of x+1
-template <typename T>
-struct Log1pFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out>
-  void operator()(Device d, X x, Out out) const {
-    out.device(d) = (static_cast<T>(1) + x).log();
-  }
-};
-template <typename T>
-struct Log1pGradFunctor : public BaseActivationFunctor<T> {
-  template <typename Device, typename X, typename Out, typename dOut,
-            typename dX>
-  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    dx.device(d) = dout * (static_cast<T>(1) / (x + static_cast<T>(1)));
-  }
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
-};
 // square(x) = x^2
 template <typename T>
 struct SquareFunctor : public BaseActivationFunctor<T> {
@@ -1197,37 +1120,6 @@ class SquareDoubleGradKernel
  }
 };
-template <typename DeviceContext, typename Functor>
-class LogDoubleGradKernel
-    : public SquareDoubleGradKernel<DeviceContext, Functor> {};
-template <typename DeviceContext, typename Functor>
-class ELUDoubleGradKernel
-    : public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
- public:
-  using T = typename Functor::ELEMENT_TYPE;
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    const framework::Tensor *X, *ddX, *dOut;
-    X = ddX = dOut = nullptr;
-    framework::Tensor *dX, *ddOut;
-    dX = ddOut = nullptr;
-    ExtractDoubleGradTensorWithInputDOut(ctx, &X, &ddX, &dX, &dOut, &ddOut);
-    if (dX) dX->mutable_data<T>(X->dims(), ctx.GetPlace());
-    if (ddOut) ddOut->mutable_data<T>(ctx.GetPlace());
-    auto& place = ctx.template device_context<DeviceContext>();
-    Functor functor;
-    auto attrs = functor.GetAttrs();
-    for (auto& attr : attrs) {
-      *attr.second = ctx.Attr<float>(attr.first);
-    }
-    functor(place, X, ddX, ddOut, dOut, dX);
-  }
-};
 template <typename DeviceContext, typename Functor>
 class CELUDoubleGradKernel
    : public framework::OpKernel<typename Functor::ELEMENT_TYPE> {
@@ -1522,36 +1414,6 @@ class LogitGradKernel : public framework::OpKernel<T> {
  }
 };
-template <typename T>
-struct LogGradGradFunctor : public BaseActivationFunctor<T> {
-  template <typename Device>
-  void operator()(const Device& dev, const framework::Tensor* X,
-                  const framework::Tensor* ddX, framework::Tensor* ddOut,
-                  const framework::Tensor* dOut, framework::Tensor* dX) const {
-    auto* d = dev.eigen_device();
-    auto ddx = framework::EigenVector<T>::Flatten(
-        GET_DATA_SAFELY(ddX, "Input", "DDX", "LogGradGrad"));
-    auto x = framework::EigenVector<T>::Flatten(
-        GET_DATA_SAFELY(X, "Input", "X", "LogGradGrad"));
-    // ddout = ddx / x; dx = -(dout / x) * (ddx / x)
-    // calculate dx first, so ddout can inplace ddx
-    if (dX) {
-      auto dout = framework::EigenVector<T>::Flatten(
-          GET_DATA_SAFELY(dOut, "Output", "DOut", "LogGradGrad"));
-      auto dx = framework::EigenVector<T>::Flatten(
-          GET_DATA_SAFELY(dX, "Output", "DX", "LogGradGrad"));
-      dx.device(*d) = dout * static_cast<T>(-1) * ddx / (x * x);
-    }
-    if (ddOut) {
-      auto ddout = framework::EigenVector<T>::Flatten(
-          GET_DATA_SAFELY(ddOut, "Output", "DDOut", "LogGradGrad"));
-      ddout.device(*d) = ddx * static_cast<T>(1) / x;
-    }
-  }
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
-};
 }  // namespace operators
 }  // namespace paddle
@@ -1560,9 +1422,6 @@ struct LogGradGradFunctor : public BaseActivationFunctor<T> {
  __macro(floor, Floor, FloorFunctor, ZeroGradFunctor);                      \
  __macro(round, Round, RoundFunctor, ZeroGradFunctor);                      \
  __macro(reciprocal, Reciprocal, ReciprocalFunctor, ReciprocalGradFunctor); \
-  __macro(log1p, Log1p, Log1pFunctor, Log1pGradFunctor);                     \
-  __macro(log2, Log2, Log2Functor, Log2GradFunctor);                         \
-  __macro(log10, Log10, Log10Functor, Log10GradFunctor);                     \
  __macro(soft_relu, SoftRelu, SoftReluFunctor, SoftReluGradFunctor);        \
  __macro(stanh, STanh, STanhFunctor, STanhGradFunctor);                     \
  __macro(softplus, Softplus, SoftplusFunctor, SoftplusGradFunctor);         \

--- a/paddle/fluid/operators/activation_op.kps
+++ b/paddle/fluid/operators/activation_op.kps
@@ -131,27 +131,6 @@ struct CudaExpm1GradFunctor : public BaseActivationFunctor<T> {
  }
 };
-template <typename T>
-struct CudaLogFunctor : public BaseActivationFunctor<T> {
-  using MPType = typename details::MPTypeTrait<T>::Type;
-  // log(x) = log(x)
-  __device__ __forceinline__ T operator()(const T arg_x) const {
-    MPType x = static_cast<MPType>(arg_x);
-    return static_cast<T>(log(x));
-  }
-};
-template <typename T>
-struct CudaLogGradFunctor : public BaseActivationFunctor<T> {
-  // dx = dout / x
-  __device__ __forceinline__ T operator()(const T dout, const T x) const {
-    return dout / x;
-  }
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
-};
 template <typename T>
 struct CudaSquareFunctor : public BaseActivationFunctor<T> {
  // square(x) = x * x
@@ -220,78 +199,6 @@ struct CudaRsqrtGradFunctor : public BaseActivationFunctor<T> {
  }
 };
-template <typename T>
-struct CudaLog1pFunctor : public BaseActivationFunctor<T> {
-  using MPType = typename details::MPTypeTrait<T>::Type;
-  MPType one = static_cast<MPType>(1.0f);
-  // log1p(x) = log(1 + x)
-  __device__ __forceinline__ T operator()(const T arg_x) const {
-    MPType x = static_cast<MPType>(arg_x);
-    return static_cast<T>(log(one + x));
-  }
-};
-template <typename T>
-struct CudaLog1pGradFunctor : public BaseActivationFunctor<T> {
-  T one = static_cast<T>(1.0f);
-  // dx = dout / (1 + x)
-  __device__ __forceinline__ T operator()(const T dout, const T x) const {
-    return dout / (one + x);
-  }
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
-};
-template <typename T>
-struct CudaLog2Functor : public BaseActivationFunctor<T> {
-  using MPType = typename details::MPTypeTrait<T>::Type;
-  // log2(x) = log2(x)
-  __device__ __forceinline__ T operator()(const T arg_x) const {
-    MPType x = static_cast<MPType>(arg_x);
-    return static_cast<T>(log2(x));
-  }
-};
-template <typename T>
-struct CudaLog2GradFunctor : public BaseActivationFunctor<T> {
-  using MPType = typename details::MPTypeTrait<T>::Type;
-  T log_two = static_cast<T>(log(static_cast<MPType>(2.0f)));
-  // dx = dout / (x * log(2))
-  __device__ __forceinline__ T operator()(const T dout, const T x) const {
-    return dout / (x * log_two);
-  }
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
-};
-template <typename T>
-struct CudaLog10Functor : public BaseActivationFunctor<T> {
-  using MPType = typename details::MPTypeTrait<T>::Type;
-  // log10(x) = log10(x)
-  __device__ __forceinline__ T operator()(const T arg_x) const {
-    MPType x = static_cast<MPType>(arg_x);
-    return static_cast<T>(log10(x));
-  }
-};
-template <typename T>
-struct CudaLog10GradFunctor : public BaseActivationFunctor<T> {
-  using MPType = typename details::MPTypeTrait<T>::Type;
-  T log_ten = static_cast<T>(log(static_cast<MPType>(10.0f)));
-  // dx = dout / (x * log(10))
-  __device__ __forceinline__ T operator()(const T dout, const T x) const {
-    return dout / (x * log_ten);
-  }
-  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
-};
 template <typename T>
 struct CudaSoftReluFunctor : public BaseActivationFunctor<T> {
  using MPType = typename details::MPTypeTrait<T>::Type;
@@ -773,6 +680,10 @@ USE_PHI_FUNCTOR(CudaELU)
 USE_PHI_FUNCTOR(CudaSigmoid)
 USE_PHI_FUNCTOR(CudaLogSigmoid)
 USE_PHI_FUNCTOR(CudaHardSigmoid)
+USE_PHI_FUNCTOR(CudaLog)
+USE_PHI_FUNCTOR(CudaLog2)
+USE_PHI_FUNCTOR(CudaLog10)
+USE_PHI_FUNCTOR(CudaLog1p)
 template <typename T>
 using CudaELUGradNegativeAlphaFunctor =
@@ -975,18 +886,6 @@ REGISTER_OP_CUDA_KERNEL(
                                  ops::CudaExpm1GradFunctor<plat::float16>>);
 /* ========================================================================== */
-/* ==========================  Log register ==================================*/
-REGISTER_ACTIVATION_CUDA_KERNEL(log, Log, CudaLogFunctor, CudaLogGradFunctor);
-REGISTER_OP_CUDA_KERNEL(
-    log_grad_grad, ops::LogDoubleGradKernel<plat::CUDADeviceContext,
-                                            ops::LogGradGradFunctor<float>>,
-    ops::LogDoubleGradKernel<plat::CUDADeviceContext,
-                             ops::LogGradGradFunctor<double>>,
-    ops::LogDoubleGradKernel<plat::CUDADeviceContext,
-                             ops::LogGradGradFunctor<plat::float16>>);
-/* ========================================================================== */
 #define FOR_EACH_ACTIVATION_CUDA_OP(__macro)                                  \
  __macro(softshrink, SoftShrink, CudaSoftShrinkFunctor,                      \
          CudaSoftShrinkGradFunctor);                                         \
@@ -995,9 +894,6 @@ REGISTER_OP_CUDA_KERNEL(
  __macro(round, Round, CudaRoundFunctor, CudaZeroGradFunctor);               \
  __macro(reciprocal, Reciprocal, CudaReciprocalFunctor,                      \
          CudaReciprocalGradFunctor);                                         \
-  __macro(log1p, Log1p, CudaLog1pFunctor, CudaLog1pGradFunctor);              \
-  __macro(log2, Log2, CudaLog2Functor, CudaLog2GradFunctor);                  \
-  __macro(log10, Log10, CudaLog10Functor, CudaLog10GradFunctor);              \
  __macro(soft_relu, SoftRelu, CudaSoftReluFunctor, CudaSoftReluGradFunctor); \
  __macro(stanh, STanh, CudaSTanhFunctor, CudaSTanhGradFunctor);              \
  __macro(softplus, Softplus, CudaSoftplusFunctor, CudaSoftplusGradFunctor);  \

--- a/paddle/fluid/operators/fill_constant_batch_size_like_op.cc
+++ b/paddle/fluid/operators/fill_constant_batch_size_like_op.cc
@@ -12,7 +12,6 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/operators/fill_constant_batch_size_like_op.h"
 #include "paddle/fluid/operators/batch_size_like.h"
 namespace paddle {
@@ -23,9 +22,13 @@ class FillConstantBatchSizeLikeOp : public BatchSizeLikeOp {
  using BatchSizeLikeOp::BatchSizeLikeOp;
  framework::OpKernelType GetExpectedKernelType(
      const framework::ExecutionContext &ctx) const override {
-    return framework::OpKernelType(
+    framework::OpKernelType kernel_type = framework::OpKernelType(
        static_cast<framework::proto::VarType::Type>(ctx.Attr<int>("dtype")),
        ctx.device_context());
+    if (ctx.Attr<bool>("force_cpu")) {
+      kernel_type.place_ = platform::CPUPlace();
+    }
+    return kernel_type;
  }
 };
@@ -64,15 +67,3 @@ REGISTER_OPERATOR(
    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
    ops::FillConstantBatchSizeLikeOpMaker,
    ops::BatchSizeLikeNoNeedBufferVarsInferer);
-REGISTER_OP_CPU_KERNEL(
-    fill_constant_batch_size_like,
-    ops::FillConstantBatchSizeLikeOpKernel<paddle::platform::CPUDeviceContext,
-                                           float>,
-    ops::FillConstantBatchSizeLikeOpKernel<paddle::platform::CPUDeviceContext,
-                                           double>,
-    ops::FillConstantBatchSizeLikeOpKernel<paddle::platform::CPUDeviceContext,
-                                           int>,
-    ops::FillConstantBatchSizeLikeOpKernel<paddle::platform::CPUDeviceContext,
-                                           int64_t>,
-    ops::FillConstantBatchSizeLikeOpKernel<paddle::platform::CPUDeviceContext,
-                                           bool>);
--- a/paddle/fluid/operators/fill_constant_batch_size_like_op.h
+++ b/paddle/fluid/operators/fill_constant_batch_size_like_op.h
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#pragma once
-#include <string>
-#include "paddle/fluid/framework/convert_utils.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/phi/kernels/funcs/math_function.h"
-namespace paddle {
-namespace operators {
-template <typename DeviceContext, typename T>
-class FillConstantBatchSizeLikeOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext &ctx) const override {
-    auto data_type =
-        static_cast<framework::proto::VarType::Type>(ctx.Attr<int>("dtype"));
-    auto float_value = ctx.Attr<float>("value");
-    auto str_value = ctx.Attr<std::string>("str_value");
-    auto force_cpu = ctx.Attr<bool>("force_cpu");
-    auto *out = ctx.Output<framework::Tensor>("Out");
-    auto *in = ctx.Input<framework::LoDTensor>("Input");
-    if (in->lod().size() && ctx.Attr<int>("input_dim_idx") == 0) {
-      // set the correct batch size for the LoDTensor.
-      auto odims = out->dims();
-      int output_dim_idx = ctx.Attr<int>("output_dim_idx");
-      odims[output_dim_idx] = static_cast<int>(in->lod().back().size()) - 1;
-      out->mutable_data<T>(odims, ctx.GetPlace());
-    }
-    T value;
-    if (str_value.empty()) {
-      value = static_cast<T>(float_value);
-    } else {
-      std::stringstream convert_stream(str_value);
-      if (std::is_same<int64_t, T>::value) {
-        int64_t tmp_value;
-        convert_stream >> tmp_value;
-        value = static_cast<T>(tmp_value);
-      } else {
-        double tmp_value;
-        convert_stream >> tmp_value;
-        value = static_cast<T>(tmp_value);
-      }
-    }
-    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
-    bool cpu_place = force_cpu || ctx.GetPlace() == platform::CPUPlace();
-    if (cpu_place) {
-      auto &dev_ctx = *pool.Get(platform::CPUPlace());
-      phi::funcs::SetConstant<platform::CPUDeviceContext, T> functor;
-      out->mutable_data(platform::CPUPlace(),
-                        framework::TransToPhiDataType(data_type));
-      functor(reinterpret_cast<const platform::CPUDeviceContext &>(dev_ctx),
-              out, static_cast<T>(value));
-    }
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-    if (!cpu_place) {
-      auto &dev_ctx = *pool.Get(ctx.GetPlace());
-      phi::funcs::SetConstant<platform::CUDADeviceContext, T> functor;
-      out->mutable_data(ctx.GetPlace(),
-                        framework::TransToPhiDataType(data_type));
-      functor(reinterpret_cast<const platform::CUDADeviceContext &>(dev_ctx),
-              out, static_cast<T>(value));
-    }
-#endif
-  }
-};
-}  // namespace operators
-}  // namespace paddle
--- a/paddle/fluid/operators/squeeze_op.cc
+++ b/paddle/fluid/operators/squeeze_op.cc
@@ -19,7 +19,9 @@ limitations under the License. */
 #include <unordered_map>
 #include <vector>
+#include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/infermeta/unary.h"
 namespace paddle {
 namespace operators {
@@ -113,13 +115,13 @@ class SqueezeOp : public framework::OperatorWithKernel {
    auto input_data_type =
        framework::OperatorWithKernel::IndicateVarDataType(ctx, "X");
-    //#ifdef PADDLE_WITH_MKLDNN
+    // #ifdef PADDLE_WITH_MKLDNN
    //    if (this->CanMKLDNNBeUsed(ctx, input_data_type)) {
    //      return framework::OpKernelType(input_data_type, ctx.GetPlace(),
    //                                     framework::DataLayout::kMKLDNN,
    //                                     framework::LibraryType::kMKLDNN);
    //    }
-    //#endif
+    // #endif
    return framework::OpKernelType(input_data_type, ctx.GetPlace());
  }
 };
@@ -140,13 +142,13 @@ class SqueezeGradOp : public framework::OperatorWithKernel {
    auto input_data_type = framework::OperatorWithKernel::IndicateVarDataType(
        ctx, framework::GradVarName("Out"));
-    //#ifdef PADDLE_WITH_MKLDNN
+    // #ifdef PADDLE_WITH_MKLDNN
    //    if (this->CanMKLDNNBeUsed(ctx, input_data_type)) {
    //      return framework::OpKernelType(input_data_type, ctx.GetPlace(),
    //                                     framework::DataLayout::kMKLDNN,
    //                                     framework::LibraryType::kMKLDNN);
    //    }
-    //#endif
+    // #endif
    return framework::OpKernelType(input_data_type, ctx.GetPlace());
  }
 };
@@ -201,53 +203,18 @@ class SqueezeOpMaker : public framework::OpProtoAndCheckerMaker {
 class Squeeze2Op : public framework::OperatorWithKernel {
 public:
  using framework::OperatorWithKernel::OperatorWithKernel;
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "Squeeze2");
-    OP_INOUT_CHECK(ctx->HasOutput("Out"), "Output", "Out", "Squeeze2");
-    const auto &x_dims = ctx->GetInputDim("X");
-    // Check input tensor dims (<6) Eigen limit.
-    PADDLE_ENFORCE_LE(x_dims.size(), 6,
-                      platform::errors::InvalidArgument(
-                          "The dimensions of Input(X) "
-                          "should be in the range of [1, 6] (Eigen limit)."
-                          "But received X's dimensions = %d, X's shape = [%s].",
-                          x_dims.size(), x_dims));
-    const auto &axes = ctx->Attrs().Get<std::vector<int>>("axes");
-    auto out_dims = GetOutputShape(axes, x_dims, false);
-    ctx->SetOutputDim("Out", out_dims);
-    if (x_dims[0] == out_dims[0]) {
-      // Only pass LoD when the first dimension of output and Input(X)
-      // are the same.
-      ctx->ShareLoD("X", "Out");
-    }
-    if (!ctx->HasOutput("XShape")) return;
-    std::vector<int64_t> xshape_dims(x_dims.size() + 1);
-    xshape_dims[0] = 0;
-    for (int i = 0; i < x_dims.size(); ++i) {
-      xshape_dims[i + 1] = x_dims[i];
-    }
-    ctx->SetOutputDim("XShape", phi::make_ddim(xshape_dims));
-    ctx->ShareLoD("X", /*->*/ "XShape");
-  }
  framework::OpKernelType GetExpectedKernelType(
      const framework::ExecutionContext &ctx) const override {
    auto input_data_type =
        framework::OperatorWithKernel::IndicateVarDataType(ctx, "X");
-    //#ifdef PADDLE_WITH_MKLDNN
+    // #ifdef PADDLE_WITH_MKLDNN
    //    if (this->CanMKLDNNBeUsed(ctx, input_data_type)) {
    //      return framework::OpKernelType(input_data_type, ctx.GetPlace(),
    //                                     framework::DataLayout::kMKLDNN,
    //                                     framework::LibraryType::kMKLDNN);
    //    }
-    //#endif
+    // #endif
    return framework::OpKernelType(input_data_type, ctx.GetPlace());
  }
 };
@@ -287,13 +254,13 @@ class Squeeze2GradOp : public framework::OperatorWithKernel {
    auto input_data_type = framework::OperatorWithKernel::IndicateVarDataType(
        ctx, framework::GradVarName("Out"));
-    //#ifdef PADDLE_WITH_MKLDNN
+    // #ifdef PADDLE_WITH_MKLDNN
    //    if (this->CanMKLDNNBeUsed(ctx, input_data_type)) {
    //      return framework::OpKernelType(input_data_type, ctx.GetPlace(),
    //                                     framework::DataLayout::kMKLDNN,
    //                                     framework::LibraryType::kMKLDNN);
    //    }
-    //#endif
+    // #endif
    return framework::OpKernelType(input_data_type, ctx.GetPlace());
  }
 };
@@ -365,6 +332,10 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(SqueezeGradNoNeedBufferVarsInferer, "X");
 }  // namespace paddle
 namespace ops = paddle::operators;
+DECLARE_INFER_SHAPE_FUNCTOR(squeeze2, SqueezeInferShapeFunctor,
+                            PD_INFER_META(phi::SqueezeInferMeta));
 REGISTER_OPERATOR(squeeze, ops::SqueezeOp, ops::SqueezeOpMaker,
                  ops::SqueezeGradOpMaker<paddle::framework::OpDesc>,
                  ops::SqueezeGradOpMaker<paddle::imperative::OpBase>);
@@ -376,7 +347,7 @@ REGISTER_OPERATOR(squeeze_grad, ops::SqueezeGradOp,
 REGISTER_OPERATOR(squeeze2, ops::Squeeze2Op, ops::Squeeze2OpMaker,
                  ops::Squeeze2GradOpMaker<paddle::framework::OpDesc>,
                  ops::Squeeze2GradOpMaker<paddle::imperative::OpBase>,
-                  ops::SqueezeInplaceInferer);
+                  ops::SqueezeInplaceInferer, SqueezeInferShapeFunctor);
 REGISTER_OPERATOR(squeeze2_grad, ops::Squeeze2GradOp,
                  ops::Squeeze2DoubleGradOpMaker<paddle::framework::OpDesc>,
                  ops::Squeeze2DoubleGradOpMaker<paddle::imperative::OpBase>,
@@ -411,34 +382,3 @@ REGISTER_OP_CPU_KERNEL(
                           paddle::platform::complex<double>>,
    ops::SqueezeGradKernel<paddle::platform::CPUDeviceContext,
                           paddle::platform::bfloat16>);
-REGISTER_OP_CPU_KERNEL(
-    squeeze2, ops::Squeeze2Kernel<paddle::platform::CPUDeviceContext, float>,
-    ops::Squeeze2Kernel<paddle::platform::CPUDeviceContext, double>,
-    ops::Squeeze2Kernel<paddle::platform::CPUDeviceContext, bool>,
-    ops::Squeeze2Kernel<paddle::platform::CPUDeviceContext, int>,
-    ops::Squeeze2Kernel<paddle::platform::CPUDeviceContext, uint8_t>,
-    ops::Squeeze2Kernel<paddle::platform::CPUDeviceContext, int8_t>,
-    ops::Squeeze2Kernel<paddle::platform::CPUDeviceContext, int64_t>,
-    ops::Squeeze2Kernel<paddle::platform::CPUDeviceContext,
-                        paddle::platform::complex<float>>,
-    ops::Squeeze2Kernel<paddle::platform::CPUDeviceContext,
-                        paddle::platform::complex<double>>,
-    ops::Squeeze2Kernel<paddle::platform::CPUDeviceContext,
-                        paddle::platform::bfloat16>);
-REGISTER_OP_CPU_KERNEL(
-    squeeze2_grad,
-    ops::Squeeze2GradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::Squeeze2GradKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::Squeeze2GradKernel<paddle::platform::CPUDeviceContext, bool>,
-    ops::Squeeze2GradKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::Squeeze2GradKernel<paddle::platform::CPUDeviceContext, uint8_t>,
-    ops::Squeeze2GradKernel<paddle::platform::CPUDeviceContext, int8_t>,
-    ops::Squeeze2GradKernel<paddle::platform::CPUDeviceContext, int64_t>,
-    ops::Squeeze2GradKernel<paddle::platform::CPUDeviceContext,
-                            paddle::platform::complex<float>>,
-    ops::Squeeze2GradKernel<paddle::platform::CPUDeviceContext,
-                            paddle::platform::complex<double>>,
-    ops::Squeeze2GradKernel<paddle::platform::CPUDeviceContext,
-                            paddle::platform::bfloat16>);
--- a/paddle/fluid/operators/squeeze_op.cu.cc
+++ b/paddle/fluid/operators/squeeze_op.cu.cc
@@ -46,33 +46,3 @@ REGISTER_OP_CUDA_KERNEL(
                           paddle::platform::complex<float>>,
    ops::SqueezeGradKernel<paddle::platform::CUDADeviceContext,
                           paddle::platform::complex<double>>);
-REGISTER_OP_CUDA_KERNEL(
-    squeeze2, ops::Squeeze2Kernel<paddle::platform::CUDADeviceContext, float>,
-    ops::Squeeze2Kernel<paddle::platform::CUDADeviceContext, double>,
-    ops::Squeeze2Kernel<paddle::platform::CUDADeviceContext, plat::float16>,
-    ops::Squeeze2Kernel<paddle::platform::CUDADeviceContext, plat::bfloat16>,
-    ops::Squeeze2Kernel<paddle::platform::CUDADeviceContext, bool>,
-    ops::Squeeze2Kernel<paddle::platform::CUDADeviceContext, int>,
-    ops::Squeeze2Kernel<paddle::platform::CUDADeviceContext, int8_t>,
-    ops::Squeeze2Kernel<paddle::platform::CUDADeviceContext, uint8_t>,
-    ops::Squeeze2Kernel<paddle::platform::CUDADeviceContext, int64_t>,
-    ops::Squeeze2Kernel<paddle::platform::CUDADeviceContext,
-                        paddle::platform::complex<float>>,
-    ops::Squeeze2Kernel<paddle::platform::CUDADeviceContext,
-                        paddle::platform::complex<double>>);
-REGISTER_OP_CUDA_KERNEL(
-    squeeze2_grad,
-    ops::Squeeze2GradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::Squeeze2GradKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::Squeeze2GradKernel<paddle::platform::CUDADeviceContext, plat::float16>,
-    ops::Squeeze2GradKernel<paddle::platform::CUDADeviceContext,
-                            plat::bfloat16>,
-    ops::Squeeze2GradKernel<paddle::platform::CUDADeviceContext, bool>,
-    ops::Squeeze2GradKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::Squeeze2GradKernel<paddle::platform::CUDADeviceContext, int8_t>,
-    ops::Squeeze2GradKernel<paddle::platform::CUDADeviceContext, uint8_t>,
-    ops::Squeeze2GradKernel<paddle::platform::CUDADeviceContext, int64_t>,
-    ops::Squeeze2GradKernel<paddle::platform::CUDADeviceContext,
-                            paddle::platform::complex<float>>,
-    ops::Squeeze2GradKernel<paddle::platform::CUDADeviceContext,
-                            paddle::platform::complex<double>>);
--- a/paddle/fluid/operators/unsqueeze_op.cc
+++ b/paddle/fluid/operators/unsqueeze_op.cc
@@ -18,7 +18,9 @@ limitations under the License. */
 #include <string>
 #include <vector>
+#include "paddle/fluid/framework/infershape_utils.h"
 #include "paddle/fluid/framework/op_registry.h"
+#include "paddle/phi/infermeta/unary.h"
 namespace paddle {
 namespace operators {
@@ -251,19 +253,6 @@ class UnsqueezeDoubleGradOpMaker : public framework::SingleGradOpMaker<T> {
 class Unsqueeze2Op : public UnsqueezeOp {
 public:
  using UnsqueezeOp::UnsqueezeOp;
-  void InferShape(framework::InferShapeContext *ctx) const override {
-    UnsqueezeOp::InferShape(ctx);
-    const auto &x_dims = ctx->GetInputDim("X");
-    if (!ctx->HasOutput("XShape")) return;
-    std::vector<int64_t> xshape_dims(x_dims.size() + 1);
-    xshape_dims[0] = 0;
-    for (int i = 0; i < x_dims.size(); ++i) {
-      xshape_dims[i + 1] = x_dims[i];
-    }
-    ctx->SetOutputDim("XShape", phi::make_ddim(xshape_dims));
-    ctx->ShareLoD("X", /*->*/ "XShape");
-  }
 };
 class Unsqueeze2OpMaker : public UnsqueezeOpMaker {
@@ -339,10 +328,14 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERER(UnsqueezeGradOpNoNeedBufferVarInferer, "X");
 }  // namespace operators
 }  // namespace paddle
+DECLARE_INFER_SHAPE_FUNCTOR(unsqueeze2, Unsqueeze2InferShapeFunctor,
+                            PD_INFER_META(phi::UnsqueezeInferMeta));
 namespace ops = paddle::operators;
 REGISTER_OPERATOR(unsqueeze, ops::UnsqueezeOp, ops::UnsqueezeOpMaker,
                  ops::UnsqueezeGradOpMaker<paddle::framework::OpDesc>,
                  ops::UnsqueezeGradOpMaker<paddle::imperative::OpBase>);
 REGISTER_OPERATOR(unsqueeze_grad, ops::UnsqueezeGradOp,
                  ops::UnsqueezeDoubleGradOpMaker<paddle::framework::OpDesc>,
                  ops::UnsqueezeDoubleGradOpMaker<paddle::imperative::OpBase>,
@@ -351,7 +344,8 @@ REGISTER_OPERATOR(unsqueeze_grad, ops::UnsqueezeGradOp,
 REGISTER_OPERATOR(unsqueeze2, ops::Unsqueeze2Op, ops::Unsqueeze2OpMaker,
                  ops::Unsqueeze2GradOpMaker<paddle::framework::OpDesc>,
                  ops::Unsqueeze2GradOpMaker<paddle::imperative::OpBase>,
-                  ops::UnsqueezeInplaceInferer);
+                  Unsqueeze2InferShapeFunctor, ops::UnsqueezeInplaceInferer);
 REGISTER_OPERATOR(unsqueeze2_grad, ops::Unsqueeze2GradOp,
                  ops::Unsqueeze2DoubleGradOpMaker<paddle::framework::OpDesc>,
                  ops::Unsqueeze2DoubleGradOpMaker<paddle::imperative::OpBase>,
@@ -388,34 +382,3 @@ REGISTER_OP_CPU_KERNEL(
                             paddle::platform::complex<double>>,
    ops::UnsqueezeGradKernel<paddle::platform::CPUDeviceContext,
                             paddle::platform::bfloat16>);
-REGISTER_OP_CPU_KERNEL(
-    unsqueeze2, ops::UnsqueezeKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::UnsqueezeKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::UnsqueezeKernel<paddle::platform::CPUDeviceContext, bool>,
-    ops::UnsqueezeKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::UnsqueezeKernel<paddle::platform::CPUDeviceContext, int16_t>,
-    ops::UnsqueezeKernel<paddle::platform::CPUDeviceContext, uint8_t>,
-    ops::UnsqueezeKernel<paddle::platform::CPUDeviceContext, int8_t>,
-    ops::UnsqueezeKernel<paddle::platform::CPUDeviceContext, int64_t>,
-    ops::UnsqueezeKernel<paddle::platform::CPUDeviceContext,
-                         paddle::platform::complex<float>>,
-    ops::UnsqueezeKernel<paddle::platform::CPUDeviceContext,
-                         paddle::platform::complex<double>>,
-    ops::UnsqueezeKernel<paddle::platform::CPUDeviceContext,
-                         paddle::platform::bfloat16>);
-REGISTER_OP_CPU_KERNEL(
-    unsqueeze2_grad,
-    ops::Unsqueeze2GradKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::Unsqueeze2GradKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::Unsqueeze2GradKernel<paddle::platform::CPUDeviceContext, bool>,
-    ops::Unsqueeze2GradKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::Unsqueeze2GradKernel<paddle::platform::CPUDeviceContext, int16_t>,
-    ops::Unsqueeze2GradKernel<paddle::platform::CPUDeviceContext, uint8_t>,
-    ops::Unsqueeze2GradKernel<paddle::platform::CPUDeviceContext, int8_t>,
-    ops::Unsqueeze2GradKernel<paddle::platform::CPUDeviceContext, int64_t>,
-    ops::Unsqueeze2GradKernel<paddle::platform::CPUDeviceContext,
-                              paddle::platform::complex<float>>,
-    ops::Unsqueeze2GradKernel<paddle::platform::CPUDeviceContext,
-                              paddle::platform::complex<double>>,
-    ops::Unsqueeze2GradKernel<paddle::platform::CPUDeviceContext,
-                              paddle::platform::bfloat16>);
--- a/paddle/fluid/operators/unsqueeze_op.cu.cc
+++ b/paddle/fluid/operators/unsqueeze_op.cu.cc
@@ -50,37 +50,3 @@ REGISTER_OP_CUDA_KERNEL(
                             paddle::platform::complex<float>>,
    ops::UnsqueezeGradKernel<paddle::platform::CUDADeviceContext,
                             paddle::platform::complex<double>>);
-REGISTER_OP_CUDA_KERNEL(
-    unsqueeze2,
-    ops::UnsqueezeKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::UnsqueezeKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::UnsqueezeKernel<paddle::platform::CUDADeviceContext, plat::float16>,
-    ops::UnsqueezeKernel<paddle::platform::CUDADeviceContext, plat::bfloat16>,
-    ops::UnsqueezeKernel<paddle::platform::CUDADeviceContext, bool>,
-    ops::UnsqueezeKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::UnsqueezeKernel<paddle::platform::CUDADeviceContext, int16_t>,
-    ops::UnsqueezeKernel<paddle::platform::CUDADeviceContext, uint8_t>,
-    ops::UnsqueezeKernel<paddle::platform::CUDADeviceContext, int8_t>,
-    ops::UnsqueezeKernel<paddle::platform::CUDADeviceContext, int64_t>,
-    ops::UnsqueezeKernel<paddle::platform::CUDADeviceContext,
-                         paddle::platform::complex<float>>,
-    ops::UnsqueezeKernel<paddle::platform::CUDADeviceContext,
-                         paddle::platform::complex<double>>);
-REGISTER_OP_CUDA_KERNEL(
-    unsqueeze2_grad,
-    ops::Unsqueeze2GradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::Unsqueeze2GradKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::Unsqueeze2GradKernel<paddle::platform::CUDADeviceContext,
-                              plat::float16>,
-    ops::Unsqueeze2GradKernel<paddle::platform::CUDADeviceContext,
-                              plat::bfloat16>,
-    ops::Unsqueeze2GradKernel<paddle::platform::CUDADeviceContext, bool>,
-    ops::Unsqueeze2GradKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::Unsqueeze2GradKernel<paddle::platform::CUDADeviceContext, int16_t>,
-    ops::Unsqueeze2GradKernel<paddle::platform::CUDADeviceContext, uint8_t>,
-    ops::Unsqueeze2GradKernel<paddle::platform::CUDADeviceContext, int8_t>,
-    ops::Unsqueeze2GradKernel<paddle::platform::CUDADeviceContext, int64_t>,
-    ops::Unsqueeze2GradKernel<paddle::platform::CUDADeviceContext,
-                              paddle::platform::complex<float>>,
-    ops::Unsqueeze2GradKernel<paddle::platform::CUDADeviceContext,
-                              paddle::platform::complex<double>>);
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -1761,6 +1761,7 @@ All parameter, weight, gradient are variables in Paddle.
               out (core.Variable|None): the found variable or None.
           )DOC",
           py::return_value_policy::reference)
+      .def("size", &Scope::Size)
      .def("erase", &Scope::EraseVars, py::arg("names"),
           R"DOC(
           Find variable named :code:`name` in the current scope or
@@ -2857,6 +2858,9 @@ All parameter, weight, gradient are variables in Paddle.
      .def("run",
           [](StandaloneExecutor &self, std::vector<std::string> feed_names,
              std::vector<std::string> fetch_names) {
+             platform::RecordEvent record_event(
+                 "StandaloneExecutor:run",
+                 platform::TracerEventType::UserDefined, 1);
             paddle::framework::FetchList ret;
             {
               pybind11::gil_scoped_release release;

--- a/paddle/phi/core/compat/op_utils.h
+++ b/paddle/phi/core/compat/op_utils.h
@@ -42,6 +42,10 @@ const std::unordered_set<std::string> deprecated_op_names({"diag",
                                                           "flatten_grad",
                                                           "isinf",
                                                           "isnan",
+                                                           "unsqueeze",
+                                                           "unsqueeze_grad",
+                                                           "squeeze",
+                                                           "squeeze_grad",
                                                           "isfinite",
                                                           "matmul",
                                                           "matmul_grad",

--- a/paddle/phi/infermeta/unary.cc
+++ b/paddle/phi/infermeta/unary.cc
@@ -24,6 +24,7 @@ limitations under the License. */
 #include "paddle/phi/core/infermeta_utils.h"
 #include "paddle/phi/kernels/funcs/pooling.h"
 #include "paddle/phi/kernels/funcs/unfold_functor.h"
+#include "paddle/phi/kernels/funcs/unsqueeze.h"
 namespace phi {
@@ -1497,6 +1498,40 @@ void SplitInferMeta(const MetaTensor& x,
  }
 }
+void SqueezeInferMeta(const MetaTensor& x,
+                      const std::vector<int>& axes,
+                      MetaTensor* xshape,
+                      MetaTensor* out) {
+  const auto& x_dims = x.dims();
+  // Check input tensor dims (<6) Eigen limit.
+  PADDLE_ENFORCE_LE(x_dims.size(),
+                    6,
+                    phi::errors::InvalidArgument(
+                        "The dimensions of Input(X) "
+                        "should be in the range of [1, 6] (Eigen limit)."
+                        "But received X's dimensions = %d, X's shape = [%s].",
+                        x_dims.size(),
+                        x_dims));
+  auto out_dims = funcs::GetOutputSqueezeShape(axes, x_dims, false);
+  out->set_dims(out_dims);
+  if (x_dims[0] == out_dims[0]) {
+    // Only pass LoD when the first dimension of output and Input(X)
+    // are the same.
+    out->share_lod(x);
+  }
+  std::vector<int64_t> xshape_dims(x_dims.size() + 1);
+  xshape_dims[0] = 0;
+  for (int i = 0; i < x_dims.size(); ++i) {
+    xshape_dims[i + 1] = x_dims[i];
+  }
+  xshape->set_dims(phi::make_ddim(xshape_dims));
+  xshape->share_lod(x);
+  xshape->set_dtype(x.dtype());
+  out->set_dtype(x.dtype());
+}
 /*  Why not use SumRawInferMeta directly?
    Because we need make InferMetaFunction's args follow the design of api.yaml
 */
@@ -1982,6 +2017,41 @@ void UnfoldInferMeta(const MetaTensor& x,
  out->set_dims(phi::make_ddim(out_dims));
 }
+void UnsqueezeInferMeta(const MetaTensor& x,
+                        const ScalarArray& axes,
+                        MetaTensor* xshape,
+                        MetaTensor* out) {
+  const auto& x_dims = x.dims();
+  // Validity Check: input tensor dims (<6).
+  PADDLE_ENFORCE_LE(x_dims.size(),
+                    6,
+                    phi::errors::InvalidArgument(
+                        "Invalid "
+                        "dimensions, the rank of Input(X) "
+                        "should be in the range of [1, 6] (Eigen limit)"));
+  if (!axes.GetData().empty()) {
+    std::vector<int32_t> tmp;
+    tmp.reserve(axes.GetData().size());
+    std::for_each(axes.GetData().begin(),
+                  axes.GetData().end(),
+                  [&tmp](const int64_t& t) { tmp.push_back(t); });
+    auto out_dims = funcs::GetUnsqueezeShape(tmp, x_dims);
+    out->set_dims(out_dims);
+    if (x_dims[0] == out_dims[0]) {
+      out->share_lod(x);
+    }
+  }
+  std::vector<int64_t> xshape_dims(x_dims.size() + 1);
+  xshape_dims[0] = 0;
+  for (int i = 0; i < x_dims.size(); ++i) {
+    xshape_dims[i + 1] = x_dims[i];
+  }
+  xshape->set_dims(phi::make_ddim(xshape_dims));
+  xshape->share_lod(x);
+  out->set_dtype(x.dtype());
+  xshape->set_dtype(x.dtype());
+}
 void OneHotRawInferMeta(const MetaTensor& x,
                        int32_t depth,
                        DataType dtype,
@@ -1992,7 +2062,6 @@ void OneHotRawInferMeta(const MetaTensor& x,
      x_dims.size(),
      1,
      phi::errors::InvalidArgument("Rank of Input(X) should be at least 1."));
  auto out_dims_vec = phi::vectorize(x_dims);
  out_dims_vec.push_back(depth);
  auto out_dims = phi::make_ddim(out_dims_vec);

--- a/paddle/phi/infermeta/unary.h
+++ b/paddle/phi/infermeta/unary.h
@@ -229,6 +229,11 @@ void SplitInferMeta(const MetaTensor& x_meta,
                    std::vector<MetaTensor*> out,
                    MetaConfig config = MetaConfig());
+void SqueezeInferMeta(const MetaTensor& x,
+                      const std::vector<int>& axes,
+                      MetaTensor* xshape,
+                      MetaTensor* out);
 void SumInferMeta(const MetaTensor& x,
                  const std::vector<int64_t>& axis,
                  DataType dtype,
@@ -290,6 +295,11 @@ void UnfoldInferMeta(const MetaTensor& x,
                     MetaTensor* out,
                     MetaConfig config = MetaConfig());
+void UnsqueezeInferMeta(const MetaTensor& x,
+                        const ScalarArray& axes,
+                        MetaTensor* xshape,
+                        MetaTensor* out);
 void OneHotRawInferMeta(const MetaTensor& x,
                        int32_t depth,
                        DataType dtype,

--- a/paddle/phi/kernels/CMakeLists.txt
+++ b/paddle/phi/kernels/CMakeLists.txt
@@ -62,3 +62,6 @@ register_kernels(EXCLUDES ${COMMON_BAISC_KERNELS} ${MANUAL_BUILD_KERNELS} DEPS $
 add_subdirectory(sparse)
 copy_if_different(${kernel_declare_file} ${kernel_declare_file_final})
+# 5. kernel autotune 
+add_subdirectory(autotune)
--- a/paddle/phi/kernels/activation_grad_kernel.h
+++ b/paddle/phi/kernels/activation_grad_kernel.h
@@ -135,6 +135,14 @@ void SigmoidTripleGradKernel(const Context& dev_ctx,
                             DenseTensor* d_dout,
                             DenseTensor* d_ddx);
+template <typename T, typename Context>
+void LogDoubleGradKernel(const Context& dev_ctx,
+                         const DenseTensor& x,
+                         const DenseTensor& dout,
+                         const DenseTensor& ddx,
+                         DenseTensor* dx,
+                         DenseTensor* ddout);
 DECLARE_ACTIVATION_GRAD_KERNEL_DEPX(Cos);
 DECLARE_ACTIVATION_GRAD_KERNEL_DEPX(Tan);
 DECLARE_ACTIVATION_GRAD_KERNEL_DEPX(Acos);
@@ -149,6 +157,10 @@ DECLARE_ACTIVATION_GRAD_KERNEL_DEPX(Atanh);
 DECLARE_ACTIVATION_GRAD_KERNEL_DEPX(TanhShrink);
 DECLARE_ACTIVATION_GRAD_KERNEL_DEPX(Silu);
 DECLARE_ACTIVATION_GRAD_KERNEL_DEPX(LogSigmoid);
+DECLARE_ACTIVATION_GRAD_KERNEL_DEPX(Log);
+DECLARE_ACTIVATION_GRAD_KERNEL_DEPX(Log2);
+DECLARE_ACTIVATION_GRAD_KERNEL_DEPX(Log10);
+DECLARE_ACTIVATION_GRAD_KERNEL_DEPX(Log1p);
 DECLARE_ACTIVATION_GRAD_KERNEL_DEPOUT(Relu);
 DECLARE_ACTIVATION_GRAD_KERNEL_DEPOUT(Tanh);

--- a/paddle/phi/kernels/activation_kernel.h
+++ b/paddle/phi/kernels/activation_kernel.h
@@ -56,6 +56,10 @@ DECLARE_ACTIVATION_KERNEL(TanhShrink)
 DECLARE_ACTIVATION_KERNEL(Silu)
 DECLARE_ACTIVATION_KERNEL(Sigmoid)
 DECLARE_ACTIVATION_KERNEL(LogSigmoid)
+DECLARE_ACTIVATION_KERNEL(Log)
+DECLARE_ACTIVATION_KERNEL(Log2)
+DECLARE_ACTIVATION_KERNEL(Log10)
+DECLARE_ACTIVATION_KERNEL(Log1p)
 DECLARE_ACTIVATION_KERNEL_WITH_ONE_ATTRS(LeakyRelu, alpha)
 DECLARE_ACTIVATION_KERNEL_WITH_ONE_ATTRS(ThresholdedRelu, threshold)

--- a/paddle/phi/kernels/autotune/CMakeLists.txt
+++ b/paddle/phi/kernels/autotune/CMakeLists.txt
+if (WITH_GPU)
+     nv_test(gpu_timer_test SRCS gpu_timer_test.cu DEPS gtest)
+elseif (WITH_ROCM)
+    hip_test(gpu_timer_test SRCS gpu_timer_test.cu DEPS gtest)
+endif()
--- a/paddle/phi/kernels/autotune/gpu_timer.h
+++ b/paddle/phi/kernels/autotune/gpu_timer.h
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include "paddle/phi/backends/gpu/gpu_decls.h"
+#include "paddle/phi/core/enforce.h"
+#include "paddle/phi/core/errors.h"
+#ifdef PADDLE_WITH_CUDA
+#include <cuda_runtime.h>
+#endif
+#ifdef PADDLE_WITH_HIP
+#include <hip/hip_runtime.h>
+#endif
+namespace phi {
+class GpuTimer {
+ public:
+  GpuTimer() {
+#ifdef PADDLE_WITH_HIP
+    hipEventCreate(&start_);
+    hipEventCreate(&stop_);
+#else
+    cudaEventCreate(&start_);
+    cudaEventCreate(&stop_);
+#endif
+    PADDLE_ENFORCE_NOT_NULL(
+        start_, phi::errors::PreconditionNotMet("Start Event is not ready."));
+    PADDLE_ENFORCE_NOT_NULL(
+        stop_, phi::errors::PreconditionNotMet("Stop Event is not ready."));
+  }
+  ~GpuTimer() {
+#ifdef PADDLE_WITH_HIP
+    hipEventDestroy(start_);
+    hipEventDestroy(stop_);
+#else
+    cudaEventDestroy(start_);
+    cudaEventDestroy(stop_);
+#endif
+  }
+  void Start(gpuStream_t stream) {
+#ifdef PADDLE_WITH_HIP
+    hipEventRecord(start_, stream);
+#else
+    cudaEventRecord(start_, stream);
+#endif
+  }
+  void Stop(gpuStream_t stream) {
+#ifdef PADDLE_WITH_HIP
+    hipEventRecord(stop_, stream);
+#else
+    cudaEventRecord(stop_, stream);
+#endif
+  }
+  float ElapsedTime() {
+    float milliseconds = 0;
+#ifdef PADDLE_WITH_HIP
+    hipEventSynchronize(stop_);
+    hipEventElapsedTime(&milliseconds, start_, stop_);
+#else
+    cudaEventSynchronize(stop_);
+    cudaEventElapsedTime(&milliseconds, start_, stop_);
+#endif
+    return milliseconds;
+  }
+ private:
+  gpuEvent_t start_;
+  gpuEvent_t stop_;
+};
+}  // namespace phi
--- a/paddle/phi/kernels/autotune/gpu_timer_test.cu
+++ b/paddle/phi/kernels/autotune/gpu_timer_test.cu
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <gtest/gtest.h>
+#include <functional>
+#include "glog/logging.h"
+#include "paddle/phi/kernels/autotune/gpu_timer.h"
+#include "paddle/phi/kernels/funcs/aligned_vector.h"
+template <typename T, int VecSize>
+__global__ void VecSum(T *x, T *y, int N) {
+#ifdef __HIPCC__
+  int idx = hipBlockDim_x * hipBlockIdx_x + hipThreadIdx_x;
+#else
+  int idx = blockDim.x * blockIdx.x + threadIdx.x;
+#endif
+  using LoadT = phi::AlignedVector<T, VecSize>;
+  for (int i = idx * VecSize; i < N; i += blockDim.x * gridDim.x * VecSize) {
+    LoadT x_vec;
+    LoadT y_vec;
+    phi::Load<T, VecSize>(&x[i], &x_vec);
+    phi::Load<T, VecSize>(&y[i], &y_vec);
+#pragma unroll
+    for (int j = 0; j < VecSize; j++) {
+      y_vec[j] = x_vec[j] + y_vec[j];
+    }
+    phi::Store<T, VecSize>(y_vec, &y[i]);
+  }
+}
+template <int Vecsize, int Threads, size_t Blocks>
+void Algo(float *d_in, float *d_out, size_t N) {
+#ifdef __HIPCC__
+  hipLaunchKernelGGL(HIP_KERNEL_NAME(VecSum<float, Vecsize>),
+                     dim3(Blocks),
+                     dim3(Threads),
+                     0,
+                     0,
+                     d_in,
+                     d_out,
+                     N);
+#else
+  VecSum<float, Vecsize><<<Blocks, Threads>>>(d_in, d_out, N);
+#endif
+}
+TEST(GpuTimer, Sum) {
+  float *in1, *in2, *out;
+  float *d_in1, *d_in2;
+  size_t N = 1 << 20;
+  size_t size = sizeof(float) * N;
+#ifdef __HIPCC__
+  hipMalloc(reinterpret_cast<void **>(&d_in1), size);
+  hipMalloc(reinterpret_cast<void **>(&d_in2), size);
+#else
+  cudaMalloc(reinterpret_cast<void **>(&d_in1), size);
+  cudaMalloc(reinterpret_cast<void **>(&d_in2), size);
+#endif
+  in1 = reinterpret_cast<float *>(malloc(size));
+  in2 = reinterpret_cast<float *>(malloc(size));
+  out = reinterpret_cast<float *>(malloc(size));
+  for (size_t i = 0; i < N; i++) {
+    in1[i] = 1.0f;
+    in2[i] = 2.0f;
+  }
+#ifdef __HIPCC__
+  hipMemcpy(d_in1, in1, size, hipMemcpyHostToDevice);
+  hipMemcpy(d_in2, in2, size, hipMemcpyHostToDevice);
+#else
+  cudaMemcpy(d_in1, in1, size, cudaMemcpyHostToDevice);
+  cudaMemcpy(d_in2, in2, size, cudaMemcpyHostToDevice);
+#endif
+  using Functor = std::function<void(float *, float *, size_t)>;
+  Functor alog0 = Algo<4, 256, 1024>;
+  Functor algo1 = Algo<1, 256, 1024>;
+  Functor alog2 = Algo<1, 256, 8>;
+  std::vector<Functor> algos = {alog0, algo1, alog2};
+  for (int j = 0; j < algos.size(); ++j) {
+    auto algo = algos[j];
+    phi::GpuTimer timer;
+    timer.Start(0);
+    algo(d_in1, d_in2, N);
+    timer.Stop(0);
+    VLOG(3) << "alog: " << j << " cost: " << timer.ElapsedTime() << "ms";
+  }
+#ifdef __HIPCC__
+  hipMemcpy(out, d_in2, size, hipMemcpyDeviceToHost);
+#else
+  cudaMemcpy(out, d_in2, size, cudaMemcpyDeviceToHost);
+#endif
+  free(in1);
+  free(in2);
+  free(out);
+#ifdef __HIPCC__
+  hipFree(d_in1);
+  hipFree(d_in2);
+#else
+  cudaFree(d_in1);
+  cudaFree(d_in2);
+#endif
+}
--- a/paddle/phi/kernels/cpu/activation_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/activation_grad_kernel.cc
@@ -121,6 +121,10 @@ DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(Atanh, AtanhGradFunctor);
 DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(TanhShrink, TanhShrinkGradFunctor);
 DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(Silu, SiluGradFunctor);
 DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(LogSigmoid, LogSigmoidGradFunctor);
+DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(Log, LogGradFunctor);
+DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(Log2, Log2GradFunctor);
+DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(Log10, Log10GradFunctor);
+DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPX(Log1p, Log1pGradFunctor);
 DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Relu, ReluGradFunctor);
 DEFINE_CPU_ACTIVATION_GRAD_KERNEL_DEPOUT(Tanh, TanhGradFunctor);
@@ -233,3 +237,8 @@ PD_REGISTER_ACTIVATION_GRAD_KERNEL(sigmoid_double_grad, SigmoidDoubleGradKernel)
 PD_REGISTER_ACTIVATION_GRAD_KERNEL(sigmoid_triple_grad, SigmoidTripleGradKernel)
 PD_REGISTER_ACTIVATION_GRAD_KERNEL(hard_sigmoid_grad, HardSigmoidGradKernel)
 PD_REGISTER_ACTIVATION_GRAD_KERNEL(logsigmoid_grad, LogSigmoidGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(log_grad, LogGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(log2_grad, Log2GradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(log10_grad, Log10GradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(log1p_grad, Log1pGradKernel)
+PD_REGISTER_ACTIVATION_DOUBLE_GRAD_KERNEL(log_double_grad, LogDoubleGradKernel)
--- a/paddle/phi/kernels/cpu/activation_kernel.cc
+++ b/paddle/phi/kernels/cpu/activation_kernel.cc
@@ -74,6 +74,10 @@ DEFINE_CPU_ACTIVATION_KERNEL(TanhShrink, TanhShrinkFunctor)
 DEFINE_CPU_ACTIVATION_KERNEL(Silu, SiluFunctor)
 DEFINE_CPU_ACTIVATION_KERNEL(Sigmoid, SigmoidFunctor)
 DEFINE_CPU_ACTIVATION_KERNEL(LogSigmoid, LogSigmoidFunctor)
+DEFINE_CPU_ACTIVATION_KERNEL(Log, LogFunctor)
+DEFINE_CPU_ACTIVATION_KERNEL(Log2, Log2Functor)
+DEFINE_CPU_ACTIVATION_KERNEL(Log10, Log10Functor)
+DEFINE_CPU_ACTIVATION_KERNEL(Log1p, Log1pFunctor)
 DEFINE_CPU_ACT_KERNEL_WITH_ONE_ATTRS(LeakyRelu, LeakyReluFunctor, alpha)
 DEFINE_CPU_ACT_KERNEL_WITH_ONE_ATTRS(ThresholdedRelu,
@@ -118,3 +122,7 @@ PD_REGISTER_ACTIVATION_KERNEL(silu, SiluKernel)
 PD_REGISTER_ACTIVATION_KERNEL(sigmoid, SigmoidKernel)
 PD_REGISTER_ACTIVATION_KERNEL(logsigmoid, LogSigmoidKernel)
 PD_REGISTER_ACTIVATION_KERNEL(hard_sigmoid, HardSigmoidKernel)
+PD_REGISTER_ACTIVATION_KERNEL(log, LogKernel)
+PD_REGISTER_ACTIVATION_KERNEL(log2, Log2Kernel)
+PD_REGISTER_ACTIVATION_KERNEL(log10, Log10Kernel)
+PD_REGISTER_ACTIVATION_KERNEL(log1p, Log1pKernel)
--- a/paddle/phi/kernels/cpu/squeeze_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/squeeze_grad_kernel.cc
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/phi/kernels/squeeze_grad_kernel.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/squeeze_grad_kernel_impl.h"
+PD_REGISTER_KERNEL(squeeze_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::SqueezeGradKernel,
+                   float,
+                   double,
+                   phi::dtype::bfloat16,
+                   bool,
+                   int,
+                   uint8_t,
+                   int8_t,
+                   int64_t,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
--- a/paddle/phi/kernels/cpu/squeeze_kernel.cc
+++ b/paddle/phi/kernels/cpu/squeeze_kernel.cc
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/phi/kernels/squeeze_kernel.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/squeeze_kernel_impl.h"
+PD_REGISTER_KERNEL(squeeze,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::SqueezeKernel,
+                   float,
+                   double,
+                   phi::dtype::bfloat16,
+                   bool,
+                   int,
+                   uint8_t,
+                   int8_t,
+                   int64_t,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
--- a/paddle/phi/kernels/cpu/unsqueeze_grad_kernel.cc
+++ b/paddle/phi/kernels/cpu/unsqueeze_grad_kernel.cc
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/phi/kernels/unsqueeze_grad_kernel.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/unsqueeze_grad_kernel_impl.h"
+PD_REGISTER_KERNEL(unsqueeze_grad,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::UnsqueezeGradKernel,
+                   phi::dtype::bfloat16,
+                   bool,
+                   int,
+                   int16_t,
+                   uint8_t,
+                   int8_t,
+                   int64_t,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>,
+                   float,
+                   double) {}
--- a/paddle/phi/kernels/cpu/unsqueeze_kernel.cc
+++ b/paddle/phi/kernels/cpu/unsqueeze_kernel.cc
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/phi/kernels/unsqueeze_kernel.h"
+#include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/unsqueeze_kernel_impl.h"
+PD_REGISTER_KERNEL(unsqueeze,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::UnsqueezeKernel,
+                   float,
+                   double,
+                   phi::dtype::bfloat16,
+                   bool,
+                   int,
+                   int16_t,
+                   uint8_t,
+                   int8_t,
+                   int64_t,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
--- a/paddle/phi/kernels/full_kernel.cc
+++ b/paddle/phi/kernels/full_kernel.cc
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/phi/kernels/full_kernel.h"
+#include "paddle/phi/core/kernel_registry.h"
+namespace phi {
+template <typename T, typename Context>
+void FullBatchSizeLikeKernel(const Context& dev_ctx,
+                             const DenseTensor& x,
+                             const std::vector<int>& shape,
+                             const Scalar& val,
+                             DataType dtype,
+                             int x_batch_size_dim,
+                             int out_batch_size_dim,
+                             DenseTensor* out) {
+  if (x.lod().size() && x_batch_size_dim == 0) {
+    // set the correct batch size for the LoDTensor.
+    auto odims = out->dims();
+    odims[out_batch_size_dim] = static_cast<int>(x.lod().back().size()) - 1;
+    FullKernel<T, Context>(dev_ctx, phi::vectorize(odims), val, dtype, out);
+  }
+  FullLikeKernel<T, Context>(dev_ctx, x, val, dtype, out);
+}
+}  // namespace phi
+PD_REGISTER_KERNEL(full_batch_size_like,
+                   CPU,
+                   ALL_LAYOUT,
+                   phi::FullBatchSizeLikeKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   bool) {
+  kernel->InputAt(0).SetBackend(phi::Backend::ALL_BACKEND);
+}
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+PD_REGISTER_KERNEL(full_batch_size_like,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::FullBatchSizeLikeKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   bool,
+                   phi::dtype::float16) {
+  kernel->InputAt(0).SetBackend(phi::Backend::ALL_BACKEND);
+}
+#endif
--- a/paddle/phi/kernels/full_kernel.h
+++ b/paddle/phi/kernels/full_kernel.h
@@ -14,6 +14,8 @@
 #pragma once
+#include <vector>
 #include "paddle/phi/common/scalar.h"
 #include "paddle/phi/common/scalar_array.h"
 #include "paddle/phi/core/dense_tensor.h"
@@ -37,6 +39,18 @@ void FullLikeKernel(const Context& dev_ctx,
                    DataType dtype,
                    DenseTensor* out);
+// In order to be compatible with fill_constant_batch_size_like op
+// that are still used in the 2.x APIs
+template <typename T, typename Context>
+void FullBatchSizeLikeKernel(const Context& dev_ctx,
+                             const DenseTensor& x,
+                             const std::vector<int>& shape,
+                             const Scalar& val,
+                             DataType dtype,
+                             int x_batch_size_dim,
+                             int out_batch_size_dim,
+                             DenseTensor* out);
 template <typename T, typename Context>
 void Full(const Context& dev_ctx,
          const ScalarArray& shape,

--- a/paddle/phi/kernels/funcs/activation_functor.h
+++ b/paddle/phi/kernels/funcs/activation_functor.h
@@ -1223,6 +1223,133 @@ struct HardSigmoidGradFunctor : public BaseActivationFunctor<T> {
  }
 };
+// log(x) = natural logarithm of x
+template <typename T>
+struct LogFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    out.device(d) = x.log();
+  }
+};
+template <typename T>
+struct LogGradFunctor : public BaseActivationFunctor<T> {
+  template <typename Device,
+            typename X,
+            typename Out,
+            typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    dx.device(d) = dout * (static_cast<T>(1) / x);
+  }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
+};
+// log2(x) = logarithm to the base 2 of the elements of x
+template <typename T>
+struct Log2Functor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    out.device(d) = x.log() / static_cast<T>(log(2));
+  }
+};
+// the gradient of log2(x) is 1/(x*ln(2))
+template <typename T>
+struct Log2GradFunctor : public BaseActivationFunctor<T> {
+  template <typename Device,
+            typename X,
+            typename Out,
+            typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    dx.device(d) = dout * static_cast<T>(1) / (x * static_cast<T>(log(2)));
+  }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
+};
+// log10(x) = logarithm to the base 10 of the elements of x
+template <typename T>
+struct Log10Functor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    out.device(d) = x.log() / static_cast<T>(log(10));
+  }
+};
+// the gradient of log10(x) is 1/(x*ln(10))
+template <typename T>
+struct Log10GradFunctor : public BaseActivationFunctor<T> {
+  template <typename Device,
+            typename X,
+            typename Out,
+            typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    dx.device(d) = dout * static_cast<T>(1) / (x * static_cast<T>(log(10)));
+  }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
+};
+// log1p(x) = natural logarithm of x+1
+template <typename T>
+struct Log1pFunctor : public BaseActivationFunctor<T> {
+  template <typename Device, typename X, typename Out>
+  void operator()(Device d, X x, Out out) const {
+    out.device(d) = (static_cast<T>(1) + x).log();
+  }
+};
+template <typename T>
+struct Log1pGradFunctor : public BaseActivationFunctor<T> {
+  template <typename Device,
+            typename X,
+            typename Out,
+            typename dOut,
+            typename dX>
+  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
+    dx.device(d) = dout * (static_cast<T>(1) / (x + static_cast<T>(1)));
+  }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
+};
+template <typename T>
+struct LogGradGradFunctor : public BaseActivationFunctor<T> {
+  template <typename Device>
+  void operator()(const Device& dev,
+                  const DenseTensor* X,
+                  const DenseTensor* ddX,
+                  DenseTensor* ddOut,
+                  const DenseTensor* dOut,
+                  DenseTensor* dX) const {
+    auto* d = dev.eigen_device();
+    auto ddx = EigenVector<T>::Flatten(
+        GET_DATA_SAFELY(ddX, "Input", "DDX", "LogGradGrad"));
+    auto x = EigenVector<T>::Flatten(
+        GET_DATA_SAFELY(X, "Input", "X", "LogGradGrad"));
+    // ddout = ddx / x; dx = -(dout / x) * (ddx / x)
+    // calculate dx first, so ddout can inplace ddx
+    if (dX) {
+      auto dout = EigenVector<T>::Flatten(
+          GET_DATA_SAFELY(dOut, "Output", "DOut", "LogGradGrad"));
+      auto dx = EigenVector<T>::Flatten(
+          GET_DATA_SAFELY(dX, "Output", "DX", "LogGradGrad"));
+      dx.device(*d) = dout * static_cast<T>(-1) * ddx / (x * x);
+    }
+    if (ddOut) {
+      auto ddout = EigenVector<T>::Flatten(
+          GET_DATA_SAFELY(ddOut, "Output", "DDOut", "LogGradGrad"));
+      ddout.device(*d) = ddx * static_cast<T>(1) / x;
+    }
+  }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
+};
 #if defined(__NVCC__) || defined(__HIPCC__) || defined(__xpu__)
 template <typename T>
 struct CudaReluFunctor : public BaseActivationFunctor<T> {
@@ -1970,6 +2097,99 @@ struct CudaHardSigmoidGradFunctor : public BaseActivationFunctor<T> {
  }
 };
+template <typename T>
+struct CudaLogFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+  // log(x) = log(x)
+  __device__ __forceinline__ T operator()(const T arg_x) const {
+    MPType x = static_cast<MPType>(arg_x);
+    return static_cast<T>(log(x));
+  }
+};
+template <typename T>
+struct CudaLogGradFunctor : public BaseActivationFunctor<T> {
+  // dx = dout / x
+  __device__ __forceinline__ T operator()(const T dout, const T x) const {
+    return dout / x;
+  }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
+};
+template <typename T>
+struct CudaLog1pFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+  MPType one = static_cast<MPType>(1.0f);
+  // log1p(x) = log(1 + x)
+  __device__ __forceinline__ T operator()(const T arg_x) const {
+    MPType x = static_cast<MPType>(arg_x);
+    return static_cast<T>(log(one + x));
+  }
+};
+template <typename T>
+struct CudaLog1pGradFunctor : public BaseActivationFunctor<T> {
+  T one = static_cast<T>(1.0f);
+  // dx = dout / (1 + x)
+  __device__ __forceinline__ T operator()(const T dout, const T x) const {
+    return dout / (one + x);
+  }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
+};
+template <typename T>
+struct CudaLog2Functor : public BaseActivationFunctor<T> {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+  // log2(x) = log2(x)
+  __device__ __forceinline__ T operator()(const T arg_x) const {
+    MPType x = static_cast<MPType>(arg_x);
+    return static_cast<T>(log2(x));
+  }
+};
+template <typename T>
+struct CudaLog2GradFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+  T log_two = static_cast<T>(log(static_cast<MPType>(2.0f)));
+  // dx = dout / (x * log(2))
+  __device__ __forceinline__ T operator()(const T dout, const T x) const {
+    return dout / (x * log_two);
+  }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
+};
+template <typename T>
+struct CudaLog10Functor : public BaseActivationFunctor<T> {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+  // log10(x) = log10(x)
+  __device__ __forceinline__ T operator()(const T arg_x) const {
+    MPType x = static_cast<MPType>(arg_x);
+    return static_cast<T>(log10(x));
+  }
+};
+template <typename T>
+struct CudaLog10GradFunctor : public BaseActivationFunctor<T> {
+  using MPType = typename phi::dtype::MPTypeTrait<T>::Type;
+  T log_ten = static_cast<T>(log(static_cast<MPType>(10.0f)));
+  // dx = dout / (x * log(10))
+  __device__ __forceinline__ T operator()(const T dout, const T x) const {
+    return dout / (x * log_ten);
+  }
+  static constexpr ActBwdOpFwdDeps FwdDeps() { return ActBwdOpFwdDeps::kDepX; }
+};
 #endif
 }  // namespace funcs

--- a/paddle/phi/kernels/funcs/unsqueeze.h
+++ b/paddle/phi/kernels/funcs/unsqueeze.h
@@ -21,6 +21,118 @@
 namespace phi {
 namespace funcs {
+inline DDim GetOutputSqueezeShape(const std::vector<int> squeeze_dims,
+                                  const DDim& in_dims,
+                                  bool is_runtime) {
+  size_t num_squeeze_dims = squeeze_dims.size();
+  std::vector<bool> should_squeeze(in_dims.size(), false);
+  // Mark dimensions need to be squeezed.
+  if (num_squeeze_dims == 0) {
+    for (int i = 0; i < in_dims.size(); ++i) {
+      if (in_dims[i] == 1) {
+        should_squeeze[i] = true;
+      }
+    }
+  } else {
+    for (size_t i = 0; i < num_squeeze_dims; ++i) {
+      int current = squeeze_dims[i] < 0 ? squeeze_dims[i] + in_dims.size()
+                                        : squeeze_dims[i];
+      PADDLE_ENFORCE_GE(
+          current,
+          0,
+          phi::errors::InvalidArgument(
+              "Each axis in Attr(axes) should be in the range of [%d, %d]"
+              "But current axis is:%d, input tensor's shape = [%s].",
+              -in_dims.size(),
+              in_dims.size() - 1,
+              current,
+              in_dims));
+      PADDLE_ENFORCE_LT(
+          current,
+          in_dims.size(),
+          phi::errors::InvalidArgument(
+              "Each axis in Attr(axes) should be in the range of [%d, %d]"
+              "But current axis is:%d, input tensor's shape = [%s].",
+              -in_dims.size(),
+              in_dims.size() - 1,
+              current,
+              in_dims));
+      if (!should_squeeze[current]) {
+        if (is_runtime) {
+          // At run time, dim of 1 is allowed to squeeze
+          if (in_dims[current] == 1) {
+            should_squeeze[current] = true;
+          }
+        } else {
+          // At compile time, dim of -1 or 1 is allowed to squeeze
+          if (in_dims[current] == 1 || in_dims[current] == -1) {
+            should_squeeze[current] = true;
+          }
+        }
+      }
+    }
+  }
+  // Make output dimensions
+  std::vector<int64_t> output_shape;
+  for (int i = 0; i < in_dims.size(); ++i) {
+    if (!should_squeeze[i]) {
+      output_shape.push_back(in_dims[i]);
+    }
+  }
+  return phi::make_ddim(output_shape);
+}
+inline DDim GetUnsqueezeShape(const std::vector<int> unsqz_dims,
+                              const DDim& in_dims) {
+  int output_size = in_dims.size() + static_cast<int>(unsqz_dims.size());
+  int cur_output_size = in_dims.size();
+  std::vector<int64_t> output_shape(output_size, 0);
+  // Validity Check: rank range.
+  PADDLE_ENFORCE_LE(
+      output_size,
+      6,
+      phi::errors::InvalidArgument("The output "
+                                   "tensor's rank should be less than 6."));
+  for (int axis : unsqz_dims) {
+    int cur = axis < 0 ? axis + cur_output_size + 1 : axis;
+    // Vaildity Check: the axis bound
+    PADDLE_ENFORCE_GE(
+        cur,
+        0,
+        phi::errors::InvalidArgument("The insert dimension value should "
+                                     "not be less than 0"));
+    PADDLE_ENFORCE_LE(cur,
+                      cur_output_size,
+                      phi::errors::InvalidArgument(
+                          "The insert dimension value shoule not be larger "
+                          "than the dimension size of input tensor"));
+    // Move old axis, and insert new axis
+    for (int i = cur_output_size; i >= cur; --i) {
+      if (output_shape[i] == 1) {
+        // Move axis
+        output_shape[i + 1] = 1;
+        output_shape[i] = 0;
+      }
+    }
+    output_shape[cur] = 1;
+    // Add the output size.
+    cur_output_size++;
+  }
+  // Make output shape
+  for (int in_idx = 0, out_idx = 0; out_idx < output_size; ++out_idx) {
+    if (output_shape[out_idx] == 0) {
+      output_shape[out_idx] = in_dims[in_idx++];
+    }
+  }
+  return phi::make_ddim(output_shape);
+}
 inline const DenseTensor Unsqueeze(const DenseTensor& x, int axis = 0) {
  // don't copy data, only change the dims

--- a/paddle/phi/kernels/gpu/activation_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/activation_grad_kernel.cu
@@ -177,6 +177,10 @@ DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(Atanh, CudaAtanhGradFunctor);
 DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(TanhShrink, CudaTanhShrinkGradFunctor);
 DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(Silu, CudaSiluGradFunctor);
 DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(LogSigmoid, CudaLogSigmoidGradFunctor);
+DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(Log, CudaLogGradFunctor);
+DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(Log2, CudaLog2GradFunctor);
+DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(Log10, CudaLog10GradFunctor);
+DEFINE_GPU_ACTIVATION_GRAD_KERNEL_DEPX(Log1p, CudaLog1pGradFunctor);
 DEFINE_GPU_ACT_GRAD_KERNEL_WITH_ONE_ATTRS_DEPX(LeakyRelu,
                                               CudaLeakyReluGradFunctor,
@@ -300,3 +304,14 @@ PD_REGISTER_ACTIVATION_GRAD_KERNEL(sigmoid_double_grad, SigmoidDoubleGradKernel)
 PD_REGISTER_ACTIVATION_GRAD_KERNEL(sigmoid_triple_grad, SigmoidTripleGradKernel)
 PD_REGISTER_ACTIVATION_GRAD_KERNEL(hard_sigmoid_grad, HardSigmoidGradKernel)
 PD_REGISTER_ACTIVATION_GRAD_KERNEL(logsigmoid_grad, LogSigmoidGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(log_grad, LogGradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(log2_grad, Log2GradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(log10_grad, Log10GradKernel)
+PD_REGISTER_ACTIVATION_GRAD_KERNEL(log1p_grad, Log1pGradKernel)
+PD_REGISTER_KERNEL(log_double_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::LogDoubleGradKernel,
+                   float,
+                   double,
+                   phi::dtype::float16) {}
--- a/paddle/phi/kernels/gpu/activation_kernel.cu
+++ b/paddle/phi/kernels/gpu/activation_kernel.cu
@@ -19,7 +19,7 @@ limitations under the License. */
 #include "paddle/phi/common/float16.h"
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/funcs/elementwise_base.h"
-#include "paddle/phi/kernels/impl/activation_grad_impl.h"
+#include "paddle/phi/kernels/impl/activation_impl.h"
 #include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
@@ -93,6 +93,10 @@ DEFINE_GPU_ACTIVATION_KERNEL(TanhShrink, CudaTanhShrinkFunctor)
 DEFINE_GPU_ACTIVATION_KERNEL(Silu, CudaSiluFunctor)
 DEFINE_GPU_ACTIVATION_KERNEL(Sigmoid, CudaSigmoidFunctor)
 DEFINE_GPU_ACTIVATION_KERNEL(LogSigmoid, CudaLogSigmoidFunctor)
+DEFINE_GPU_ACTIVATION_KERNEL(Log, CudaLogFunctor)
+DEFINE_GPU_ACTIVATION_KERNEL(Log2, CudaLog2Functor)
+DEFINE_GPU_ACTIVATION_KERNEL(Log10, CudaLog10Functor)
+DEFINE_GPU_ACTIVATION_KERNEL(Log1p, CudaLog1pFunctor)
 DEFINE_GPU_ACT_KERNEL_WITH_ONE_ATTRS(LeakyRelu, CudaLeakyReluFunctor, alpha)
 DEFINE_GPU_ACT_KERNEL_WITH_ONE_ATTRS(ThresholdedRelu,
@@ -164,3 +168,7 @@ PD_REGISTER_ACTIVATION_KERNEL(silu, SiluKernel)
 PD_REGISTER_ACTIVATION_KERNEL(sigmoid, SigmoidKernel)
 PD_REGISTER_ACTIVATION_KERNEL(logsigmoid, LogSigmoidKernel)
 PD_REGISTER_ACTIVATION_KERNEL(hard_sigmoid, HardSigmoidKernel)
+PD_REGISTER_ACTIVATION_KERNEL(log, LogKernel)
+PD_REGISTER_ACTIVATION_KERNEL(log2, Log2Kernel)
+PD_REGISTER_ACTIVATION_KERNEL(log10, Log10Kernel)
+PD_REGISTER_ACTIVATION_KERNEL(log1p, Log1pKernel)
--- a/paddle/phi/kernels/gpu/squeeze_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/squeeze_grad_kernel.cu
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/phi/kernels/squeeze_grad_kernel.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/squeeze_grad_kernel_impl.h"
+PD_REGISTER_KERNEL(squeeze_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::SqueezeGradKernel,
+                   float,
+                   double,
+                   phi::dtype::bfloat16,
+                   phi::dtype::float16,
+                   bool,
+                   int,
+                   uint8_t,
+                   int8_t,
+                   int64_t,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
--- a/paddle/phi/kernels/gpu/squeeze_kernel.cu
+++ b/paddle/phi/kernels/gpu/squeeze_kernel.cu
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/phi/kernels/squeeze_kernel.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/squeeze_kernel_impl.h"
+PD_REGISTER_KERNEL(squeeze,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::SqueezeKernel,
+                   float,
+                   double,
+                   phi::dtype::bfloat16,
+                   phi::dtype::float16,
+                   bool,
+                   int,
+                   uint8_t,
+                   int8_t,
+                   int64_t,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
--- a/paddle/phi/kernels/gpu/unsqueeze_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/unsqueeze_grad_kernel.cu
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/phi/kernels/unsqueeze_grad_kernel.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/unsqueeze_grad_kernel_impl.h"
+PD_REGISTER_KERNEL(unsqueeze_grad,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::UnsqueezeGradKernel,
+                   phi::dtype::bfloat16,
+                   phi::dtype::float16,
+                   bool,
+                   int,
+                   int16_t,
+                   uint8_t,
+                   int8_t,
+                   int64_t,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>,
+                   float,
+                   double) {}
--- a/paddle/phi/kernels/gpu/unsqueeze_kernel.cu
+++ b/paddle/phi/kernels/gpu/unsqueeze_kernel.cu
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/phi/kernels/unsqueeze_kernel.h"
+#include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/core/kernel_registry.h"
+#include "paddle/phi/kernels/impl/unsqueeze_kernel_impl.h"
+PD_REGISTER_KERNEL(unsqueeze,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::UnsqueezeKernel,
+                   float,
+                   double,
+                   phi::dtype::float16,
+                   phi::dtype::bfloat16,
+                   bool,
+                   int,
+                   int16_t,
+                   uint8_t,
+                   int8_t,
+                   int64_t,
+                   phi::dtype::complex<float>,
+                   phi::dtype::complex<double>) {}
--- a/paddle/phi/kernels/impl/activation_grad_impl.h
+++ b/paddle/phi/kernels/impl/activation_grad_impl.h
@@ -275,4 +275,22 @@ void SigmoidTripleGradKernel(const Context& dev_ctx,
          d_ddx);
 }
+template <typename T, typename Context>
+void LogDoubleGradKernel(const Context& dev_ctx,
+                         const DenseTensor& x,
+                         const DenseTensor& dout,
+                         const DenseTensor& ddx,
+                         DenseTensor* dx,
+                         DenseTensor* ddout) {
+  if (dx) {
+    dx->Resize(x.dims());
+    dev_ctx.template Alloc<T>(dx);
+  }
+  if (ddout) {
+    dev_ctx.template Alloc<T>(ddout);
+  }
+  funcs::LogGradGradFunctor<T> functor;
+  functor(dev_ctx, &x, &ddx, ddout, &dout, dx);
+}
 }  // namespace phi
--- a/paddle/phi/kernels/impl/squeeze_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/squeeze_grad_kernel_impl.h
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/kernels/copy_kernel.h"
+namespace phi {
+template <typename T, typename Context>
+void SqueezeGradKernel(const Context& dev_ctx,
+                       const DenseTensor& xshape,
+                       const DenseTensor& dout,
+                       const std::vector<int>& axes,
+                       DenseTensor* dx) {
+  auto xshape_dims = xshape.dims();
+  auto x_dims = phi::slice_ddim(xshape_dims, 1, xshape_dims.size());
+  dev_ctx.template Alloc<T>(dx);
+  phi::Copy(dev_ctx, dout, dev_ctx.GetPlace(), false, dx);
+  dx->Resize(x_dims);
+}
+}  // namespace phi
--- a/paddle/phi/kernels/impl/squeeze_kernel_impl.h
+++ b/paddle/phi/kernels/impl/squeeze_kernel_impl.h
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/kernels/funcs/unsqueeze.h"
+namespace phi {
+template <typename T, typename Context>
+void SqueezeKernel(const Context& dev_ctx,
+                   const DenseTensor& x,
+                   const std::vector<int>& axes,
+                   DenseTensor* xshape,
+                   DenseTensor* out) {
+  auto x_dims = x.dims();
+  auto out_dims = funcs::GetOutputSqueezeShape(axes, x_dims, true);
+  dev_ctx.template Alloc<T>(out);
+  phi::Copy(dev_ctx, x, dev_ctx.GetPlace(), false, out);
+  out->Resize(out_dims);
+}
+}  // namespace phi
--- a/paddle/phi/kernels/impl/unsqueeze_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/unsqueeze_grad_kernel_impl.h
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/kernels/copy_kernel.h"
+namespace phi {
+template <typename T, typename Context>
+void UnsqueezeGradKernel(const Context& dev_ctx,
+                         const DenseTensor& x_shape,
+                         const DenseTensor& dout,
+                         DenseTensor* dx) {
+  auto xshape_dims = x_shape.dims();
+  auto x_dims = phi::slice_ddim(xshape_dims, 1, xshape_dims.size());
+  dev_ctx.template Alloc<T>(dx);
+  phi::Copy(dev_ctx, dout, dev_ctx.GetPlace(), true, dx);
+  dx->Resize(x_dims);
+}
+}  // namespace phi
--- a/paddle/phi/kernels/impl/unsqueeze_kernel_impl.h
+++ b/paddle/phi/kernels/impl/unsqueeze_kernel_impl.h
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/kernels/copy_kernel.h"
+#include "paddle/phi/kernels/funcs/unsqueeze.h"
+namespace phi {
+template <typename T, typename Context>
+void UnsqueezeKernel(const Context& dev_ctx,
+                     const DenseTensor& x,
+                     const ScalarArray& axes,
+                     DenseTensor* xshape,
+                     DenseTensor* out) {
+  auto x_dims = x.dims();
+  auto out_dims = out->dims();
+  if (axes.FromTensor()) {
+    std::vector<int32_t> tmp;
+    tmp.reserve(axes.GetData().size());
+    std::for_each(axes.GetData().begin(),
+                  axes.GetData().end(),
+                  [&tmp](const int64_t& t) { tmp.push_back(t); });
+    out_dims = funcs::GetUnsqueezeShape(tmp, x_dims);
+  }
+  out->Resize(out_dims);
+  dev_ctx.template Alloc<T>(out);
+  phi::Copy(dev_ctx, x, dev_ctx.GetPlace(), false, out);
+  out->Resize(out_dims);  // copy will reset the dims.
+}
+}  // namespace phi
--- a/paddle/phi/kernels/squeeze_grad_kernel.h
+++ b/paddle/phi/kernels/squeeze_grad_kernel.h
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include "paddle/phi/core/dense_tensor.h"
+namespace phi {
+template <typename T, typename Context>
+void SqueezeGradKernel(const Context& dev_ctx,
+                       const DenseTensor& xshape,
+                       const DenseTensor& dout,
+                       const std::vector<int>& axes,
+                       DenseTensor* dx);
+}  // namespace phi
--- a/paddle/phi/kernels/squeeze_kernel.h
+++ b/paddle/phi/kernels/squeeze_kernel.h
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include "paddle/phi/core/dense_tensor.h"
+namespace phi {
+template <typename T, typename Context>
+void SqueezeKernel(const Context& dev_ctx,
+                   const DenseTensor& x,
+                   const std::vector<int>& axes,
+                   DenseTensor* xshape,
+                   DenseTensor* out);
+}  // namespace phi
--- a/paddle/phi/kernels/unsqueeze_grad_kernel.h
+++ b/paddle/phi/kernels/unsqueeze_grad_kernel.h
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include "paddle/phi/core/dense_tensor.h"
+namespace phi {
+template <typename T, typename Context>
+void UnsqueezeGradKernel(const Context& dev_ctx,
+                         const DenseTensor& x_shape,
+                         const DenseTensor& dout,
+                         DenseTensor* dx);
+}  // namespace phi
--- a/paddle/phi/kernels/unsqueeze_kernel.h
+++ b/paddle/phi/kernels/unsqueeze_kernel.h
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include "paddle/phi/common/scalar_array.h"
+#include "paddle/phi/core/dense_tensor.h"
+namespace phi {
+template <typename T, typename Context>
+void UnsqueezeKernel(const Context& dev_ctx,
+                     const DenseTensor& x,
+                     const ScalarArray& axes,
+                     DenseTensor* xshape,
+                     DenseTensor* out);
+}  // namespace phi
--- a/paddle/phi/ops/compat/activation_sig.cc
+++ b/paddle/phi/ops/compat/activation_sig.cc
@@ -57,6 +57,10 @@ DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(HardShrink, "hard_shrink", "threshold");
 DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(TanhShrink, "tanh_shrink", );  // NOLINT
 DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Silu, "silu", );               // NOLINT
 DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(LogSigmoid, "logsigmoid", );   // NOLINT
+DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Log, "log", );                 // NOLINT
+DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Log2, "log2", );               // NOLINT
+DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Log10, "log10", );             // NOLINT
+DEFINE_ACT_GRAD_DEPX_OP_ARGMAP(Log1p, "log1p", );             // NOLINT
 DEFINE_ACT_GRAD_DEPOUT_OP_ARGMAP(Relu, "relu", );        // NOLINT
 DEFINE_ACT_GRAD_DEPOUT_OP_ARGMAP(Tanh, "tanh", );        // NOLINT
@@ -125,6 +129,12 @@ KernelSignature EluDoubleGradOpArgumentMapping(
      "elu_double_grad", {"X", "DOut", "DDX"}, {"alpha"}, {"DX", "DDOut"});
 }
+KernelSignature LogDoubleGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature(
+      "log_double_grad", {"X", "DOut", "DDX"}, {}, {"DX", "DDOut"});
+}
 }  // namespace phi
 PD_REGISTER_BASE_KERNEL_NAME(relu_grad_grad, relu_double_grad);
@@ -134,6 +144,7 @@ PD_REGISTER_BASE_KERNEL_NAME(softshrink, soft_shrink);
 PD_REGISTER_BASE_KERNEL_NAME(softshrink_grad, soft_shrink_grad);
 PD_REGISTER_BASE_KERNEL_NAME(elu_grad_grad, elu_double_grad);
 PD_REGISTER_BASE_KERNEL_NAME(sigmoid_grad_grad, sigmoid_double_grad);
+PD_REGISTER_BASE_KERNEL_NAME(log_grad_grad, log_double_grad);
 PD_REGISTER_ARG_MAPPING_FN(cos_grad, phi::CosGradOpArgumentMapping);
 PD_REGISTER_ARG_MAPPING_FN(tan_grad, phi::TanGradOpArgumentMapping);
@@ -181,3 +192,8 @@ PD_REGISTER_ARG_MAPPING_FN(logsigmoid_grad,
                           phi::LogSigmoidGradOpArgumentMapping);
 PD_REGISTER_ARG_MAPPING_FN(hard_sigmoid_grad,
                           phi::HardSigmoidGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(log_grad, phi::LogGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(log_grad_grad, phi::LogDoubleGradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(log2_grad, phi::Log2GradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(log10_grad, phi::Log10GradOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(log1p_grad, phi::Log1pGradOpArgumentMapping);
--- a/paddle/fluid/operators/fill_constant_batch_size_like_op.cu.cc
+++ b/paddle/fluid/operators/fill_constant_batch_size_like_op.cu.cc
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -12,21 +12,32 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/operators/fill_constant_batch_size_like_op.h"
+#include "paddle/phi/core/compat/op_utils.h"
-#include "paddle/fluid/framework/op_registry.h"
+namespace phi {
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
+KernelSignature FillConstantBatchSizeLikeOpArgumentMapping(
-    fill_constant_batch_size_like,
+    const ArgumentMappingContext& ctx) {
-    ops::FillConstantBatchSizeLikeOpKernel<paddle::platform::CUDADeviceContext,
+  const auto& str_value = paddle::any_cast<std::string>(ctx.Attr("str_value"));
-                                           paddle::platform::float16>,
+  if (str_value.empty()) {
-    ops::FillConstantBatchSizeLikeOpKernel<paddle::platform::CUDADeviceContext,
+    return KernelSignature(
-                                           float>,
+        "full_batch_size_like",
-    ops::FillConstantBatchSizeLikeOpKernel<paddle::platform::CUDADeviceContext,
+        {"Input"},
-                                           double>,
+        {"shape", "value", "dtype", "input_dim_idx", "output_dim_idx"},
-    ops::FillConstantBatchSizeLikeOpKernel<paddle::platform::CUDADeviceContext,
+        {"Out"});
-                                           int>,
+  } else {
-    ops::FillConstantBatchSizeLikeOpKernel<paddle::platform::CUDADeviceContext,
+    return KernelSignature(
-                                           int64_t>,
+        "full_batch_size_like",
-    ops::FillConstantBatchSizeLikeOpKernel<paddle::platform::CUDADeviceContext,
+        {"Input"},
-                                           bool>);
+        {"shape", "str_value", "dtype", "input_dim_idx", "output_dim_idx"},
+        {"Out"});
+  }
+}
+}  // namespace phi
+PD_REGISTER_BASE_KERNEL_NAME(fill_constant_batch_size_like,
+                             full_batch_size_like);
+PD_REGISTER_ARG_MAPPING_FN(fill_constant_batch_size_like,
+                           phi::FillConstantBatchSizeLikeOpArgumentMapping);
--- a/paddle/phi/ops/compat/squeeze_sig.cc
+++ b/paddle/phi/ops/compat/squeeze_sig.cc
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/phi/core/compat/op_utils.h"
+namespace phi {
+KernelSignature SqueezeOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  return KernelSignature("squeeze", {"X"}, {"axes"}, {"XShape", "Out"});
+}
+KernelSignature SqueezeGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature("squeeze_grad",
+                         {"XShape", GradVarName("Out")},
+                         {"axes"},
+                         {GradVarName("X")});
+}
+}  // namespace phi
+PD_REGISTER_BASE_KERNEL_NAME(squeeze2, squeeze);
+PD_REGISTER_BASE_KERNEL_NAME(squeeze2_grad, squeeze_grad);
+PD_REGISTER_ARG_MAPPING_FN(squeeze2, phi::SqueezeOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(squeeze2_grad, phi::SqueezeGradOpArgumentMapping);
--- a/paddle/phi/ops/compat/unsqueeze_sig.cc
+++ b/paddle/phi/ops/compat/unsqueeze_sig.cc
+// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/phi/core/compat/op_utils.h"
+namespace phi {
+KernelSignature UnsqueezeOpArgumentMapping(const ArgumentMappingContext& ctx) {
+  if (ctx.InputSize("AxesTensorList") > 0) {
+    VLOG(2) << "unsqueeze2 in AxesTensorList";
+    return KernelSignature(
+        "unsqueeze", {"X"}, {"AxesTensorList"}, {"XShape", "Out"});
+  } else if (ctx.InputSize("AxesTensor") > 0) {
+    VLOG(2) << "unsqueeze2 in AxesTensor";
+    return KernelSignature(
+        "unsqueeze", {"X"}, {"AxesTensor"}, {"XShape", "Out"});
+  } else {
+    VLOG(2) << "unsqueeze2 in axes";
+    return KernelSignature("unsqueeze", {"X"}, {"axes"}, {"XShape", "Out"});
+  }
+}
+KernelSignature UnsqueezeGradOpArgumentMapping(
+    const ArgumentMappingContext& ctx) {
+  return KernelSignature(
+      "unsqueeze_grad", {"XShape", GradVarName("Out")}, {}, {GradVarName("X")});
+}
+}  // namespace phi
+PD_REGISTER_BASE_KERNEL_NAME(unsqueeze2, unsqueeze);
+PD_REGISTER_BASE_KERNEL_NAME(unsqueeze2_grad, unsqueeze_grad);
+PD_REGISTER_ARG_MAPPING_FN(unsqueeze2, phi::UnsqueezeOpArgumentMapping);
+PD_REGISTER_ARG_MAPPING_FN(unsqueeze2_grad,
+                           phi::UnsqueezeGradOpArgumentMapping);
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -774,12 +774,12 @@ set +x
        get_precision_ut_mac
        ut_actual_total_startTime_s=`date +%s`
        if [[ "$on_precision" == "0" ]];then
-            ctest -E "$disable_ut_quickly" -LE ${nightly_label} --output-on-failure -j $2 | tee $tmpfile
+            ctest -E "$disable_ut_quickly" -LE ${nightly_label} --timeout 120 --output-on-failure -j $2 | tee $tmpfile
        else
-            ctest -R "$UT_list_prec" -E "$disable_ut_quickly" -LE ${nightly_label} --output-on-failure -j $2 | tee $tmpfile
+            ctest -R "$UT_list_prec" -E "$disable_ut_quickly" -LE ${nightly_label} --timeout 120 --output-on-failure -j $2 | tee $tmpfile
            tmpfile_rand=`date +%s%N`
            tmpfile=$tmp_dir/$tmpfile_rand
-            ctest -R "$UT_list_prec_1" -E "$disable_ut_quickly" -LE ${nightly_label} --output-on-failure -j $2 | tee $tmpfile
+            ctest -R "$UT_list_prec_1" -E "$disable_ut_quickly" -LE ${nightly_label} --timeout 120 --output-on-failure -j $2 | tee $tmpfile
        fi
        ut_total_endTime_s=`date +%s`
        echo "TestCases Total Time: $[ $ut_total_endTime_s - $ut_actual_total_startTime_s ]s"
@@ -848,7 +848,7 @@ set +x
                                fi
                            done
                        failed_test_lists=''
-                        ctest -R "$retry_unittests_regular" --output-on-failure -j 2 | tee $tmpfile
+                        ctest -R "$retry_unittests_regular" --timeout 120 --output-on-failure -j 2 | tee $tmpfile
                        collect_failed_tests
                        rm -f $tmp_dir/*
                        exec_times=$[$exec_times+1]

--- a/python/paddle/fluid/tests/unittests/test_scope.py
+++ b/python/paddle/fluid/tests/unittests/test_scope.py
@@ -59,6 +59,13 @@ class TestScope(unittest.TestCase):
            # It is not allowed to delete a nonexistent scope.
            scope._remove_from_pool()
+    def test_size(self):
+        paddle_c = paddle.fluid.core
+        scope = paddle_c.Scope()
+        var_a = scope.var("var_a")
+        self.assertEqual(scope.size(), 1)
+        self.assertIsNotNone(scope.find_var('var_a'))
 if __name__ == '__main__':
    unittest.main()
--- a/python/paddle/utils/code_gen/api_gen.py
+++ b/python/paddle/utils/code_gen/api_gen.py
@@ -148,7 +148,6 @@ def source_include(header_file_path):
 #include "paddle/phi/infermeta/nullary.h"
 #include "paddle/phi/infermeta/unary.h"
 #include "paddle/phi/infermeta/ternary.h"
-#include "paddle/phi/kernels/declarations.h"
 #include "paddle/fluid/platform/profiler/event_tracing.h"
 """

--- a/tools/check_file_diff_approvals.sh
+++ b/tools/check_file_diff_approvals.sh
@@ -231,6 +231,12 @@ if [ "${HAS_MODIFIED_ALLOCATION}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then
    check_approval 1 6888866 39303645
  fi
+HAS_MODIFIED_DECLARATIONS=`git diff --name-only upstream/$BRANCH | grep "paddle/phi/kernels/declarations.h" || true`
+if [ "${HAS_MODIFIED_DECLARATIONS}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then
+    echo_line="You must be approved by chenwhql for any use of paddle/phi/kernels/declarations.h. Thanks!\n"
+    check_approval 1 22561442
+  fi
 ALL_PADDLE_ENFORCE=`git diff -U0 upstream/$BRANCH |grep "^+" |grep -zoE "PADDLE_ENFORCE\(.[^,\);]+.[^;]*\);\s" || true`
 if [ "${ALL_PADDLE_ENFORCE}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then
    echo_line="PADDLE_ENFORCE is not recommended. Please use PADDLE_ENFORCE_EQ/NE/GT/GE/LT/LE or PADDLE_ENFORCE_NOT_NULL or PADDLE_ENFORCE_GPU_SUCCESS instead, see [ https://github.com/PaddlePaddle/Paddle/wiki/PADDLE_ENFORCE-Rewriting-Specification ] for details.\nYou must have one RD (chenwhql (Recommend) , luotao1 (Recommend) or lanxianghit) approval for the usage (either add or delete) of PADDLE_ENFORCE.\n${ALL_PADDLE_ENFORCE}\n"