From 09096aebd49b1c07b8c0fea29206413f1ca938cc Mon Sep 17 00:00:00 2001
From: Leo Chen <chenqiuliang@baidu.com>
Date: Fri, 1 Jul 2022 21:25:21 -0500
Subject: [PATCH] unify cpu context (#43989)

* unify cpu context

* fix init()

* delete test_device_context

* fix test_scalar
---
 .../elementwise/elementwise_add_op.cc         |   6 -
 .../elementwise/elementwise_floordiv_op.cc    |   3 -
 .../elementwise/elementwise_max_op.cc         |   3 -
 .../elementwise/elementwise_min_op.cc         |   3 -
 .../elementwise/elementwise_mod_op.cc         |   3 -
 .../elementwise/elementwise_pow_op.cc         |   3 -
 .../elementwise/elementwise_sub_op.cc         |   3 -
 paddle/fluid/operators/increment_op.cc        |   3 -
 paddle/fluid/operators/isfinite_op.cc         |   3 -
 paddle/fluid/operators/isfinite_v2_op.cc      |   3 -
 paddle/fluid/operators/label_smooth_op.cc     |   3 -
 paddle/fluid/operators/math/beam_search.cc    |  21 +-
 .../fluid/operators/math/concat_and_split.cc  |   8 +-
 .../fluid/operators/math/context_project.cc   |  11 +-
 .../fluid/operators/math/cos_sim_functor.cc   |   7 -
 paddle/fluid/operators/math/cross_entropy.cc  |   9 -
 paddle/fluid/operators/math/gru_compute.cc    |   6 -
 paddle/fluid/operators/math/im2col.cc         |  30 -
 paddle/fluid/operators/math/math_function.cc  | 335 ----------
 paddle/fluid/operators/math/maxouting.cc      |   5 -
 paddle/fluid/operators/math/sample_prob.cc    |  13 +-
 .../operators/math/selected_rows_functor.cc   |  86 +--
 .../fluid/operators/math/sequence_padding.cc  | 113 ----
 paddle/fluid/operators/math/sequence_scale.cc |  26 -
 paddle/fluid/operators/math/softmax.cc        |   7 -
 paddle/fluid/operators/math/vol2col.cc        | 250 -------
 .../fluid/operators/mkldnn/sum_mkldnn_op.cc   |   1 -
 paddle/fluid/operators/rank_loss_op.cc        |   3 -
 .../operators/reduce_ops/frobenius_norm_op.cc |   3 -
 .../operators/reduce_ops/reduce_all_op.cc     |   3 -
 .../operators/reduce_ops/reduce_any_op.cc     |   3 -
 .../operators/reduce_ops/reduce_prod_op.cc    |   3 -
 .../operators/reduce_ops/reduce_sum_op.cc     |   3 -
 paddle/fluid/operators/set_value_op.cc        |   3 -
 paddle/fluid/platform/device_context.cc       |   8 -
 paddle/fluid/platform/device_context.h        |   9 +-
 paddle/fluid/platform/transform.h             |  24 -
 paddle/infrt/kernel/phi/context_kernels.cc    |   1 -
 .../infershaped/infershape_launchers_test.cc  |   1 -
 paddle/phi/backends/cpu/cpu_context.cc        |  10 +-
 paddle/phi/backends/cpu/cpu_context.h         |   6 -
 paddle/phi/kernels/funcs/blas/blas_impl.h     | 616 ------------------
 paddle/phi/kernels/funcs/fc_functor.cc        |   2 -
 paddle/phi/kernels/funcs/for_range.h          |  16 -
 paddle/phi/kernels/funcs/gru_compute.cc       | 185 ------
 paddle/phi/kernels/funcs/lstm_compute.cc      |  80 ---
 paddle/phi/kernels/funcs/math_function.cc     | 134 ++--
 paddle/phi/kernels/funcs/matrix_inverse.cc    |   4 -
 paddle/phi/tests/api/test_sparse_utils_api.cc |   1 -
 paddle/phi/tests/common/test_scalar.cu        |   7 -
 paddle/phi/tests/core/CMakeLists.txt          |   4 -
 paddle/phi/tests/core/test_device_context.cc  |  54 --
 paddle/phi/tests/kernels/test_cast_dev_api.cc |   1 -
 .../phi/tests/kernels/test_concat_dev_api.cc  |   1 -
 paddle/phi/tests/kernels/test_conj_dev_api.cc |   1 -
 paddle/phi/tests/kernels/test_copy_dev_api.cc |   1 -
 .../tests/kernels/test_creation_dev_api.cc    |   4 -
 paddle/phi/tests/kernels/test_dot_dev_api.cc  |   1 -
 .../tests/kernels/test_elementwise_dev_api.cc |   4 -
 .../phi/tests/kernels/test_flatten_dev_api.cc |   1 -
 .../phi/tests/kernels/test_math_function.cc   |   1 -
 .../phi/tests/kernels/test_matmul_dev_api.cc  |   1 -
 paddle/phi/tests/kernels/test_mean_dev_api.cc |   1 -
 .../phi/tests/kernels/test_reshape_dev_api.cc |   1 -
 .../phi/tests/kernels/test_scale_dev_api.cc   |   2 -
 .../kernels/test_sparse_activation_dev_api.cc |   1 -
 .../kernels/test_sparse_conv3d_dev_api.cc     |   1 -
 .../test_sparse_elementwise_dev_api.cc        |   4 -
 .../tests/kernels/test_sparse_pool_dev_api.cc |   1 -
 .../kernels/test_sparse_utils_dev_api.cc      |   6 -
 .../phi/tests/kernels/test_split_dev_api.cc   |   1 -
 paddle/phi/tests/kernels/test_sum_dev_api.cc  |   1 -
 72 files changed, 61 insertions(+), 2121 deletions(-)
 delete mode 100644 paddle/fluid/operators/math/math_function.cc
 delete mode 100644 paddle/phi/tests/core/test_device_context.cc

diff --git a/paddle/fluid/operators/elementwise/elementwise_add_op.cc b/paddle/fluid/operators/elementwise/elementwise_add_op.cc
index c71f6b7c3cd..0123df0006f 100644
--- a/paddle/fluid/operators/elementwise/elementwise_add_op.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_add_op.cc
@@ -20,12 +20,6 @@ namespace paddle {
 namespace framework {
 class OpDesc;
 }  // namespace framework
-namespace imperative {
-class OpBase;
-}  // namespace imperative
-namespace platform {
-class CPUDeviceContext;
-}  // namespace platform
 }  // namespace paddle
 
 namespace paddle {
diff --git a/paddle/fluid/operators/elementwise/elementwise_floordiv_op.cc b/paddle/fluid/operators/elementwise/elementwise_floordiv_op.cc
index 422cbd881d2..6a8c986a53c 100644
--- a/paddle/fluid/operators/elementwise/elementwise_floordiv_op.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_floordiv_op.cc
@@ -25,9 +25,6 @@ class EmptyGradOpMaker;
 namespace imperative {
 class OpBase;
 }  // namespace imperative
-namespace platform {
-class CPUDeviceContext;
-}  // namespace platform
 }  // namespace paddle
 
 namespace paddle {
diff --git a/paddle/fluid/operators/elementwise/elementwise_max_op.cc b/paddle/fluid/operators/elementwise/elementwise_max_op.cc
index 58e9c6d7b4c..1911b5c2de6 100644
--- a/paddle/fluid/operators/elementwise/elementwise_max_op.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_max_op.cc
@@ -23,9 +23,6 @@ class OpDesc;
 namespace imperative {
 class OpBase;
 }  // namespace imperative
-namespace platform {
-class CPUDeviceContext;
-}  // namespace platform
 }  // namespace paddle
 
 namespace paddle {
diff --git a/paddle/fluid/operators/elementwise/elementwise_min_op.cc b/paddle/fluid/operators/elementwise/elementwise_min_op.cc
index 8b967cb1fe1..9fd70754888 100644
--- a/paddle/fluid/operators/elementwise/elementwise_min_op.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_min_op.cc
@@ -23,9 +23,6 @@ class OpDesc;
 namespace imperative {
 class OpBase;
 }  // namespace imperative
-namespace platform {
-class CPUDeviceContext;
-}  // namespace platform
 }  // namespace paddle
 
 namespace paddle {
diff --git a/paddle/fluid/operators/elementwise/elementwise_mod_op.cc b/paddle/fluid/operators/elementwise/elementwise_mod_op.cc
index ee67f7e4020..55d6e214d6c 100644
--- a/paddle/fluid/operators/elementwise/elementwise_mod_op.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_mod_op.cc
@@ -25,9 +25,6 @@ class EmptyGradOpMaker;
 namespace imperative {
 class OpBase;
 }  // namespace imperative
-namespace platform {
-class CPUDeviceContext;
-}  // namespace platform
 }  // namespace paddle
 
 namespace paddle {
diff --git a/paddle/fluid/operators/elementwise/elementwise_pow_op.cc b/paddle/fluid/operators/elementwise/elementwise_pow_op.cc
index c13fba99bdb..fcfee9b4fca 100644
--- a/paddle/fluid/operators/elementwise/elementwise_pow_op.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_pow_op.cc
@@ -20,9 +20,6 @@ class OpDesc;
 namespace imperative {
 class OpBase;
 }  // namespace imperative
-namespace platform {
-class CPUDeviceContext;
-}  // namespace platform
 }  // namespace paddle
 
 namespace paddle {
diff --git a/paddle/fluid/operators/elementwise/elementwise_sub_op.cc b/paddle/fluid/operators/elementwise/elementwise_sub_op.cc
index a9968906fb9..24f0228025f 100644
--- a/paddle/fluid/operators/elementwise/elementwise_sub_op.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_sub_op.cc
@@ -23,9 +23,6 @@ class OpDesc;
 namespace imperative {
 class OpBase;
 }  // namespace imperative
-namespace platform {
-class CPUDeviceContext;
-}  // namespace platform
 }  // namespace paddle
 
 namespace paddle {
diff --git a/paddle/fluid/operators/increment_op.cc b/paddle/fluid/operators/increment_op.cc
index 7d62bf2d628..3ab6b9f9405 100644
--- a/paddle/fluid/operators/increment_op.cc
+++ b/paddle/fluid/operators/increment_op.cc
@@ -25,9 +25,6 @@ class OpDesc;
 namespace imperative {
 class OpBase;
 }  // namespace imperative
-namespace platform {
-class CPUDeviceContext;
-}  // namespace platform
 }  // namespace paddle
 
 namespace paddle {
diff --git a/paddle/fluid/operators/isfinite_op.cc b/paddle/fluid/operators/isfinite_op.cc
index a7fc4865f78..bcab28df3a1 100644
--- a/paddle/fluid/operators/isfinite_op.cc
+++ b/paddle/fluid/operators/isfinite_op.cc
@@ -26,9 +26,6 @@ class EmptyGradOpMaker;
 namespace imperative {
 class OpBase;
 }  // namespace imperative
-namespace platform {
-class CPUDeviceContext;
-}  // namespace platform
 }  // namespace paddle
 
 namespace paddle {
diff --git a/paddle/fluid/operators/isfinite_v2_op.cc b/paddle/fluid/operators/isfinite_v2_op.cc
index 314bbf556ae..65857b6d87d 100644
--- a/paddle/fluid/operators/isfinite_v2_op.cc
+++ b/paddle/fluid/operators/isfinite_v2_op.cc
@@ -34,9 +34,6 @@ namespace operators {
 template <typename DeviceContext, typename T, typename Functor>
 class OverflowKernel;
 }  // namespace operators
-namespace platform {
-class CPUDeviceContext;
-}  // namespace platform
 }  // namespace paddle
 
 namespace plat = paddle::platform;
diff --git a/paddle/fluid/operators/label_smooth_op.cc b/paddle/fluid/operators/label_smooth_op.cc
index ccd4db816bd..873ab62a3d2 100644
--- a/paddle/fluid/operators/label_smooth_op.cc
+++ b/paddle/fluid/operators/label_smooth_op.cc
@@ -24,9 +24,6 @@ class OpDesc;
 namespace imperative {
 class OpBase;
 }  // namespace imperative
-namespace platform {
-class CPUDeviceContext;
-}  // namespace platform
 }  // namespace paddle
 
 namespace paddle {
diff --git a/paddle/fluid/operators/math/beam_search.cc b/paddle/fluid/operators/math/beam_search.cc
index cda085aabe9..2b607ade728 100644
--- a/paddle/fluid/operators/math/beam_search.cc
+++ b/paddle/fluid/operators/math/beam_search.cc
@@ -13,26 +13,19 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/math/beam_search.h"
-
+#include "paddle/phi/backends/cpu/cpu_context.h"
 namespace phi {
 class DenseTensor;
 }  // namespace phi
 
-namespace paddle {
-namespace framework {}  // namespace framework
-namespace platform {
-class CPUDeviceContext;
-}  // namespace platform
-}  // namespace paddle
-
 namespace paddle {
 namespace operators {
 namespace math {
 
 template <typename T>
-class BeamSearchFunctor<platform::CPUDeviceContext, T> {
+class BeamSearchFunctor<phi::CPUContext, T> {
  public:
-  void operator()(const platform::CPUDeviceContext &context,
+  void operator()(const phi::CPUContext &context,
                   const framework::LoDTensor *pre_ids,
                   const framework::LoDTensor *pre_scores,
                   const framework::LoDTensor *ids,
@@ -308,10 +301,10 @@ class BeamSearchFunctor<platform::CPUDeviceContext, T> {
   }
 };
 
-template class BeamSearchFunctor<platform::CPUDeviceContext, int>;
-template class BeamSearchFunctor<platform::CPUDeviceContext, int64_t>;
-template class BeamSearchFunctor<platform::CPUDeviceContext, float>;
-template class BeamSearchFunctor<platform::CPUDeviceContext, double>;
+template class BeamSearchFunctor<phi::CPUContext, int>;
+template class BeamSearchFunctor<phi::CPUContext, int64_t>;
+template class BeamSearchFunctor<phi::CPUContext, float>;
+template class BeamSearchFunctor<phi::CPUContext, double>;
 
 }  // namespace math
 }  // namespace operators
diff --git a/paddle/fluid/operators/math/concat_and_split.cc b/paddle/fluid/operators/math/concat_and_split.cc
index 4ce2db1e579..3df69e20019 100644
--- a/paddle/fluid/operators/math/concat_and_split.cc
+++ b/paddle/fluid/operators/math/concat_and_split.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/math/concat_and_split.h"
+#include "paddle/fluid/platform/device_context.h"
 
 #include "paddle/phi/kernels/funcs/concat_and_split_functor.h"
 #ifdef PADDLE_WITH_ASCEND_CL
@@ -28,13 +29,6 @@ namespace phi {
 class DenseTensor;
 }  // namespace phi
 
-namespace paddle {
-namespace framework {}  // namespace framework
-namespace platform {
-class CPUDeviceContext;
-}  // namespace platform
-}  // namespace paddle
-
 namespace paddle {
 namespace operators {
 namespace math {
diff --git a/paddle/fluid/operators/math/context_project.cc b/paddle/fluid/operators/math/context_project.cc
index 927d610e2ce..beee93ae016 100644
--- a/paddle/fluid/operators/math/context_project.cc
+++ b/paddle/fluid/operators/math/context_project.cc
@@ -13,19 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/math/context_project.h"
-
-namespace paddle {
-namespace platform {
-class CPUDeviceContext;
-}  // namespace platform
-}  // namespace paddle
+#include "paddle/phi/backends/cpu/cpu_context.h"
 
 namespace paddle {
 namespace operators {
 namespace math {
 
-template class ContextProjectFunctor<platform::CPUDeviceContext, float>;
-template class ContextProjectFunctor<platform::CPUDeviceContext, double>;
+template class ContextProjectFunctor<phi::CPUContext, float>;
+template class ContextProjectFunctor<phi::CPUContext, double>;
 
 }  // namespace math
 }  // namespace operators
diff --git a/paddle/fluid/operators/math/cos_sim_functor.cc b/paddle/fluid/operators/math/cos_sim_functor.cc
index 4a3da2ef86d..85f012afb50 100644
--- a/paddle/fluid/operators/math/cos_sim_functor.cc
+++ b/paddle/fluid/operators/math/cos_sim_functor.cc
@@ -14,16 +14,9 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/math/cos_sim_functor.h"
 
-namespace paddle {
-namespace platform {
-class CPUDeviceContext;
-}  // namespace platform
-}  // namespace paddle
-
 namespace paddle {
 namespace operators {
 namespace math {
-
 template <typename T>
 struct CosSimDyFunctor<platform::CPUDeviceContext, T> {
   void operator()(const platform::CPUDeviceContext& ctx,
diff --git a/paddle/fluid/operators/math/cross_entropy.cc b/paddle/fluid/operators/math/cross_entropy.cc
index 45c7e47b810..17ff6aff6f9 100644
--- a/paddle/fluid/operators/math/cross_entropy.cc
+++ b/paddle/fluid/operators/math/cross_entropy.cc
@@ -17,12 +17,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/convert_utils.h"
 #include "paddle/phi/backends/cpu/cpu_context.h"
 
-namespace paddle {
-namespace platform {
-class CPUDeviceContext;
-}  // namespace platform
-}  // namespace paddle
-
 namespace paddle {
 namespace operators {
 namespace math {
@@ -129,9 +123,6 @@ void CrossEntropyFunctor<DeviceContext, T>::operator()(
   }
 }
 
-template class CrossEntropyFunctor<platform::CPUDeviceContext, float>;
-template class CrossEntropyFunctor<platform::CPUDeviceContext, double>;
-
 template class CrossEntropyFunctor<phi::CPUContext, float>;
 template class CrossEntropyFunctor<phi::CPUContext, double>;
 }  // namespace math
diff --git a/paddle/fluid/operators/math/gru_compute.cc b/paddle/fluid/operators/math/gru_compute.cc
index d8fa1b5a869..7e543a63afc 100644
--- a/paddle/fluid/operators/math/gru_compute.cc
+++ b/paddle/fluid/operators/math/gru_compute.cc
@@ -15,12 +15,6 @@ limitations under the License. */
 #include "paddle/fluid/operators/math/detail/gru_kernel.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
 
-namespace paddle {
-namespace platform {
-class CPUDeviceContext;
-}  // namespace platform
-}  // namespace paddle
-
 namespace paddle {
 namespace operators {
 namespace math {
diff --git a/paddle/fluid/operators/math/im2col.cc b/paddle/fluid/operators/math/im2col.cc
index e7ed2cbf675..9192badedcf 100644
--- a/paddle/fluid/operators/math/im2col.cc
+++ b/paddle/fluid/operators/math/im2col.cc
@@ -16,12 +16,6 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/math/im2col_cfo_cpu.h"
 
-namespace paddle {
-namespace platform {
-class CPUDeviceContext;
-}  // namespace platform
-}  // namespace paddle
-
 namespace phi {
 class CPUContext;
 }  // namespace phi
@@ -166,24 +160,12 @@ class Col2ImFunctor<paddle::operators::math::ColFormat::kCFO,
   }
 };
 
-template class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO,
-                             platform::CPUDeviceContext,
-                             float>;
-template class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO,
-                             platform::CPUDeviceContext,
-                             double>;
 template class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO,
                              phi::CPUContext,
                              float>;
 template class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO,
                              phi::CPUContext,
                              double>;
-template class Col2ImFunctor<paddle::operators::math::ColFormat::kCFO,
-                             platform::CPUDeviceContext,
-                             float>;
-template class Col2ImFunctor<paddle::operators::math::ColFormat::kCFO,
-                             platform::CPUDeviceContext,
-                             double>;
 template class Col2ImFunctor<paddle::operators::math::ColFormat::kCFO,
                              phi::CPUContext,
                              float>;
@@ -353,24 +335,12 @@ class Col2ImFunctor<paddle::operators::math::ColFormat::kOCF,
   }
 };
 
-template class Im2ColFunctor<paddle::operators::math::ColFormat::kOCF,
-                             platform::CPUDeviceContext,
-                             float>;
-template class Im2ColFunctor<paddle::operators::math::ColFormat::kOCF,
-                             platform::CPUDeviceContext,
-                             double>;
 template class Im2ColFunctor<paddle::operators::math::ColFormat::kOCF,
                              phi::CPUContext,
                              float>;
 template class Im2ColFunctor<paddle::operators::math::ColFormat::kOCF,
                              phi::CPUContext,
                              double>;
-template class Col2ImFunctor<paddle::operators::math::ColFormat::kOCF,
-                             platform::CPUDeviceContext,
-                             float>;
-template class Col2ImFunctor<paddle::operators::math::ColFormat::kOCF,
-                             platform::CPUDeviceContext,
-                             double>;
 template class Col2ImFunctor<paddle::operators::math::ColFormat::kOCF,
                              phi::CPUContext,
                              float>;
diff --git a/paddle/fluid/operators/math/math_function.cc b/paddle/fluid/operators/math/math_function.cc
deleted file mode 100644
index 5eff0a5d457..00000000000
--- a/paddle/fluid/operators/math/math_function.cc
+++ /dev/null
@@ -1,335 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/math/math_function.h"
-
-#ifdef PADDLE_WITH_MKLML
-#include "paddle/fluid/platform/dynload/mklml.h"
-#endif
-
-#ifdef PADDLE_USE_OPENBLAS
-#include <cblas.h>
-#endif
-
-#include <memory>
-#include <utility>
-#include <vector>
-
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/operators/math/math_function_impl.h"
-#include "paddle/fluid/platform/bfloat16.h"
-#include "paddle/fluid/platform/float16.h"
-#include "paddle/phi/backends/cpu/cpu_context.h"
-#include "paddle/phi/kernels/funcs/eigen/common.h"
-#include "unsupported/Eigen/CXX11/Tensor"
-
-namespace paddle {
-namespace operators {
-namespace math {
-
-using float16 = paddle::platform::float16;
-
-template struct SetConstant<platform::CPUDeviceContext, platform::float16>;
-template struct SetConstant<platform::CPUDeviceContext, platform::bfloat16>;
-template struct SetConstant<platform::CPUDeviceContext, float>;
-template struct SetConstant<platform::CPUDeviceContext, double>;
-template struct SetConstant<platform::CPUDeviceContext, int16_t>;
-template struct SetConstant<platform::CPUDeviceContext, int>;
-template struct SetConstant<platform::CPUDeviceContext, int64_t>;
-template struct SetConstant<platform::CPUDeviceContext, bool>;
-template struct SetConstant<platform::CPUDeviceContext, uint8_t>;
-template struct SetConstant<platform::CPUDeviceContext,
-                            platform::complex<float>>;
-template struct SetConstant<platform::CPUDeviceContext,
-                            platform::complex<double>>;
-
-template struct SetConstant<phi::CPUContext, platform::float16>;
-template struct SetConstant<phi::CPUContext, platform::bfloat16>;
-template struct SetConstant<phi::CPUContext, float>;
-template struct SetConstant<phi::CPUContext, double>;
-template struct SetConstant<phi::CPUContext, int16_t>;
-template struct SetConstant<phi::CPUContext, int>;
-template struct SetConstant<phi::CPUContext, int64_t>;
-template struct SetConstant<phi::CPUContext, bool>;
-template struct SetConstant<phi::CPUContext, uint8_t>;
-template struct SetConstant<phi::CPUContext, platform::complex<float>>;
-template struct SetConstant<phi::CPUContext, platform::complex<double>>;
-
-#ifdef PADDLE_WITH_XPU
-template struct SetConstant<platform::XPUDeviceContext, platform::float16>;
-template struct SetConstant<platform::XPUDeviceContext, platform::bfloat16>;
-template struct SetConstant<platform::XPUDeviceContext, float>;
-template struct SetConstant<platform::XPUDeviceContext, double>;
-template struct SetConstant<platform::XPUDeviceContext, uint8_t>;
-template struct SetConstant<platform::XPUDeviceContext, int16_t>;
-template struct SetConstant<platform::XPUDeviceContext, int>;
-template struct SetConstant<platform::XPUDeviceContext, int64_t>;
-template struct SetConstant<platform::XPUDeviceContext, bool>;
-template struct SetConstant<platform::XPUDeviceContext,
-                            platform::complex<float>>;
-template struct SetConstant<platform::XPUDeviceContext,
-                            platform::complex<double>>;
-#endif
-
-#define DEFINE_CPU_TRANS(RANK)                                          \
-  template struct Transpose<platform::CPUDeviceContext,                 \
-                            platform::float16,                          \
-                            RANK>;                                      \
-  template struct Transpose<platform::CPUDeviceContext,                 \
-                            platform::bfloat16,                         \
-                            RANK>;                                      \
-  template struct Transpose<platform::CPUDeviceContext, float, RANK>;   \
-  template struct Transpose<platform::CPUDeviceContext, double, RANK>;  \
-  template struct Transpose<platform::CPUDeviceContext, int, RANK>;     \
-  template struct Transpose<platform::CPUDeviceContext, int64_t, RANK>; \
-  template struct Transpose<platform::CPUDeviceContext, bool, RANK>;    \
-  template struct Transpose<platform::CPUDeviceContext, int16_t, RANK>; \
-  template struct Transpose<platform::CPUDeviceContext, uint8_t, RANK>; \
-  template struct Transpose<platform::CPUDeviceContext, int8_t, RANK>;  \
-  template struct Transpose<platform::CPUDeviceContext,                 \
-                            platform::complex<float>,                   \
-                            RANK>;                                      \
-  template struct Transpose<platform::CPUDeviceContext,                 \
-                            platform::complex<double>,                  \
-                            RANK>;
-
-DEFINE_CPU_TRANS(1);
-DEFINE_CPU_TRANS(2);
-DEFINE_CPU_TRANS(3);
-DEFINE_CPU_TRANS(4);
-DEFINE_CPU_TRANS(5);
-DEFINE_CPU_TRANS(6);
-
-template <typename T>
-struct TransposeNormal<platform::CPUDeviceContext, T> {
-  void operator()(const platform::CPUDeviceContext& context,
-                  const framework::Tensor& in,
-                  framework::Tensor* out,
-                  const std::vector<int>& axis) {
-    const int rank = axis.size();
-    auto in_stride = phi::stride(in.dims());
-    auto out_stride = phi::stride(out->dims());
-    const T* in_ptr = in.data<T>();
-    T* out_ptr = out->data<T>();
-
-    auto transpose_helper = [&](int64_t beg, int64_t end) {
-      for (int64_t out_idx = beg; out_idx < end; ++out_idx) {
-        int64_t in_idx = 0;
-        int64_t tmp_idx = out_idx;
-        // calculate the input index
-        for (int i = 0; i < rank; ++i) {
-          const int64_t coordinate = tmp_idx / out_stride[i];
-          tmp_idx -= coordinate * out_stride[i];
-          in_idx += coordinate * in_stride[axis[i]];
-        }
-        out_ptr[out_idx] = in_ptr[in_idx];
-      }
-    };
-    transpose_helper(0, out->numel());
-  }
-};
-
-// define transpose normal
-#define DEFINE_CPU_TRANS_NORMAL(TYPE) \
-  template struct TransposeNormal<platform::CPUDeviceContext, TYPE>
-
-DEFINE_CPU_TRANS_NORMAL(platform::float16);
-DEFINE_CPU_TRANS_NORMAL(platform::bfloat16);
-DEFINE_CPU_TRANS_NORMAL(float);
-DEFINE_CPU_TRANS_NORMAL(double);
-DEFINE_CPU_TRANS_NORMAL(int);
-DEFINE_CPU_TRANS_NORMAL(int64_t);
-DEFINE_CPU_TRANS_NORMAL(bool);
-DEFINE_CPU_TRANS_NORMAL(int16_t);
-DEFINE_CPU_TRANS_NORMAL(uint8_t);
-DEFINE_CPU_TRANS_NORMAL(int8_t);
-DEFINE_CPU_TRANS_NORMAL(platform::complex<float>);
-DEFINE_CPU_TRANS_NORMAL(platform::complex<double>);
-
-struct TensorSetConstantCPU {
-  TensorSetConstantCPU(framework::Tensor* tensor, float value)
-      : tensor_(tensor), value_(value) {}
-  template <typename T>
-  void apply() const {
-    auto cpu = platform::CPUPlace();
-    auto* begin = tensor_->mutable_data<T>(cpu);
-    std::fill(begin, begin + tensor_->numel(), static_cast<T>(value_));
-  }
-  framework::Tensor* tensor_;
-  float value_;
-};
-
-template <>
-void set_constant_with_place<platform::XPUPlace>(
-    const platform::DeviceContext& context,
-    framework::Tensor* tensor,
-    float value) {
-  PADDLE_THROW(platform::errors::Unimplemented("XPUPlace is not supported"));
-}
-
-template <>
-void set_constant_with_place<platform::NPUPlace>(
-    const platform::DeviceContext& context,
-    framework::Tensor* tensor,
-    float value) {
-  PADDLE_THROW(platform::errors::Unimplemented("NPUPlace is not supported"));
-}
-
-template <>
-void set_constant_with_place<platform::NPUPinnedPlace>(
-    const platform::DeviceContext& context,
-    framework::Tensor* tensor,
-    float value) {
-  PADDLE_THROW(
-      platform::errors::Unimplemented("NPUPinnedPlace is not supported"));
-}
-
-template <>
-void set_constant_with_place<platform::IPUPlace>(
-    const platform::DeviceContext& context,
-    framework::Tensor* tensor,
-    float value) {
-  PADDLE_THROW(platform::errors::Unimplemented("IPUPlace is not supported"));
-}
-
-template <>
-void set_constant_with_place<platform::CPUPlace>(
-    const platform::DeviceContext& context,
-    framework::Tensor* tensor,
-    float value) {
-  framework::VisitDataType(tensor->type(), TensorSetConstantCPU(tensor, value));
-}
-
-template <>
-void set_constant_with_place<platform::MLUPlace>(
-    const platform::DeviceContext& context,
-    framework::Tensor* tensor,
-    float value) {
-  PADDLE_THROW(platform::errors::Unimplemented("MLUPlace is not supported"));
-}
-
-template <>
-void set_constant_with_place<platform::CustomPlace>(
-    const platform::DeviceContext& context,
-    framework::Tensor* tensor,
-    float value) {
-  PADDLE_THROW(platform::errors::Unimplemented("CustomPlace is not supported"));
-}
-
-template <>
-void set_constant_with_place<platform::CUDAPinnedPlace>(
-    const platform::DeviceContext& context,
-    framework::Tensor* tensor,
-    float value) {
-  framework::VisitDataType(tensor->type(), TensorSetConstantCPU(tensor, value));
-}
-
-struct TensorSetConstantWithPlace : public boost::static_visitor<void> {
-  TensorSetConstantWithPlace(const platform::DeviceContext& context,
-                             framework::Tensor* tensor,
-                             float value)
-      : context_(context), tensor_(tensor), value_(value) {}
-
-  template <typename Place>
-  void operator()(Place place) const {
-    set_constant_with_place<Place>(context_, tensor_, value_);
-  }
-
-  const platform::DeviceContext& context_;
-  framework::Tensor* tensor_;
-  float value_;
-};
-
-void set_constant(const platform::DeviceContext& context,
-                  framework::Tensor* tensor,
-                  float value) {
-  TensorSetConstantWithPlace func(context, tensor, value);
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-  // tensor->place().apply_visitor(func);
-  paddle::platform::VisitPlace(tensor->place(), func);
-#else
-  func(platform::CPUPlace());
-#endif
-}
-
-template <typename T>
-struct RowwiseAdd<platform::CPUDeviceContext, T> {
-  void operator()(const platform::CPUDeviceContext& context,
-                  const framework::Tensor& input,
-                  const framework::Tensor& vector,
-                  framework::Tensor* output) {
-    auto in_dims = input.dims();
-    auto out_dims = output->dims();
-    auto size = input.numel() / in_dims[0];
-    PADDLE_ENFORCE_EQ(
-        vector.numel(),
-        size,
-        platform::errors::InvalidArgument(
-            "The input vector size"
-            " should be equal to the size of each row of input tensor."
-            " Expected vector size=%d, but received %d",
-            size,
-            vector.numel()));
-    const char* in_dims_cstr = in_dims.to_str().c_str();
-    const char* out_dims_cstr = out_dims.to_str().c_str();
-    PADDLE_ENFORCE_EQ(out_dims,
-                      in_dims,
-                      platform::errors::InvalidArgument(
-                          "The output tensor shape should be same as the input"
-                          " tensor shape. Expected output tensor shape: %s,"
-                          " but received %s",
-                          in_dims_cstr,
-                          out_dims_cstr));
-
-    auto in = framework::EigenMatrix<T>::From(input);
-    auto vec = framework::EigenVector<T>::Flatten(vector);
-    auto out = framework::EigenMatrix<T>::From(*output);
-
-    for (int64_t i = 0; i < in_dims[0]; ++i) {
-      out.chip(i, 0) = in.chip(i, 0) + vec;
-    }
-  }
-};
-
-template struct RowwiseAdd<platform::CPUDeviceContext, float>;
-template struct RowwiseAdd<platform::CPUDeviceContext, double>;
-
-template struct ColwiseSum<platform::CPUDeviceContext, float>;
-template struct ColwiseSum<platform::CPUDeviceContext, double>;
-template struct ColwiseSum<platform::CPUDeviceContext, int>;
-template struct ColwiseSum<platform::CPUDeviceContext, int64_t>;
-
-template struct RowwiseSum<platform::CPUDeviceContext, float>;
-template struct RowwiseSum<platform::CPUDeviceContext, double>;
-
-template struct RowwiseMean<platform::CPUDeviceContext, float>;
-template struct RowwiseMean<platform::CPUDeviceContext, double>;
-
-template <typename T>
-struct ElementwiseAddTo<platform::CPUDeviceContext, T> {
-  void operator()(platform::CPUDeviceContext* ctx,
-                  const framework::Tensor& src,
-                  framework::Tensor* dst) {
-    auto in = framework::EigenVector<T>::Flatten(src);
-    auto out = framework::EigenVector<T>::Flatten(*dst);
-    auto& place = *(ctx->eigen_device());
-    out.device(place) = out + in;
-  }
-};
-
-template struct ElementwiseAddTo<platform::CPUDeviceContext, platform::float16>;
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/math/maxouting.cc b/paddle/fluid/operators/math/maxouting.cc
index 7729b86cc3e..2205ed51e19 100644
--- a/paddle/fluid/operators/math/maxouting.cc
+++ b/paddle/fluid/operators/math/maxouting.cc
@@ -109,11 +109,6 @@ void MaxOutGradFunctor<DeviceContext, T>::operator()(
   }
 }
 
-template class MaxOutGradFunctor<platform::CPUDeviceContext, float>;
-template class MaxOutGradFunctor<platform::CPUDeviceContext, double>;
-template class MaxOutFunctor<platform::CPUDeviceContext, float>;
-template class MaxOutFunctor<platform::CPUDeviceContext, double>;
-
 template class MaxOutGradFunctor<phi::CPUContext, float>;
 template class MaxOutGradFunctor<phi::CPUContext, double>;
 template class MaxOutFunctor<phi::CPUContext, float>;
diff --git a/paddle/fluid/operators/math/sample_prob.cc b/paddle/fluid/operators/math/sample_prob.cc
index 16342493e45..18321cf9b9e 100644
--- a/paddle/fluid/operators/math/sample_prob.cc
+++ b/paddle/fluid/operators/math/sample_prob.cc
@@ -14,19 +14,8 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/math/sample_prob.h"
 
-namespace paddle {
-namespace platform {
-class CPUDeviceContext;
-}  // namespace platform
-}  // namespace paddle
-
 namespace paddle {
 namespace operators {
-namespace math {
-
-template class SampleWithProb<platform::CPUDeviceContext, float>;
-template class SampleWithProb<platform::CPUDeviceContext, double>;
-
-}  // namespace math
+namespace math {}  // namespace math
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/fluid/operators/math/selected_rows_functor.cc b/paddle/fluid/operators/math/selected_rows_functor.cc
index 81b0e9102bb..399a1b6dc4c 100644
--- a/paddle/fluid/operators/math/selected_rows_functor.cc
+++ b/paddle/fluid/operators/math/selected_rows_functor.cc
@@ -276,51 +276,6 @@ struct SelectedRowsSumTo<platform::CPUDeviceContext, T> {
 template struct SelectedRowsSumTo<platform::CPUDeviceContext, float>;
 template struct SelectedRowsSumTo<platform::CPUDeviceContext, double>;
 
-template <typename T>
-struct SelectedRowsAddToTensor<platform::CPUDeviceContext, T> {
-  void operator()(const platform::CPUDeviceContext& context,
-                  const phi::SelectedRows& input1,
-                  framework::Tensor* input2) {
-    if (UNLIKELY(input1.rows().size() == 0)) {
-      LOG(WARNING) << "input selected rows is empty!";
-      return;
-    }
-    auto in1_height = input1.height();
-    const auto& in2_dims = input2->dims();
-    PADDLE_ENFORCE_EQ(
-        in1_height,
-        in2_dims[0],
-        platform::errors::InvalidArgument("The two inputs height must be equal."
-                                          "But received first input height = "
-                                          "[%d], second input height = [%d]",
-                                          in1_height,
-                                          in2_dims[0]));
-
-    auto& in1_value = input1.value();
-    auto& in1_rows = input1.rows();
-
-    int64_t in1_row_numel = in1_value.numel() / in1_rows.size();
-    PADDLE_ENFORCE_EQ(
-        in1_row_numel,
-        input2->numel() / in1_height,
-        platform::errors::InvalidArgument(
-            "The two inputs width must be equal."
-            "But received first input width = [%d], second input width = [%d]",
-            in1_row_numel,
-            input2->numel() / in1_height));
-
-    auto* in1_data = in1_value.data<T>();
-    auto* input2_data = input2->data<T>();
-
-    for (size_t i = 0; i < in1_rows.size(); i++) {
-      for (int64_t j = 0; j < in1_row_numel; j++) {
-        input2_data[in1_rows[i] * in1_row_numel + j] +=
-            in1_data[i * in1_row_numel + j];
-      }
-    }
-  }
-};
-
 template <typename T>
 struct SelectedRowsAddToTensor<phi::CPUContext, T> {
   void operator()(const phi::CPUContext& context,
@@ -366,13 +321,6 @@ struct SelectedRowsAddToTensor<phi::CPUContext, T> {
   }
 };
 
-template struct SelectedRowsAddToTensor<platform::CPUDeviceContext, float>;
-template struct SelectedRowsAddToTensor<platform::CPUDeviceContext, double>;
-template struct SelectedRowsAddToTensor<platform::CPUDeviceContext, int>;
-template struct SelectedRowsAddToTensor<platform::CPUDeviceContext, int64_t>;
-template struct SelectedRowsAddToTensor<platform::CPUDeviceContext,
-                                        platform::bfloat16>;
-
 template struct SelectedRowsAddToTensor<phi::CPUContext, float>;
 template struct SelectedRowsAddToTensor<phi::CPUContext, double>;
 template struct SelectedRowsAddToTensor<phi::CPUContext, int>;
@@ -582,34 +530,6 @@ struct MergeAddImpl {
   }
 };
 
-template <typename T>
-struct MergeAdd<platform::CPUDeviceContext, T> {
-  // unary functor, merge by adding duplicated rows in
-  // the input SelectedRows object.
-  phi::SelectedRows operator()(const platform::CPUDeviceContext& context,
-                               const phi::SelectedRows& input,
-                               const bool sorted_result) {
-    return MergeAddImpl<platform::CPUDeviceContext, T>()(
-        context, input, sorted_result);
-  }
-
-  void operator()(const platform::CPUDeviceContext& context,
-                  const phi::SelectedRows& input,
-                  phi::SelectedRows* output,
-                  const bool sorted_result) {
-    MergeAddImpl<platform::CPUDeviceContext, T>()(
-        context, input, output, sorted_result);
-  }
-
-  void operator()(const platform::CPUDeviceContext& context,
-                  const std::vector<const phi::SelectedRows*>& inputs,
-                  phi::SelectedRows* output,
-                  const bool sorted_result) {
-    MergeAddImpl<platform::CPUDeviceContext, T>()(
-        context, inputs, output, sorted_result);
-  }
-};
-
 template <typename T>
 struct MergeAdd<phi::CPUContext, T> {
   // unary functor, merge by adding duplicated rows in
@@ -635,10 +555,8 @@ struct MergeAdd<phi::CPUContext, T> {
   }
 };
 
-#define TEMPLATE_SPECIALIZED_FOR_MERGEADD_CPU(dtype)               \
-  template struct MergeAddImpl<platform::CPUDeviceContext, dtype>; \
-  template struct MergeAddImpl<phi::CPUContext, dtype>;            \
-  template struct MergeAdd<platform::CPUDeviceContext, dtype>;     \
+#define TEMPLATE_SPECIALIZED_FOR_MERGEADD_CPU(dtype)    \
+  template struct MergeAddImpl<phi::CPUContext, dtype>; \
   template struct MergeAdd<phi::CPUContext, dtype>;
 
 TEMPLATE_SPECIALIZED_FOR_MERGEADD_CPU(float)
diff --git a/paddle/fluid/operators/math/sequence_padding.cc b/paddle/fluid/operators/math/sequence_padding.cc
index 1a952bbb62d..826eda5559a 100644
--- a/paddle/fluid/operators/math/sequence_padding.cc
+++ b/paddle/fluid/operators/math/sequence_padding.cc
@@ -20,13 +20,6 @@ namespace phi {
 class DenseTensor;
 }  // namespace phi
 
-namespace paddle {
-namespace framework {}  // namespace framework
-namespace platform {
-class CPUDeviceContext;
-}  // namespace platform
-}  // namespace paddle
-
 namespace paddle {
 namespace operators {
 namespace math {
@@ -101,66 +94,6 @@ static void fast_mem_init(void* dest,
   }
 }
 
-template <typename T>
-class PaddingLoDTensorFunctor<platform::CPUDeviceContext, T> {
- public:
-  void operator()(const platform::CPUDeviceContext& context,
-                  const framework::LoDTensor& seq_tensor,
-                  framework::LoDTensor* pad_tensor,
-                  const framework::LoDTensor& pad_value,
-                  int pad_seq_len = -1,
-                  int lod_level = 0,
-                  bool norm_by_times = false,
-                  const PadLayout layout = kBatchLengthWidth) {
-    auto seq_lod = seq_tensor.lod();
-    const auto seq_offsets = framework::ToAbsOffset(seq_lod)[lod_level];
-    const auto& seq_tensor_dims = seq_tensor.dims();
-    const auto& pad_tensor_dims = pad_tensor->dims();
-    if (pad_seq_len == -1) {
-      pad_seq_len = MaximumSequenceLength(seq_offsets);
-    }
-    int step_width = seq_tensor.numel() / seq_tensor_dims[0];
-
-    CheckDims(seq_tensor_dims,
-              pad_tensor_dims,
-              seq_offsets,
-              pad_seq_len,
-              step_width,
-              layout);
-
-    PADDLE_ENFORCE_EQ(
-        pad_value.numel() == 1 || pad_value.numel() == step_width,
-        true,
-        platform::errors::InvalidArgument(
-            "The numel of 'pad_value' can only be 1 or be equal to the "
-            "'step_width', but got %ld != 1 and %ld. Please check the input "
-            "value.",
-            pad_value.numel(),
-            step_width));
-
-    // fill padding value
-    T* pad_data = pad_tensor->data<T>();
-    const T* pad_value_data = pad_value.data<T>();
-    if (pad_value.numel() == 1) {
-      fast_mem_init<T>(
-          pad_data, pad_tensor->numel(), pad_value_data, sizeof(T));
-    } else {
-      for (int i = 0; i < pad_tensor->numel(); i += step_width) {
-        memcpy(pad_data + i, pad_value_data, step_width * sizeof(T));
-      }
-    }
-
-    CopyValidData<T>(pad_tensor,
-                     &seq_tensor,
-                     seq_offsets,
-                     pad_seq_len,
-                     step_width,
-                     norm_by_times,
-                     kSeqToPad,
-                     layout);
-  }
-};
-
 template <typename T>
 class PaddingLoDTensorFunctor<phi::CPUContext, T> {
  public:
@@ -221,42 +154,6 @@ class PaddingLoDTensorFunctor<phi::CPUContext, T> {
   }
 };
 
-template <typename T>
-class UnpaddingLoDTensorFunctor<platform::CPUDeviceContext, T> {
- public:
-  void operator()(const platform::CPUDeviceContext& context,
-                  const framework::LoDTensor& pad_tensor,
-                  framework::LoDTensor* seq_tensor,
-                  int pad_seq_len = -1,
-                  int lod_level = 0,
-                  bool norm_by_times = false,
-                  const PadLayout layout = kBatchLengthWidth) {
-    auto seq_offsets = framework::ToAbsOffset(seq_tensor->lod())[lod_level];
-    const auto& seq_tensor_dims = seq_tensor->dims();
-    const auto& pad_tensor_dims = pad_tensor.dims();
-    if (pad_seq_len == -1) {
-      pad_seq_len = MaximumSequenceLength(seq_offsets);
-    }
-    int step_width = seq_tensor->numel() / seq_tensor_dims[0];
-
-    CheckDims(seq_tensor_dims,
-              pad_tensor_dims,
-              seq_offsets,
-              pad_seq_len,
-              step_width,
-              layout);
-
-    CopyValidData<T>(seq_tensor,
-                     &pad_tensor,
-                     seq_offsets,
-                     pad_seq_len,
-                     step_width,
-                     norm_by_times,
-                     kPadToSeq,
-                     layout);
-  }
-};
-
 template <typename T>
 class UnpaddingLoDTensorFunctor<phi::CPUContext, T> {
  public:
@@ -293,16 +190,6 @@ class UnpaddingLoDTensorFunctor<phi::CPUContext, T> {
   }
 };
 
-template class PaddingLoDTensorFunctor<platform::CPUDeviceContext, int>;
-template class PaddingLoDTensorFunctor<platform::CPUDeviceContext, int64_t>;
-template class PaddingLoDTensorFunctor<platform::CPUDeviceContext, float>;
-template class PaddingLoDTensorFunctor<platform::CPUDeviceContext, double>;
-
-template class UnpaddingLoDTensorFunctor<platform::CPUDeviceContext, int>;
-template class UnpaddingLoDTensorFunctor<platform::CPUDeviceContext, int64_t>;
-template class UnpaddingLoDTensorFunctor<platform::CPUDeviceContext, float>;
-template class UnpaddingLoDTensorFunctor<platform::CPUDeviceContext, double>;
-
 template class PaddingLoDTensorFunctor<phi::CPUContext, int>;
 template class PaddingLoDTensorFunctor<phi::CPUContext, int64_t>;
 template class PaddingLoDTensorFunctor<phi::CPUContext, float>;
diff --git a/paddle/fluid/operators/math/sequence_scale.cc b/paddle/fluid/operators/math/sequence_scale.cc
index cd91b2eb531..8faf9572bef 100644
--- a/paddle/fluid/operators/math/sequence_scale.cc
+++ b/paddle/fluid/operators/math/sequence_scale.cc
@@ -24,29 +24,6 @@ namespace paddle {
 namespace operators {
 namespace math {
 
-template <typename T>
-class ScaleLoDTensorFunctor<platform::CPUDeviceContext, T> {
- public:
-  void operator()(const platform::CPUDeviceContext& context,
-                  const T* scales,
-                  framework::LoDTensor* seq) {
-    const size_t level = 0;
-    auto lod = seq->lod();
-    const size_t num_seq = lod[level].size() - 1;
-    size_t seq_width = seq->dims()[1];
-    framework::LoD abs_offset_lod = framework::ToAbsOffset(lod);
-
-    T* seq_data = seq->mutable_data<T>(context.GetPlace());
-    for (size_t i = 0; i < num_seq; ++i) {
-      for (size_t j = lod[level][i] * seq_width;
-           j < lod[level][i + 1] * seq_width;
-           ++j) {
-        seq_data[j] *= scales[i];
-      }
-    }
-  }
-};
-
 template <typename T>
 class ScaleLoDTensorFunctor<phi::CPUContext, T> {
  public:
@@ -70,9 +47,6 @@ class ScaleLoDTensorFunctor<phi::CPUContext, T> {
   }
 };
 
-template class ScaleLoDTensorFunctor<platform::CPUDeviceContext, float>;
-template class ScaleLoDTensorFunctor<platform::CPUDeviceContext, double>;
-
 template class ScaleLoDTensorFunctor<phi::CPUContext, float>;
 template class ScaleLoDTensorFunctor<phi::CPUContext, double>;
 
diff --git a/paddle/fluid/operators/math/softmax.cc b/paddle/fluid/operators/math/softmax.cc
index adea86a6c5a..730dcbf59a6 100644
--- a/paddle/fluid/operators/math/softmax.cc
+++ b/paddle/fluid/operators/math/softmax.cc
@@ -21,13 +21,6 @@ namespace paddle {
 namespace operators {
 namespace math {
 
-template class SoftmaxFunctor<platform::CPUDeviceContext, float, true>;
-template class SoftmaxFunctor<platform::CPUDeviceContext, float, false>;
-template class SoftmaxFunctor<platform::CPUDeviceContext, double, true>;
-template class SoftmaxFunctor<platform::CPUDeviceContext, double, false>;
-template class SoftmaxGradFunctor<platform::CPUDeviceContext, float>;
-template class SoftmaxGradFunctor<platform::CPUDeviceContext, double>;
-
 template class SoftmaxFunctor<phi::CPUContext, float, true>;
 template class SoftmaxFunctor<phi::CPUContext, float, false>;
 template class SoftmaxFunctor<phi::CPUContext, double, true>;
diff --git a/paddle/fluid/operators/math/vol2col.cc b/paddle/fluid/operators/math/vol2col.cc
index 7b687909306..36ce3e64742 100644
--- a/paddle/fluid/operators/math/vol2col.cc
+++ b/paddle/fluid/operators/math/vol2col.cc
@@ -16,12 +16,6 @@ limitations under the License. */
 
 #include "paddle/phi/backends/cpu/cpu_context.h"
 
-namespace paddle {
-namespace platform {
-class CPUDeviceContext;
-}  // namespace platform
-}  // namespace paddle
-
 namespace paddle {
 namespace operators {
 namespace math {
@@ -32,126 +26,6 @@ namespace math {
  *   [input_channels, filter_depth, filter_height, filter_width,
  *                    output_depth, output_height, output_width]
  */
-template <class T>
-class Vol2ColFunctor<platform::CPUDeviceContext, T> {
- public:
-  void operator()(const platform::CPUDeviceContext& context,
-                  const framework::Tensor& vol,
-                  const std::vector<int>& dilations,
-                  const std::vector<int>& strides,
-                  const std::vector<int>& paddings,
-                  framework::Tensor* col,
-                  const DataLayout data_layout) const {
-    PADDLE_ENFORCE_EQ(vol.dims().size(),
-                      4,
-                      platform::errors::InvalidArgument(
-                          "The dimension of vol should be 4, but received %d.",
-                          vol.dims().size()));
-
-    PADDLE_ENFORCE_EQ(col->dims().size(),
-                      7,
-                      platform::errors::InvalidArgument(
-                          "The dimension of col should be 7, but received %d.",
-                          col->dims().size()));
-
-    int input_channels =
-        (data_layout != DataLayout::kNHWC ? vol.dims()[0] : vol.dims()[3]);
-    int input_depth =
-        (data_layout != DataLayout::kNHWC ? vol.dims()[1] : vol.dims()[0]);
-    int input_height =
-        (data_layout != DataLayout::kNHWC ? vol.dims()[2] : vol.dims()[1]);
-    int input_width =
-        (data_layout != DataLayout::kNHWC ? vol.dims()[3] : vol.dims()[2]);
-    int filter_depth = col->dims()[1];
-    int filter_height = col->dims()[2];
-    int filter_width = col->dims()[3];
-    int output_depth = col->dims()[4];
-    int output_height = col->dims()[5];
-    int output_width = col->dims()[6];
-    int channels_col =
-        input_channels * filter_depth * filter_height * filter_width;
-
-    // changed
-    bool paddings_size_is_6 = (paddings.size() == 6);
-    int pad_d_forth = paddings_size_is_6 ? paddings[0] : paddings[0];
-    int pad_d_back = paddings_size_is_6 ? paddings[1] : paddings[0];
-    int pad_h_up = paddings_size_is_6 ? paddings[2] : paddings[1];
-    int pad_h_down = paddings_size_is_6 ? paddings[3] : paddings[1];
-    int pad_w_left = paddings_size_is_6 ? paddings[4] : paddings[2];
-    int pad_w_right = paddings_size_is_6 ? paddings[5] : paddings[2];
-
-    auto input_depth_tmp = (input_depth + pad_d_forth + pad_d_back -
-                            ((dilations[0] * (filter_depth - 1) + 1))) /
-                               strides[0] +
-                           1;
-    PADDLE_ENFORCE_EQ(
-        input_depth_tmp,
-        output_depth,
-        platform::errors::InvalidArgument(
-            "input_depth(%d) and output_depth(%d) are mismatching.",
-            input_depth_tmp,
-            output_depth));
-    auto input_height_tmp = (input_height + pad_h_up + pad_h_down -
-                             ((dilations[1] * (filter_height - 1) + 1))) /
-                                strides[1] +
-                            1;
-    PADDLE_ENFORCE_EQ(
-        input_height_tmp,
-        output_height,
-        platform::errors::InvalidArgument(
-            "input_height(%d) and output_height(%d) are mismatching.",
-            input_height_tmp,
-            output_height));
-    auto input_width_tmp = (input_width + pad_w_left + pad_w_right -
-                            ((dilations[2] * (filter_width - 1) + 1))) /
-                               strides[2] +
-                           1;
-    PADDLE_ENFORCE_EQ(
-        input_width_tmp,
-        output_width,
-        platform::errors::InvalidArgument(
-            "input_width(%d) and output_width(%d) are mismatching.",
-            input_width_tmp,
-            output_width));
-    const T* vol_data = vol.data<T>();
-    T* col_data = col->data<T>();
-
-    for (int c = 0; c < channels_col; ++c) {
-      int w_offset = c % filter_width;
-      int h_offset = (c / filter_width) % filter_height;
-      int d_offset = (c / filter_width / filter_height) % filter_depth;
-      int c_in = c / filter_width / filter_height / filter_depth;
-      for (int d = 0; d < output_depth; ++d) {
-        int d_pad = d * strides[0] - pad_d_forth + d_offset * dilations[0];
-        for (int h = 0; h < output_height; ++h) {
-          int h_pad = h * strides[1] - pad_h_up + h_offset * dilations[1];
-          for (int w = 0; w < output_width; ++w) {
-            int w_pad = w * strides[2] - pad_w_left + w_offset * dilations[2];
-
-            int col_idx =
-                ((c * output_depth + d) * output_height + h) * output_width + w;
-            int vol_idx;
-            if (data_layout != DataLayout::kNHWC) {
-              vol_idx = ((c_in * input_depth + d_pad) * input_height + h_pad) *
-                            input_width +
-                        w_pad;
-            } else {
-              vol_idx = ((d_pad * input_height + h_pad) * input_width + w_pad) *
-                            input_channels +
-                        c_in;
-            }
-            col_data[col_idx] =
-                (h_pad < 0 || h_pad >= input_height || w_pad < 0 ||
-                 w_pad >= input_width || d_pad < 0 || d_pad >= input_depth)
-                    ? static_cast<T>(0)
-                    : vol_data[vol_idx];
-          }
-        }
-      }
-    }
-  }
-};
-
 template <class T>
 class Vol2ColFunctor<phi::CPUContext, T> {
  public:
@@ -278,126 +152,6 @@ class Vol2ColFunctor<phi::CPUContext, T> {
  *   [input_channels, filter_depth, filter_height, filter_width,
  *                    output_depth, output_height, output_width]
  */
-template <class T>
-class Col2VolFunctor<platform::CPUDeviceContext, T> {
- public:
-  void operator()(const platform::CPUDeviceContext& context,
-                  const framework::Tensor& col,
-                  const std::vector<int>& dilations,
-                  const std::vector<int>& strides,
-                  const std::vector<int>& paddings,
-                  framework::Tensor* vol,
-                  const DataLayout data_layout) const {
-    PADDLE_ENFORCE_EQ(vol->dims().size(),
-                      4,
-                      platform::errors::InvalidArgument(
-                          "The dimension of vol should be 4, but received %d.",
-                          vol->dims().size()));
-
-    PADDLE_ENFORCE_EQ(col.dims().size(),
-                      7,
-                      platform::errors::InvalidArgument(
-                          "The dimension of col  should be 7, but received %d.",
-                          col.dims().size()));
-
-    int input_channels =
-        (data_layout != DataLayout::kNHWC ? vol->dims()[0] : vol->dims()[3]);
-    int input_depth =
-        (data_layout != DataLayout::kNHWC ? vol->dims()[1] : vol->dims()[0]);
-    int input_height =
-        (data_layout != DataLayout::kNHWC ? vol->dims()[2] : vol->dims()[1]);
-    int input_width =
-        (data_layout != DataLayout::kNHWC ? vol->dims()[3] : vol->dims()[2]);
-    int filter_depth = col.dims()[1];
-    int filter_height = col.dims()[2];
-    int filter_width = col.dims()[3];
-    int output_depth = col.dims()[4];
-    int output_height = col.dims()[5];
-    int output_width = col.dims()[6];
-    int channels_col =
-        input_channels * filter_depth * filter_height * filter_width;
-
-    bool paddings_size_is_6 = (paddings.size() == 6);
-    int pad_d_forth = paddings_size_is_6 ? paddings[0] : paddings[0];
-    int pad_d_back = paddings_size_is_6 ? paddings[1] : paddings[0];
-    int pad_h_up = paddings_size_is_6 ? paddings[2] : paddings[1];
-    int pad_h_down = paddings_size_is_6 ? paddings[3] : paddings[1];
-    int pad_w_left = paddings_size_is_6 ? paddings[4] : paddings[2];
-    int pad_w_right = paddings_size_is_6 ? paddings[5] : paddings[2];
-
-    auto input_depth_tmp = (input_depth + pad_d_forth + pad_d_back -
-                            ((dilations[0] * (filter_depth - 1) + 1))) /
-                               strides[0] +
-                           1;
-    PADDLE_ENFORCE_EQ(
-        input_depth_tmp,
-        output_depth,
-        platform::errors::InvalidArgument(
-            "input_depth(%d) and output_depth(%d) are mismatching.",
-            input_depth_tmp,
-            output_depth));
-    auto input_height_tmp = (input_height + pad_h_up + pad_h_down -
-                             ((dilations[1] * (filter_height - 1) + 1))) /
-                                strides[1] +
-                            1;
-    PADDLE_ENFORCE_EQ(
-        input_height_tmp,
-        output_height,
-        platform::errors::InvalidArgument(
-            "input_height(%d) and output_height(%d) are mismatching.",
-            input_height_tmp,
-            output_height));
-    auto input_width_tmp = (input_width + pad_w_left + pad_w_right -
-                            ((dilations[2] * (filter_width - 1) + 1))) /
-                               strides[2] +
-                           1;
-    PADDLE_ENFORCE_EQ(
-        input_width_tmp,
-        output_width,
-        platform::errors::InvalidArgument(
-            "input_width(%d)  and output_width(%d) are mismatching.",
-            input_width_tmp,
-            output_width));
-    T* vol_data = vol->data<T>();
-    const T* col_data = col.data<T>();
-
-    for (int c = 0; c < channels_col; ++c) {
-      int w_offset = c % filter_width;
-      int h_offset = (c / filter_width) % filter_height;
-      int d_offset = (c / filter_width / filter_height) % filter_depth;
-      int cIm = c / filter_width / filter_height / filter_depth;
-      for (int d = 0; d < output_depth; ++d) {
-        int d_pad = d * strides[0] - pad_d_forth + d_offset * dilations[0];
-        for (int h = 0; h < output_height; ++h) {
-          int h_pad = h * strides[1] - pad_h_up + h_offset * dilations[1];
-          for (int w = 0; w < output_width; ++w) {
-            int w_pad = w * strides[2] - pad_w_left + w_offset * dilations[2];
-
-            if (h_pad >= 0 && h_pad < input_height && w_pad >= 0 &&
-                w_pad < input_width && d_pad >= 0 && d_pad < input_depth) {
-              int vol_idx;
-              if (data_layout != DataLayout::kNHWC) {
-                vol_idx = ((cIm * input_depth + d_pad) * input_height + h_pad) *
-                              input_width +
-                          w_pad;
-              } else {
-                vol_idx =
-                    ((d_pad * input_height + h_pad) * input_width + w_pad) *
-                        input_channels +
-                    cIm;
-              }
-              int col_idx =
-                  ((c * output_depth + d) * output_height + h) * output_width +
-                  w;
-              vol_data[vol_idx] += col_data[col_idx];
-            }
-          }
-        }
-      }
-    }
-  }
-};
-
 template <class T>
 class Col2VolFunctor<phi::CPUContext, T> {
  public:
@@ -518,13 +272,9 @@ class Col2VolFunctor<phi::CPUContext, T> {
   }
 };
 
-template class Vol2ColFunctor<platform::CPUDeviceContext, float>;
-template class Vol2ColFunctor<platform::CPUDeviceContext, double>;
 template class Vol2ColFunctor<phi::CPUContext, float>;
 template class Vol2ColFunctor<phi::CPUContext, double>;
 
-template class Col2VolFunctor<platform::CPUDeviceContext, float>;
-template class Col2VolFunctor<platform::CPUDeviceContext, double>;
 template class Col2VolFunctor<phi::CPUContext, float>;
 template class Col2VolFunctor<phi::CPUContext, double>;
 
diff --git a/paddle/fluid/operators/mkldnn/sum_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/sum_mkldnn_op.cc
index a92d9ec2f2b..bd6d55fb7b3 100644
--- a/paddle/fluid/operators/mkldnn/sum_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/sum_mkldnn_op.cc
@@ -34,7 +34,6 @@ class DenseTensor;
 namespace paddle {
 namespace framework {}  // namespace framework
 namespace platform {
-class CPUDeviceContext;
 class MKLDNNDeviceContext;
 }  // namespace platform
 }  // namespace paddle
diff --git a/paddle/fluid/operators/rank_loss_op.cc b/paddle/fluid/operators/rank_loss_op.cc
index 05f2fb7067e..49d6424394a 100644
--- a/paddle/fluid/operators/rank_loss_op.cc
+++ b/paddle/fluid/operators/rank_loss_op.cc
@@ -24,9 +24,6 @@ class OpDesc;
 namespace imperative {
 class OpBase;
 }  // namespace imperative
-namespace platform {
-class CPUDeviceContext;
-}  // namespace platform
 }  // namespace paddle
 
 namespace paddle {
diff --git a/paddle/fluid/operators/reduce_ops/frobenius_norm_op.cc b/paddle/fluid/operators/reduce_ops/frobenius_norm_op.cc
index c7b0e8ced59..7fba45fa539 100644
--- a/paddle/fluid/operators/reduce_ops/frobenius_norm_op.cc
+++ b/paddle/fluid/operators/reduce_ops/frobenius_norm_op.cc
@@ -27,9 +27,6 @@ class OpDesc;
 namespace imperative {
 class OpBase;
 }  // namespace imperative
-namespace platform {
-class CPUDeviceContext;
-}  // namespace platform
 }  // namespace paddle
 
 namespace paddle {
diff --git a/paddle/fluid/operators/reduce_ops/reduce_all_op.cc b/paddle/fluid/operators/reduce_ops/reduce_all_op.cc
index 6947ca5b71a..f0de9466635 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_all_op.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_all_op.cc
@@ -27,9 +27,6 @@ class EmptyGradOpMaker;
 namespace imperative {
 class OpBase;
 }  // namespace imperative
-namespace platform {
-class CPUDeviceContext;
-}  // namespace platform
 }  // namespace paddle
 
 DECLARE_INFER_SHAPE_FUNCTOR(reduce_all,
diff --git a/paddle/fluid/operators/reduce_ops/reduce_any_op.cc b/paddle/fluid/operators/reduce_ops/reduce_any_op.cc
index 85e262add2e..6634ccaaa01 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_any_op.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_any_op.cc
@@ -26,9 +26,6 @@ class EmptyGradOpMaker;
 namespace imperative {
 class OpBase;
 }  // namespace imperative
-namespace platform {
-class CPUDeviceContext;
-}  // namespace platform
 }  // namespace paddle
 
 DECLARE_INFER_SHAPE_FUNCTOR(reduce_any,
diff --git a/paddle/fluid/operators/reduce_ops/reduce_prod_op.cc b/paddle/fluid/operators/reduce_ops/reduce_prod_op.cc
index 1c88c4cb708..578954663c7 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_prod_op.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_prod_op.cc
@@ -25,9 +25,6 @@ class OpDesc;
 namespace imperative {
 class OpBase;
 }  // namespace imperative
-namespace platform {
-class CPUDeviceContext;
-}  // namespace platform
 }  // namespace paddle
 
 namespace ops = paddle::operators;
diff --git a/paddle/fluid/operators/reduce_ops/reduce_sum_op.cc b/paddle/fluid/operators/reduce_ops/reduce_sum_op.cc
index ca24cc9c634..d072dcfa5eb 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_sum_op.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_sum_op.cc
@@ -27,9 +27,6 @@ class OpDesc;
 namespace imperative {
 class OpBase;
 }  // namespace imperative
-namespace platform {
-class CPUDeviceContext;
-}  // namespace platform
 }  // namespace paddle
 
 namespace paddle {
diff --git a/paddle/fluid/operators/set_value_op.cc b/paddle/fluid/operators/set_value_op.cc
index da2cf4c0dbe..074642e1b02 100644
--- a/paddle/fluid/operators/set_value_op.cc
+++ b/paddle/fluid/operators/set_value_op.cc
@@ -31,9 +31,6 @@ class EmptyGradOpMaker;
 namespace imperative {
 class OpBase;
 }  // namespace imperative
-namespace platform {
-class CPUDeviceContext;
-}  // namespace platform
 }  // namespace paddle
 
 namespace paddle {
diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc
index 3ad22def690..ec7f46cd973 100644
--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@@ -367,14 +367,6 @@ DeviceContextPool::DeviceContextPool(
                         /*disable_setting_default_stream_for_allocator=*/false);
 }
 
-CPUDeviceContext::CPUDeviceContext() : phi::CPUContext() {
-  phi::CPUContext::Init();
-}
-
-CPUDeviceContext::CPUDeviceContext(CPUPlace place) : phi::CPUContext(place) {
-  phi::CPUContext::Init();
-}
-
 #ifdef PADDLE_WITH_IPU
 IPUDeviceContext::IPUDeviceContext(IPUPlace place) : place_(place) {}
 
diff --git a/paddle/fluid/platform/device_context.h b/paddle/fluid/platform/device_context.h
index c6cc29d9ca1..2c3bc017635 100644
--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
@@ -134,14 +134,7 @@ constexpr DeviceType kMLU = DeviceType::MLU;
 
 using DeviceContext = phi::DeviceContext;
 
-// using CPUDeviceContext = phi::CPUContext;
-// TODO(wilber): The place constructor is used in many places, it is more
-// difficult to use CPUDeviceContext = phi::CPUContext directly.
-class CPUDeviceContext : public phi::CPUContext {
- public:
-  CPUDeviceContext();
-  explicit CPUDeviceContext(CPUPlace place);
-};
+using CPUDeviceContext = phi::CPUContext;
 
 template <typename Place>
 struct DefaultDeviceContextType;
diff --git a/paddle/fluid/platform/transform.h b/paddle/fluid/platform/transform.h
index 45756372e22..575415ef890 100644
--- a/paddle/fluid/platform/transform.h
+++ b/paddle/fluid/platform/transform.h
@@ -69,30 +69,6 @@ struct Transform {
 };
 
 // NOTE: After the phi kernel is migrated, it needs to be deleted.
-template <>
-struct Transform<platform::CPUDeviceContext> {
-  template <typename InputIter, typename OutputIter, typename UnaryOperation>
-  void operator()(const platform::CPUDeviceContext& context,
-                  InputIter first,
-                  InputIter last,
-                  OutputIter result,
-                  UnaryOperation op) {
-    std::transform(first, last, result, op);
-  }
-
-  template <typename InputIter1,
-            typename InputIter2,
-            typename OutputIter,
-            typename BinaryOperation>
-  void operator()(const platform::CPUDeviceContext& context,
-                  InputIter1 first1,
-                  InputIter1 last1,
-                  InputIter2 first2,
-                  OutputIter result,
-                  BinaryOperation op) {
-    std::transform(first1, last1, first2, result, op);
-  }
-};
 
 template <>
 struct Transform<phi::CPUContext> {
diff --git a/paddle/infrt/kernel/phi/context_kernels.cc b/paddle/infrt/kernel/phi/context_kernels.cc
index 23d96aeb8d5..9c5ab13d17b 100644
--- a/paddle/infrt/kernel/phi/context_kernels.cc
+++ b/paddle/infrt/kernel/phi/context_kernels.cc
@@ -20,7 +20,6 @@ namespace phi {
 
 ::phi::CPUContext CreateCPUContext() {
   ::phi::CPUContext ctx{};
-  ctx.Init();
   auto allocator = new backends::CpuPhiAllocator{};
   ctx.SetAllocator(allocator);
   ctx.SetHostAllocator(allocator);
diff --git a/paddle/infrt/kernel/phi/infershaped/infershape_launchers_test.cc b/paddle/infrt/kernel/phi/infershaped/infershape_launchers_test.cc
index 5a314817c24..aa577da60c3 100644
--- a/paddle/infrt/kernel/phi/infershaped/infershape_launchers_test.cc
+++ b/paddle/infrt/kernel/phi/infershaped/infershape_launchers_test.cc
@@ -81,7 +81,6 @@ TEST(ElementwiseAdd, launcher_registry) {
 
   ::phi::CPUContext context;
   context.SetAllocator(alloc);
-  context.Init();
 
   host_context::KernelFrameBuilder kernel_frame_builder;
   kernel_frame_builder.AddArgument(new host_context::Value(std::move(context)));
diff --git a/paddle/phi/backends/cpu/cpu_context.cc b/paddle/phi/backends/cpu/cpu_context.cc
index 42e19944b21..63b5d82f3bd 100644
--- a/paddle/phi/backends/cpu/cpu_context.cc
+++ b/paddle/phi/backends/cpu/cpu_context.cc
@@ -51,10 +51,14 @@ struct CPUContext::Impl {
 };
 
 CPUContext::CPUContext()
-    : DeviceContext(), impl_(std::make_unique<CPUContext::Impl>()) {}
+    : DeviceContext(), impl_(std::make_unique<CPUContext::Impl>()) {
+  impl_->Init();
+}
 
 CPUContext::CPUContext(const Place& place)
-    : DeviceContext(), impl_(std::make_unique<CPUContext::Impl>(place)) {}
+    : DeviceContext(), impl_(std::make_unique<CPUContext::Impl>(place)) {
+  impl_->Init();
+}
 
 CPUContext::~CPUContext() = default;
 
@@ -62,8 +66,6 @@ CPUContext::CPUContext(CPUContext&&) = default;
 
 CPUContext& CPUContext::operator=(CPUContext&&) = default;
 
-void CPUContext::Init() { impl_->Init(); }
-
 Eigen::DefaultDevice* CPUContext::eigen_device() const {
   return impl_->GetEigenDevice();
 }
diff --git a/paddle/phi/backends/cpu/cpu_context.h b/paddle/phi/backends/cpu/cpu_context.h
index e482fdc9e04..58548b2e04e 100644
--- a/paddle/phi/backends/cpu/cpu_context.h
+++ b/paddle/phi/backends/cpu/cpu_context.h
@@ -34,12 +34,6 @@ class PADDLE_API CPUContext : public DeviceContext {
   Eigen::DefaultDevice* eigen_device() const;
   const Place& GetPlace() const override;
 
- public:
-  // NOTE: DeviceContext hold resources. Used in training scenarios.
-  // The interface used by the training scene, DeviceContext will initialize
-  // all resources and delete them when destructing.
-  void Init();
-
  protected:
   // NOTE: External users manage resources. Used in inference scenarios.
   // The Set interface is for inference only, DeviceContext will mark the
diff --git a/paddle/phi/kernels/funcs/blas/blas_impl.h b/paddle/phi/kernels/funcs/blas/blas_impl.h
index db4796b3f61..a18ec953d0a 100644
--- a/paddle/phi/kernels/funcs/blas/blas_impl.h
+++ b/paddle/phi/kernels/funcs/blas/blas_impl.h
@@ -1003,12 +1003,6 @@ struct CBlas<phi::dtype::float16> {
 #ifdef PADDLE_WITH_MKLML
 template <>
 template <typename T>
-T *Blas<paddle::platform::CPUDeviceContext>::GEMM_ALLOC(
-    const CBLAS_IDENTIFIER id, const int M, const int N, const int K) const {
-  return CBlas<T>::GEMM_ALLOC(id, M, N, K);
-}
-template <>
-template <typename T>
 T *Blas<phi::CPUContext>::GEMM_ALLOC(const CBLAS_IDENTIFIER id,
                                      const int M,
                                      const int N,
@@ -1016,20 +1010,6 @@ T *Blas<phi::CPUContext>::GEMM_ALLOC(const CBLAS_IDENTIFIER id,
   return CBlas<T>::GEMM_ALLOC(id, M, N, K);
 }
 
-template <>
-template <typename T>
-void Blas<paddle::platform::CPUDeviceContext>::GEMM_PACK(
-    const CBLAS_IDENTIFIER id,
-    const CBLAS_TRANSPOSE trans,
-    int M,
-    int N,
-    int K,
-    const T alpha,
-    const T *src,
-    const int ld,
-    T *dst) const {
-  CBlas<T>::GEMM_PACK(CblasRowMajor, id, trans, M, N, K, alpha, src, ld, dst);
-}
 template <>
 template <typename T>
 void Blas<phi::CPUContext>::GEMM_PACK(const CBLAS_IDENTIFIER id,
@@ -1044,24 +1024,6 @@ void Blas<phi::CPUContext>::GEMM_PACK(const CBLAS_IDENTIFIER id,
   CBlas<T>::GEMM_PACK(CblasRowMajor, id, trans, M, N, K, alpha, src, ld, dst);
 }
 
-template <>
-template <typename T>
-void Blas<paddle::platform::CPUDeviceContext>::GEMM_COMPUTE(
-    int transA,
-    int transB,
-    int M,
-    int N,
-    int K,
-    const T *A,
-    const int lda,
-    const T *B,
-    const int ldb,
-    T beta,
-    T *C,
-    const int ldc) const {
-  CBlas<T>::GEMM_COMPUTE(
-      CblasRowMajor, transA, transB, M, N, K, A, lda, B, ldb, beta, C, ldc);
-}
 template <>
 template <typename T>
 void Blas<phi::CPUContext>::GEMM_COMPUTE(int transA,
@@ -1080,11 +1042,6 @@ void Blas<phi::CPUContext>::GEMM_COMPUTE(int transA,
       CblasRowMajor, transA, transB, M, N, K, A, lda, B, ldb, beta, C, ldc);
 }
 
-template <>
-template <typename T>
-void Blas<paddle::platform::CPUDeviceContext>::GEMM_FREE(T *data) const {
-  CBlas<T>::GEMM_FREE(data);
-}
 template <>
 template <typename T>
 void Blas<phi::CPUContext>::GEMM_FREE(T *data) const {
@@ -1092,36 +1049,6 @@ void Blas<phi::CPUContext>::GEMM_FREE(T *data) const {
 }
 #endif
 
-template <>
-template <typename T>
-void Blas<paddle::platform::CPUDeviceContext>::GEMM(CBLAS_TRANSPOSE transA,
-                                                    CBLAS_TRANSPOSE transB,
-                                                    int M,
-                                                    int N,
-                                                    int K,
-                                                    T alpha,
-                                                    const T *A,
-                                                    const T *B,
-                                                    T beta,
-                                                    T *C) const {
-  int lda = (transA == CblasNoTrans) ? K : M;
-  int ldb = (transB == CblasNoTrans) ? N : K;
-  int ldc = N;
-  CBlas<T>::GEMM(CblasRowMajor,
-                 transA,
-                 transB,
-                 M,
-                 N,
-                 K,
-                 alpha,
-                 A,
-                 lda,
-                 B,
-                 ldb,
-                 beta,
-                 C,
-                 ldc);
-}
 template <>
 template <typename T>
 void Blas<phi::CPUContext>::GEMM(CBLAS_TRANSPOSE transA,
@@ -1153,36 +1080,6 @@ void Blas<phi::CPUContext>::GEMM(CBLAS_TRANSPOSE transA,
                  ldc);
 }
 
-template <>
-template <typename T>
-void Blas<paddle::platform::CPUDeviceContext>::GEMM(bool transA,
-                                                    bool transB,
-                                                    int M,
-                                                    int N,
-                                                    int K,
-                                                    T alpha,
-                                                    const T *A,
-                                                    int lda,
-                                                    const T *B,
-                                                    int ldb,
-                                                    T beta,
-                                                    T *C,
-                                                    int ldc) const {
-  CBlas<T>::GEMM(CblasRowMajor,
-                 transA == false ? CblasNoTrans : CblasTrans,
-                 transB == false ? CblasNoTrans : CblasTrans,
-                 M,
-                 N,
-                 K,
-                 alpha,
-                 A,
-                 lda,
-                 B,
-                 ldb,
-                 beta,
-                 C,
-                 ldc);
-}
 template <>
 template <typename T>
 void Blas<phi::CPUContext>::GEMM(bool transA,
@@ -1214,36 +1111,6 @@ void Blas<phi::CPUContext>::GEMM(bool transA,
                  ldc);
 }
 
-template <>
-template <typename T>
-void Blas<paddle::platform::CPUDeviceContext>::GEMM(CBLAS_TRANSPOSE transA,
-                                                    CBLAS_TRANSPOSE transB,
-                                                    int M,
-                                                    int N,
-                                                    int K,
-                                                    T alpha,
-                                                    const T *A,
-                                                    int lda,
-                                                    const T *B,
-                                                    int ldb,
-                                                    T beta,
-                                                    T *C,
-                                                    int ldc) const {
-  CBlas<T>::GEMM(CblasRowMajor,
-                 transA,
-                 transB,
-                 M,
-                 N,
-                 K,
-                 alpha,
-                 A,
-                 lda,
-                 B,
-                 ldb,
-                 beta,
-                 C,
-                 ldc);
-}
 template <>
 template <typename T>
 void Blas<phi::CPUContext>::GEMM(CBLAS_TRANSPOSE transA,
@@ -1323,50 +1190,18 @@ void Blas<DeviceContext>::MatMul(const phi::DenseTensor &mat_a,
              mat_out->data<T>());
 }
 
-template <>
-template <typename T>
-void Blas<paddle::platform::CPUDeviceContext>::AXPY(int n,
-                                                    T alpha,
-                                                    const T *x,
-                                                    T *y) const {
-  CBlas<T>::AXPY(n, alpha, x, 1, y, 1);
-}
 template <>
 template <typename T>
 void Blas<phi::CPUContext>::AXPY(int n, T alpha, const T *x, T *y) const {
   CBlas<T>::AXPY(n, alpha, x, 1, y, 1);
 }
 
-template <>
-template <typename T>
-void Blas<paddle::platform::CPUDeviceContext>::VCOPY(int n,
-                                                     const T *x,
-                                                     T *y) const {
-  CBlas<T>::VCOPY(n, x, 1, y, 1);
-}
 template <>
 template <typename T>
 void Blas<phi::CPUContext>::VCOPY(int n, const T *x, T *y) const {
   CBlas<T>::VCOPY(n, x, 1, y, 1);
 }
 
-template <>
-template <typename T>
-void Blas<paddle::platform::CPUDeviceContext>::VADD(int n,
-                                                    const T *x,
-                                                    const T *y,
-                                                    T *z) const {
-#ifdef PADDLE_WITH_MKLML
-  CBlas<T>::VADD(n, x, y, z);
-#else
-  if (x == z) {
-    this->template AXPY<T>(n, (T)(1.), y, z);
-  } else {
-    this->template VCOPY<T>(n, y, z);
-    this->template AXPY<T>(n, (T)(1.), x, z);
-  }
-#endif
-}
 template <>
 template <typename T>
 void Blas<phi::CPUContext>::VADD(int n, const T *x, const T *y, T *z) const {
@@ -1382,21 +1217,6 @@ void Blas<phi::CPUContext>::VADD(int n, const T *x, const T *y, T *z) const {
 #endif
 }
 
-template <>
-template <typename T>
-void Blas<paddle::platform::CPUDeviceContext>::VSUB(int n,
-                                                    const T *x,
-                                                    const T *y,
-                                                    T *z) const {
-#ifdef PADDLE_WITH_MKLML
-  CBlas<T>::VSUB(n, x, y, z);
-#else
-  // try to find if openblas support vsub
-  for (int i = 0; i < n; ++i) {
-    z[i] = x[i] - y[i];
-  }
-#endif
-}
 template <>
 template <typename T>
 void Blas<phi::CPUContext>::VSUB(int n, const T *x, const T *y, T *z) const {
@@ -1410,21 +1230,6 @@ void Blas<phi::CPUContext>::VSUB(int n, const T *x, const T *y, T *z) const {
 #endif
 }
 
-template <>
-template <typename T>
-void Blas<paddle::platform::CPUDeviceContext>::VMUL(int n,
-                                                    const T *x,
-                                                    const T *y,
-                                                    T *z) const {
-#ifdef PADDLE_WITH_MKLML
-  CBlas<T>::VMUL(n, x, y, z);
-#else
-  // try to find if openblas support vmul
-  for (int i = 0; i < n; ++i) {
-    z[i] = x[i] * y[i];
-  }
-#endif
-}
 template <>
 template <typename T>
 void Blas<phi::CPUContext>::VMUL(int n, const T *x, const T *y, T *z) const {
@@ -1438,21 +1243,6 @@ void Blas<phi::CPUContext>::VMUL(int n, const T *x, const T *y, T *z) const {
 #endif
 }
 
-template <>
-template <typename T>
-void Blas<paddle::platform::CPUDeviceContext>::VDIV(int n,
-                                                    const T *x,
-                                                    const T *y,
-                                                    T *z) const {
-#ifdef PADDLE_WITH_MKLML
-  CBlas<T>::VDIV(n, x, y, z);
-#else
-  // try to find if openblas support vdiv
-  for (int i = 0; i < n; ++i) {
-    z[i] = x[i] / y[i];
-  }
-#endif
-}
 template <>
 template <typename T>
 void Blas<phi::CPUContext>::VDIV(int n, const T *x, const T *y, T *z) const {
@@ -1466,20 +1256,6 @@ void Blas<phi::CPUContext>::VDIV(int n, const T *x, const T *y, T *z) const {
 #endif
 }
 
-template <>
-template <typename T>
-void Blas<paddle::platform::CPUDeviceContext>::VEXP(int n,
-                                                    const T *x,
-                                                    T *y) const {
-#ifdef PADDLE_WITH_MKLML
-  CBlas<T>::VEXP(n, x, y);
-#else
-  // try to find if openblas support vexp
-  for (int i = 0; i < n; ++i) {
-    y[i] = std::exp(x[i]);
-  }
-#endif
-}
 template <>
 template <typename T>
 void Blas<phi::CPUContext>::VEXP(int n, const T *x, T *y) const {
@@ -1493,19 +1269,6 @@ void Blas<phi::CPUContext>::VEXP(int n, const T *x, T *y) const {
 #endif
 }
 
-template <>
-template <typename T>
-void Blas<paddle::platform::CPUDeviceContext>::VSQUARE(int n,
-                                                       const T *x,
-                                                       T *y) const {
-#ifdef PADDLE_WITH_MKLML
-  CBlas<T>::VSQUARE(n, x, y);
-#else
-  for (int i = 0; i < n; ++i) {
-    y[i] = x[i] * x[i];
-  }
-#endif
-}
 template <>
 template <typename T>
 void Blas<phi::CPUContext>::VSQUARE(int n, const T *x, T *y) const {
@@ -1518,20 +1281,6 @@ void Blas<phi::CPUContext>::VSQUARE(int n, const T *x, T *y) const {
 #endif
 }
 
-template <>
-template <typename T>
-void Blas<paddle::platform::CPUDeviceContext>::VPOW(int n,
-                                                    const T *x,
-                                                    T a,
-                                                    T *y) const {
-#ifdef PADDLE_WITH_MKLML
-  CBlas<T>::VPOW(n, x, a, y);
-#else
-  for (int i = 0; i < n; ++i) {
-    y[i] = std::pow(x[i], a);
-  }
-#endif
-}
 template <>
 template <typename T>
 void Blas<phi::CPUContext>::VPOW(int n, const T *x, T a, T *y) const {
@@ -1544,22 +1293,6 @@ void Blas<phi::CPUContext>::VPOW(int n, const T *x, T a, T *y) const {
 #endif
 }
 
-template <>
-template <typename T>
-T Blas<paddle::platform::CPUDeviceContext>::DOT(int n,
-                                                const T *x,
-                                                const T *y) const {
-#ifdef PADDLE_WITH_MKLML
-  return CBlas<T>::DOT(n, x, 1, y, 1);
-#else
-  // try to find if openblas support cblas_dot
-  T sum = 0;
-  for (int i = 0; i < n; ++i) {
-    sum += x[i] * y[i];
-  }
-  return sum;
-#endif
-}
 template <>
 template <typename T>
 T Blas<phi::CPUContext>::DOT(int n, const T *x, const T *y) const {
@@ -1575,20 +1308,6 @@ T Blas<phi::CPUContext>::DOT(int n, const T *x, const T *y) const {
 #endif
 }
 
-template <>
-template <typename T>
-void Blas<paddle::platform::CPUDeviceContext>::SCAL(int n,
-                                                    const T a,
-                                                    T *x) const {
-#ifdef PADDLE_WITH_MKLML
-  CBlas<T>::SCAL(n, a, x, 1);
-#else
-  // try to find if openblas support cblas_scal
-  for (int i = 0; i < n; ++i) {
-    x[i] = a * x[i];
-  }
-#endif
-}
 template <>
 template <typename T>
 void Blas<phi::CPUContext>::SCAL(int n, const T a, T *x) const {
@@ -1602,20 +1321,6 @@ void Blas<phi::CPUContext>::SCAL(int n, const T a, T *x) const {
 #endif
 }
 
-template <>
-template <typename T>
-T Blas<paddle::platform::CPUDeviceContext>::ASUM(int n, T *x, int inc) const {
-  auto sum = static_cast<T>(0.0);
-#ifdef PADDLE_WITH_MKLML
-  sum = CBlas<T>::ASUM(n, x, inc);
-#else
-  // TODO(jczaja): check if openblas does provide cblas_sasum/cblas_dasum
-  for (int c = 0; c < n; ++c) {
-    sum += x[c];
-  }
-#endif
-  return sum;
-}
 template <>
 template <typename T>
 T Blas<phi::CPUContext>::ASUM(int n, T *x, int inc) const {
@@ -1631,19 +1336,6 @@ T Blas<phi::CPUContext>::ASUM(int n, T *x, int inc) const {
   return sum;
 }
 
-template <>
-template <typename T>
-void Blas<paddle::platform::CPUDeviceContext>::GEMV(bool trans_a,
-                                                    int M,
-                                                    int N,
-                                                    T alpha,
-                                                    const T *A,
-                                                    const T *B,
-                                                    T beta,
-                                                    T *C) const {
-  CBLAS_TRANSPOSE transA = !trans_a ? CblasNoTrans : CblasTrans;
-  CBlas<T>::GEMV(CblasRowMajor, transA, M, N, alpha, A, N, B, 1, beta, C, 1);
-}
 template <>
 template <typename T>
 void Blas<phi::CPUContext>::GEMV(bool trans_a,
@@ -1658,66 +1350,6 @@ void Blas<phi::CPUContext>::GEMV(bool trans_a,
   CBlas<T>::GEMV(CblasRowMajor, transA, M, N, alpha, A, N, B, 1, beta, C, 1);
 }
 
-template <>
-template <typename T>
-void Blas<paddle::platform::CPUDeviceContext>::BatchedGEMM(
-    CBLAS_TRANSPOSE transA,
-    CBLAS_TRANSPOSE transB,
-    int M,
-    int N,
-    int K,
-    T alpha,
-    const T *A,
-    const T *B,
-    T beta,
-    T *C,
-    int batchCount,
-    int64_t strideA,
-    int64_t strideB) const {
-  PADDLE_ENFORCE_NOT_NULL(
-      A, phi::errors::InvalidArgument("Pointer A should not be null."));
-  PADDLE_ENFORCE_NOT_NULL(
-      B, phi::errors::InvalidArgument("Pointer B should not be null."));
-  PADDLE_ENFORCE_NOT_NULL(
-      C, phi::errors::InvalidArgument("Pointer C should not be null."));
-#ifdef PADDLE_WITH_MKLML
-  int lda = (transA == CblasNoTrans) ? K : M;
-  int ldb = (transB == CblasNoTrans) ? N : K;
-  int ldc = N;
-  auto a_array = std::vector<const T *>(batchCount);
-  auto b_array = std::vector<const T *>(batchCount);
-  auto c_array = std::vector<T *>(batchCount);
-  for (int k = 0; k < batchCount; ++k) {
-    a_array[k] = &A[k * strideA];
-    b_array[k] = &B[k * strideB];
-    c_array[k] = &C[k * M * N];
-  }
-
-  CBlas<T>::GEMM_BATCH(CblasRowMajor,
-                       &transA,
-                       &transB,
-                       &M,
-                       &N,
-                       &K,
-                       &alpha,
-                       a_array.data(),
-                       &lda,
-                       b_array.data(),
-                       &ldb,
-                       &beta,
-                       c_array.data(),
-                       &ldc,
-                       1 /* group_count */,
-                       &batchCount);
-#else
-  for (int k = 0; k < batchCount; ++k) {
-    auto *Ak = &A[k * strideA];
-    auto *Bk = &B[k * strideB];
-    auto *Ck = &C[k * M * N];
-    this->template GEMM<T>(transA, transB, M, N, K, alpha, Ak, Bk, beta, Ck);
-  }
-#endif
-}
 template <>
 template <typename T>
 void Blas<phi::CPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
@@ -1778,47 +1410,6 @@ void Blas<phi::CPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
 #endif
 }
 
-template <>
-template <typename T>
-void Blas<paddle::platform::CPUDeviceContext>::BatchedGEMM(
-    CBLAS_TRANSPOSE transA,
-    CBLAS_TRANSPOSE transB,
-    int M,
-    int N,
-    int K,
-    T alpha,
-    const T **A,
-    const T **B,
-    T beta,
-    T **C,
-    int batchCount) const {
-#ifdef PADDLE_WITH_MKLML
-  const int lda = (std::max)((transA == CblasNoTrans) ? K : M, 1);
-  const int ldb = (std::max)((transB == CblasNoTrans) ? N : K, 1);
-  const int ldc = (std::max)(N, 1);
-  CBlas<T>::GEMM_BATCH(CblasRowMajor,
-                       &transA,
-                       &transB,
-                       &M,
-                       &N,
-                       &K,
-                       &alpha,
-                       A,
-                       &lda,
-                       B,
-                       &ldb,
-                       &beta,
-                       C,
-                       &ldc,
-                       1 /* group_count */,
-                       &batchCount);
-#else
-  for (int k = 0; k < batchCount; ++k) {
-    this->template GEMM<T>(
-        transA, transB, M, N, K, alpha, A[k], B[k], beta, C[k]);
-  }
-#endif
-}
 template <>
 template <typename T>
 void Blas<phi::CPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
@@ -1864,113 +1455,6 @@ void Blas<phi::CPUContext>::BatchedGEMM(CBLAS_TRANSPOSE transA,
     !defined(PADDLE_WITH_HIP)  // @{ Group Blas MKLML: BatchedGEMMWithHead
 template <>
 template <typename T>
-void Blas<paddle::platform::CPUDeviceContext>::BatchedGEMMWithHead(
-    CBLAS_TRANSPOSE transA,
-    CBLAS_TRANSPOSE transB,
-    int W1,
-    int H1,
-    int W2,
-    int H2,
-    T alpha,
-    const T *A,
-    const T *B,
-    T beta,
-    T *C,
-    int batchCount,
-    int64_t strideA,
-    int64_t strideB,
-    int64_t head_number,
-    bool split_b_vertical) const {
-  int lda = (transA == CblasNoTrans) ? W1 : H1;
-  int ldb = (transB == CblasNoTrans) ? W2 : H2;
-  auto a_array = std::vector<const T *>(batchCount);
-  auto b_array = std::vector<const T *>(batchCount);
-  auto c_array = std::vector<T *>(batchCount);
-
-  if (split_b_vertical) {
-    int ldc = W2;
-    int sub_width = W2 / head_number;
-
-    for (int i = 0; i < head_number; i++) {
-      int sub_matA_offset = (transA == CblasNoTrans)
-                                ? i * (W1 / head_number)
-                                : i * (W1 / head_number) * H1;
-      int sub_matB_offset = (transB == CblasNoTrans)
-                                ? i * (W2 / head_number)
-                                : i * (W2 / head_number) * H2;
-      int sub_matC_offset = i * W2 / head_number;
-      for (int k = 0; k < batchCount; ++k) {
-        a_array[k] = &A[k * strideA] + sub_matA_offset;
-        b_array[k] = &B[k * strideB] + sub_matB_offset;
-        c_array[k] = &C[k * H1 * W2] + sub_matC_offset;
-      }
-
-      CBlas<T>::GEMM_BATCH(CblasRowMajor,
-                           &transA,
-                           &transB,
-                           &H1,
-                           &sub_width,
-                           &H2,
-                           &alpha,
-                           a_array.data(),
-                           &lda,
-                           b_array.data(),
-                           &ldb,
-                           &beta,
-                           c_array.data(),
-                           &ldc,
-                           1 /* group_count */,
-                           &batchCount);
-    }
-
-  } else {
-    PADDLE_ENFORCE_EQ(
-        W1,
-        H2,
-        phi::errors::InvalidArgument(
-            "The fisrt matrix width should be same as second matrix height,"
-            "but received fisrt matrix width %d"
-            ", second matrix height %d",
-            W1,
-            H2));
-    int ldc = W2 * head_number;
-    int sub_width = W1 / head_number;
-
-    for (int i = 0; i < head_number; i++) {
-      int sub_matA_offset = (transA == CblasNoTrans)
-                                ? i * (W1 / head_number)
-                                : i * (W1 / head_number) * H1;
-      int sub_matB_offset = (transB == CblasNoTrans)
-                                ? i * (W1 / head_number) * W2
-                                : i * (W1 / head_number);
-      int sub_matC_offset = i * W2;
-      for (int k = 0; k < batchCount; ++k) {
-        a_array[k] = &A[k * strideA] + sub_matA_offset;
-        b_array[k] = &B[k * strideB] + sub_matB_offset;
-        c_array[k] = &C[k * H1 * head_number * W2] + sub_matC_offset;
-      }
-
-      CBlas<T>::GEMM_BATCH(CblasRowMajor,
-                           &transA,
-                           &transB,
-                           &H1,
-                           &W2,
-                           &sub_width,
-                           &alpha,
-                           a_array.data(),
-                           &lda,
-                           b_array.data(),
-                           &ldb,
-                           &beta,
-                           c_array.data(),
-                           &ldc,
-                           1 /* group_count */,
-                           &batchCount);
-    }
-  }
-}
-template <>
-template <typename T>
 void Blas<phi::CPUContext>::BatchedGEMMWithHead(CBLAS_TRANSPOSE transA,
                                                 CBLAS_TRANSPOSE transB,
                                                 int W1,
@@ -2097,43 +1581,6 @@ void Blas<DeviceContext>::MatMul(
                          N);
 }
 
-template <>
-template <typename T>
-void Blas<paddle::platform::CPUDeviceContext>::MatMul(
-    const int M, const int N, const int K, const T *A, const T *B, T *C) const {
-#ifdef PADDLE_WITH_LIBXSMM
-  // Refer to https://github.com/hfp/libxsmm/blob/master/README.md
-  // But the threshold is custom constexpr int LIBXSMM_THRESHOLD = 20 * 20 * 20;
-
-  // Since the matrix is very small,
-  // so the unit of calculation is already very fast,
-  // and the if( M*N*K < LIBXSMM_THRESHOLD) would be overhead,
-  // use xsmm directly.
-  // Note: SMM use ColMajor
-  const char transa = 'N';
-  const char transb = 'N';
-  const T alpha = static_cast<T>(1);
-  const T beta = static_cast<T>(0);
-  CBlas<T>::SMM_GEMM(
-      &transa, &transb, &N, &M, &K, &alpha, B, &N, A, &K, &beta, C, &N);
-  return;
-#endif
-
-  CBlas<T>::GEMM(CblasRowMajor,
-                 CblasNoTrans,
-                 CblasNoTrans,
-                 M,
-                 N,
-                 K,
-                 static_cast<T>(1),
-                 A,
-                 K,
-                 B,
-                 N,
-                 static_cast<T>(0),
-                 C,
-                 N);
-}
 template <>
 template <typename T>
 void Blas<phi::CPUContext>::MatMul(
@@ -2425,20 +1872,6 @@ void Blas<DeviceContext>::VINV(int n, const T *a, T *y) const {
 #endif
 }
 
-template <>
-template <typename T>
-void Blas<paddle::platform::CPUDeviceContext>::VMERF(int n,
-                                                     const T *a,
-                                                     T *y,
-                                                     int64_t mode) const {
-#ifdef PADDLE_WITH_MKLML
-  CBlas<T>::VMERF(n, a, y, mode);
-#else
-  for (int i = 0; i < n; ++i) {
-    y[i] = std::erf(a[i]);
-  }
-#endif
-}
 template <>
 template <typename T>
 void Blas<phi::CPUContext>::VMERF(int n, const T *a, T *y, int64_t mode) const {
@@ -2454,39 +1887,6 @@ void Blas<phi::CPUContext>::VMERF(int n, const T *a, T *y, int64_t mode) const {
 #ifdef PADDLE_WITH_MKLML
 template <>
 template <typename T>
-void Blas<paddle::platform::CPUDeviceContext>::CSRMM(const char *transa,
-                                                     const int *m,
-                                                     const int *n,
-                                                     const int *k,
-                                                     const T *alpha,
-                                                     const char *matdescra,
-                                                     const T *val,
-                                                     const int *indx,
-                                                     const int *pntrb,
-                                                     const int *pntre,
-                                                     const T *b,
-                                                     const int *ldb,
-                                                     const T *beta,
-                                                     T *c,
-                                                     const int *ldc) const {
-  CBlas<T>::CSRMM(transa,
-                  m,
-                  n,
-                  k,
-                  alpha,
-                  matdescra,
-                  val,
-                  indx,
-                  pntrb,
-                  pntre,
-                  b,
-                  ldb,
-                  beta,
-                  c,
-                  ldc);
-}
-template <>
-template <typename T>
 void Blas<phi::CPUContext>::CSRMM(const char *transa,
                                   const int *m,
                                   const int *n,
@@ -2520,22 +1920,6 @@ void Blas<phi::CPUContext>::CSRMM(const char *transa,
 }
 #endif
 
-template <>
-template <typename T>
-void Blas<paddle::platform::CPUDeviceContext>::TRSM(CBLAS_SIDE side,
-                                                    CBLAS_UPLO uplo,
-                                                    CBLAS_TRANSPOSE transA,
-                                                    CBLAS_DIAG diag,
-                                                    int M,
-                                                    int N,
-                                                    T alpha,
-                                                    const T *A,
-                                                    int lda,
-                                                    T *B,
-                                                    int ldb) const {
-  CBlas<T>::TRSM(
-      CblasRowMajor, side, uplo, transA, diag, M, N, alpha, A, lda, B, ldb);
-}
 template <>
 template <typename T>
 void Blas<phi::CPUContext>::TRSM(CBLAS_SIDE side,
diff --git a/paddle/phi/kernels/funcs/fc_functor.cc b/paddle/phi/kernels/funcs/fc_functor.cc
index 0fb38c971ab..0434483be13 100644
--- a/paddle/phi/kernels/funcs/fc_functor.cc
+++ b/paddle/phi/kernels/funcs/fc_functor.cc
@@ -96,8 +96,6 @@ void FCFunctor<DeviceContext, T>::operator()(const DeviceContext& context,
   }
 }
 
-template class FCFunctor<paddle::platform::CPUDeviceContext, float>;
-template class FCFunctor<paddle::platform::CPUDeviceContext, double>;
 template class FCFunctor<CPUContext, float>;
 template class FCFunctor<CPUContext, double>;
 
diff --git a/paddle/phi/kernels/funcs/for_range.h b/paddle/phi/kernels/funcs/for_range.h
index bf0888c301f..78066ce5b2f 100644
--- a/paddle/phi/kernels/funcs/for_range.h
+++ b/paddle/phi/kernels/funcs/for_range.h
@@ -41,22 +41,6 @@ struct ForRange<phi::CPUContext> {
   size_t limit_;
 };
 
-// NOTE: After the pten kernel is migrated, it needs to be deleted.
-template <>
-struct ForRange<paddle::platform::CPUDeviceContext> {
-  ForRange(const paddle::platform::CPUDeviceContext& dev_ctx, size_t limit)
-      : dev_ctx_(dev_ctx), limit_(limit) {}
-
-  template <typename Function>
-  void operator()(Function func) const {
-    phi::funcs::ForRange<phi::CPUContext> for_range(dev_ctx_, limit_);
-    for_range(func);
-  }
-
-  const paddle::platform::CPUDeviceContext& dev_ctx_;
-  size_t limit_;
-};
-
 #if defined(__NVCC__) || defined(__HIPCC__)
 
 template <typename Function>
diff --git a/paddle/phi/kernels/funcs/gru_compute.cc b/paddle/phi/kernels/funcs/gru_compute.cc
index 8cda2e9062a..c081a9ed97d 100644
--- a/paddle/phi/kernels/funcs/gru_compute.cc
+++ b/paddle/phi/kernels/funcs/gru_compute.cc
@@ -179,60 +179,6 @@ struct GRUUnitGradFunctor<paddle::platform::CPUDeviceContext, T> {
   }
 };
 
-template <typename T>
-struct GRUUnitFunctorV2<paddle::platform::CPUDeviceContext, T> {
-  static void compute(const paddle::platform::CPUDeviceContext &context,
-                      GRUMetaValue<T> value,
-                      int frame_size,
-                      int batch_size,
-                      const phi::funcs::detail::ActivationType active_node,
-                      const phi::funcs::detail::ActivationType active_gate) {
-#if !defined(__NVCC__) && !defined(__HIPCC___)
-    auto blas =
-        phi::funcs::GetBlas<paddle::platform::CPUDeviceContext, T>(context);
-    if (value.prev_out_value) {
-      blas.GEMM(CblasNoTrans,
-                CblasTrans,
-                batch_size,
-                frame_size,
-                frame_size,
-                1,
-                value.prev_out_value,
-                value.state_weight,
-                0,
-                value.reset_output_value);
-    }
-    detail::forward_reset_output(
-        phi::funcs::detail::forward::gru_resetOutput<T>(),
-        value,
-        frame_size,
-        batch_size,
-        active_gate,
-        false,
-        &context);
-
-    T *cell_state_value = value.gate_value + 2 * frame_size;
-    T *reset_output_value = value.reset_output_value;
-    for (int b = 0; b < batch_size; ++b) {
-      blas.VADD(
-          frame_size, cell_state_value, reset_output_value, cell_state_value);
-      cell_state_value += frame_size * 3;
-      reset_output_value += frame_size;
-    }
-
-    detail::forward_final_output(
-        phi::funcs::detail::forward::gru_finalOutput<T>(),
-        value,
-        frame_size,
-        batch_size,
-        active_node,
-        true,
-        false,
-        &context);
-#endif
-  }
-};
-
 template <typename T>
 struct GRUUnitFunctorV2<CPUContext, T> {
   static void compute(const CPUContext &context,
@@ -286,131 +232,6 @@ struct GRUUnitFunctorV2<CPUContext, T> {
   }
 };
 
-template <typename T>
-struct GRUUnitGradFunctorV2<paddle::platform::CPUDeviceContext, T> {
-  static void compute(const paddle::platform::CPUDeviceContext &context,
-                      GRUMetaValue<T> value,
-                      GRUMetaGrad<T> grad,
-                      int frame_size,
-                      int batch_size,
-                      const phi::funcs::detail::ActivationType active_node,
-                      const phi::funcs::detail::ActivationType active_gate) {
-#if !defined(__NVCC__) && !defined(__HIPCC___)
-    // calculate grad_update_gate, grad_frame_state,
-    // grad_reset_output, grad_reset_gate
-    detail::cpu_gru_backward(context,
-                             phi::funcs::detail::backward::gru<T>(),
-                             value,
-                             grad,
-                             frame_size,
-                             batch_size,
-                             active_node,
-                             active_gate);
-    auto blas =
-        phi::funcs::GetBlas<paddle::platform::CPUDeviceContext, T>(context);
-    if (grad.prev_out_grad && value.prev_out_value) {
-      // update prev_out_grad
-      blas.GEMM(false,
-                false,
-                batch_size,
-                frame_size,
-                frame_size,
-                1,
-                grad.gate_grad,
-                frame_size * 3,
-                value.gate_weight,
-                frame_size,
-                1,
-                grad.prev_out_grad,
-                frame_size);
-      blas.GEMM(false,
-                false,
-                batch_size,
-                frame_size,
-                frame_size,
-                1,
-                grad.gate_grad + frame_size,
-                frame_size * 3,
-                value.gate_weight + frame_size * frame_size,
-                frame_size,
-                1,
-                grad.prev_out_grad,
-                frame_size);
-      blas.GEMM(false,
-                false,
-                batch_size,
-                frame_size,
-                frame_size,
-                1,
-                grad.reset_output_grad,
-                frame_size,
-                value.state_weight,
-                frame_size,
-                1,
-                grad.prev_out_grad,
-                frame_size);
-      // update weight_hh_grad
-      if (grad.gate_weight_grad) {
-        // reset gate
-        blas.GEMM(true,
-                  false,
-                  frame_size,
-                  frame_size,
-                  batch_size,
-                  1,
-                  grad.gate_grad,
-                  frame_size * 3,
-                  value.prev_out_value,
-                  frame_size,
-                  1,
-                  grad.gate_weight_grad,
-                  frame_size);
-        // update gate
-        blas.GEMM(true,
-                  false,
-                  frame_size,
-                  frame_size,
-                  batch_size,
-                  1,
-                  grad.gate_grad + frame_size,
-                  frame_size * 3,
-                  value.prev_out_value,
-                  frame_size,
-                  1,
-                  grad.gate_weight_grad + frame_size * frame_size,
-                  frame_size);
-        // cell state
-        blas.GEMM(true,
-                  false,
-                  frame_size,
-                  frame_size,
-                  batch_size,
-                  1,
-                  grad.reset_output_grad,
-                  frame_size,
-                  value.prev_out_value,
-                  frame_size,
-                  1,
-                  grad.state_weight_grad,
-                  frame_size);
-      }
-    }
-    // update bias_hh_grad
-    T *gate_grad = grad.gate_grad;
-    T *bias_hh_grad = grad.bias_hh_grad;
-    T *state_bias_grad = grad.bias_hh_grad + 2 * frame_size;
-    T *reset_output_grad = grad.reset_output_grad;
-    for (int b = 0; b < batch_size; ++b) {
-      blas.VADD(2 * frame_size, bias_hh_grad, gate_grad, bias_hh_grad);
-      blas.VADD(
-          frame_size, state_bias_grad, reset_output_grad, state_bias_grad);
-      gate_grad += 3 * frame_size;
-      reset_output_grad += frame_size;
-    }
-#endif
-  }
-};
-
 template <typename T>
 struct GRUUnitGradFunctorV2<CPUContext, T> {
   static void compute(const CPUContext &context,
@@ -540,12 +361,6 @@ template struct GRUUnitFunctor<paddle::platform::CPUDeviceContext, double>;
 template struct GRUUnitGradFunctor<paddle::platform::CPUDeviceContext, float>;
 template struct GRUUnitGradFunctor<paddle::platform::CPUDeviceContext, double>;
 
-template struct GRUUnitFunctorV2<paddle::platform::CPUDeviceContext, float>;
-template struct GRUUnitFunctorV2<paddle::platform::CPUDeviceContext, double>;
-template struct GRUUnitGradFunctorV2<paddle::platform::CPUDeviceContext, float>;
-template struct GRUUnitGradFunctorV2<paddle::platform::CPUDeviceContext,
-                                     double>;
-
 template struct GRUUnitFunctorV2<CPUContext, float>;
 template struct GRUUnitFunctorV2<CPUContext, double>;
 template struct GRUUnitGradFunctorV2<CPUContext, float>;
diff --git a/paddle/phi/kernels/funcs/lstm_compute.cc b/paddle/phi/kernels/funcs/lstm_compute.cc
index 45d0b2e40b4..e4b8a6961fd 100644
--- a/paddle/phi/kernels/funcs/lstm_compute.cc
+++ b/paddle/phi/kernels/funcs/lstm_compute.cc
@@ -21,38 +21,6 @@ limitations under the License. */
 namespace phi {
 namespace funcs {
 
-template <class T>
-struct LstmUnitFunctor<paddle::platform::CPUDeviceContext, T> {
-  static void compute(const paddle::platform::CPUDeviceContext& context,
-                      LstmMetaValue<T> value,
-                      int frame_size,
-                      int batch_size,
-                      T cell_clip,
-                      const phi::funcs::detail::ActivationType& gate_act,
-                      const phi::funcs::detail::ActivationType& cell_act,
-                      const phi::funcs::detail::ActivationType& cand_act,
-                      bool old_api_version = true) {
-    for (int b = 0; b < batch_size; b++) {
-      detail::cpu_lstm_forward(context,
-                               phi::funcs::detail::forward::lstm<T>(),
-                               value,
-                               frame_size,
-                               cell_clip,
-                               cand_act,
-                               gate_act,
-                               cell_act,
-                               old_api_version);
-      value.gate_value += frame_size * 4;
-      value.state_value += frame_size;
-      value.state_active_value += frame_size;
-      value.output_value += frame_size;
-      if (value.prev_state_value) {
-        value.prev_state_value += frame_size;
-      }
-    }
-  }
-};
-
 template <class T>
 struct LstmUnitFunctor<CPUContext, T> {
   static void compute(const CPUContext& context,
@@ -85,49 +53,6 @@ struct LstmUnitFunctor<CPUContext, T> {
   }
 };
 
-template <class T>
-struct LstmUnitGradFunctor<paddle::platform::CPUDeviceContext, T> {
-  static void compute(const paddle::platform::CPUDeviceContext& context,
-                      LstmMetaValue<T> value,
-                      LstmMetaGrad<T> grad,
-                      int frame_size,
-                      int batch_size,
-                      T cell_clip,
-                      const phi::funcs::detail::ActivationType& gate_act,
-                      const phi::funcs::detail::ActivationType& cell_act,
-                      const phi::funcs::detail::ActivationType& cand_act,
-                      bool old_api_version = true) {
-    for (int b = 0; b < batch_size; b++) {
-      detail::cpu_lstm_backward(context,
-                                phi::funcs::detail::backward::lstm<T>(),
-                                value,
-                                grad,
-                                frame_size,
-                                cell_clip,
-                                cand_act,
-                                gate_act,
-                                cell_act,
-                                old_api_version);
-
-      value.gate_value += frame_size * 4;
-      value.state_value += frame_size;
-      value.state_active_value += frame_size;
-      value.output_value += frame_size;
-      if (value.prev_state_value) {
-        value.prev_state_value += frame_size;
-      }
-
-      grad.gate_grad += frame_size * 4;
-      grad.state_grad += frame_size;
-      grad.state_active_grad += frame_size;
-      grad.output_grad += frame_size;
-      if (grad.prev_state_grad) {
-        grad.prev_state_grad += frame_size;
-      }
-    }
-  }
-};
-
 template <class T>
 struct LstmUnitGradFunctor<CPUContext, T> {
   static void compute(const CPUContext& context,
@@ -171,11 +96,6 @@ struct LstmUnitGradFunctor<CPUContext, T> {
   }
 };
 
-template class LstmUnitFunctor<paddle::platform::CPUDeviceContext, float>;
-template class LstmUnitFunctor<paddle::platform::CPUDeviceContext, double>;
-template class LstmUnitGradFunctor<paddle::platform::CPUDeviceContext, float>;
-template class LstmUnitGradFunctor<paddle::platform::CPUDeviceContext, double>;
-
 template class LstmUnitFunctor<CPUContext, float>;
 template class LstmUnitFunctor<CPUContext, double>;
 template class LstmUnitGradFunctor<CPUContext, float>;
diff --git a/paddle/phi/kernels/funcs/math_function.cc b/paddle/phi/kernels/funcs/math_function.cc
index 033c50e537d..042b333ad45 100644
--- a/paddle/phi/kernels/funcs/math_function.cc
+++ b/paddle/phi/kernels/funcs/math_function.cc
@@ -39,22 +39,6 @@ namespace funcs {
 
 using float16 = phi::dtype::float16;
 
-template struct SetConstant<paddle::platform::CPUDeviceContext,
-                            phi::dtype::float16>;
-template struct SetConstant<paddle::platform::CPUDeviceContext,
-                            phi::dtype::bfloat16>;
-template struct SetConstant<paddle::platform::CPUDeviceContext, float>;
-template struct SetConstant<paddle::platform::CPUDeviceContext, double>;
-template struct SetConstant<paddle::platform::CPUDeviceContext, int16_t>;
-template struct SetConstant<paddle::platform::CPUDeviceContext, int>;
-template struct SetConstant<paddle::platform::CPUDeviceContext, int64_t>;
-template struct SetConstant<paddle::platform::CPUDeviceContext, bool>;
-template struct SetConstant<paddle::platform::CPUDeviceContext, uint8_t>;
-template struct SetConstant<paddle::platform::CPUDeviceContext,
-                            phi::dtype::complex<float>>;
-template struct SetConstant<paddle::platform::CPUDeviceContext,
-                            phi::dtype::complex<double>>;
-
 template struct SetConstant<phi::CPUContext, phi::dtype::float16>;
 template struct SetConstant<phi::CPUContext, phi::dtype::bfloat16>;
 template struct SetConstant<phi::CPUContext, float>;
@@ -85,46 +69,20 @@ template struct SetConstant<paddle::platform::XPUDeviceContext,
                             phi::dtype::complex<double>>;
 #endif
 
-#define DEFINE_CPU_TRANS(RANK)                                                 \
-  template struct Transpose<paddle::platform::CPUDeviceContext,                \
-                            phi::dtype::float16,                               \
-                            RANK>;                                             \
-  template struct Transpose<paddle::platform::CPUDeviceContext,                \
-                            phi::dtype::bfloat16,                              \
-                            RANK>;                                             \
-  template struct Transpose<paddle::platform::CPUDeviceContext, float, RANK>;  \
-  template struct Transpose<paddle::platform::CPUDeviceContext, double, RANK>; \
-  template struct Transpose<paddle::platform::CPUDeviceContext, int, RANK>;    \
-  template struct Transpose<paddle::platform::CPUDeviceContext,                \
-                            int64_t,                                           \
-                            RANK>;                                             \
-  template struct Transpose<paddle::platform::CPUDeviceContext, bool, RANK>;   \
-  template struct Transpose<paddle::platform::CPUDeviceContext,                \
-                            int16_t,                                           \
-                            RANK>;                                             \
-  template struct Transpose<paddle::platform::CPUDeviceContext,                \
-                            uint8_t,                                           \
-                            RANK>;                                             \
-  template struct Transpose<paddle::platform::CPUDeviceContext, int8_t, RANK>; \
-  template struct Transpose<paddle::platform::CPUDeviceContext,                \
-                            phi::dtype::complex<float>,                        \
-                            RANK>;                                             \
-  template struct Transpose<paddle::platform::CPUDeviceContext,                \
-                            phi::dtype::complex<double>,                       \
-                            RANK>;                                             \
-  template struct Transpose<phi::CPUContext, phi::dtype::float16, RANK>;       \
-  template struct Transpose<phi::CPUContext, phi::dtype::bfloat16, RANK>;      \
-  template struct Transpose<phi::CPUContext, float, RANK>;                     \
-  template struct Transpose<phi::CPUContext, double, RANK>;                    \
-  template struct Transpose<phi::CPUContext, int, RANK>;                       \
-  template struct Transpose<phi::CPUContext, int64_t, RANK>;                   \
-  template struct Transpose<phi::CPUContext, bool, RANK>;                      \
-  template struct Transpose<phi::CPUContext, int16_t, RANK>;                   \
-  template struct Transpose<phi::CPUContext, uint8_t, RANK>;                   \
-  template struct Transpose<phi::CPUContext, int8_t, RANK>;                    \
-  template struct Transpose<phi::CPUContext,                                   \
-                            phi::dtype::complex<float>,                        \
-                            RANK>;                                             \
+#define DEFINE_CPU_TRANS(RANK)                                            \
+  template struct Transpose<phi::CPUContext, phi::dtype::float16, RANK>;  \
+  template struct Transpose<phi::CPUContext, phi::dtype::bfloat16, RANK>; \
+  template struct Transpose<phi::CPUContext, float, RANK>;                \
+  template struct Transpose<phi::CPUContext, double, RANK>;               \
+  template struct Transpose<phi::CPUContext, int, RANK>;                  \
+  template struct Transpose<phi::CPUContext, int64_t, RANK>;              \
+  template struct Transpose<phi::CPUContext, bool, RANK>;                 \
+  template struct Transpose<phi::CPUContext, int16_t, RANK>;              \
+  template struct Transpose<phi::CPUContext, uint8_t, RANK>;              \
+  template struct Transpose<phi::CPUContext, int8_t, RANK>;               \
+  template struct Transpose<phi::CPUContext,                              \
+                            phi::dtype::complex<float>,                   \
+                            RANK>;                                        \
   template struct Transpose<phi::CPUContext, phi::dtype::complex<double>, RANK>;
 
 DEFINE_CPU_TRANS(1);
@@ -163,8 +121,7 @@ void TransposeNormal<DeviceContext, T>::operator()(
 }
 
 // define transpose normal
-#define DEFINE_CPU_TRANS_NORMAL(TYPE)                                        \
-  template struct TransposeNormal<paddle::platform::CPUDeviceContext, TYPE>; \
+#define DEFINE_CPU_TRANS_NORMAL(TYPE) \
   template struct TransposeNormal<phi::CPUContext, TYPE>
 
 DEFINE_CPU_TRANS_NORMAL(phi::dtype::float16);
@@ -291,6 +248,31 @@ void set_constant(const paddle::platform::DeviceContext& context,
 #endif
 }
 
+template struct ColwiseSum<phi::CPUContext, float>;
+template struct ColwiseSum<phi::CPUContext, double>;
+template struct ColwiseSum<phi::CPUContext, int>;
+template struct ColwiseSum<phi::CPUContext, int64_t>;
+
+template struct RowwiseMean<phi::CPUContext, float>;
+template struct RowwiseMean<phi::CPUContext, double>;
+
+template <typename T>
+struct ElementwiseAddTo<paddle::platform::CPUDeviceContext, T> {
+  void operator()(paddle::platform::CPUDeviceContext* ctx,
+                  const paddle::framework::Tensor& src,
+                  paddle::framework::Tensor* dst) {
+    auto in = paddle::framework::EigenVector<T>::Flatten(src);
+    auto out = paddle::framework::EigenVector<T>::Flatten(*dst);
+    auto& place = *(ctx->eigen_device());
+    out.device(place) = out + in;
+  }
+};
+
+template struct ElementwiseAddTo<paddle::platform::CPUDeviceContext,
+                                 phi::dtype::float16>;
+template struct ElementwiseAddTo<paddle::platform::CPUDeviceContext,
+                                 phi::dtype::bfloat16>;
+
 template <typename T>
 struct RowwiseAdd<paddle::platform::CPUDeviceContext, T> {
   void operator()(const paddle::platform::CPUDeviceContext& context,
@@ -333,41 +315,5 @@ struct RowwiseAdd<paddle::platform::CPUDeviceContext, T> {
 template struct RowwiseAdd<paddle::platform::CPUDeviceContext, float>;
 template struct RowwiseAdd<paddle::platform::CPUDeviceContext, double>;
 
-template struct ColwiseSum<paddle::platform::CPUDeviceContext, float>;
-template struct ColwiseSum<paddle::platform::CPUDeviceContext, double>;
-template struct ColwiseSum<paddle::platform::CPUDeviceContext, int>;
-template struct ColwiseSum<paddle::platform::CPUDeviceContext, int64_t>;
-
-template struct ColwiseSum<phi::CPUContext, float>;
-template struct ColwiseSum<phi::CPUContext, double>;
-template struct ColwiseSum<phi::CPUContext, int>;
-template struct ColwiseSum<phi::CPUContext, int64_t>;
-
-template struct RowwiseSum<paddle::platform::CPUDeviceContext, float>;
-template struct RowwiseSum<paddle::platform::CPUDeviceContext, double>;
-
-template struct RowwiseMean<paddle::platform::CPUDeviceContext, float>;
-template struct RowwiseMean<paddle::platform::CPUDeviceContext, double>;
-
-template struct RowwiseMean<phi::CPUContext, float>;
-template struct RowwiseMean<phi::CPUContext, double>;
-
-template <typename T>
-struct ElementwiseAddTo<paddle::platform::CPUDeviceContext, T> {
-  void operator()(paddle::platform::CPUDeviceContext* ctx,
-                  const paddle::framework::Tensor& src,
-                  paddle::framework::Tensor* dst) {
-    auto in = paddle::framework::EigenVector<T>::Flatten(src);
-    auto out = paddle::framework::EigenVector<T>::Flatten(*dst);
-    auto& place = *(ctx->eigen_device());
-    out.device(place) = out + in;
-  }
-};
-
-template struct ElementwiseAddTo<paddle::platform::CPUDeviceContext,
-                                 phi::dtype::float16>;
-template struct ElementwiseAddTo<paddle::platform::CPUDeviceContext,
-                                 phi::dtype::bfloat16>;
-
 }  // namespace funcs
 }  // namespace phi
diff --git a/paddle/phi/kernels/funcs/matrix_inverse.cc b/paddle/phi/kernels/funcs/matrix_inverse.cc
index c95e97f8ea8..c316970e6a5 100644
--- a/paddle/phi/kernels/funcs/matrix_inverse.cc
+++ b/paddle/phi/kernels/funcs/matrix_inverse.cc
@@ -29,9 +29,5 @@ void MatrixInverseFunctor<Context, T>::operator()(const Context& dev_ctx,
 template class MatrixInverseFunctor<CPUContext, float>;
 template class MatrixInverseFunctor<CPUContext, double>;
 
-// TODO(chenweihang): remove these instantiations later
-template class MatrixInverseFunctor<paddle::platform::CPUDeviceContext, float>;
-template class MatrixInverseFunctor<paddle::platform::CPUDeviceContext, double>;
-
 }  // namespace funcs
 }  // namespace phi
diff --git a/paddle/phi/tests/api/test_sparse_utils_api.cc b/paddle/phi/tests/api/test_sparse_utils_api.cc
index e0201755511..d5891baaf10 100644
--- a/paddle/phi/tests/api/test_sparse_utils_api.cc
+++ b/paddle/phi/tests/api/test_sparse_utils_api.cc
@@ -48,7 +48,6 @@ TEST(API, to_sparse_coo) {
   std::copy(&dense_data[0][0], &dense_data[0][0] + 9, dense_x_data);
 
   phi::CPUContext dev_ctx_cpu;
-  dev_ctx_cpu.Init();
 
   // 1. test dense_to_sparse_coo
   paddle::experimental::Tensor x(dense_x);
diff --git a/paddle/phi/tests/common/test_scalar.cu b/paddle/phi/tests/common/test_scalar.cu
index 50b9e198da0..95334ac36a6 100644
--- a/paddle/phi/tests/common/test_scalar.cu
+++ b/paddle/phi/tests/common/test_scalar.cu
@@ -47,7 +47,6 @@ TEST(Scalar, ConstructFromDenseTensor1) {
   dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
                            .GetAllocator(phi::CPUPlace())
                            .get());
-  dev_ctx.Init();
 
   auto* dense_x_data = dev_ctx.Alloc<float16>(&dense_x);
   dense_x_data[0] = 1;
@@ -67,7 +66,6 @@ TEST(Scalar, ConstructFromDenseTensor2) {
   dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
                            .GetAllocator(phi::CPUPlace())
                            .get());
-  dev_ctx.Init();
 
   auto* dense_x_data = dev_ctx.Alloc<int16_t>(&dense_x);
   dense_x_data[0] = 1;
@@ -87,7 +85,6 @@ TEST(Scalar, ConstructFromDenseTensor3) {
   dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
                            .GetAllocator(phi::CPUPlace())
                            .get());
-  dev_ctx.Init();
 
   auto* dense_x_data = dev_ctx.Alloc<int8_t>(&dense_x);
   dense_x_data[0] = 1;
@@ -107,7 +104,6 @@ TEST(Scalar, ConstructFromDenseTensor4) {
   dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
                            .GetAllocator(phi::CPUPlace())
                            .get());
-  dev_ctx.Init();
 
   auto* dense_x_data = dev_ctx.Alloc<bool>(&dense_x);
   dense_x_data[0] = true;
@@ -127,7 +123,6 @@ TEST(Scalar, ConstructFromDenseTensor5) {
   dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
                            .GetAllocator(phi::CPUPlace())
                            .get());
-  dev_ctx.Init();
 
   auto* dense_x_data = dev_ctx.Alloc<complex64>(&dense_x);
   dense_x_data[0] = 1;
@@ -148,7 +143,6 @@ TEST(Scalar, ConstructFromDenseTensor6) {
   dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
                            .GetAllocator(phi::CPUPlace())
                            .get());
-  dev_ctx.Init();
 
   auto* dense_x_data = dev_ctx.Alloc<complex128>(&dense_x);
   dense_x_data[0] = 1;
@@ -170,7 +164,6 @@ TEST(Scalar, ConstructFromDenseTensor7) {
                            .GetAllocator(phi::GPUPlace())
                            .get());
   dev_ctx.Init();
-
   auto* dense_x_data = dev_ctx.Alloc<float>(&dense_x);
   FillTensor<<<1, 1, 0, dev_ctx.stream()>>>(dense_x_data);
   dev_ctx.Wait();
diff --git a/paddle/phi/tests/core/CMakeLists.txt b/paddle/phi/tests/core/CMakeLists.txt
index c299559da59..3d549aa5f16 100644
--- a/paddle/phi/tests/core/CMakeLists.txt
+++ b/paddle/phi/tests/core/CMakeLists.txt
@@ -24,10 +24,6 @@ cc_test(
   test_op_utils
   SRCS test_op_utils.cc
   DEPS op_compat_infos)
-cc_test(
-  test_phi_device_context
-  SRCS test_device_context.cc
-  DEPS phi_context cpu_context)
 cc_test(
   test_meta_fn_utils
   SRCS test_meta_fn_utils.cc
diff --git a/paddle/phi/tests/core/test_device_context.cc b/paddle/phi/tests/core/test_device_context.cc
deleted file mode 100644
index 844330ee097..00000000000
--- a/paddle/phi/tests/core/test_device_context.cc
+++ /dev/null
@@ -1,54 +0,0 @@
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "gtest/gtest.h"
-
-// TODO(wilber): will remove after the cpu, gpu context megre.
-#include "paddle/phi/backends/cpu/cpu_context.h"
-// #include "paddle/phi/backends/all_context.h"
-
-// NOTE: The paddle framework should add WITH_EIGEN option to support compile
-// without eigen.
-#include "unsupported/Eigen/CXX11/Tensor"
-
-namespace phi {
-namespace tests {
-
-class InferenceCPUContext : public CPUContext {
- public:
-  void SetEigenDevice(Eigen::DefaultDevice* eigen_device) {
-    CPUContext::SetEigenDevice(eigen_device);
-  }
-};
-
-TEST(DeviceContext, cpu_context) {
-  std::cout << "test training scenarios" << std::endl;
-  {
-    phi::CPUContext ctx;
-    ctx.Init();
-    EXPECT_TRUE(ctx.eigen_device() != nullptr);
-  }
-
-  std::cout << "test inference scenarios" << std::endl;
-  Eigen::DefaultDevice* device = new Eigen::DefaultDevice();
-  {
-    InferenceCPUContext ctx;
-    ctx.SetEigenDevice(device);
-    EXPECT_TRUE(ctx.eigen_device() != nullptr);
-  }
-  delete device;
-}
-
-}  // namespace tests
-}  // namespace phi
diff --git a/paddle/phi/tests/kernels/test_cast_dev_api.cc b/paddle/phi/tests/kernels/test_cast_dev_api.cc
index 179e44f0f0f..d43cd075ed5 100644
--- a/paddle/phi/tests/kernels/test_cast_dev_api.cc
+++ b/paddle/phi/tests/kernels/test_cast_dev_api.cc
@@ -52,7 +52,6 @@ TEST(DEV_API, cast) {
   dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
                            .GetAllocator(paddle::platform::CPUPlace())
                            .get());
-  dev_ctx.Init();
 
   phi::DataType out_dtype = phi::DataType::FLOAT64;
   // 2. test API
diff --git a/paddle/phi/tests/kernels/test_concat_dev_api.cc b/paddle/phi/tests/kernels/test_concat_dev_api.cc
index 0dd58b1bba9..9283fcd0b65 100644
--- a/paddle/phi/tests/kernels/test_concat_dev_api.cc
+++ b/paddle/phi/tests/kernels/test_concat_dev_api.cc
@@ -60,7 +60,6 @@ TEST(DEV_API, concat) {
   dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
                            .GetAllocator(paddle::platform::CPUPlace())
                            .get());
-  dev_ctx.Init();
   auto out = phi::Concat<float>(dev_ctx, inputs, 0);
 
   // 3. check result
diff --git a/paddle/phi/tests/kernels/test_conj_dev_api.cc b/paddle/phi/tests/kernels/test_conj_dev_api.cc
index 5ac676ffcbc..2f7ab838373 100644
--- a/paddle/phi/tests/kernels/test_conj_dev_api.cc
+++ b/paddle/phi/tests/kernels/test_conj_dev_api.cc
@@ -48,7 +48,6 @@ TEST(DEV_API, conj) {
   dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
                            .GetAllocator(paddle::platform::CPUPlace())
                            .get());
-  dev_ctx.Init();
 
   // 2. test API
   auto out = phi::Conj<paddle::complex64>(dev_ctx, dense_x);
diff --git a/paddle/phi/tests/kernels/test_copy_dev_api.cc b/paddle/phi/tests/kernels/test_copy_dev_api.cc
index 1c9b17ed613..c2df0a8acdc 100644
--- a/paddle/phi/tests/kernels/test_copy_dev_api.cc
+++ b/paddle/phi/tests/kernels/test_copy_dev_api.cc
@@ -65,7 +65,6 @@ TEST(DEV_API, copy) {
       paddle::memory::allocation::AllocatorFacade::Instance()
           .GetAllocator(paddle::platform::CPUPlace())
           .get());
-  dev_ctx.Init();
   phi::Copy(
       dev_ctx, *(dense_src.get()), phi::CPUPlace(), false, dense_dst.get());
 
diff --git a/paddle/phi/tests/kernels/test_creation_dev_api.cc b/paddle/phi/tests/kernels/test_creation_dev_api.cc
index 2dcd8739991..5685c3a2a0b 100644
--- a/paddle/phi/tests/kernels/test_creation_dev_api.cc
+++ b/paddle/phi/tests/kernels/test_creation_dev_api.cc
@@ -36,7 +36,6 @@ TEST(DEV_API, empty) {
   dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
                            .GetAllocator(paddle::platform::CPUPlace())
                            .get());
-  dev_ctx.Init();
 
   // 2. test API
   auto out = phi::Empty<int>(dev_ctx, {3, 2});
@@ -66,7 +65,6 @@ TEST(DEV_API, empty_like) {
   dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
                            .GetAllocator(paddle::platform::CPUPlace())
                            .get());
-  dev_ctx.Init();
   auto out = phi::EmptyLike<float>(dev_ctx, dense_x);
 
   // 3. check result
@@ -86,7 +84,6 @@ TEST(DEV_API, full) {
   dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
                            .GetAllocator(paddle::platform::CPUPlace())
                            .get());
-  dev_ctx.Init();
   auto out = phi::Full<float>(dev_ctx, {3, 2}, val);
 
   // 3. check result
@@ -119,7 +116,6 @@ TEST(DEV_API, full_like) {
   dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
                            .GetAllocator(paddle::platform::CPUPlace())
                            .get());
-  dev_ctx.Init();
 
   // 2. test API
   auto out = phi::FullLike<float>(dev_ctx, dense_x, val);
diff --git a/paddle/phi/tests/kernels/test_dot_dev_api.cc b/paddle/phi/tests/kernels/test_dot_dev_api.cc
index de20907cadf..a2af0471df0 100644
--- a/paddle/phi/tests/kernels/test_dot_dev_api.cc
+++ b/paddle/phi/tests/kernels/test_dot_dev_api.cc
@@ -61,7 +61,6 @@ TEST(DEV_API, dot) {
   dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
                            .GetAllocator(paddle::platform::CPUPlace())
                            .get());
-  dev_ctx.Init();
   auto out = phi::Dot<float>(dev_ctx, dense_x, dense_y);
 
   // 3. check result
diff --git a/paddle/phi/tests/kernels/test_elementwise_dev_api.cc b/paddle/phi/tests/kernels/test_elementwise_dev_api.cc
index 63f8b86a534..4100889d3ac 100644
--- a/paddle/phi/tests/kernels/test_elementwise_dev_api.cc
+++ b/paddle/phi/tests/kernels/test_elementwise_dev_api.cc
@@ -66,7 +66,6 @@ TEST(DEV_API, add) {
   dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
                            .GetAllocator(paddle::platform::CPUPlace())
                            .get());
-  dev_ctx.Init();
   auto dense_out = phi::Add<float>(dev_ctx, dense_x, dense_y);
 
   // 3. check result
@@ -118,7 +117,6 @@ TEST(DEV_API, subtract) {
   dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
                            .GetAllocator(paddle::platform::CPUPlace())
                            .get());
-  dev_ctx.Init();
   auto dense_out = phi::Subtract<float>(dev_ctx, dense_x, dense_y);
 
   // 3. check result
@@ -170,7 +168,6 @@ TEST(DEV_API, divide) {
   dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
                            .GetAllocator(paddle::platform::CPUPlace())
                            .get());
-  dev_ctx.Init();
   auto dense_out = phi::Divide<float>(dev_ctx, dense_x, dense_y);
 
   // 3. check result
@@ -222,7 +219,6 @@ TEST(DEV_API, multiply) {
   dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
                            .GetAllocator(paddle::platform::CPUPlace())
                            .get());
-  dev_ctx.Init();
   auto dense_out = phi::Multiply<float>(dev_ctx, dense_x, dense_y);
 
   // 3. check result
diff --git a/paddle/phi/tests/kernels/test_flatten_dev_api.cc b/paddle/phi/tests/kernels/test_flatten_dev_api.cc
index fb1cdee7e5f..860af4c4a4d 100644
--- a/paddle/phi/tests/kernels/test_flatten_dev_api.cc
+++ b/paddle/phi/tests/kernels/test_flatten_dev_api.cc
@@ -52,7 +52,6 @@ TEST(DEV_API, flatten) {
       paddle::memory::allocation::AllocatorFacade::Instance()
           .GetAllocator(paddle::platform::CPUPlace())
           .get());
-  dev_ctx.Init();
 
   // 2. test API
   auto out = phi::Flatten<float>(dev_ctx, dense_x, start_axis, stop_axis);
diff --git a/paddle/phi/tests/kernels/test_math_function.cc b/paddle/phi/tests/kernels/test_math_function.cc
index 29f33c555d1..a13a8cb564f 100644
--- a/paddle/phi/tests/kernels/test_math_function.cc
+++ b/paddle/phi/tests/kernels/test_math_function.cc
@@ -273,7 +273,6 @@ TEST(math_funciton, set_constant) {
   t.Resize({10, 10});
   t.mutable_data<int>(paddle::platform::CPUPlace());
   auto* ctx = new paddle::platform::CPUDeviceContext();
-  ctx->Init();
   phi::funcs::set_constant(*ctx, &t, 10);
   for (int64_t i = 0; i < t.numel(); ++i) {
     PADDLE_ENFORCE_EQ(10,
diff --git a/paddle/phi/tests/kernels/test_matmul_dev_api.cc b/paddle/phi/tests/kernels/test_matmul_dev_api.cc
index f25acaf9bcc..374a05fc5e4 100644
--- a/paddle/phi/tests/kernels/test_matmul_dev_api.cc
+++ b/paddle/phi/tests/kernels/test_matmul_dev_api.cc
@@ -58,7 +58,6 @@ TEST(DEV_API, dot) {
   dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
                            .GetAllocator(paddle::platform::CPUPlace())
                            .get());
-  dev_ctx.Init();
   auto out = Matmul<float, CPUContext>(dev_ctx, dense_x, dense_y, false, false);
 
   // 3. check result
diff --git a/paddle/phi/tests/kernels/test_mean_dev_api.cc b/paddle/phi/tests/kernels/test_mean_dev_api.cc
index 6f3f91a7dbe..1c791503913 100644
--- a/paddle/phi/tests/kernels/test_mean_dev_api.cc
+++ b/paddle/phi/tests/kernels/test_mean_dev_api.cc
@@ -51,7 +51,6 @@ TEST(DEV_API, mean) {
   dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
                            .GetAllocator(paddle::platform::CPUPlace())
                            .get());
-  dev_ctx.Init();
   auto out = phi::Mean<float>(dev_ctx, dense_x, dims, false);
 
   // 3. check result
diff --git a/paddle/phi/tests/kernels/test_reshape_dev_api.cc b/paddle/phi/tests/kernels/test_reshape_dev_api.cc
index f0f521d57db..708b31cb9a9 100644
--- a/paddle/phi/tests/kernels/test_reshape_dev_api.cc
+++ b/paddle/phi/tests/kernels/test_reshape_dev_api.cc
@@ -54,7 +54,6 @@ TEST(DEV_API, reshape) {
       paddle::memory::allocation::AllocatorFacade::Instance()
           .GetAllocator(paddle::platform::CPUPlace())
           .get());
-  dev_ctx.Init();
   auto out = phi::Reshape<float>(dev_ctx, dense_x, shape);
   // 3. check result
   std::vector<int64_t> expect_shape = {12, 3};
diff --git a/paddle/phi/tests/kernels/test_scale_dev_api.cc b/paddle/phi/tests/kernels/test_scale_dev_api.cc
index eff18bdeeca..57e186ab393 100644
--- a/paddle/phi/tests/kernels/test_scale_dev_api.cc
+++ b/paddle/phi/tests/kernels/test_scale_dev_api.cc
@@ -51,7 +51,6 @@ TEST(DEV_API, scale) {
   dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
                            .GetAllocator(paddle::platform::CPUPlace())
                            .get());
-  dev_ctx.Init();
 
   auto out = phi::Scale<float>(dev_ctx, dense_x, scale, bias, bias_after_scale);
 
@@ -93,7 +92,6 @@ TEST(DEV_API, scale_host) {
   dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
                            .GetAllocator(paddle::platform::CPUPlace())
                            .get());
-  dev_ctx.Init();
 
   auto out = phi::Scale<float>(dev_ctx, dense_x, scale, bias, bias_after_scale);
 
diff --git a/paddle/phi/tests/kernels/test_sparse_activation_dev_api.cc b/paddle/phi/tests/kernels/test_sparse_activation_dev_api.cc
index d1c464e4b1c..51d1e67f5af 100644
--- a/paddle/phi/tests/kernels/test_sparse_activation_dev_api.cc
+++ b/paddle/phi/tests/kernels/test_sparse_activation_dev_api.cc
@@ -42,7 +42,6 @@ TEST(DEV_API, sparse_relu) {
       paddle::memory::allocation::AllocatorFacade::Instance()
           .GetAllocator(paddle::platform::CPUPlace())
           .get());
-  dev_ctx_cpu.Init();
 
   DenseTensor dense_x =
       phi::Empty(dev_ctx_cpu,
diff --git a/paddle/phi/tests/kernels/test_sparse_conv3d_dev_api.cc b/paddle/phi/tests/kernels/test_sparse_conv3d_dev_api.cc
index bb84690cd07..f08c7b0872b 100644
--- a/paddle/phi/tests/kernels/test_sparse_conv3d_dev_api.cc
+++ b/paddle/phi/tests/kernels/test_sparse_conv3d_dev_api.cc
@@ -75,7 +75,6 @@ void TestConv3dBase(const std::vector<IntT>& indices,
       paddle::memory::allocation::AllocatorFacade::Instance()
           .GetAllocator(paddle::platform::CPUPlace())
           .get());
-  dev_ctx_cpu.Init();
 
   const int in_channels = kernel_dims[3];
   const int out_channels = kernel_dims[4];
diff --git a/paddle/phi/tests/kernels/test_sparse_elementwise_dev_api.cc b/paddle/phi/tests/kernels/test_sparse_elementwise_dev_api.cc
index 50848ae5f1c..cbac854d48e 100644
--- a/paddle/phi/tests/kernels/test_sparse_elementwise_dev_api.cc
+++ b/paddle/phi/tests/kernels/test_sparse_elementwise_dev_api.cc
@@ -113,7 +113,6 @@ TEST(DEV_API, sparse_elementwise_coo_kernel_double) {
         paddle::memory::allocation::AllocatorFacade::Instance()
             .GetAllocator(paddle::platform::CPUPlace())
             .get());
-    dev_ctx_cpu.Init();
 
     auto coo_x = sparse::DenseToSparseCoo<T>(dev_ctx_cpu, dense_x, sparse_dim);
     auto coo_y = sparse::DenseToSparseCoo<T>(dev_ctx_cpu, dense_y, sparse_dim);
@@ -159,7 +158,6 @@ TEST(DEV_API, sparse_elementwise_csr_kernel_float) {
       paddle::memory::allocation::AllocatorFacade::Instance()
           .GetAllocator(paddle::platform::CPUPlace())
           .get());
-  dev_ctx_cpu.Init();
 
   auto csr_x = sparse::DenseToSparseCsr<T>(dev_ctx_cpu, dense_x);
   auto csr_y = sparse::DenseToSparseCsr<T>(dev_ctx_cpu, dense_y);
@@ -357,7 +355,6 @@ TEST(DEV_API, sparse_elementwise_csr_grad_kernel_float) {
       paddle::memory::allocation::AllocatorFacade::Instance()
           .GetAllocator(paddle::platform::CPUPlace())
           .get());
-  dev_ctx_cpu.Init();
 
   auto csr_x = sparse::DenseToSparseCsr<T>(dev_ctx_cpu, dense_x);
   auto csr_y = sparse::DenseToSparseCsr<T>(dev_ctx_cpu, dense_y);
@@ -404,7 +401,6 @@ TEST(DEV_API, sparse_elementwise_coo_grad_kernel_double) {
       paddle::memory::allocation::AllocatorFacade::Instance()
           .GetAllocator(paddle::platform::CPUPlace())
           .get());
-  dev_ctx_cpu.Init();
 
   auto csr_x = sparse::DenseToSparseCoo<T>(dev_ctx_cpu, dense_x, sparse_dim);
   auto csr_y = sparse::DenseToSparseCoo<T>(dev_ctx_cpu, dense_y, sparse_dim);
diff --git a/paddle/phi/tests/kernels/test_sparse_pool_dev_api.cc b/paddle/phi/tests/kernels/test_sparse_pool_dev_api.cc
index 7d7cd1ceaf5..460dca59c71 100644
--- a/paddle/phi/tests/kernels/test_sparse_pool_dev_api.cc
+++ b/paddle/phi/tests/kernels/test_sparse_pool_dev_api.cc
@@ -60,7 +60,6 @@ void TestMaxPoolBase(const std::vector<IntT>& indices,
       paddle::memory::allocation::AllocatorFacade::Instance()
           .GetAllocator(phi::CPUPlace())
           .get());
-  dev_ctx_cpu.Init();
 
   const int in_channels = x_dims[4];
   const int out_channels = in_channels;
diff --git a/paddle/phi/tests/kernels/test_sparse_utils_dev_api.cc b/paddle/phi/tests/kernels/test_sparse_utils_dev_api.cc
index d4f1d6efb5d..70c9f4cfc61 100644
--- a/paddle/phi/tests/kernels/test_sparse_utils_dev_api.cc
+++ b/paddle/phi/tests/kernels/test_sparse_utils_dev_api.cc
@@ -88,7 +88,6 @@ void TestDenseToSparseCoo(const DenseTensor& dense_x,
       paddle::platform::CPUPlace());
 
   phi::CPUContext dev_ctx_cpu;
-  dev_ctx_cpu.Init();
   dev_ctx_cpu.SetAllocator(
       paddle::memory::allocation::AllocatorFacade::Instance()
           .GetAllocator(phi::CPUPlace())
@@ -307,7 +306,6 @@ void TestSparseCsrToCoo(const DDim& dense_dims,
 
   // 1. test cpu
   phi::CPUContext dev_ctx_cpu;
-  dev_ctx_cpu.Init();
   dev_ctx_cpu.SetAllocator(
       paddle::memory::allocation::AllocatorFacade::Instance()
           .GetAllocator(phi::CPUPlace())
@@ -489,7 +487,6 @@ void TestCooToCsr(const DDim& dense_dims,
 
   // 1. test cpu
   phi::CPUContext dev_ctx_cpu;
-  dev_ctx_cpu.Init();
   dev_ctx_cpu.SetAllocator(
       paddle::memory::allocation::AllocatorFacade::Instance()
           .GetAllocator(phi::CPUPlace())
@@ -588,7 +585,6 @@ void TestDenseToSparseCsr(const DenseTensor& dense_x,
   const auto alloc = std::make_shared<paddle::experimental::DefaultAllocator>(
       paddle::platform::CPUPlace());
   phi::CPUContext dev_ctx_cpu;
-  dev_ctx_cpu.Init();
   dev_ctx_cpu.SetAllocator(
       paddle::memory::allocation::AllocatorFacade::Instance()
           .GetAllocator(phi::CPUPlace())
@@ -701,7 +697,6 @@ void TestSparseCooToDense(const DDim& dense_dims,
                           const int64_t non_zero_num,
                           const int64_t sparse_dim) {
   phi::CPUContext dev_ctx_cpu;
-  dev_ctx_cpu.Init();
   dev_ctx_cpu.SetAllocator(
       paddle::memory::allocation::AllocatorFacade::Instance()
           .GetAllocator(phi::CPUPlace())
@@ -879,7 +874,6 @@ void TestSparseCsrToDense(const DDim& dense_dims,
 
   // 1. test cpu
   phi::CPUContext dev_ctx_cpu;
-  dev_ctx_cpu.Init();
   dev_ctx_cpu.SetAllocator(
       paddle::memory::allocation::AllocatorFacade::Instance()
           .GetAllocator(phi::CPUPlace())
diff --git a/paddle/phi/tests/kernels/test_split_dev_api.cc b/paddle/phi/tests/kernels/test_split_dev_api.cc
index a358fcdf28d..0389ab7afba 100644
--- a/paddle/phi/tests/kernels/test_split_dev_api.cc
+++ b/paddle/phi/tests/kernels/test_split_dev_api.cc
@@ -40,7 +40,6 @@ TEST(DEV_API, split) {
   dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
                            .GetAllocator(paddle::platform::CPUPlace())
                            .get());
-  dev_ctx.Init();
 
   auto* dense_x_data = dev_ctx.Alloc<float>(&dense_x);
   for (size_t i = 0; i < 4; ++i) {
diff --git a/paddle/phi/tests/kernels/test_sum_dev_api.cc b/paddle/phi/tests/kernels/test_sum_dev_api.cc
index 2cd677373f4..20e934eb692 100644
--- a/paddle/phi/tests/kernels/test_sum_dev_api.cc
+++ b/paddle/phi/tests/kernels/test_sum_dev_api.cc
@@ -49,7 +49,6 @@ TEST(DEV_API, sum) {
   dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
                            .GetAllocator(paddle::platform::CPUPlace())
                            .get());
-  dev_ctx.Init();
 
   // 2. test API
   auto out =
-- 
GitLab