unify cpu context (#43989)

* unify cpu context * fix init() * delete test_device_context * fix test_scalar

unify cpu context (#43989)
* unify cpu context * fix init() * delete test_device_context * fix test_scalar
09096aeb · Leo Chen · GitHub · 8d9f00a8 · 09096aeb · 09096aeb
72 changed file
--- a/paddle/fluid/operators/elementwise/elementwise_add_op.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_add_op.cc
@@ -20,12 +20,6 @@ namespace paddle {
 namespace framework {
 class OpDesc;
 }  // namespace framework
-namespace imperative {
-class OpBase;
-}  // namespace imperative
-namespace platform {
-class CPUDeviceContext;
-}  // namespace platform
 }  // namespace paddle

 namespace paddle {

--- a/paddle/fluid/operators/elementwise/elementwise_floordiv_op.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_floordiv_op.cc
@@ -25,9 +25,6 @@ class EmptyGradOpMaker;
 namespace imperative {
 class OpBase;
 }  // namespace imperative
-namespace platform {
-class CPUDeviceContext;
-}  // namespace platform
 }  // namespace paddle

 namespace paddle {

--- a/paddle/fluid/operators/elementwise/elementwise_max_op.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_max_op.cc
@@ -23,9 +23,6 @@ class OpDesc;
 namespace imperative {
 class OpBase;
 }  // namespace imperative
-namespace platform {
-class CPUDeviceContext;
-}  // namespace platform
 }  // namespace paddle

 namespace paddle {

--- a/paddle/fluid/operators/elementwise/elementwise_min_op.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_min_op.cc
@@ -23,9 +23,6 @@ class OpDesc;
 namespace imperative {
 class OpBase;
 }  // namespace imperative
-namespace platform {
-class CPUDeviceContext;
-}  // namespace platform
 }  // namespace paddle

 namespace paddle {

--- a/paddle/fluid/operators/elementwise/elementwise_mod_op.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_mod_op.cc
@@ -25,9 +25,6 @@ class EmptyGradOpMaker;
 namespace imperative {
 class OpBase;
 }  // namespace imperative
-namespace platform {
-class CPUDeviceContext;
-}  // namespace platform
 }  // namespace paddle

 namespace paddle {

--- a/paddle/fluid/operators/elementwise/elementwise_pow_op.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_pow_op.cc
@@ -20,9 +20,6 @@ class OpDesc;
 namespace imperative {
 class OpBase;
 }  // namespace imperative
-namespace platform {
-class CPUDeviceContext;
-}  // namespace platform
 }  // namespace paddle

 namespace paddle {

--- a/paddle/fluid/operators/elementwise/elementwise_sub_op.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_sub_op.cc
@@ -23,9 +23,6 @@ class OpDesc;
 namespace imperative {
 class OpBase;
 }  // namespace imperative
-namespace platform {
-class CPUDeviceContext;
-}  // namespace platform
 }  // namespace paddle

 namespace paddle {

--- a/paddle/fluid/operators/increment_op.cc
+++ b/paddle/fluid/operators/increment_op.cc
@@ -25,9 +25,6 @@ class OpDesc;
 namespace imperative {
 class OpBase;
 }  // namespace imperative
-namespace platform {
-class CPUDeviceContext;
-}  // namespace platform
 }  // namespace paddle

 namespace paddle {

--- a/paddle/fluid/operators/isfinite_op.cc
+++ b/paddle/fluid/operators/isfinite_op.cc
@@ -26,9 +26,6 @@ class EmptyGradOpMaker;
 namespace imperative {
 class OpBase;
 }  // namespace imperative
-namespace platform {
-class CPUDeviceContext;
-}  // namespace platform
 }  // namespace paddle

 namespace paddle {

--- a/paddle/fluid/operators/isfinite_v2_op.cc
+++ b/paddle/fluid/operators/isfinite_v2_op.cc
@@ -34,9 +34,6 @@ namespace operators {
 template <typename DeviceContext, typename T, typename Functor>
 class OverflowKernel;
 }  // namespace operators
-namespace platform {
-class CPUDeviceContext;
-}  // namespace platform
 }  // namespace paddle

 namespace plat = paddle::platform;

--- a/paddle/fluid/operators/label_smooth_op.cc
+++ b/paddle/fluid/operators/label_smooth_op.cc
@@ -24,9 +24,6 @@ class OpDesc;
 namespace imperative {
 class OpBase;
 }  // namespace imperative
-namespace platform {
-class CPUDeviceContext;
-}  // namespace platform
 }  // namespace paddle

 namespace paddle {

--- a/paddle/fluid/operators/math/beam_search.cc
+++ b/paddle/fluid/operators/math/beam_search.cc
@@ -13,26 +13,19 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "paddle/fluid/operators/math/beam_search.h"
-
+#include "paddle/phi/backends/cpu/cpu_context.h"
 namespace phi {
 class DenseTensor;
 }  // namespace phi

-namespace paddle {
-namespace framework {}  // namespace framework
-namespace platform {
-class CPUDeviceContext;
-}  // namespace platform
-}  // namespace paddle
-
 namespace paddle {
 namespace operators {
 namespace math {

 template <typename T>
-class BeamSearchFunctor<platform::CPUDeviceContext, T> {
+class BeamSearchFunctor<phi::CPUContext, T> {
 public:
-  void operator()(const platform::CPUDeviceContext &context,
+  void operator()(const phi::CPUContext &context,
                  const framework::LoDTensor *pre_ids,
                  const framework::LoDTensor *pre_scores,
                  const framework::LoDTensor *ids,
@@ -308,10 +301,10 @@ class BeamSearchFunctor<platform::CPUDeviceContext, T> {
  }
 };

-template class BeamSearchFunctor<platform::CPUDeviceContext, int>;
-template class BeamSearchFunctor<platform::CPUDeviceContext, int64_t>;
-template class BeamSearchFunctor<platform::CPUDeviceContext, float>;
-template class BeamSearchFunctor<platform::CPUDeviceContext, double>;
+template class BeamSearchFunctor<phi::CPUContext, int>;
+template class BeamSearchFunctor<phi::CPUContext, int64_t>;
+template class BeamSearchFunctor<phi::CPUContext, float>;
+template class BeamSearchFunctor<phi::CPUContext, double>;

 }  // namespace math
 }  // namespace operators

--- a/paddle/fluid/operators/math/concat_and_split.cc
+++ b/paddle/fluid/operators/math/concat_and_split.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "paddle/fluid/operators/math/concat_and_split.h"
+#include "paddle/fluid/platform/device_context.h"

 #include "paddle/phi/kernels/funcs/concat_and_split_functor.h"
 #ifdef PADDLE_WITH_ASCEND_CL
@@ -28,13 +29,6 @@ namespace phi {
 class DenseTensor;
 }  // namespace phi

-namespace paddle {
-namespace framework {}  // namespace framework
-namespace platform {
-class CPUDeviceContext;
-}  // namespace platform
-}  // namespace paddle
-
 namespace paddle {
 namespace operators {
 namespace math {

--- a/paddle/fluid/operators/math/context_project.cc
+++ b/paddle/fluid/operators/math/context_project.cc
@@ -13,19 +13,14 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "paddle/fluid/operators/math/context_project.h"
-
-namespace paddle {
-namespace platform {
-class CPUDeviceContext;
-}  // namespace platform
-}  // namespace paddle
+#include "paddle/phi/backends/cpu/cpu_context.h"

 namespace paddle {
 namespace operators {
 namespace math {

-template class ContextProjectFunctor<platform::CPUDeviceContext, float>;
-template class ContextProjectFunctor<platform::CPUDeviceContext, double>;
+template class ContextProjectFunctor<phi::CPUContext, float>;
+template class ContextProjectFunctor<phi::CPUContext, double>;

 }  // namespace math
 }  // namespace operators

--- a/paddle/fluid/operators/math/cos_sim_functor.cc
+++ b/paddle/fluid/operators/math/cos_sim_functor.cc
@@ -14,16 +14,9 @@ limitations under the License. */

 #include "paddle/fluid/operators/math/cos_sim_functor.h"

-namespace paddle {
-namespace platform {
-class CPUDeviceContext;
-}  // namespace platform
-}  // namespace paddle
-
 namespace paddle {
 namespace operators {
 namespace math {
-
 template <typename T>
 struct CosSimDyFunctor<platform::CPUDeviceContext, T> {
  void operator()(const platform::CPUDeviceContext& ctx,

--- a/paddle/fluid/operators/math/cross_entropy.cc
+++ b/paddle/fluid/operators/math/cross_entropy.cc
@@ -17,12 +17,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/convert_utils.h"
 #include "paddle/phi/backends/cpu/cpu_context.h"

-namespace paddle {
-namespace platform {
-class CPUDeviceContext;
-}  // namespace platform
-}  // namespace paddle
-
 namespace paddle {
 namespace operators {
 namespace math {
@@ -129,9 +123,6 @@ void CrossEntropyFunctor<DeviceContext, T>::operator()(
  }
 }

-template class CrossEntropyFunctor<platform::CPUDeviceContext, float>;
-template class CrossEntropyFunctor<platform::CPUDeviceContext, double>;
-
 template class CrossEntropyFunctor<phi::CPUContext, float>;
 template class CrossEntropyFunctor<phi::CPUContext, double>;
 }  // namespace math

--- a/paddle/fluid/operators/math/gru_compute.cc
+++ b/paddle/fluid/operators/math/gru_compute.cc
@@ -15,12 +15,6 @@ limitations under the License. */
 #include "paddle/fluid/operators/math/detail/gru_kernel.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"

-namespace paddle {
-namespace platform {
-class CPUDeviceContext;
-}  // namespace platform
-}  // namespace paddle
-
 namespace paddle {
 namespace operators {
 namespace math {

--- a/paddle/fluid/operators/math/im2col.cc
+++ b/paddle/fluid/operators/math/im2col.cc
@@ -16,12 +16,6 @@ limitations under the License. */

 #include "paddle/fluid/operators/math/im2col_cfo_cpu.h"

-namespace paddle {
-namespace platform {
-class CPUDeviceContext;
-}  // namespace platform
-}  // namespace paddle
-
 namespace phi {
 class CPUContext;
 }  // namespace phi
@@ -166,24 +160,12 @@ class Col2ImFunctor<paddle::operators::math::ColFormat::kCFO,
  }
 };

-template class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO,
-                             platform::CPUDeviceContext,
-                             float>;
-template class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO,
-                             platform::CPUDeviceContext,
-                             double>;
 template class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO,
                             phi::CPUContext,
                             float>;
 template class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO,
                             phi::CPUContext,
                             double>;
-template class Col2ImFunctor<paddle::operators::math::ColFormat::kCFO,
-                             platform::CPUDeviceContext,
-                             float>;
-template class Col2ImFunctor<paddle::operators::math::ColFormat::kCFO,
-                             platform::CPUDeviceContext,
-                             double>;
 template class Col2ImFunctor<paddle::operators::math::ColFormat::kCFO,
                             phi::CPUContext,
                             float>;
@@ -353,24 +335,12 @@ class Col2ImFunctor<paddle::operators::math::ColFormat::kOCF,
  }
 };

-template class Im2ColFunctor<paddle::operators::math::ColFormat::kOCF,
-                             platform::CPUDeviceContext,
-                             float>;
-template class Im2ColFunctor<paddle::operators::math::ColFormat::kOCF,
-                             platform::CPUDeviceContext,
-                             double>;
 template class Im2ColFunctor<paddle::operators::math::ColFormat::kOCF,
                             phi::CPUContext,
                             float>;
 template class Im2ColFunctor<paddle::operators::math::ColFormat::kOCF,
                             phi::CPUContext,
                             double>;
-template class Col2ImFunctor<paddle::operators::math::ColFormat::kOCF,
-                             platform::CPUDeviceContext,
-                             float>;
-template class Col2ImFunctor<paddle::operators::math::ColFormat::kOCF,
-                             platform::CPUDeviceContext,
-                             double>;
 template class Col2ImFunctor<paddle::operators::math::ColFormat::kOCF,
                             phi::CPUContext,
                             float>;

--- a/paddle/fluid/operators/math/math_function.cc
+++ b/paddle/fluid/operators/math/math_function.cc
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/math/math_function.h"
-
-#ifdef PADDLE_WITH_MKLML
-#include "paddle/fluid/platform/dynload/mklml.h"
-#endif
-
-#ifdef PADDLE_USE_OPENBLAS
-#include <cblas.h>
-#endif
-
-#include <memory>
-#include <utility>
-#include <vector>
-
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/operators/math/math_function_impl.h"
-#include "paddle/fluid/platform/bfloat16.h"
-#include "paddle/fluid/platform/float16.h"
-#include "paddle/phi/backends/cpu/cpu_context.h"
-#include "paddle/phi/kernels/funcs/eigen/common.h"
-#include "unsupported/Eigen/CXX11/Tensor"
-
-namespace paddle {
-namespace operators {
-namespace math {
-
-using float16 = paddle::platform::float16;
-
-template struct SetConstant<platform::CPUDeviceContext, platform::float16>;
-template struct SetConstant<platform::CPUDeviceContext, platform::bfloat16>;
-template struct SetConstant<platform::CPUDeviceContext, float>;
-template struct SetConstant<platform::CPUDeviceContext, double>;
-template struct SetConstant<platform::CPUDeviceContext, int16_t>;
-template struct SetConstant<platform::CPUDeviceContext, int>;
-template struct SetConstant<platform::CPUDeviceContext, int64_t>;
-template struct SetConstant<platform::CPUDeviceContext, bool>;
-template struct SetConstant<platform::CPUDeviceContext, uint8_t>;
-template struct SetConstant<platform::CPUDeviceContext,
-                            platform::complex<float>>;
-template struct SetConstant<platform::CPUDeviceContext,
-                            platform::complex<double>>;
-
-template struct SetConstant<phi::CPUContext, platform::float16>;
-template struct SetConstant<phi::CPUContext, platform::bfloat16>;
-template struct SetConstant<phi::CPUContext, float>;
-template struct SetConstant<phi::CPUContext, double>;
-template struct SetConstant<phi::CPUContext, int16_t>;
-template struct SetConstant<phi::CPUContext, int>;
-template struct SetConstant<phi::CPUContext, int64_t>;
-template struct SetConstant<phi::CPUContext, bool>;
-template struct SetConstant<phi::CPUContext, uint8_t>;
-template struct SetConstant<phi::CPUContext, platform::complex<float>>;
-template struct SetConstant<phi::CPUContext, platform::complex<double>>;
-
-#ifdef PADDLE_WITH_XPU
-template struct SetConstant<platform::XPUDeviceContext, platform::float16>;
-template struct SetConstant<platform::XPUDeviceContext, platform::bfloat16>;
-template struct SetConstant<platform::XPUDeviceContext, float>;
-template struct SetConstant<platform::XPUDeviceContext, double>;
-template struct SetConstant<platform::XPUDeviceContext, uint8_t>;
-template struct SetConstant<platform::XPUDeviceContext, int16_t>;
-template struct SetConstant<platform::XPUDeviceContext, int>;
-template struct SetConstant<platform::XPUDeviceContext, int64_t>;
-template struct SetConstant<platform::XPUDeviceContext, bool>;
-template struct SetConstant<platform::XPUDeviceContext,
-                            platform::complex<float>>;
-template struct SetConstant<platform::XPUDeviceContext,
-                            platform::complex<double>>;
-#endif
-
-#define DEFINE_CPU_TRANS(RANK)                                          \
-  template struct Transpose<platform::CPUDeviceContext,                 \
-                            platform::float16,                          \
-                            RANK>;                                      \
-  template struct Transpose<platform::CPUDeviceContext,                 \
-                            platform::bfloat16,                         \
-                            RANK>;                                      \
-  template struct Transpose<platform::CPUDeviceContext, float, RANK>;   \
-  template struct Transpose<platform::CPUDeviceContext, double, RANK>;  \
-  template struct Transpose<platform::CPUDeviceContext, int, RANK>;     \
-  template struct Transpose<platform::CPUDeviceContext, int64_t, RANK>; \
-  template struct Transpose<platform::CPUDeviceContext, bool, RANK>;    \
-  template struct Transpose<platform::CPUDeviceContext, int16_t, RANK>; \
-  template struct Transpose<platform::CPUDeviceContext, uint8_t, RANK>; \
-  template struct Transpose<platform::CPUDeviceContext, int8_t, RANK>;  \
-  template struct Transpose<platform::CPUDeviceContext,                 \
-                            platform::complex<float>,                   \
-                            RANK>;                                      \
-  template struct Transpose<platform::CPUDeviceContext,                 \
-                            platform::complex<double>,                  \
-                            RANK>;
-
-DEFINE_CPU_TRANS(1);
-DEFINE_CPU_TRANS(2);
-DEFINE_CPU_TRANS(3);
-DEFINE_CPU_TRANS(4);
-DEFINE_CPU_TRANS(5);
-DEFINE_CPU_TRANS(6);
-
-template <typename T>
-struct TransposeNormal<platform::CPUDeviceContext, T> {
-  void operator()(const platform::CPUDeviceContext& context,
-                  const framework::Tensor& in,
-                  framework::Tensor* out,
-                  const std::vector<int>& axis) {
-    const int rank = axis.size();
-    auto in_stride = phi::stride(in.dims());
-    auto out_stride = phi::stride(out->dims());
-    const T* in_ptr = in.data<T>();
-    T* out_ptr = out->data<T>();
-
-    auto transpose_helper = [&](int64_t beg, int64_t end) {
-      for (int64_t out_idx = beg; out_idx < end; ++out_idx) {
-        int64_t in_idx = 0;
-        int64_t tmp_idx = out_idx;
-        // calculate the input index
-        for (int i = 0; i < rank; ++i) {
-          const int64_t coordinate = tmp_idx / out_stride[i];
-          tmp_idx -= coordinate * out_stride[i];
-          in_idx += coordinate * in_stride[axis[i]];
-        }
-        out_ptr[out_idx] = in_ptr[in_idx];
-      }
-    };
-    transpose_helper(0, out->numel());
-  }
-};
-
-// define transpose normal
-#define DEFINE_CPU_TRANS_NORMAL(TYPE) \
-  template struct TransposeNormal<platform::CPUDeviceContext, TYPE>
-
-DEFINE_CPU_TRANS_NORMAL(platform::float16);
-DEFINE_CPU_TRANS_NORMAL(platform::bfloat16);
-DEFINE_CPU_TRANS_NORMAL(float);
-DEFINE_CPU_TRANS_NORMAL(double);
-DEFINE_CPU_TRANS_NORMAL(int);
-DEFINE_CPU_TRANS_NORMAL(int64_t);
-DEFINE_CPU_TRANS_NORMAL(bool);
-DEFINE_CPU_TRANS_NORMAL(int16_t);
-DEFINE_CPU_TRANS_NORMAL(uint8_t);
-DEFINE_CPU_TRANS_NORMAL(int8_t);
-DEFINE_CPU_TRANS_NORMAL(platform::complex<float>);
-DEFINE_CPU_TRANS_NORMAL(platform::complex<double>);
-
-struct TensorSetConstantCPU {
-  TensorSetConstantCPU(framework::Tensor* tensor, float value)
-      : tensor_(tensor), value_(value) {}
-  template <typename T>
-  void apply() const {
-    auto cpu = platform::CPUPlace();
-    auto* begin = tensor_->mutable_data<T>(cpu);
-    std::fill(begin, begin + tensor_->numel(), static_cast<T>(value_));
-  }
-  framework::Tensor* tensor_;
-  float value_;
-};
-
-template <>
-void set_constant_with_place<platform::XPUPlace>(
-    const platform::DeviceContext& context,
-    framework::Tensor* tensor,
-    float value) {
-  PADDLE_THROW(platform::errors::Unimplemented("XPUPlace is not supported"));
-}
-
-template <>
-void set_constant_with_place<platform::NPUPlace>(
-    const platform::DeviceContext& context,
-    framework::Tensor* tensor,
-    float value) {
-  PADDLE_THROW(platform::errors::Unimplemented("NPUPlace is not supported"));
-}
-
-template <>
-void set_constant_with_place<platform::NPUPinnedPlace>(
-    const platform::DeviceContext& context,
-    framework::Tensor* tensor,
-    float value) {
-  PADDLE_THROW(
-      platform::errors::Unimplemented("NPUPinnedPlace is not supported"));
-}
-
-template <>
-void set_constant_with_place<platform::IPUPlace>(
-    const platform::DeviceContext& context,
-    framework::Tensor* tensor,
-    float value) {
-  PADDLE_THROW(platform::errors::Unimplemented("IPUPlace is not supported"));
-}
-
-template <>
-void set_constant_with_place<platform::CPUPlace>(
-    const platform::DeviceContext& context,
-    framework::Tensor* tensor,
-    float value) {
-  framework::VisitDataType(tensor->type(), TensorSetConstantCPU(tensor, value));
-}
-
-template <>
-void set_constant_with_place<platform::MLUPlace>(
-    const platform::DeviceContext& context,
-    framework::Tensor* tensor,
-    float value) {
-  PADDLE_THROW(platform::errors::Unimplemented("MLUPlace is not supported"));
-}
-
-template <>
-void set_constant_with_place<platform::CustomPlace>(
-    const platform::DeviceContext& context,
-    framework::Tensor* tensor,
-    float value) {
-  PADDLE_THROW(platform::errors::Unimplemented("CustomPlace is not supported"));
-}
-
-template <>
-void set_constant_with_place<platform::CUDAPinnedPlace>(
-    const platform::DeviceContext& context,
-    framework::Tensor* tensor,
-    float value) {
-  framework::VisitDataType(tensor->type(), TensorSetConstantCPU(tensor, value));
-}
-
-struct TensorSetConstantWithPlace : public boost::static_visitor<void> {
-  TensorSetConstantWithPlace(const platform::DeviceContext& context,
-                             framework::Tensor* tensor,
-                             float value)
-      : context_(context), tensor_(tensor), value_(value) {}
-
-  template <typename Place>
-  void operator()(Place place) const {
-    set_constant_with_place<Place>(context_, tensor_, value_);
-  }
-
-  const platform::DeviceContext& context_;
-  framework::Tensor* tensor_;
-  float value_;
-};
-
-void set_constant(const platform::DeviceContext& context,
-                  framework::Tensor* tensor,
-                  float value) {
-  TensorSetConstantWithPlace func(context, tensor, value);
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-  // tensor->place().apply_visitor(func);
-  paddle::platform::VisitPlace(tensor->place(), func);
-#else
-  func(platform::CPUPlace());
-#endif
-}
-
-template <typename T>
-struct RowwiseAdd<platform::CPUDeviceContext, T> {
-  void operator()(const platform::CPUDeviceContext& context,
-                  const framework::Tensor& input,
-                  const framework::Tensor& vector,
-                  framework::Tensor* output) {
-    auto in_dims = input.dims();
-    auto out_dims = output->dims();
-    auto size = input.numel() / in_dims[0];
-    PADDLE_ENFORCE_EQ(
-        vector.numel(),
-        size,
-        platform::errors::InvalidArgument(
-            "The input vector size"
-            " should be equal to the size of each row of input tensor."
-            " Expected vector size=%d, but received %d",
-            size,
-            vector.numel()));
-    const char* in_dims_cstr = in_dims.to_str().c_str();
-    const char* out_dims_cstr = out_dims.to_str().c_str();
-    PADDLE_ENFORCE_EQ(out_dims,
-                      in_dims,
-                      platform::errors::InvalidArgument(
-                          "The output tensor shape should be same as the input"
-                          " tensor shape. Expected output tensor shape: %s,"
-                          " but received %s",
-                          in_dims_cstr,
-                          out_dims_cstr));
-
-    auto in = framework::EigenMatrix<T>::From(input);
-    auto vec = framework::EigenVector<T>::Flatten(vector);
-    auto out = framework::EigenMatrix<T>::From(*output);
-
-    for (int64_t i = 0; i < in_dims[0]; ++i) {
-      out.chip(i, 0) = in.chip(i, 0) + vec;
-    }
-  }
-};
-
-template struct RowwiseAdd<platform::CPUDeviceContext, float>;
-template struct RowwiseAdd<platform::CPUDeviceContext, double>;
-
-template struct ColwiseSum<platform::CPUDeviceContext, float>;
-template struct ColwiseSum<platform::CPUDeviceContext, double>;
-template struct ColwiseSum<platform::CPUDeviceContext, int>;
-template struct ColwiseSum<platform::CPUDeviceContext, int64_t>;
-
-template struct RowwiseSum<platform::CPUDeviceContext, float>;
-template struct RowwiseSum<platform::CPUDeviceContext, double>;
-
-template struct RowwiseMean<platform::CPUDeviceContext, float>;
-template struct RowwiseMean<platform::CPUDeviceContext, double>;
-
-template <typename T>
-struct ElementwiseAddTo<platform::CPUDeviceContext, T> {
-  void operator()(platform::CPUDeviceContext* ctx,
-                  const framework::Tensor& src,
-                  framework::Tensor* dst) {
-    auto in = framework::EigenVector<T>::Flatten(src);
-    auto out = framework::EigenVector<T>::Flatten(*dst);
-    auto& place = *(ctx->eigen_device());
-    out.device(place) = out + in;
-  }
-};
-
-template struct ElementwiseAddTo<platform::CPUDeviceContext, platform::float16>;
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
--- a/paddle/fluid/operators/math/maxouting.cc
+++ b/paddle/fluid/operators/math/maxouting.cc
@@ -109,11 +109,6 @@ void MaxOutGradFunctor<DeviceContext, T>::operator()(
  }
 }

-template class MaxOutGradFunctor<platform::CPUDeviceContext, float>;
-template class MaxOutGradFunctor<platform::CPUDeviceContext, double>;
-template class MaxOutFunctor<platform::CPUDeviceContext, float>;
-template class MaxOutFunctor<platform::CPUDeviceContext, double>;
-
 template class MaxOutGradFunctor<phi::CPUContext, float>;
 template class MaxOutGradFunctor<phi::CPUContext, double>;
 template class MaxOutFunctor<phi::CPUContext, float>;

--- a/paddle/fluid/operators/math/sample_prob.cc
+++ b/paddle/fluid/operators/math/sample_prob.cc
@@ -14,19 +14,8 @@ limitations under the License. */

 #include "paddle/fluid/operators/math/sample_prob.h"

-namespace paddle {
-namespace platform {
-class CPUDeviceContext;
-}  // namespace platform
-}  // namespace paddle
-
 namespace paddle {
 namespace operators {
-namespace math {
-
-template class SampleWithProb<platform::CPUDeviceContext, float>;
-template class SampleWithProb<platform::CPUDeviceContext, double>;
-
-}  // namespace math
+namespace math {}  // namespace math
 }  // namespace operators
 }  // namespace paddle
--- a/paddle/fluid/operators/math/selected_rows_functor.cc
+++ b/paddle/fluid/operators/math/selected_rows_functor.cc
@@ -276,51 +276,6 @@ struct SelectedRowsSumTo<platform::CPUDeviceContext, T> {
 template struct SelectedRowsSumTo<platform::CPUDeviceContext, float>;
 template struct SelectedRowsSumTo<platform::CPUDeviceContext, double>;

-template <typename T>
-struct SelectedRowsAddToTensor<platform::CPUDeviceContext, T> {
-  void operator()(const platform::CPUDeviceContext& context,
-                  const phi::SelectedRows& input1,
-                  framework::Tensor* input2) {
-    if (UNLIKELY(input1.rows().size() == 0)) {
-      LOG(WARNING) << "input selected rows is empty!";
-      return;
-    }
-    auto in1_height = input1.height();
-    const auto& in2_dims = input2->dims();
-    PADDLE_ENFORCE_EQ(
-        in1_height,
-        in2_dims[0],
-        platform::errors::InvalidArgument("The two inputs height must be equal."
-                                          "But received first input height = "
-                                          "[%d], second input height = [%d]",
-                                          in1_height,
-                                          in2_dims[0]));
-
-    auto& in1_value = input1.value();
-    auto& in1_rows = input1.rows();
-
-    int64_t in1_row_numel = in1_value.numel() / in1_rows.size();
-    PADDLE_ENFORCE_EQ(
-        in1_row_numel,
-        input2->numel() / in1_height,
-        platform::errors::InvalidArgument(
-            "The two inputs width must be equal."
-            "But received first input width = [%d], second input width = [%d]",
-            in1_row_numel,
-            input2->numel() / in1_height));
-
-    auto* in1_data = in1_value.data<T>();
-    auto* input2_data = input2->data<T>();
-
-    for (size_t i = 0; i < in1_rows.size(); i++) {
-      for (int64_t j = 0; j < in1_row_numel; j++) {
-        input2_data[in1_rows[i] * in1_row_numel + j] +=
-            in1_data[i * in1_row_numel + j];
-      }
-    }
-  }
-};
-
 template <typename T>
 struct SelectedRowsAddToTensor<phi::CPUContext, T> {
  void operator()(const phi::CPUContext& context,
@@ -366,13 +321,6 @@ struct SelectedRowsAddToTensor<phi::CPUContext, T> {
  }
 };

-template struct SelectedRowsAddToTensor<platform::CPUDeviceContext, float>;
-template struct SelectedRowsAddToTensor<platform::CPUDeviceContext, double>;
-template struct SelectedRowsAddToTensor<platform::CPUDeviceContext, int>;
-template struct SelectedRowsAddToTensor<platform::CPUDeviceContext, int64_t>;
-template struct SelectedRowsAddToTensor<platform::CPUDeviceContext,
-                                        platform::bfloat16>;
-
 template struct SelectedRowsAddToTensor<phi::CPUContext, float>;
 template struct SelectedRowsAddToTensor<phi::CPUContext, double>;
 template struct SelectedRowsAddToTensor<phi::CPUContext, int>;
@@ -582,34 +530,6 @@ struct MergeAddImpl {
  }
 };

-template <typename T>
-struct MergeAdd<platform::CPUDeviceContext, T> {
-  // unary functor, merge by adding duplicated rows in
-  // the input SelectedRows object.
-  phi::SelectedRows operator()(const platform::CPUDeviceContext& context,
-                               const phi::SelectedRows& input,
-                               const bool sorted_result) {
-    return MergeAddImpl<platform::CPUDeviceContext, T>()(
-        context, input, sorted_result);
-  }
-
-  void operator()(const platform::CPUDeviceContext& context,
-                  const phi::SelectedRows& input,
-                  phi::SelectedRows* output,
-                  const bool sorted_result) {
-    MergeAddImpl<platform::CPUDeviceContext, T>()(
-        context, input, output, sorted_result);
-  }
-
-  void operator()(const platform::CPUDeviceContext& context,
-                  const std::vector<const phi::SelectedRows*>& inputs,
-                  phi::SelectedRows* output,
-                  const bool sorted_result) {
-    MergeAddImpl<platform::CPUDeviceContext, T>()(
-        context, inputs, output, sorted_result);
-  }
-};
-
 template <typename T>
 struct MergeAdd<phi::CPUContext, T> {
  // unary functor, merge by adding duplicated rows in
@@ -635,10 +555,8 @@ struct MergeAdd<phi::CPUContext, T> {
  }
 };

-#define TEMPLATE_SPECIALIZED_FOR_MERGEADD_CPU(dtype)               \
-  template struct MergeAddImpl<platform::CPUDeviceContext, dtype>; \
-  template struct MergeAddImpl<phi::CPUContext, dtype>;            \
-  template struct MergeAdd<platform::CPUDeviceContext, dtype>;     \
+#define TEMPLATE_SPECIALIZED_FOR_MERGEADD_CPU(dtype)    \
+  template struct MergeAddImpl<phi::CPUContext, dtype>; \
  template struct MergeAdd<phi::CPUContext, dtype>;

 TEMPLATE_SPECIALIZED_FOR_MERGEADD_CPU(float)

--- a/paddle/fluid/operators/math/sequence_padding.cc
+++ b/paddle/fluid/operators/math/sequence_padding.cc
@@ -20,13 +20,6 @@ namespace phi {
 class DenseTensor;
 }  // namespace phi

-namespace paddle {
-namespace framework {}  // namespace framework
-namespace platform {
-class CPUDeviceContext;
-}  // namespace platform
-}  // namespace paddle
-
 namespace paddle {
 namespace operators {
 namespace math {
@@ -101,66 +94,6 @@ static void fast_mem_init(void* dest,
  }
 }

-template <typename T>
-class PaddingLoDTensorFunctor<platform::CPUDeviceContext, T> {
- public:
-  void operator()(const platform::CPUDeviceContext& context,
-                  const framework::LoDTensor& seq_tensor,
-                  framework::LoDTensor* pad_tensor,
-                  const framework::LoDTensor& pad_value,
-                  int pad_seq_len = -1,
-                  int lod_level = 0,
-                  bool norm_by_times = false,
-                  const PadLayout layout = kBatchLengthWidth) {
-    auto seq_lod = seq_tensor.lod();
-    const auto seq_offsets = framework::ToAbsOffset(seq_lod)[lod_level];
-    const auto& seq_tensor_dims = seq_tensor.dims();
-    const auto& pad_tensor_dims = pad_tensor->dims();
-    if (pad_seq_len == -1) {
-      pad_seq_len = MaximumSequenceLength(seq_offsets);
-    }
-    int step_width = seq_tensor.numel() / seq_tensor_dims[0];
-
-    CheckDims(seq_tensor_dims,
-              pad_tensor_dims,
-              seq_offsets,
-              pad_seq_len,
-              step_width,
-              layout);
-
-    PADDLE_ENFORCE_EQ(
-        pad_value.numel() == 1 || pad_value.numel() == step_width,
-        true,
-        platform::errors::InvalidArgument(
-            "The numel of 'pad_value' can only be 1 or be equal to the "
-            "'step_width', but got %ld != 1 and %ld. Please check the input "
-            "value.",
-            pad_value.numel(),
-            step_width));
-
-    // fill padding value
-    T* pad_data = pad_tensor->data<T>();
-    const T* pad_value_data = pad_value.data<T>();
-    if (pad_value.numel() == 1) {
-      fast_mem_init<T>(
-          pad_data, pad_tensor->numel(), pad_value_data, sizeof(T));
-    } else {
-      for (int i = 0; i < pad_tensor->numel(); i += step_width) {
-        memcpy(pad_data + i, pad_value_data, step_width * sizeof(T));
-      }
-    }
-
-    CopyValidData<T>(pad_tensor,
-                     &seq_tensor,
-                     seq_offsets,
-                     pad_seq_len,
-                     step_width,
-                     norm_by_times,
-                     kSeqToPad,
-                     layout);
-  }
-};
-
 template <typename T>
 class PaddingLoDTensorFunctor<phi::CPUContext, T> {
 public:
@@ -221,42 +154,6 @@ class PaddingLoDTensorFunctor<phi::CPUContext, T> {
  }
 };

-template <typename T>
-class UnpaddingLoDTensorFunctor<platform::CPUDeviceContext, T> {
- public:
-  void operator()(const platform::CPUDeviceContext& context,
-                  const framework::LoDTensor& pad_tensor,
-                  framework::LoDTensor* seq_tensor,
-                  int pad_seq_len = -1,
-                  int lod_level = 0,
-                  bool norm_by_times = false,
-                  const PadLayout layout = kBatchLengthWidth) {
-    auto seq_offsets = framework::ToAbsOffset(seq_tensor->lod())[lod_level];
-    const auto& seq_tensor_dims = seq_tensor->dims();
-    const auto& pad_tensor_dims = pad_tensor.dims();
-    if (pad_seq_len == -1) {
-      pad_seq_len = MaximumSequenceLength(seq_offsets);
-    }
-    int step_width = seq_tensor->numel() / seq_tensor_dims[0];
-
-    CheckDims(seq_tensor_dims,
-              pad_tensor_dims,
-              seq_offsets,
-              pad_seq_len,
-              step_width,
-              layout);
-
-    CopyValidData<T>(seq_tensor,
-                     &pad_tensor,
-                     seq_offsets,
-                     pad_seq_len,
-                     step_width,
-                     norm_by_times,
-                     kPadToSeq,
-                     layout);
-  }
-};
-
 template <typename T>
 class UnpaddingLoDTensorFunctor<phi::CPUContext, T> {
 public:
@@ -293,16 +190,6 @@ class UnpaddingLoDTensorFunctor<phi::CPUContext, T> {
  }
 };

-template class PaddingLoDTensorFunctor<platform::CPUDeviceContext, int>;
-template class PaddingLoDTensorFunctor<platform::CPUDeviceContext, int64_t>;
-template class PaddingLoDTensorFunctor<platform::CPUDeviceContext, float>;
-template class PaddingLoDTensorFunctor<platform::CPUDeviceContext, double>;
-
-template class UnpaddingLoDTensorFunctor<platform::CPUDeviceContext, int>;
-template class UnpaddingLoDTensorFunctor<platform::CPUDeviceContext, int64_t>;
-template class UnpaddingLoDTensorFunctor<platform::CPUDeviceContext, float>;
-template class UnpaddingLoDTensorFunctor<platform::CPUDeviceContext, double>;
-
 template class PaddingLoDTensorFunctor<phi::CPUContext, int>;
 template class PaddingLoDTensorFunctor<phi::CPUContext, int64_t>;
 template class PaddingLoDTensorFunctor<phi::CPUContext, float>;

--- a/paddle/fluid/operators/math/sequence_scale.cc
+++ b/paddle/fluid/operators/math/sequence_scale.cc
@@ -24,29 +24,6 @@ namespace paddle {
 namespace operators {
 namespace math {

-template <typename T>
-class ScaleLoDTensorFunctor<platform::CPUDeviceContext, T> {
- public:
-  void operator()(const platform::CPUDeviceContext& context,
-                  const T* scales,
-                  framework::LoDTensor* seq) {
-    const size_t level = 0;
-    auto lod = seq->lod();
-    const size_t num_seq = lod[level].size() - 1;
-    size_t seq_width = seq->dims()[1];
-    framework::LoD abs_offset_lod = framework::ToAbsOffset(lod);
-
-    T* seq_data = seq->mutable_data<T>(context.GetPlace());
-    for (size_t i = 0; i < num_seq; ++i) {
-      for (size_t j = lod[level][i] * seq_width;
-           j < lod[level][i + 1] * seq_width;
-           ++j) {
-        seq_data[j] *= scales[i];
-      }
-    }
-  }
-};
-
 template <typename T>
 class ScaleLoDTensorFunctor<phi::CPUContext, T> {
 public:
@@ -70,9 +47,6 @@ class ScaleLoDTensorFunctor<phi::CPUContext, T> {
  }
 };

-template class ScaleLoDTensorFunctor<platform::CPUDeviceContext, float>;
-template class ScaleLoDTensorFunctor<platform::CPUDeviceContext, double>;
-
 template class ScaleLoDTensorFunctor<phi::CPUContext, float>;
 template class ScaleLoDTensorFunctor<phi::CPUContext, double>;


--- a/paddle/fluid/operators/math/softmax.cc
+++ b/paddle/fluid/operators/math/softmax.cc
@@ -21,13 +21,6 @@ namespace paddle {
 namespace operators {
 namespace math {

-template class SoftmaxFunctor<platform::CPUDeviceContext, float, true>;
-template class SoftmaxFunctor<platform::CPUDeviceContext, float, false>;
-template class SoftmaxFunctor<platform::CPUDeviceContext, double, true>;
-template class SoftmaxFunctor<platform::CPUDeviceContext, double, false>;
-template class SoftmaxGradFunctor<platform::CPUDeviceContext, float>;
-template class SoftmaxGradFunctor<platform::CPUDeviceContext, double>;
-
 template class SoftmaxFunctor<phi::CPUContext, float, true>;
 template class SoftmaxFunctor<phi::CPUContext, float, false>;
 template class SoftmaxFunctor<phi::CPUContext, double, true>;

--- a/paddle/fluid/operators/math/vol2col.cc
+++ b/paddle/fluid/operators/math/vol2col.cc
@@ -16,12 +16,6 @@ limitations under the License. */

 #include "paddle/phi/backends/cpu/cpu_context.h"

-namespace paddle {
-namespace platform {
-class CPUDeviceContext;
-}  // namespace platform
-}  // namespace paddle
-
 namespace paddle {
 namespace operators {
 namespace math {
@@ -32,126 +26,6 @@ namespace math {
 *   [input_channels, filter_depth, filter_height, filter_width,
 *                    output_depth, output_height, output_width]
 */
-template <class T>
-class Vol2ColFunctor<platform::CPUDeviceContext, T> {
- public:
-  void operator()(const platform::CPUDeviceContext& context,
-                  const framework::Tensor& vol,
-                  const std::vector<int>& dilations,
-                  const std::vector<int>& strides,
-                  const std::vector<int>& paddings,
-                  framework::Tensor* col,
-                  const DataLayout data_layout) const {
-    PADDLE_ENFORCE_EQ(vol.dims().size(),
-                      4,
-                      platform::errors::InvalidArgument(
-                          "The dimension of vol should be 4, but received %d.",
-                          vol.dims().size()));
-
-    PADDLE_ENFORCE_EQ(col->dims().size(),
-                      7,
-                      platform::errors::InvalidArgument(
-                          "The dimension of col should be 7, but received %d.",
-                          col->dims().size()));
-
-    int input_channels =
-        (data_layout != DataLayout::kNHWC ? vol.dims()[0] : vol.dims()[3]);
-    int input_depth =
-        (data_layout != DataLayout::kNHWC ? vol.dims()[1] : vol.dims()[0]);
-    int input_height =
-        (data_layout != DataLayout::kNHWC ? vol.dims()[2] : vol.dims()[1]);
-    int input_width =
-        (data_layout != DataLayout::kNHWC ? vol.dims()[3] : vol.dims()[2]);
-    int filter_depth = col->dims()[1];
-    int filter_height = col->dims()[2];
-    int filter_width = col->dims()[3];
-    int output_depth = col->dims()[4];
-    int output_height = col->dims()[5];
-    int output_width = col->dims()[6];
-    int channels_col =
-        input_channels * filter_depth * filter_height * filter_width;
-
-    // changed
-    bool paddings_size_is_6 = (paddings.size() == 6);
-    int pad_d_forth = paddings_size_is_6 ? paddings[0] : paddings[0];
-    int pad_d_back = paddings_size_is_6 ? paddings[1] : paddings[0];
-    int pad_h_up = paddings_size_is_6 ? paddings[2] : paddings[1];
-    int pad_h_down = paddings_size_is_6 ? paddings[3] : paddings[1];
-    int pad_w_left = paddings_size_is_6 ? paddings[4] : paddings[2];
-    int pad_w_right = paddings_size_is_6 ? paddings[5] : paddings[2];
-
-    auto input_depth_tmp = (input_depth + pad_d_forth + pad_d_back -
-                            ((dilations[0] * (filter_depth - 1) + 1))) /
-                               strides[0] +
-                           1;
-    PADDLE_ENFORCE_EQ(
-        input_depth_tmp,
-        output_depth,
-        platform::errors::InvalidArgument(
-            "input_depth(%d) and output_depth(%d) are mismatching.",
-            input_depth_tmp,
-            output_depth));
-    auto input_height_tmp = (input_height + pad_h_up + pad_h_down -
-                             ((dilations[1] * (filter_height - 1) + 1))) /
-                                strides[1] +
-                            1;
-    PADDLE_ENFORCE_EQ(
-        input_height_tmp,
-        output_height,
-        platform::errors::InvalidArgument(
-            "input_height(%d) and output_height(%d) are mismatching.",
-            input_height_tmp,
-            output_height));
-    auto input_width_tmp = (input_width + pad_w_left + pad_w_right -
-                            ((dilations[2] * (filter_width - 1) + 1))) /
-                               strides[2] +
-                           1;
-    PADDLE_ENFORCE_EQ(
-        input_width_tmp,
-        output_width,
-        platform::errors::InvalidArgument(
-            "input_width(%d) and output_width(%d) are mismatching.",
-            input_width_tmp,
-            output_width));
-    const T* vol_data = vol.data<T>();
-    T* col_data = col->data<T>();
-
-    for (int c = 0; c < channels_col; ++c) {
-      int w_offset = c % filter_width;
-      int h_offset = (c / filter_width) % filter_height;
-      int d_offset = (c / filter_width / filter_height) % filter_depth;
-      int c_in = c / filter_width / filter_height / filter_depth;
-      for (int d = 0; d < output_depth; ++d) {
-        int d_pad = d * strides[0] - pad_d_forth + d_offset * dilations[0];
-        for (int h = 0; h < output_height; ++h) {
-          int h_pad = h * strides[1] - pad_h_up + h_offset * dilations[1];
-          for (int w = 0; w < output_width; ++w) {
-            int w_pad = w * strides[2] - pad_w_left + w_offset * dilations[2];
-
-            int col_idx =
-                ((c * output_depth + d) * output_height + h) * output_width + w;
-            int vol_idx;
-            if (data_layout != DataLayout::kNHWC) {
-              vol_idx = ((c_in * input_depth + d_pad) * input_height + h_pad) *
-                            input_width +
-                        w_pad;
-            } else {
-              vol_idx = ((d_pad * input_height + h_pad) * input_width + w_pad) *
-                            input_channels +
-                        c_in;
-            }
-            col_data[col_idx] =
-                (h_pad < 0 || h_pad >= input_height || w_pad < 0 ||
-                 w_pad >= input_width || d_pad < 0 || d_pad >= input_depth)
-                    ? static_cast<T>(0)
-                    : vol_data[vol_idx];
-          }
-        }
-      }
-    }
-  }
-};
-
 template <class T>
 class Vol2ColFunctor<phi::CPUContext, T> {
 public:
@@ -278,126 +152,6 @@ class Vol2ColFunctor<phi::CPUContext, T> {
 *   [input_channels, filter_depth, filter_height, filter_width,
 *                    output_depth, output_height, output_width]
 */
-template <class T>
-class Col2VolFunctor<platform::CPUDeviceContext, T> {
- public:
-  void operator()(const platform::CPUDeviceContext& context,
-                  const framework::Tensor& col,
-                  const std::vector<int>& dilations,
-                  const std::vector<int>& strides,
-                  const std::vector<int>& paddings,
-                  framework::Tensor* vol,
-                  const DataLayout data_layout) const {
-    PADDLE_ENFORCE_EQ(vol->dims().size(),
-                      4,
-                      platform::errors::InvalidArgument(
-                          "The dimension of vol should be 4, but received %d.",
-                          vol->dims().size()));
-
-    PADDLE_ENFORCE_EQ(col.dims().size(),
-                      7,
-                      platform::errors::InvalidArgument(
-                          "The dimension of col  should be 7, but received %d.",
-                          col.dims().size()));
-
-    int input_channels =
-        (data_layout != DataLayout::kNHWC ? vol->dims()[0] : vol->dims()[3]);
-    int input_depth =
-        (data_layout != DataLayout::kNHWC ? vol->dims()[1] : vol->dims()[0]);
-    int input_height =
-        (data_layout != DataLayout::kNHWC ? vol->dims()[2] : vol->dims()[1]);
-    int input_width =
-        (data_layout != DataLayout::kNHWC ? vol->dims()[3] : vol->dims()[2]);
-    int filter_depth = col.dims()[1];
-    int filter_height = col.dims()[2];
-    int filter_width = col.dims()[3];
-    int output_depth = col.dims()[4];
-    int output_height = col.dims()[5];
-    int output_width = col.dims()[6];
-    int channels_col =
-        input_channels * filter_depth * filter_height * filter_width;
-
-    bool paddings_size_is_6 = (paddings.size() == 6);
-    int pad_d_forth = paddings_size_is_6 ? paddings[0] : paddings[0];
-    int pad_d_back = paddings_size_is_6 ? paddings[1] : paddings[0];
-    int pad_h_up = paddings_size_is_6 ? paddings[2] : paddings[1];
-    int pad_h_down = paddings_size_is_6 ? paddings[3] : paddings[1];
-    int pad_w_left = paddings_size_is_6 ? paddings[4] : paddings[2];
-    int pad_w_right = paddings_size_is_6 ? paddings[5] : paddings[2];
-
-    auto input_depth_tmp = (input_depth + pad_d_forth + pad_d_back -
-                            ((dilations[0] * (filter_depth - 1) + 1))) /
-                               strides[0] +
-                           1;
-    PADDLE_ENFORCE_EQ(
-        input_depth_tmp,
-        output_depth,
-        platform::errors::InvalidArgument(
-            "input_depth(%d) and output_depth(%d) are mismatching.",
-            input_depth_tmp,
-            output_depth));
-    auto input_height_tmp = (input_height + pad_h_up + pad_h_down -
-                             ((dilations[1] * (filter_height - 1) + 1))) /
-                                strides[1] +
-                            1;
-    PADDLE_ENFORCE_EQ(
-        input_height_tmp,
-        output_height,
-        platform::errors::InvalidArgument(
-            "input_height(%d) and output_height(%d) are mismatching.",
-            input_height_tmp,
-            output_height));
-    auto input_width_tmp = (input_width + pad_w_left + pad_w_right -
-                            ((dilations[2] * (filter_width - 1) + 1))) /
-                               strides[2] +
-                           1;
-    PADDLE_ENFORCE_EQ(
-        input_width_tmp,
-        output_width,
-        platform::errors::InvalidArgument(
-            "input_width(%d)  and output_width(%d) are mismatching.",
-            input_width_tmp,
-            output_width));
-    T* vol_data = vol->data<T>();
-    const T* col_data = col.data<T>();
-
-    for (int c = 0; c < channels_col; ++c) {
-      int w_offset = c % filter_width;
-      int h_offset = (c / filter_width) % filter_height;
-      int d_offset = (c / filter_width / filter_height) % filter_depth;
-      int cIm = c / filter_width / filter_height / filter_depth;
-      for (int d = 0; d < output_depth; ++d) {
-        int d_pad = d * strides[0] - pad_d_forth + d_offset * dilations[0];
-        for (int h = 0; h < output_height; ++h) {
-          int h_pad = h * strides[1] - pad_h_up + h_offset * dilations[1];
-          for (int w = 0; w < output_width; ++w) {
-            int w_pad = w * strides[2] - pad_w_left + w_offset * dilations[2];
-
-            if (h_pad >= 0 && h_pad < input_height && w_pad >= 0 &&
-                w_pad < input_width && d_pad >= 0 && d_pad < input_depth) {
-              int vol_idx;
-              if (data_layout != DataLayout::kNHWC) {
-                vol_idx = ((cIm * input_depth + d_pad) * input_height + h_pad) *
-                              input_width +
-                          w_pad;
-              } else {
-                vol_idx =
-                    ((d_pad * input_height + h_pad) * input_width + w_pad) *
-                        input_channels +
-                    cIm;
-              }
-              int col_idx =
-                  ((c * output_depth + d) * output_height + h) * output_width +
-                  w;
-              vol_data[vol_idx] += col_data[col_idx];
-            }
-          }
-        }
-      }
-    }
-  }
-};
-
 template <class T>
 class Col2VolFunctor<phi::CPUContext, T> {
 public:
@@ -518,13 +272,9 @@ class Col2VolFunctor<phi::CPUContext, T> {
  }
 };

-template class Vol2ColFunctor<platform::CPUDeviceContext, float>;
-template class Vol2ColFunctor<platform::CPUDeviceContext, double>;
 template class Vol2ColFunctor<phi::CPUContext, float>;
 template class Vol2ColFunctor<phi::CPUContext, double>;

-template class Col2VolFunctor<platform::CPUDeviceContext, float>;
-template class Col2VolFunctor<platform::CPUDeviceContext, double>;
 template class Col2VolFunctor<phi::CPUContext, float>;
 template class Col2VolFunctor<phi::CPUContext, double>;


--- a/paddle/fluid/operators/mkldnn/sum_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/sum_mkldnn_op.cc
@@ -34,7 +34,6 @@ class DenseTensor;
 namespace paddle {
 namespace framework {}  // namespace framework
 namespace platform {
-class CPUDeviceContext;
 class MKLDNNDeviceContext;
 }  // namespace platform
 }  // namespace paddle

--- a/paddle/fluid/operators/rank_loss_op.cc
+++ b/paddle/fluid/operators/rank_loss_op.cc
@@ -24,9 +24,6 @@ class OpDesc;
 namespace imperative {
 class OpBase;
 }  // namespace imperative
-namespace platform {
-class CPUDeviceContext;
-}  // namespace platform
 }  // namespace paddle

 namespace paddle {

--- a/paddle/fluid/operators/reduce_ops/frobenius_norm_op.cc
+++ b/paddle/fluid/operators/reduce_ops/frobenius_norm_op.cc
@@ -27,9 +27,6 @@ class OpDesc;
 namespace imperative {
 class OpBase;
 }  // namespace imperative
-namespace platform {
-class CPUDeviceContext;
-}  // namespace platform
 }  // namespace paddle

 namespace paddle {

--- a/paddle/fluid/operators/reduce_ops/reduce_all_op.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_all_op.cc
@@ -27,9 +27,6 @@ class EmptyGradOpMaker;
 namespace imperative {
 class OpBase;
 }  // namespace imperative
-namespace platform {
-class CPUDeviceContext;
-}  // namespace platform
 }  // namespace paddle

 DECLARE_INFER_SHAPE_FUNCTOR(reduce_all,

--- a/paddle/fluid/operators/reduce_ops/reduce_any_op.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_any_op.cc
@@ -26,9 +26,6 @@ class EmptyGradOpMaker;
 namespace imperative {
 class OpBase;
 }  // namespace imperative
-namespace platform {
-class CPUDeviceContext;
-}  // namespace platform
 }  // namespace paddle

 DECLARE_INFER_SHAPE_FUNCTOR(reduce_any,

--- a/paddle/fluid/operators/reduce_ops/reduce_prod_op.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_prod_op.cc
@@ -25,9 +25,6 @@ class OpDesc;
 namespace imperative {
 class OpBase;
 }  // namespace imperative
-namespace platform {
-class CPUDeviceContext;
-}  // namespace platform
 }  // namespace paddle

 namespace ops = paddle::operators;

--- a/paddle/fluid/operators/reduce_ops/reduce_sum_op.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_sum_op.cc
@@ -27,9 +27,6 @@ class OpDesc;
 namespace imperative {
 class OpBase;
 }  // namespace imperative
-namespace platform {
-class CPUDeviceContext;
-}  // namespace platform
 }  // namespace paddle

 namespace paddle {

--- a/paddle/fluid/operators/set_value_op.cc
+++ b/paddle/fluid/operators/set_value_op.cc
@@ -31,9 +31,6 @@ class EmptyGradOpMaker;
 namespace imperative {
 class OpBase;
 }  // namespace imperative
-namespace platform {
-class CPUDeviceContext;
-}  // namespace platform
 }  // namespace paddle

 namespace paddle {

--- a/paddle/fluid/platform/device_context.cc
+++ b/paddle/fluid/platform/device_context.cc
@@ -367,14 +367,6 @@ DeviceContextPool::DeviceContextPool(
                        /*disable_setting_default_stream_for_allocator=*/false);
 }

-CPUDeviceContext::CPUDeviceContext() : phi::CPUContext() {
-  phi::CPUContext::Init();
-}
-
-CPUDeviceContext::CPUDeviceContext(CPUPlace place) : phi::CPUContext(place) {
-  phi::CPUContext::Init();
-}
-
 #ifdef PADDLE_WITH_IPU
 IPUDeviceContext::IPUDeviceContext(IPUPlace place) : place_(place) {}


--- a/paddle/fluid/platform/device_context.h
+++ b/paddle/fluid/platform/device_context.h
@@ -134,14 +134,7 @@ constexpr DeviceType kMLU = DeviceType::MLU;

 using DeviceContext = phi::DeviceContext;

-// using CPUDeviceContext = phi::CPUContext;
-// TODO(wilber): The place constructor is used in many places, it is more
-// difficult to use CPUDeviceContext = phi::CPUContext directly.
-class CPUDeviceContext : public phi::CPUContext {
- public:
-  CPUDeviceContext();
-  explicit CPUDeviceContext(CPUPlace place);
-};
+using CPUDeviceContext = phi::CPUContext;

 template <typename Place>
 struct DefaultDeviceContextType;

--- a/paddle/fluid/platform/transform.h
+++ b/paddle/fluid/platform/transform.h
@@ -69,30 +69,6 @@ struct Transform {
 };

 // NOTE: After the phi kernel is migrated, it needs to be deleted.
-template <>
-struct Transform<platform::CPUDeviceContext> {
-  template <typename InputIter, typename OutputIter, typename UnaryOperation>
-  void operator()(const platform::CPUDeviceContext& context,
-                  InputIter first,
-                  InputIter last,
-                  OutputIter result,
-                  UnaryOperation op) {
-    std::transform(first, last, result, op);
-  }
-
-  template <typename InputIter1,
-            typename InputIter2,
-            typename OutputIter,
-            typename BinaryOperation>
-  void operator()(const platform::CPUDeviceContext& context,
-                  InputIter1 first1,
-                  InputIter1 last1,
-                  InputIter2 first2,
-                  OutputIter result,
-                  BinaryOperation op) {
-    std::transform(first1, last1, first2, result, op);
-  }
-};

 template <>
 struct Transform<phi::CPUContext> {

--- a/paddle/infrt/kernel/phi/context_kernels.cc
+++ b/paddle/infrt/kernel/phi/context_kernels.cc
@@ -20,7 +20,6 @@ namespace phi {

 ::phi::CPUContext CreateCPUContext() {
  ::phi::CPUContext ctx{};
-  ctx.Init();
  auto allocator = new backends::CpuPhiAllocator{};
  ctx.SetAllocator(allocator);
  ctx.SetHostAllocator(allocator);

--- a/paddle/infrt/kernel/phi/infershaped/infershape_launchers_test.cc
+++ b/paddle/infrt/kernel/phi/infershaped/infershape_launchers_test.cc
@@ -81,7 +81,6 @@ TEST(ElementwiseAdd, launcher_registry) {

  ::phi::CPUContext context;
  context.SetAllocator(alloc);
-  context.Init();

  host_context::KernelFrameBuilder kernel_frame_builder;
  kernel_frame_builder.AddArgument(new host_context::Value(std::move(context)));

--- a/paddle/phi/backends/cpu/cpu_context.cc
+++ b/paddle/phi/backends/cpu/cpu_context.cc
@@ -51,10 +51,14 @@ struct CPUContext::Impl {
 };

 CPUContext::CPUContext()
-    : DeviceContext(), impl_(std::make_unique<CPUContext::Impl>()) {}
+    : DeviceContext(), impl_(std::make_unique<CPUContext::Impl>()) {
+  impl_->Init();
+}

 CPUContext::CPUContext(const Place& place)
-    : DeviceContext(), impl_(std::make_unique<CPUContext::Impl>(place)) {}
+    : DeviceContext(), impl_(std::make_unique<CPUContext::Impl>(place)) {
+  impl_->Init();
+}

 CPUContext::~CPUContext() = default;

@@ -62,8 +66,6 @@ CPUContext::CPUContext(CPUContext&&) = default;

 CPUContext& CPUContext::operator=(CPUContext&&) = default;

-void CPUContext::Init() { impl_->Init(); }
-
 Eigen::DefaultDevice* CPUContext::eigen_device() const {
  return impl_->GetEigenDevice();
 }

--- a/paddle/phi/backends/cpu/cpu_context.h
+++ b/paddle/phi/backends/cpu/cpu_context.h
@@ -34,12 +34,6 @@ class PADDLE_API CPUContext : public DeviceContext {
  Eigen::DefaultDevice* eigen_device() const;
  const Place& GetPlace() const override;

- public:
-  // NOTE: DeviceContext hold resources. Used in training scenarios.
-  // The interface used by the training scene, DeviceContext will initialize
-  // all resources and delete them when destructing.
-  void Init();
-
 protected:
  // NOTE: External users manage resources. Used in inference scenarios.
  // The Set interface is for inference only, DeviceContext will mark the

--- a/paddle/phi/kernels/funcs/blas/blas_impl.h
+++ b/paddle/phi/kernels/funcs/blas/blas_impl.h
--- a/paddle/phi/kernels/funcs/fc_functor.cc
+++ b/paddle/phi/kernels/funcs/fc_functor.cc
@@ -96,8 +96,6 @@ void FCFunctor<DeviceContext, T>::operator()(const DeviceContext& context,
  }
 }

-template class FCFunctor<paddle::platform::CPUDeviceContext, float>;
-template class FCFunctor<paddle::platform::CPUDeviceContext, double>;
 template class FCFunctor<CPUContext, float>;
 template class FCFunctor<CPUContext, double>;


--- a/paddle/phi/kernels/funcs/for_range.h
+++ b/paddle/phi/kernels/funcs/for_range.h
@@ -41,22 +41,6 @@ struct ForRange<phi::CPUContext> {
  size_t limit_;
 };

-// NOTE: After the pten kernel is migrated, it needs to be deleted.
-template <>
-struct ForRange<paddle::platform::CPUDeviceContext> {
-  ForRange(const paddle::platform::CPUDeviceContext& dev_ctx, size_t limit)
-      : dev_ctx_(dev_ctx), limit_(limit) {}
-
-  template <typename Function>
-  void operator()(Function func) const {
-    phi::funcs::ForRange<phi::CPUContext> for_range(dev_ctx_, limit_);
-    for_range(func);
-  }
-
-  const paddle::platform::CPUDeviceContext& dev_ctx_;
-  size_t limit_;
-};
-
 #if defined(__NVCC__) || defined(__HIPCC__)

 template <typename Function>

--- a/paddle/phi/kernels/funcs/gru_compute.cc
+++ b/paddle/phi/kernels/funcs/gru_compute.cc
@@ -179,60 +179,6 @@ struct GRUUnitGradFunctor<paddle::platform::CPUDeviceContext, T> {
  }
 };

-template <typename T>
-struct GRUUnitFunctorV2<paddle::platform::CPUDeviceContext, T> {
-  static void compute(const paddle::platform::CPUDeviceContext &context,
-                      GRUMetaValue<T> value,
-                      int frame_size,
-                      int batch_size,
-                      const phi::funcs::detail::ActivationType active_node,
-                      const phi::funcs::detail::ActivationType active_gate) {
-#if !defined(__NVCC__) && !defined(__HIPCC___)
-    auto blas =
-        phi::funcs::GetBlas<paddle::platform::CPUDeviceContext, T>(context);
-    if (value.prev_out_value) {
-      blas.GEMM(CblasNoTrans,
-                CblasTrans,
-                batch_size,
-                frame_size,
-                frame_size,
-                1,
-                value.prev_out_value,
-                value.state_weight,
-                0,
-                value.reset_output_value);
-    }
-    detail::forward_reset_output(
-        phi::funcs::detail::forward::gru_resetOutput<T>(),
-        value,
-        frame_size,
-        batch_size,
-        active_gate,
-        false,
-        &context);
-
-    T *cell_state_value = value.gate_value + 2 * frame_size;
-    T *reset_output_value = value.reset_output_value;
-    for (int b = 0; b < batch_size; ++b) {
-      blas.VADD(
-          frame_size, cell_state_value, reset_output_value, cell_state_value);
-      cell_state_value += frame_size * 3;
-      reset_output_value += frame_size;
-    }
-
-    detail::forward_final_output(
-        phi::funcs::detail::forward::gru_finalOutput<T>(),
-        value,
-        frame_size,
-        batch_size,
-        active_node,
-        true,
-        false,
-        &context);
-#endif
-  }
-};
-
 template <typename T>
 struct GRUUnitFunctorV2<CPUContext, T> {
  static void compute(const CPUContext &context,
@@ -286,131 +232,6 @@ struct GRUUnitFunctorV2<CPUContext, T> {
  }
 };

-template <typename T>
-struct GRUUnitGradFunctorV2<paddle::platform::CPUDeviceContext, T> {
-  static void compute(const paddle::platform::CPUDeviceContext &context,
-                      GRUMetaValue<T> value,
-                      GRUMetaGrad<T> grad,
-                      int frame_size,
-                      int batch_size,
-                      const phi::funcs::detail::ActivationType active_node,
-                      const phi::funcs::detail::ActivationType active_gate) {
-#if !defined(__NVCC__) && !defined(__HIPCC___)
-    // calculate grad_update_gate, grad_frame_state,
-    // grad_reset_output, grad_reset_gate
-    detail::cpu_gru_backward(context,
-                             phi::funcs::detail::backward::gru<T>(),
-                             value,
-                             grad,
-                             frame_size,
-                             batch_size,
-                             active_node,
-                             active_gate);
-    auto blas =
-        phi::funcs::GetBlas<paddle::platform::CPUDeviceContext, T>(context);
-    if (grad.prev_out_grad && value.prev_out_value) {
-      // update prev_out_grad
-      blas.GEMM(false,
-                false,
-                batch_size,
-                frame_size,
-                frame_size,
-                1,
-                grad.gate_grad,
-                frame_size * 3,
-                value.gate_weight,
-                frame_size,
-                1,
-                grad.prev_out_grad,
-                frame_size);
-      blas.GEMM(false,
-                false,
-                batch_size,
-                frame_size,
-                frame_size,
-                1,
-                grad.gate_grad + frame_size,
-                frame_size * 3,
-                value.gate_weight + frame_size * frame_size,
-                frame_size,
-                1,
-                grad.prev_out_grad,
-                frame_size);
-      blas.GEMM(false,
-                false,
-                batch_size,
-                frame_size,
-                frame_size,
-                1,
-                grad.reset_output_grad,
-                frame_size,
-                value.state_weight,
-                frame_size,
-                1,
-                grad.prev_out_grad,
-                frame_size);
-      // update weight_hh_grad
-      if (grad.gate_weight_grad) {
-        // reset gate
-        blas.GEMM(true,
-                  false,
-                  frame_size,
-                  frame_size,
-                  batch_size,
-                  1,
-                  grad.gate_grad,
-                  frame_size * 3,
-                  value.prev_out_value,
-                  frame_size,
-                  1,
-                  grad.gate_weight_grad,
-                  frame_size);
-        // update gate
-        blas.GEMM(true,
-                  false,
-                  frame_size,
-                  frame_size,
-                  batch_size,
-                  1,
-                  grad.gate_grad + frame_size,
-                  frame_size * 3,
-                  value.prev_out_value,
-                  frame_size,
-                  1,
-                  grad.gate_weight_grad + frame_size * frame_size,
-                  frame_size);
-        // cell state
-        blas.GEMM(true,
-                  false,
-                  frame_size,
-                  frame_size,
-                  batch_size,
-                  1,
-                  grad.reset_output_grad,
-                  frame_size,
-                  value.prev_out_value,
-                  frame_size,
-                  1,
-                  grad.state_weight_grad,
-                  frame_size);
-      }
-    }
-    // update bias_hh_grad
-    T *gate_grad = grad.gate_grad;
-    T *bias_hh_grad = grad.bias_hh_grad;
-    T *state_bias_grad = grad.bias_hh_grad + 2 * frame_size;
-    T *reset_output_grad = grad.reset_output_grad;
-    for (int b = 0; b < batch_size; ++b) {
-      blas.VADD(2 * frame_size, bias_hh_grad, gate_grad, bias_hh_grad);
-      blas.VADD(
-          frame_size, state_bias_grad, reset_output_grad, state_bias_grad);
-      gate_grad += 3 * frame_size;
-      reset_output_grad += frame_size;
-    }
-#endif
-  }
-};
-
 template <typename T>
 struct GRUUnitGradFunctorV2<CPUContext, T> {
  static void compute(const CPUContext &context,
@@ -540,12 +361,6 @@ template struct GRUUnitFunctor<paddle::platform::CPUDeviceContext, double>;
 template struct GRUUnitGradFunctor<paddle::platform::CPUDeviceContext, float>;
 template struct GRUUnitGradFunctor<paddle::platform::CPUDeviceContext, double>;

-template struct GRUUnitFunctorV2<paddle::platform::CPUDeviceContext, float>;
-template struct GRUUnitFunctorV2<paddle::platform::CPUDeviceContext, double>;
-template struct GRUUnitGradFunctorV2<paddle::platform::CPUDeviceContext, float>;
-template struct GRUUnitGradFunctorV2<paddle::platform::CPUDeviceContext,
-                                     double>;
-
 template struct GRUUnitFunctorV2<CPUContext, float>;
 template struct GRUUnitFunctorV2<CPUContext, double>;
 template struct GRUUnitGradFunctorV2<CPUContext, float>;

--- a/paddle/phi/kernels/funcs/lstm_compute.cc
+++ b/paddle/phi/kernels/funcs/lstm_compute.cc
@@ -21,38 +21,6 @@ limitations under the License. */
 namespace phi {
 namespace funcs {

-template <class T>
-struct LstmUnitFunctor<paddle::platform::CPUDeviceContext, T> {
-  static void compute(const paddle::platform::CPUDeviceContext& context,
-                      LstmMetaValue<T> value,
-                      int frame_size,
-                      int batch_size,
-                      T cell_clip,
-                      const phi::funcs::detail::ActivationType& gate_act,
-                      const phi::funcs::detail::ActivationType& cell_act,
-                      const phi::funcs::detail::ActivationType& cand_act,
-                      bool old_api_version = true) {
-    for (int b = 0; b < batch_size; b++) {
-      detail::cpu_lstm_forward(context,
-                               phi::funcs::detail::forward::lstm<T>(),
-                               value,
-                               frame_size,
-                               cell_clip,
-                               cand_act,
-                               gate_act,
-                               cell_act,
-                               old_api_version);
-      value.gate_value += frame_size * 4;
-      value.state_value += frame_size;
-      value.state_active_value += frame_size;
-      value.output_value += frame_size;
-      if (value.prev_state_value) {
-        value.prev_state_value += frame_size;
-      }
-    }
-  }
-};
-
 template <class T>
 struct LstmUnitFunctor<CPUContext, T> {
  static void compute(const CPUContext& context,
@@ -85,49 +53,6 @@ struct LstmUnitFunctor<CPUContext, T> {
  }
 };

-template <class T>
-struct LstmUnitGradFunctor<paddle::platform::CPUDeviceContext, T> {
-  static void compute(const paddle::platform::CPUDeviceContext& context,
-                      LstmMetaValue<T> value,
-                      LstmMetaGrad<T> grad,
-                      int frame_size,
-                      int batch_size,
-                      T cell_clip,
-                      const phi::funcs::detail::ActivationType& gate_act,
-                      const phi::funcs::detail::ActivationType& cell_act,
-                      const phi::funcs::detail::ActivationType& cand_act,
-                      bool old_api_version = true) {
-    for (int b = 0; b < batch_size; b++) {
-      detail::cpu_lstm_backward(context,
-                                phi::funcs::detail::backward::lstm<T>(),
-                                value,
-                                grad,
-                                frame_size,
-                                cell_clip,
-                                cand_act,
-                                gate_act,
-                                cell_act,
-                                old_api_version);
-
-      value.gate_value += frame_size * 4;
-      value.state_value += frame_size;
-      value.state_active_value += frame_size;
-      value.output_value += frame_size;
-      if (value.prev_state_value) {
-        value.prev_state_value += frame_size;
-      }
-
-      grad.gate_grad += frame_size * 4;
-      grad.state_grad += frame_size;
-      grad.state_active_grad += frame_size;
-      grad.output_grad += frame_size;
-      if (grad.prev_state_grad) {
-        grad.prev_state_grad += frame_size;
-      }
-    }
-  }
-};
-
 template <class T>
 struct LstmUnitGradFunctor<CPUContext, T> {
  static void compute(const CPUContext& context,
@@ -171,11 +96,6 @@ struct LstmUnitGradFunctor<CPUContext, T> {
  }
 };

-template class LstmUnitFunctor<paddle::platform::CPUDeviceContext, float>;
-template class LstmUnitFunctor<paddle::platform::CPUDeviceContext, double>;
-template class LstmUnitGradFunctor<paddle::platform::CPUDeviceContext, float>;
-template class LstmUnitGradFunctor<paddle::platform::CPUDeviceContext, double>;
-
 template class LstmUnitFunctor<CPUContext, float>;
 template class LstmUnitFunctor<CPUContext, double>;
 template class LstmUnitGradFunctor<CPUContext, float>;

--- a/paddle/phi/kernels/funcs/math_function.cc
+++ b/paddle/phi/kernels/funcs/math_function.cc
@@ -39,22 +39,6 @@ namespace funcs {

 using float16 = phi::dtype::float16;

-template struct SetConstant<paddle::platform::CPUDeviceContext,
-                            phi::dtype::float16>;
-template struct SetConstant<paddle::platform::CPUDeviceContext,
-                            phi::dtype::bfloat16>;
-template struct SetConstant<paddle::platform::CPUDeviceContext, float>;
-template struct SetConstant<paddle::platform::CPUDeviceContext, double>;
-template struct SetConstant<paddle::platform::CPUDeviceContext, int16_t>;
-template struct SetConstant<paddle::platform::CPUDeviceContext, int>;
-template struct SetConstant<paddle::platform::CPUDeviceContext, int64_t>;
-template struct SetConstant<paddle::platform::CPUDeviceContext, bool>;
-template struct SetConstant<paddle::platform::CPUDeviceContext, uint8_t>;
-template struct SetConstant<paddle::platform::CPUDeviceContext,
-                            phi::dtype::complex<float>>;
-template struct SetConstant<paddle::platform::CPUDeviceContext,
-                            phi::dtype::complex<double>>;
-
 template struct SetConstant<phi::CPUContext, phi::dtype::float16>;
 template struct SetConstant<phi::CPUContext, phi::dtype::bfloat16>;
 template struct SetConstant<phi::CPUContext, float>;
@@ -85,46 +69,20 @@ template struct SetConstant<paddle::platform::XPUDeviceContext,
                            phi::dtype::complex<double>>;
 #endif

-#define DEFINE_CPU_TRANS(RANK)                                                 \
-  template struct Transpose<paddle::platform::CPUDeviceContext,                \
-                            phi::dtype::float16,                               \
-                            RANK>;                                             \
-  template struct Transpose<paddle::platform::CPUDeviceContext,                \
-                            phi::dtype::bfloat16,                              \
-                            RANK>;                                             \
-  template struct Transpose<paddle::platform::CPUDeviceContext, float, RANK>;  \
-  template struct Transpose<paddle::platform::CPUDeviceContext, double, RANK>; \
-  template struct Transpose<paddle::platform::CPUDeviceContext, int, RANK>;    \
-  template struct Transpose<paddle::platform::CPUDeviceContext,                \
-                            int64_t,                                           \
-                            RANK>;                                             \
-  template struct Transpose<paddle::platform::CPUDeviceContext, bool, RANK>;   \
-  template struct Transpose<paddle::platform::CPUDeviceContext,                \
-                            int16_t,                                           \
-                            RANK>;                                             \
-  template struct Transpose<paddle::platform::CPUDeviceContext,                \
-                            uint8_t,                                           \
-                            RANK>;                                             \
-  template struct Transpose<paddle::platform::CPUDeviceContext, int8_t, RANK>; \
-  template struct Transpose<paddle::platform::CPUDeviceContext,                \
-                            phi::dtype::complex<float>,                        \
-                            RANK>;                                             \
-  template struct Transpose<paddle::platform::CPUDeviceContext,                \
-                            phi::dtype::complex<double>,                       \
-                            RANK>;                                             \
-  template struct Transpose<phi::CPUContext, phi::dtype::float16, RANK>;       \
-  template struct Transpose<phi::CPUContext, phi::dtype::bfloat16, RANK>;      \
-  template struct Transpose<phi::CPUContext, float, RANK>;                     \
-  template struct Transpose<phi::CPUContext, double, RANK>;                    \
-  template struct Transpose<phi::CPUContext, int, RANK>;                       \
-  template struct Transpose<phi::CPUContext, int64_t, RANK>;                   \
-  template struct Transpose<phi::CPUContext, bool, RANK>;                      \
-  template struct Transpose<phi::CPUContext, int16_t, RANK>;                   \
-  template struct Transpose<phi::CPUContext, uint8_t, RANK>;                   \
-  template struct Transpose<phi::CPUContext, int8_t, RANK>;                    \
-  template struct Transpose<phi::CPUContext,                                   \
-                            phi::dtype::complex<float>,                        \
-                            RANK>;                                             \
+#define DEFINE_CPU_TRANS(RANK)                                            \
+  template struct Transpose<phi::CPUContext, phi::dtype::float16, RANK>;  \
+  template struct Transpose<phi::CPUContext, phi::dtype::bfloat16, RANK>; \
+  template struct Transpose<phi::CPUContext, float, RANK>;                \
+  template struct Transpose<phi::CPUContext, double, RANK>;               \
+  template struct Transpose<phi::CPUContext, int, RANK>;                  \
+  template struct Transpose<phi::CPUContext, int64_t, RANK>;              \
+  template struct Transpose<phi::CPUContext, bool, RANK>;                 \
+  template struct Transpose<phi::CPUContext, int16_t, RANK>;              \
+  template struct Transpose<phi::CPUContext, uint8_t, RANK>;              \
+  template struct Transpose<phi::CPUContext, int8_t, RANK>;               \
+  template struct Transpose<phi::CPUContext,                              \
+                            phi::dtype::complex<float>,                   \
+                            RANK>;                                        \
  template struct Transpose<phi::CPUContext, phi::dtype::complex<double>, RANK>;

 DEFINE_CPU_TRANS(1);
@@ -163,8 +121,7 @@ void TransposeNormal<DeviceContext, T>::operator()(
 }

 // define transpose normal
-#define DEFINE_CPU_TRANS_NORMAL(TYPE)                                        \
-  template struct TransposeNormal<paddle::platform::CPUDeviceContext, TYPE>; \
+#define DEFINE_CPU_TRANS_NORMAL(TYPE) \
  template struct TransposeNormal<phi::CPUContext, TYPE>

 DEFINE_CPU_TRANS_NORMAL(phi::dtype::float16);
@@ -291,6 +248,31 @@ void set_constant(const paddle::platform::DeviceContext& context,
 #endif
 }

+template struct ColwiseSum<phi::CPUContext, float>;
+template struct ColwiseSum<phi::CPUContext, double>;
+template struct ColwiseSum<phi::CPUContext, int>;
+template struct ColwiseSum<phi::CPUContext, int64_t>;
+
+template struct RowwiseMean<phi::CPUContext, float>;
+template struct RowwiseMean<phi::CPUContext, double>;
+
+template <typename T>
+struct ElementwiseAddTo<paddle::platform::CPUDeviceContext, T> {
+  void operator()(paddle::platform::CPUDeviceContext* ctx,
+                  const paddle::framework::Tensor& src,
+                  paddle::framework::Tensor* dst) {
+    auto in = paddle::framework::EigenVector<T>::Flatten(src);
+    auto out = paddle::framework::EigenVector<T>::Flatten(*dst);
+    auto& place = *(ctx->eigen_device());
+    out.device(place) = out + in;
+  }
+};
+
+template struct ElementwiseAddTo<paddle::platform::CPUDeviceContext,
+                                 phi::dtype::float16>;
+template struct ElementwiseAddTo<paddle::platform::CPUDeviceContext,
+                                 phi::dtype::bfloat16>;
+
 template <typename T>
 struct RowwiseAdd<paddle::platform::CPUDeviceContext, T> {
  void operator()(const paddle::platform::CPUDeviceContext& context,
@@ -333,41 +315,5 @@ struct RowwiseAdd<paddle::platform::CPUDeviceContext, T> {
 template struct RowwiseAdd<paddle::platform::CPUDeviceContext, float>;
 template struct RowwiseAdd<paddle::platform::CPUDeviceContext, double>;

-template struct ColwiseSum<paddle::platform::CPUDeviceContext, float>;
-template struct ColwiseSum<paddle::platform::CPUDeviceContext, double>;
-template struct ColwiseSum<paddle::platform::CPUDeviceContext, int>;
-template struct ColwiseSum<paddle::platform::CPUDeviceContext, int64_t>;
-
-template struct ColwiseSum<phi::CPUContext, float>;
-template struct ColwiseSum<phi::CPUContext, double>;
-template struct ColwiseSum<phi::CPUContext, int>;
-template struct ColwiseSum<phi::CPUContext, int64_t>;
-
-template struct RowwiseSum<paddle::platform::CPUDeviceContext, float>;
-template struct RowwiseSum<paddle::platform::CPUDeviceContext, double>;
-
-template struct RowwiseMean<paddle::platform::CPUDeviceContext, float>;
-template struct RowwiseMean<paddle::platform::CPUDeviceContext, double>;
-
-template struct RowwiseMean<phi::CPUContext, float>;
-template struct RowwiseMean<phi::CPUContext, double>;
-
-template <typename T>
-struct ElementwiseAddTo<paddle::platform::CPUDeviceContext, T> {
-  void operator()(paddle::platform::CPUDeviceContext* ctx,
-                  const paddle::framework::Tensor& src,
-                  paddle::framework::Tensor* dst) {
-    auto in = paddle::framework::EigenVector<T>::Flatten(src);
-    auto out = paddle::framework::EigenVector<T>::Flatten(*dst);
-    auto& place = *(ctx->eigen_device());
-    out.device(place) = out + in;
-  }
-};
-
-template struct ElementwiseAddTo<paddle::platform::CPUDeviceContext,
-                                 phi::dtype::float16>;
-template struct ElementwiseAddTo<paddle::platform::CPUDeviceContext,
-                                 phi::dtype::bfloat16>;
-
 }  // namespace funcs
 }  // namespace phi
--- a/paddle/phi/kernels/funcs/matrix_inverse.cc
+++ b/paddle/phi/kernels/funcs/matrix_inverse.cc
@@ -29,9 +29,5 @@ void MatrixInverseFunctor<Context, T>::operator()(const Context& dev_ctx,
 template class MatrixInverseFunctor<CPUContext, float>;
 template class MatrixInverseFunctor<CPUContext, double>;

-// TODO(chenweihang): remove these instantiations later
-template class MatrixInverseFunctor<paddle::platform::CPUDeviceContext, float>;
-template class MatrixInverseFunctor<paddle::platform::CPUDeviceContext, double>;
-
 }  // namespace funcs
 }  // namespace phi
--- a/paddle/phi/tests/api/test_sparse_utils_api.cc
+++ b/paddle/phi/tests/api/test_sparse_utils_api.cc
@@ -48,7 +48,6 @@ TEST(API, to_sparse_coo) {
  std::copy(&dense_data[0][0], &dense_data[0][0] + 9, dense_x_data);

  phi::CPUContext dev_ctx_cpu;
-  dev_ctx_cpu.Init();

  // 1. test dense_to_sparse_coo
  paddle::experimental::Tensor x(dense_x);

--- a/paddle/phi/tests/common/test_scalar.cu
+++ b/paddle/phi/tests/common/test_scalar.cu
@@ -47,7 +47,6 @@ TEST(Scalar, ConstructFromDenseTensor1) {
  dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
                           .GetAllocator(phi::CPUPlace())
                           .get());
-  dev_ctx.Init();

  auto* dense_x_data = dev_ctx.Alloc<float16>(&dense_x);
  dense_x_data[0] = 1;
@@ -67,7 +66,6 @@ TEST(Scalar, ConstructFromDenseTensor2) {
  dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
                           .GetAllocator(phi::CPUPlace())
                           .get());
-  dev_ctx.Init();

  auto* dense_x_data = dev_ctx.Alloc<int16_t>(&dense_x);
  dense_x_data[0] = 1;
@@ -87,7 +85,6 @@ TEST(Scalar, ConstructFromDenseTensor3) {
  dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
                           .GetAllocator(phi::CPUPlace())
                           .get());
-  dev_ctx.Init();

  auto* dense_x_data = dev_ctx.Alloc<int8_t>(&dense_x);
  dense_x_data[0] = 1;
@@ -107,7 +104,6 @@ TEST(Scalar, ConstructFromDenseTensor4) {
  dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
                           .GetAllocator(phi::CPUPlace())
                           .get());
-  dev_ctx.Init();

  auto* dense_x_data = dev_ctx.Alloc<bool>(&dense_x);
  dense_x_data[0] = true;
@@ -127,7 +123,6 @@ TEST(Scalar, ConstructFromDenseTensor5) {
  dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
                           .GetAllocator(phi::CPUPlace())
                           .get());
-  dev_ctx.Init();

  auto* dense_x_data = dev_ctx.Alloc<complex64>(&dense_x);
  dense_x_data[0] = 1;
@@ -148,7 +143,6 @@ TEST(Scalar, ConstructFromDenseTensor6) {
  dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
                           .GetAllocator(phi::CPUPlace())
                           .get());
-  dev_ctx.Init();

  auto* dense_x_data = dev_ctx.Alloc<complex128>(&dense_x);
  dense_x_data[0] = 1;
@@ -170,7 +164,6 @@ TEST(Scalar, ConstructFromDenseTensor7) {
                           .GetAllocator(phi::GPUPlace())
                           .get());
  dev_ctx.Init();
-
  auto* dense_x_data = dev_ctx.Alloc<float>(&dense_x);
  FillTensor<<<1, 1, 0, dev_ctx.stream()>>>(dense_x_data);
  dev_ctx.Wait();

--- a/paddle/phi/tests/core/CMakeLists.txt
+++ b/paddle/phi/tests/core/CMakeLists.txt
@@ -24,10 +24,6 @@ cc_test(
  test_op_utils
  SRCS test_op_utils.cc
  DEPS op_compat_infos)
-cc_test(
-  test_phi_device_context
-  SRCS test_device_context.cc
-  DEPS phi_context cpu_context)
 cc_test(
  test_meta_fn_utils
  SRCS test_meta_fn_utils.cc

--- a/paddle/phi/tests/core/test_device_context.cc
+++ b/paddle/phi/tests/core/test_device_context.cc
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "gtest/gtest.h"
-
-// TODO(wilber): will remove after the cpu, gpu context megre.
-#include "paddle/phi/backends/cpu/cpu_context.h"
-// #include "paddle/phi/backends/all_context.h"
-
-// NOTE: The paddle framework should add WITH_EIGEN option to support compile
-// without eigen.
-#include "unsupported/Eigen/CXX11/Tensor"
-
-namespace phi {
-namespace tests {
-
-class InferenceCPUContext : public CPUContext {
- public:
-  void SetEigenDevice(Eigen::DefaultDevice* eigen_device) {
-    CPUContext::SetEigenDevice(eigen_device);
-  }
-};
-
-TEST(DeviceContext, cpu_context) {
-  std::cout << "test training scenarios" << std::endl;
-  {
-    phi::CPUContext ctx;
-    ctx.Init();
-    EXPECT_TRUE(ctx.eigen_device() != nullptr);
-  }
-
-  std::cout << "test inference scenarios" << std::endl;
-  Eigen::DefaultDevice* device = new Eigen::DefaultDevice();
-  {
-    InferenceCPUContext ctx;
-    ctx.SetEigenDevice(device);
-    EXPECT_TRUE(ctx.eigen_device() != nullptr);
-  }
-  delete device;
-}
-
-}  // namespace tests
-}  // namespace phi
--- a/paddle/phi/tests/kernels/test_cast_dev_api.cc
+++ b/paddle/phi/tests/kernels/test_cast_dev_api.cc
@@ -52,7 +52,6 @@ TEST(DEV_API, cast) {
  dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
                           .GetAllocator(paddle::platform::CPUPlace())
                           .get());
-  dev_ctx.Init();

  phi::DataType out_dtype = phi::DataType::FLOAT64;
  // 2. test API

--- a/paddle/phi/tests/kernels/test_concat_dev_api.cc
+++ b/paddle/phi/tests/kernels/test_concat_dev_api.cc
@@ -60,7 +60,6 @@ TEST(DEV_API, concat) {
  dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
                           .GetAllocator(paddle::platform::CPUPlace())
                           .get());
-  dev_ctx.Init();
  auto out = phi::Concat<float>(dev_ctx, inputs, 0);

  // 3. check result

--- a/paddle/phi/tests/kernels/test_conj_dev_api.cc
+++ b/paddle/phi/tests/kernels/test_conj_dev_api.cc
@@ -48,7 +48,6 @@ TEST(DEV_API, conj) {
  dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
                           .GetAllocator(paddle::platform::CPUPlace())
                           .get());
-  dev_ctx.Init();

  // 2. test API
  auto out = phi::Conj<paddle::complex64>(dev_ctx, dense_x);

--- a/paddle/phi/tests/kernels/test_copy_dev_api.cc
+++ b/paddle/phi/tests/kernels/test_copy_dev_api.cc
@@ -65,7 +65,6 @@ TEST(DEV_API, copy) {
      paddle::memory::allocation::AllocatorFacade::Instance()
          .GetAllocator(paddle::platform::CPUPlace())
          .get());
-  dev_ctx.Init();
  phi::Copy(
      dev_ctx, *(dense_src.get()), phi::CPUPlace(), false, dense_dst.get());


--- a/paddle/phi/tests/kernels/test_creation_dev_api.cc
+++ b/paddle/phi/tests/kernels/test_creation_dev_api.cc
@@ -36,7 +36,6 @@ TEST(DEV_API, empty) {
  dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
                           .GetAllocator(paddle::platform::CPUPlace())
                           .get());
-  dev_ctx.Init();

  // 2. test API
  auto out = phi::Empty<int>(dev_ctx, {3, 2});
@@ -66,7 +65,6 @@ TEST(DEV_API, empty_like) {
  dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
                           .GetAllocator(paddle::platform::CPUPlace())
                           .get());
-  dev_ctx.Init();
  auto out = phi::EmptyLike<float>(dev_ctx, dense_x);

  // 3. check result
@@ -86,7 +84,6 @@ TEST(DEV_API, full) {
  dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
                           .GetAllocator(paddle::platform::CPUPlace())
                           .get());
-  dev_ctx.Init();
  auto out = phi::Full<float>(dev_ctx, {3, 2}, val);

  // 3. check result
@@ -119,7 +116,6 @@ TEST(DEV_API, full_like) {
  dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
                           .GetAllocator(paddle::platform::CPUPlace())
                           .get());
-  dev_ctx.Init();

  // 2. test API
  auto out = phi::FullLike<float>(dev_ctx, dense_x, val);

--- a/paddle/phi/tests/kernels/test_dot_dev_api.cc
+++ b/paddle/phi/tests/kernels/test_dot_dev_api.cc
@@ -61,7 +61,6 @@ TEST(DEV_API, dot) {
  dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
                           .GetAllocator(paddle::platform::CPUPlace())
                           .get());
-  dev_ctx.Init();
  auto out = phi::Dot<float>(dev_ctx, dense_x, dense_y);

  // 3. check result

--- a/paddle/phi/tests/kernels/test_elementwise_dev_api.cc
+++ b/paddle/phi/tests/kernels/test_elementwise_dev_api.cc
@@ -66,7 +66,6 @@ TEST(DEV_API, add) {
  dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
                           .GetAllocator(paddle::platform::CPUPlace())
                           .get());
-  dev_ctx.Init();
  auto dense_out = phi::Add<float>(dev_ctx, dense_x, dense_y);

  // 3. check result
@@ -118,7 +117,6 @@ TEST(DEV_API, subtract) {
  dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
                           .GetAllocator(paddle::platform::CPUPlace())
                           .get());
-  dev_ctx.Init();
  auto dense_out = phi::Subtract<float>(dev_ctx, dense_x, dense_y);

  // 3. check result
@@ -170,7 +168,6 @@ TEST(DEV_API, divide) {
  dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
                           .GetAllocator(paddle::platform::CPUPlace())
                           .get());
-  dev_ctx.Init();
  auto dense_out = phi::Divide<float>(dev_ctx, dense_x, dense_y);

  // 3. check result
@@ -222,7 +219,6 @@ TEST(DEV_API, multiply) {
  dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
                           .GetAllocator(paddle::platform::CPUPlace())
                           .get());
-  dev_ctx.Init();
  auto dense_out = phi::Multiply<float>(dev_ctx, dense_x, dense_y);

  // 3. check result

--- a/paddle/phi/tests/kernels/test_flatten_dev_api.cc
+++ b/paddle/phi/tests/kernels/test_flatten_dev_api.cc
@@ -52,7 +52,6 @@ TEST(DEV_API, flatten) {
      paddle::memory::allocation::AllocatorFacade::Instance()
          .GetAllocator(paddle::platform::CPUPlace())
          .get());
-  dev_ctx.Init();

  // 2. test API
  auto out = phi::Flatten<float>(dev_ctx, dense_x, start_axis, stop_axis);

--- a/paddle/phi/tests/kernels/test_math_function.cc
+++ b/paddle/phi/tests/kernels/test_math_function.cc
@@ -273,7 +273,6 @@ TEST(math_funciton, set_constant) {
  t.Resize({10, 10});
  t.mutable_data<int>(paddle::platform::CPUPlace());
  auto* ctx = new paddle::platform::CPUDeviceContext();
-  ctx->Init();
  phi::funcs::set_constant(*ctx, &t, 10);
  for (int64_t i = 0; i < t.numel(); ++i) {
    PADDLE_ENFORCE_EQ(10,

--- a/paddle/phi/tests/kernels/test_matmul_dev_api.cc
+++ b/paddle/phi/tests/kernels/test_matmul_dev_api.cc
@@ -58,7 +58,6 @@ TEST(DEV_API, dot) {
  dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
                           .GetAllocator(paddle::platform::CPUPlace())
                           .get());
-  dev_ctx.Init();
  auto out = Matmul<float, CPUContext>(dev_ctx, dense_x, dense_y, false, false);

  // 3. check result

--- a/paddle/phi/tests/kernels/test_mean_dev_api.cc
+++ b/paddle/phi/tests/kernels/test_mean_dev_api.cc
@@ -51,7 +51,6 @@ TEST(DEV_API, mean) {
  dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
                           .GetAllocator(paddle::platform::CPUPlace())
                           .get());
-  dev_ctx.Init();
  auto out = phi::Mean<float>(dev_ctx, dense_x, dims, false);

  // 3. check result

--- a/paddle/phi/tests/kernels/test_reshape_dev_api.cc
+++ b/paddle/phi/tests/kernels/test_reshape_dev_api.cc
@@ -54,7 +54,6 @@ TEST(DEV_API, reshape) {
      paddle::memory::allocation::AllocatorFacade::Instance()
          .GetAllocator(paddle::platform::CPUPlace())
          .get());
-  dev_ctx.Init();
  auto out = phi::Reshape<float>(dev_ctx, dense_x, shape);
  // 3. check result
  std::vector<int64_t> expect_shape = {12, 3};

--- a/paddle/phi/tests/kernels/test_scale_dev_api.cc
+++ b/paddle/phi/tests/kernels/test_scale_dev_api.cc
@@ -51,7 +51,6 @@ TEST(DEV_API, scale) {
  dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
                           .GetAllocator(paddle::platform::CPUPlace())
                           .get());
-  dev_ctx.Init();

  auto out = phi::Scale<float>(dev_ctx, dense_x, scale, bias, bias_after_scale);

@@ -93,7 +92,6 @@ TEST(DEV_API, scale_host) {
  dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
                           .GetAllocator(paddle::platform::CPUPlace())
                           .get());
-  dev_ctx.Init();

  auto out = phi::Scale<float>(dev_ctx, dense_x, scale, bias, bias_after_scale);


--- a/paddle/phi/tests/kernels/test_sparse_activation_dev_api.cc
+++ b/paddle/phi/tests/kernels/test_sparse_activation_dev_api.cc
@@ -42,7 +42,6 @@ TEST(DEV_API, sparse_relu) {
      paddle::memory::allocation::AllocatorFacade::Instance()
          .GetAllocator(paddle::platform::CPUPlace())
          .get());
-  dev_ctx_cpu.Init();

  DenseTensor dense_x =
      phi::Empty(dev_ctx_cpu,

--- a/paddle/phi/tests/kernels/test_sparse_conv3d_dev_api.cc
+++ b/paddle/phi/tests/kernels/test_sparse_conv3d_dev_api.cc
@@ -75,7 +75,6 @@ void TestConv3dBase(const std::vector<IntT>& indices,
      paddle::memory::allocation::AllocatorFacade::Instance()
          .GetAllocator(paddle::platform::CPUPlace())
          .get());
-  dev_ctx_cpu.Init();

  const int in_channels = kernel_dims[3];
  const int out_channels = kernel_dims[4];

--- a/paddle/phi/tests/kernels/test_sparse_elementwise_dev_api.cc
+++ b/paddle/phi/tests/kernels/test_sparse_elementwise_dev_api.cc
@@ -113,7 +113,6 @@ TEST(DEV_API, sparse_elementwise_coo_kernel_double) {
        paddle::memory::allocation::AllocatorFacade::Instance()
            .GetAllocator(paddle::platform::CPUPlace())
            .get());
-    dev_ctx_cpu.Init();

    auto coo_x = sparse::DenseToSparseCoo<T>(dev_ctx_cpu, dense_x, sparse_dim);
    auto coo_y = sparse::DenseToSparseCoo<T>(dev_ctx_cpu, dense_y, sparse_dim);
@@ -159,7 +158,6 @@ TEST(DEV_API, sparse_elementwise_csr_kernel_float) {
      paddle::memory::allocation::AllocatorFacade::Instance()
          .GetAllocator(paddle::platform::CPUPlace())
          .get());
-  dev_ctx_cpu.Init();

  auto csr_x = sparse::DenseToSparseCsr<T>(dev_ctx_cpu, dense_x);
  auto csr_y = sparse::DenseToSparseCsr<T>(dev_ctx_cpu, dense_y);
@@ -357,7 +355,6 @@ TEST(DEV_API, sparse_elementwise_csr_grad_kernel_float) {
      paddle::memory::allocation::AllocatorFacade::Instance()
          .GetAllocator(paddle::platform::CPUPlace())
          .get());
-  dev_ctx_cpu.Init();

  auto csr_x = sparse::DenseToSparseCsr<T>(dev_ctx_cpu, dense_x);
  auto csr_y = sparse::DenseToSparseCsr<T>(dev_ctx_cpu, dense_y);
@@ -404,7 +401,6 @@ TEST(DEV_API, sparse_elementwise_coo_grad_kernel_double) {
      paddle::memory::allocation::AllocatorFacade::Instance()
          .GetAllocator(paddle::platform::CPUPlace())
          .get());
-  dev_ctx_cpu.Init();

  auto csr_x = sparse::DenseToSparseCoo<T>(dev_ctx_cpu, dense_x, sparse_dim);
  auto csr_y = sparse::DenseToSparseCoo<T>(dev_ctx_cpu, dense_y, sparse_dim);

--- a/paddle/phi/tests/kernels/test_sparse_pool_dev_api.cc
+++ b/paddle/phi/tests/kernels/test_sparse_pool_dev_api.cc
@@ -60,7 +60,6 @@ void TestMaxPoolBase(const std::vector<IntT>& indices,
      paddle::memory::allocation::AllocatorFacade::Instance()
          .GetAllocator(phi::CPUPlace())
          .get());
-  dev_ctx_cpu.Init();

  const int in_channels = x_dims[4];
  const int out_channels = in_channels;

--- a/paddle/phi/tests/kernels/test_sparse_utils_dev_api.cc
+++ b/paddle/phi/tests/kernels/test_sparse_utils_dev_api.cc
@@ -88,7 +88,6 @@ void TestDenseToSparseCoo(const DenseTensor& dense_x,
      paddle::platform::CPUPlace());

  phi::CPUContext dev_ctx_cpu;
-  dev_ctx_cpu.Init();
  dev_ctx_cpu.SetAllocator(
      paddle::memory::allocation::AllocatorFacade::Instance()
          .GetAllocator(phi::CPUPlace())
@@ -307,7 +306,6 @@ void TestSparseCsrToCoo(const DDim& dense_dims,

  // 1. test cpu
  phi::CPUContext dev_ctx_cpu;
-  dev_ctx_cpu.Init();
  dev_ctx_cpu.SetAllocator(
      paddle::memory::allocation::AllocatorFacade::Instance()
          .GetAllocator(phi::CPUPlace())
@@ -489,7 +487,6 @@ void TestCooToCsr(const DDim& dense_dims,

  // 1. test cpu
  phi::CPUContext dev_ctx_cpu;
-  dev_ctx_cpu.Init();
  dev_ctx_cpu.SetAllocator(
      paddle::memory::allocation::AllocatorFacade::Instance()
          .GetAllocator(phi::CPUPlace())
@@ -588,7 +585,6 @@ void TestDenseToSparseCsr(const DenseTensor& dense_x,
  const auto alloc = std::make_shared<paddle::experimental::DefaultAllocator>(
      paddle::platform::CPUPlace());
  phi::CPUContext dev_ctx_cpu;
-  dev_ctx_cpu.Init();
  dev_ctx_cpu.SetAllocator(
      paddle::memory::allocation::AllocatorFacade::Instance()
          .GetAllocator(phi::CPUPlace())
@@ -701,7 +697,6 @@ void TestSparseCooToDense(const DDim& dense_dims,
                          const int64_t non_zero_num,
                          const int64_t sparse_dim) {
  phi::CPUContext dev_ctx_cpu;
-  dev_ctx_cpu.Init();
  dev_ctx_cpu.SetAllocator(
      paddle::memory::allocation::AllocatorFacade::Instance()
          .GetAllocator(phi::CPUPlace())
@@ -879,7 +874,6 @@ void TestSparseCsrToDense(const DDim& dense_dims,

  // 1. test cpu
  phi::CPUContext dev_ctx_cpu;
-  dev_ctx_cpu.Init();
  dev_ctx_cpu.SetAllocator(
      paddle::memory::allocation::AllocatorFacade::Instance()
          .GetAllocator(phi::CPUPlace())

--- a/paddle/phi/tests/kernels/test_split_dev_api.cc
+++ b/paddle/phi/tests/kernels/test_split_dev_api.cc
@@ -40,7 +40,6 @@ TEST(DEV_API, split) {
  dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
                           .GetAllocator(paddle::platform::CPUPlace())
                           .get());
-  dev_ctx.Init();

  auto* dense_x_data = dev_ctx.Alloc<float>(&dense_x);
  for (size_t i = 0; i < 4; ++i) {

--- a/paddle/phi/tests/kernels/test_sum_dev_api.cc
+++ b/paddle/phi/tests/kernels/test_sum_dev_api.cc
@@ -49,7 +49,6 @@ TEST(DEV_API, sum) {
  dev_ctx.SetAllocator(paddle::memory::allocation::AllocatorFacade::Instance()
                           .GetAllocator(paddle::platform::CPUPlace())
                           .get());
-  dev_ctx.Init();

  // 2. test API
  auto out =