[PHI decoupling] move softmax from fluid to phi and remove cpu_vec.h in fluid (#48970)

344b99e1 · huangjiyi · GitHub · 4672ea8e · 344b99e1 · 344b99e1
65 changed file
--- a/paddle/fluid/inference/api/analysis_config.cc
+++ b/paddle/fluid/inference/api/analysis_config.cc
@@ -19,9 +19,9 @@
 #include "paddle/fluid/inference/api/paddle_analysis_config.h"
 #include "paddle/fluid/inference/api/paddle_pass_builder.h"
 #include "paddle/fluid/inference/utils/table_printer.h"
-#include "paddle/fluid/platform/cpu_info.h"
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #include "paddle/fluid/platform/enforce.h"
+#include "paddle/phi/backends/cpu/cpu_info.h"
 #include "paddle/utils/string/split.h"

 #ifdef PADDLE_WITH_TENSORRT
@@ -624,10 +624,11 @@ void AnalysisConfig::EnableMkldnnQuantizer() {

 void AnalysisConfig::EnableMkldnnBfloat16() {
 #ifdef PADDLE_WITH_MKLDNN
-  if (platform::MayIUse(platform::cpu_isa_t::avx512_core)) {
+  if (phi::backends::cpu::MayIUse(phi::backends::cpu::cpu_isa_t::avx512_core)) {
    use_mkldnn_bfloat16_ = true;
    LOG(INFO) << "Hardware support for BFLOAT16"
-              << (platform::MayIUse(platform::cpu_isa_t::avx512_bf16)
+              << (phi::backends::cpu::MayIUse(
+                      phi::backends::cpu::cpu_isa_t::avx512_bf16)
                      ? " is enabled"
                      : " is disabled. Simulation will be used");
  } else {

--- a/paddle/fluid/inference/api/analysis_predictor_tester.cc
+++ b/paddle/fluid/inference/api/analysis_predictor_tester.cc
@@ -29,7 +29,7 @@
 #include "paddle/fluid/inference/api/paddle_inference_api.h"
 #include "paddle/fluid/inference/tests/api/tester_helper.h"
 #include "paddle/fluid/inference/utils/io_utils.h"
-#include "paddle/fluid/platform/cpu_info.h"
+#include "paddle/phi/backends/cpu/cpu_info.h"

 DEFINE_string(dirname, "", "dirname to tests.");

@@ -327,7 +327,7 @@ TEST(AnalysisPredictor, bf16_gpu_pass_strategy) {
  config.EnableUseGpu(100, 0);
  config.EnableMkldnnBfloat16();
 #ifdef PADDLE_WITH_MKLDNN
-  if (platform::MayIUse(platform::cpu_isa_t::avx512_core))
+  if (phi::backends::cpu::MayIUse(phi::backends::cpu::cpu_isa_t::avx512_core))
    ASSERT_EQ(config.mkldnn_bfloat16_enabled(), true);
  else
    ASSERT_EQ(config.mkldnn_bfloat16_enabled(), false);

--- a/paddle/fluid/inference/api/onnxruntime_predictor_tester.cc
+++ b/paddle/fluid/inference/api/onnxruntime_predictor_tester.cc
@@ -27,7 +27,7 @@
 #include "paddle/fluid/inference/api/paddle_inference_api.h"
 #include "paddle/fluid/inference/tests/api/tester_helper.h"
 #include "paddle/fluid/inference/utils/io_utils.h"
-#include "paddle/fluid/platform/cpu_info.h"
+#include "paddle/phi/backends/cpu/cpu_info.h"

 DEFINE_string(dirname, "", "dirname to tests.");


--- a/paddle/fluid/inference/tests/api/analyzer_bfloat16_image_classification_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_bfloat16_image_classification_tester.cc
@@ -14,7 +14,7 @@ limitations under the License. */

 #include "paddle/fluid/inference/api/paddle_analysis_config.h"
 #include "paddle/fluid/inference/tests/api/tester_helper.h"
-#include "paddle/fluid/platform/cpu_info.h"
+#include "paddle/phi/backends/cpu/cpu_info.h"

 DEFINE_bool(enable_mkldnn, true, "Enable MKLDNN");

@@ -47,7 +47,7 @@ TEST(Analyzer_bfloat16_image_classification, bfloat16) {
  std::vector<std::vector<PaddleTensor>> input_slots_all;
  SetInputs(&input_slots_all);
  if (FLAGS_enable_mkldnn && FLAGS_enable_bf16 &&
-      platform::MayIUse(platform::cpu_isa_t::avx512_bf16)) {
+      phi::backends::cpu::MayIUse(phi::backends::cpu::cpu_isa_t::avx512_bf16)) {
    b_cfg.EnableMkldnnBfloat16();
  } else {
    FLAGS_enable_bf16 = false;

--- a/paddle/fluid/memory/allocation/buddy_allocator.h
+++ b/paddle/fluid/memory/allocation/buddy_allocator.h
@@ -27,9 +27,9 @@ limitations under the License. */

 #include "paddle/fluid/memory/allocation/memory_block.h"
 #include "paddle/fluid/memory/allocation/system_allocator.h"
-#include "paddle/fluid/platform/cpu_info.h"
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #include "paddle/fluid/platform/device/npu/npu_info.h"
+#include "paddle/phi/backends/cpu/cpu_info.h"

 namespace paddle {
 namespace memory {

--- a/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
+++ b/paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
@@ -78,8 +78,8 @@ BuddyAllocator *GetCPUBuddyAllocator() {
  std::call_once(init_flag, []() {
    a = new detail::BuddyAllocator(
        std::unique_ptr<detail::SystemAllocator>(new detail::CPUAllocator),
-        platform::CpuMinChunkSize(),
-        platform::CpuMaxChunkSize());
+        phi::backends::cpu::CpuMinChunkSize(),
+        phi::backends::cpu::CpuMaxChunkSize());
  });

  return a;
@@ -290,8 +290,8 @@ BuddyAllocator *GetNPUPinnedBuddyAllocator() {
  std::call_once(init_flag, []() {
    ba = new BuddyAllocator(std::unique_ptr<detail::SystemAllocator>(
                                new detail::NPUPinnedAllocator),
-                            platform::NPUPinnedMinChunkSize(),
-                            platform::NPUPinnedMaxChunkSize());
+                            phi::backends::cpu::NPUPinnedMinChunkSize(),
+                            phi::backends::cpu::NPUPinnedMaxChunkSize());
  });

  return ba;
@@ -562,8 +562,8 @@ BuddyAllocator *GetCUDAPinnedBuddyAllocator() {
  std::call_once(init_flag, []() {
    ba = new BuddyAllocator(std::unique_ptr<detail::SystemAllocator>(
                                new detail::CUDAPinnedAllocator),
-                            platform::CUDAPinnedMinChunkSize(),
-                            platform::CUDAPinnedMaxChunkSize());
+                            phi::backends::cpu::CUDAPinnedMinChunkSize(),
+                            phi::backends::cpu::CUDAPinnedMaxChunkSize());
  });

  return ba;

--- a/paddle/fluid/memory/allocation/system_allocator.cc
+++ b/paddle/fluid/memory/allocation/system_allocator.cc
@@ -28,10 +28,10 @@ limitations under the License. */
 #endif
 #include "gflags/gflags.h"
 #include "paddle/fluid/memory/allocation/allocator.h"
-#include "paddle/fluid/platform/cpu_info.h"
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #include "paddle/fluid/platform/device/npu/npu_info.h"
 #include "paddle/fluid/platform/enforce.h"
+#include "paddle/phi/backends/cpu/cpu_info.h"
 #ifdef PADDLE_WITH_MLU
 #include "paddle/fluid/platform/device/mlu/mlu_info.h"
 #endif
@@ -206,7 +206,7 @@ void* CUDAPinnedAllocator::Alloc(size_t* index, size_t size) {
  // of host pinned allocation. Allocates too much would reduce
  // the amount of memory available to the underlying system for paging.
  size_t usable =
-      paddle::platform::CUDAPinnedMaxAllocSize() - cuda_pinnd_alloc_size_;
+      phi::backends::cpu::CUDAPinnedMaxAllocSize() - cuda_pinnd_alloc_size_;

  if (size > usable) {
    LOG(WARNING) << "Cannot malloc " << size / 1024.0 / 1024.0
@@ -362,7 +362,7 @@ void* NPUPinnedAllocator::Alloc(size_t* index, size_t size) {
  if (size <= 0) return nullptr;

  size_t usable =
-      paddle::platform::NPUPinnedMaxAllocSize() - npu_pinnd_alloc_size_;
+      phi::backends::cpu::NPUPinnedMaxAllocSize() - npu_pinnd_alloc_size_;

  if (size > usable) {
    LOG(WARNING) << "Cannot malloc " << size / 1024.0 / 1024.0

--- a/paddle/fluid/memory/pinned_memory_test.cu
+++ b/paddle/fluid/memory/pinned_memory_test.cu
@@ -18,9 +18,9 @@ limitations under the License. */
 #include "paddle/fluid/memory/allocation/memory_block.h"
 #include "paddle/fluid/memory/memcpy.h"
 #include "paddle/fluid/memory/memory.h"
-#include "paddle/fluid/platform/cpu_info.h"
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #include "paddle/fluid/platform/place.h"
+#include "paddle/phi/backends/cpu/cpu_info.h"

 // This unit test is an example comparing the performance between using pinned
 // memory and not. In general, using pinned memory will be faster.

--- a/paddle/fluid/operators/attention_lstm_op.cc
+++ b/paddle/fluid/operators/attention_lstm_op.cc
@@ -16,7 +16,7 @@ limitations under the License. */

 #include <string>

-#include "paddle/fluid/platform/cpu_info.h"
+#include "paddle/phi/backends/cpu/cpu_info.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/funcs/cpu_vec.h"
 #include "paddle/phi/kernels/funcs/fc_functor.h"
@@ -315,10 +315,10 @@ use lstm_x_t as input and compute as standard LSTM.
 template <typename T>
 inline void bias_relu(const int n, const T* x, const T* bias, T* y) {
  if (bias) {
-    phi::funcs::vec_add_bias<T, platform::avx>(n, *bias, x, y);
-    phi::funcs::vec_relu<T, platform::avx>(n, y, y);
+    phi::funcs::vec_add_bias<T, phi::backends::cpu::avx>(n, *bias, x, y);
+    phi::funcs::vec_relu<T, phi::backends::cpu::avx>(n, y, y);
  } else {
-    phi::funcs::vec_relu<T, platform::avx>(n, x, y);
+    phi::funcs::vec_relu<T, phi::backends::cpu::avx>(n, x, y);
  }
 }

@@ -329,8 +329,9 @@ inline void vec_softmax(const int n, const T* x, T* y) {
  for (int i = 1; i < n; ++i) {
    scalar = scalar < x[i] ? x[i] : scalar;
  }
-  phi::funcs::vec_add_bias<T, platform::avx>(n, -scalar, x, y);  // sub
-  phi::funcs::vec_exp<T>(n, y, y);                               // exp
+  phi::funcs::vec_add_bias<T, phi::backends::cpu::avx>(
+      n, -scalar, x, y);            // sub
+  phi::funcs::vec_exp<T>(n, y, y);  // exp
  // sum
  scalar = T(0);
  for (int i = 0; i < n; ++i) {
@@ -393,13 +394,13 @@ class AttentionLSTMKernel : public framework::OpKernel<T> {
    auto& act_gate_str = ctx.Attr<std::string>("gate_activation");
    auto& act_cell_str = ctx.Attr<std::string>("cell_activation");
    auto& act_cand_str = ctx.Attr<std::string>("candidate_activation");
-    if (platform::MayIUse(platform::avx)) {
-      phi::funcs::VecActivations<T, platform::avx> act_functor;
+    if (phi::backends::cpu::MayIUse(phi::backends::cpu::avx)) {
+      phi::funcs::VecActivations<T, phi::backends::cpu::avx> act_functor;
      act_gate = act_functor(act_gate_str);
      act_cell = act_functor(act_cell_str);
      act_cand = act_functor(act_cand_str);
    } else {
-      phi::funcs::VecActivations<T, platform::isa_any> act_functor;
+      phi::funcs::VecActivations<T, phi::backends::cpu::isa_any> act_functor;
      act_gate = act_functor(act_gate_str);
      act_cell = act_functor(act_cell_str);
      act_cand = act_functor(act_cand_str);

--- a/paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.cu
+++ b/paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.cu
@@ -13,13 +13,15 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.h"
-#include "paddle/fluid/operators/math/softmax_impl.h"
+
+#include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/platform/collective_helper.h"
 #include "paddle/fluid/platform/device/gpu/nccl_helper.h"
 #include "paddle/fluid/string/string_helper.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/kernels/funcs/axis_utils.h"
 #include "paddle/phi/kernels/funcs/cross_entropy.h"
+#include "paddle/phi/kernels/funcs/softmax_impl.h"

 namespace paddle {
 namespace operators {
@@ -129,15 +131,15 @@ struct CSoftmaxWithCrossEntropyFunctor<phi::GPUContext, T> {
    softmax_2d.ShareDataWith(*softmax).Resize({N, D});
    loss_2d.ShareDataWith(*loss).Resize({N, 1});

-    auto eigen_logits = math::EigenMatrix<T>::From(logits_2d);
-    auto eigen_softmax = math::EigenMatrix<T>::From(softmax_2d);
+    auto eigen_logits = phi::funcs::EigenMatrix<T>::From(logits_2d);
+    auto eigen_softmax = phi::funcs::EigenMatrix<T>::From(softmax_2d);

    // step 1, obtain logit_max
    phi::DenseTensor logits_max;
    logits_max = ctx.AllocateTmpTensor<T, phi::GPUContext>({N, 1}, dev_ctx);
    void* logits_max_buff = logits_max.mutable_data<T>(place);

-    auto eigen_logits_max = math::EigenMatrix<T>::From(logits_max);
+    auto eigen_logits_max = phi::funcs::EigenMatrix<T>::From(logits_max);
    Eigen::DSizes<int, 1> along_axis(1);
    eigen_logits_max.device(*dev_ctx.eigen_device()) =
        eigen_logits.maximum(along_axis);
@@ -158,7 +160,7 @@ struct CSoftmaxWithCrossEntropyFunctor<phi::GPUContext, T> {
    eigen_softmax.device(*dev_ctx.eigen_device()) =
        (eigen_logits -
         eigen_logits_max.reshape(batch_by_one).broadcast(one_by_class))
-            .unaryExpr(math::ValueClip<T>());
+            .unaryExpr(phi::funcs::ValueClip<T>());

    // step 3, obtain predict target
    phi::DenseTensor predicted_logits;
@@ -217,7 +219,8 @@ struct CSoftmaxWithCrossEntropyFunctor<phi::GPUContext, T> {
    sum_exp_logits = ctx.AllocateTmpTensor<T, phi::GPUContext>({N, 1}, dev_ctx);
    void* sum_exp_logits_buff = sum_exp_logits.mutable_data<T>(place);

-    auto eigen_sum_exp_logits = math::EigenMatrix<T>::From(sum_exp_logits);
+    auto eigen_sum_exp_logits =
+        phi::funcs::EigenMatrix<T>::From(sum_exp_logits);
    eigen_sum_exp_logits.device(*dev_ctx.eigen_device()) =
        eigen_softmax.sum(along_axis);

@@ -231,8 +234,9 @@ struct CSoftmaxWithCrossEntropyFunctor<phi::GPUContext, T> {
        comm->comm(),
        stream));

-    auto eigen_loss = math::EigenMatrix<T>::From(loss_2d);
-    auto eigen_predicted_logits = math::EigenMatrix<T>::From(predicted_logits);
+    auto eigen_loss = phi::funcs::EigenMatrix<T>::From(loss_2d);
+    auto eigen_predicted_logits =
+        phi::funcs::EigenMatrix<T>::From(predicted_logits);

    eigen_loss.device(*dev_ctx.eigen_device()) =
        (eigen_sum_exp_logits.log().unaryExpr(phi::funcs::TolerableValue<T>()) -
@@ -281,14 +285,14 @@ struct CSoftmaxWithCrossEntropyProcessGroupFunctor<phi::GPUContext, T> {
    softmax_2d.ShareDataWith(*softmax).Resize({N, D});
    loss_2d.ShareDataWith(*loss).Resize({N, 1});

-    auto eigen_logits = math::EigenMatrix<T>::From(logits_2d);
-    auto eigen_softmax = math::EigenMatrix<T>::From(softmax_2d);
+    auto eigen_logits = phi::funcs::EigenMatrix<T>::From(logits_2d);
+    auto eigen_softmax = phi::funcs::EigenMatrix<T>::From(softmax_2d);

    // step 1, obtain logit_max
    phi::DenseTensor logits_max;
    logits_max = ctx.AllocateTmpTensor<T, phi::GPUContext>({N, 1}, dev_ctx);

-    auto eigen_logits_max = math::EigenMatrix<T>::From(logits_max);
+    auto eigen_logits_max = phi::funcs::EigenMatrix<T>::From(logits_max);
    Eigen::DSizes<int, 1> along_axis(1);
    eigen_logits_max.device(*dev_ctx.eigen_device()) =
        eigen_logits.maximum(along_axis);
@@ -304,7 +308,7 @@ struct CSoftmaxWithCrossEntropyProcessGroupFunctor<phi::GPUContext, T> {
    eigen_softmax.device(*dev_ctx.eigen_device()) =
        (eigen_logits -
         eigen_logits_max.reshape(batch_by_one).broadcast(one_by_class))
-            .unaryExpr(math::ValueClip<T>());
+            .unaryExpr(phi::funcs::ValueClip<T>());

    // step 3, obtain predict target
    phi::DenseTensor predicted_logits;
@@ -357,7 +361,8 @@ struct CSoftmaxWithCrossEntropyProcessGroupFunctor<phi::GPUContext, T> {
    sum_exp_logits = ctx.AllocateTmpTensor<T, phi::GPUContext>({N, 1}, dev_ctx);
    void* sum_exp_logits_buff = sum_exp_logits.mutable_data<T>(place);

-    auto eigen_sum_exp_logits = math::EigenMatrix<T>::From(sum_exp_logits);
+    auto eigen_sum_exp_logits =
+        phi::funcs::EigenMatrix<T>::From(sum_exp_logits);
    eigen_sum_exp_logits.device(*dev_ctx.eigen_device()) =
        eigen_softmax.sum(along_axis);

@@ -366,8 +371,9 @@ struct CSoftmaxWithCrossEntropyProcessGroupFunctor<phi::GPUContext, T> {
    opts.reduce_op = distributed::ReduceOp::SUM;
    pg->AllReduce(in_out, in_out, opts)->Synchronize();

-    auto eigen_loss = math::EigenMatrix<T>::From(loss_2d);
-    auto eigen_predicted_logits = math::EigenMatrix<T>::From(predicted_logits);
+    auto eigen_loss = phi::funcs::EigenMatrix<T>::From(loss_2d);
+    auto eigen_predicted_logits =
+        phi::funcs::EigenMatrix<T>::From(predicted_logits);

    eigen_loss.device(*dev_ctx.eigen_device()) =
        (eigen_sum_exp_logits.log().unaryExpr(phi::funcs::TolerableValue<T>()) -

--- a/paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.h
+++ b/paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.h
@@ -22,9 +22,9 @@ limitations under the License. */
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/softmax.h"
 #include "paddle/phi/api/include/tensor.h"
 #include "paddle/phi/kernels/funcs/cross_entropy.h"
+#include "paddle/phi/kernels/funcs/softmax.h"

 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/elementwise/elementwise_mul_op.h
+++ b/paddle/fluid/operators/elementwise/elementwise_mul_op.h
@@ -17,7 +17,7 @@ limitations under the License. */
 #include <string>

 #include "paddle/fluid/operators/elementwise/elementwise_op.h"
-#include "paddle/fluid/platform/cpu_info.h"
+#include "paddle/phi/backends/cpu/cpu_info.h"
 #include "paddle/phi/kernels/elementwise_kernel.h"

 namespace paddle {

--- a/paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.cc
+++ b/paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.cc
@@ -16,7 +16,7 @@ limitations under the License. */

 #include <string>

-#include "paddle/fluid/platform/cpu_info.h"
+#include "paddle/phi/backends/cpu/cpu_info.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/funcs/cpu_vec.h"
 #include "paddle/phi/kernels/funcs/sequence2batch.h"
@@ -278,13 +278,13 @@ class FusedEmbeddingFCLSTMKernel : public framework::OpKernel<T> {
  auto& act_gate_str = ctx.Attr<std::string>("gate_activation");             \
  auto& act_cell_str = ctx.Attr<std::string>("cell_activation");             \
  auto& act_cand_str = ctx.Attr<std::string>("candidate_activation");        \
-  if (platform::MayIUse(platform::avx)) {                                    \
-    phi::funcs::VecActivations<T, platform::avx> act_functor;                \
+  if (phi::backends::cpu::MayIUse(phi::backends::cpu::avx)) {                \
+    phi::funcs::VecActivations<T, phi::backends::cpu::avx> act_functor;      \
    act_gate = act_functor(act_gate_str);                                    \
    act_cell = act_functor(act_cell_str);                                    \
    act_cand = act_functor(act_cand_str);                                    \
  } else {                                                                   \
-    phi::funcs::VecActivations<T, platform::isa_any> act_functor;            \
+    phi::funcs::VecActivations<T, phi::backends::cpu::isa_any> act_functor;  \
    act_gate = act_functor(act_gate_str);                                    \
    act_cell = act_functor(act_cell_str);                                    \
    act_cand = act_functor(act_cand_str);                                    \

--- a/paddle/fluid/operators/fused/fusion_seqexpand_concat_fc_op.cc
+++ b/paddle/fluid/operators/fused/fusion_seqexpand_concat_fc_op.cc
@@ -16,7 +16,7 @@ limitations under the License. */

 #include <string>

-#include "paddle/fluid/platform/cpu_info.h"
+#include "paddle/phi/backends/cpu/cpu_info.h"
 #include "paddle/phi/kernels/funcs/blas/blas.h"
 #include "paddle/phi/kernels/funcs/cpu_vec.h"
 #include "paddle/phi/kernels/funcs/fc_functor.h"
@@ -225,11 +225,11 @@ class FusionSeqExpandConcatFCOpKernel : public framework::OpKernel<T> {

    std::function<void(const int, const T*, T*)> fc_act;
    auto& fc_act_str = ctx.Attr<std::string>("fc_activation");
-    if (platform::MayIUse(platform::avx)) {
-      phi::funcs::VecActivations<T, platform::avx> act_functor;
+    if (phi::backends::cpu::MayIUse(phi::backends::cpu::avx)) {
+      phi::funcs::VecActivations<T, phi::backends::cpu::avx> act_functor;
      fc_act = act_functor(fc_act_str);
    } else {
-      phi::funcs::VecActivations<T, platform::isa_any> act_functor;
+      phi::funcs::VecActivations<T, phi::backends::cpu::isa_any> act_functor;
      fc_act = act_functor(fc_act_str);
    }


--- a/paddle/fluid/operators/jit/CMakeLists.txt
+++ b/paddle/fluid/operators/jit/CMakeLists.txt
@@ -9,7 +9,7 @@ file(APPEND ${jit_file} "\#include \"paddle/fluid/operators/jit/helper.h\"\n")
 file(APPEND ${jit_file}
     "\#include \"paddle/fluid/operators/jit/registry.h\"\n\n")

-set(JIT_KERNEL_DEPS cpu_info cblas gflags enforce place xxhash)
+set(JIT_KERNEL_DEPS device_context cblas gflags enforce place xxhash)

 file(
  GLOB jit_kernel_cc_srcs

--- a/paddle/fluid/operators/jit/gen/act.cc
+++ b/paddle/fluid/operators/jit/gen/act.cc
@@ -15,7 +15,7 @@
 #include "paddle/fluid/operators/jit/gen/act.h"

 #include "paddle/fluid/operators/jit/registry.h"
-#include "paddle/fluid/platform/cpu_info.h"
+#include "paddle/phi/backends/cpu/cpu_info.h"

 namespace paddle {
 namespace operators {
@@ -98,27 +98,27 @@ DECLARE_ACT_CREATOR(VTanh);

 // TODO(TJ): tuning use me
 bool VReluCreator::CanBeUsed(const int& d) const {
-  return platform::MayIUse(platform::avx);
+  return phi::backends::cpu::MayIUse(phi::backends::cpu::avx);
 }

 bool VSquareCreator::CanBeUsed(const int& d) const {
-  return platform::MayIUse(platform::avx);
+  return phi::backends::cpu::MayIUse(phi::backends::cpu::avx);
 }

 bool VIdentityCreator::CanBeUsed(const int& d) const {
-  return platform::MayIUse(platform::avx);
+  return phi::backends::cpu::MayIUse(phi::backends::cpu::avx);
 }

 bool VExpCreator::CanBeUsed(const int& d) const {
-  return platform::MayIUse(platform::avx) && d < 32;
+  return phi::backends::cpu::MayIUse(phi::backends::cpu::avx) && d < 32;
 }

 bool VSigmoidCreator::CanBeUsed(const int& d) const {
-  return platform::MayIUse(platform::avx);
+  return phi::backends::cpu::MayIUse(phi::backends::cpu::avx);
 }

 bool VTanhCreator::CanBeUsed(const int& d) const {
-  return platform::MayIUse(platform::avx);
+  return phi::backends::cpu::MayIUse(phi::backends::cpu::avx);
 }

 size_t VReluCreator::CodeSize(const int& d) const {

--- a/paddle/fluid/operators/jit/gen/act.h
+++ b/paddle/fluid/operators/jit/gen/act.h
@@ -84,8 +84,8 @@ class VActFunc : public JitCode {

  // compute EXP with ymm, xmm
  template <typename JMM>
-  void exp_jmm(JMM& dst,
-               JMM& src,
+  void exp_jmm(JMM& dst,  // NOLINT
+               JMM& src,  // NOLINT
               int src_idx = 11,
               int fx_idx = 12,  // NOLINT
               int fy_idx = 13,
@@ -144,10 +144,11 @@ class VActFunc : public JitCode {
    vcvttps2dq(ymm_int, jmm_fx);
    mov(reg_ptr_global, reinterpret_cast<size_t>(exp_int_0x7f));
    vmovdqa(jmm_tmp, ptr[reg_ptr_global]);
-    if (MayIUse(avx2) || std::is_same<JMM, xmm_t>::value) {
+    if (phi::backends::cpu::MayIUse(phi::backends::cpu::avx2) ||
+        std::is_same<JMM, xmm_t>::value) {
      vpaddd(ymm_int, ymm_int, jmm_tmp);
      vpslld(ymm_int, ymm_int, 23);
-    } else if (MayIUse(avx)) {
+    } else if (phi::backends::cpu::MayIUse(phi::backends::cpu::avx)) {
      xmm_t xtmp1 = xmm_t(ymm_int.getIdx());
      xmm_t xtmp2 = xmm_t(jmm_tmp.getIdx());
      reg64_t reg_ptr_tmp = reg_ptr_global;
@@ -174,8 +175,8 @@ class VActFunc : public JitCode {

  // compute SIGMOID with ymm, xmm
  template <typename JMM>
-  void sigmoid_jmm(JMM& dst,
-                   JMM& src,
+  void sigmoid_jmm(JMM& dst,          // NOLINT
+                   JMM& src,          // NOLINT
                   int src_idx = 11,  // NOLINT
                   int fx_idx = 12,
                   int fy_idx = 13,
@@ -203,8 +204,8 @@ class VActFunc : public JitCode {

  // compute TANH with ymm, xmm
  template <typename JMM>
-  void tanh_jmm(JMM& dst,
-                JMM& src,
+  void tanh_jmm(JMM& dst,          // NOLINT
+                JMM& src,          // NOLINT
                int src_idx = 11,  // NOLINT
                int fx_idx = 12,
                int fy_idx = 13,

--- a/paddle/fluid/operators/jit/gen/adam.cc
+++ b/paddle/fluid/operators/jit/gen/adam.cc
@@ -17,7 +17,7 @@
 #include <stddef.h>  // offsetof

 #include "paddle/fluid/operators/jit/registry.h"
-#include "paddle/fluid/platform/cpu_info.h"
+#include "paddle/phi/backends/cpu/cpu_info.h"

 namespace paddle {
 namespace operators {
@@ -132,7 +132,7 @@ void AdamJitCode::genCode() {
 class AdamCreator : public JitCodeCreator<adam_attr_t> {
 public:
  bool CanBeUsed(const adam_attr_t& attr) const override {
-    return platform::MayIUse(platform::avx512f);
+    return phi::backends::cpu::MayIUse(phi::backends::cpu::avx512f);
  }
  size_t CodeSize(const adam_attr_t& attr) const override {
    return 96 + 32 * 8;

--- a/paddle/fluid/operators/jit/gen/adamw.cc
+++ b/paddle/fluid/operators/jit/gen/adamw.cc
@@ -17,7 +17,7 @@
 #include <stddef.h>  // offsetof

 #include "paddle/fluid/operators/jit/registry.h"
-#include "paddle/fluid/platform/cpu_info.h"
+#include "paddle/phi/backends/cpu/cpu_info.h"

 namespace paddle {
 namespace operators {
@@ -147,7 +147,7 @@ void AdamWJitCode::genCode() {
 class AdamWCreator : public JitCodeCreator<int> {
 public:
  bool CanBeUsed(const int& attr) const override {
-    return platform::MayIUse(platform::avx512f);
+    return phi::backends::cpu::MayIUse(phi::backends::cpu::avx512f);
  }
  size_t CodeSize(const int& attr) const override { return 96 + 32 * 8; }
  std::unique_ptr<GenBase> CreateJitCode(const int& attr) const override {

--- a/paddle/fluid/operators/jit/gen/blas.cc
+++ b/paddle/fluid/operators/jit/gen/blas.cc
@@ -16,7 +16,7 @@

 #include "paddle/fluid/operators/jit/macro.h"
 #include "paddle/fluid/operators/jit/registry.h"
-#include "paddle/fluid/platform/cpu_info.h"
+#include "paddle/phi/backends/cpu/cpu_info.h"

 namespace paddle {
 namespace operators {
@@ -145,7 +145,7 @@ void NCHW16CMulNCJitCode::genCode() {
 class NCHW16CMulNCCreator : public JitCodeCreator<int> {
 public:
  bool CanBeUsed(const int& attr) const override {
-    return platform::MayIUse(platform::avx512f);
+    return phi::backends::cpu::MayIUse(phi::backends::cpu::avx512f);
  }
  size_t CodeSize(const int& d) const override { return 256 * 1024; }
  std::unique_ptr<GenBase> CreateJitCode(const int& attr) const override {
@@ -157,7 +157,8 @@ class NCHW16CMulNCCreator : public JitCodeCreator<int> {
  class name##Creator : public JitCodeCreator<int> {                         \
   public:                                                                   \
    bool CanBeUsed(const int& attr) const override {                         \
-      return platform::MayIUse(platform::avx) && attr <= 1024;               \
+      return phi::backends::cpu::MayIUse(phi::backends::cpu::avx) &&         \
+             attr <= 1024;                                                   \
    }                                                                        \
    size_t CodeSize(const int& d) const override {                           \
      return 96 + d / YMM_FLOAT_BLOCK * 4 * 8;                               \

--- a/paddle/fluid/operators/jit/gen/embseqpool.cc
+++ b/paddle/fluid/operators/jit/gen/embseqpool.cc
@@ -18,7 +18,7 @@

 #include "paddle/fluid/operators/jit/macro.h"
 #include "paddle/fluid/operators/jit/registry.h"
-#include "paddle/fluid/platform/cpu_info.h"
+#include "paddle/phi/backends/cpu/cpu_info.h"

 namespace paddle {
 namespace operators {
@@ -123,7 +123,7 @@ void EmbSeqPoolJitCode::genCode() {
 class EmbSeqPoolCreator : public JitCodeCreator<emb_seq_pool_attr_t> {
 public:
  bool CanBeUsed(const emb_seq_pool_attr_t& attr) const override {
-    return platform::MayIUse(platform::avx) &&
+    return phi::backends::cpu::MayIUse(phi::backends::cpu::avx) &&
           attr.table_width % YMM_FLOAT_BLOCK == 0;
  }
  size_t CodeSize(const emb_seq_pool_attr_t& attr) const override {

--- a/paddle/fluid/operators/jit/gen/gru.cc
+++ b/paddle/fluid/operators/jit/gen/gru.cc
@@ -18,7 +18,7 @@

 #include "paddle/fluid/operators/jit/macro.h"
 #include "paddle/fluid/operators/jit/registry.h"
-#include "paddle/fluid/platform/cpu_info.h"
+#include "paddle/phi/backends/cpu/cpu_info.h"

 namespace paddle {
 namespace operators {
@@ -85,20 +85,21 @@ void GRUJitCode::genCode() {
  ret();
 }

-#define DECLARE_GRU_CREATOR(name)                                 \
-  class name##Creator : public JitCodeCreator<gru_attr_t> {       \
-   public:                                                        \
-    /* TODO(TJ): enable more */                                   \
-    bool CanBeUsed(const gru_attr_t& attr) const override {       \
-      return platform::MayIUse(platform::avx) && attr.d % 8 == 0; \
-    }                                                             \
-    size_t CodeSize(const gru_attr_t& attr) const override {      \
-      return 96 + attr.d / YMM_FLOAT_BLOCK * 96 * 2 * 8;          \
-    }                                                             \
-    std::unique_ptr<GenBase> CreateJitCode(                       \
-        const gru_attr_t& attr) const override {                  \
-      return make_unique<name##JitCode>(attr, CodeSize(attr));    \
-    }                                                             \
+#define DECLARE_GRU_CREATOR(name)                                    \
+  class name##Creator : public JitCodeCreator<gru_attr_t> {          \
+   public:                                                           \
+    /* TODO(TJ): enable more */                                      \
+    bool CanBeUsed(const gru_attr_t& attr) const override {          \
+      return phi::backends::cpu::MayIUse(phi::backends::cpu::avx) && \
+             attr.d % 8 == 0;                                        \
+    }                                                                \
+    size_t CodeSize(const gru_attr_t& attr) const override {         \
+      return 96 + attr.d / YMM_FLOAT_BLOCK * 96 * 2 * 8;             \
+    }                                                                \
+    std::unique_ptr<GenBase> CreateJitCode(                          \
+        const gru_attr_t& attr) const override {                     \
+      return make_unique<name##JitCode>(attr, CodeSize(attr));       \
+    }                                                                \
  }

 DECLARE_GRU_CREATOR(GRUH1);

--- a/paddle/fluid/operators/jit/gen/hopv.cc
+++ b/paddle/fluid/operators/jit/gen/hopv.cc
@@ -15,7 +15,7 @@
 #include "paddle/fluid/operators/jit/gen/hopv.h"

 #include "paddle/fluid/operators/jit/registry.h"
-#include "paddle/fluid/platform/cpu_info.h"
+#include "paddle/phi/backends/cpu/cpu_info.h"

 namespace paddle {
 namespace operators {
@@ -78,7 +78,7 @@ void HOPVJitCode::genCode() {
  class name##Creator : public JitCodeCreator<int> {                         \
   public:                                                                   \
    bool CanBeUsed(const int& attr) const override {                         \
-      return platform::MayIUse(platform::avx);                               \
+      return phi::backends::cpu::MayIUse(phi::backends::cpu::avx);           \
    }                                                                        \
    size_t CodeSize(const int& d) const override {                           \
      return 96 + d / YMM_FLOAT_BLOCK * 4 * 8;                               \

--- a/paddle/fluid/operators/jit/gen/jitcode.h
+++ b/paddle/fluid/operators/jit/gen/jitcode.h
@@ -18,7 +18,7 @@
 #include <type_traits>

 #include "paddle/fluid/operators/jit/gen_base.h"
-#include "paddle/fluid/platform/cpu_info.h"
+#include "paddle/phi/backends/cpu/cpu_info.h"

 #define XBYAK_USE_MMAP_ALLOCATOR
 #include "xbyak/xbyak.h"
@@ -92,7 +92,7 @@ class JitCode : public GenBase, public Xbyak::CodeGenerator {
    for (int i = 0; i < num_g_abi_regs; ++i) {
      push(Xbyak::Reg64(g_abi_regs[i]));
    }
-    if (platform::MayIUse(platform::avx512f)) {
+    if (phi::backends::cpu::MayIUse(phi::backends::cpu::avx512f)) {
      mov(reg_EVEX_max_8b_offt, 2 * EVEX_max_8b_offt);
    }
  }

--- a/paddle/fluid/operators/jit/gen/lstm.cc
+++ b/paddle/fluid/operators/jit/gen/lstm.cc
@@ -18,7 +18,7 @@

 #include "paddle/fluid/operators/jit/macro.h"
 #include "paddle/fluid/operators/jit/registry.h"
-#include "paddle/fluid/platform/cpu_info.h"
+#include "paddle/phi/backends/cpu/cpu_info.h"

 namespace paddle {
 namespace operators {
@@ -113,20 +113,21 @@ void LSTMJitCode::genCode() {
  }
 }

-#define DECLARE_LSTM_CREATOR(name)                                \
-  class name##Creator : public JitCodeCreator<lstm_attr_t> {      \
-   public:                                                        \
-    /* TODO(TJ): enable more */                                   \
-    bool CanBeUsed(const lstm_attr_t& attr) const override {      \
-      return platform::MayIUse(platform::avx) && attr.d % 8 == 0; \
-    }                                                             \
-    size_t CodeSize(const lstm_attr_t& attr) const override {     \
-      return 96 + attr.d / YMM_FLOAT_BLOCK * 90 * 4 * 8;          \
-    }                                                             \
-    std::unique_ptr<GenBase> CreateJitCode(                       \
-        const lstm_attr_t& attr) const override {                 \
-      return make_unique<name##JitCode>(attr, CodeSize(attr));    \
-    }                                                             \
+#define DECLARE_LSTM_CREATOR(name)                                   \
+  class name##Creator : public JitCodeCreator<lstm_attr_t> {         \
+   public:                                                           \
+    /* TODO(TJ): enable more */                                      \
+    bool CanBeUsed(const lstm_attr_t& attr) const override {         \
+      return phi::backends::cpu::MayIUse(phi::backends::cpu::avx) && \
+             attr.d % 8 == 0;                                        \
+    }                                                                \
+    size_t CodeSize(const lstm_attr_t& attr) const override {        \
+      return 96 + attr.d / YMM_FLOAT_BLOCK * 90 * 4 * 8;             \
+    }                                                                \
+    std::unique_ptr<GenBase> CreateJitCode(                          \
+        const lstm_attr_t& attr) const override {                    \
+      return make_unique<name##JitCode>(attr, CodeSize(attr));       \
+    }                                                                \
  }

 DECLARE_LSTM_CREATOR(LSTMCtHt);

--- a/paddle/fluid/operators/jit/gen/matmul.cc
+++ b/paddle/fluid/operators/jit/gen/matmul.cc
@@ -17,7 +17,7 @@
 #include <stddef.h>  // offsetof

 #include "paddle/fluid/operators/jit/registry.h"
-#include "paddle/fluid/platform/cpu_info.h"
+#include "paddle/phi/backends/cpu/cpu_info.h"

 namespace paddle {
 namespace operators {
@@ -110,12 +110,13 @@ void MatMulJitCode::genCode() {
 class MatMulCreator : public JitCodeCreator<matmul_attr_t> {
 public:
  bool CanBeUsed(const matmul_attr_t& attr) const override {
-    return attr.m == 1 && platform::MayIUse(platform::avx512f) &&
+    return attr.m == 1 &&
+           phi::backends::cpu::MayIUse(phi::backends::cpu::avx512f) &&
           attr.n % ZMM_FLOAT_BLOCK == 0 && attr.k < 512;
  }
  size_t CodeSize(const matmul_attr_t& attr) const override {
    int block = YMM_FLOAT_BLOCK;
-    if (platform::MayIUse(platform::avx512f)) {
+    if (phi::backends::cpu::MayIUse(phi::backends::cpu::avx512f)) {
      block = ZMM_FLOAT_BLOCK;
    }
    return 96 + 4 * attr.k * (attr.n / block + 1) * 8;

--- a/paddle/fluid/operators/jit/gen/seqpool.cc
+++ b/paddle/fluid/operators/jit/gen/seqpool.cc
@@ -16,7 +16,7 @@

 #include "paddle/fluid/operators/jit/gen/act.h"  // for exp_float_consts ones
 #include "paddle/fluid/operators/jit/registry.h"
-#include "paddle/fluid/platform/cpu_info.h"
+#include "paddle/phi/backends/cpu/cpu_info.h"

 namespace paddle {
 namespace operators {
@@ -59,7 +59,7 @@ void SeqPoolJitCode::genCode() {
 class SeqPoolCreator : public JitCodeCreator<seq_pool_attr_t> {
 public:
  bool CanBeUsed(const seq_pool_attr_t& attr) const override {
-    return platform::MayIUse(platform::avx);
+    return phi::backends::cpu::MayIUse(phi::backends::cpu::avx);
  }
  size_t CodeSize(const seq_pool_attr_t& attr) const override {
    return 96 + ((attr.w / YMM_FLOAT_BLOCK + 4 /* for rest */) *

--- a/paddle/fluid/operators/jit/gen/sgd.cc
+++ b/paddle/fluid/operators/jit/gen/sgd.cc
@@ -17,7 +17,7 @@
 #include <stddef.h>  // offsetof

 #include "paddle/fluid/operators/jit/registry.h"
-#include "paddle/fluid/platform/cpu_info.h"
+#include "paddle/phi/backends/cpu/cpu_info.h"

 namespace paddle {
 namespace operators {
@@ -109,7 +109,7 @@ void SgdJitCode::genCode() {
 class SgdCreator : public JitCodeCreator<sgd_attr_t> {
 public:
  bool CanBeUsed(const sgd_attr_t& attr) const override {
-    return platform::MayIUse(platform::avx) &&
+    return phi::backends::cpu::MayIUse(phi::backends::cpu::avx) &&
           attr.grad_width % YMM_FLOAT_BLOCK == 0;
  }
  size_t CodeSize(const sgd_attr_t& attr) const override { return 96 + 32 * 8; }

--- a/paddle/fluid/operators/jit/gen/vbroadcast.cc
+++ b/paddle/fluid/operators/jit/gen/vbroadcast.cc
@@ -15,7 +15,7 @@
 #include "paddle/fluid/operators/jit/gen/vbroadcast.h"

 #include "paddle/fluid/operators/jit/registry.h"
-#include "paddle/fluid/platform/cpu_info.h"
+#include "paddle/phi/backends/cpu/cpu_info.h"

 namespace paddle {
 namespace operators {
@@ -69,7 +69,8 @@ void VBroadcastJitCode::genCode() {
 class VBroadcastCreator : public JitCodeCreator<int64_t> {
 public:
  bool CanBeUsed(const int64_t& w) const override {
-    return platform::MayIUse(platform::avx) && w % YMM_FLOAT_BLOCK == 0;
+    return phi::backends::cpu::MayIUse(phi::backends::cpu::avx) &&
+           w % YMM_FLOAT_BLOCK == 0;
  }
  size_t CodeSize(const int64_t& w) const override {
    return 96 + (w / YMM_FLOAT_BLOCK) * 16 * 8;

--- a/paddle/fluid/operators/jit/gen_base.cc
+++ b/paddle/fluid/operators/jit/gen_base.cc
@@ -17,8 +17,8 @@
 #include <fstream>

 #include "paddle/fluid/memory/allocation/cpu_allocator.h"  // for posix_memalign
-#include "paddle/fluid/platform/cpu_info.h"
 #include "paddle/fluid/platform/enforce.h"
+#include "paddle/phi/backends/cpu/cpu_info.h"

 #ifndef _WIN32
 #define posix_memalign_free free
@@ -66,7 +66,7 @@ void GenBase::operator delete(void* ptr) { posix_memalign_free(ptr); }
 std::vector<int> packed_groups(int n, int k, int* block_out, int* rest_out) {
  int block;
  int max_num_regs;
-  if (platform::MayIUse(platform::avx512f)) {
+  if (phi::backends::cpu::MayIUse(phi::backends::cpu::avx512f)) {
    block = ZMM_FLOAT_BLOCK;
    max_num_regs = 32;
  } else {

--- a/paddle/fluid/operators/jit/more/intrinsic/crf_decoding.cc
+++ b/paddle/fluid/operators/jit/more/intrinsic/crf_decoding.cc
@@ -17,7 +17,7 @@
 #include <limits>

 #include "paddle/fluid/operators/jit/registry.h"
-#include "paddle/fluid/platform/cpu_info.h"
+#include "paddle/phi/backends/cpu/cpu_info.h"

 namespace paddle {
 namespace operators {
@@ -172,7 +172,7 @@ bool CRFDecodingKernel::CanBeUsed(const int& d) const {
 #else
  constexpr int block = YMM_FLOAT_BLOCK;
 #endif
-  return platform::MayIUse(platform::avx) && d >= block;
+  return phi::backends::cpu::MayIUse(phi::backends::cpu::avx) && d >= block;
 }

 }  // namespace intrinsic

--- a/paddle/fluid/operators/jit/more/intrinsic/layer_norm.cc
+++ b/paddle/fluid/operators/jit/more/intrinsic/layer_norm.cc
@@ -17,7 +17,7 @@
 #include <limits>

 #include "paddle/fluid/operators/jit/registry.h"
-#include "paddle/fluid/platform/cpu_info.h"
+#include "paddle/phi/backends/cpu/cpu_info.h"

 namespace paddle {
 namespace operators {
@@ -179,7 +179,8 @@ void LayerNorm(float* x,
 }

 bool LayerNormKernel::CanBeUsed(const int& d) const {
-  return platform::MayIUse(platform::avx) && d >= YMM_FLOAT_BLOCK;
+  return phi::backends::cpu::MayIUse(phi::backends::cpu::avx) &&
+         d >= YMM_FLOAT_BLOCK;
 }

 }  // namespace intrinsic

--- a/paddle/fluid/operators/jit/more/mkl/mkl.cc
+++ b/paddle/fluid/operators/jit/more/mkl/mkl.cc
@@ -16,8 +16,8 @@

 #include "paddle/fluid/operators/jit/refer/refer.h"
 #include "paddle/fluid/operators/jit/registry.h"
-#include "paddle/fluid/platform/cpu_info.h"
 #include "paddle/fluid/platform/dynload/mklml.h"
+#include "paddle/phi/backends/cpu/cpu_info.h"

 namespace paddle {
 namespace operators {
@@ -188,17 +188,17 @@ void StrideASum<double>(const double* x, double* res, int n, int stride) {
 // TODO(TJ): tuning me carefully on AVX, AVX2 and AVX512
 template <>
 bool VMulKernel<float>::CanBeUsed(const int& d) const {
-  return platform::MayIUse(platform::avx512f) && d > 512;
+  return phi::backends::cpu::MayIUse(phi::backends::cpu::avx512f) && d > 512;
 }

 template <>
 bool VAddKernel<float>::CanBeUsed(const int& d) const {
-  return platform::MayIUse(platform::avx) && d > 512;
+  return phi::backends::cpu::MayIUse(phi::backends::cpu::avx) && d > 512;
 }

 template <>
 bool VScalKernel<float>::CanBeUsed(const int& d) const {
-  return platform::MayIUse(platform::avx512f) && d > 512;
+  return phi::backends::cpu::MayIUse(phi::backends::cpu::avx512f) && d > 512;
 }

 template <>
@@ -274,7 +274,7 @@ bool SgdKernel<double>::CanBeUsed(const sgd_attr_t& attr) const {

 template <>
 bool MatMulKernel<float>::CanBeUsed(const matmul_attr_t& attr) const {
-  return platform::MayIUse(platform::avx);
+  return phi::backends::cpu::MayIUse(phi::backends::cpu::avx);
 }

 template <>
@@ -285,7 +285,7 @@ bool MatMulKernel<double>::CanBeUsed(const matmul_attr_t& attr) const {
 template <>
 bool SoftmaxKernel<float>::CanBeUsed(const int& d) const {
  // tuned on avx2
-  return platform::MayIUse(platform::avx) && d < 60;
+  return phi::backends::cpu::MayIUse(phi::backends::cpu::avx) && d < 60;
 }

 #define AWALYS_USE_ME_WITH_DOUBLE(func)                      \

--- a/paddle/fluid/operators/jit/test.cc
+++ b/paddle/fluid/operators/jit/test.cc
@@ -19,8 +19,8 @@ limitations under the License. */
 #include "glog/logging.h"
 #include "gtest/gtest.h"
 #include "paddle/fluid/operators/jit/kernels.h"
-#include "paddle/fluid/platform/cpu_info.h"
 #include "paddle/fluid/platform/place.h"
+#include "paddle/phi/backends/cpu/cpu_info.h"

 DEFINE_double(acc, 1e-5, "Test accuracy threshold.");

@@ -437,7 +437,7 @@ void TestKernelNCHW16CMulNC() {
  EXPECT_TRUE(tgt != nullptr);

  if (std::is_same<T, float>::value &&
-      paddle::platform::MayIUse(paddle::platform::avx512f)) {
+      phi::backends::cpu::MayIUse(phi::backends::cpu::avx512f)) {
    EXPECT_TRUE(jitcode != nullptr);
  }
  for (int ni = 0; ni < n; ni++) {
@@ -1393,7 +1393,7 @@ TEST(JITKernel_helper, pack_weights) {
  }
  int block = 0;
  std::vector<int> groups;
-  if (paddle::platform::MayIUse(paddle::platform::avx512f)) {
+  if (phi::backends::cpu::MayIUse(phi::backends::cpu::avx512f)) {
    block = ZMM_FLOAT_BLOCK;
    groups.push_back(30);
  } else {

--- a/paddle/fluid/operators/math/CMakeLists.txt
+++ b/paddle/fluid/operators/math/CMakeLists.txt
@@ -32,7 +32,6 @@ math_library(maxouting)
 math_library(sequence_padding)
 math_library(sequence_pooling DEPS math_function jit_kernel_helper)
 math_library(sequence_scale)
-math_library(softmax DEPS math_function jit_kernel_helper)
 if(WITH_ASCEND_CL)
  math_library(beam_search DEPS math_function beam_search_npu)
 elseif(WITH_XPU)

--- a/paddle/fluid/operators/math/cpu_vec.h
+++ b/paddle/fluid/operators/math/cpu_vec.h
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <cmath>
-#include <functional>
-#include <string>
-
-#include "paddle/fluid/platform/cpu_info.h"
-#include "paddle/fluid/platform/enforce.h"
-
-#ifdef PADDLE_WITH_MKLML
-#include "paddle/fluid/platform/dynload/mklml.h"
-#endif
-
-namespace paddle {
-namespace operators {
-namespace math {
-
-#define SIGMOID_THRESHOLD_MIN -40.0
-#define SIGMOID_THRESHOLD_MAX 13.0
-
-#define YMM_FLOAT_BLOCK 8
-#define AVX_DOUBLE_BLOCK 4
-#define YMM_FLOAT_BLOCK 8
-#define AVX2_DOUBLE_BLOCK 4
-#define ZMM_FLOAT_BLOCK 16
-#define AVX512_DOUBLE_BLOCK 8
-
-template <typename T>
-inline void vec_exp(const int n, const T* x, T* y) {
-  for (int i = 0; i < n; ++i) {
-    y[i] = std::exp(x[i]);
-  }
-}
-
-template <typename T>
-inline void vec_scal(const int n, const T a, T* x) {
-  for (int i = 0; i < n; ++i) {
-    x[i] = a * x[i];
-  }
-}
-
-#ifdef PADDLE_WITH_MKLML
-template <>
-inline void vec_exp<float>(const int n, const float* x, float* y) {
-  constexpr int small_enough = 128;
-  if (n < small_enough) {
-    for (int i = 0; i < n; ++i) {
-      y[i] = std::exp(x[i]);
-    }
-  } else {
-    platform::dynload::vsExp(n, x, y);
-  }
-}
-
-template <>
-inline void vec_exp<double>(const int n, const double* x, double* y) {
-  platform::dynload::vdExp(n, x, y);
-}
-
-template <>
-inline void vec_scal<float>(const int n, const float a, float* x) {
-  platform::dynload::cblas_sscal(n, a, x, 1);
-}
-
-template <>
-inline void vec_scal<double>(const int n, const double a, double* x) {
-  platform::dynload::cblas_dscal(n, a, x, 1);
-}
-#endif
-
-// MKL scal only support inplace, choose this if src and dst are not equal
-template <typename T, platform::cpu_isa_t isa = platform::isa_any>
-inline void vec_scal(const int n, const T a, const T* x, T* y) {
-  for (int i = 0; i < n; ++i) {
-    y[i] = a * x[i];
-  }
-}
-
-template <>
-inline void vec_scal<float, platform::avx>(const int n,
-                                           const float a,
-                                           const float* x,
-                                           float* y) {
-#ifdef __AVX__
-  constexpr int block = YMM_FLOAT_BLOCK;
-  if (n < block) {
-    vec_scal<float, platform::isa_any>(n, a, x, y);
-    return;
-  }
-  const int rest = n % block;
-  const int end = n - rest;
-  int i = 0;
-  __m256 scalar = _mm256_set1_ps(a);
-  __m256 tmp;
-#define MOVE_ONE_STEP               \
-  tmp = _mm256_loadu_ps(x + i);     \
-  tmp = _mm256_mul_ps(tmp, scalar); \
-  _mm256_storeu_ps(y + i, tmp)
-  for (i = 0; i < end; i += block) {
-    MOVE_ONE_STEP;
-  }
-#undef MOVE_ONE_STEP
-  if (rest == 0) {
-    return;
-  }
-  // can not continue move step if src and dst are inplace
-  for (i = n - rest; i < n; ++i) {
-    y[i] = a * x[i];
-  }
-#else
-  vec_scal<float, platform::isa_any>(n, a, x, y);
-#endif
-}
-
-template <>
-inline void vec_scal<float, platform::avx2>(const int n,
-                                            const float a,
-                                            const float* x,
-                                            float* y) {
-  vec_scal<float, platform::avx>(n, a, x, y);
-}
-
-template <>
-inline void vec_scal<float, platform::avx512f>(const int n,
-                                               const float a,
-                                               const float* x,
-                                               float* y) {
-  // TODO(TJ): enable me
-  vec_scal<float, platform::avx2>(n, a, x, y);
-}
-
-template <typename T, platform::cpu_isa_t isa = platform::isa_any>
-inline void vec_sum(const size_t n, const T* x, T* s) {
-  s[0] = x[0];
-  for (size_t i = 1; i < n; ++i) {
-    s[0] += x[i];
-  }
-}
-
-template <>
-inline void vec_sum<float, platform::avx>(const size_t n,
-                                          const float* x,
-                                          float* s) {
-#ifdef __AVX__
-  constexpr unsigned int block = YMM_FLOAT_BLOCK;
-  if (n < block) {
-    vec_sum<float, platform::isa_any>(n, x, s);
-    return;
-  }
-
-  unsigned int i, end;
-  i = end = 0;
-  s[0] = 0.f;
-
-  end = n & ~(block - 1);
-  __m256 tmp = _mm256_setzero_ps();
-  for (i = 0; i < end; i += block) {
-    tmp = _mm256_add_ps(tmp, _mm256_loadu_ps(x + i));
-  }
-
-  __m256 hsum = _mm256_hadd_ps(tmp, tmp);
-  hsum = _mm256_add_ps(hsum, _mm256_permute2f128_ps(hsum, hsum, 0x1));
-  _mm_store_ss(
-      s,
-      _mm_hadd_ps(_mm256_castps256_ps128(hsum), _mm256_castps256_ps128(hsum)));
-
-  for (; i < n; i++) {
-    s[0] += x[i];
-  }
-#else
-  vec_sum<float, platform::isa_any>(n, x, s);
-#endif
-}
-
-template <typename T, platform::cpu_isa_t isa = platform::isa_any>
-inline void vec_mul(const size_t n, const T* x, const T* y, T* z) {
-  for (size_t i = 0; i < n; ++i) {
-    z[i] = x[i] * y[i];
-  }
-}
-
-template <>
-inline void vec_mul<float, platform::avx>(const size_t n,
-                                          const float* x,
-                                          const float* y,
-                                          float* z) {
-#ifdef __AVX__
-  constexpr unsigned int block = YMM_FLOAT_BLOCK;
-  if (n < block) {
-    vec_mul<float, platform::isa_any>(n, x, y, z);
-    return;
-  }
-
-  unsigned int i = 0, end = 0;
-  end = n & ~(block - 1);
-  for (i = 0; i < end; i += block) {
-    _mm256_storeu_ps(
-        z + i, _mm256_mul_ps(_mm256_loadu_ps(x + i), _mm256_loadu_ps(y + i)));
-  }
-
-  for (; i < n; i++) {
-    z[i] = x[i] * y[i];
-  }
-#else
-  vec_mul<float, platform::isa_any>(n, x, y, z);
-#endif
-}
-
-template <typename T, platform::cpu_isa_t isa = platform::isa_any>
-inline void vec_mul_reduce(const size_t n, const T* x, const T* y, T* z) {
-  z[0] = x[0] * y[0];
-  for (size_t i = 1; i < n; ++i) {
-    z[0] += x[i] * y[i];
-  }
-}
-
-template <>
-inline void vec_mul_reduce<float, platform::avx>(const size_t n,
-                                                 const float* x,
-                                                 const float* y,
-                                                 float* z) {
-#ifdef __AVX__
-  constexpr unsigned int block = YMM_FLOAT_BLOCK;
-  if (n < block) {
-    vec_mul_reduce<float, platform::isa_any>(n, x, y, z);
-    return;
-  }
-
-  unsigned int i = 0, end = 0;
-  z[0] = 0.f;
-
-  end = n & ~(block - 1);
-  __m256 tmp = _mm256_setzero_ps();
-  for (i = 0; i < end; i += block) {
-    tmp = _mm256_add_ps(
-        tmp, _mm256_mul_ps(_mm256_loadu_ps(x + i), _mm256_loadu_ps(y + i)));
-  }
-
-  __m256 hsum = _mm256_hadd_ps(tmp, tmp);
-  hsum = _mm256_add_ps(hsum, _mm256_permute2f128_ps(hsum, hsum, 0x1));
-  _mm_store_ss(
-      z,
-      _mm_hadd_ps(_mm256_castps256_ps128(hsum), _mm256_castps256_ps128(hsum)));
-
-  for (; i < n; i++) {
-    z[0] += x[i] * y[i];
-  }
-#else
-  vec_mul_reduce<float, platform::isa_any>(n, x, y, z);
-#endif
-}
-
-template <typename T, platform::cpu_isa_t isa = platform::isa_any>
-inline void vec_bias_sub(const int n, const T a, const T* x, T* y) {
-  for (int i = 0; i < n; ++i) {
-    y[i] = a - x[i];
-  }
-}
-
-template <>
-inline void vec_bias_sub<float, platform::avx>(const int n,
-                                               const float a,
-                                               const float* x,
-                                               float* y) {
-#ifdef __AVX__
-  constexpr int block = YMM_FLOAT_BLOCK;
-  if (n < block) {
-    vec_bias_sub<float, platform::isa_any>(n, a, x, y);
-    return;
-  }
-  const int rest = n % block;
-  const int end = n - rest;
-  int i = 0;
-  __m256 bias = _mm256_set1_ps(a);
-  __m256 tmp;
-#define MOVE_ONE_STEP             \
-  tmp = _mm256_loadu_ps(x + i);   \
-  tmp = _mm256_sub_ps(bias, tmp); \
-  _mm256_storeu_ps(y + i, tmp)
-  for (i = 0; i < end; i += block) {
-    MOVE_ONE_STEP;
-  }
-#undef MOVE_ONE_STEP
-  if (rest == 0) {
-    return;
-  }
-  // can not continue move step if src and dst are inplace
-  for (i = n - rest; i < n; ++i) {
-    y[i] = a - x[i];
-  }
-#else
-  vec_bias_sub<float, platform::isa_any>(n, a, x, y);
-#endif
-}
-
-template <>
-inline void vec_bias_sub<float, platform::avx2>(const int n,
-                                                const float a,
-                                                const float* x,
-                                                float* y) {
-  vec_bias_sub<float, platform::avx>(n, a, x, y);
-}
-
-template <>
-inline void vec_bias_sub<float, platform::avx512f>(const int n,
-                                                   const float a,
-                                                   const float* x,
-                                                   float* y) {
-  // TODO(TJ): enable me
-  vec_bias_sub<float, platform::avx2>(n, a, x, y);
-}
-
-// out = x*y + (1-x)*z
-template <typename T, platform::cpu_isa_t isa = platform::isa_any>
-inline void vec_cross(const int n, const T* x, const T* y, const T* z, T* out) {
-  for (int i = 0; i < n; ++i) {
-    out[i] = x[i] * y[i] + (static_cast<T>(1) - x[i]) * z[i];
-  }
-}
-
-template <>
-inline void vec_cross<float, platform::avx>(
-    const int n, const float* x, const float* y, const float* z, float* out) {
-#ifdef __AVX__
-  constexpr int block = YMM_FLOAT_BLOCK;
-  if (n < block) {
-    vec_cross<float, platform::isa_any>(n, x, y, z, out);
-    return;
-  }
-  const int rest = n % block;
-  const int end = n - rest;
-  int i = 0;
-  __m256 bias = _mm256_set1_ps(1.f);
-  __m256 tmpx, tmpy, tmpz;
-  for (i = 0; i < end; i += block) {
-    tmpx = _mm256_loadu_ps(x + i);
-    tmpy = _mm256_loadu_ps(y + i);
-    tmpz = _mm256_loadu_ps(z + i);
-    tmpy = _mm256_mul_ps(tmpx, tmpy);
-    tmpx = _mm256_sub_ps(bias, tmpx);
-    tmpz = _mm256_mul_ps(tmpx, tmpz);
-    tmpz = _mm256_add_ps(tmpy, tmpz);
-    _mm256_storeu_ps(out + i, tmpz);
-  }
-  if (rest == 0) {
-    return;
-  }
-  // can not continue move step if src and dst are inplace
-  for (i = n - rest; i < n; ++i) {
-    out[i] = x[i] * y[i] + (1.f - x[i]) * z[i];
-  }
-#else
-  vec_cross<float, platform::isa_any>(n, x, y, z, out);
-#endif
-}
-
-template <>
-inline void vec_cross<float, platform::avx2>(
-    const int n, const float* x, const float* y, const float* z, float* out) {
-  vec_cross<float, platform::avx>(n, x, y, z, out);
-}
-
-template <>
-inline void vec_cross<float, platform::avx512f>(
-    const int n, const float* x, const float* y, const float* z, float* out) {
-  // TODO(TJ): enable me
-  vec_cross<float, platform::avx>(n, x, y, z, out);
-}
-
-template <typename T, platform::cpu_isa_t isa = platform::isa_any>
-inline void vec_clip(const size_t n, const T a, const T* x, T* y) {
-  for (size_t i = 0; i < n; ++i) {
-    y[i] = x[i] < a ? a : x[i];
-  }
-}
-
-template <>
-inline void vec_clip<float, platform::avx>(const size_t n,
-                                           const float a,
-                                           const float* x,
-                                           float* y) {
-#ifdef __AVX__
-  constexpr unsigned int block = YMM_FLOAT_BLOCK;
-  if (n < block) {
-    vec_clip<float, platform::isa_any>(n, a, x, y);
-    return;
-  }
-
-  unsigned int i = 0, end = 0;
-  end = n & ~(block - 1);
-  __m256 threshold = _mm256_set1_ps(a);
-
-  for (i = 0; i < end; i += block) {
-    _mm256_storeu_ps(y + i, _mm256_max_ps(_mm256_loadu_ps(x + i), threshold));
-  }
-
-  for (; i < n; i++) {
-    y[i] = x[i] < a ? a : x[i];
-  }
-#else
-  vec_clip<float, platform::isa_any>(n, a, x, y);
-#endif
-}
-
-template <typename T, platform::cpu_isa_t isa = platform::isa_any>
-inline void vec_add_bias(const int n, const T a, const T* x, T* y) {
-  for (int i = 0; i < n; ++i) {
-    y[i] = x[i] + a;
-  }
-}
-
-template <>
-inline void vec_add_bias<float, platform::avx>(const int n,
-                                               const float a,
-                                               const float* x,
-                                               float* y) {
-#ifdef __AVX__
-  constexpr int block = YMM_FLOAT_BLOCK;
-  if (n < block) {
-    vec_add_bias<float, platform::isa_any>(n, a, x, y);
-    return;
-  }
-  const int rest = n % block;
-  const int end = n - rest;
-  int i = 0;
-  __m256 bias = _mm256_set1_ps(a);
-  __m256 tmp;
-#define MOVE_ONE_STEP             \
-  tmp = _mm256_loadu_ps(x + i);   \
-  tmp = _mm256_add_ps(tmp, bias); \
-  _mm256_storeu_ps(y + i, tmp)
-  for (i = 0; i < end; i += block) {
-    MOVE_ONE_STEP;
-  }
-#undef MOVE_ONE_STEP
-  if (rest == 0) {
-    return;
-  }
-  // can not continue move step if src and dst are inplace
-  for (i = n - rest; i < n; ++i) {
-    y[i] = x[i] + a;
-  }
-#else
-  vec_add_bias<float, platform::isa_any>(n, a, x, y);
-#endif
-}
-
-template <>
-inline void vec_add_bias<float, platform::avx2>(const int n,
-                                                const float a,
-                                                const float* x,
-                                                float* y) {
-  vec_add_bias<float, platform::avx>(n, a, x, y);
-}
-
-template <>
-inline void vec_add_bias<float, platform::avx512f>(const int n,
-                                                   const float a,
-                                                   const float* x,
-                                                   float* y) {
-  // TODO(TJ): enable me
-  vec_add_bias<float, platform::avx2>(n, a, x, y);
-}
-
-template <typename T, platform::cpu_isa_t isa = platform::isa_any>
-inline void vec_identity(const int n, const T* x, T* y) {
-  // do nothing
-  return;
-}
-
-template <typename T, platform::cpu_isa_t isa = platform::isa_any>
-inline void vec_sigmoid(const int n, const T* x, T* y) {
-  const T min = SIGMOID_THRESHOLD_MIN;
-  const T max = SIGMOID_THRESHOLD_MAX;
-  for (int i = 0; i < n; ++i) {
-    y[i] = (x[i] < min) ? min : ((x[i] > max) ? max : x[i]);
-    y[i] = static_cast<T>(0) - y[i];
-  }
-  vec_exp<T>(n, y, y);
-  for (int i = 0; i < n; ++i) {
-    y[i] = static_cast<T>(1) / (static_cast<T>(1) + y[i]);
-  }
-}
-
-template <>
-inline void vec_sigmoid<float, platform::avx>(const int n,
-                                              const float* x,
-                                              float* y) {
-#ifdef __AVX__
-  constexpr int block = YMM_FLOAT_BLOCK;
-  if (n < block) {
-    vec_sigmoid<float, platform::isa_any>(n, x, y);
-    return;
-  }
-  const int rest = n % block;
-  const int end = n - rest;
-  int i = 0;
-  __m256 max = _mm256_set1_ps(SIGMOID_THRESHOLD_MAX);
-  __m256 min = _mm256_set1_ps(SIGMOID_THRESHOLD_MIN);
-  __m256 zeros = _mm256_setzero_ps();
-  __m256 tmp;
-#define MOVE_ONE_STEP              \
-  tmp = _mm256_loadu_ps(x + i);    \
-  tmp = _mm256_max_ps(tmp, min);   \
-  tmp = _mm256_min_ps(tmp, max);   \
-  tmp = _mm256_sub_ps(zeros, tmp); \
-  _mm256_storeu_ps(y + i, tmp)
-  for (i = 0; i < end; i += block) {
-    MOVE_ONE_STEP;
-  }
-#undef MOVE_ONE_STEP
-  if (rest != 0) {
-    // can not continue move step since the src and dst address could be equal
-    const float xmin = SIGMOID_THRESHOLD_MIN;
-    const float xmax = SIGMOID_THRESHOLD_MAX;
-    for (i = n - rest; i < n; ++i) {
-      y[i] = 0.f - ((x[i] < xmin) ? xmin : ((x[i] > xmax) ? xmax : x[i]));
-    }
-  }
-
-  vec_exp<float>(n, y, y);
-
-  __m256 ones = _mm256_set1_ps(1.0f);
-#define MOVE_ONE_STEP             \
-  tmp = _mm256_loadu_ps(y + i);   \
-  tmp = _mm256_add_ps(ones, tmp); \
-  tmp = _mm256_div_ps(ones, tmp); \
-  _mm256_storeu_ps(y + i, tmp)
-  for (i = 0; i < end; i += block) {
-    MOVE_ONE_STEP;
-  }
-#undef MOVE_ONE_STEP
-  if (rest == 0) {
-    return;
-  }
-  // can not continue move step
-  for (i = n - rest; i < n; ++i) {
-    y[i] = 1.f / (1.f + y[i]);
-  }
-#else
-  vec_sigmoid<float, platform::isa_any>(n, x, y);
-#endif
-}
-
-template <>
-inline void vec_sigmoid<float, platform::avx2>(const int n,
-                                               const float* x,
-                                               float* y) {
-  vec_sigmoid<float, platform::avx>(n, x, y);
-}
-
-template <>
-inline void vec_sigmoid<float, platform::avx512f>(const int n,
-                                                  const float* x,
-                                                  float* y) {
-  // TODO(TJ): enable me
-  vec_sigmoid<float, platform::avx2>(n, x, y);
-}
-
-template <typename T, platform::cpu_isa_t isa = platform::isa_any>
-inline void vec_tanh(const int n, const T* x, T* y) {
-  vec_scal<T, isa>(n, static_cast<T>(2), x, y);
-  vec_sigmoid<T, isa>(n, y, y);
-  vec_scal<T>(n, static_cast<T>(2), y);
-  vec_add_bias<T, isa>(n, static_cast<T>(-1), y, y);
-}
-
-// TODO(TJ): make relu clip
-template <typename T, platform::cpu_isa_t isa = platform::isa_any>
-inline void vec_relu(const int n, const T* x, T* y) {
-  for (int i = 0; i < n; ++i) {
-    y[i] = x[i] > 0 ? x[i] : 0;
-  }
-}
-
-template <>
-inline void vec_relu<float, platform::avx>(const int n,
-                                           const float* x,
-                                           float* y) {
-#ifdef __AVX__
-  constexpr int block = YMM_FLOAT_BLOCK;
-  if (n < block * 4) {
-    vec_relu<float, platform::isa_any>(n, x, y);
-    return;
-  }
-
-  const int rest = n % block;
-  const int end = n - rest;
-  int i = 0;
-  __m256 zeros = _mm256_setzero_ps();
-  __m256 tmp;
-#define MOVE_ONE_STEP              \
-  tmp = _mm256_loadu_ps(x + i);    \
-  tmp = _mm256_max_ps(tmp, zeros); \
-  _mm256_storeu_ps(y + i, tmp)
-  for (i = 0; i < end; i += block) {
-    MOVE_ONE_STEP;
-  }
-  if (rest == 0) {
-    return;
-  }
-  i = n - block;
-  MOVE_ONE_STEP;
-#undef MOVE_ONE_STEP
-
-#else
-  vec_relu<float, platform::isa_any>(n, x, y);
-#endif
-}
-
-template <>
-inline void vec_relu<float, platform::avx2>(const int n,
-                                            const float* x,
-                                            float* y) {
-  vec_relu<float, platform::avx>(n, x, y);
-}
-
-template <>
-inline void vec_relu<float, platform::avx512f>(const int n,
-                                               const float* x,
-                                               float* y) {
-  // TODO(TJ): enable me
-  vec_relu<float, platform::avx2>(n, x, y);
-}
-
-// TODO(TJ): optimize double of sigmoid, tanh and relu if necessary
-
-template <typename T, platform::cpu_isa_t isa = platform::isa_any>
-class VecActivations {
- public:
-  std::function<void(const int, const T*, T*)> operator()(
-      const std::string& type) {
-    if (type == "sigmoid") {
-      return vec_sigmoid<T, isa>;
-    } else if (type == "relu") {
-      return vec_relu<T, isa>;
-    } else if (type == "tanh") {
-      return vec_tanh<T, isa>;
-    } else if (type == "identity" || type == "") {
-      return vec_identity<T, isa>;
-    }
-    PADDLE_THROW(platform::errors::InvalidArgument(
-        "Expected type should be one of sigmod, relu, tanh, identity. But got "
-        "not support type: %s.",
-        type));
-  }
-};
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
--- a/paddle/fluid/operators/sample_logits_op.cu
+++ b/paddle/fluid/operators/sample_logits_op.cu
@@ -21,9 +21,9 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/operators/math/sample_prob.h"
-#include "paddle/fluid/operators/math/softmax.h"
 #include "paddle/fluid/operators/sample_logits_op.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/funcs/softmax.h"

 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/sample_logits_op.h
+++ b/paddle/fluid/operators/sample_logits_op.h
@@ -21,8 +21,8 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/operators/math/sample_prob.h"
-#include "paddle/fluid/operators/math/softmax.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/funcs/softmax.h"

 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/sequence_ops/sequence_softmax_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/sequence_ops/sequence_softmax_cudnn_op.cu.cc
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/softmax.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/funcs/softmax.h"

 namespace paddle {
 namespace operators {
@@ -61,7 +61,7 @@ class SequenceSoftmaxCUDNNKernel : public framework::OpKernel<T> {
          phi::make_ddim({1UL, end_pos - start_pos});
      x_i.Resize(dims_i);
      out_i.Resize(dims_i);
-      math::SoftmaxCUDNNFunctor<T, phi::GPUContext>()(
+      phi::funcs::SoftmaxCUDNNFunctor<T, phi::GPUContext>()(
          ctx.template device_context<phi::GPUContext>(), &x_i, &out_i);
    }
  }
@@ -95,7 +95,7 @@ class SequenceSoftmaxGradCUDNNKernel : public framework::OpKernel<T> {
      out_i.Resize(dims_i);
      out_grad_i.Resize(dims_i);
      x_grad_i.Resize(dims_i);
-      math::SoftmaxGradCUDNNFunctor<T, phi::GPUContext>()(
+      phi::funcs::SoftmaxGradCUDNNFunctor<T, phi::GPUContext>()(
          ctx.template device_context<phi::GPUContext>(),
          &out_i,
          &out_grad_i,

--- a/paddle/fluid/operators/softmax_with_cross_entropy_op_npu.cc
+++ b/paddle/fluid/operators/softmax_with_cross_entropy_op_npu.cc
@@ -16,10 +16,10 @@ limitations under the License. */
 #include <string>

 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/softmax.h"
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
 #include "paddle/phi/kernels/funcs/axis_utils.h"
 #include "paddle/phi/kernels/funcs/cross_entropy.h"
+#include "paddle/phi/kernels/funcs/softmax.h"

 namespace paddle {
 namespace operators {

--- a/paddle/fluid/platform/CMakeLists.txt
+++ b/paddle/fluid/platform/CMakeLists.txt
@@ -56,18 +56,10 @@ cc_test(
  SRCS enforce_test.cc
  DEPS enforce)

-set(CPU_INFO_DEPS gflags glog enforce)
-if(WITH_XBYAK)
-  list(APPEND CPU_INFO_DEPS xbyak)
-endif()
-cc_library(
-  cpu_info
-  SRCS cpu_info.cc
-  DEPS ${CPU_INFO_DEPS})
 cc_test(
  cpu_info_test
  SRCS cpu_info_test.cc
-  DEPS cpu_info)
+  DEPS phi_backends)
 cc_library(
  os_info
  SRCS os_info.cc
@@ -194,7 +186,6 @@ cc_library(
       phi_place
       eigen3
       cpu_helper
-       cpu_info
       framework_proto
       ${IPU_CTX_DEPS}
       ${GPU_CTX_DEPS}

--- a/paddle/fluid/platform/cpu_info.h
+++ b/paddle/fluid/platform/cpu_info.h
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include <stddef.h>
-
-#ifdef _WIN32
-#if defined(__AVX2__)
-#include <immintrin.h>  // avx2
-#elif defined(__AVX__)
-#include <intrin.h>  // avx
-#endif               // AVX
-#else                // WIN32
-#ifdef __AVX__
-#include <immintrin.h>
-#endif
-#endif  // WIN32
-
-#if defined(_WIN32)
-#define ALIGN32_BEG __declspec(align(32))
-#define ALIGN32_END
-#else
-#define ALIGN32_BEG
-#define ALIGN32_END __attribute__((aligned(32)))
-#endif  // _WIN32
-
-#ifndef PADDLE_WITH_XBYAK
-#ifdef _WIN32
-#define cpuid(reg, x) __cpuidex(reg, x, 0)
-#else
-#if !defined(WITH_NV_JETSON) && !defined(PADDLE_WITH_ARM) && \
-    !defined(PADDLE_WITH_SW) && !defined(PADDLE_WITH_MIPS)
-#include <cpuid.h>
-inline void cpuid(int reg[4], int x) {
-  __cpuid_count(x, 0, reg[0], reg[1], reg[2], reg[3]);
-}
-#endif
-#endif
-#endif
-
-#include "paddle/phi/backends/cpu/cpu_info.h"
-
-namespace paddle {
-namespace platform {
-
-size_t CpuTotalPhysicalMemory();
-
-//! Get the maximum allocation size for a machine.
-size_t CpuMaxAllocSize();
-
-//! Get the maximum allocation size for a machine.
-size_t CUDAPinnedMaxAllocSize();
-
-using phi::backends::cpu::CpuMinChunkSize;
-
-//! Get the maximum chunk size for buddy allocator.
-size_t CpuMaxChunkSize();
-
-//! Get the minimum chunk size for buddy allocator.
-size_t CUDAPinnedMinChunkSize();
-
-//! Get the maximum chunk size for buddy allocator.
-size_t CUDAPinnedMaxChunkSize();
-
-//! Get the maximum allocation size for a machine.
-size_t NPUPinnedMaxAllocSize();
-
-//! Get the minimum chunk size for buddy allocator.
-size_t NPUPinnedMinChunkSize();
-
-//! Get the maximum chunk size for buddy allocator.
-size_t NPUPinnedMaxChunkSize();
-
-using namespace phi::backends::cpu;  // NOLINT
-
-// May I use some instruction
-bool MayIUse(const cpu_isa_t cpu_isa);
-
-}  // namespace platform
-}  // namespace paddle
--- a/paddle/fluid/platform/cpu_info_test.cc
+++ b/paddle/fluid/platform/cpu_info_test.cc
@@ -11,7 +11,7 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include "paddle/fluid/platform/cpu_info.h"
+#include "paddle/phi/backends/cpu/cpu_info.h"

 #include <sstream>

@@ -23,7 +23,8 @@ DECLARE_double(fraction_of_cpu_memory_to_use);

 TEST(CpuMemoryUsage, Print) {
  std::stringstream ss;
-  size_t memory_size = paddle::platform::CpuMaxAllocSize() / 1024 / 1024 / 1024;
+  size_t memory_size =
+      phi::backends::cpu::CpuMaxAllocSize() / 1024 / 1024 / 1024;
  float use_percent = FLAGS_fraction_of_cpu_memory_to_use * 100;

  std::cout << paddle::string::Sprintf("\n%.2f %% of CPU Memory Usage: %d GB\n",

--- a/paddle/fluid/platform/init.cc
+++ b/paddle/fluid/platform/init.cc
@@ -16,9 +16,9 @@ limitations under the License. */
 #include <string>

 #include "paddle/fluid/platform/cpu_helper.h"
-#include "paddle/fluid/platform/cpu_info.h"
 #include "paddle/fluid/platform/device/npu/npu_info.h"
 #include "paddle/fluid/string/split.h"
+#include "paddle/phi/backends/cpu/cpu_info.h"
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
 #include "paddle/fluid/platform/cuda_device_guard.h"
 #endif

--- a/paddle/fluid/platform/profiler/CMakeLists.txt
+++ b/paddle/fluid/platform/profiler/CMakeLists.txt
@@ -29,7 +29,7 @@ cc_library(
 cc_library(
  cpu_utilization
  SRCS cpu_utilization.cc
-  DEPS cpu_info os_info enforce glog)
+  DEPS phi_backends os_info enforce glog)
 cc_library(
  new_profiler
  SRCS profiler.cc

--- a/paddle/fluid/pybind/parallel_executor.cc
+++ b/paddle/fluid/pybind/parallel_executor.cc
@@ -72,7 +72,6 @@ limitations under the License. */
 #include "paddle/fluid/operators/common_infer_shape_functions.h"
 #include "paddle/fluid/operators/py_func_op.h"
 #include "paddle/fluid/platform/cpu_helper.h"
-#include "paddle/fluid/platform/cpu_info.h"
 #include "paddle/fluid/platform/device/device_wrapper.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/dynload/dynamic_loader.h"
@@ -89,6 +88,7 @@ limitations under the License. */
 #include "paddle/fluid/pybind/eager.h"
 #include "paddle/fluid/pybind/imperative.h"
 #include "paddle/fluid/pybind/io.h"
+#include "paddle/phi/backends/cpu/cpu_info.h"
 #include "paddle/phi/core/compat/convert_utils.h"
 #include "paddle/phi/core/lod_utils.h"
 #include "paddle/utils/none.h"

--- a/paddle/fluid/pybind/place.cc
+++ b/paddle/fluid/pybind/place.cc
@@ -72,7 +72,6 @@ limitations under the License. */
 #include "paddle/fluid/operators/common_infer_shape_functions.h"
 #include "paddle/fluid/operators/py_func_op.h"
 #include "paddle/fluid/platform/cpu_helper.h"
-#include "paddle/fluid/platform/cpu_info.h"
 #include "paddle/fluid/platform/device/device_wrapper.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/dynload/dynamic_loader.h"
@@ -89,6 +88,7 @@ limitations under the License. */
 #include "paddle/fluid/pybind/eager.h"
 #include "paddle/fluid/pybind/imperative.h"
 #include "paddle/fluid/pybind/io.h"
+#include "paddle/phi/backends/cpu/cpu_info.h"
 #include "paddle/phi/core/compat/convert_utils.h"
 #include "paddle/phi/core/lod_utils.h"
 #include "paddle/utils/none.h"

--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -75,7 +75,6 @@ limitations under the License. */
 #include "paddle/fluid/operators/ops_extra_info.h"
 #include "paddle/fluid/operators/py_func_op.h"
 #include "paddle/fluid/platform/cpu_helper.h"
-#include "paddle/fluid/platform/cpu_info.h"
 #include "paddle/fluid/platform/device/device_wrapper.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/dynload/dynamic_loader.h"
@@ -94,6 +93,7 @@ limitations under the License. */
 #include "paddle/fluid/pybind/io.h"
 #include "paddle/fluid/pybind/jit.h"
 #include "paddle/fluid/pybind/xpu_streams_py.h"
+#include "paddle/phi/backends/cpu/cpu_info.h"
 #include "paddle/phi/core/compat/convert_utils.h"
 #include "paddle/phi/core/lod_utils.h"
 #include "paddle/utils/none.h"
@@ -327,7 +327,7 @@ bool SupportsBfloat16() {
 #ifndef PADDLE_WITH_MKLDNN
  return false;
 #else
-  if (platform::MayIUse(platform::cpu_isa_t::avx512_core))
+  if (phi::backends::cpu::MayIUse(phi::backends::cpu::cpu_isa_t::avx512_core))
    return true;
  else
    return false;
@@ -338,7 +338,7 @@ bool SupportsBfloat16FastPerformance() {
 #ifndef PADDLE_WITH_MKLDNN
  return false;
 #else
-  if (platform::MayIUse(platform::cpu_isa_t::avx512_bf16))
+  if (phi::backends::cpu::MayIUse(phi::backends::cpu::cpu_isa_t::avx512_bf16))
    return true;
  else
    return false;
@@ -349,8 +349,8 @@ bool SupportsInt8() {
 #ifndef PADDLE_WITH_MKLDNN
  return false;
 #else
-  return (platform::MayIUse(platform::cpu_isa_t::avx2) ||
-          platform::MayIUse(platform::cpu_isa_t::avx512f));
+  return (phi::backends::cpu::MayIUse(phi::backends::cpu::cpu_isa_t::avx2) ||
+          phi::backends::cpu::MayIUse(phi::backends::cpu::cpu_isa_t::avx512f));
 #endif
 }

@@ -358,7 +358,8 @@ bool SupportsVNNI() {
 #ifndef PADDLE_WITH_MKLDNN
  return false;
 #else
-  return platform::MayIUse(platform::cpu_isa_t::avx512_core_vnni);
+  return phi::backends::cpu::MayIUse(
+      phi::backends::cpu::cpu_isa_t::avx512_core_vnni);
 #endif
 }

@@ -615,7 +616,7 @@ PYBIND11_MODULE(libpaddle, m) {
  BindJit(&m);

  // Not used, just make sure cpu_info.cc is linked.
-  paddle::platform::CpuTotalPhysicalMemory();
+  phi::backends::cpu::CpuTotalPhysicalMemory();

  paddle::memory::allocation::UseAllocatorStrategyGFlag();


--- a/paddle/fluid/pybind/tensor.cc
+++ b/paddle/fluid/pybind/tensor.cc
@@ -72,7 +72,6 @@ limitations under the License. */
 #include "paddle/fluid/operators/common_infer_shape_functions.h"
 #include "paddle/fluid/operators/py_func_op.h"
 #include "paddle/fluid/platform/cpu_helper.h"
-#include "paddle/fluid/platform/cpu_info.h"
 #include "paddle/fluid/platform/device/device_wrapper.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/dynload/dynamic_loader.h"
@@ -89,6 +88,7 @@ limitations under the License. */
 #include "paddle/fluid/pybind/eager.h"
 #include "paddle/fluid/pybind/imperative.h"
 #include "paddle/fluid/pybind/io.h"
+#include "paddle/phi/backends/cpu/cpu_info.h"
 #include "paddle/phi/core/compat/convert_utils.h"
 #include "paddle/phi/core/lod_utils.h"
 #include "paddle/utils/none.h"

--- a/paddle/phi/backends/CMakeLists.txt
+++ b/paddle/phi/backends/CMakeLists.txt
 add_subdirectory(dynload)
 add_subdirectory(gpu)

-set(BACKENDS_SRCS all_context.cc cpu/cpu_context.cc)
+set(BACKENDS_SRCS all_context.cc cpu/cpu_context.cc cpu/cpu_info.cc)
 set(BACKENDS_DEPS enforce place flags eigen3 phi_device_context)
+if(WITH_XBYAK)
+  list(APPEND BACKENDS_DEPS xbyak)
+endif()

 if(WITH_GPU OR WITH_ROCM)
  list(APPEND BACKENDS_SRCS gpu/gpu_context.cc gpu/gpu_info.cc

--- a/paddle/fluid/platform/cpu_info.cc
+++ b/paddle/fluid/platform/cpu_info.cc
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.

 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
@@ -12,11 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

-#include "paddle/fluid/platform/cpu_info.h"
-
-#ifdef PADDLE_WITH_XBYAK
-#include "xbyak/xbyak_util.h"
-#endif
+#include "paddle/phi/backends/cpu/cpu_info.h"

 #ifdef __APPLE__
 #include <sys/sysctl.h>
@@ -30,6 +26,10 @@ limitations under the License. */
 #include <unistd.h>
 #endif  // _WIN32

+#ifdef PADDLE_WITH_XBYAK
+#include "xbyak/xbyak_util.h"
+#endif
+
 #include <algorithm>

 #include "paddle/phi/core/flags.h"
@@ -47,8 +47,9 @@ PADDLE_DEFINE_EXPORTED_bool(use_pinned_memory,
                            true,
                            "If set, allocate cpu pinned memory.");

-namespace paddle {
-namespace platform {
+namespace phi {
+namespace backends {
+namespace cpu {

 size_t CpuTotalPhysicalMemory() {
 #ifdef __APPLE__
@@ -87,6 +88,11 @@ size_t CpuMaxChunkSize() {
      static_cast<size_t>(FLAGS_initial_cpu_memory_in_mb * 1 << 20));
 }

+size_t CpuMinChunkSize() {
+  // Allow to allocate the minimum chunk size is 4 KB.
+  return 1 << 12;
+}
+
 size_t CUDAPinnedMaxAllocSize() {
  // For distributed systems, it requires configuring and limiting
  // the fraction of memory to use.
@@ -206,5 +212,6 @@ bool MayIUse(const cpu_isa_t cpu_isa) {
 }
 #endif

-}  // namespace platform
-}  // namespace paddle
+}  // namespace cpu
+}  // namespace backends
+}  // namespace phi
--- a/paddle/phi/backends/cpu/cpu_info.h
+++ b/paddle/phi/backends/cpu/cpu_info.h
@@ -36,15 +36,52 @@
 #define ALIGN32_END __attribute__((aligned(32)))
 #endif  // _WIN32

+#ifndef PADDLE_WITH_XBYAK
+#ifdef _WIN32
+#define cpuid(reg, x) __cpuidex(reg, x, 0)
+#else
+#if !defined(WITH_NV_JETSON) && !defined(PADDLE_WITH_ARM) && \
+    !defined(PADDLE_WITH_SW) && !defined(PADDLE_WITH_MIPS)
+#include <cpuid.h>
+inline void cpuid(int reg[4], int x) {
+  __cpuid_count(x, 0, reg[0], reg[1], reg[2], reg[3]);
+}
+#endif
+#endif
+#endif
+
 namespace phi {
 namespace backends {
 namespace cpu {

+size_t CpuTotalPhysicalMemory();
+
+//! Get the maximum allocation size for a machine.
+size_t CpuMaxAllocSize();
+
+//! Get the maximum allocation size for a machine.
+size_t CUDAPinnedMaxAllocSize();
+
 //! Get the minimum chunk size for buddy allocator.
-inline size_t CpuMinChunkSize() {
-  // Allow to allocate the minimum chunk size is 4 KB.
-  return 1 << 12;
-}
+size_t CpuMinChunkSize();
+
+//! Get the maximum chunk size for buddy allocator.
+size_t CpuMaxChunkSize();
+
+//! Get the minimum chunk size for buddy allocator.
+size_t CUDAPinnedMinChunkSize();
+
+//! Get the maximum chunk size for buddy allocator.
+size_t CUDAPinnedMaxChunkSize();
+
+//! Get the maximum allocation size for a machine.
+size_t NPUPinnedMaxAllocSize();
+
+//! Get the minimum chunk size for buddy allocator.
+size_t NPUPinnedMinChunkSize();
+
+//! Get the maximum chunk size for buddy allocator.
+size_t NPUPinnedMaxChunkSize();

 typedef enum {
  isa_any,
@@ -59,6 +96,8 @@ typedef enum {
  avx512_bf16,
 } cpu_isa_t;  // Instruction set architecture

+// May I use some instruction
+bool MayIUse(const cpu_isa_t cpu_isa);
 }  // namespace cpu
 }  // namespace backends
 }  // namespace phi
--- a/paddle/phi/kernels/funcs/CMakeLists.txt
+++ b/paddle/phi/kernels/funcs/CMakeLists.txt
@@ -19,6 +19,7 @@ math_library(matrix_solve DEPS dense_tensor eigen3 blas math_function)
 math_library(cross_entropy)
 math_library(im2col)
 math_library(vol2col)
+math_library(softmax DEPS math_function)

 cc_library(
  phi_data_layout_transform

--- a/paddle/phi/kernels/funcs/eigen/common.h
+++ b/paddle/phi/kernels/funcs/eigen/common.h
@@ -17,6 +17,7 @@ limitations under the License. */
 #include <stdint.h>

 #include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/kernels/funcs/eigen/extensions.h"
 #include "unsupported/Eigen/CXX11/Tensor"

 namespace phi {

--- a/paddle/fluid/operators/math/softmax.cc
+++ b/paddle/fluid/operators/math/softmax.cc
@@ -12,20 +12,18 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

-#include "paddle/fluid/operators/math/softmax.h"
+#include "paddle/phi/kernels/funcs/softmax.h"

-#include "paddle/fluid/operators/math/softmax_impl.h"
 #include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/kernels/funcs/softmax_impl.h"

-namespace paddle {
-namespace operators {
-namespace math {
+namespace phi {
+namespace funcs {

 template class SoftmaxFunctor<phi::CPUContext, float>;
 template class SoftmaxFunctor<phi::CPUContext, double>;
 template class SoftmaxGradFunctor<phi::CPUContext, float>;
 template class SoftmaxGradFunctor<phi::CPUContext, double>;

-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
+}  // namespace funcs
+}  // namespace phi
--- a/paddle/fluid/operators/math/softmax.cu
+++ b/paddle/fluid/operators/math/softmax.cu
@@ -13,20 +13,19 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include <vector>

-#include "paddle/fluid/operators/math/softmax.h"
-#include "paddle/fluid/operators/math/softmax_impl.h"
-#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/backends/gpu/gpu_dnn.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/funcs/softmax.h"
+#include "paddle/phi/kernels/funcs/softmax_impl.h"

-namespace paddle {
-namespace operators {
-namespace math {
+namespace phi {
+namespace funcs {

-using ScopedTensorDescriptor = platform::ScopedTensorDescriptor;
-using DataLayout = platform::DataLayout;
+using ScopedTensorDescriptor = phi::backends::gpu::ScopedTensorDescriptor;
+using DataLayout = phi::backends::gpu::DataLayout;
 template <typename T>
-using CudnnDataType = platform::CudnnDataType<T>;
+using CudnnDataType = phi::backends::gpu::CudnnDataType<T>;

 template <typename T, typename DeviceContext>
 void SoftmaxCUDNNFunctor<T, DeviceContext>::operator()(
@@ -51,31 +50,31 @@ void SoftmaxCUDNNFunctor<T, DeviceContext>::operator()(
      xDesc.descriptor<T>(layout, cudnn_tensor_dims);
  miopenTensorDescriptor_t cudnn_y_desc =
      xDesc.descriptor<T>(layout, cudnn_tensor_dims);
-  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenSoftmaxForward_V2(
-      context.cudnn_handle(),
-      CudnnDataType<T>::kOne(),
-      cudnn_x_desc,
-      X->data<T>(),
-      CudnnDataType<T>::kZero(),
-      cudnn_y_desc,
-      Y->mutable_data<T>(context.GetPlace()),
-      MIOPEN_SOFTMAX_ACCURATE,
-      MIOPEN_SOFTMAX_MODE_INSTANCE));
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      phi::dynload::miopenSoftmaxForward_V2(context.cudnn_handle(),
+                                            CudnnDataType<T>::kOne(),
+                                            cudnn_x_desc,
+                                            X->data<T>(),
+                                            CudnnDataType<T>::kZero(),
+                                            cudnn_y_desc,
+                                            context.template Alloc<T>(Y),
+                                            MIOPEN_SOFTMAX_ACCURATE,
+                                            MIOPEN_SOFTMAX_MODE_INSTANCE));
 #else
  cudnnTensorDescriptor_t cudnn_x_desc =
      xDesc.descriptor<T>(layout, cudnn_tensor_dims);
  cudnnTensorDescriptor_t cudnn_y_desc =
      xDesc.descriptor<T>(layout, cudnn_tensor_dims);
-  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSoftmaxForward(
-      context.cudnn_handle(),
-      CUDNN_SOFTMAX_ACCURATE,
-      CUDNN_SOFTMAX_MODE_INSTANCE,
-      CudnnDataType<T>::kOne(),
-      cudnn_x_desc,
-      X->data<T>(),
-      CudnnDataType<T>::kZero(),
-      cudnn_y_desc,
-      Y->mutable_data<T>(context.GetPlace())));
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      phi::dynload::cudnnSoftmaxForward(context.cudnn_handle(),
+                                        CUDNN_SOFTMAX_ACCURATE,
+                                        CUDNN_SOFTMAX_MODE_INSTANCE,
+                                        CudnnDataType<T>::kOne(),
+                                        cudnn_x_desc,
+                                        X->data<T>(),
+                                        CudnnDataType<T>::kZero(),
+                                        cudnn_y_desc,
+                                        context.template Alloc<T>(Y)));
 #endif
 }

@@ -106,18 +105,18 @@ void SoftmaxGradCUDNNFunctor<T, DeviceContext>::operator()(
      dxDesc.descriptor<T>(layout, cudnn_tensor_dims);
  miopenTensorDescriptor_t cudnn_ygrad_desc =
      dyDesc.descriptor<T>(layout, cudnn_tensor_dims);
-  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::miopenSoftmaxBackward_V2(
-      context.cudnn_handle(),
-      CudnnDataType<T>::kOne(),
-      cudnn_y_desc,
-      Y->data<T>(),
-      cudnn_ygrad_desc,
-      YGrad->data<T>(),
-      CudnnDataType<T>::kZero(),
-      cudnn_xgrad_desc,
-      XGrad->mutable_data<T>(context.GetPlace()),
-      MIOPEN_SOFTMAX_ACCURATE,
-      MIOPEN_SOFTMAX_MODE_INSTANCE));
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      phi::dynload::miopenSoftmaxBackward_V2(context.cudnn_handle(),
+                                             CudnnDataType<T>::kOne(),
+                                             cudnn_y_desc,
+                                             Y->data<T>(),
+                                             cudnn_ygrad_desc,
+                                             YGrad->data<T>(),
+                                             CudnnDataType<T>::kZero(),
+                                             cudnn_xgrad_desc,
+                                             context.template Alloc<T>(XGrad),
+                                             MIOPEN_SOFTMAX_ACCURATE,
+                                             MIOPEN_SOFTMAX_MODE_INSTANCE));
 #else
  cudnnTensorDescriptor_t cudnn_y_desc =
      yDesc.descriptor<T>(layout, cudnn_tensor_dims);
@@ -125,28 +124,28 @@ void SoftmaxGradCUDNNFunctor<T, DeviceContext>::operator()(
      dxDesc.descriptor<T>(layout, cudnn_tensor_dims);
  cudnnTensorDescriptor_t cudnn_ygrad_desc =
      dyDesc.descriptor<T>(layout, cudnn_tensor_dims);
-  PADDLE_ENFORCE_GPU_SUCCESS(platform::dynload::cudnnSoftmaxBackward(
-      context.cudnn_handle(),
-      CUDNN_SOFTMAX_ACCURATE,
-      CUDNN_SOFTMAX_MODE_INSTANCE,
-      CudnnDataType<T>::kOne(),
-      cudnn_y_desc,
-      Y->data<T>(),
-      cudnn_ygrad_desc,
-      YGrad->data<T>(),
-      CudnnDataType<T>::kZero(),
-      cudnn_xgrad_desc,
-      XGrad->mutable_data<T>(context.GetPlace())));
+  PADDLE_ENFORCE_GPU_SUCCESS(
+      phi::dynload::cudnnSoftmaxBackward(context.cudnn_handle(),
+                                         CUDNN_SOFTMAX_ACCURATE,
+                                         CUDNN_SOFTMAX_MODE_INSTANCE,
+                                         CudnnDataType<T>::kOne(),
+                                         cudnn_y_desc,
+                                         Y->data<T>(),
+                                         cudnn_ygrad_desc,
+                                         YGrad->data<T>(),
+                                         CudnnDataType<T>::kZero(),
+                                         cudnn_xgrad_desc,
+                                         context.template Alloc<T>(XGrad)));
 #endif
 }

 template class SoftmaxCUDNNFunctor<float, phi::GPUContext>;
-template class SoftmaxCUDNNFunctor<platform::float16, phi::GPUContext>;
+template class SoftmaxCUDNNFunctor<phi::dtype::float16, phi::GPUContext>;
 template class SoftmaxGradCUDNNFunctor<float, phi::GPUContext>;
-template class SoftmaxGradCUDNNFunctor<platform::float16, phi::GPUContext>;
+template class SoftmaxGradCUDNNFunctor<phi::dtype::float16, phi::GPUContext>;
 #if CUDNN_VERSION_MIN(8, 1, 0)
-template class SoftmaxCUDNNFunctor<platform::bfloat16, phi::GPUContext>;
-template class SoftmaxGradCUDNNFunctor<platform::bfloat16, phi::GPUContext>;
+template class SoftmaxCUDNNFunctor<phi::dtype::bfloat16, phi::GPUContext>;
+template class SoftmaxGradCUDNNFunctor<phi::dtype::bfloat16, phi::GPUContext>;
 #endif

 // MIOPEN do not support double
@@ -155,15 +154,14 @@ template class SoftmaxCUDNNFunctor<double, phi::GPUContext>;
 template class SoftmaxGradCUDNNFunctor<double, phi::GPUContext>;
 #endif

-template class SoftmaxFunctor<phi::GPUContext, platform::float16>;
-template class SoftmaxFunctor<phi::GPUContext, platform::bfloat16>;
+template class SoftmaxFunctor<phi::GPUContext, phi::dtype::float16>;
+template class SoftmaxFunctor<phi::GPUContext, phi::dtype::bfloat16>;
 template class SoftmaxFunctor<phi::GPUContext, float>;
 template class SoftmaxFunctor<phi::GPUContext, double>;
 template class SoftmaxGradFunctor<phi::GPUContext, float>;
 template class SoftmaxGradFunctor<phi::GPUContext, double>;
-template class SoftmaxGradFunctor<phi::GPUContext, platform::float16>;
-template class SoftmaxGradFunctor<phi::GPUContext, platform::bfloat16>;
+template class SoftmaxGradFunctor<phi::GPUContext, phi::dtype::float16>;
+template class SoftmaxGradFunctor<phi::GPUContext, phi::dtype::bfloat16>;

-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
+}  // namespace funcs
+}  // namespace phi
--- a/paddle/fluid/operators/math/softmax.h
+++ b/paddle/fluid/operators/math/softmax.h
@@ -13,11 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #pragma once
-#include "paddle/fluid/framework/tensor.h"
+#include "paddle/phi/core/dense_tensor.h"

-namespace paddle {
-namespace operators {
-namespace math {
+namespace phi {
+namespace funcs {

 template <typename DeviceContext, typename T, typename Enable = void>
 class SoftmaxFunctor {
@@ -58,6 +57,5 @@ class SoftmaxGradCUDNNFunctor {

 #endif

-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
+}  // namespace funcs
+}  // namespace phi
--- a/paddle/fluid/operators/math/softmax_impl.h
+++ b/paddle/fluid/operators/math/softmax_impl.h
@@ -15,24 +15,22 @@ limitations under the License. */
 #pragma once
 #include <vector>

-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/operators/jit/kernels.h"
-#include "paddle/fluid/operators/math/cpu_vec.h"
-#include "paddle/fluid/platform/bfloat16.h"
-#include "paddle/fluid/platform/cpu_info.h"
-#include "paddle/fluid/platform/float16.h"
 #include "paddle/phi/backends/cpu/cpu_context.h"
+#include "paddle/phi/backends/cpu/cpu_info.h"
 #include "paddle/phi/backends/gpu/gpu_context.h"
+#include "paddle/phi/common/bfloat16.h"
+#include "paddle/phi/common/float16.h"
+#include "paddle/phi/core/dense_tensor.h"
+#include "paddle/phi/kernels/funcs/cpu_vec.h"
+#include "paddle/phi/kernels/funcs/eigen/common.h"

-namespace paddle {
-namespace operators {
-namespace math {
+namespace phi {
+namespace funcs {

 template <typename T,
          int MajorType = Eigen::RowMajor,
          typename IndexType = Eigen::DenseIndex>
-using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
+using EigenMatrix = phi::EigenMatrix<T, MajorType, IndexType>;

 template <typename T>
 struct ValueClip {
@@ -104,7 +102,7 @@ class SoftmaxEigen {
 };

 template <typename DeviceContext>
-class SoftmaxEigen<DeviceContext, platform::float16> {
+class SoftmaxEigen<DeviceContext, phi::dtype::float16> {
 public:
  void operator()(const DeviceContext& context,
                  const int axis_dim,
@@ -114,8 +112,8 @@ class SoftmaxEigen<DeviceContext, platform::float16> {
    constexpr int kClassDim = 1;
    constexpr int kAxisDim = 1;

-    auto logits = EigenMatrix<platform::float16>::From(*X);
-    auto softmax = EigenMatrix<platform::float16>::From(*Y);
+    auto logits = EigenMatrix<phi::dtype::float16>::From(*X);
+    auto softmax = EigenMatrix<phi::dtype::float16>::From(*Y);

    const int batch_size = logits.dimension(kBatchDim);
    const int num_classes = logits.dimension(kClassDim);
@@ -139,7 +137,7 @@ class SoftmaxEigen<DeviceContext, platform::float16> {
          (logits - logits.maximum(along_axis)
                        .reshape(batch_by_one)
                        .broadcast(one_by_class))
-              .unaryExpr(ValueClip<platform::float16>());
+              .unaryExpr(ValueClip<phi::dtype::float16>());
    } else {
      // axis != -1, class dimension split into (axis, remain), max and sum
      // should be calculated along axis dimension
@@ -149,7 +147,7 @@ class SoftmaxEigen<DeviceContext, platform::float16> {
                                                   .reshape(batch_one_remain)
                                                   .broadcast(one_axis_one)
                                                   .reshape(batch_classes))
-              .unaryExpr(ValueClip<platform::float16>());
+              .unaryExpr(ValueClip<phi::dtype::float16>());
    }

    softmax.device(*context.eigen_device()) = softmax.exp();
@@ -162,7 +160,7 @@ class SoftmaxEigen<DeviceContext, platform::float16> {
 };

 template <typename DeviceContext>
-class SoftmaxEigen<DeviceContext, platform::bfloat16> {
+class SoftmaxEigen<DeviceContext, phi::dtype::bfloat16> {
 public:
  void operator()(const DeviceContext& context,
                  const int axis_dim,
@@ -172,8 +170,8 @@ class SoftmaxEigen<DeviceContext, platform::bfloat16> {
    constexpr int kClassDim = 1;
    constexpr int kAxisDim = 1;

-    auto logits = EigenMatrix<platform::bfloat16>::From(*X);
-    auto softmax = EigenMatrix<platform::bfloat16>::From(*Y);
+    auto logits = EigenMatrix<phi::dtype::bfloat16>::From(*X);
+    auto softmax = EigenMatrix<phi::dtype::bfloat16>::From(*Y);

    const int batch_size = logits.dimension(kBatchDim);
    const int num_classes = logits.dimension(kClassDim);
@@ -197,7 +195,7 @@ class SoftmaxEigen<DeviceContext, platform::bfloat16> {
          (logits - logits.maximum(along_axis)
                        .reshape(batch_by_one)
                        .broadcast(one_by_class))
-              .unaryExpr(ValueClip<platform::bfloat16>());
+              .unaryExpr(ValueClip<phi::dtype::bfloat16>());
    } else {
      // axis != -1, class dimension split into (axis, remain), max and sum
      // should be calculated along axis dimension
@@ -207,7 +205,7 @@ class SoftmaxEigen<DeviceContext, platform::bfloat16> {
                                                   .reshape(batch_one_remain)
                                                   .broadcast(one_axis_one)
                                                   .reshape(batch_classes))
-              .unaryExpr(ValueClip<platform::bfloat16>());
+              .unaryExpr(ValueClip<phi::dtype::bfloat16>());
    }

    softmax.device(*context.eigen_device()) = softmax.exp();
@@ -247,21 +245,24 @@ class SoftmaxFunctor<DeviceContext, T, enable_if_CPU<DeviceContext>> {
    const int batch_size = in_dims[kBatchDim];
    const int num_remain = num_classes / axis_dim;

-    if (num_remain == 1 && platform::MayIUse(platform::avx)) {
+    if (num_remain == 1 &&
+        phi::backends::cpu::MayIUse(phi::backends::cpu::avx)) {
      const T* in_data = X->data<T>();
      T* out_data = Y->data<T>();
      for (int bs = 0; bs < batch_size; ++bs) {
        T max_val = *std::max_element(in_data, in_data + num_classes);
        max_val *= static_cast<T>(-1);
-        vec_add_bias<T, platform::avx>(num_classes, max_val, in_data, out_data);
-        vec_clip<T, platform::avx>(
+        vec_add_bias<T, phi::backends::cpu::avx>(
+            num_classes, max_val, in_data, out_data);
+        vec_clip<T, phi::backends::cpu::avx>(
            num_classes, static_cast<T>(-64), out_data, out_data);
        vec_exp<T>(num_classes, out_data, out_data);

        T sum = 0;
-        vec_sum<T, platform::avx>(num_classes, out_data, &sum);
+        vec_sum<T, phi::backends::cpu::avx>(num_classes, out_data, &sum);
        sum = static_cast<T>(1) / sum;
-        vec_scal<T, platform::avx>(num_classes, sum, out_data, out_data);
+        vec_scal<T, phi::backends::cpu::avx>(
+            num_classes, sum, out_data, out_data);

        in_data += num_classes;
        out_data += num_classes;
@@ -308,16 +309,16 @@ class SoftmaxGradEigen {
 };

 template <typename DeviceContext>
-class SoftmaxGradEigen<DeviceContext, platform::float16> {
+class SoftmaxGradEigen<DeviceContext, phi::dtype::float16> {
 public:
  void operator()(const DeviceContext& context,
                  const int axis_dim,
                  const phi::DenseTensor* y,
                  const phi::DenseTensor* y_grad,
                  phi::DenseTensor* x_grad) {
-    auto softmax = EigenMatrix<platform::float16>::From(*y);
-    auto softmax_grad = EigenMatrix<platform::float16>::From(*y_grad);
-    auto logits_grad = EigenMatrix<platform::float16>::From(*x_grad);
+    auto softmax = EigenMatrix<phi::dtype::float16>::From(*y);
+    auto softmax_grad = EigenMatrix<phi::dtype::float16>::From(*y_grad);
+    auto logits_grad = EigenMatrix<phi::dtype::float16>::From(*x_grad);

    constexpr int kBatchDim = 0;
    constexpr int kClassDim = 1;
@@ -342,16 +343,16 @@ class SoftmaxGradEigen<DeviceContext, platform::float16> {
 };

 template <typename DeviceContext>
-class SoftmaxGradEigen<DeviceContext, platform::bfloat16> {
+class SoftmaxGradEigen<DeviceContext, phi::dtype::bfloat16> {
 public:
  void operator()(const DeviceContext& context,
                  const int axis_dim,
                  const phi::DenseTensor* y,
                  const phi::DenseTensor* y_grad,
                  phi::DenseTensor* x_grad) {
-    auto softmax = EigenMatrix<platform::bfloat16>::From(*y);
-    auto softmax_grad = EigenMatrix<platform::bfloat16>::From(*y_grad);
-    auto logits_grad = EigenMatrix<platform::bfloat16>::From(*x_grad);
+    auto softmax = EigenMatrix<phi::dtype::bfloat16>::From(*y);
+    auto softmax_grad = EigenMatrix<phi::dtype::bfloat16>::From(*y_grad);
+    auto logits_grad = EigenMatrix<phi::dtype::bfloat16>::From(*x_grad);

    constexpr int kBatchDim = 0;
    constexpr int kClassDim = 1;
@@ -400,17 +401,20 @@ class SoftmaxGradFunctor<DeviceContext, T, enable_if_CPU<DeviceContext>> {
    const int batch_size = out_dims[kBatchDim];
    const int num_remain = num_classes / axis_dim;

-    if (num_remain == 1 && platform::MayIUse(platform::avx)) {
+    if (num_remain == 1 &&
+        phi::backends::cpu::MayIUse(phi::backends::cpu::avx)) {
      const T* out_data = y->data<T>();
      const T* out_grad = y_grad->data<T>();
      T* in_grad = x_grad->data<T>();
      for (int bs = 0; bs < batch_size; ++bs) {
        T scalar;
-        vec_mul_reduce<T, platform::avx>(
+        vec_mul_reduce<T, phi::backends::cpu::avx>(
            num_classes, out_grad, out_data, &scalar);
        scalar *= static_cast<T>(-1);
-        vec_add_bias<T, platform::avx>(num_classes, scalar, out_grad, in_grad);
-        vec_mul<T, platform::avx>(num_classes, out_data, in_grad, in_grad);
+        vec_add_bias<T, phi::backends::cpu::avx>(
+            num_classes, scalar, out_grad, in_grad);
+        vec_mul<T, phi::backends::cpu::avx>(
+            num_classes, out_data, in_grad, in_grad);
        out_data += num_classes;
        out_grad += num_classes;
        in_grad += num_classes;
@@ -422,6 +426,5 @@ class SoftmaxGradFunctor<DeviceContext, T, enable_if_CPU<DeviceContext>> {
  }
 };

-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
+}  // namespace funcs
+}  // namespace phi
--- a/paddle/phi/kernels/gpu/cross_entropy_grad_kernel.cu
+++ b/paddle/phi/kernels/gpu/cross_entropy_grad_kernel.cu
@@ -22,7 +22,6 @@ limitations under the License. */
 namespace cub = hipcub;
 #endif

-#include "paddle/fluid/operators/math/softmax.h"
 #include "paddle/phi/backends/gpu/gpu_device_function.h"
 #include "paddle/phi/backends/gpu/gpu_dnn.h"
 #include "paddle/phi/common/amp_type_traits.h"
@@ -32,6 +31,7 @@ namespace cub = hipcub;
 #include "paddle/phi/kernels/funcs/axis_utils.h"
 #include "paddle/phi/kernels/funcs/for_range.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/funcs/softmax.h"
 #include "paddle/phi/kernels/gpudnn/softmax_gpudnn.h"

 namespace phi {

--- a/paddle/phi/kernels/gpu/cross_entropy_kernel.cu
+++ b/paddle/phi/kernels/gpu/cross_entropy_kernel.cu
@@ -22,7 +22,6 @@ limitations under the License. */
 namespace cub = hipcub;
 #endif

-#include "paddle/fluid/operators/math/softmax.h"
 #include "paddle/phi/backends/gpu/gpu_device_function.h"
 #include "paddle/phi/backends/gpu/gpu_dnn.h"
 #include "paddle/phi/common/amp_type_traits.h"
@@ -33,6 +32,7 @@ namespace cub = hipcub;
 #include "paddle/phi/kernels/funcs/cross_entropy.h"
 #include "paddle/phi/kernels/funcs/for_range.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/funcs/softmax.h"
 #include "paddle/phi/kernels/gpudnn/softmax_gpudnn.h"

 namespace phi {
@@ -1386,7 +1386,7 @@ void CrossEntropyWithSoftmaxCUDAKernel(const GPUContext& dev_ctx,
      labels_2d.Resize({n, label.numel() / n});
      DenseTensor loss_2d(*loss);
      loss_2d.Resize({n, 1});
-      paddle::operators::math::SoftmaxCUDNNFunctor<T, GPUContext>()(
+      phi::funcs::SoftmaxCUDNNFunctor<T, GPUContext>()(
          dev_ctx, &logits_2d, &softmax_2d);
      phi::funcs::CrossEntropyFunctor<GPUContext, T>()(dev_ctx,
                                                       &loss_2d,

--- a/paddle/phi/kernels/impl/gumbel_softmax_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/gumbel_softmax_grad_kernel_impl.h
@@ -14,11 +14,11 @@

 #pragma once

-#include "paddle/fluid/operators/math/softmax.h"
-#include "paddle/fluid/operators/math/softmax_impl.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/kernels/funcs/axis_utils.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/funcs/softmax.h"
+#include "paddle/phi/kernels/funcs/softmax_impl.h"

 namespace phi {

@@ -50,7 +50,7 @@ void GumbelSoftmaxGradKernel(const Context& ctx,
  dx_2d.Resize({size_to_axis, size_from_axis});
  out_2d.Resize({size_to_axis, size_from_axis});
  dout_2d.Resize({size_to_axis, size_from_axis});
-  paddle::operators::math::SoftmaxGradFunctor<Context, T>()(
+  phi::funcs::SoftmaxGradFunctor<Context, T>()(
      ctx, axis_dim, &out_2d, &dout_2d, &dx_2d);
 }


--- a/paddle/phi/kernels/impl/gumbel_softmax_kernel_impl.h
+++ b/paddle/phi/kernels/impl/gumbel_softmax_kernel_impl.h
@@ -16,12 +16,12 @@

 #include <random>

-#include "paddle/fluid/operators/math/softmax.h"
-#include "paddle/fluid/operators/math/softmax_impl.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/kernels/funcs/axis_utils.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/funcs/softmax.h"
+#include "paddle/phi/kernels/funcs/softmax_impl.h"

 namespace phi {

@@ -87,8 +87,7 @@ void GumbelSoftmaxKernelHelper(const Context& ctx,
                                              size_to_axis,
                                              size_from_axis,
                                              temperature);
-  paddle::operators::math::SoftmaxFunctor<Context, T>()(
-      ctx, axis_dim, &x_noise_2d, &out_2d);
+  phi::funcs::SoftmaxFunctor<Context, T>()(ctx, axis_dim, &x_noise_2d, &out_2d);

  if (hard) {
    OneHotGenerator<Context, T>::Transform(ctx, x, out, axis);

--- a/paddle/phi/kernels/impl/softmax_grad_kernel_impl.h
+++ b/paddle/phi/kernels/impl/softmax_grad_kernel_impl.h
@@ -14,9 +14,9 @@ limitations under the License. */

 #pragma once

-#include "paddle/fluid/operators/math/softmax.h"
 #include "paddle/phi/kernels/funcs/axis_utils.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/funcs/softmax.h"
 #include "paddle/phi/kernels/softmax_grad_kernel.h"

 namespace phi {
@@ -50,7 +50,7 @@ void SoftmaxGradKernel(const Context& dev_ctx,
  Out_2d.ShareDataWith(out).Resize({n, d});
  dOut_2d.ShareDataWith(out_grad).Resize({n, d});

-  paddle::operators::math::SoftmaxGradFunctor<Context, T>()(
+  phi::funcs::SoftmaxGradFunctor<Context, T>()(
      dev_ctx, axis_dim, &Out_2d, &dOut_2d, &dX_2d);
 }


--- a/paddle/phi/kernels/impl/softmax_kernel_impl.h
+++ b/paddle/phi/kernels/impl/softmax_kernel_impl.h
@@ -14,9 +14,9 @@ limitations under the License. */

 #pragma once

-#include "paddle/fluid/operators/math/softmax.h"
 #include "paddle/phi/kernels/funcs/axis_utils.h"
 #include "paddle/phi/kernels/funcs/math_function.h"
+#include "paddle/phi/kernels/funcs/softmax.h"
 #include "paddle/phi/kernels/softmax_kernel.h"

 namespace phi {
@@ -47,8 +47,7 @@ void SoftmaxKernel(const Context& dev_ctx,
  DenseTensor X_2d, Out_2d;
  X_2d.ShareDataWith(x).Resize({n, d});
  Out_2d.ShareDataWith(*out).Resize({n, d});
-  paddle::operators::math::SoftmaxFunctor<Context, T>()(
-      dev_ctx, axis_dim, &X_2d, &Out_2d);
+  phi::funcs::SoftmaxFunctor<Context, T>()(dev_ctx, axis_dim, &X_2d, &Out_2d);
 }

 }  // namespace phi
--- a/paddle/phi/tests/kernels/CMakeLists.txt
+++ b/paddle/phi/tests/kernels/CMakeLists.txt
@@ -22,7 +22,7 @@ endif()
 cc_test(
  test_cpu_vec
  SRCS test_cpu_vec.cc
-  DEPS blas cpu_info)
+  DEPS blas phi_backends)

 # For String Kernels
 cc_test(