From d25a7f9ea7a171caa7b37fd9f624b28d25aa293a Mon Sep 17 00:00:00 2001
From: Feiyu Chan <chenfeiyu@baidu.com>
Date: Fri, 11 Feb 2022 16:48:51 +0800
Subject: [PATCH] [Pten] move operators/math/math_function_* to
 pten/kernels/func (#39300)

* move operators/math/math_function_* to pten/kernels/func
* namespace from `paddle::operators::math` to `pten::funcs`
---
 .../ps/service/communicator/communicator.h    |   4 +-
 .../ps/service/ps_service/graph_py_service.h  |   3 +-
 .../test/brpc_service_dense_sgd_test.cc       |   3 +-
 .../test/brpc_service_sparse_sgd_test.cc      |   3 +-
 .../fluid/distributed/test/brpc_utils_test.cc |   9 +-
 .../distributed/test/graph_node_split_test.cc |   3 +-
 .../fluid/distributed/test/graph_node_test.cc |   3 +-
 paddle/fluid/eager/grad_tensor_holder.cc      |   2 +-
 .../framework/data_device_transform_test.cu   |   2 +-
 .../fluid/framework/data_layout_transform.cc  |   4 +-
 paddle/fluid/framework/data_transform.h       |   2 +-
 .../multi_devices_graph_pass.cc               |   2 +-
 paddle/fluid/imperative/basic_engine.cc       |   6 +-
 .../fluid/imperative/gradient_accumulator.cc  |  12 +-
 paddle/fluid/imperative/layer.cc              |   4 +-
 .../fluid/imperative/partial_grad_engine.cc   |   4 +-
 paddle/fluid/imperative/reducer.cc            |   2 +-
 paddle/fluid/imperative/reducer.h             |   2 +-
 .../tests/test_gradient_accmulator.cc         |   2 +-
 paddle/fluid/operators/addmm_op.h             |   2 +-
 paddle/fluid/operators/affine_grid_op.cu      |   2 +-
 paddle/fluid/operators/affine_grid_op.h       |   8 +-
 .../check_finite_and_unscale_op_npu_test.cc   |   3 +-
 paddle/fluid/operators/assign_op_npu_test.cc  |   3 +-
 .../fluid/operators/average_accumulates_op.h  |   4 +-
 paddle/fluid/operators/batch_norm_op.cc       |   2 +-
 paddle/fluid/operators/batch_norm_op.cu       |   5 +-
 paddle/fluid/operators/batch_norm_op.h        |   2 +-
 paddle/fluid/operators/batch_size_like.h      |   2 +-
 .../operators/bilinear_tensor_product_op.h    |   2 +-
 paddle/fluid/operators/bincount_op.cu         |   6 +-
 paddle/fluid/operators/bincount_op.h          |   8 +-
 paddle/fluid/operators/bmm_op.h               |   2 +-
 paddle/fluid/operators/bpr_loss_op.h          |   2 +-
 paddle/fluid/operators/broadcast_tensors_op.h |   2 +-
 paddle/fluid/operators/coalesce_tensor_op.cc  |   6 +-
 .../collective/c_allgather_op_npu_test.cc     |   3 +-
 .../collective/c_allreduce_max_op_npu_test.cc |   3 +-
 .../collective/c_allreduce_sum_op_npu_test.cc |   3 +-
 .../collective/c_broadcast_op_npu_test.cc     |   3 +-
 .../collective/c_reduce_sum_op_npu_test.cc    |   3 +-
 .../collective/c_reducescatter_op_npu_test.cc |   3 +-
 .../c_sync_calc_stream_op_npu_test.cc         |   3 +-
 .../c_sync_comm_stream_op_npu_test.cc         |   3 +-
 .../collective/checknumeric_npu_test.cc       |   3 +-
 .../collective/recv_v2_op_npu_test.cc         |   3 +-
 .../collective/send_v2_op_npu_test.cc         |   3 +-
 .../controlflow/conditional_block_op.cc       |   4 +-
 paddle/fluid/operators/conv_cudnn_op.cu       |   2 +-
 paddle/fluid/operators/conv_op.h              |   6 +-
 paddle/fluid/operators/conv_shift_op.cu       |   4 +-
 .../operators/conv_transpose_cudnn_op.cu      |   6 +-
 paddle/fluid/operators/conv_transpose_op.h    |   8 +-
 paddle/fluid/operators/cos_sim_op.h           |   4 +-
 paddle/fluid/operators/crf_decoding_op.h      |   4 +-
 paddle/fluid/operators/cross_entropy_op.h     |   2 +-
 paddle/fluid/operators/ctc_align_op.cu        |   2 +-
 paddle/fluid/operators/ctc_align_op.h         |   2 +-
 paddle/fluid/operators/cudnn_lstm_op.cu.cc    |   4 +-
 paddle/fluid/operators/cvm_op.cc              |   2 +-
 .../operators/deformable_conv_filter.cu.h     |   2 +-
 paddle/fluid/operators/deformable_conv_func.h |   2 +-
 paddle/fluid/operators/deformable_conv_op.cu  |   4 +-
 paddle/fluid/operators/deformable_conv_op.h   |   4 +-
 .../fluid/operators/deformable_conv_v1_op.cu  |   4 +-
 .../fluid/operators/deformable_conv_v1_op.h   |   4 +-
 .../operators/deformable_psroi_pooling_op.cu  |   4 +-
 .../operators/deformable_psroi_pooling_op.h   |   6 +-
 .../operators/detection/anchor_generator_op.h |   2 +-
 .../fluid/operators/detection/bbox_util.cu.h  |   2 +-
 .../operators/detection/bipartite_match_op.cc |   6 +-
 .../fluid/operators/detection/box_clip_op.cu  |   2 +-
 .../fluid/operators/detection/box_clip_op.h   |   2 +-
 .../fluid/operators/detection/box_coder_op.h  |   2 +-
 .../detection/box_decoder_and_assign_op.h     |   2 +-
 .../detection/collect_fpn_proposals_op.cu     |   2 +-
 .../detection/collect_fpn_proposals_op.h      |   2 +-
 .../detection/distribute_fpn_proposals_op.cu  |   4 +-
 .../detection/distribute_fpn_proposals_op.h   |   2 +-
 .../detection/generate_mask_labels_op.cc      |   6 +-
 .../detection/generate_proposal_labels_op.cc  |  12 +-
 .../detection/generate_proposals_op.cc        |   6 +-
 .../detection/generate_proposals_op.cu        |   6 +-
 .../detection/generate_proposals_v2_op.cc     |   6 +-
 .../detection/generate_proposals_v2_op.cu     |   6 +-
 .../fluid/operators/detection/prior_box_op.h  |   2 +-
 .../detection/roi_perspective_transform_op.cc |   2 +-
 .../detection/roi_perspective_transform_op.cu |   6 +-
 .../detection/rpn_target_assign_op.cc         |   2 +-
 .../fluid/operators/detection/yolo_box_op.cu  |   4 +-
 .../fluid/operators/detection/yolo_box_op.h   |   2 +-
 .../operators/detection/yolov3_loss_op.h      |   6 +-
 paddle/fluid/operators/determinant_op.h       |   6 +-
 paddle/fluid/operators/dgc_op.h               |   2 +-
 paddle/fluid/operators/diag_embed_op.h        |   4 +-
 paddle/fluid/operators/diag_op.h              |   4 +-
 paddle/fluid/operators/diag_v2_op.cc          |   4 +-
 paddle/fluid/operators/diag_v2_op.cu          |   2 +-
 paddle/fluid/operators/diag_v2_op.h           |   2 +-
 paddle/fluid/operators/dist_op.h              |   4 +-
 paddle/fluid/operators/dropout_op_test.cc     |   3 +-
 paddle/fluid/operators/edit_distance_op.cu    |   4 +-
 paddle/fluid/operators/eig_op.h               |   2 +-
 .../elementwise/elementwise_op_function.h     |   2 +-
 .../elementwise/elementwise_op_npu_test.cc    |   3 +-
 paddle/fluid/operators/expand_op_npu_test.cc  |   3 +-
 paddle/fluid/operators/exponential_op.h       |   4 +-
 paddle/fluid/operators/eye_op.h               |   4 +-
 paddle/fluid/operators/feed_forward_test.cu   |   2 +-
 paddle/fluid/operators/fill_any_op.h          |   6 +-
 .../fill_constant_batch_size_like_op.h        |   6 +-
 .../fill_constant_batch_size_like_op_npu.cc   |   2 +-
 paddle/fluid/operators/fill_constant_op.h     |  10 +-
 paddle/fluid/operators/fill_zeros_like_op.h   |   4 +-
 paddle/fluid/operators/flatten_op.h           |   2 +-
 paddle/fluid/operators/fold_op.h              |   4 +-
 paddle/fluid/operators/frame_op.h             |   2 +-
 paddle/fluid/operators/fsp_op.h               |   4 +-
 .../operators/fused/cudnn_bn_add_relu_test.cc |   2 +-
 .../operators/fused/cudnn_norm_conv_test.cc   |   2 +-
 .../operators/fused/fused_attention_op.cu     |   2 +-
 .../operators/fused/fused_bn_activation_op.cu |   5 +-
 .../fused/fused_bn_add_activation_op.cu       |   2 +-
 .../operators/fused/fused_dropout_test.h      |   2 +-
 paddle/fluid/operators/gather.cu.h            |   4 +-
 paddle/fluid/operators/gather.h               |   4 +-
 paddle/fluid/operators/gather_op_npu_test.cc  |   3 +-
 paddle/fluid/operators/gelu_op_npu_test.cc    |   3 +-
 paddle/fluid/operators/grid_sampler_op.cu     |   6 +-
 paddle/fluid/operators/grid_sampler_op.h      |   8 +-
 paddle/fluid/operators/group_norm_op.cu       |   4 +-
 paddle/fluid/operators/group_norm_op.h        |   4 +-
 paddle/fluid/operators/gru_op.cc              |   2 +-
 paddle/fluid/operators/gru_op.cu.cc           |   2 +-
 paddle/fluid/operators/gru_op.h               |   6 +-
 paddle/fluid/operators/gumbel_softmax_op.cu   |   2 +-
 paddle/fluid/operators/gumbel_softmax_op.h    |   4 +-
 .../fluid/operators/hierarchical_sigmoid_op.h |   8 +-
 paddle/fluid/operators/histogram_op.cu        |   2 +-
 paddle/fluid/operators/histogram_op.h         |   4 +-
 paddle/fluid/operators/im2sequence_op.h       |   2 +-
 .../fluid/operators/increment_op_npu_test.cc  |   3 +-
 paddle/fluid/operators/index_sample_op.cu     |   4 +-
 paddle/fluid/operators/index_select_op.h      |   4 +-
 paddle/fluid/operators/inplace_abn_op.h       |   2 +-
 paddle/fluid/operators/instance_norm_op.cc    |   8 +-
 paddle/fluid/operators/instance_norm_op.cu    |  13 +-
 paddle/fluid/operators/interpolate_op.cu      |   6 +-
 paddle/fluid/operators/interpolate_op.h       |   8 +-
 paddle/fluid/operators/interpolate_v2_op.cu   |   6 +-
 paddle/fluid/operators/interpolate_v2_op.h    |   8 +-
 paddle/fluid/operators/layer_norm_op.h        |  10 +-
 paddle/fluid/operators/layout_utils.h         |  14 +-
 paddle/fluid/operators/linear_chain_crf_op.h  |   8 +-
 paddle/fluid/operators/linspace_op.h          |   2 +-
 paddle/fluid/operators/lrn_op.cc              |   4 +-
 paddle/fluid/operators/lrn_op.h               |   2 +-
 paddle/fluid/operators/lstm_op.h              |   6 +-
 paddle/fluid/operators/lstmp_op.h             |   6 +-
 paddle/fluid/operators/lstsq_op.h             |   2 +-
 paddle/fluid/operators/lu_op.h                |   4 +-
 paddle/fluid/operators/lu_unpack_op.h         |   2 +-
 .../operators/margin_cross_entropy_op.cu      |   6 +-
 paddle/fluid/operators/math/CMakeLists.txt    |   5 +-
 paddle/fluid/operators/math/blas_impl.cu.h    |   2 +-
 paddle/fluid/operators/math/blas_impl.h       |   2 +-
 paddle/fluid/operators/math/blas_impl.hip.h   |   2 +-
 paddle/fluid/operators/math/depthwise_conv.cu |  10 +-
 paddle/fluid/operators/math/math_function.cc  | 306 --------------
 paddle/fluid/operators/math/math_function.cu  | 322 ---------------
 paddle/fluid/operators/math/math_function.h   | 112 ------
 .../fluid/operators/math/matrix_solve.cu.cc   |   6 +-
 paddle/fluid/operators/math/prelu.h           |   2 +-
 paddle/fluid/operators/math/sample_prob.cu    |   2 +-
 .../fluid/operators/math/segment_pooling.cu   |   2 +-
 .../operators/math/selected_rows_functor.cc   |   6 +-
 .../operators/math/selected_rows_functor.cu   |   8 +-
 .../operators/math/selected_rows_functor.h    |   2 +-
 .../math/selected_rows_functor_test.cc        |  33 +-
 .../math/selected_rows_functor_test.cu.cc     |  13 +-
 .../fluid/operators/math/sequence_pooling.cc  |   6 +-
 .../fluid/operators/math/sequence_pooling.cu  |   2 +-
 paddle/fluid/operators/math/softmax.cu        |   2 +-
 paddle/fluid/operators/math/sparse_impl.cu.h  |   2 +-
 paddle/fluid/operators/math/tree2col.cc       |   4 +-
 paddle/fluid/operators/math/tree2col.cu       |   6 +-
 paddle/fluid/operators/math/tree2col.h        |   2 +-
 paddle/fluid/operators/matmul_op.cc           |   2 +-
 paddle/fluid/operators/matrix_power_op.h      |   2 +-
 paddle/fluid/operators/matrix_rank_op.cu      |   2 +-
 paddle/fluid/operators/maxout_op.h            |   4 +-
 paddle/fluid/operators/mean_iou_op.cu         |   2 +-
 .../operators/mlu/activation_op_mlu_test.cc   |   2 +-
 paddle/fluid/operators/mul_op.h               |   2 +-
 paddle/fluid/operators/norm_utils.cu.h        |   4 +-
 paddle/fluid/operators/one_hot_op.cu          |   2 +-
 paddle/fluid/operators/one_hot_op.h           |   4 +-
 paddle/fluid/operators/one_hot_v2_op.cu       |   2 +-
 paddle/fluid/operators/one_hot_v2_op.h        |   4 +-
 .../fluid/operators/optimizers/adagrad_op.cc  |   2 +-
 .../fluid/operators/optimizers/adagrad_op.cu  |   2 +-
 paddle/fluid/operators/overlap_add_op.h       |   2 +-
 paddle/fluid/operators/p_norm_op.cu           |   2 +-
 paddle/fluid/operators/p_norm_op.h            |   4 +-
 paddle/fluid/operators/pad2d_op.cc            |   4 +-
 paddle/fluid/operators/pad2d_op.cu            |   4 +-
 paddle/fluid/operators/pad3d_op.cc            |   4 +-
 paddle/fluid/operators/pad3d_op.cu            |   4 +-
 paddle/fluid/operators/pixel_shuffle_op.h     |   6 +-
 paddle/fluid/operators/poisson_op.h           |   4 +-
 paddle/fluid/operators/pool_cudnn_op.cu.cc    |  33 +-
 paddle/fluid/operators/pool_op.h              |   4 +-
 paddle/fluid/operators/pool_with_index_op.h   |   4 +-
 paddle/fluid/operators/prroi_pool_op.cu       |   2 +-
 paddle/fluid/operators/prroi_pool_op.h        |   4 +-
 .../pscore/distributed_lookup_table_op.cc     |   2 +-
 .../pscore/distributed_lookup_table_op.h      |   2 +-
 .../pscore/distributed_push_sparse_op.cc      |   2 +-
 .../pscore/distributed_push_sparse_op.h       |   2 +-
 paddle/fluid/operators/pscore/fake_init_op.cc |   2 +-
 paddle/fluid/operators/psroi_pool_op.cu       |   2 +-
 paddle/fluid/operators/psroi_pool_op.h        |   4 +-
 paddle/fluid/operators/put_along_axis_op.cu   |   2 +-
 paddle/fluid/operators/put_along_axis_op.h    |   2 +-
 paddle/fluid/operators/qr_op.h                |   2 +-
 paddle/fluid/operators/range_op.h             |   2 +-
 paddle/fluid/operators/range_op_npu_test.cc   |   3 +-
 paddle/fluid/operators/rank_attention.cu.h    |   2 +-
 .../reduce_ops/reduce_any_op_npu_test.cc      |   3 +-
 paddle/fluid/operators/reduce_ops/reduce_op.h |   6 +-
 paddle/fluid/operators/repeat_interleave_op.h |   2 +-
 paddle/fluid/operators/rnn_op.cu.cc           |   4 +-
 paddle/fluid/operators/rnn_op.h               |  20 +-
 paddle/fluid/operators/roi_align_op.cu        |   2 +-
 paddle/fluid/operators/roi_align_op.h         |  14 +-
 paddle/fluid/operators/roi_align_op_npu.cc    |   2 +-
 paddle/fluid/operators/roi_pool_op.cu         |   2 +-
 paddle/fluid/operators/roi_pool_op.h          |   4 +-
 paddle/fluid/operators/row_conv_op.cu         |   4 +-
 paddle/fluid/operators/sample_logits_op.cu    |   6 +-
 paddle/fluid/operators/sample_logits_op.h     |   4 +-
 paddle/fluid/operators/scatter.cu.h           |   2 +-
 paddle/fluid/operators/search_compute.h       |   2 +-
 paddle/fluid/operators/seed_op.cu             |   4 +-
 paddle/fluid/operators/segment_pool_op.h      |   8 +-
 .../operators/sequence_ops/sequence_conv_op.h |   6 +-
 .../sequence_ops/sequence_expand_op.h         |   6 +-
 .../operators/sequence_ops/sequence_pad_op.h  |   2 +-
 .../operators/sequence_ops/sequence_pool_op.h |   2 +-
 .../sequence_ops/sequence_reshape_op.h        |   2 +-
 .../sequence_ops/sequence_slice_op.h          |   4 +-
 .../sequence_softmax_cudnn_op.cu.cc           |   2 +-
 .../sequence_topk_avg_pooling_op.h            |   4 +-
 .../sequence_ops/sequence_unpad_op.h          |   4 +-
 paddle/fluid/operators/set_value_op.h         |   2 +-
 .../fluid/operators/shrink_rnn_memory_op.cc   |   6 +-
 paddle/fluid/operators/shuffle_channel_op.h   |   2 +-
 paddle/fluid/operators/slice_op.h             |   4 +-
 paddle/fluid/operators/softmax_op_npu_test.cc |   3 +-
 .../softmax_with_cross_entropy_op.cu          |   6 +-
 paddle/fluid/operators/solve_op.h             |   4 +-
 paddle/fluid/operators/spectral_norm_op.h     |  10 +-
 paddle/fluid/operators/spp_op.h               |   4 +-
 paddle/fluid/operators/squeeze_op.h           |   2 +-
 paddle/fluid/operators/squeeze_op_npu_test.cc |   3 +-
 paddle/fluid/operators/strided_slice_op.h     |   6 +-
 paddle/fluid/operators/sum_op.cu              |   2 +-
 paddle/fluid/operators/sum_op.h               |   4 +-
 paddle/fluid/operators/svd_helper.h           |  14 +-
 paddle/fluid/operators/take_along_axis_op.cu  |   2 +-
 paddle/fluid/operators/take_along_axis_op.h   |   4 +-
 .../teacher_student_sigmoid_loss_op.cc        |   2 +-
 paddle/fluid/operators/temporal_shift_op.h    |   2 +-
 paddle/fluid/operators/transpose_op.h         |  16 +-
 .../fluid/operators/transpose_op_npu_test.cc  |   3 +-
 paddle/fluid/operators/tree_conv_op.h         |   4 +-
 paddle/fluid/operators/unfold_op.h            |   4 +-
 .../fluid/operators/unique_consecutive_op.h   |   2 +-
 paddle/fluid/operators/unique_op.h            |   2 +-
 .../fluid/operators/unique_with_counts_op.h   |   2 +-
 paddle/fluid/operators/unpool_op.h            |  10 +-
 paddle/fluid/operators/unsqueeze_op.h         |   2 +-
 .../fluid/operators/unsqueeze_op_npu_test.cc  |   3 +-
 paddle/fluid/operators/var_conv_2d_op.cc      |   2 +-
 paddle/fluid/operators/viterbi_decode_op.h    |   4 +-
 paddle/fluid/operators/warpctc_op.h           |   6 +-
 paddle/fluid/operators/where_index_op.h       |   2 +-
 paddle/fluid/operators/where_op.h             |   2 +-
 paddle/pten/kernels/cpu/norm_grad_kernel.cc   |   2 +-
 paddle/pten/kernels/cpu/norm_kernel.cc        |   2 +-
 paddle/pten/kernels/funcs/CMakeLists.txt      |  48 +++
 paddle/pten/kernels/funcs/elementwise_base.h  |   4 +-
 paddle/pten/kernels/funcs/math_function.cc    | 342 ++++++++++++++++
 paddle/pten/kernels/funcs/math_function.cu    | 380 ++++++++++++++++++
 paddle/pten/kernels/funcs/math_function.h     | 127 ++++++
 .../kernels/funcs}/math_function_impl.h       | 179 +++++----
 .../kernels/funcs}/math_function_test.cc      | 105 +++--
 .../kernels/funcs}/math_function_test.cu      |  93 +++--
 paddle/pten/kernels/gpu/trace_kernel.cu       |   2 +-
 paddle/pten/kernels/impl/trace_kernel_impl.h  |   5 +-
 300 files changed, 1724 insertions(+), 1483 deletions(-)
 delete mode 100644 paddle/fluid/operators/math/math_function.cc
 delete mode 100644 paddle/fluid/operators/math/math_function.cu
 delete mode 100644 paddle/fluid/operators/math/math_function.h
 mode change 100755 => 100644 paddle/fluid/operators/squeeze_op.h
 create mode 100644 paddle/pten/kernels/funcs/math_function.cc
 create mode 100644 paddle/pten/kernels/funcs/math_function.cu
 create mode 100644 paddle/pten/kernels/funcs/math_function.h
 rename paddle/{fluid/operators/math => pten/kernels/funcs}/math_function_impl.h (54%)
 rename paddle/{fluid/operators/math => pten/kernels/funcs}/math_function_test.cc (69%)
 rename paddle/{fluid/operators/math => pten/kernels/funcs}/math_function_test.cu (90%)

diff --git a/paddle/fluid/distributed/ps/service/communicator/communicator.h b/paddle/fluid/distributed/ps/service/communicator/communicator.h
index da4e2f1a128..9f8c998d3a1 100644
--- a/paddle/fluid/distributed/ps/service/communicator/communicator.h
+++ b/paddle/fluid/distributed/ps/service/communicator/communicator.h
@@ -35,12 +35,12 @@ limitations under the License. */
 #include "paddle/fluid/framework/variable.h"
 #include "paddle/fluid/framework/variable_helper.h"
 #include "paddle/fluid/operators/math/blas.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/math/selected_rows_functor.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/string/split.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 #include "paddle/fluid/distributed/ps/service/ps_client.h"
 
@@ -180,7 +180,7 @@ inline void MergeVars(const std::string &var_name,
 
     // set output tensor to 0.
     paddle::platform::CPUDeviceContext cpu_ctx;
-    paddle::operators::math::SetConstant<paddle::platform::CPUDeviceContext, T>
+    pten::funcs::SetConstant<paddle::platform::CPUDeviceContext, T>
         constant_functor;
     constant_functor(cpu_ctx, out_t, static_cast<T>(0));
     // sum all vars to out
diff --git a/paddle/fluid/distributed/ps/service/ps_service/graph_py_service.h b/paddle/fluid/distributed/ps/service/ps_service/graph_py_service.h
index 71b44f36d01..5bbcdca88a1 100644
--- a/paddle/fluid/distributed/ps/service/ps_service/graph_py_service.h
+++ b/paddle/fluid/distributed/ps/service/ps_service/graph_py_service.h
@@ -38,9 +38,10 @@
 #include "paddle/fluid/distributed/ps/service/ps_service/service.h"
 #include "paddle/fluid/distributed/ps/service/sendrecv.pb.h"
 #include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/string/printf.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
+
 namespace paddle {
 namespace distributed {
 class GraphPyService {
diff --git a/paddle/fluid/distributed/test/brpc_service_dense_sgd_test.cc b/paddle/fluid/distributed/test/brpc_service_dense_sgd_test.cc
index d7d9d1ed1ba..dd79d67be75 100644
--- a/paddle/fluid/distributed/test/brpc_service_dense_sgd_test.cc
+++ b/paddle/fluid/distributed/test/brpc_service_dense_sgd_test.cc
@@ -21,8 +21,8 @@ limitations under the License. */
 #include "paddle/fluid/distributed/ps/service/brpc_ps_server.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/platform/place.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace distributed {
@@ -42,7 +42,6 @@ class DenseTensor;
 namespace framework = paddle::framework;
 namespace platform = paddle::platform;
 namespace operators = paddle::operators;
-namespace math = paddle::operators::math;
 namespace memory = paddle::memory;
 namespace distributed = paddle::distributed;
 
diff --git a/paddle/fluid/distributed/test/brpc_service_sparse_sgd_test.cc b/paddle/fluid/distributed/test/brpc_service_sparse_sgd_test.cc
index 4f7b608c8bf..0dfaafb2581 100644
--- a/paddle/fluid/distributed/test/brpc_service_sparse_sgd_test.cc
+++ b/paddle/fluid/distributed/test/brpc_service_sparse_sgd_test.cc
@@ -22,8 +22,8 @@ limitations under the License. */
 #include "paddle/fluid/distributed/ps/service/brpc_ps_server.h"
 #include "paddle/fluid/distributed/ps/service/env.h"
 #include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/platform/place.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace distributed {
@@ -43,7 +43,6 @@ class DenseTensor;
 namespace framework = paddle::framework;
 namespace platform = paddle::platform;
 namespace operators = paddle::operators;
-namespace math = paddle::operators::math;
 namespace memory = paddle::memory;
 namespace distributed = paddle::distributed;
 
diff --git a/paddle/fluid/distributed/test/brpc_utils_test.cc b/paddle/fluid/distributed/test/brpc_utils_test.cc
index 608f647d148..7f18c86ac7e 100644
--- a/paddle/fluid/distributed/test/brpc_utils_test.cc
+++ b/paddle/fluid/distributed/test/brpc_utils_test.cc
@@ -17,7 +17,7 @@ limitations under the License. */
 #include "gtest/gtest.h"
 
 #include "paddle/fluid/distributed/ps/service/brpc_utils.h"
-#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace framework {
@@ -28,7 +28,6 @@ class Variable;
 namespace framework = paddle::framework;
 namespace platform = paddle::platform;
 namespace operators = paddle::operators;
-namespace math = paddle::operators::math;
 namespace memory = paddle::memory;
 namespace distributed = paddle::distributed;
 
@@ -42,7 +41,7 @@ void CreateVarsOnScope(framework::Scope* scope, platform::Place* place,
   lod1.push_back(framework::Vector<size_t>({1, 3, 8}));
   tensor1->set_lod(lod1);
   tensor1->mutable_data<float>(*place);
-  math::set_constant(ctx, tensor1, 31.9);
+  pten::funcs::set_constant(ctx, tensor1, 31.9);
 
   // var 2
   framework::Variable* var2 = scope->Var("x2");
@@ -52,7 +51,7 @@ void CreateVarsOnScope(framework::Scope* scope, platform::Place* place,
   lod2.push_back(framework::Vector<size_t>({1, 1}));
   tensor2->set_lod(lod2);
   tensor2->mutable_data<int>(*place);
-  math::set_constant(ctx, tensor2, 100);
+  pten::funcs::set_constant(ctx, tensor2, 100);
 
   // var 3
   framework::Variable* var3 = scope->Var("x3");
@@ -62,7 +61,7 @@ void CreateVarsOnScope(framework::Scope* scope, platform::Place* place,
   auto* rows = slr->mutable_rows();
   tensor3->Resize(framework::make_ddim({564, 128}));
   tensor3->mutable_data<float>(*place);
-  math::set_constant(ctx, tensor3, 32.7);
+  pten::funcs::set_constant(ctx, tensor3, 32.7);
   for (int i = 0; i < 564; ++i) rows->push_back(i);
 }
 
diff --git a/paddle/fluid/distributed/test/graph_node_split_test.cc b/paddle/fluid/distributed/test/graph_node_split_test.cc
index e808d2a8153..6bbcb1d3996 100644
--- a/paddle/fluid/distributed/test/graph_node_split_test.cc
+++ b/paddle/fluid/distributed/test/graph_node_split_test.cc
@@ -36,14 +36,13 @@ limitations under the License. */
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/framework/variable.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/string/printf.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace framework = paddle::framework;
 namespace platform = paddle::platform;
 namespace operators = paddle::operators;
-namespace math = paddle::operators::math;
 namespace memory = paddle::memory;
 namespace distributed = paddle::distributed;
 
diff --git a/paddle/fluid/distributed/test/graph_node_test.cc b/paddle/fluid/distributed/test/graph_node_test.cc
index 3243ebc389c..4aa2839c181 100644
--- a/paddle/fluid/distributed/test/graph_node_test.cc
+++ b/paddle/fluid/distributed/test/graph_node_test.cc
@@ -36,14 +36,13 @@ limitations under the License. */
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/framework/variable.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/string/printf.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace framework = paddle::framework;
 namespace platform = paddle::platform;
 namespace operators = paddle::operators;
-namespace math = paddle::operators::math;
 namespace memory = paddle::memory;
 namespace distributed = paddle::distributed;
 
diff --git a/paddle/fluid/eager/grad_tensor_holder.cc b/paddle/fluid/eager/grad_tensor_holder.cc
index 90ae91db5f5..8bfeaf47b23 100644
--- a/paddle/fluid/eager/grad_tensor_holder.cc
+++ b/paddle/fluid/eager/grad_tensor_holder.cc
@@ -16,7 +16,7 @@
 #include "paddle/fluid/imperative/gradient_accumulator.h"
 
 #include "paddle/fluid/framework/var_type.h"
-#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace egr {
 
diff --git a/paddle/fluid/framework/data_device_transform_test.cu b/paddle/fluid/framework/data_device_transform_test.cu
index b364cf9b31d..316f8c4d90d 100644
--- a/paddle/fluid/framework/data_device_transform_test.cu
+++ b/paddle/fluid/framework/data_device_transform_test.cu
@@ -19,9 +19,9 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/init.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 #include "paddle/fluid/framework/pten_utils.h"
 
diff --git a/paddle/fluid/framework/data_layout_transform.cc b/paddle/fluid/framework/data_layout_transform.cc
index 688835cc3c9..a014d34bcf5 100644
--- a/paddle/fluid/framework/data_layout_transform.cc
+++ b/paddle/fluid/framework/data_layout_transform.cc
@@ -14,7 +14,7 @@
 
 #include "paddle/fluid/framework/data_layout_transform.h"
 
-#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 #ifdef PADDLE_WITH_MKLDNN
 #include "paddle/fluid/platform/mkldnn_reuse.h"
 #endif
@@ -42,7 +42,7 @@ void CastDataLayout::apply() {
   auto place = ctx_->GetPlace();
 
   if (platform::is_cpu_place(place)) {
-    operators::math::Transpose<platform::CPUDeviceContext, T, 4> trans4;
+    pten::funcs::Transpose<platform::CPUDeviceContext, T, 4> trans4;
     auto* context = static_cast<const platform::CPUDeviceContext*>(ctx_);
     trans4(*context, in_, out_, axis_);
   } else {
diff --git a/paddle/fluid/framework/data_transform.h b/paddle/fluid/framework/data_transform.h
index 385a5ff704f..5c5d49f8fec 100644
--- a/paddle/fluid/framework/data_transform.h
+++ b/paddle/fluid/framework/data_transform.h
@@ -22,10 +22,10 @@ limitations under the License. */
 #include "paddle/fluid/framework/selected_rows_utils.h"
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/framework/variable.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/macros.h"
 #include "paddle/fluid/platform/transform.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.cc b/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.cc
index 5dbc3e38ea1..cab7d5ddb8b 100644
--- a/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.cc
+++ b/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.cc
@@ -33,7 +33,7 @@
 #include "paddle/fluid/framework/ir/node.h"
 #include "paddle/fluid/framework/op_info.h"
 #include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 #if defined(PADDLE_WITH_DGC)
 #include "paddle/fluid/framework/details/sparse_all_reduce_op_handle.h"
diff --git a/paddle/fluid/imperative/basic_engine.cc b/paddle/fluid/imperative/basic_engine.cc
index 9d377926536..4c91ece0493 100644
--- a/paddle/fluid/imperative/basic_engine.cc
+++ b/paddle/fluid/imperative/basic_engine.cc
@@ -28,8 +28,8 @@
 #include "paddle/fluid/imperative/layer.h"
 #include "paddle/fluid/imperative/op_base.h"
 #include "paddle/fluid/imperative/tracer.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/platform/profiler.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 DECLARE_bool(sort_sum_gradient);
 
@@ -103,7 +103,7 @@ void BasicEngine::Init(
     if (grad_tensor == nullptr) {
       grad_var->Resize(fwd_var.dims());
       grad_var->mutable_data(fwd_var.place(), fwd_var.type());
-      operators::math::set_constant(*dev_ctx, grad_var, 1.0);
+      pten::funcs::set_constant(*dev_ctx, grad_var, 1.0);
     } else {
       paddle::framework::TensorCopy(
           grad_tensor->Var().Get<framework::LoDTensor>(), fwd_var.place(),
@@ -156,7 +156,7 @@ void BasicEngine::CheckBackwardInputs(const OpBase& op) {
         VLOG(6) << "Set ungenerated Grad: " << var->Name()
                 << " as zero with dtype "
                 << framework::DataTypeToString(var->ForwardDataType());
-        operators::math::set_constant(*dev_ctx, tensor, 0.0);
+        pten::funcs::set_constant(*dev_ctx, tensor, 0.0);
       }
     }
   }
diff --git a/paddle/fluid/imperative/gradient_accumulator.cc b/paddle/fluid/imperative/gradient_accumulator.cc
index 75d4d8246e3..5eed7eca7a7 100644
--- a/paddle/fluid/imperative/gradient_accumulator.cc
+++ b/paddle/fluid/imperative/gradient_accumulator.cc
@@ -22,12 +22,12 @@
 #include "paddle/fluid/framework/selected_rows_utils.h"
 #include "paddle/fluid/imperative/layer.h"
 #include "paddle/fluid/operators/math/blas.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/math/selected_rows_functor.h"
 #include "paddle/fluid/platform/complex.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/float16.h"
 #include "paddle/fluid/platform/profiler.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 #ifdef PADDLE_WITH_XPU
 #include "xpu/refactor/math.h"
 #endif
@@ -210,7 +210,7 @@ void TensorAddImpl(const framework::Tensor& src, framework::Tensor* dst,
   platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
   paddle::platform::DeviceContext* ctx = pool.Get(place);
   auto dev_ctx = dynamic_cast<DeviceContext*>(ctx);
-  operators::math::ElementwiseAddTo<DeviceContext, T> func;
+  pten::funcs::ElementwiseAddTo<DeviceContext, T> func;
   func(dev_ctx, src, dst);
 }
 
@@ -703,12 +703,12 @@ void EagerGradientAccumulator::SumGrad(std::shared_ptr<VariableWrapper> var,
                 << var->Var().Get<framework::LoDTensor>().dims();
         tensor->Resize(var->Var().Get<framework::LoDTensor>().dims());
         tensor->mutable_data(place, var->DataType());
-        operators::math::set_constant(*dev_ctx, tensor, 0.0);
+        pten::funcs::set_constant(*dev_ctx, tensor, 0.0);
       } else {
         auto* tensor =
             dst_var->MutableVar()->GetMutable<framework::LoDTensor>();
         tensor->mutable_data(place, var->DataType());
-        operators::math::set_constant(*dev_ctx, tensor, 0.0);
+        pten::funcs::set_constant(*dev_ctx, tensor, 0.0);
       }
     }
   }
@@ -835,12 +835,12 @@ void SortedGradientAccumulator::SumGrad(std::shared_ptr<VariableWrapper> var,
                 << var->Var().Get<framework::LoDTensor>().dims();
         tensor->Resize(var->Var().Get<framework::LoDTensor>().dims());
         tensor->mutable_data(place, var->DataType());
-        operators::math::set_constant(*dev_ctx, tensor, 0.0);
+        pten::funcs::set_constant(*dev_ctx, tensor, 0.0);
       } else {
         auto* tensor =
             dst_var->MutableVar()->GetMutable<framework::LoDTensor>();
         tensor->mutable_data(place, var->DataType());
-        operators::math::set_constant(*dev_ctx, tensor, 0.0);
+        pten::funcs::set_constant(*dev_ctx, tensor, 0.0);
       }
     }
     // looks like tmp_grad_vars will not have any member but just in case
diff --git a/paddle/fluid/imperative/layer.cc b/paddle/fluid/imperative/layer.cc
index 5b8974b3348..60e1291a087 100644
--- a/paddle/fluid/imperative/layer.cc
+++ b/paddle/fluid/imperative/layer.cc
@@ -20,10 +20,10 @@
 #include "paddle/fluid/imperative/op_base.h"
 #include "paddle/fluid/imperative/prepared_operator.h"
 #include "paddle/fluid/imperative/var_helper.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/profiler.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 #ifdef PADDLE_WITH_MKLDNN
 #include "paddle/fluid/platform/mkldnn_helper.h"
 #endif
@@ -229,7 +229,7 @@ void VarBase::ClearGradient(bool set_to_zero) {
         if (set_to_zero) {
           auto* dev_ctx =
               platform::DeviceContextPool::Instance().Get(grad_t->place());
-          operators::math::set_constant(*dev_ctx, grad_t, 0.0);
+          pten::funcs::set_constant(*dev_ctx, grad_t, 0.0);
         } else {
           grad_t->clear();
         }
diff --git a/paddle/fluid/imperative/partial_grad_engine.cc b/paddle/fluid/imperative/partial_grad_engine.cc
index 45756083c90..ed60a4dc084 100644
--- a/paddle/fluid/imperative/partial_grad_engine.cc
+++ b/paddle/fluid/imperative/partial_grad_engine.cc
@@ -28,10 +28,10 @@
 #include "paddle/fluid/imperative/layer.h"
 #include "paddle/fluid/imperative/op_base.h"
 #include "paddle/fluid/imperative/tracer.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/profiler.h"
 #include "paddle/fluid/string/string_helper.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 DECLARE_bool(sort_sum_gradient);
 
@@ -316,7 +316,7 @@ static void FillConstantLike(const VariableWrapper &ref_var,
   } else {
     dst_tensor->mutable_data(place, ref_var.DataType());
   }
-  operators::math::set_constant(*dev_ctx, dst_tensor, value);
+  pten::funcs::set_constant(*dev_ctx, dst_tensor, value);
 }
 
 /**
diff --git a/paddle/fluid/imperative/reducer.cc b/paddle/fluid/imperative/reducer.cc
index 54e27b2bd8c..361b9eb0fe6 100644
--- a/paddle/fluid/imperative/reducer.cc
+++ b/paddle/fluid/imperative/reducer.cc
@@ -755,7 +755,7 @@ void Reducer::MarkVarReady(const size_t var_index, const bool is_used_var) {
             {static_cast<int64_t>(length)});
       } else {
         group_tensor.Resize({static_cast<int64_t>(length)});
-        operators::math::set_constant(*dev_ctx, &group_tensor, 0.0);
+        pten::funcs::set_constant(*dev_ctx, &group_tensor, 0.0);
       }
 #endif
     }
diff --git a/paddle/fluid/imperative/reducer.h b/paddle/fluid/imperative/reducer.h
index b99d7adc0c7..b0317fe33e2 100644
--- a/paddle/fluid/imperative/reducer.h
+++ b/paddle/fluid/imperative/reducer.h
@@ -29,8 +29,8 @@
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/framework/variable.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/platform/for_range.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace imperative {
diff --git a/paddle/fluid/imperative/tests/test_gradient_accmulator.cc b/paddle/fluid/imperative/tests/test_gradient_accmulator.cc
index 6210cb108bd..e91b0b0a777 100644
--- a/paddle/fluid/imperative/tests/test_gradient_accmulator.cc
+++ b/paddle/fluid/imperative/tests/test_gradient_accmulator.cc
@@ -20,7 +20,7 @@
 #include "paddle/fluid/framework/variable.h"
 #include "paddle/fluid/imperative/gradient_accumulator.h"
 #include "paddle/fluid/memory/memcpy.h"
-#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace imperative = paddle::imperative;
 namespace platform = paddle::platform;
diff --git a/paddle/fluid/operators/addmm_op.h b/paddle/fluid/operators/addmm_op.h
index ecfd10d2fa6..8fe73d81b02 100644
--- a/paddle/fluid/operators/addmm_op.h
+++ b/paddle/fluid/operators/addmm_op.h
@@ -20,7 +20,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/operators/eigen/eigen_function.h"
 #include "paddle/fluid/operators/math/blas.h"
-#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
diff --git a/paddle/fluid/operators/affine_grid_op.cu b/paddle/fluid/operators/affine_grid_op.cu
index bcf7deefc98..d203dcb7b91 100644
--- a/paddle/fluid/operators/affine_grid_op.cu
+++ b/paddle/fluid/operators/affine_grid_op.cu
@@ -170,7 +170,7 @@ class AffineGridGradOpCUDAKernel : public framework::OpKernel<T> {
       w = size_attr[3];
     }
     T* theta_grad_data = theta_grad->mutable_data<T>({n, 2, 3}, ctx.GetPlace());
-    math::SetConstant<paddle::platform::CUDADeviceContext, T>()(
+    pten::funcs::SetConstant<paddle::platform::CUDADeviceContext, T>()(
         ctx.cuda_device_context(), theta_grad, static_cast<T>(0));
 
     T h_step;
diff --git a/paddle/fluid/operators/affine_grid_op.h b/paddle/fluid/operators/affine_grid_op.h
index 50c9ebcd9c8..129c7a61a78 100644
--- a/paddle/fluid/operators/affine_grid_op.h
+++ b/paddle/fluid/operators/affine_grid_op.h
@@ -17,7 +17,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/blas.h"
-#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
@@ -61,7 +61,7 @@ inline void GetIdxMap(int n, int h, int w, bool align_corners, Tensor* grid,
   Tensor ones;
   ones.mutable_data<T>({h, w, 1}, ctx.GetPlace());
 
-  math::SetConstant<DeviceContext, T>()(
+  pten::funcs::SetConstant<DeviceContext, T>()(
       ctx.template device_context<DeviceContext>(), &ones, static_cast<T>(1));
   auto ones_t = EigenTensor<T, 3>::From(ones);
   // Get grid tensor with shape [n, h, w, 3] by concatenating h_idx, w_idx and
@@ -115,7 +115,7 @@ class AffineGridOpKernel : public framework::OpKernel<T> {
     }
     auto* output = ctx.Output<Tensor>("Output");
     output->mutable_data<T>({n, h, w, 2}, ctx.GetPlace());
-    math::SetConstant<DeviceContext, T>()(
+    pten::funcs::SetConstant<DeviceContext, T>()(
         ctx.template device_context<DeviceContext>(), output,
         static_cast<T>(0));
     Tensor grid;
@@ -158,7 +158,7 @@ class AffineGridGradOpKernel : public framework::OpKernel<T> {
       w = size_attr[3];
     }
     theta_grad->mutable_data<T>({n, 2, 3}, ctx.GetPlace());
-    math::SetConstant<DeviceContext, T>()(
+    pten::funcs::SetConstant<DeviceContext, T>()(
         ctx.template device_context<DeviceContext>(), theta_grad,
         static_cast<T>(0));
     Tensor grid;
diff --git a/paddle/fluid/operators/amp/check_finite_and_unscale_op_npu_test.cc b/paddle/fluid/operators/amp/check_finite_and_unscale_op_npu_test.cc
index a80b83f0cbe..6390a1f4738 100644
--- a/paddle/fluid/operators/amp/check_finite_and_unscale_op_npu_test.cc
+++ b/paddle/fluid/operators/amp/check_finite_and_unscale_op_npu_test.cc
@@ -24,12 +24,11 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/platform/enforce.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace f = paddle::framework;
 namespace p = paddle::platform;
-namespace m = paddle::operators::math;
 
 using Tensor = paddle::framework::Tensor;
 
diff --git a/paddle/fluid/operators/assign_op_npu_test.cc b/paddle/fluid/operators/assign_op_npu_test.cc
index 049cfb8046f..4761ec61556 100644
--- a/paddle/fluid/operators/assign_op_npu_test.cc
+++ b/paddle/fluid/operators/assign_op_npu_test.cc
@@ -24,12 +24,11 @@ limitations under the License. */
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/operators/dropout_op.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/string/printf.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace f = paddle::framework;
 namespace p = paddle::platform;
-namespace m = paddle::operators::math;
 
 USE_OP(assign);
 USE_OP_DEVICE_KERNEL(assign, NPU);
diff --git a/paddle/fluid/operators/average_accumulates_op.h b/paddle/fluid/operators/average_accumulates_op.h
index 6813f566758..3cd235d89a3 100644
--- a/paddle/fluid/operators/average_accumulates_op.h
+++ b/paddle/fluid/operators/average_accumulates_op.h
@@ -16,7 +16,7 @@ limitations under the License. */
 #include <algorithm>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
@@ -78,7 +78,7 @@ class AverageAccumulatesKernel : public framework::OpKernel<T> {
 
     // Compute
     auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
-    math::SetConstant<DeviceContext, T> constant_functor;
+    pten::funcs::SetConstant<DeviceContext, T> constant_functor;
     ++num_updates;
     ++num_accumulates;
     out_sum_1_tensor.device(place) = in_sum_1_tensor + param_tensor;
diff --git a/paddle/fluid/operators/batch_norm_op.cc b/paddle/fluid/operators/batch_norm_op.cc
index 0a8e753c01d..8e960ff89bf 100644
--- a/paddle/fluid/operators/batch_norm_op.cc
+++ b/paddle/fluid/operators/batch_norm_op.cc
@@ -989,7 +989,7 @@ class BatchNormDoubleGradKernel<platform::CPUDeviceContext, T>
         (data_layout == DataLayout::kNCHW ? x_dims[1]
                                           : x_dims[x_dims.size() - 1]);
     const int sample_size = X->numel() / C;
-    math::SetConstant<platform::CPUDeviceContext, T> set_constant;
+    pten::funcs::SetConstant<platform::CPUDeviceContext, T> set_constant;
 
     const T *mean_data = Saved_mean->data<T>();
     const T *inv_var_data = Saved_variance->data<T>();
diff --git a/paddle/fluid/operators/batch_norm_op.cu b/paddle/fluid/operators/batch_norm_op.cu
index 5f32d697bae..85bd8451b8d 100644
--- a/paddle/fluid/operators/batch_norm_op.cu
+++ b/paddle/fluid/operators/batch_norm_op.cu
@@ -25,9 +25,9 @@ namespace cub = hipcub;
 #endif
 #include "paddle/fluid/framework/data_layout.h"
 #include "paddle/fluid/operators/batch_norm_op.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/norm_utils.cu.h"
 #include "paddle/fluid/platform/float16.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 DECLARE_bool(cudnn_batchnorm_spatial_persistent);
 
@@ -967,7 +967,8 @@ class BatchNormGradKernel<platform::CUDADeviceContext, T>
         if (d_x) {
           framework::TensorCopy(*d_y, ctx.GetPlace(), d_x);
         }
-        math::SetConstant<platform::CUDADeviceContext, BatchNormParamType<T>>
+        pten::funcs::SetConstant<platform::CUDADeviceContext,
+                                 BatchNormParamType<T>>
             functor;
         functor(dev_ctx, d_scale, static_cast<BatchNormParamType<T>>(0));
         functor(dev_ctx, d_bias, static_cast<BatchNormParamType<T>>(0));
diff --git a/paddle/fluid/operators/batch_norm_op.h b/paddle/fluid/operators/batch_norm_op.h
index 32e956e1528..55f1964cf5c 100644
--- a/paddle/fluid/operators/batch_norm_op.h
+++ b/paddle/fluid/operators/batch_norm_op.h
@@ -20,8 +20,8 @@ limitations under the License. */
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/layout_utils.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/norm_utils.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/batch_size_like.h b/paddle/fluid/operators/batch_size_like.h
index f24a3c316a0..1ee0e7002ab 100644
--- a/paddle/fluid/operators/batch_size_like.h
+++ b/paddle/fluid/operators/batch_size_like.h
@@ -16,7 +16,7 @@ limitations under the License. */
 #include <algorithm>
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/bilinear_tensor_product_op.h b/paddle/fluid/operators/bilinear_tensor_product_op.h
index 8f6c9b60dca..c7eb70c290e 100644
--- a/paddle/fluid/operators/bilinear_tensor_product_op.h
+++ b/paddle/fluid/operators/bilinear_tensor_product_op.h
@@ -111,7 +111,7 @@ class BilinearTensorProductGradKernel : public framework::OpKernel<T> {
                             ctx.GetPlace());
     auto y_scale_mat = EigenMatrix<T>::From(y_scale);
 
-    math::SetConstant<DeviceContext, T> set_zero;
+    pten::funcs::SetConstant<DeviceContext, T> set_zero;
 
     if (d_x) {
       d_x->mutable_data<T>(ctx.GetPlace());
diff --git a/paddle/fluid/operators/bincount_op.cu b/paddle/fluid/operators/bincount_op.cu
index 5964b9e345e..dd7804625a7 100644
--- a/paddle/fluid/operators/bincount_op.cu
+++ b/paddle/fluid/operators/bincount_op.cu
@@ -105,7 +105,7 @@ void BincountCUDAInner(const framework::ExecutionContext& context) {
 
   if (!has_weights) {
     int64_t* output_data = output->mutable_data<int64_t>(context.GetPlace());
-    math::SetConstant<DeviceContext, int64_t>()(
+    pten::funcs::SetConstant<DeviceContext, int64_t>()(
         context.template device_context<DeviceContext>(), output, 0L);
 
     KernelBincount<T, InputT, int64_t><<<GET_BLOCKS(input_numel),
@@ -116,7 +116,7 @@ void BincountCUDAInner(const framework::ExecutionContext& context) {
 
     if (weights_type == framework::proto::VarType::FP32) {
       float* output_data = output->mutable_data<float>(context.GetPlace());
-      math::SetConstant<DeviceContext, float>()(
+      pten::funcs::SetConstant<DeviceContext, float>()(
           context.template device_context<DeviceContext>(), output,
           static_cast<float>(0));
 
@@ -125,7 +125,7 @@ void BincountCUDAInner(const framework::ExecutionContext& context) {
           input_data, input_numel, has_weights, weights_data, output_data);
     } else {
       double* output_data = output->mutable_data<double>(context.GetPlace());
-      math::SetConstant<DeviceContext, double>()(
+      pten::funcs::SetConstant<DeviceContext, double>()(
           context.template device_context<DeviceContext>(), output,
           static_cast<double>(0));
 
diff --git a/paddle/fluid/operators/bincount_op.h b/paddle/fluid/operators/bincount_op.h
index a142332bce2..3f4334099e2 100644
--- a/paddle/fluid/operators/bincount_op.h
+++ b/paddle/fluid/operators/bincount_op.h
@@ -18,7 +18,7 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
@@ -64,7 +64,7 @@ void BincountInner(const framework::ExecutionContext& context) {
     const auto& weights_type = weights->type();
     if (weights_type == framework::proto::VarType::FP32) {
       float* output_data = output->mutable_data<float>(context.GetPlace());
-      math::SetConstant<DeviceContext, float>()(
+      pten::funcs::SetConstant<DeviceContext, float>()(
           context.template device_context<DeviceContext>(), output,
           static_cast<float>(0));
       for (int64_t i = 0; i < input_numel; i++) {
@@ -72,7 +72,7 @@ void BincountInner(const framework::ExecutionContext& context) {
       }
     } else {
       double* output_data = output->mutable_data<double>(context.GetPlace());
-      math::SetConstant<DeviceContext, double>()(
+      pten::funcs::SetConstant<DeviceContext, double>()(
           context.template device_context<DeviceContext>(), output,
           static_cast<double>(0));
       for (int64_t i = 0; i < input_numel; i++) {
@@ -82,7 +82,7 @@ void BincountInner(const framework::ExecutionContext& context) {
 
   } else {
     int64_t* output_data = output->mutable_data<int64_t>(context.GetPlace());
-    math::SetConstant<DeviceContext, int64_t>()(
+    pten::funcs::SetConstant<DeviceContext, int64_t>()(
         context.template device_context<DeviceContext>(), output, 0L);
     for (int64_t i = 0; i < input_numel; i++) {
       output_data[input_data[i]] += 1L;
diff --git a/paddle/fluid/operators/bmm_op.h b/paddle/fluid/operators/bmm_op.h
index 15cd6de9136..7a0ddd45823 100644
--- a/paddle/fluid/operators/bmm_op.h
+++ b/paddle/fluid/operators/bmm_op.h
@@ -21,7 +21,7 @@
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/operators/math/blas.h"
-#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 namespace paddle {
 namespace operators {
 
diff --git a/paddle/fluid/operators/bpr_loss_op.h b/paddle/fluid/operators/bpr_loss_op.h
index bebaf6e3365..559d3e14edd 100644
--- a/paddle/fluid/operators/bpr_loss_op.h
+++ b/paddle/fluid/operators/bpr_loss_op.h
@@ -15,8 +15,8 @@ limitations under the License. */
 #pragma once
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/platform/for_range.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/broadcast_tensors_op.h b/paddle/fluid/operators/broadcast_tensors_op.h
index 0eeb9234df0..4161b5879f6 100644
--- a/paddle/fluid/operators/broadcast_tensors_op.h
+++ b/paddle/fluid/operators/broadcast_tensors_op.h
@@ -19,7 +19,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/lod_tensor_array.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/eigen/eigen_function.h"
-#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 #define SWITCH_OUT_RANK_CASE(n)                                \
   case n: {                                                    \
diff --git a/paddle/fluid/operators/coalesce_tensor_op.cc b/paddle/fluid/operators/coalesce_tensor_op.cc
index 5655fd25ec2..d71d6fc39b1 100644
--- a/paddle/fluid/operators/coalesce_tensor_op.cc
+++ b/paddle/fluid/operators/coalesce_tensor_op.cc
@@ -18,8 +18,8 @@
 #include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/var_type.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/platform/device_memory_aligment.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 #ifdef PADDLE_WITH_ASCEND_CL
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
 #endif
@@ -65,11 +65,11 @@ struct FillConstantVisitor {
               .stream();
       runner.Run(stream);
     } else {
-      math::SetConstant<DeviceContext, T> set_constant;
+      pten::funcs::SetConstant<DeviceContext, T> set_constant;
       set_constant(dev_ctx_, tensor_, static_cast<T>(value_));
     }
 #else
-    math::SetConstant<DeviceContext, T> set_constant;
+    pten::funcs::SetConstant<DeviceContext, T> set_constant;
     set_constant(dev_ctx_, tensor_, static_cast<T>(value_));
 #endif
   }
diff --git a/paddle/fluid/operators/collective/c_allgather_op_npu_test.cc b/paddle/fluid/operators/collective/c_allgather_op_npu_test.cc
index ecf682aa524..a51e81a4279 100644
--- a/paddle/fluid/operators/collective/c_allgather_op_npu_test.cc
+++ b/paddle/fluid/operators/collective/c_allgather_op_npu_test.cc
@@ -27,8 +27,8 @@ limitations under the License. */
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/operators/dropout_op.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/string/printf.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 #include "paddle/fluid/operators/collective/c_allgather_op.h"
 #include "paddle/fluid/operators/collective/c_allreduce_op.h"
@@ -43,7 +43,6 @@ limitations under the License. */
 
 namespace f = paddle::framework;
 namespace p = paddle::platform;
-namespace m = paddle::operators::math;
 
 USE_OP(c_allgather);
 USE_NO_KERNEL_OP(c_gen_hccl_id);
diff --git a/paddle/fluid/operators/collective/c_allreduce_max_op_npu_test.cc b/paddle/fluid/operators/collective/c_allreduce_max_op_npu_test.cc
index fa134b60e28..f273e31f6b0 100644
--- a/paddle/fluid/operators/collective/c_allreduce_max_op_npu_test.cc
+++ b/paddle/fluid/operators/collective/c_allreduce_max_op_npu_test.cc
@@ -27,8 +27,8 @@ limitations under the License. */
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/operators/dropout_op.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/string/printf.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 #include "paddle/fluid/operators/collective/c_allgather_op.h"
 #include "paddle/fluid/operators/collective/c_allreduce_op.h"
@@ -43,7 +43,6 @@ limitations under the License. */
 
 namespace f = paddle::framework;
 namespace p = paddle::platform;
-namespace m = paddle::operators::math;
 
 USE_OP(c_allreduce_max);
 USE_NO_KERNEL_OP(c_gen_hccl_id);
diff --git a/paddle/fluid/operators/collective/c_allreduce_sum_op_npu_test.cc b/paddle/fluid/operators/collective/c_allreduce_sum_op_npu_test.cc
index 3e91220423e..66efcd2a490 100644
--- a/paddle/fluid/operators/collective/c_allreduce_sum_op_npu_test.cc
+++ b/paddle/fluid/operators/collective/c_allreduce_sum_op_npu_test.cc
@@ -27,8 +27,8 @@ limitations under the License. */
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/operators/dropout_op.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/string/printf.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 #include "paddle/fluid/operators/collective/c_allreduce_op.h"
 #include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h"
@@ -45,7 +45,6 @@ limitations under the License. */
 
 namespace f = paddle::framework;
 namespace p = paddle::platform;
-namespace m = paddle::operators::math;
 
 USE_OP(c_allreduce_sum);
 USE_NO_KERNEL_OP(c_gen_hccl_id);
diff --git a/paddle/fluid/operators/collective/c_broadcast_op_npu_test.cc b/paddle/fluid/operators/collective/c_broadcast_op_npu_test.cc
index 1ea34c82003..acfdd42a41f 100644
--- a/paddle/fluid/operators/collective/c_broadcast_op_npu_test.cc
+++ b/paddle/fluid/operators/collective/c_broadcast_op_npu_test.cc
@@ -27,8 +27,8 @@ limitations under the License. */
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/operators/dropout_op.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/string/printf.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 #include "paddle/fluid/operators/collective/c_broadcast_op.h"
 #include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h"
@@ -40,7 +40,6 @@ limitations under the License. */
 
 namespace f = paddle::framework;
 namespace p = paddle::platform;
-namespace m = paddle::operators::math;
 
 USE_OP(c_broadcast);
 USE_NO_KERNEL_OP(c_gen_hccl_id);
diff --git a/paddle/fluid/operators/collective/c_reduce_sum_op_npu_test.cc b/paddle/fluid/operators/collective/c_reduce_sum_op_npu_test.cc
index d589d0a25e6..ee0463f84b1 100644
--- a/paddle/fluid/operators/collective/c_reduce_sum_op_npu_test.cc
+++ b/paddle/fluid/operators/collective/c_reduce_sum_op_npu_test.cc
@@ -27,8 +27,8 @@ limitations under the License. */
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/operators/dropout_op.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/string/printf.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 #include "paddle/fluid/operators/collective/c_reduce_op.h"
 #include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h"
@@ -40,7 +40,6 @@ limitations under the License. */
 
 namespace f = paddle::framework;
 namespace p = paddle::platform;
-namespace m = paddle::operators::math;
 
 USE_OP(c_reduce_sum);
 USE_NO_KERNEL_OP(c_gen_hccl_id);
diff --git a/paddle/fluid/operators/collective/c_reducescatter_op_npu_test.cc b/paddle/fluid/operators/collective/c_reducescatter_op_npu_test.cc
index db78652f879..652bf0c1f2a 100644
--- a/paddle/fluid/operators/collective/c_reducescatter_op_npu_test.cc
+++ b/paddle/fluid/operators/collective/c_reducescatter_op_npu_test.cc
@@ -27,8 +27,8 @@ limitations under the License. */
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/operators/dropout_op.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/string/printf.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 #include "paddle/fluid/operators/collective/c_allgather_op.h"
 #include "paddle/fluid/operators/collective/c_allreduce_op.h"
@@ -43,7 +43,6 @@ limitations under the License. */
 
 namespace f = paddle::framework;
 namespace p = paddle::platform;
-namespace m = paddle::operators::math;
 
 USE_OP(c_reducescatter);
 USE_NO_KERNEL_OP(c_gen_hccl_id);
diff --git a/paddle/fluid/operators/collective/c_sync_calc_stream_op_npu_test.cc b/paddle/fluid/operators/collective/c_sync_calc_stream_op_npu_test.cc
index 5778a270f19..9d27d99b3ab 100644
--- a/paddle/fluid/operators/collective/c_sync_calc_stream_op_npu_test.cc
+++ b/paddle/fluid/operators/collective/c_sync_calc_stream_op_npu_test.cc
@@ -26,12 +26,11 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/string/printf.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace f = paddle::framework;
 namespace p = paddle::platform;
-namespace m = paddle::operators::math;
 
 USE_OP(elementwise_add);
 USE_OP_DEVICE_KERNEL(elementwise_add, NPU);
diff --git a/paddle/fluid/operators/collective/c_sync_comm_stream_op_npu_test.cc b/paddle/fluid/operators/collective/c_sync_comm_stream_op_npu_test.cc
index e7017835686..9d883786478 100644
--- a/paddle/fluid/operators/collective/c_sync_comm_stream_op_npu_test.cc
+++ b/paddle/fluid/operators/collective/c_sync_comm_stream_op_npu_test.cc
@@ -27,8 +27,8 @@ limitations under the License. */
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/operators/dropout_op.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/string/printf.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 #include "paddle/fluid/operators/collective/c_broadcast_op.h"
 #include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h"
@@ -40,7 +40,6 @@ limitations under the License. */
 
 namespace f = paddle::framework;
 namespace p = paddle::platform;
-namespace m = paddle::operators::math;
 
 USE_OP(c_broadcast);
 USE_OP_DEVICE_KERNEL(c_sync_comm_stream, NPU);
diff --git a/paddle/fluid/operators/collective/checknumeric_npu_test.cc b/paddle/fluid/operators/collective/checknumeric_npu_test.cc
index 2be37cc456b..18b75d8e685 100644
--- a/paddle/fluid/operators/collective/checknumeric_npu_test.cc
+++ b/paddle/fluid/operators/collective/checknumeric_npu_test.cc
@@ -28,8 +28,8 @@ limitations under the License. */
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/operators/dropout_op.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/string/printf.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 #include "paddle/fluid/operators/collective/c_allreduce_op.h"
 #include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h"
@@ -41,7 +41,6 @@ limitations under the License. */
 
 namespace f = paddle::framework;
 namespace p = paddle::platform;
-namespace m = paddle::operators::math;
 
 USE_OP(c_allreduce_sum);
 USE_OP_DEVICE_KERNEL(c_allreduce_sum, NPU);
diff --git a/paddle/fluid/operators/collective/recv_v2_op_npu_test.cc b/paddle/fluid/operators/collective/recv_v2_op_npu_test.cc
index edd4b18b35a..bf96f48bc87 100644
--- a/paddle/fluid/operators/collective/recv_v2_op_npu_test.cc
+++ b/paddle/fluid/operators/collective/recv_v2_op_npu_test.cc
@@ -27,8 +27,8 @@ limitations under the License. */
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/operators/dropout_op.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/string/printf.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 #include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h"
 #include "paddle/fluid/operators/collective/recv_v2_op.h"
@@ -40,7 +40,6 @@ limitations under the License. */
 
 namespace f = paddle::framework;
 namespace p = paddle::platform;
-namespace m = paddle::operators::math;
 
 USE_OP(recv_v2);
 USE_NO_KERNEL_OP(c_gen_hccl_id);
diff --git a/paddle/fluid/operators/collective/send_v2_op_npu_test.cc b/paddle/fluid/operators/collective/send_v2_op_npu_test.cc
index b2470ab4c05..748a4fb99b4 100644
--- a/paddle/fluid/operators/collective/send_v2_op_npu_test.cc
+++ b/paddle/fluid/operators/collective/send_v2_op_npu_test.cc
@@ -26,8 +26,8 @@ limitations under the License. */
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/operators/dropout_op.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/string/printf.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 #include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h"
 #include "paddle/fluid/operators/collective/send_v2_op.h"
@@ -39,7 +39,6 @@ limitations under the License. */
 
 namespace f = paddle::framework;
 namespace p = paddle::platform;
-namespace m = paddle::operators::math;
 
 USE_OP(send_v2);
 USE_NO_KERNEL_OP(c_gen_hccl_id);
diff --git a/paddle/fluid/operators/controlflow/conditional_block_op.cc b/paddle/fluid/operators/controlflow/conditional_block_op.cc
index eeb410eba2b..f961e479ce4 100644
--- a/paddle/fluid/operators/controlflow/conditional_block_op.cc
+++ b/paddle/fluid/operators/controlflow/conditional_block_op.cc
@@ -15,7 +15,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/controlflow/conditional_block_op.h"
 
 #include "paddle/fluid/operators/assign_op.h"
-#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
@@ -245,7 +245,7 @@ class ConditionalBlockGradOp : public ConditionalOp {
     outside_tensor->mutable_data(place, input_tensor.type());
     const platform::DeviceContext *dev_ctx =
         platform::DeviceContextPool::Instance().Get(place);
-    math::set_constant(*dev_ctx, outside_tensor, 0.0f);
+    pten::funcs::set_constant(*dev_ctx, outside_tensor, 0.0f);
     outside_tensor->set_lod(input_tensor.lod());
   }
 };
diff --git a/paddle/fluid/operators/conv_cudnn_op.cu b/paddle/fluid/operators/conv_cudnn_op.cu
index 20b1afb42fe..3e85194908b 100644
--- a/paddle/fluid/operators/conv_cudnn_op.cu
+++ b/paddle/fluid/operators/conv_cudnn_op.cu
@@ -861,7 +861,7 @@ class CUDNNConvDoubleGradOpKernel : public framework::OpKernel<T> {
     auto dX = ctx.Output<Tensor>("DInput");
     if (ddO) {
       ddO->mutable_data<T>(ctx.GetPlace());
-      math::SetConstant<platform::CUDADeviceContext, T> set_zero;
+      pten::funcs::SetConstant<platform::CUDADeviceContext, T> set_zero;
       set_zero(dev_ctx, ddO, static_cast<T>(0));
     }
     if (dW) {
diff --git a/paddle/fluid/operators/conv_op.h b/paddle/fluid/operators/conv_op.h
index 94d1f707b74..fb22765d76e 100644
--- a/paddle/fluid/operators/conv_op.h
+++ b/paddle/fluid/operators/conv_op.h
@@ -485,7 +485,7 @@ class GemmConvGradKernel : public framework::OpKernel<T> {
       col_matrix.Resize(col_matrix_shape);
     }
 
-    math::SetConstant<DeviceContext, T> set_zero;
+    pten::funcs::SetConstant<DeviceContext, T> set_zero;
     auto blas = math::GetBlas<DeviceContext, T>(dev_ctx);
 
     if (input_grad) {
@@ -692,7 +692,7 @@ class GemmConvDoubleGradKernel : public framework::OpKernel<T> {
       col_matrix.Resize(col_matrix_shape);
     }
 
-    math::SetConstant<DeviceContext, T> set_zero;
+    pten::funcs::SetConstant<DeviceContext, T> set_zero;
     auto blas = math::GetBlas<DeviceContext, T>(dev_ctx);
 
     // dx convolution double grad:  gemm + col2im(col2vol)
@@ -991,7 +991,7 @@ class DepthwiseConvGradKernel : public framework::OpKernel<T> {
         paddings.erase(paddings.begin() + i + 1);
       }
     }
-    math::SetConstant<DeviceContext, T> set_zero;
+    pten::funcs::SetConstant<DeviceContext, T> set_zero;
     auto& dev_ctx = context.template device_context<DeviceContext>();
 
     if (input_grad) {
diff --git a/paddle/fluid/operators/conv_shift_op.cu b/paddle/fluid/operators/conv_shift_op.cu
index 2289104d2db..aca3bf9ae27 100644
--- a/paddle/fluid/operators/conv_shift_op.cu
+++ b/paddle/fluid/operators/conv_shift_op.cu
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/conv_shift_op.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
@@ -163,7 +163,7 @@ class ConvShiftGradKernel<platform::CUDADeviceContext, T>
 
     auto &device_ctx =
         context.template device_context<platform::CUDADeviceContext>();
-    math::SetConstant<platform::CUDADeviceContext, T> zero;
+    pten::funcs::SetConstant<platform::CUDADeviceContext, T> zero;
 
     const int x_per_block = 256;
     int num_x_blocks = DivUp(x_width, x_per_block);
diff --git a/paddle/fluid/operators/conv_transpose_cudnn_op.cu b/paddle/fluid/operators/conv_transpose_cudnn_op.cu
index 19c0be44a1d..32792d6d47f 100644
--- a/paddle/fluid/operators/conv_transpose_cudnn_op.cu
+++ b/paddle/fluid/operators/conv_transpose_cudnn_op.cu
@@ -21,8 +21,8 @@ limitations under the License. */
 #include "paddle/fluid/operators/conv_cudnn_helper.h"
 #endif
 #include "paddle/fluid/operators/conv_transpose_op.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/math/padding.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
@@ -34,7 +34,7 @@ static void DataTranspose(const framework::ExecutionContext& ctx,
                           const Tensor* input, Tensor* output,
                           const std::vector<int>& axis, int flag = 0) {
   auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
-  math::Transpose<platform::CUDADeviceContext, T, D> transpose;
+  pten::funcs::Transpose<platform::CUDADeviceContext, T, D> transpose;
   auto in_dims = input->dims();
   std::vector<int64_t> input_transpose_vec;
   for (size_t i = 0; i < axis.size(); ++i) {
@@ -650,7 +650,7 @@ class CUDNNConvTransposeDoubleGradOpKernel : public framework::OpKernel<T> {
 
     if (ddO) {
       ddO->mutable_data<T>(ctx.GetPlace());
-      math::SetConstant<platform::CUDADeviceContext, T> set_zero;
+      pten::funcs::SetConstant<platform::CUDADeviceContext, T> set_zero;
       set_zero(dev_ctx, ddO, static_cast<T>(0));
     }
     if (dW) {
diff --git a/paddle/fluid/operators/conv_transpose_op.h b/paddle/fluid/operators/conv_transpose_op.h
index b8335c75064..7b1fb6901e3 100644
--- a/paddle/fluid/operators/conv_transpose_op.h
+++ b/paddle/fluid/operators/conv_transpose_op.h
@@ -226,7 +226,7 @@ class GemmConvTransposeKernel : public framework::OpKernel<T> {
     filter.Resize(filter_matrix_shape);
 
     output->mutable_data<T>(context.GetPlace());
-    math::SetConstant<DeviceContext, T> set_zero;
+    pten::funcs::SetConstant<DeviceContext, T> set_zero;
     auto& dev_ctx = context.template device_context<DeviceContext>();
     auto blas = math::GetBlas<DeviceContext, T>(dev_ctx);
     set_zero(dev_ctx, output, static_cast<T>(0));
@@ -437,7 +437,7 @@ class GemmConvTransposeGradKernel : public framework::OpKernel<T> {
       col_matrix.Resize(col_matrix_shape);
 
       Tensor filter_grad_;
-      math::SetConstant<DeviceContext, T> set_zero;
+      pten::funcs::SetConstant<DeviceContext, T> set_zero;
 
       math::Im2ColFunctor<math::ColFormat::kCFO, DeviceContext, T> im2col;
       math::Vol2ColFunctor<DeviceContext, T> vol2col;
@@ -628,7 +628,7 @@ class DepthwiseConvTransposeKernel : public framework::OpKernel<T> {
 
     output->mutable_data<T>(context.GetPlace());
     auto& dev_ctx = context.template device_context<DeviceContext>();
-    math::SetConstant<DeviceContext, T> set_zero;
+    pten::funcs::SetConstant<DeviceContext, T> set_zero;
     set_zero(dev_ctx, output, static_cast<T>(0));
 
     math::DepthwiseConvInputGradFunctor<DeviceContext, T>
@@ -690,7 +690,7 @@ class DepthwiseConvTransposeGradKernel : public framework::OpKernel<T> {
     }
 
     if (filter_grad) {
-      math::SetConstant<DeviceContext, T> set_zero;
+      pten::funcs::SetConstant<DeviceContext, T> set_zero;
       filter_grad->mutable_data<T>(context.GetPlace());
       set_zero(dev_ctx, filter_grad, static_cast<T>(0));
 
diff --git a/paddle/fluid/operators/cos_sim_op.h b/paddle/fluid/operators/cos_sim_op.h
index 0b4e3f77467..f8b984e1159 100644
--- a/paddle/fluid/operators/cos_sim_op.h
+++ b/paddle/fluid/operators/cos_sim_op.h
@@ -15,8 +15,8 @@ limitations under the License. */
 #pragma once
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/cos_sim_functor.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/platform/for_range.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
@@ -121,7 +121,7 @@ class CosSimGradKernel : public framework::OpKernel<T> {
       if (out_grad_y) {
         out_grad_y->Resize(in_y->dims());
         out_grad_y->mutable_data<T>(context.GetPlace());
-        math::SetConstant<DeviceContext, T> set_zero;
+        pten::funcs::SetConstant<DeviceContext, T> set_zero;
         auto& dev_ctx = context.template device_context<DeviceContext>();
         set_zero(dev_ctx, out_grad_y, static_cast<T>(0));
 
diff --git a/paddle/fluid/operators/crf_decoding_op.h b/paddle/fluid/operators/crf_decoding_op.h
index 33108251b3b..8ca819de06c 100644
--- a/paddle/fluid/operators/crf_decoding_op.h
+++ b/paddle/fluid/operators/crf_decoding_op.h
@@ -17,7 +17,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/jit/kernels.h"
-#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
@@ -36,7 +36,7 @@ class CRFDecodingOpKernel : public framework::OpKernel<T> {
     auto* decoded_path = ctx.Output<Tensor>("ViterbiPath");
 
     int64_t* path = decoded_path->mutable_data<int64_t>(platform::CPUPlace());
-    math::SetConstant<DeviceContext, int64_t>()(
+    pten::funcs::SetConstant<DeviceContext, int64_t>()(
         ctx.template device_context<DeviceContext>(), decoded_path, 0);
 
     bool has_length = ctx.HasInput("Length");
diff --git a/paddle/fluid/operators/cross_entropy_op.h b/paddle/fluid/operators/cross_entropy_op.h
index 8424fc4376f..19ab6afd7fb 100644
--- a/paddle/fluid/operators/cross_entropy_op.h
+++ b/paddle/fluid/operators/cross_entropy_op.h
@@ -17,8 +17,8 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math.h"
 #include "paddle/fluid/operators/math/cross_entropy.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/platform/for_range.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/ctc_align_op.cu b/paddle/fluid/operators/ctc_align_op.cu
index 67bd71d4a1b..bd0b0ac0bc9 100644
--- a/paddle/fluid/operators/ctc_align_op.cu
+++ b/paddle/fluid/operators/ctc_align_op.cu
@@ -128,7 +128,7 @@ class CTCAlignOpCUDAKernel : public framework::OpKernel<T> {
       if (host_out_lod0.back() == 0) {
         output->Resize({1, 1});
         output->mutable_data<T>(ctx.GetPlace());
-        math::SetConstant<platform::CUDADeviceContext, T> set_constant;
+        pten::funcs::SetConstant<platform::CUDADeviceContext, T> set_constant;
         set_constant(ctx.template device_context<platform::CUDADeviceContext>(),
                      output, -1);
       }
diff --git a/paddle/fluid/operators/ctc_align_op.h b/paddle/fluid/operators/ctc_align_op.h
index 662f899c0a5..b79c3aeac49 100644
--- a/paddle/fluid/operators/ctc_align_op.h
+++ b/paddle/fluid/operators/ctc_align_op.h
@@ -17,7 +17,7 @@ limitations under the License. */
 #include <string.h>
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/cudnn_lstm_op.cu.cc b/paddle/fluid/operators/cudnn_lstm_op.cu.cc
index 8adf556b4cd..5c899ac557f 100644
--- a/paddle/fluid/operators/cudnn_lstm_op.cu.cc
+++ b/paddle/fluid/operators/cudnn_lstm_op.cu.cc
@@ -14,8 +14,8 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/generator.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/utils.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 #ifdef PADDLE_WITH_CUDA
 #include "paddle/fluid/operators/cudnn_lstm_cache.h"
 #endif
@@ -366,7 +366,7 @@ class CudnnLSTMGPUGradKernel : public framework::OpKernel<T> {
     }
 
     Tensor weight_grad;
-    math::SetConstant<paddle::platform::CUDADeviceContext, T> zero;
+    pten::funcs::SetConstant<paddle::platform::CUDADeviceContext, T> zero;
     weight_grad.mutable_data<T>({weight_numel}, ctx.GetPlace());
     zero(dev_ctx, &weight_grad, static_cast<T>(0.0));
     T *weight_grad_data = weight_grad.data<T>();
diff --git a/paddle/fluid/operators/cvm_op.cc b/paddle/fluid/operators/cvm_op.cc
index be7d4780f83..a84357b6e43 100644
--- a/paddle/fluid/operators/cvm_op.cc
+++ b/paddle/fluid/operators/cvm_op.cc
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/cvm_op.h"
 #include <memory>
-#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/deformable_conv_filter.cu.h b/paddle/fluid/operators/deformable_conv_filter.cu.h
index f466d1803f8..75d16ae0d43 100644
--- a/paddle/fluid/operators/deformable_conv_filter.cu.h
+++ b/paddle/fluid/operators/deformable_conv_filter.cu.h
@@ -23,7 +23,7 @@
 
 #pragma once
 #include "paddle/fluid/operators/math/blas.h"
-#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 template <typename T>
 __global__ void FilterGradAddupCUDAKernel(const int nthreads, const int n,
diff --git a/paddle/fluid/operators/deformable_conv_func.h b/paddle/fluid/operators/deformable_conv_func.h
index 99d1d7c4776..134a1ea06d9 100644
--- a/paddle/fluid/operators/deformable_conv_func.h
+++ b/paddle/fluid/operators/deformable_conv_func.h
@@ -23,8 +23,8 @@
 
 #pragma once
 #include "paddle/fluid/operators/math/blas.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/pten/core/hostdevice.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 template <typename T>
 HOSTDEVICE T DmcnGetGradientWeight(T argmax_h, T argmax_w, const int h,
diff --git a/paddle/fluid/operators/deformable_conv_op.cu b/paddle/fluid/operators/deformable_conv_op.cu
index 924adafa4b8..97d2f71758f 100644
--- a/paddle/fluid/operators/deformable_conv_op.cu
+++ b/paddle/fluid/operators/deformable_conv_op.cu
@@ -26,8 +26,8 @@
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/deformable_conv_op.h"
 #include "paddle/fluid/operators/math/blas.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
@@ -623,7 +623,7 @@ class DeformableConvGradCUDAKernel : public framework::OpKernel<T> {
     Tensor col_buffer_3d;
     col_buffer_3d.ShareDataWith(col_buffer).Resize(col_buffer_3d_shape);
 
-    math::SetConstant<DeviceContext, T> set_zero;
+    pten::funcs::SetConstant<DeviceContext, T> set_zero;
     auto blas = math::GetBlas<DeviceContext, T>(dev_ctx);
 
     col_buffer.mutable_data<T>(ctx.GetPlace());
diff --git a/paddle/fluid/operators/deformable_conv_op.h b/paddle/fluid/operators/deformable_conv_op.h
index 4be98f3e6c0..a5c0404ed3a 100644
--- a/paddle/fluid/operators/deformable_conv_op.h
+++ b/paddle/fluid/operators/deformable_conv_op.h
@@ -27,7 +27,7 @@
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/deformable_conv_func.h"
 #include "paddle/fluid/operators/math/blas.h"
-#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
@@ -489,7 +489,7 @@ class DeformableConvGradCPUKernel : public framework::OpKernel<T> {
     Tensor col_buffer_3d;
     col_buffer_3d.ShareDataWith(col_buffer).Resize(col_buffer_3d_shape);
 
-    math::SetConstant<CPUDeviceContext, T> set_zero;
+    pten::funcs::SetConstant<CPUDeviceContext, T> set_zero;
     auto blas = math::GetBlas<CPUDeviceContext, T>(dev_ctx);
 
     col_buffer.mutable_data<T>(ctx.GetPlace());
diff --git a/paddle/fluid/operators/deformable_conv_v1_op.cu b/paddle/fluid/operators/deformable_conv_v1_op.cu
index c252700528c..8f6c5a226bc 100644
--- a/paddle/fluid/operators/deformable_conv_v1_op.cu
+++ b/paddle/fluid/operators/deformable_conv_v1_op.cu
@@ -29,8 +29,8 @@
 #include "paddle/fluid/operators/deformable_conv_func.h"
 #include "paddle/fluid/operators/deformable_conv_v1_op.h"
 #include "paddle/fluid/operators/math/blas.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
@@ -489,7 +489,7 @@ class DeformableConvV1GradCUDAKernel : public framework::OpKernel<T> {
     Tensor col_buffer_3d;
     col_buffer_3d.ShareDataWith(col_buffer).Resize(col_buffer_3d_shape);
 
-    math::SetConstant<CUDADeviceContext, T> set_zero;
+    pten::funcs::SetConstant<CUDADeviceContext, T> set_zero;
     auto blas = math::GetBlas<CUDADeviceContext, T>(dev_ctx);
 
     col_buffer.mutable_data<T>(ctx.GetPlace());
diff --git a/paddle/fluid/operators/deformable_conv_v1_op.h b/paddle/fluid/operators/deformable_conv_v1_op.h
index 92b19e39046..1ddc31c93ea 100644
--- a/paddle/fluid/operators/deformable_conv_v1_op.h
+++ b/paddle/fluid/operators/deformable_conv_v1_op.h
@@ -28,7 +28,7 @@
 #include "paddle/fluid/operators/deformable_conv_func.h"
 #include "paddle/fluid/operators/deformable_conv_op.h"
 #include "paddle/fluid/operators/math/blas.h"
-#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
@@ -451,7 +451,7 @@ class DeformableConvV1GradCPUKernel : public framework::OpKernel<T> {
     Tensor col_buffer_3d;
     col_buffer_3d.ShareDataWith(col_buffer).Resize(col_buffer_3d_shape);
 
-    math::SetConstant<CPUDeviceContext, T> set_zero;
+    pten::funcs::SetConstant<CPUDeviceContext, T> set_zero;
     auto blas = math::GetBlas<CPUDeviceContext, T>(dev_ctx);
 
     col_buffer.mutable_data<T>(ctx.GetPlace());
diff --git a/paddle/fluid/operators/deformable_psroi_pooling_op.cu b/paddle/fluid/operators/deformable_psroi_pooling_op.cu
index eeb2c7692b5..95f05963cd1 100644
--- a/paddle/fluid/operators/deformable_psroi_pooling_op.cu
+++ b/paddle/fluid/operators/deformable_psroi_pooling_op.cu
@@ -31,8 +31,8 @@
 #include "paddle/fluid/memory/malloc.h"
 #include "paddle/fluid/operators/deformable_psroi_pooling_op.h"
 #include "paddle/fluid/operators/math/blas.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
@@ -436,7 +436,7 @@ class DeformablePSROIPoolGradCUDAKernel : public framework::OpKernel<T> {
     Tensor* input_grad = ctx.Output<Tensor>(framework::GradVarName("Input"));
     Tensor* trans_grad = ctx.Output<Tensor>(framework::GradVarName("Trans"));
 
-    math::SetConstant<DeviceContext, T> set_zero;
+    pten::funcs::SetConstant<DeviceContext, T> set_zero;
     auto& dev_ctx = ctx.cuda_device_context();
     if (input_grad) {
       input_grad->mutable_data<T>(ctx.GetPlace());
diff --git a/paddle/fluid/operators/deformable_psroi_pooling_op.h b/paddle/fluid/operators/deformable_psroi_pooling_op.h
index a986f915e26..08b8342a1fd 100644
--- a/paddle/fluid/operators/deformable_psroi_pooling_op.h
+++ b/paddle/fluid/operators/deformable_psroi_pooling_op.h
@@ -27,7 +27,7 @@
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/blas.h"
-#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
@@ -165,7 +165,7 @@ class DeformablePSROIPoolCPUKernel : public framework::OpKernel<T> {
     auto* top_count = ctx.Output<Tensor>("TopCount");
     top_count->mutable_data<T>(ctx.GetPlace());
 
-    math::SetConstant<DeviceContext, T> set_zero;
+    pten::funcs::SetConstant<DeviceContext, T> set_zero;
     auto& dev_ctx = ctx.template device_context<DeviceContext>();
     set_zero(dev_ctx, out, static_cast<T>(0));
     set_zero(dev_ctx, top_count, static_cast<T>(0));
@@ -421,7 +421,7 @@ class DeformablePSROIPoolGradCPUKernel : public framework::OpKernel<T> {
     auto* top_count = ctx.Input<Tensor>("TopCount");
     auto* output_grad = ctx.Input<Tensor>(framework::GradVarName("Output"));
     auto* input_grad = ctx.Output<Tensor>(framework::GradVarName("Input"));
-    math::SetConstant<DeviceContext, T> set_zero;
+    pten::funcs::SetConstant<DeviceContext, T> set_zero;
     auto& dev_ctx = ctx.template device_context<DeviceContext>();
     if (input_grad) {
       input_grad->mutable_data<T>(ctx.GetPlace());
diff --git a/paddle/fluid/operators/detection/anchor_generator_op.h b/paddle/fluid/operators/detection/anchor_generator_op.h
index 599f6935736..f888787cf51 100644
--- a/paddle/fluid/operators/detection/anchor_generator_op.h
+++ b/paddle/fluid/operators/detection/anchor_generator_op.h
@@ -16,8 +16,8 @@ limitations under the License. */
 #include <algorithm>
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/platform/transform.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/detection/bbox_util.cu.h b/paddle/fluid/operators/detection/bbox_util.cu.h
index c6754f62cc7..c4ae795a507 100644
--- a/paddle/fluid/operators/detection/bbox_util.cu.h
+++ b/paddle/fluid/operators/detection/bbox_util.cu.h
@@ -24,9 +24,9 @@ limitations under the License. */
 namespace cub = hipcub;
 #endif
 #include "paddle/fluid/operators/gather.cu.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
 #include "paddle/fluid/platform/for_range.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/detection/bipartite_match_op.cc b/paddle/fluid/operators/detection/bipartite_match_op.cc
index 5cd85375892..582f81d71aa 100644
--- a/paddle/fluid/operators/detection/bipartite_match_op.cc
+++ b/paddle/fluid/operators/detection/bipartite_match_op.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
@@ -205,9 +205,9 @@ class BipartiteMatchKernel : public framework::OpKernel<T> {
     match_indices->mutable_data<int>({n, col}, context.GetPlace());
     match_dist->mutable_data<T>({n, col}, context.GetPlace());
 
-    math::SetConstant<platform::CPUDeviceContext, int> iset;
+    pten::funcs::SetConstant<platform::CPUDeviceContext, int> iset;
     iset(dev_ctx, match_indices, static_cast<int>(-1));
-    math::SetConstant<platform::CPUDeviceContext, T> tset;
+    pten::funcs::SetConstant<platform::CPUDeviceContext, T> tset;
     tset(dev_ctx, match_dist, static_cast<T>(0));
 
     int* indices = match_indices->data<int>();
diff --git a/paddle/fluid/operators/detection/box_clip_op.cu b/paddle/fluid/operators/detection/box_clip_op.cu
index 53727d9d087..24f5f00b077 100644
--- a/paddle/fluid/operators/detection/box_clip_op.cu
+++ b/paddle/fluid/operators/detection/box_clip_op.cu
@@ -14,9 +14,9 @@ limitations under the License. */
 #include <algorithm>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/detection/box_clip_op.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/pten/core/hostdevice.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/detection/box_clip_op.h b/paddle/fluid/operators/detection/box_clip_op.h
index e24cefdcd7b..5c1870e9023 100644
--- a/paddle/fluid/operators/detection/box_clip_op.h
+++ b/paddle/fluid/operators/detection/box_clip_op.h
@@ -13,7 +13,7 @@ limitations under the License. */
 #include <string>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/detection/bbox_util.h"
-#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/detection/box_coder_op.h b/paddle/fluid/operators/detection/box_coder_op.h
index d120ebbeb4d..b4fe27401db 100644
--- a/paddle/fluid/operators/detection/box_coder_op.h
+++ b/paddle/fluid/operators/detection/box_coder_op.h
@@ -13,7 +13,7 @@ limitations under the License. */
 #include <string>
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/detection/box_decoder_and_assign_op.h b/paddle/fluid/operators/detection/box_decoder_and_assign_op.h
index e66a8351f47..1fe05e6ebbf 100644
--- a/paddle/fluid/operators/detection/box_decoder_and_assign_op.h
+++ b/paddle/fluid/operators/detection/box_decoder_and_assign_op.h
@@ -14,7 +14,7 @@ limitations under the License. */
 #include <string>
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu b/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu
index eddb25d57b4..70cbd7a9dea 100644
--- a/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu
+++ b/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu
@@ -195,7 +195,7 @@ class GPUCollectFpnProposalsOpKernel : public framework::OpKernel<T> {
     Tensor length_lod;
     int* length_lod_data =
         length_lod.mutable_data<int>({lod_size}, dev_ctx.GetPlace());
-    math::SetConstant<platform::CUDADeviceContext, int> set_zero;
+    pten::funcs::SetConstant<platform::CUDADeviceContext, int> set_zero;
     set_zero(dev_ctx, &length_lod, static_cast<int>(0));
 
     int blocks = NumBlocks(real_post_num);
diff --git a/paddle/fluid/operators/detection/collect_fpn_proposals_op.h b/paddle/fluid/operators/detection/collect_fpn_proposals_op.h
index 950b8b78933..984b6332918 100644
--- a/paddle/fluid/operators/detection/collect_fpn_proposals_op.h
+++ b/paddle/fluid/operators/detection/collect_fpn_proposals_op.h
@@ -22,7 +22,7 @@ limitations under the License.*/
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/gather.h"
-#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cu b/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cu
index 355a35d4dd2..84d564ac4e9 100644
--- a/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cu
+++ b/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cu
@@ -25,9 +25,9 @@ namespace cub = hipcub;
 #include "paddle/fluid/operators/detection/bbox_util.h"
 #include "paddle/fluid/operators/detection/distribute_fpn_proposals_op.h"
 #include "paddle/fluid/operators/gather.cu.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/fluid/platform/for_range.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
@@ -121,7 +121,7 @@ class GPUDistributeFpnProposalsOpKernel : public framework::OpKernel<T> {
     Tensor sub_lod_list;
     sub_lod_list.Resize({num_level, lod_size});
     int* sub_lod_list_data = sub_lod_list.mutable_data<int>(dev_ctx.GetPlace());
-    math::SetConstant<platform::CUDADeviceContext, int> set_zero;
+    pten::funcs::SetConstant<platform::CUDADeviceContext, int> set_zero;
     set_zero(dev_ctx, &sub_lod_list, static_cast<int>(0));
 
     Tensor target_lvls;
diff --git a/paddle/fluid/operators/detection/distribute_fpn_proposals_op.h b/paddle/fluid/operators/detection/distribute_fpn_proposals_op.h
index f1b454913f7..e96804ab6f6 100644
--- a/paddle/fluid/operators/detection/distribute_fpn_proposals_op.h
+++ b/paddle/fluid/operators/detection/distribute_fpn_proposals_op.h
@@ -21,7 +21,7 @@ limitations under the License. */
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/gather.h"
-#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/detection/generate_mask_labels_op.cc b/paddle/fluid/operators/detection/generate_mask_labels_op.cc
index afa4ccf25d0..92dba742f4c 100644
--- a/paddle/fluid/operators/detection/generate_mask_labels_op.cc
+++ b/paddle/fluid/operators/detection/generate_mask_labels_op.cc
@@ -19,7 +19,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/detection/mask_util.h"
 #include "paddle/fluid/operators/gather.h"
 #include "paddle/fluid/operators/math/concat_and_split.h"
-#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
@@ -122,7 +122,7 @@ static inline void ExpandMaskTarget(const platform::CPUDeviceContext& ctx,
 
   int* mask_targets_data =
       mask_targets->mutable_data<int>({num_mask, mask_dim}, ctx.GetPlace());
-  math::set_constant(ctx, mask_targets, -1);
+  pten::funcs::set_constant(ctx, mask_targets, -1);
   for (int64_t mask_id = 0; mask_id < num_mask; ++mask_id) {
     int cls = mask_class_labels_data[mask_id];
     int start = M * cls;
@@ -271,7 +271,7 @@ std::vector<Tensor> SampleMaskForOneImage(
     }
     masks.mutable_data<uint8_t>({bg_num, resolution * resolution},
                                 ctx.GetPlace());
-    math::set_constant(ctx, &masks, -1);
+    pten::funcs::set_constant(ctx, &masks, -1);
     int* mask_class_labels_data =
         mask_class_labels.mutable_data<int>({bg_num, 1}, ctx.GetPlace());
     mask_class_labels_data[0] = 0;
diff --git a/paddle/fluid/operators/detection/generate_proposal_labels_op.cc b/paddle/fluid/operators/detection/generate_proposal_labels_op.cc
index 1b1fa7b064f..67a1d2c5acf 100644
--- a/paddle/fluid/operators/detection/generate_proposal_labels_op.cc
+++ b/paddle/fluid/operators/detection/generate_proposal_labels_op.cc
@@ -18,7 +18,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/detection/bbox_util.h"
 #include "paddle/fluid/operators/gather.h"
 #include "paddle/fluid/operators/math/concat_and_split.h"
-#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
@@ -289,7 +289,7 @@ void GatherBoxesLabels(const platform::CPUDeviceContext& context,
   fg_labels.mutable_data<int>({fg_num}, context.GetPlace());
   CPUGather<int>(context, gt_classes, gt_label_inds_t, &fg_labels);
   bg_labels.mutable_data<int>({bg_num}, context.GetPlace());
-  math::set_constant(context, &bg_labels, 0);
+  pten::funcs::set_constant(context, &bg_labels, 0);
   Concat<int>(context, fg_labels, bg_labels, sampled_labels);
 
   Tensor fg_max_overlap, bg_max_overlap;
@@ -328,7 +328,7 @@ std::vector<Tensor> SampleRoisForOneImage(
     Tensor roi_filter;
     // Tensor box_filter;
     if (keep.numel() == 0) {
-      math::SetConstant<platform::CPUDeviceContext, T> set_zero;
+      pten::funcs::SetConstant<platform::CPUDeviceContext, T> set_zero;
       roi_filter.mutable_data<T>({proposals_num, kBoxDim}, context.GetPlace());
       set_zero(context, &roi_filter, static_cast<T>(0));
     } else {
@@ -403,9 +403,9 @@ std::vector<Tensor> SampleRoisForOneImage(
   bbox_targets.mutable_data<T>(bbox_expand_dim, context.GetPlace());
   bbox_inside_weights.mutable_data<T>(bbox_expand_dim, context.GetPlace());
   bbox_outside_weights.mutable_data<T>(bbox_expand_dim, context.GetPlace());
-  math::set_constant(context, &bbox_targets, 0.0);
-  math::set_constant(context, &bbox_inside_weights, 0.0);
-  math::set_constant(context, &bbox_outside_weights, 0.0);
+  pten::funcs::set_constant(context, &bbox_targets, 0.0);
+  pten::funcs::set_constant(context, &bbox_inside_weights, 0.0);
+  pten::funcs::set_constant(context, &bbox_outside_weights, 0.0);
 
   auto* bbox_targets_single_data = bbox_targets_single.data<T>();
   auto* sampled_labels_data = sampled_labels.data<int>();
diff --git a/paddle/fluid/operators/detection/generate_proposals_op.cc b/paddle/fluid/operators/detection/generate_proposals_op.cc
index bc48c3b5ba1..570720550bf 100644
--- a/paddle/fluid/operators/detection/generate_proposals_op.cc
+++ b/paddle/fluid/operators/detection/generate_proposals_op.cc
@@ -21,7 +21,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/detection/bbox_util.h"
 #include "paddle/fluid/operators/detection/nms_util.h"
 #include "paddle/fluid/operators/gather.h"
-#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
@@ -112,7 +112,7 @@ class GenerateProposalsKernel : public framework::OpKernel<T> {
     scores_swap.mutable_data<T>({num, h_score, w_score, c_score},
                                 dev_ctx.GetPlace());
 
-    math::Transpose<platform::CPUDeviceContext, T, 4> trans;
+    pten::funcs::Transpose<platform::CPUDeviceContext, T, 4> trans;
     std::vector<int> axis = {0, 2, 3, 1};
     trans(dev_ctx, *bbox_deltas, &bbox_deltas_swap, axis);
     trans(dev_ctx, *scores, &scores_swap, axis);
@@ -211,7 +211,7 @@ class GenerateProposalsKernel : public framework::OpKernel<T> {
     FilterBoxes<T>(ctx, &proposals, min_size, im_info_slice, true, &keep);
     // Handle the case when there is no keep index left
     if (keep.numel() == 0) {
-      math::SetConstant<platform::CPUDeviceContext, T> set_zero;
+      pten::funcs::SetConstant<platform::CPUDeviceContext, T> set_zero;
       bbox_sel.mutable_data<T>({1, 4}, ctx.GetPlace());
       set_zero(ctx, &bbox_sel, static_cast<T>(0));
       Tensor scores_filter;
diff --git a/paddle/fluid/operators/detection/generate_proposals_op.cu b/paddle/fluid/operators/detection/generate_proposals_op.cu
index 2de06e06d9a..f34b8e26c0d 100644
--- a/paddle/fluid/operators/detection/generate_proposals_op.cu
+++ b/paddle/fluid/operators/detection/generate_proposals_op.cu
@@ -20,7 +20,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/memory/memory.h"
 #include "paddle/fluid/operators/detection/bbox_util.cu.h"
-#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
@@ -76,7 +76,7 @@ static std::pair<Tensor, Tensor> ProposalForOneImage(
   Tensor scores_filter, proposals_filter;
   // Handle the case when there is no keep index left
   if (keep_num == 0) {
-    math::SetConstant<platform::CUDADeviceContext, T> set_zero;
+    pten::funcs::SetConstant<platform::CUDADeviceContext, T> set_zero;
     proposals_filter.mutable_data<T>({1, 4}, ctx.GetPlace());
     scores_filter.mutable_data<T>({1, 1}, ctx.GetPlace());
     set_zero(ctx, &proposals_filter, static_cast<T>(0));
@@ -154,7 +154,7 @@ class CUDAGenerateProposalsKernel : public framework::OpKernel<T> {
     scores_swap.mutable_data<T>({num, h_score, w_score, c_score},
                                 dev_ctx.GetPlace());
 
-    math::Transpose<DeviceContext, T, 4> trans;
+    pten::funcs::Transpose<DeviceContext, T, 4> trans;
     std::vector<int> axis = {0, 2, 3, 1};
     trans(dev_ctx, *bbox_deltas, &bbox_deltas_swap, axis);
     trans(dev_ctx, *scores, &scores_swap, axis);
diff --git a/paddle/fluid/operators/detection/generate_proposals_v2_op.cc b/paddle/fluid/operators/detection/generate_proposals_v2_op.cc
index 44554a941dc..671a27429f2 100644
--- a/paddle/fluid/operators/detection/generate_proposals_v2_op.cc
+++ b/paddle/fluid/operators/detection/generate_proposals_v2_op.cc
@@ -21,7 +21,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/detection/bbox_util.h"
 #include "paddle/fluid/operators/detection/nms_util.h"
 #include "paddle/fluid/operators/gather.h"
-#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
@@ -113,7 +113,7 @@ class GenerateProposalsV2Kernel : public framework::OpKernel<T> {
     scores_swap.mutable_data<T>({num, h_score, w_score, c_score},
                                 dev_ctx.GetPlace());
 
-    math::Transpose<platform::CPUDeviceContext, T, 4> trans;
+    pten::funcs::Transpose<platform::CPUDeviceContext, T, 4> trans;
     std::vector<int> axis = {0, 2, 3, 1};
     trans(dev_ctx, *bbox_deltas, &bbox_deltas_swap, axis);
     trans(dev_ctx, *scores, &scores_swap, axis);
@@ -215,7 +215,7 @@ class GenerateProposalsV2Kernel : public framework::OpKernel<T> {
                    pixel_offset);
     // Handle the case when there is no keep index left
     if (keep.numel() == 0) {
-      math::SetConstant<platform::CPUDeviceContext, T> set_zero;
+      pten::funcs::SetConstant<platform::CPUDeviceContext, T> set_zero;
       bbox_sel.mutable_data<T>({1, 4}, ctx.GetPlace());
       set_zero(ctx, &bbox_sel, static_cast<T>(0));
       Tensor scores_filter;
diff --git a/paddle/fluid/operators/detection/generate_proposals_v2_op.cu b/paddle/fluid/operators/detection/generate_proposals_v2_op.cu
index cc2d4578e3e..98108a25dad 100644
--- a/paddle/fluid/operators/detection/generate_proposals_v2_op.cu
+++ b/paddle/fluid/operators/detection/generate_proposals_v2_op.cu
@@ -20,7 +20,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/memory/memory.h"
 #include "paddle/fluid/operators/detection/bbox_util.cu.h"
-#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
@@ -77,7 +77,7 @@ static std::pair<Tensor, Tensor> ProposalForOneImage(
   Tensor scores_filter, proposals_filter;
   // Handle the case when there is no keep index left
   if (keep_num == 0) {
-    math::SetConstant<platform::CUDADeviceContext, T> set_zero;
+    pten::funcs::SetConstant<platform::CUDADeviceContext, T> set_zero;
     proposals_filter.mutable_data<T>({1, 4}, ctx.GetPlace());
     scores_filter.mutable_data<T>({1, 1}, ctx.GetPlace());
     set_zero(ctx, &proposals_filter, static_cast<T>(0));
@@ -157,7 +157,7 @@ class CUDAGenerateProposalsV2Kernel : public framework::OpKernel<T> {
     scores_swap.mutable_data<T>({num, h_score, w_score, c_score},
                                 dev_ctx.GetPlace());
 
-    math::Transpose<DeviceContext, T, 4> trans;
+    pten::funcs::Transpose<DeviceContext, T, 4> trans;
     std::vector<int> axis = {0, 2, 3, 1};
     trans(dev_ctx, *bbox_deltas, &bbox_deltas_swap, axis);
     trans(dev_ctx, *scores, &scores_swap, axis);
diff --git a/paddle/fluid/operators/detection/prior_box_op.h b/paddle/fluid/operators/detection/prior_box_op.h
index 21ac74f25cb..94413c9c835 100644
--- a/paddle/fluid/operators/detection/prior_box_op.h
+++ b/paddle/fluid/operators/detection/prior_box_op.h
@@ -16,8 +16,8 @@ limitations under the License. */
 #include <algorithm>
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/platform/transform.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/detection/roi_perspective_transform_op.cc b/paddle/fluid/operators/detection/roi_perspective_transform_op.cc
index 4d0c9da2eeb..777e69ab7b4 100644
--- a/paddle/fluid/operators/detection/roi_perspective_transform_op.cc
+++ b/paddle/fluid/operators/detection/roi_perspective_transform_op.cc
@@ -16,7 +16,7 @@ limitations under the License. */
 #include <memory>
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/detection/roi_perspective_transform_op.cu b/paddle/fluid/operators/detection/roi_perspective_transform_op.cu
index fbf631f75b6..ff8da478a00 100644
--- a/paddle/fluid/operators/detection/roi_perspective_transform_op.cu
+++ b/paddle/fluid/operators/detection/roi_perspective_transform_op.cu
@@ -14,9 +14,9 @@ limitations under the License. */
 
 #include <algorithm>
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/fluid/platform/float16.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 using paddle::platform::PADDLE_CUDA_NUM_THREADS;
 using paddle::platform::float16;
@@ -356,7 +356,7 @@ class CUDAROIPerspectiveTransformOpKernel : public framework::OpKernel<T> {
     T* out2in_w_data =
         out2in_w->mutable_data<T>({out->numel(), 4}, ctx.GetPlace());
 
-    math::SetConstant<platform::CUDADeviceContext, int> init;
+    pten::funcs::SetConstant<platform::CUDADeviceContext, int> init;
     init(ctx.cuda_device_context(), out2in_idx, static_cast<int>(-1));
 
     auto transformed_height = ctx.Attr<int>("transformed_height");
@@ -482,7 +482,7 @@ class CUDAROIPerspectiveTransformGradOpKernel : public framework::OpKernel<T> {
 
     T* in_grad_data = in_grad->mutable_data<T>(ctx.GetPlace());
 
-    math::SetConstant<platform::CUDADeviceContext, T> set_zero;
+    pten::funcs::SetConstant<platform::CUDADeviceContext, T> set_zero;
     set_zero(ctx.cuda_device_context(), in_grad, static_cast<T>(0));
 
     const T* out_grad_data = out_grad->data<T>();
diff --git a/paddle/fluid/operators/detection/rpn_target_assign_op.cc b/paddle/fluid/operators/detection/rpn_target_assign_op.cc
index 2a16e20c2a7..cf7afc3853d 100644
--- a/paddle/fluid/operators/detection/rpn_target_assign_op.cc
+++ b/paddle/fluid/operators/detection/rpn_target_assign_op.cc
@@ -15,7 +15,7 @@ limitations under the License. */
 #include <random>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/detection/bbox_util.h"
-#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/detection/yolo_box_op.cu b/paddle/fluid/operators/detection/yolo_box_op.cu
index bfe4742c4b3..7cc66f2074d 100644
--- a/paddle/fluid/operators/detection/yolo_box_op.cu
+++ b/paddle/fluid/operators/detection/yolo_box_op.cu
@@ -14,8 +14,8 @@ limitations under the License. */
 
 #include "paddle/fluid/memory/malloc.h"
 #include "paddle/fluid/operators/detection/yolo_box_op.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 namespace paddle {
 namespace operators {
 
@@ -114,7 +114,7 @@ class YoloBoxOpCUDAKernel : public framework::OpKernel<T> {
     T* boxes_data = boxes->mutable_data<T>({n, box_num, 4}, ctx.GetPlace());
     T* scores_data =
         scores->mutable_data<T>({n, box_num, class_num}, ctx.GetPlace());
-    math::SetConstant<platform::CUDADeviceContext, T> set_zero;
+    pten::funcs::SetConstant<platform::CUDADeviceContext, T> set_zero;
     set_zero(dev_ctx, boxes, static_cast<T>(0));
     set_zero(dev_ctx, scores, static_cast<T>(0));
     platform::GpuLaunchConfig config =
diff --git a/paddle/fluid/operators/detection/yolo_box_op.h b/paddle/fluid/operators/detection/yolo_box_op.h
index 31a67ecc266..27fe31587e4 100644
--- a/paddle/fluid/operators/detection/yolo_box_op.h
+++ b/paddle/fluid/operators/detection/yolo_box_op.h
@@ -13,8 +13,8 @@
 #include <algorithm>
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/pten/core/hostdevice.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/detection/yolov3_loss_op.h b/paddle/fluid/operators/detection/yolov3_loss_op.h
index 1acfb2cf4e5..1ab3039b2e8 100644
--- a/paddle/fluid/operators/detection/yolov3_loss_op.h
+++ b/paddle/fluid/operators/detection/yolov3_loss_op.h
@@ -13,7 +13,7 @@
 #include <algorithm>
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
@@ -305,7 +305,7 @@ class Yolov3LossKernel : public framework::OpKernel<T> {
     Tensor gtscore;
     if (!gt_score) {
       gtscore.mutable_data<T>({n, b}, ctx.GetPlace());
-      math::SetConstant<platform::CPUDeviceContext, T>()(
+      pten::funcs::SetConstant<platform::CPUDeviceContext, T>()(
           ctx.template device_context<platform::CPUDeviceContext>(), &gtscore,
           static_cast<T>(1.0));
       gt_score = &gtscore;
@@ -461,7 +461,7 @@ class Yolov3LossGradKernel : public framework::OpKernel<T> {
     Tensor gtscore;
     if (!gt_score) {
       gtscore.mutable_data<T>({n, b}, ctx.GetPlace());
-      math::SetConstant<platform::CPUDeviceContext, T>()(
+      pten::funcs::SetConstant<platform::CPUDeviceContext, T>()(
           ctx.template device_context<platform::CPUDeviceContext>(), &gtscore,
           static_cast<T>(1.0));
       gt_score = &gtscore;
diff --git a/paddle/fluid/operators/determinant_op.h b/paddle/fluid/operators/determinant_op.h
index 4c17869fb5d..90443e0928b 100644
--- a/paddle/fluid/operators/determinant_op.h
+++ b/paddle/fluid/operators/determinant_op.h
@@ -150,7 +150,7 @@ inline bool CheckMatrixInvertible(const framework::ExecutionContext& ctx,
   auto* data = dev_tensor.mutable_data<bool>({1}, ctx.GetPlace());
 
   // set false
-  math::SetConstant<DeviceContext, bool> zero;
+  pten::funcs::SetConstant<DeviceContext, bool> zero;
   zero(dev_ctx, &dev_tensor, false);
 
   // find whether zero
@@ -208,7 +208,7 @@ class DeterminantGradKernel : public framework::OpKernel<T> {
       VLOG(3) << "The input matrix not invertible!";
       ddet->Resize(input->dims());
       ddet->mutable_data<T>(context.GetPlace());
-      math::SetConstant<DeviceContext, T> zero;
+      pten::funcs::SetConstant<DeviceContext, T> zero;
       zero(dev_ctx, ddet, static_cast<T>(0.0f));
       return;
     }
@@ -363,7 +363,7 @@ class SlogDeterminantGradKernel : public framework::OpKernel<T> {
       VLOG(3) << "The input matrix not invertible!";
       dslogdet->Resize(input->dims());
       dslogdet->mutable_data<T>(context.GetPlace());
-      math::SetConstant<DeviceContext, T> zero;
+      pten::funcs::SetConstant<DeviceContext, T> zero;
       zero(dev_ctx, dslogdet, std::numeric_limits<T>::quiet_NaN());
       return;
     }
diff --git a/paddle/fluid/operators/dgc_op.h b/paddle/fluid/operators/dgc_op.h
index 12ffc948336..4a81537b8c8 100644
--- a/paddle/fluid/operators/dgc_op.h
+++ b/paddle/fluid/operators/dgc_op.h
@@ -187,7 +187,7 @@ class DGCOpKernel : public framework::OpKernel<T> {
           "V_out numel error, V_out numel is %d.", v_out->numel()));
     }
 
-    math::SetConstant<DeviceContext, T> tset;
+    pten::funcs::SetConstant<DeviceContext, T> tset;
     tset(dev_ctx, grad_out, static_cast<T>(0));
   }
 };
diff --git a/paddle/fluid/operators/diag_embed_op.h b/paddle/fluid/operators/diag_embed_op.h
index aff7d7e48a8..922140b5b80 100644
--- a/paddle/fluid/operators/diag_embed_op.h
+++ b/paddle/fluid/operators/diag_embed_op.h
@@ -17,8 +17,8 @@
 #include <algorithm>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/platform/for_range.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
@@ -70,7 +70,7 @@ class DiagEmbedKernel : public framework::OpKernel<T> {
     auto* input_data = input->data<T>();
 
     T* out_data = out->mutable_data<T>(context.GetPlace());
-    math::SetConstant<DeviceContext, T> set_zero;
+    pten::funcs::SetConstant<DeviceContext, T> set_zero;
     auto& dev_ctx = context.template device_context<DeviceContext>();
     set_zero(dev_ctx, out, static_cast<T>(0.0));
 
diff --git a/paddle/fluid/operators/diag_op.h b/paddle/fluid/operators/diag_op.h
index f89415ae089..09723e6df6b 100644
--- a/paddle/fluid/operators/diag_op.h
+++ b/paddle/fluid/operators/diag_op.h
@@ -15,8 +15,8 @@ limitations under the License. */
 #pragma once
 
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/platform/for_range.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
@@ -45,7 +45,7 @@ class DiagKernel : public framework::OpKernel<T> {
     auto* out = context.Output<framework::Tensor>("Out");
     T* out_data = out->mutable_data<T>(context.GetPlace());
 
-    math::SetConstant<DeviceContext, T> set_zero;
+    pten::funcs::SetConstant<DeviceContext, T> set_zero;
     auto& dev_ctx = context.template device_context<DeviceContext>();
     set_zero(dev_ctx, out, static_cast<T>(0));
 
diff --git a/paddle/fluid/operators/diag_v2_op.cc b/paddle/fluid/operators/diag_v2_op.cc
index dd5ad739506..3e74c7aa810 100644
--- a/paddle/fluid/operators/diag_v2_op.cc
+++ b/paddle/fluid/operators/diag_v2_op.cc
@@ -15,7 +15,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/diag_v2_op.h"
 #include <algorithm>
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
@@ -109,7 +109,7 @@ class DiagV2Kernel : public framework::OpKernel<T> {
     int64_t i;
     if (x_dims.size() == 1) {
       float padding_value = context.Attr<float>("padding_value");
-      math::SetConstant<DeviceContext, T> set_padding_value;
+      pten::funcs::SetConstant<DeviceContext, T> set_padding_value;
       auto& dev_ctx = context.template device_context<DeviceContext>();
       set_padding_value(dev_ctx, out, static_cast<T>(padding_value));
 
diff --git a/paddle/fluid/operators/diag_v2_op.cu b/paddle/fluid/operators/diag_v2_op.cu
index 12ea31945f8..02e531765ce 100644
--- a/paddle/fluid/operators/diag_v2_op.cu
+++ b/paddle/fluid/operators/diag_v2_op.cu
@@ -72,7 +72,7 @@ class DiagV2CUDAKernel : public framework::OpKernel<T> {
 
     if (x_dims.size() == 1) {
       float padding_value = context.Attr<float>("padding_value");
-      math::SetConstant<DeviceContext, T> set_padding_value;
+      pten::funcs::SetConstant<DeviceContext, T> set_padding_value;
       set_padding_value(dev_ctx, out, static_cast<T>(padding_value));
 
       auto x_length = x_dims[0];
diff --git a/paddle/fluid/operators/diag_v2_op.h b/paddle/fluid/operators/diag_v2_op.h
index 7850def0611..0d1d6cd86e4 100644
--- a/paddle/fluid/operators/diag_v2_op.h
+++ b/paddle/fluid/operators/diag_v2_op.h
@@ -15,7 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/dist_op.h b/paddle/fluid/operators/dist_op.h
index 6a34ef48a16..2d4620eca72 100644
--- a/paddle/fluid/operators/dist_op.h
+++ b/paddle/fluid/operators/dist_op.h
@@ -19,7 +19,7 @@
 #include <vector>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
@@ -171,7 +171,7 @@ static void DistGradFunction(const framework::ExecutionContext& context) {
 
   // 1: Lp-norm(z), z = x-y, compute dz
   if (p == 0) {
-    math::SetConstant<DeviceContext, T> set_zero;
+    pten::funcs::SetConstant<DeviceContext, T> set_zero;
     auto& dev_ctx = context.template device_context<DeviceContext>();
     set_zero(dev_ctx, &grad, static_cast<T>(0));
   } else if (p == INFINITY || p == -INFINITY) {
diff --git a/paddle/fluid/operators/dropout_op_test.cc b/paddle/fluid/operators/dropout_op_test.cc
index 5c9be588419..a268ef95e33 100644
--- a/paddle/fluid/operators/dropout_op_test.cc
+++ b/paddle/fluid/operators/dropout_op_test.cc
@@ -25,12 +25,11 @@ limitations under the License. */
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/operators/dropout_op.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/string/printf.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace f = paddle::framework;
 namespace p = paddle::platform;
-namespace m = paddle::operators::math;
 
 USE_OP(dropout);
 
diff --git a/paddle/fluid/operators/edit_distance_op.cu b/paddle/fluid/operators/edit_distance_op.cu
index 3096795f3ea..be6534365e5 100644
--- a/paddle/fluid/operators/edit_distance_op.cu
+++ b/paddle/fluid/operators/edit_distance_op.cu
@@ -16,9 +16,9 @@ limitations under the License. */
 #include "paddle/fluid/framework/mixed_vector.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/edit_distance_op.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
@@ -118,7 +118,7 @@ class EditDistanceGPUKernel : public framework::OpKernel<T> {
     }
 
     const size_t num_strs = hyp_lod.size() - 1;
-    math::SetConstant<platform::CUDADeviceContext, int64_t> set_constant;
+    pten::funcs::SetConstant<platform::CUDADeviceContext, int64_t> set_constant;
     set_constant(ctx.template device_context<platform::CUDADeviceContext>(),
                  sequence_num, static_cast<int64_t>(num_strs));
 
diff --git a/paddle/fluid/operators/eig_op.h b/paddle/fluid/operators/eig_op.h
index b9a3cb300b4..4dd5b7cfd84 100644
--- a/paddle/fluid/operators/eig_op.h
+++ b/paddle/fluid/operators/eig_op.h
@@ -19,11 +19,11 @@
 #include <complex>
 #include "paddle/fluid/operators/math/complex_functors.h"
 #include "paddle/fluid/operators/math/lapack_function.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/math/matrix_solve.h"
 #include "paddle/fluid/operators/svd_helper.h"
 #include "paddle/fluid/operators/transpose_op.h"
 #include "paddle/fluid/platform/for_range.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 #define EPSILON 1e-6
 
 namespace paddle {
diff --git a/paddle/fluid/operators/elementwise/elementwise_op_function.h b/paddle/fluid/operators/elementwise/elementwise_op_function.h
index 34d40c741f0..57b47d436da 100644
--- a/paddle/fluid/operators/elementwise/elementwise_op_function.h
+++ b/paddle/fluid/operators/elementwise/elementwise_op_function.h
@@ -47,8 +47,8 @@ limitations under the License. */
 
 #endif
 
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/platform/for_range.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 #define DIVUP(x, y) (((x) + (y)-1) / (y))
 
diff --git a/paddle/fluid/operators/elementwise/elementwise_op_npu_test.cc b/paddle/fluid/operators/elementwise/elementwise_op_npu_test.cc
index 3cd9729d344..63ec5bd4a28 100644
--- a/paddle/fluid/operators/elementwise/elementwise_op_npu_test.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_op_npu_test.cc
@@ -25,12 +25,11 @@ limitations under the License. */
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/operators/dropout_op.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/string/printf.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace f = paddle::framework;
 namespace p = paddle::platform;
-namespace m = paddle::operators::math;
 
 USE_OP(elementwise_add);
 USE_OP_DEVICE_KERNEL(elementwise_add, NPU);
diff --git a/paddle/fluid/operators/expand_op_npu_test.cc b/paddle/fluid/operators/expand_op_npu_test.cc
index 7de2bf2e699..4e18cc73d29 100644
--- a/paddle/fluid/operators/expand_op_npu_test.cc
+++ b/paddle/fluid/operators/expand_op_npu_test.cc
@@ -25,12 +25,11 @@ limitations under the License. */
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/operators/dropout_op.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/string/printf.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace f = paddle::framework;
 namespace p = paddle::platform;
-namespace m = paddle::operators::math;
 
 USE_OP(expand);
 USE_OP_DEVICE_KERNEL(expand, NPU);
diff --git a/paddle/fluid/operators/exponential_op.h b/paddle/fluid/operators/exponential_op.h
index d8cafb8ef7f..88c891d8bff 100644
--- a/paddle/fluid/operators/exponential_op.h
+++ b/paddle/fluid/operators/exponential_op.h
@@ -18,7 +18,7 @@
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/operators/distribution_helper.h"
-#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
@@ -32,7 +32,7 @@ class ExponentialGradKernel : public framework::OpKernel<T> {
   void Compute(const framework::ExecutionContext& ctx) const override {
     auto* dx = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
     dx->mutable_data<T>(ctx.GetPlace());
-    math::SetConstant<DeviceContext, T> functor;
+    pten::funcs::SetConstant<DeviceContext, T> functor;
     auto& dev_ctx = ctx.template device_context<DeviceContext>();
     functor(dev_ctx, dx, static_cast<T>(0));
   }
diff --git a/paddle/fluid/operators/eye_op.h b/paddle/fluid/operators/eye_op.h
index d5ad27596d6..1aa22e74f75 100644
--- a/paddle/fluid/operators/eye_op.h
+++ b/paddle/fluid/operators/eye_op.h
@@ -17,8 +17,8 @@ limitations under the License. */
 #include <algorithm>
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/platform/for_range.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
@@ -47,7 +47,7 @@ class EyeKernel : public framework::OpKernel<T> {
     auto* out_tensor = ctx.Output<framework::Tensor>("Out");
     T* out_data = out_tensor->mutable_data<T>(ctx.GetPlace());
 
-    math::SetConstant<DeviceContext, T> set_zero;
+    pten::funcs::SetConstant<DeviceContext, T> set_zero;
     auto& dev_ctx = ctx.template device_context<DeviceContext>();
     set_zero(dev_ctx, out_tensor, static_cast<T>(0));
 
diff --git a/paddle/fluid/operators/feed_forward_test.cu b/paddle/fluid/operators/feed_forward_test.cu
index dea427393b1..551d8ee6592 100644
--- a/paddle/fluid/operators/feed_forward_test.cu
+++ b/paddle/fluid/operators/feed_forward_test.cu
@@ -20,8 +20,8 @@ limitations under the License. */
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/operators/fused/attn_feed_forward.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/platform/float16.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace framework = paddle::framework;
 namespace platform = paddle::platform;
diff --git a/paddle/fluid/operators/fill_any_op.h b/paddle/fluid/operators/fill_any_op.h
index f483e05a08f..a476b7a0a6e 100644
--- a/paddle/fluid/operators/fill_any_op.h
+++ b/paddle/fluid/operators/fill_any_op.h
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
@@ -41,7 +41,7 @@ class FillAnyKernel : public framework::OpKernel<T> {
 
     out->mutable_data<T>(ctx.GetPlace());
     auto &dev_ctx = ctx.template device_context<DeviceContext>();
-    math::SetConstant<DeviceContext, T> functor;
+    pten::funcs::SetConstant<DeviceContext, T> functor;
     functor(reinterpret_cast<const DeviceContext &>(dev_ctx), out,
             static_cast<T>(fill_var));
   }
@@ -55,7 +55,7 @@ class FillAnyGradKernel : public framework::OpKernel<T> {
     if (dx) {
       dx->mutable_data<T>(ctx.GetPlace());
       auto &dev_ctx = ctx.template device_context<DeviceContext>();
-      math::SetConstant<DeviceContext, T> functor;
+      pten::funcs::SetConstant<DeviceContext, T> functor;
       functor(reinterpret_cast<const DeviceContext &>(dev_ctx), dx, T(0));
     }
   }
diff --git a/paddle/fluid/operators/fill_constant_batch_size_like_op.h b/paddle/fluid/operators/fill_constant_batch_size_like_op.h
index 4c90daa39f9..ed3a6618977 100644
--- a/paddle/fluid/operators/fill_constant_batch_size_like_op.h
+++ b/paddle/fluid/operators/fill_constant_batch_size_like_op.h
@@ -15,7 +15,7 @@ limitations under the License. */
 #pragma once
 #include <string>
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
@@ -60,7 +60,7 @@ class FillConstantBatchSizeLikeOpKernel : public framework::OpKernel<T> {
     bool cpu_place = force_cpu || ctx.GetPlace() == platform::CPUPlace();
     if (cpu_place) {
       auto &dev_ctx = *pool.Get(platform::CPUPlace());
-      math::SetConstant<platform::CPUDeviceContext, T> functor;
+      pten::funcs::SetConstant<platform::CPUDeviceContext, T> functor;
       out->mutable_data(platform::CPUPlace(), data_type);
       functor(reinterpret_cast<const platform::CPUDeviceContext &>(dev_ctx),
               out, static_cast<T>(value));
@@ -68,7 +68,7 @@ class FillConstantBatchSizeLikeOpKernel : public framework::OpKernel<T> {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
     if (!cpu_place) {
       auto &dev_ctx = *pool.Get(ctx.GetPlace());
-      math::SetConstant<platform::CUDADeviceContext, T> functor;
+      pten::funcs::SetConstant<platform::CUDADeviceContext, T> functor;
       out->mutable_data(ctx.GetPlace(), data_type);
       functor(reinterpret_cast<const platform::CUDADeviceContext &>(dev_ctx),
               out, static_cast<T>(value));
diff --git a/paddle/fluid/operators/fill_constant_batch_size_like_op_npu.cc b/paddle/fluid/operators/fill_constant_batch_size_like_op_npu.cc
index 6b07b021d13..98e03ea66d8 100644
--- a/paddle/fluid/operators/fill_constant_batch_size_like_op_npu.cc
+++ b/paddle/fluid/operators/fill_constant_batch_size_like_op_npu.cc
@@ -70,7 +70,7 @@ class FillConstantBatchSizeLikeOpNPUKernel : public framework::OpKernel<T> {
     bool cpu_place = force_cpu || ctx.GetPlace() == platform::CPUPlace();
     if (cpu_place) {
       auto &dev_ctx = *pool.Get(platform::CPUPlace());
-      math::SetConstant<platform::CPUDeviceContext, T> functor;
+      pten::funcs::SetConstant<platform::CPUDeviceContext, T> functor;
       out->mutable_data(platform::CPUPlace(), data_type);
       functor(reinterpret_cast<const platform::CPUDeviceContext &>(dev_ctx),
               out, static_cast<T>(value));
diff --git a/paddle/fluid/operators/fill_constant_op.h b/paddle/fluid/operators/fill_constant_op.h
index c74cf2a824c..15c9241275d 100644
--- a/paddle/fluid/operators/fill_constant_op.h
+++ b/paddle/fluid/operators/fill_constant_op.h
@@ -21,8 +21,8 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/utils.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
@@ -121,14 +121,14 @@ class FillConstantKernel : public framework::OpKernel<T> {
               << ((data_type == framework::proto::VarType::BF16) ? "<bfloat16>"
                                                                  : "<T>");
       tensor->mutable_data(platform::CPUPlace(), data_type);
-      math::SetConstant<platform::CPUDeviceContext, T> functor;
+      pten::funcs::SetConstant<platform::CPUDeviceContext, T> functor;
       auto &dev_ctx = *pool.Get(platform::CPUPlace());
       functor(reinterpret_cast<const platform::CPUDeviceContext &>(dev_ctx),
               tensor, static_cast<T>(value));
     } else if (actual_place == 1) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
       tensor->mutable_data(ctx.GetPlace(), data_type);
-      math::SetConstant<platform::CUDADeviceContext, T> functor;
+      pten::funcs::SetConstant<platform::CUDADeviceContext, T> functor;
       auto &dev_ctx = *pool.Get(ctx.GetPlace());
       functor(reinterpret_cast<const platform::CUDADeviceContext &>(dev_ctx),
               tensor, static_cast<T>(value));
@@ -139,7 +139,7 @@ class FillConstantKernel : public framework::OpKernel<T> {
     } else if (actual_place == 2) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
       tensor->mutable_data(platform::CUDAPinnedPlace(), data_type);
-      math::SetConstant<platform::CUDAPinnedDeviceContext, T> functor;
+      pten::funcs::SetConstant<platform::CUDAPinnedDeviceContext, T> functor;
       auto &dev_ctx = *pool.Get(platform::CUDAPinnedPlace());
       functor(
           reinterpret_cast<const platform::CUDAPinnedDeviceContext &>(dev_ctx),
@@ -151,7 +151,7 @@ class FillConstantKernel : public framework::OpKernel<T> {
     } else if (actual_place == 3) {
 #ifdef PADDLE_WITH_XPU
       tensor->mutable_data(ctx.GetPlace(), data_type);
-      math::SetConstant<platform::XPUDeviceContext, T> functor;
+      pten::funcs::SetConstant<platform::XPUDeviceContext, T> functor;
       auto &dev_ctx = *pool.Get(ctx.GetPlace());
       functor(reinterpret_cast<const platform::XPUDeviceContext &>(dev_ctx),
               tensor, static_cast<T>(value));
diff --git a/paddle/fluid/operators/fill_zeros_like_op.h b/paddle/fluid/operators/fill_zeros_like_op.h
index 4bbe0df6b68..c34358d9a3c 100644
--- a/paddle/fluid/operators/fill_zeros_like_op.h
+++ b/paddle/fluid/operators/fill_zeros_like_op.h
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #pragma once
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
@@ -26,7 +26,7 @@ class FillZerosLikeKernel : public framework::OpKernel<T> {
     auto* out = context.Output<framework::Tensor>("Out");
     out->mutable_data<T>(context.GetPlace());
 
-    math::SetConstant<DeviceContext, T> setter;
+    pten::funcs::SetConstant<DeviceContext, T> setter;
     setter(context.template device_context<DeviceContext>(), out,
            static_cast<T>(0));
   }
diff --git a/paddle/fluid/operators/flatten_op.h b/paddle/fluid/operators/flatten_op.h
index 2a9c2b27d23..15e820a9ee3 100644
--- a/paddle/fluid/operators/flatten_op.h
+++ b/paddle/fluid/operators/flatten_op.h
@@ -17,12 +17,12 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/pten_utils.h"
 #include "paddle/fluid/operators/math/blas.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/math/pooling.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/pten/kernels/empty_kernel.h"
 #include "paddle/pten/kernels/flatten_grad_kernel.h"
 #include "paddle/pten/kernels/flatten_kernel.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/fold_op.h b/paddle/fluid/operators/fold_op.h
index d37edbfe803..7f2f26b464f 100644
--- a/paddle/fluid/operators/fold_op.h
+++ b/paddle/fluid/operators/fold_op.h
@@ -18,7 +18,7 @@ limitations under the License. */
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/im2col.h"
-#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
@@ -64,7 +64,7 @@ class FoldOpKernel : public framework::OpKernel<T> {
     framework::DDim input_matrix_shape({input_dims[0], kernel_sizes[0],
                                         kernel_sizes[1], output_height,
                                         output_width});
-    math::SetConstant<DeviceContext, T> set_zero;
+    pten::funcs::SetConstant<DeviceContext, T> set_zero;
     set_zero(dev_ctx, output, static_cast<T>(0));
 
     for (int i = 0; i < batch_size; i++) {
diff --git a/paddle/fluid/operators/frame_op.h b/paddle/fluid/operators/frame_op.h
index 482c6411812..0f34e2f7fcc 100644
--- a/paddle/fluid/operators/frame_op.h
+++ b/paddle/fluid/operators/frame_op.h
@@ -18,11 +18,11 @@
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/math/seq2col.h"
 #include "paddle/fluid/operators/transpose_op.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/for_range.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/fsp_op.h b/paddle/fluid/operators/fsp_op.h
index 55bd23784d4..999c3ae3747 100644
--- a/paddle/fluid/operators/fsp_op.h
+++ b/paddle/fluid/operators/fsp_op.h
@@ -15,7 +15,7 @@ limitations under the License. */
 #pragma once
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/blas.h"
-#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
@@ -79,7 +79,7 @@ class FSPGradOpKernel : public framework::OpKernel<T> {
     int64_t w = 0;
 
     auto blas = math::GetBlas<DeviceContext, T>(context);
-    math::SetConstant<DeviceContext, T> set_zero;
+    pten::funcs::SetConstant<DeviceContext, T> set_zero;
     if (d_x != nullptr) {
       d_x->mutable_data<T>(context.GetPlace());
       set_zero(context.template device_context<DeviceContext>(), d_x,
diff --git a/paddle/fluid/operators/fused/cudnn_bn_add_relu_test.cc b/paddle/fluid/operators/fused/cudnn_bn_add_relu_test.cc
index 74307c3ba79..cd88b67a563 100644
--- a/paddle/fluid/operators/fused/cudnn_bn_add_relu_test.cc
+++ b/paddle/fluid/operators/fused/cudnn_bn_add_relu_test.cc
@@ -22,8 +22,8 @@ limitations under the License. */
 #include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/operators/fused/cudnn_bn_stats_finalize.cu.h"
 #include "paddle/fluid/operators/fused/cudnn_scale_bias_add_relu.cu.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/platform/float16.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 DECLARE_bool(cudnn_batchnorm_spatial_persistent);
 
diff --git a/paddle/fluid/operators/fused/cudnn_norm_conv_test.cc b/paddle/fluid/operators/fused/cudnn_norm_conv_test.cc
index 425782d7900..bec44662a26 100644
--- a/paddle/fluid/operators/fused/cudnn_norm_conv_test.cc
+++ b/paddle/fluid/operators/fused/cudnn_norm_conv_test.cc
@@ -21,8 +21,8 @@ limitations under the License. */
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/operators/fused/cudnn_norm_conv.cu.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/platform/float16.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace framework = paddle::framework;
 namespace platform = paddle::platform;
diff --git a/paddle/fluid/operators/fused/fused_attention_op.cu b/paddle/fluid/operators/fused/fused_attention_op.cu
index 581fc45e268..79569bb3a79 100644
--- a/paddle/fluid/operators/fused/fused_attention_op.cu
+++ b/paddle/fluid/operators/fused/fused_attention_op.cu
@@ -20,7 +20,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
 
 #include "paddle/fluid/operators/elementwise/elementwise_add_op.h"
-#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 #include "paddle/fluid/operators/fused/attention_layer_norm.h"
 #include "paddle/fluid/operators/fused/attn_gemm.h"
diff --git a/paddle/fluid/operators/fused/fused_bn_activation_op.cu b/paddle/fluid/operators/fused/fused_bn_activation_op.cu
index 83328caf384..e825ad30782 100644
--- a/paddle/fluid/operators/fused/fused_bn_activation_op.cu
+++ b/paddle/fluid/operators/fused/fused_bn_activation_op.cu
@@ -20,10 +20,10 @@
 #include "paddle/fluid/framework/data_layout.h"
 #include "paddle/fluid/operators/activation_op.h"
 #include "paddle/fluid/operators/fused/fused_bn_activation_op.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/norm_utils.h"
 #include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
 #include "paddle/fluid/platform/float16.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 DECLARE_bool(cudnn_batchnorm_spatial_persistent);
 
@@ -256,7 +256,8 @@ class FusedBatchNormActGradKernel<platform::CUDADeviceContext, T>
         PADDLE_THROW(
             platform::errors::Unimplemented("Unsupported activation type"));
       }
-      math::SetConstant<platform::CUDADeviceContext, BatchNormParamType<T>>
+      pten::funcs::SetConstant<platform::CUDADeviceContext,
+                               BatchNormParamType<T>>
           functor;
       functor(dev_ctx, d_scale, static_cast<BatchNormParamType<T>>(0));
       functor(dev_ctx, d_bias, static_cast<BatchNormParamType<T>>(0));
diff --git a/paddle/fluid/operators/fused/fused_bn_add_activation_op.cu b/paddle/fluid/operators/fused/fused_bn_add_activation_op.cu
index 7c124a0d6b6..c5bc5b17255 100644
--- a/paddle/fluid/operators/fused/fused_bn_add_activation_op.cu
+++ b/paddle/fluid/operators/fused/fused_bn_add_activation_op.cu
@@ -19,10 +19,10 @@
 #include "paddle/fluid/framework/data_layout.h"
 #include "paddle/fluid/operators/activation_op.h"
 #include "paddle/fluid/operators/fused/fused_bn_add_activation_op.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/norm_utils.h"
 #include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
 #include "paddle/fluid/platform/float16.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 DECLARE_bool(cudnn_batchnorm_spatial_persistent);
 
diff --git a/paddle/fluid/operators/fused/fused_dropout_test.h b/paddle/fluid/operators/fused/fused_dropout_test.h
index a0d1cd43404..59b997bb514 100644
--- a/paddle/fluid/operators/fused/fused_dropout_test.h
+++ b/paddle/fluid/operators/fused/fused_dropout_test.h
@@ -24,8 +24,8 @@ limitations under the License. */
 #include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/memory/memory.h"
 #include "paddle/fluid/operators/layer_norm_kernel.cu.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/string/printf.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace framework = paddle::framework;
 namespace platform = paddle::platform;
diff --git a/paddle/fluid/operators/gather.cu.h b/paddle/fluid/operators/gather.cu.h
index 8386896027f..739fcc9b184 100644
--- a/paddle/fluid/operators/gather.cu.h
+++ b/paddle/fluid/operators/gather.cu.h
@@ -18,10 +18,10 @@ limitations under the License. */
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/memory/malloc.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/fluid/platform/place.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 namespace paddle {
 namespace operators {
 
@@ -283,7 +283,7 @@ void GatherV2GradCUDAFunction(const Tensor* input, const Tensor* index,
   auto* dev_ctx = platform::DeviceContextPool::Instance().Get(place);
   auto out_dim = out->dims();
   int64_t out_index_dim_size = out_dim[axis_index];
-  operators::math::set_constant(*dev_ctx, out, 0.0);
+  pten::funcs::set_constant(*dev_ctx, out, 0.0);
 
   platform::GpuLaunchConfig config =
       platform::GetGpuLaunchConfig1D(ctx.cuda_device_context(), input_size);
diff --git a/paddle/fluid/operators/gather.h b/paddle/fluid/operators/gather.h
index 84ec587bede..bd339c4a085 100644
--- a/paddle/fluid/operators/gather.h
+++ b/paddle/fluid/operators/gather.h
@@ -20,8 +20,8 @@ limitations under the License. */
 #include "paddle/fluid/framework/ddim.h"
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/platform/place.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
@@ -226,7 +226,7 @@ void GatherV2GradFunction(const Tensor* input, const Tensor* index,
   auto* dev_ctx = platform::DeviceContextPool::Instance().Get(place);
   auto out_dim = out->dims();
   int64_t out_index_dim_size = out_dim[axis_index];
-  operators::math::set_constant(*dev_ctx, out, 0.0);
+  pten::funcs::set_constant(*dev_ctx, out, 0.0);
 
   for (int64_t i = 0; i < inner_dim_size; i++) {
     for (int64_t j = 0; j < input_index_dim_size; j++) {
diff --git a/paddle/fluid/operators/gather_op_npu_test.cc b/paddle/fluid/operators/gather_op_npu_test.cc
index f50c4f5528e..247ce8529c9 100644
--- a/paddle/fluid/operators/gather_op_npu_test.cc
+++ b/paddle/fluid/operators/gather_op_npu_test.cc
@@ -25,12 +25,11 @@ limitations under the License. */
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/operators/gather_op.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/string/printf.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace f = paddle::framework;
 namespace p = paddle::platform;
-namespace m = paddle::operators::math;
 
 USE_OP(gather);
 USE_OP_DEVICE_KERNEL(gather, NPU);
diff --git a/paddle/fluid/operators/gelu_op_npu_test.cc b/paddle/fluid/operators/gelu_op_npu_test.cc
index f47250c9681..bcaf7b11feb 100644
--- a/paddle/fluid/operators/gelu_op_npu_test.cc
+++ b/paddle/fluid/operators/gelu_op_npu_test.cc
@@ -25,12 +25,11 @@ limitations under the License. */
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/operators/dropout_op.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/string/printf.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace f = paddle::framework;
 namespace p = paddle::platform;
-namespace m = paddle::operators::math;
 
 USE_OP(gelu);
 USE_OP_DEVICE_KERNEL(gelu, NPU);
diff --git a/paddle/fluid/operators/grid_sampler_op.cu b/paddle/fluid/operators/grid_sampler_op.cu
index 8e9f445f3b1..df70efcc6ff 100644
--- a/paddle/fluid/operators/grid_sampler_op.cu
+++ b/paddle/fluid/operators/grid_sampler_op.cu
@@ -292,7 +292,7 @@ class GridSampleOpCUDAKernel : public framework::OpKernel<T> {
     auto* output_data = output->mutable_data<T>(ctx.GetPlace());
     VLOG(3) << "out dims: " << output->dims()[0] << "; " << output->dims()[1]
             << "; " << output->dims()[2] << "; " << output->dims()[3];
-    math::SetConstant<paddle::platform::CUDADeviceContext, T>()(
+    pten::funcs::SetConstant<paddle::platform::CUDADeviceContext, T>()(
         dev_ctx, output, static_cast<T>(0));
     int count = static_cast<int>(n * out_h * out_w);
     auto cu_stream = dev_ctx.stream();
@@ -459,7 +459,7 @@ class GridSampleGradOpCUDAKernel : public framework::OpKernel<T> {
 
     auto* input_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
     input_grad->mutable_data<T>(ctx.GetPlace());
-    math::SetConstant<paddle::platform::CUDADeviceContext, T>()(
+    pten::funcs::SetConstant<paddle::platform::CUDADeviceContext, T>()(
         ctx.template device_context<paddle::platform::CUDADeviceContext>(),
         input_grad, static_cast<T>(0));
 
@@ -467,7 +467,7 @@ class GridSampleGradOpCUDAKernel : public framework::OpKernel<T> {
     if (ctx.HasOutput(framework::GradVarName("Grid"))) {
       auto* grid_grad = ctx.Output<Tensor>(framework::GradVarName("Grid"));
       grid_grad_data = grid_grad->mutable_data<T>(ctx.GetPlace());
-      math::SetConstant<paddle::platform::CUDADeviceContext, T>()(
+      pten::funcs::SetConstant<paddle::platform::CUDADeviceContext, T>()(
           ctx.template device_context<paddle::platform::CUDADeviceContext>(),
           grid_grad, static_cast<T>(0));
     }
diff --git a/paddle/fluid/operators/grid_sampler_op.h b/paddle/fluid/operators/grid_sampler_op.h
index a595e5078b2..874a8d8c2a2 100644
--- a/paddle/fluid/operators/grid_sampler_op.h
+++ b/paddle/fluid/operators/grid_sampler_op.h
@@ -19,8 +19,8 @@ limitations under the License. */
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/gather.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/pten/core/hostdevice.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
@@ -520,7 +520,7 @@ class GridSampleOpKernel : public framework::OpKernel<T> {
 
     auto* output = ctx.Output<Tensor>("Output");
     output->mutable_data<T>({n, c, out_h, out_w}, ctx.GetPlace());
-    math::SetConstant<DeviceContext, T>()(
+    pten::funcs::SetConstant<DeviceContext, T>()(
         ctx.template device_context<DeviceContext>(), output,
         static_cast<T>(0));
 
@@ -563,7 +563,7 @@ class GridSampleGradOpKernel : public framework::OpKernel<T> {
 
     auto* input_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
     input_grad->mutable_data<T>({n, c, in_h, in_w}, ctx.GetPlace());
-    math::SetConstant<DeviceContext, T>()(
+    pten::funcs::SetConstant<DeviceContext, T>()(
         ctx.template device_context<DeviceContext>(), input_grad,
         static_cast<T>(0));
 
@@ -571,7 +571,7 @@ class GridSampleGradOpKernel : public framework::OpKernel<T> {
     if (ctx.HasOutput(framework::GradVarName("Grid"))) {
       grid_grad = ctx.Output<Tensor>(framework::GradVarName("Grid"));
       grid_grad->mutable_data<T>({n, out_h, out_w, 2}, ctx.GetPlace());
-      math::SetConstant<DeviceContext, T>()(
+      pten::funcs::SetConstant<DeviceContext, T>()(
           ctx.template device_context<DeviceContext>(), grid_grad,
           static_cast<T>(0));
     }
diff --git a/paddle/fluid/operators/group_norm_op.cu b/paddle/fluid/operators/group_norm_op.cu
index 055fd791af5..584be96c659 100644
--- a/paddle/fluid/operators/group_norm_op.cu
+++ b/paddle/fluid/operators/group_norm_op.cu
@@ -153,7 +153,7 @@ class GroupNormKernel<platform::CUDADeviceContext, T>
     y->mutable_data<T>(ctx.GetPlace());
     mean->mutable_data<T>(ctx.GetPlace());
     var->mutable_data<T>(ctx.GetPlace());
-    math::SetConstant<platform::CUDADeviceContext, T> set_zero;
+    pten::funcs::SetConstant<platform::CUDADeviceContext, T> set_zero;
     auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
     Tensor temp_var;
     temp_var.mutable_data<T>(var->dims(), ctx.GetPlace());
@@ -321,7 +321,7 @@ class GroupNormGradKernel<platform::CUDADeviceContext, T>
                                           : x_dims[x_dims.size() - 2]);
 
     d_x->mutable_data<T>(ctx.GetPlace());
-    math::SetConstant<platform::CUDADeviceContext, T> set_zero;
+    pten::funcs::SetConstant<platform::CUDADeviceContext, T> set_zero;
     auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
 
     Tensor temp_var;
diff --git a/paddle/fluid/operators/group_norm_op.h b/paddle/fluid/operators/group_norm_op.h
index 9cb451235f1..3fc2d413b6c 100644
--- a/paddle/fluid/operators/group_norm_op.h
+++ b/paddle/fluid/operators/group_norm_op.h
@@ -22,7 +22,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
 #include "paddle/fluid/operators/math/blas.h"
-#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
@@ -239,7 +239,7 @@ class GroupNormGradKernel : public framework::OpKernel<T> {
     const int group_size = C / groups;
 
     d_x->mutable_data<T>(ctx.GetPlace());
-    math::SetConstant<DeviceContext, T> set_zero;
+    pten::funcs::SetConstant<DeviceContext, T> set_zero;
     auto& dev_ctx = ctx.template device_context<DeviceContext>();
 
     auto* x_data = x->data<T>();
diff --git a/paddle/fluid/operators/gru_op.cc b/paddle/fluid/operators/gru_op.cc
index a2d61695649..20956e3cdbb 100644
--- a/paddle/fluid/operators/gru_op.cc
+++ b/paddle/fluid/operators/gru_op.cc
@@ -321,7 +321,7 @@ class GRUCPUKernel : public framework::OpKernel<T> {
     to_batch(dev_ctx, *input, batch_gate, true, is_reverse);
 
     if (bias) {
-      math::RowwiseAdd<DeviceContext, T> add_bias;
+      pten::funcs::RowwiseAdd<DeviceContext, T> add_bias;
       add_bias(dev_ctx, *batch_gate, *bias, batch_gate);
     }
 
diff --git a/paddle/fluid/operators/gru_op.cu.cc b/paddle/fluid/operators/gru_op.cu.cc
index ce3c8ac51c7..0f1db8de5a3 100644
--- a/paddle/fluid/operators/gru_op.cu.cc
+++ b/paddle/fluid/operators/gru_op.cu.cc
@@ -70,7 +70,7 @@ class GRUKernel : public framework::OpKernel<T> {
     to_batch(dev_ctx, *input, batch_gate, true, is_reverse);
 
     if (bias) {
-      math::RowwiseAdd<DeviceContext, T> add_bias;
+      pten::funcs::RowwiseAdd<DeviceContext, T> add_bias;
       add_bias(dev_ctx, *batch_gate, *bias, batch_gate);
     }
 
diff --git a/paddle/fluid/operators/gru_op.h b/paddle/fluid/operators/gru_op.h
index bcca992e2b4..e9d520dd9fc 100644
--- a/paddle/fluid/operators/gru_op.h
+++ b/paddle/fluid/operators/gru_op.h
@@ -18,8 +18,8 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/detail/activation_functions.h"
 #include "paddle/fluid/operators/math/gru_compute.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/math/sequence2batch.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
@@ -69,7 +69,7 @@ class GRUGradKernel : public framework::OpKernel<T> {
     batch_gate_grad.mutable_data<T>(gate_dims, context.GetPlace());
     batch_reset_hidden_prev_grad.mutable_data<T>(hidden_dims,
                                                  context.GetPlace());
-    math::SetConstant<DeviceContext, T> zero;
+    pten::funcs::SetConstant<DeviceContext, T> zero;
     auto& dev_ctx = context.template device_context<DeviceContext>();
     zero(dev_ctx, &batch_hidden_grad, static_cast<T>(0.0));
     zero(dev_ctx, &batch_gate_grad, static_cast<T>(0.0));
@@ -157,7 +157,7 @@ class GRUGradKernel : public framework::OpKernel<T> {
     }
     if (bias_grad) {
       bias_grad->mutable_data<T>(context.GetPlace());
-      math::ColwiseSum<DeviceContext, T> col_sum;
+      pten::funcs::ColwiseSum<DeviceContext, T> col_sum;
       col_sum(dev_ctx, batch_gate_grad, bias_grad);
     }
     if (h0 && h0_grad) {
diff --git a/paddle/fluid/operators/gumbel_softmax_op.cu b/paddle/fluid/operators/gumbel_softmax_op.cu
index 63577ed1e0f..ba6ce141e81 100644
--- a/paddle/fluid/operators/gumbel_softmax_op.cu
+++ b/paddle/fluid/operators/gumbel_softmax_op.cu
@@ -99,7 +99,7 @@ struct OneHotGenerator<platform::CUDADeviceContext, T> {
     Tensor input_tensor;
     input_tensor.mutable_data<T>(Out->dims(), platform::CUDAPlace());
     paddle::framework::TensorCopy(*Out, context.GetPlace(), &input_tensor);
-    math::set_constant(context, Out, 0.0);
+    pten::funcs::set_constant(context, Out, 0.0);
     OneHotCUDAKernel<
         T, thread_size><<<block_size, thread_size, 0, context.stream()>>>(
         height, size_from_axis / size_out_axis, size_out_axis,
diff --git a/paddle/fluid/operators/gumbel_softmax_op.h b/paddle/fluid/operators/gumbel_softmax_op.h
index f95a4810f44..3cd211ccc3e 100644
--- a/paddle/fluid/operators/gumbel_softmax_op.h
+++ b/paddle/fluid/operators/gumbel_softmax_op.h
@@ -15,8 +15,8 @@ limitations under the License. */
 #pragma once
 #include "paddle/fluid/framework/generator.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/math/softmax.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
@@ -155,7 +155,7 @@ struct OneHotGenerator<platform::CPUDeviceContext, T> {
 #undef CALL_ARG_MINMAX_FUNCTOR
     }
 
-    math::set_constant(context, Out, 0.0);
+    pten::funcs::set_constant(context, Out, 0.0);
     for (int i = 0; i < size_to_axis; i++) {
       for (int j = 0; j < size_out_axis; j++) {
         *(Out->data<T>() + i * size_from_axis + j +
diff --git a/paddle/fluid/operators/hierarchical_sigmoid_op.h b/paddle/fluid/operators/hierarchical_sigmoid_op.h
index 17734b9c542..5734e247f4d 100644
--- a/paddle/fluid/operators/hierarchical_sigmoid_op.h
+++ b/paddle/fluid/operators/hierarchical_sigmoid_op.h
@@ -24,9 +24,9 @@ limitations under the License. */
 #include "paddle/fluid/framework/mixed_vector.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/clip_op.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/math/matrix_bit_code.h"
 #include "paddle/fluid/platform/transform.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
@@ -81,10 +81,10 @@ class HierarchicalSigmoidOpKernel : public framework::OpKernel<T> {
     auto pre_out_mat = EigenMatrix<T>::From(*pre_out);
     // Not all class(leaf) nodes' path lengths equal code_length, thus init as
     // 0s can avoid out of path's loss.
-    math::SetConstant<DeviceContext, T> zero;
+    pten::funcs::SetConstant<DeviceContext, T> zero;
     zero(dev_ctx, pre_out, static_cast<T>(0.0));
     auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
-    math::RowwiseSum<DeviceContext, T> row_sum;
+    pten::funcs::RowwiseSum<DeviceContext, T> row_sum;
 
     std::unique_ptr<math::MatrixBitCodeFunctor<T>> bit_code;
     if (!is_custom) {
@@ -134,7 +134,7 @@ class HierarchicalSigmoidGradOpKernel : public framework::OpKernel<T> {
     auto* in_grad = ctx.Output<LoDTensor>(framework::GradVarName("X"));
     bool is_sparse = ctx.Attr<bool>("is_sparse");
     auto& dev_ctx = ctx.template device_context<DeviceContext>();
-    math::SetConstant<DeviceContext, T> zero;
+    pten::funcs::SetConstant<DeviceContext, T> zero;
     auto& label = GET_DATA_SAFELY(ctx.Input<LoDTensor>("Label"), "Input",
                                   "Label", "HierarchicalSigmoidGrad");
     auto& pre_out = GET_DATA_SAFELY(ctx.Input<LoDTensor>("PreOut"), "Input",
diff --git a/paddle/fluid/operators/histogram_op.cu b/paddle/fluid/operators/histogram_op.cu
index a34f4b8a22e..48a637e6c37 100644
--- a/paddle/fluid/operators/histogram_op.cu
+++ b/paddle/fluid/operators/histogram_op.cu
@@ -82,7 +82,7 @@ class HistogramCUDAKernel : public framework::OpKernel<T> {
     const int input_numel = input->numel();
 
     int64_t* out_data = output->mutable_data<int64_t>(context.GetPlace());
-    math::SetConstant<platform::CUDADeviceContext, int64_t>()(
+    pten::funcs::SetConstant<platform::CUDADeviceContext, int64_t>()(
         context.template device_context<platform::CUDADeviceContext>(), output,
         static_cast<int64_t>(0));
 
diff --git a/paddle/fluid/operators/histogram_op.h b/paddle/fluid/operators/histogram_op.h
index a6f4448cbcb..9e280336e49 100644
--- a/paddle/fluid/operators/histogram_op.h
+++ b/paddle/fluid/operators/histogram_op.h
@@ -18,7 +18,7 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
@@ -39,7 +39,7 @@ class HistogramKernel : public framework::OpKernel<T> {
     auto input_numel = input->numel();
 
     int64_t* out_data = output->mutable_data<int64_t>(context.GetPlace());
-    math::SetConstant<DeviceContext, int64_t>()(
+    pten::funcs::SetConstant<DeviceContext, int64_t>()(
         context.template device_context<DeviceContext>(), output,
         static_cast<int64_t>(0));
 
diff --git a/paddle/fluid/operators/im2sequence_op.h b/paddle/fluid/operators/im2sequence_op.h
index 39ff7ea40aa..6eac1cc4e4c 100644
--- a/paddle/fluid/operators/im2sequence_op.h
+++ b/paddle/fluid/operators/im2sequence_op.h
@@ -20,7 +20,7 @@
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/eigen/eigen_function.h"
 #include "paddle/fluid/operators/math/im2col.h"
-#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/increment_op_npu_test.cc b/paddle/fluid/operators/increment_op_npu_test.cc
index ca9420c04a2..47e2f2c3cfc 100644
--- a/paddle/fluid/operators/increment_op_npu_test.cc
+++ b/paddle/fluid/operators/increment_op_npu_test.cc
@@ -25,12 +25,11 @@ limitations under the License. */
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/operators/dropout_op.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/string/printf.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace f = paddle::framework;
 namespace p = paddle::platform;
-namespace m = paddle::operators::math;
 
 USE_OP(increment);
 USE_OP_DEVICE_KERNEL(increment, NPU);
diff --git a/paddle/fluid/operators/index_sample_op.cu b/paddle/fluid/operators/index_sample_op.cu
index 4c9dec14000..e145c555dc5 100644
--- a/paddle/fluid/operators/index_sample_op.cu
+++ b/paddle/fluid/operators/index_sample_op.cu
@@ -14,9 +14,9 @@
 
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/index_sample_op.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 #define PREDEFINED_BLOCK_SIZE_X 512
 #define PREDEFINED_BLOCK_SIZE 1024
@@ -177,7 +177,7 @@ class IndexSampleGradKernel<platform::CUDADeviceContext, T>
                   (batch_size + block_dim.y - 1) / block_dim.y);
     LimitGridDim(ctx, &grid_dim);
 
-    math::SetConstant<platform::CUDADeviceContext, T> set_zero;
+    pten::funcs::SetConstant<platform::CUDADeviceContext, T> set_zero;
     auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
     set_zero(dev_ctx, input_grad, static_cast<T>(0));
 
diff --git a/paddle/fluid/operators/index_select_op.h b/paddle/fluid/operators/index_select_op.h
index be76a66ef7c..b157f775d50 100644
--- a/paddle/fluid/operators/index_select_op.h
+++ b/paddle/fluid/operators/index_select_op.h
@@ -16,7 +16,7 @@
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/blas.h"
-#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
@@ -159,7 +159,7 @@ void IndexSelectGradInner(const framework::ExecutionContext& context,
   auto output_dim = x_grad->dims();
 
   auto& dev_ctx = context.template device_context<DeviceContext>();
-  math::SetConstant<DeviceContext, T> set_constant;
+  pten::funcs::SetConstant<DeviceContext, T> set_constant;
   set_constant(dev_ctx, x_grad, static_cast<T>(0.0));
 
   auto slice_size = 1;
diff --git a/paddle/fluid/operators/inplace_abn_op.h b/paddle/fluid/operators/inplace_abn_op.h
index 9c3727ab903..142096eb34c 100644
--- a/paddle/fluid/operators/inplace_abn_op.h
+++ b/paddle/fluid/operators/inplace_abn_op.h
@@ -16,7 +16,7 @@
 #include <string>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/activation_op.h"
-#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/instance_norm_op.cc b/paddle/fluid/operators/instance_norm_op.cc
index cfdaacf8cb6..8c650c64376 100644
--- a/paddle/fluid/operators/instance_norm_op.cc
+++ b/paddle/fluid/operators/instance_norm_op.cc
@@ -18,7 +18,7 @@ limitations under the License. */
 #include <unordered_map>
 #include "paddle/fluid/framework/data_layout.h"
 #include "paddle/fluid/framework/op_version_registry.h"
-#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
@@ -208,7 +208,7 @@ class InstanceNormKernel<platform::CPUDeviceContext, T>
     Eigen::IndexList<Eigen::type2index<1>> rdims;
 #endif
 
-    math::SetConstant<platform::CPUDeviceContext, T> set_constant;
+    pten::funcs::SetConstant<platform::CPUDeviceContext, T> set_constant;
 
     saved_mean->mutable_data<T>(ctx.GetPlace());
     saved_variance->mutable_data<T>(ctx.GetPlace());
@@ -356,7 +356,7 @@ class InstanceNormGradKernel<platform::CPUDeviceContext, T>
     NxC_shape.set(0, NxC);
 #endif
 
-    math::SetConstant<platform::CPUDeviceContext, T> set_constant;
+    pten::funcs::SetConstant<platform::CPUDeviceContext, T> set_constant;
 
     Tensor scale_data;
     if (!scale) {
@@ -492,7 +492,7 @@ class InstanceNormDoubleGradKernel<platform::CPUDeviceContext, T>
     auto *ddY = ctx.Output<Tensor>("DDY");
 
     auto &dev_ctx = ctx.template device_context<platform::CPUDeviceContext>();
-    math::SetConstant<platform::CPUDeviceContext, T> set_constant;
+    pten::funcs::SetConstant<platform::CPUDeviceContext, T> set_constant;
 
     const auto &x_dims = X->dims();
     int N, C, H, W, D;
diff --git a/paddle/fluid/operators/instance_norm_op.cu b/paddle/fluid/operators/instance_norm_op.cu
index e0401366693..a6c935074fe 100644
--- a/paddle/fluid/operators/instance_norm_op.cu
+++ b/paddle/fluid/operators/instance_norm_op.cu
@@ -25,8 +25,8 @@ namespace cub = hipcub;
 #endif
 #include "paddle/fluid/framework/data_layout.h"
 #include "paddle/fluid/operators/instance_norm_op.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
@@ -169,7 +169,7 @@ class InstanceNormKernel<platform::CUDADeviceContext, T>
     const int max_blocks = std::max(max_threads / block, 1);
     const int grid = std::min((NxC + block - 1) / block, max_blocks);
 
-    math::SetConstant<platform::CUDADeviceContext, T> set_constant;
+    pten::funcs::SetConstant<platform::CUDADeviceContext, T> set_constant;
     if (scale) {
       repeat_param<T><<<grid, block, 0, dev_ctx.stream()>>>(
           scale->data<T>(), scale_tmp.data<T>(), N, C);
@@ -185,7 +185,7 @@ class InstanceNormKernel<platform::CUDADeviceContext, T>
 
     auto handle = dev_ctx.cudnn_handle();
 
-    math::SetConstant<platform::CUDADeviceContext, BatchNormParamType<T>>
+    pten::funcs::SetConstant<platform::CUDADeviceContext, BatchNormParamType<T>>
         functor;
 
     auto *saved_mean = ctx.Output<Tensor>("SavedMean");
@@ -349,7 +349,7 @@ class InstanceNormGradKernel<platform::CUDADeviceContext, T>
     }
 
     auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
-    math::SetConstant<platform::CUDADeviceContext, T> set_constant;
+    pten::funcs::SetConstant<platform::CUDADeviceContext, T> set_constant;
 
     const int n = x->numel();
     const int block = 512;
@@ -379,7 +379,8 @@ class InstanceNormGradKernel<platform::CUDADeviceContext, T>
 
     if ((H * W * D) == 1) {
       framework::TensorCopy(*d_y, ctx.GetPlace(), d_x);
-      math::SetConstant<platform::CUDADeviceContext, BatchNormParamType<T>>
+      pten::funcs::SetConstant<platform::CUDADeviceContext,
+                               BatchNormParamType<T>>
           functor;
       functor(dev_ctx, d_scale, static_cast<BatchNormParamType<T>>(0));
       functor(dev_ctx, d_bias, static_cast<BatchNormParamType<T>>(0));
@@ -732,7 +733,7 @@ class InstanceNormDoubleGradKernel<platform::CUDADeviceContext, T>
     const T *variance_data = Saved_variance->data<T>();
 
     auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
-    math::SetConstant<platform::CUDADeviceContext, T> set_zero;
+    pten::funcs::SetConstant<platform::CUDADeviceContext, T> set_zero;
 
     auto &x_dims = X->dims();
     int N, C, H, W, D;
diff --git a/paddle/fluid/operators/interpolate_op.cu b/paddle/fluid/operators/interpolate_op.cu
index 3c857eb326a..eaf8a2f7d93 100644
--- a/paddle/fluid/operators/interpolate_op.cu
+++ b/paddle/fluid/operators/interpolate_op.cu
@@ -1159,7 +1159,7 @@ static void Interpolate1DCUDABwd(const framework::ExecutionContext& ctx,
   input_grad->mutable_data<T>(dim_grad, ctx.GetPlace());
   auto* input_grad_data = input_grad->mutable_data<T>(dim_grad, ctx.GetPlace());
   auto& device_ctx = ctx.template device_context<platform::CUDADeviceContext>();
-  math::SetConstant<platform::CUDADeviceContext, T> zero;
+  pten::funcs::SetConstant<platform::CUDADeviceContext, T> zero;
   zero(device_ctx, input_grad, static_cast<T>(0.0));
 
   if (in_w == out_w) {
@@ -1241,7 +1241,7 @@ static void Interpolate2DCUDABwd(const framework::ExecutionContext& ctx,
   input_grad->mutable_data<T>(dim_grad, ctx.GetPlace());
   auto* input_grad_data = input_grad->mutable_data<T>(dim_grad, ctx.GetPlace());
   auto& device_ctx = ctx.template device_context<platform::CUDADeviceContext>();
-  math::SetConstant<platform::CUDADeviceContext, T> zero;
+  pten::funcs::SetConstant<platform::CUDADeviceContext, T> zero;
   zero(device_ctx, input_grad, static_cast<T>(0.0));
 
   if (in_h == out_h && in_w == out_w) {
@@ -1348,7 +1348,7 @@ static void Interpolate3DCUDABwd(const framework::ExecutionContext& ctx,
   }
   auto* input_grad_data = input_grad->mutable_data<T>(dim_grad, ctx.GetPlace());
   auto& device_ctx = ctx.template device_context<platform::CUDADeviceContext>();
-  math::SetConstant<platform::CUDADeviceContext, T> zero;
+  pten::funcs::SetConstant<platform::CUDADeviceContext, T> zero;
   zero(device_ctx, input_grad, static_cast<T>(0.0));
 
   if (in_d == out_d && in_h == out_h && in_w == out_w) {
diff --git a/paddle/fluid/operators/interpolate_op.h b/paddle/fluid/operators/interpolate_op.h
index 0c0dde6bd45..46353cfb2f2 100644
--- a/paddle/fluid/operators/interpolate_op.h
+++ b/paddle/fluid/operators/interpolate_op.h
@@ -14,8 +14,8 @@
 #include <string>
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/pten/core/hostdevice.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
@@ -1057,7 +1057,7 @@ static void Interpolate1DCPUBwd(const framework::ExecutionContext& ctx,
   input_grad->mutable_data<T>(dim_grad, ctx.GetPlace());
 
   auto& device_ctx = ctx.template device_context<platform::CPUDeviceContext>();
-  math::SetConstant<platform::CPUDeviceContext, T> zero;
+  pten::funcs::SetConstant<platform::CPUDeviceContext, T> zero;
   zero(device_ctx, input_grad, static_cast<T>(0.0));
 
   if (in_w == out_w) {
@@ -1126,7 +1126,7 @@ static void Interpolate2DCPUBwd(const framework::ExecutionContext& ctx,
   input_grad->mutable_data<T>(dim_grad, ctx.GetPlace());
 
   auto& device_ctx = ctx.template device_context<platform::CPUDeviceContext>();
-  math::SetConstant<platform::CPUDeviceContext, T> zero;
+  pten::funcs::SetConstant<platform::CPUDeviceContext, T> zero;
   zero(device_ctx, input_grad, static_cast<T>(0.0));
 
   if (in_h == out_h && in_w == out_w) {
@@ -1213,7 +1213,7 @@ static void Interpolate3DCPUBwd(const framework::ExecutionContext& ctx,
   }
   input_grad->mutable_data<T>(dim_grad, ctx.GetPlace());
   auto& device_ctx = ctx.template device_context<platform::CPUDeviceContext>();
-  math::SetConstant<platform::CPUDeviceContext, T> zero;
+  pten::funcs::SetConstant<platform::CPUDeviceContext, T> zero;
   zero(device_ctx, input_grad, static_cast<T>(0.0));
 
   if (in_d == out_d && in_h == out_h && in_w == out_w) {
diff --git a/paddle/fluid/operators/interpolate_v2_op.cu b/paddle/fluid/operators/interpolate_v2_op.cu
index 8555cd14f42..8c157629586 100644
--- a/paddle/fluid/operators/interpolate_v2_op.cu
+++ b/paddle/fluid/operators/interpolate_v2_op.cu
@@ -1686,7 +1686,7 @@ static void Interpolate1DCUDABwd(const framework::ExecutionContext& ctx,
   input_grad->mutable_data<T>(dim_grad, ctx.GetPlace());
   auto* input_grad_data = input_grad->mutable_data<T>(dim_grad, ctx.GetPlace());
   auto& device_ctx = ctx.template device_context<platform::CUDADeviceContext>();
-  math::SetConstant<platform::CUDADeviceContext, T> zero;
+  pten::funcs::SetConstant<platform::CUDADeviceContext, T> zero;
   zero(device_ctx, input_grad, static_cast<T>(0.0));
 
   if (in_w == out_w) {
@@ -1808,7 +1808,7 @@ static void Interpolate2DCUDABwd(const framework::ExecutionContext& ctx,
   input_grad->mutable_data<T>(dim_grad, ctx.GetPlace());
   auto* input_grad_data = input_grad->mutable_data<T>(dim_grad, ctx.GetPlace());
   auto& device_ctx = ctx.template device_context<platform::CUDADeviceContext>();
-  math::SetConstant<platform::CUDADeviceContext, T> zero;
+  pten::funcs::SetConstant<platform::CUDADeviceContext, T> zero;
   zero(device_ctx, input_grad, static_cast<T>(0.0));
 
   if (in_h == out_h && in_w == out_w) {
@@ -1993,7 +1993,7 @@ static void Interpolate3DCUDABwd(const framework::ExecutionContext& ctx,
   }
   auto* input_grad_data = input_grad->mutable_data<T>(dim_grad, ctx.GetPlace());
   auto& device_ctx = ctx.template device_context<platform::CUDADeviceContext>();
-  math::SetConstant<platform::CUDADeviceContext, T> zero;
+  pten::funcs::SetConstant<platform::CUDADeviceContext, T> zero;
   zero(device_ctx, input_grad, static_cast<T>(0.0));
 
   if (in_d == out_d && in_h == out_h && in_w == out_w) {
diff --git a/paddle/fluid/operators/interpolate_v2_op.h b/paddle/fluid/operators/interpolate_v2_op.h
index 4d6189b57bf..400c94f48a5 100644
--- a/paddle/fluid/operators/interpolate_v2_op.h
+++ b/paddle/fluid/operators/interpolate_v2_op.h
@@ -14,8 +14,8 @@
 #include <string>
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/pten/core/hostdevice.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
@@ -1276,7 +1276,7 @@ static void Interpolate1DCPUBwd(const framework::ExecutionContext& ctx,
   input_grad->mutable_data<T>(dim_grad, ctx.GetPlace());
 
   auto& device_ctx = ctx.template device_context<platform::CPUDeviceContext>();
-  math::SetConstant<platform::CPUDeviceContext, T> zero;
+  pten::funcs::SetConstant<platform::CPUDeviceContext, T> zero;
   zero(device_ctx, input_grad, static_cast<T>(0.0));
 
   if (in_w == out_w) {
@@ -1383,7 +1383,7 @@ static void Interpolate2DCPUBwd(const framework::ExecutionContext& ctx,
   input_grad->mutable_data<T>(dim_grad, ctx.GetPlace());
 
   auto& device_ctx = ctx.template device_context<platform::CPUDeviceContext>();
-  math::SetConstant<platform::CPUDeviceContext, T> zero;
+  pten::funcs::SetConstant<platform::CPUDeviceContext, T> zero;
   zero(device_ctx, input_grad, static_cast<T>(0.0));
 
   if (in_h == out_h && in_w == out_w) {
@@ -1527,7 +1527,7 @@ static void Interpolate3DCPUBwd(const framework::ExecutionContext& ctx,
   }
   input_grad->mutable_data<T>(dim_grad, ctx.GetPlace());
   auto& device_ctx = ctx.template device_context<platform::CPUDeviceContext>();
-  math::SetConstant<platform::CPUDeviceContext, T> zero;
+  pten::funcs::SetConstant<platform::CPUDeviceContext, T> zero;
   zero(device_ctx, input_grad, static_cast<T>(0.0));
 
   if (in_d == out_d && in_h == out_h && in_w == out_w) {
diff --git a/paddle/fluid/operators/layer_norm_op.h b/paddle/fluid/operators/layer_norm_op.h
index ad7c0cc218b..b7916f44d3c 100644
--- a/paddle/fluid/operators/layer_norm_op.h
+++ b/paddle/fluid/operators/layer_norm_op.h
@@ -25,7 +25,7 @@ limitations under the License. */
     !defined(__OSX__)
 #include "paddle/fluid/operators/jit/kernels.h"
 #endif
-#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace platform {
@@ -57,7 +57,7 @@ class RowwiseMean2D<platform::CUDADeviceContext, T> {
       : left_(left), right_(right) {
     framework::DDim ones_dim({right_});
     divisor_.mutable_data<T>(ones_dim, dev_ctx.GetPlace());
-    math::set_constant(dev_ctx, &divisor_, 1.0 / right);
+    pten::funcs::set_constant(dev_ctx, &divisor_, 1.0 / right);
   }
   void operator()(const platform::CUDADeviceContext& context,
                   const framework::Tensor& input, framework::Tensor* out) {
@@ -84,7 +84,7 @@ class RowwiseMean2D<platform::CPUDeviceContext, T> {
   }
 
  private:
-  math::RowwiseMean<platform::CPUDeviceContext, T> row_mean_;
+  pten::funcs::RowwiseMean<platform::CPUDeviceContext, T> row_mean_;
 };
 
 template <typename DeviceContext, typename T>
@@ -103,7 +103,7 @@ class ColwiseSum2D<platform::CUDADeviceContext, T> {
       : left_(left), right_(right) {
     framework::DDim ones_dim({left_});
     divisor_.mutable_data<T>(ones_dim, dev_ctx.GetPlace());
-    math::set_constant(dev_ctx, &divisor_, 1.0);
+    pten::funcs::set_constant(dev_ctx, &divisor_, 1.0);
   }
 
   void operator()(const platform::CUDADeviceContext& context,
@@ -131,7 +131,7 @@ class ColwiseSum2D<platform::CPUDeviceContext, T> {
   }
 
  private:
-  math::ColwiseSum<platform::CPUDeviceContext, T> col_wise_;
+  pten::funcs::ColwiseSum<platform::CPUDeviceContext, T> col_wise_;
 };
 
 template <typename T>
diff --git a/paddle/fluid/operators/layout_utils.h b/paddle/fluid/operators/layout_utils.h
index 52fa7fd1079..57c95afc102 100644
--- a/paddle/fluid/operators/layout_utils.h
+++ b/paddle/fluid/operators/layout_utils.h
@@ -20,7 +20,7 @@
 #include <vector>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
@@ -112,18 +112,18 @@ inline void TransToChannelFirst(const framework::ExecutionContext& context,
   if (dim == 3) {
     auto& dev_ctx = context.template device_context<DeviceContext>();
     std::vector<int> axis{0, 4, 1, 2, 3};
-    math::Transpose<DeviceContext, T, 5> trans5;
+    pten::funcs::Transpose<DeviceContext, T, 5> trans5;
     trans5(dev_ctx, *input, transformed_input, axis);
 
   } else if (dim == 2) {
     auto& dev_ctx = context.template device_context<DeviceContext>();
     std::vector<int> axis{0, 3, 1, 2};
-    math::Transpose<DeviceContext, T, 4> trans4;
+    pten::funcs::Transpose<DeviceContext, T, 4> trans4;
     trans4(dev_ctx, *input, transformed_input, axis);
   } else if (dim == 1) {
     auto& dev_ctx = context.template device_context<DeviceContext>();
     std::vector<int> axis{0, 2, 1};
-    math::Transpose<DeviceContext, T, 3> trans3;
+    pten::funcs::Transpose<DeviceContext, T, 3> trans3;
     trans3(dev_ctx, *input, transformed_input, axis);
   }
 }
@@ -135,18 +135,18 @@ inline void TransToChannelLast(const framework::ExecutionContext& context,
   if (dim == 3) {
     auto& dev_ctx = context.template device_context<DeviceContext>();
     std::vector<int> axis{0, 2, 3, 4, 1};
-    math::Transpose<DeviceContext, T, 5> trans5;
+    pten::funcs::Transpose<DeviceContext, T, 5> trans5;
     trans5(dev_ctx, *input, transformed_input, axis);
 
   } else if (dim == 2) {
     auto& dev_ctx = context.template device_context<DeviceContext>();
     std::vector<int> axis{0, 2, 3, 1};
-    math::Transpose<DeviceContext, T, 4> trans4;
+    pten::funcs::Transpose<DeviceContext, T, 4> trans4;
     trans4(dev_ctx, *input, transformed_input, axis);
   } else if (dim == 1) {
     auto& dev_ctx = context.template device_context<DeviceContext>();
     std::vector<int> axis{0, 2, 1};
-    math::Transpose<DeviceContext, T, 3> trans3;
+    pten::funcs::Transpose<DeviceContext, T, 3> trans3;
     trans3(dev_ctx, *input, transformed_input, axis);
   }
 }
diff --git a/paddle/fluid/operators/linear_chain_crf_op.h b/paddle/fluid/operators/linear_chain_crf_op.h
index eacc5f467d2..c9a82dec724 100644
--- a/paddle/fluid/operators/linear_chain_crf_op.h
+++ b/paddle/fluid/operators/linear_chain_crf_op.h
@@ -15,7 +15,7 @@ limitations under the License. */
 #pragma once
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
@@ -102,8 +102,8 @@ class LinearChainCRFOpKernel : public framework::OpKernel<T> {
       label_tmp.Resize({batch_size, 1});
       alpha_tmp.Resize({batch_size, tag_num});
       emission_exps_tmp.Resize({batch_size, tag_num});
-      math::set_constant(ctx.device_context(), emission_exps, 0.0);
-      math::set_constant(ctx.device_context(), alpha, 0.0);
+      pten::funcs::set_constant(ctx.device_context(), emission_exps, 0.0);
+      pten::funcs::set_constant(ctx.device_context(), alpha, 0.0);
     } else {
       in_lod = ctx.Input<LoDTensor>("Label")->lod();
       PADDLE_ENFORCE_NE(in_lod.size(), 0,
@@ -274,7 +274,7 @@ class LinearChainCRFGradOpKernel : public framework::OpKernel<T> {
     // data reader operator, it can have no gradients.
     if (transition_grad) {
       transition_grad->mutable_data<T>(platform::CPUPlace());
-      math::set_constant(ctx.device_context(), transition_grad, 0.);
+      pten::funcs::set_constant(ctx.device_context(), transition_grad, 0.);
     }
     // Now, all the inputs and outputs should be on the CPU memory.
     auto emission_dims = emission_exps->dims();
diff --git a/paddle/fluid/operators/linspace_op.h b/paddle/fluid/operators/linspace_op.h
index d8e0fefe175..7e384f4b64b 100644
--- a/paddle/fluid/operators/linspace_op.h
+++ b/paddle/fluid/operators/linspace_op.h
@@ -16,7 +16,7 @@ limitations under the License. */
 #include <functional>
 #include "paddle/fluid/framework/data_type_transform.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/lrn_op.cc b/paddle/fluid/operators/lrn_op.cc
index b7c28a0908d..bee8b5396af 100644
--- a/paddle/fluid/operators/lrn_op.cc
+++ b/paddle/fluid/operators/lrn_op.cc
@@ -17,7 +17,7 @@ limitations under the License. */
 #include <string>
 #include <vector>
 #include "paddle/fluid/operators/math/blas.h"
-#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 #ifdef PADDLE_WITH_MKLDNN
 #include "paddle/fluid/platform/mkldnn_helper.h"
 #endif
@@ -36,7 +36,7 @@ struct LRNFunctor<platform::CPUDeviceContext, T> {
                   T k, T alpha, T beta, const DataLayout data_layout) {
     auto place = ctx.GetPlace();
     auto blas = math::GetBlas<platform::CPUDeviceContext, T>(ctx);
-    math::Transpose<platform::CPUDeviceContext, T, 4> transpose;
+    pten::funcs::Transpose<platform::CPUDeviceContext, T, 4> transpose;
     auto& dev_ctx = ctx.template device_context<platform::CPUDeviceContext>();
     Tensor in_transpose, mid_transpose, out_transpose;
     // if channel_last, transpose to channel_first
diff --git a/paddle/fluid/operators/lrn_op.h b/paddle/fluid/operators/lrn_op.h
index bdf3ad81ddb..a619d6c7237 100644
--- a/paddle/fluid/operators/lrn_op.h
+++ b/paddle/fluid/operators/lrn_op.h
@@ -18,7 +18,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/data_layout.h"
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/lstm_op.h b/paddle/fluid/operators/lstm_op.h
index c6f43b949a7..df94952a9a6 100644
--- a/paddle/fluid/operators/lstm_op.h
+++ b/paddle/fluid/operators/lstm_op.h
@@ -76,7 +76,7 @@ class LSTMKernel : public framework::OpKernel<T> {
       Tensor b = *bias;
       b.Resize({bias->numel(), 1});
       Tensor gate_bias = b.Slice(0, 4 * frame_size);
-      math::RowwiseAdd<DeviceContext, T> add_bias;
+      pten::funcs::RowwiseAdd<DeviceContext, T> add_bias;
       add_bias(device_ctx, *batch_gate, gate_bias, batch_gate);
     }
 
@@ -210,7 +210,7 @@ class LSTMGradKernel : public framework::OpKernel<T> {
     auto* c0_g = ctx.Output<Tensor>(framework::GradVarName("C0"));
 
     auto& device_ctx = ctx.template device_context<DeviceContext>();
-    math::SetConstant<DeviceContext, T> zero;
+    pten::funcs::SetConstant<DeviceContext, T> zero;
     if (weight_g) {
       weight_g->mutable_data<T>(ctx.GetPlace());
       zero(device_ctx, weight_g, static_cast<T>(0.0));
@@ -380,7 +380,7 @@ class LSTMGradKernel : public framework::OpKernel<T> {
       Tensor b_g = *bias_g;
       b_g.Resize({bias_g->numel(), 1});
       Tensor gate_bias_g = b_g.Slice(0, 4 * frame_size);
-      math::ColwiseSum<DeviceContext, T> col_sum;
+      pten::funcs::ColwiseSum<DeviceContext, T> col_sum;
       col_sum(device_ctx, batch_gate_g, &gate_bias_g);
     }
 
diff --git a/paddle/fluid/operators/lstmp_op.h b/paddle/fluid/operators/lstmp_op.h
index 5a6ac42f457..c63184f76e7 100644
--- a/paddle/fluid/operators/lstmp_op.h
+++ b/paddle/fluid/operators/lstmp_op.h
@@ -133,7 +133,7 @@ class LSTMPKernel : public framework::OpKernel<T> {
       Tensor b = *bias;
       b.Resize({bias->numel(), 1});
       Tensor gate_bias = b.Slice(0, 4 * frame_size);
-      math::RowwiseAdd<DeviceContext, T> add_bias;
+      pten::funcs::RowwiseAdd<DeviceContext, T> add_bias;
       add_bias(device_ctx, *batch_gate, gate_bias, batch_gate);
     }
 
@@ -304,7 +304,7 @@ class LSTMPGradKernel : public framework::OpKernel<T> {
     auto* c0_g = ctx.Output<Tensor>(framework::GradVarName("C0"));
 
     auto& device_ctx = ctx.template device_context<DeviceContext>();
-    math::SetConstant<DeviceContext, T> zero;
+    pten::funcs::SetConstant<DeviceContext, T> zero;
     if (weight_g) {
       weight_g->mutable_data<T>(ctx.GetPlace());
       zero(device_ctx, weight_g, static_cast<T>(0.0));
@@ -514,7 +514,7 @@ class LSTMPGradKernel : public framework::OpKernel<T> {
       Tensor b_g = *bias_g;
       b_g.Resize({bias_g->numel(), 1});
       Tensor gate_bias_g = b_g.Slice(0, 4 * frame_size);
-      math::ColwiseSum<DeviceContext, T> col_sum;
+      pten::funcs::ColwiseSum<DeviceContext, T> col_sum;
       col_sum(device_ctx, batch_gate_g, &gate_bias_g);
     }
 
diff --git a/paddle/fluid/operators/lstsq_op.h b/paddle/fluid/operators/lstsq_op.h
index be411232706..dd0cff5cc5f 100644
--- a/paddle/fluid/operators/lstsq_op.h
+++ b/paddle/fluid/operators/lstsq_op.h
@@ -21,12 +21,12 @@
 #include "paddle/fluid/operators/math/complex_functors.h"
 #include "paddle/fluid/operators/math/eigen_values_vectors.h"
 #include "paddle/fluid/operators/math/lapack_function.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/math/matrix_solve.h"
 #include "paddle/fluid/operators/svd_helper.h"
 #include "paddle/fluid/operators/transpose_op.h"
 #include "paddle/fluid/operators/triangular_solve_op.h"
 #include "paddle/fluid/platform/for_range.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 #define EPSILON 1e-6
 
diff --git a/paddle/fluid/operators/lu_op.h b/paddle/fluid/operators/lu_op.h
index c3b3552ba13..b3d79122bcd 100644
--- a/paddle/fluid/operators/lu_op.h
+++ b/paddle/fluid/operators/lu_op.h
@@ -455,7 +455,7 @@ void Unpack_Pivot(const DeviceContext& dev_ctx, const framework::Tensor& Pivot,
   auto Pdim = framework::make_ddim(Pdimvec);
   P->Resize(Pdim);
   auto pdata = P->mutable_data<T>(dev_ctx.GetPlace());
-  math::SetConstant<DeviceContext, T> setter;
+  pten::funcs::SetConstant<DeviceContext, T> setter;
   setter(dev_ctx, P, static_cast<T>(0));
 
   auto batchsize = product(framework::slice_ddim(dims, 0, prank - 1));
@@ -543,7 +543,7 @@ class LUGradKernel : public framework::OpKernel<T> {
     Tensor_Add<DeviceContext, T>(dev_ctx, phi_L, phi_U, &phi);
     psi.Resize(xdims);
     psi.mutable_data<T>(ctx.GetPlace());
-    math::SetConstant<DeviceContext, T> setter;
+    pten::funcs::SetConstant<DeviceContext, T> setter;
     setter(dev_ctx, &psi, static_cast<T>(0));
 
     std::vector<int64_t> axes = {xrank - 2, xrank - 1};
diff --git a/paddle/fluid/operators/lu_unpack_op.h b/paddle/fluid/operators/lu_unpack_op.h
index 115ab116fda..c245c7eb655 100644
--- a/paddle/fluid/operators/lu_unpack_op.h
+++ b/paddle/fluid/operators/lu_unpack_op.h
@@ -110,7 +110,7 @@ class LU_UnpackGradKernel : public framework::OpKernel<T> {
     std::vector<int64_t> slice_ends(2, 0);
     auto valuedims = vectorize(xdims);
 
-    math::SetConstant<DeviceContext, T> setter;
+    pten::funcs::SetConstant<DeviceContext, T> setter;
     setter(dev_ctx, dx, static_cast<T>(0));
     if (m <= n) {
       slice_starts[0] = 0;
diff --git a/paddle/fluid/operators/margin_cross_entropy_op.cu b/paddle/fluid/operators/margin_cross_entropy_op.cu
index 51776f2166d..a59909644aa 100644
--- a/paddle/fluid/operators/margin_cross_entropy_op.cu
+++ b/paddle/fluid/operators/margin_cross_entropy_op.cu
@@ -22,11 +22,11 @@ namespace cub = hipcub;
 #include <vector>
 #include "paddle/fluid/operators/amp/fp16_type_traits.h"
 #include "paddle/fluid/operators/margin_cross_entropy_op.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/math/softmax_impl.h"
 #include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h"
 #include "paddle/fluid/operators/reduce_ops/reduce_op.h"
 #include "paddle/fluid/string/string_helper.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 #include "paddle/fluid/platform/collective_helper.h"
@@ -341,8 +341,8 @@ class MarginCrossEntropyOpCUDAKernel : public framework::OpKernel<T> {
     // step 6, prob = exp((logit - logit_max) - log(sum(exp(logit -
     // logit_max))))
     // loss = -((logit_i - logit_max) - log(sum(exp(logit - logit_max))))
-    math::SetConstant<platform::CUDADeviceContext, T>()(dev_ctx, loss,
-                                                        static_cast<T>(0.0));
+    pten::funcs::SetConstant<platform::CUDADeviceContext, T>()(
+        dev_ctx, loss, static_cast<T>(0.0));
     if (label_type == framework::proto::VarType::INT32) {
       typedef int32_t LabelT;
       HardLabelSoftmaxWithCrossEntropyKernel<
diff --git a/paddle/fluid/operators/math/CMakeLists.txt b/paddle/fluid/operators/math/CMakeLists.txt
index 65bf595bceb..a97e2ecfce7 100644
--- a/paddle/fluid/operators/math/CMakeLists.txt
+++ b/paddle/fluid/operators/math/CMakeLists.txt
@@ -61,7 +61,7 @@ math_library(gru_compute DEPS activation_functions math_function)
 math_library(lstm_compute DEPS activation_functions)
 
 cc_library(blas SRCS blas.cc DEPS cblas framework_proto device_context)
-math_library(math_function DEPS blas dense_tensor tensor)
+# math_library(math_function DEPS blas dense_tensor tensor)
 math_library(maxouting)
 math_library(pooling)
 
@@ -95,7 +95,6 @@ math_library(matrix_inverse)
 math_library(segment_pooling)
 math_library(matrix_solve)
 
-cc_test(math_function_test SRCS math_function_test.cc DEPS math_function)
 cc_test(selected_rows_functor_test SRCS selected_rows_functor_test.cc DEPS selected_rows_functor)
 cc_test(im2col_test SRCS im2col_test.cc DEPS im2col)
 cc_test(vol2col_test SRCS vol2col_test.cc DEPS vol2col)
@@ -103,11 +102,9 @@ cc_test(sequence_padding_test SRCS sequence_padding_test.cc DEPS sequence_paddin
 cc_test(sequence_pooling_test SRCS sequence_pooling_test.cc DEPS sequence_pooling)
 cc_test(beam_search_test SRCS beam_search_test.cc DEPS beam_search)
 if(WITH_GPU)
-    nv_test(math_function_gpu_test SRCS math_function_test.cu DEPS math_function)
     nv_test(selected_rows_functor_gpu_test SRCS selected_rows_functor_test.cu.cc DEPS selected_rows_functor math_function)
 endif()
 if(WITH_ROCM)
-    hip_test(math_function_gpu_test SRCS math_function_test.cu DEPS math_function tensor)
     hip_test(selected_rows_functor_gpu_test SRCS selected_rows_functor_test.cu.cc DEPS selected_rows_functor math_function)
 endif()
 cc_test(concat_test SRCS concat_test.cc DEPS concat_and_split)
diff --git a/paddle/fluid/operators/math/blas_impl.cu.h b/paddle/fluid/operators/math/blas_impl.cu.h
index 7ffd2a7ab2d..f9a4e963c0c 100644
--- a/paddle/fluid/operators/math/blas_impl.cu.h
+++ b/paddle/fluid/operators/math/blas_impl.cu.h
@@ -14,8 +14,8 @@
 
 #pragma once
 
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/platform/dynload/cublas.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #include "paddle/pten/backends/gpu/gpu_context.h"
diff --git a/paddle/fluid/operators/math/blas_impl.h b/paddle/fluid/operators/math/blas_impl.h
index 80b7acc6103..8e0075c42eb 100644
--- a/paddle/fluid/operators/math/blas_impl.h
+++ b/paddle/fluid/operators/math/blas_impl.h
@@ -22,9 +22,9 @@
 #include <limits>
 #include <vector>
 
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/platform/bfloat16.h"
 #include "paddle/fluid/platform/complex.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/math/blas_impl.hip.h b/paddle/fluid/operators/math/blas_impl.hip.h
index bf7d66f4853..980caa9cfe6 100644
--- a/paddle/fluid/operators/math/blas_impl.hip.h
+++ b/paddle/fluid/operators/math/blas_impl.hip.h
@@ -14,10 +14,10 @@
 
 #pragma once
 
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #include "paddle/fluid/platform/dynload/rocblas.h"
 #include "paddle/pten/backends/gpu/gpu_context.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 DECLARE_bool(enable_cublas_tensor_op_math);
 
diff --git a/paddle/fluid/operators/math/depthwise_conv.cu b/paddle/fluid/operators/math/depthwise_conv.cu
index 6ff2ddaa338..117e6c47080 100644
--- a/paddle/fluid/operators/math/depthwise_conv.cu
+++ b/paddle/fluid/operators/math/depthwise_conv.cu
@@ -22,9 +22,9 @@ limitations under the License. */
 namespace cub = hipcub;
 #endif
 #include "paddle/fluid/operators/math/depthwise_conv.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
@@ -910,7 +910,7 @@ class DepthwiseConvFunctor<platform::CUDADeviceContext, T,
       filter_hwc.Resize(filter_hwc_dims);
       filter_hwc.mutable_data<T>(context.GetPlace());
       std::vector<int> perm_axis({2, 3, 0, 1});
-      math::TransposeNormal<platform::CUDADeviceContext, T> trans;
+      pten::funcs::TransposeNormal<platform::CUDADeviceContext, T> trans;
       trans(context, filter, &filter_hwc, perm_axis);
       filter_data = filter_hwc.data<T>();
     }
@@ -1053,7 +1053,7 @@ class DepthwiseConvInputGradFunctor<platform::CUDADeviceContext, T,
       filter_hwc.Resize(filter_hwc_dims);
       filter_hwc.mutable_data<T>(context.GetPlace());
       std::vector<int> perm_axis({2, 3, 0, 1});
-      math::TransposeNormal<platform::CUDADeviceContext, T> trans;
+      pten::funcs::TransposeNormal<platform::CUDADeviceContext, T> trans;
       trans(context, filter, &filter_hwc, perm_axis);
       filter_data = filter_hwc.data<T>();
     }
@@ -1215,7 +1215,7 @@ class DepthwiseConvFilterGradFunctor<platform::CUDADeviceContext, T,
              filter_grad->dims()[0], filter_grad->dims()[1]});                 \
         filter_grad_hwc.Resize(filter_grad_hwc_dims);                          \
         filter_grad_hwc.mutable_data<T>(context.GetPlace());                   \
-        math::SetConstant<platform::CUDADeviceContext, T> set_zero;            \
+        pten::funcs::SetConstant<platform::CUDADeviceContext, T> set_zero;     \
         set_zero(context, &filter_grad_hwc, static_cast<T>(0));                \
         filter_grad_data = filter_grad_hwc.data<T>();                          \
       } else {                                                                 \
@@ -1240,7 +1240,7 @@ class DepthwiseConvFilterGradFunctor<platform::CUDADeviceContext, T,
           dilate_height, dilate_width, filter_grad_data);                      \
       if (c_filter != -1) {                                                    \
         std::vector<int> perm_axis({2, 3, 0, 1});                              \
-        math::TransposeNormal<platform::CUDADeviceContext, T> trans;           \
+        pten::funcs::TransposeNormal<platform::CUDADeviceContext, T> trans;    \
         trans(context, filter_grad_hwc, filter_grad, perm_axis);               \
       }                                                                        \
     }                                                                          \
diff --git a/paddle/fluid/operators/math/math_function.cc b/paddle/fluid/operators/math/math_function.cc
deleted file mode 100644
index 2672d02db00..00000000000
--- a/paddle/fluid/operators/math/math_function.cc
+++ /dev/null
@@ -1,306 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/math/math_function.h"
-
-#ifdef PADDLE_WITH_MKLML
-#include "paddle/fluid/platform/dynload/mklml.h"
-#endif
-
-#ifdef PADDLE_USE_OPENBLAS
-#include <cblas.h>
-#endif
-
-#include <memory>
-#include <utility>
-#include <vector>
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/operators/math/math_function_impl.h"
-#include "paddle/fluid/platform/bfloat16.h"
-#include "paddle/fluid/platform/float16.h"
-#include "paddle/pten/backends/cpu/cpu_context.h"
-#include "paddle/pten/kernels/funcs/eigen/common.h"
-#include "unsupported/Eigen/CXX11/Tensor"
-
-namespace paddle {
-namespace operators {
-namespace math {
-
-using float16 = paddle::platform::float16;
-
-template struct SetConstant<platform::CPUDeviceContext, platform::float16>;
-template struct SetConstant<platform::CPUDeviceContext, platform::bfloat16>;
-template struct SetConstant<platform::CPUDeviceContext, float>;
-template struct SetConstant<platform::CPUDeviceContext, double>;
-template struct SetConstant<platform::CPUDeviceContext, int16_t>;
-template struct SetConstant<platform::CPUDeviceContext, int>;
-template struct SetConstant<platform::CPUDeviceContext, int64_t>;
-template struct SetConstant<platform::CPUDeviceContext, bool>;
-template struct SetConstant<platform::CPUDeviceContext, uint8_t>;
-template struct SetConstant<platform::CPUDeviceContext,
-                            platform::complex<float>>;
-template struct SetConstant<platform::CPUDeviceContext,
-                            platform::complex<double>>;
-
-template struct SetConstant<pten::CPUContext, platform::float16>;
-template struct SetConstant<pten::CPUContext, platform::bfloat16>;
-template struct SetConstant<pten::CPUContext, float>;
-template struct SetConstant<pten::CPUContext, double>;
-template struct SetConstant<pten::CPUContext, int16_t>;
-template struct SetConstant<pten::CPUContext, int>;
-template struct SetConstant<pten::CPUContext, int64_t>;
-template struct SetConstant<pten::CPUContext, bool>;
-template struct SetConstant<pten::CPUContext, uint8_t>;
-template struct SetConstant<pten::CPUContext, platform::complex<float>>;
-template struct SetConstant<pten::CPUContext, platform::complex<double>>;
-
-#ifdef PADDLE_WITH_XPU
-template struct SetConstant<platform::XPUDeviceContext, platform::float16>;
-template struct SetConstant<platform::XPUDeviceContext, platform::bfloat16>;
-template struct SetConstant<platform::XPUDeviceContext, float>;
-template struct SetConstant<platform::XPUDeviceContext, double>;
-template struct SetConstant<platform::XPUDeviceContext, uint8_t>;
-template struct SetConstant<platform::XPUDeviceContext, int16_t>;
-template struct SetConstant<platform::XPUDeviceContext, int>;
-template struct SetConstant<platform::XPUDeviceContext, int64_t>;
-template struct SetConstant<platform::XPUDeviceContext, bool>;
-template struct SetConstant<platform::XPUDeviceContext,
-                            platform::complex<float>>;
-template struct SetConstant<platform::XPUDeviceContext,
-                            platform::complex<double>>;
-#endif
-
-#define DEFINE_CPU_TRANS(RANK)                                              \
-  template struct Transpose<platform::CPUDeviceContext, platform::float16,  \
-                            RANK>;                                          \
-  template struct Transpose<platform::CPUDeviceContext, platform::bfloat16, \
-                            RANK>;                                          \
-  template struct Transpose<platform::CPUDeviceContext, float, RANK>;       \
-  template struct Transpose<platform::CPUDeviceContext, double, RANK>;      \
-  template struct Transpose<platform::CPUDeviceContext, int, RANK>;         \
-  template struct Transpose<platform::CPUDeviceContext, int64_t, RANK>;     \
-  template struct Transpose<platform::CPUDeviceContext, bool, RANK>;        \
-  template struct Transpose<platform::CPUDeviceContext, int16_t, RANK>;     \
-  template struct Transpose<platform::CPUDeviceContext, uint8_t, RANK>;     \
-  template struct Transpose<platform::CPUDeviceContext, int8_t, RANK>;      \
-  template struct Transpose<platform::CPUDeviceContext,                     \
-                            platform::complex<float>, RANK>;                \
-  template struct Transpose<platform::CPUDeviceContext,                     \
-                            platform::complex<double>, RANK>;
-
-DEFINE_CPU_TRANS(1);
-DEFINE_CPU_TRANS(2);
-DEFINE_CPU_TRANS(3);
-DEFINE_CPU_TRANS(4);
-DEFINE_CPU_TRANS(5);
-DEFINE_CPU_TRANS(6);
-
-template <typename T>
-struct TransposeNormal<platform::CPUDeviceContext, T> {
-  void operator()(const platform::CPUDeviceContext& context,
-                  const framework::Tensor& in, framework::Tensor* out,
-                  const std::vector<int>& axis) {
-    const int rank = axis.size();
-    auto in_stride = framework::stride(in.dims());
-    auto out_stride = framework::stride(out->dims());
-    const T* in_ptr = in.data<T>();
-    T* out_ptr = out->data<T>();
-
-    auto transpose_helper = [&](int64_t beg, int64_t end) {
-      for (int64_t out_idx = beg; out_idx < end; ++out_idx) {
-        int64_t in_idx = 0;
-        int64_t tmp_idx = out_idx;
-        // calculate the input index
-        for (int i = 0; i < rank; ++i) {
-          const int64_t coordinate = tmp_idx / out_stride[i];
-          tmp_idx -= coordinate * out_stride[i];
-          in_idx += coordinate * in_stride[axis[i]];
-        }
-        out_ptr[out_idx] = in_ptr[in_idx];
-      }
-    };
-    transpose_helper(0, out->numel());
-  }
-};
-
-// define transpose normal
-#define DEFINE_CPU_TRANS_NORMAL(TYPE) \
-  template struct TransposeNormal<platform::CPUDeviceContext, TYPE>
-
-DEFINE_CPU_TRANS_NORMAL(platform::float16);
-DEFINE_CPU_TRANS_NORMAL(platform::bfloat16);
-DEFINE_CPU_TRANS_NORMAL(float);
-DEFINE_CPU_TRANS_NORMAL(double);
-DEFINE_CPU_TRANS_NORMAL(int);
-DEFINE_CPU_TRANS_NORMAL(int64_t);
-DEFINE_CPU_TRANS_NORMAL(bool);
-DEFINE_CPU_TRANS_NORMAL(int16_t);
-DEFINE_CPU_TRANS_NORMAL(uint8_t);
-DEFINE_CPU_TRANS_NORMAL(int8_t);
-DEFINE_CPU_TRANS_NORMAL(platform::complex<float>);
-DEFINE_CPU_TRANS_NORMAL(platform::complex<double>);
-
-struct TensorSetConstantCPU {
-  TensorSetConstantCPU(framework::Tensor* tensor, float value)
-      : tensor_(tensor), value_(value) {}
-  template <typename T>
-  void apply() const {
-    auto cpu = platform::CPUPlace();
-    auto* begin = tensor_->mutable_data<T>(cpu);
-    std::fill(begin, begin + tensor_->numel(), static_cast<T>(value_));
-  }
-  framework::Tensor* tensor_;
-  float value_;
-};
-
-template <>
-void set_constant_with_place<platform::XPUPlace>(
-    const platform::DeviceContext& context, framework::Tensor* tensor,
-    float value) {
-  PADDLE_THROW(platform::errors::Unimplemented("XPUPlace is not supported"));
-}
-
-template <>
-void set_constant_with_place<platform::NPUPlace>(
-    const platform::DeviceContext& context, framework::Tensor* tensor,
-    float value) {
-  PADDLE_THROW(platform::errors::Unimplemented("NPUPlace is not supported"));
-}
-
-template <>
-void set_constant_with_place<platform::NPUPinnedPlace>(
-    const platform::DeviceContext& context, framework::Tensor* tensor,
-    float value) {
-  PADDLE_THROW(
-      platform::errors::Unimplemented("NPUPinnedPlace is not supported"));
-}
-
-template <>
-void set_constant_with_place<platform::IPUPlace>(
-    const platform::DeviceContext& context, framework::Tensor* tensor,
-    float value) {
-  PADDLE_THROW(platform::errors::Unimplemented("IPUPlace is not supported"));
-}
-
-template <>
-void set_constant_with_place<platform::CPUPlace>(
-    const platform::DeviceContext& context, framework::Tensor* tensor,
-    float value) {
-  framework::VisitDataType(tensor->type(), TensorSetConstantCPU(tensor, value));
-}
-
-template <>
-void set_constant_with_place<platform::MLUPlace>(
-    const platform::DeviceContext& context, framework::Tensor* tensor,
-    float value) {
-  PADDLE_THROW(platform::errors::Unimplemented("MLUPlace is not supported"));
-}
-
-template <>
-void set_constant_with_place<platform::CUDAPinnedPlace>(
-    const platform::DeviceContext& context, framework::Tensor* tensor,
-    float value) {
-  framework::VisitDataType(tensor->type(), TensorSetConstantCPU(tensor, value));
-}
-
-struct TensorSetConstantWithPlace : public boost::static_visitor<void> {
-  TensorSetConstantWithPlace(const platform::DeviceContext& context,
-                             framework::Tensor* tensor, float value)
-      : context_(context), tensor_(tensor), value_(value) {}
-
-  template <typename Place>
-  void operator()(Place place) const {
-    set_constant_with_place<Place>(context_, tensor_, value_);
-  }
-
-  const platform::DeviceContext& context_;
-  framework::Tensor* tensor_;
-  float value_;
-};
-
-void set_constant(const platform::DeviceContext& context,
-                  framework::Tensor* tensor, float value) {
-  TensorSetConstantWithPlace func(context, tensor, value);
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-  // tensor->place().apply_visitor(func);
-  paddle::platform::VisitPlace(tensor->place(), func);
-#else
-  func(platform::CPUPlace());
-#endif
-}
-
-template <typename T>
-struct RowwiseAdd<platform::CPUDeviceContext, T> {
-  void operator()(const platform::CPUDeviceContext& context,
-                  const framework::Tensor& input,
-                  const framework::Tensor& vector, framework::Tensor* output) {
-    auto in_dims = input.dims();
-    auto out_dims = output->dims();
-    auto size = input.numel() / in_dims[0];
-    PADDLE_ENFORCE_EQ(
-        vector.numel(), size,
-        platform::errors::InvalidArgument(
-            "The input vector size"
-            " should be equal to the size of each row of input tensor."
-            " Expected vector size=%d, but received %d",
-            size, vector.numel()));
-    const char* in_dims_cstr = in_dims.to_str().c_str();
-    const char* out_dims_cstr = out_dims.to_str().c_str();
-    PADDLE_ENFORCE_EQ(out_dims, in_dims,
-                      platform::errors::InvalidArgument(
-                          "The output tensor shape should be same as the input"
-                          " tensor shape. Expected output tensor shape: %s,"
-                          " but received %s",
-                          in_dims_cstr, out_dims_cstr));
-
-    auto in = framework::EigenMatrix<T>::From(input);
-    auto vec = framework::EigenVector<T>::Flatten(vector);
-    auto out = framework::EigenMatrix<T>::From(*output);
-
-    for (int64_t i = 0; i < in_dims[0]; ++i) {
-      out.chip(i, 0) = in.chip(i, 0) + vec;
-    }
-  }
-};
-
-template struct RowwiseAdd<platform::CPUDeviceContext, float>;
-template struct RowwiseAdd<platform::CPUDeviceContext, double>;
-
-template struct ColwiseSum<platform::CPUDeviceContext, float>;
-template struct ColwiseSum<platform::CPUDeviceContext, double>;
-template struct ColwiseSum<platform::CPUDeviceContext, int>;
-template struct ColwiseSum<platform::CPUDeviceContext, int64_t>;
-
-template struct RowwiseSum<platform::CPUDeviceContext, float>;
-template struct RowwiseSum<platform::CPUDeviceContext, double>;
-
-template struct RowwiseMean<platform::CPUDeviceContext, float>;
-template struct RowwiseMean<platform::CPUDeviceContext, double>;
-
-template <typename T>
-struct ElementwiseAddTo<platform::CPUDeviceContext, T> {
-  void operator()(platform::CPUDeviceContext* ctx, const framework::Tensor& src,
-                  framework::Tensor* dst) {
-    auto in = framework::EigenVector<T>::Flatten(src);
-    auto out = framework::EigenVector<T>::Flatten(*dst);
-    auto& place = *(ctx->eigen_device());
-    out.device(place) = out + in;
-  }
-};
-
-template struct ElementwiseAddTo<platform::CPUDeviceContext, platform::float16>;
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/math/math_function.cu b/paddle/fluid/operators/math/math_function.cu
deleted file mode 100644
index f0ef692b99f..00000000000
--- a/paddle/fluid/operators/math/math_function.cu
+++ /dev/null
@@ -1,322 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include <algorithm>
-#include <vector>
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/memory/malloc.h"
-#include "paddle/fluid/memory/memcpy.h"
-#include "paddle/fluid/operators/math/blas.h"
-#include "paddle/fluid/operators/math/math_function.h"
-#include "paddle/fluid/operators/math/math_function_impl.h"
-#include "paddle/fluid/platform/bfloat16.h"
-#include "paddle/fluid/platform/float16.h"
-#include "paddle/pten/backends/gpu/gpu_context.h"
-#include "paddle/pten/kernels/funcs/eigen/common.h"
-
-namespace paddle {
-namespace operators {
-namespace math {
-
-using float16 = paddle::platform::float16;
-using bfloat16 = paddle::platform::bfloat16;
-
-template struct SetConstant<platform::CUDADeviceContext, platform::float16>;
-template struct SetConstant<platform::CUDADeviceContext, platform::bfloat16>;
-template struct SetConstant<platform::CUDADeviceContext, float>;
-template struct SetConstant<platform::CUDADeviceContext, double>;
-template struct SetConstant<platform::CUDADeviceContext, uint8_t>;
-template struct SetConstant<platform::CUDADeviceContext, int>;
-template struct SetConstant<platform::CUDADeviceContext, int16_t>;
-template struct SetConstant<platform::CUDADeviceContext, int64_t>;
-template struct SetConstant<platform::CUDADeviceContext, bool>;
-template struct SetConstant<platform::CUDADeviceContext,
-                            platform::complex<float>>;
-template struct SetConstant<platform::CUDADeviceContext,
-                            platform::complex<double>>;
-
-template struct SetConstant<pten::GPUContext, platform::float16>;
-template struct SetConstant<pten::GPUContext, platform::bfloat16>;
-template struct SetConstant<pten::GPUContext, float>;
-template struct SetConstant<pten::GPUContext, double>;
-template struct SetConstant<pten::GPUContext, uint8_t>;
-template struct SetConstant<pten::GPUContext, int>;
-template struct SetConstant<pten::GPUContext, int16_t>;
-template struct SetConstant<pten::GPUContext, int64_t>;
-template struct SetConstant<pten::GPUContext, bool>;
-template struct SetConstant<pten::GPUContext, platform::complex<float>>;
-template struct SetConstant<pten::GPUContext, platform::complex<double>>;
-
-template struct SetConstant<platform::CUDAPinnedDeviceContext,
-                            platform::float16>;
-template struct SetConstant<platform::CUDAPinnedDeviceContext,
-                            platform::bfloat16>;
-template struct SetConstant<platform::CUDAPinnedDeviceContext, float>;
-template struct SetConstant<platform::CUDAPinnedDeviceContext, double>;
-template struct SetConstant<platform::CUDAPinnedDeviceContext, uint8_t>;
-template struct SetConstant<platform::CUDAPinnedDeviceContext, int>;
-template struct SetConstant<platform::CUDAPinnedDeviceContext, int16_t>;
-template struct SetConstant<platform::CUDAPinnedDeviceContext, int64_t>;
-template struct SetConstant<platform::CUDAPinnedDeviceContext, bool>;
-template struct SetConstant<platform::CUDAPinnedDeviceContext,
-                            platform::complex<float>>;
-template struct SetConstant<platform::CUDAPinnedDeviceContext,
-                            platform::complex<double>>;
-
-#define DEFINE_GPU_TRANS(RANK)                                            \
-  template struct Transpose<platform::CUDADeviceContext, bool, RANK>;     \
-  template struct Transpose<platform::CUDADeviceContext, float, RANK>;    \
-  template struct Transpose<platform::CUDADeviceContext, double, RANK>;   \
-  template struct Transpose<platform::CUDADeviceContext, float16, RANK>;  \
-  template struct Transpose<platform::CUDADeviceContext, bfloat16, RANK>; \
-  template struct Transpose<platform::CUDADeviceContext, int8_t, RANK>;   \
-  template struct Transpose<platform::CUDADeviceContext, int32_t, RANK>;  \
-  template struct Transpose<platform::CUDADeviceContext, int64_t, RANK>;  \
-  template struct Transpose<platform::CUDADeviceContext,                  \
-                            paddle::platform::complex<float>, RANK>;      \
-  template struct Transpose<platform::CUDADeviceContext,                  \
-                            paddle::platform::complex<double>, RANK>;
-
-DEFINE_GPU_TRANS(1);
-DEFINE_GPU_TRANS(2);
-DEFINE_GPU_TRANS(3);
-DEFINE_GPU_TRANS(4);
-DEFINE_GPU_TRANS(5);
-DEFINE_GPU_TRANS(6);
-
-#define REINTERPRET(T, DST_PTR, SRC_PTR) \
-  T* DST_PTR = reinterpret_cast<T*>(SRC_PTR)
-
-template <typename T>
-__global__ void TransposeNormalKernel(const T* in_ptr, T* out_ptr,
-                                      int64_t element,
-                                      const int64_t* in_stride_ptr,
-                                      const int64_t* out_stride_ptr,
-                                      const int64_t* axis_ptr, int rank) {
-  CUDA_KERNEL_LOOP(out_idx, element) {
-    int64_t in_idx = 0;
-    int64_t tmp_idx = out_idx;
-    for (int i = 0; i < rank; ++i) {
-      const int64_t coordinate = tmp_idx / out_stride_ptr[i];
-      tmp_idx -= coordinate * out_stride_ptr[i];
-      in_idx += coordinate * in_stride_ptr[axis_ptr[i]];
-    }
-    out_ptr[out_idx] = in_ptr[in_idx];
-  }
-}
-
-template <typename T>
-struct TransposeNormal<platform::CUDADeviceContext, T> {
-  void operator()(const platform::CUDADeviceContext& context,
-                  const framework::Tensor& in, framework::Tensor* out,
-                  const std::vector<int>& axis) {
-    const int rank = axis.size();
-    auto in_stride = framework::stride(in.dims());
-    auto out_stride = framework::stride(out->dims());
-    auto* in_ptr = in.data<T>();
-    auto* out_ptr = out->data<T>();
-
-    // copy in_stride, out_stride, axis to gpu device
-    const platform::CUDAPlace& cuda_place = context.GetPlace();
-    platform::CPUPlace cpu_place = platform::CPUPlace();
-    size_t size = 3 * rank * sizeof(int64_t);
-    auto cpu_buf_holder = memory::Alloc(cpu_place, size);
-    auto cuda_buf_holder = memory::Alloc(cuda_place, size);
-    REINTERPRET(int64_t, cpu_buf, cpu_buf_holder->ptr());
-    REINTERPRET(int64_t, cuda_buf, cuda_buf_holder->ptr());
-    for (int i = 0; i < rank; ++i) {
-      cpu_buf[i] = in_stride[i];
-      cpu_buf[rank + i] = out_stride[i];
-      cpu_buf[2 * rank + i] = axis[i];
-    }
-    memory::Copy(cuda_place, cuda_buf, cpu_place, cpu_buf, size,
-                 context.stream());
-    REINTERPRET(const int64_t, in_stride_ptr, cuda_buf);
-    REINTERPRET(const int64_t, out_stride_ptr, cuda_buf + rank);
-    REINTERPRET(const int64_t, axis_ptr, cuda_buf + 2 * rank);
-
-    const int MAX_BLOCK_DIM = context.GetMaxThreadsPerBlock();
-    const int MAX_GRID_DIM =
-        context.GetMaxPhysicalThreadCount() / MAX_BLOCK_DIM;
-    int64_t elements = in.numel();
-    int block_size = (elements >= MAX_BLOCK_DIM)
-                         ? MAX_BLOCK_DIM
-                         : (1 << static_cast<int>(std::log2(elements)));
-    int grid_size = elements / block_size;
-    grid_size = (grid_size >= MAX_GRID_DIM) ? MAX_GRID_DIM : grid_size;
-    TransposeNormalKernel<T><<<grid_size, block_size, 0, context.stream()>>>(
-        in_ptr, out_ptr, elements, in_stride_ptr, out_stride_ptr, axis_ptr,
-        rank);
-  }
-};
-
-// define transpose normal
-#define DEFINE_GPU_TRANS_NORMAL(TYPE) \
-  template struct TransposeNormal<platform::CUDADeviceContext, TYPE>
-
-DEFINE_GPU_TRANS_NORMAL(float16);
-DEFINE_GPU_TRANS_NORMAL(bfloat16);
-DEFINE_GPU_TRANS_NORMAL(float);
-DEFINE_GPU_TRANS_NORMAL(double);
-DEFINE_GPU_TRANS_NORMAL(int);
-DEFINE_GPU_TRANS_NORMAL(int64_t);
-DEFINE_GPU_TRANS_NORMAL(bool);
-DEFINE_GPU_TRANS_NORMAL(int16_t);
-DEFINE_GPU_TRANS_NORMAL(uint8_t);
-DEFINE_GPU_TRANS_NORMAL(int8_t);
-DEFINE_GPU_TRANS_NORMAL(paddle::platform::complex<float>);
-DEFINE_GPU_TRANS_NORMAL(paddle::platform::complex<double>);
-
-struct TensorSetConstantGPU {
-  TensorSetConstantGPU(const platform::DeviceContext& context,
-                       framework::Tensor* tensor, float value)
-      : context_(context), tensor_(tensor), value_(value) {}
-
-  template <typename T>
-  void apply() const {
-    SetConstant<platform::CUDADeviceContext, T> functor;
-    functor(reinterpret_cast<const platform::CUDADeviceContext&>(context_),
-            tensor_, static_cast<T>(value_));
-  }
-
-  const platform::DeviceContext& context_;
-  framework::Tensor* tensor_;
-  float value_;
-};
-
-template <>
-void set_constant_with_place<platform::CUDAPlace>(
-    const platform::DeviceContext& context, framework::Tensor* tensor,
-    float value) {
-  framework::VisitDataType(tensor->type(),
-                           TensorSetConstantGPU(context, tensor, value));
-}
-
-template <typename T>
-__global__ void RowwiseAddKernel(const T* a, const T* b, T* c, int width,
-                                 int num) {
-  T tmp = 1.0 / width;
-  CUDA_KERNEL_LOOP(i, num) {
-    int h = i * tmp;
-    int w = i - h * width;
-    c[i] = a[i] + b[w];
-  }
-}
-
-template <typename T>
-struct RowwiseAdd<platform::CUDADeviceContext, T> {
-  void operator()(const platform::CUDADeviceContext& context,
-                  const framework::Tensor& input,
-                  const framework::Tensor& vector, framework::Tensor* output) {
-    auto in_dims = input.dims();
-    auto out_dims = output->dims();
-    auto size = input.numel() / in_dims[0];
-    PADDLE_ENFORCE_EQ(
-        vector.numel(), size,
-        platform::errors::InvalidArgument(
-            "The input vector size"
-            " should be equal to the size of each row of input tensor."
-            " Expected vector size=%d, but received %d",
-            size, vector.numel()));
-    const char* in_dims_cstr = in_dims.to_str().c_str();
-    const char* out_dims_cstr = out_dims.to_str().c_str();
-    PADDLE_ENFORCE_EQ(
-        out_dims, in_dims,
-        platform::errors::InvalidArgument(
-            "The output tensor shape should be same as the input tensor"
-            " shape. Expected output tensor shape: %s,"
-            " but received %s",
-            in_dims_cstr, out_dims_cstr));
-    int blocks = 512;
-    int grids = (input.numel() + blocks - 1) / blocks;
-    RowwiseAddKernel<T><<<grids, blocks, 0, context.stream()>>>(
-        input.data<T>(), vector.data<T>(), output->data<T>(),
-        static_cast<int>(in_dims[1]), static_cast<int>(input.numel()));
-  }
-};
-
-template struct RowwiseAdd<platform::CUDADeviceContext, float>;
-template struct RowwiseAdd<platform::CUDADeviceContext, double>;
-template struct ColwiseSum<platform::CUDADeviceContext, float>;
-template struct ColwiseSum<platform::CUDADeviceContext, int>;
-template struct ColwiseSum<platform::CUDADeviceContext, int64_t>;
-// template struct ColwiseSum<platform::CUDADeviceContext, double>;
-// The ColwiseSum<platform::CUDADeviceContext, double> failed in debug mode,
-// and only failed for this case. So reimplemented it.
-template <>
-void ColwiseSum<platform::CUDADeviceContext, double>::operator()(
-    const platform::CUDADeviceContext& context, const framework::Tensor& input,
-    framework::Tensor* vector) {
-  auto in_dims = input.dims();
-  auto size = input.numel() / in_dims[0];
-  PADDLE_ENFORCE_EQ(vector->numel(), size,
-                    platform::errors::InvalidArgument(
-                        "The size of input vector"
-                        " should be equal to the size of input tensor column"
-                        " dimension. Expected vector size=%d, but received %d",
-                        size, vector->numel()));
-  framework::Tensor one;
-  one.mutable_data<double>({in_dims[0]}, context.GetPlace());
-  SetConstant<platform::CUDADeviceContext, double> set;
-  set(context, &one, static_cast<double>(1.0));
-  GetBlas<platform::CUDADeviceContext, double>(context).GEMV(
-      true, static_cast<int>(in_dims[0]), static_cast<int>(in_dims[1]), 1.0,
-      input.data<double>(), one.data<double>(), 0.0, vector->data<double>());
-}
-
-template struct RowwiseSum<platform::CUDADeviceContext, float>;
-// template struct RowwiseSum<platform::CUDADeviceContext, double>;
-// TODO(zcd): Following ColwiseSum format, need to confirm.
-// The RowwiseSum<platform::CUDADeviceContext, double> failed in debug mode,
-// and only failed for this case. So reimplemented it.
-template <>
-void RowwiseSum<platform::CUDADeviceContext, double>::operator()(
-    const platform::CUDADeviceContext& context, const framework::Tensor& input,
-    framework::Tensor* vector) {
-  auto in_dims = input.dims();
-  auto size = input.numel() / in_dims[0];
-  PADDLE_ENFORCE_EQ(vector->numel(), in_dims[0],
-                    platform::errors::InvalidArgument(
-                        "The size of input vector"
-                        " should be equal to the size of input tensor row"
-                        " dimension. Expected vector size=%d, but received %d",
-                        in_dims[0], vector->numel()));
-  framework::Tensor one;
-  one.mutable_data<double>({size}, context.GetPlace());
-  SetConstant<platform::CUDADeviceContext, double> set;
-  set(context, &one, static_cast<double>(1.0));
-  GetBlas<platform::CUDADeviceContext, double>(context).GEMV(
-      true, static_cast<int>(in_dims[1]), static_cast<int>(in_dims[0]), 1.0,
-      one.data<double>(), input.data<double>(), 0.0, vector->data<double>());
-}
-
-template struct RowwiseMean<platform::CUDADeviceContext, float>;
-template struct RowwiseMean<platform::CUDADeviceContext, double>;
-
-template <typename T>
-struct ElementwiseAddTo<platform::CUDADeviceContext, T> {
-  void operator()(platform::CUDADeviceContext* ctx,
-                  const framework::Tensor& src, framework::Tensor* dst) {
-    auto in = framework::EigenVector<T>::Flatten(src);
-    auto out = framework::EigenVector<T>::Flatten(*dst);
-    auto& place = *(ctx->eigen_device());
-    out.device(place) = out + in;
-  }
-};
-
-template struct ElementwiseAddTo<platform::CUDADeviceContext,
-                                 platform::float16>;
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/math/math_function.h b/paddle/fluid/operators/math/math_function.h
deleted file mode 100644
index 9dbbf455f18..00000000000
--- a/paddle/fluid/operators/math/math_function.h
+++ /dev/null
@@ -1,112 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <cmath>
-#include <memory>
-#include <vector>
-
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/framework/tensor_util.h"
-#include "paddle/fluid/platform/device_context.h"
-#include "paddle/fluid/platform/enforce.h"
-#include "paddle/pten/core/dense_tensor.h"
-
-namespace paddle {
-namespace operators {
-namespace math {
-
-template <typename DeviceContext, typename T>
-struct TransposeNormal {
-  // for dims >= 7 situation
-  void operator()(const DeviceContext& context, const framework::Tensor& in,
-                  framework::Tensor* out, const std::vector<int>& axis);
-};
-
-template <typename DeviceContext, typename T, int Rank>
-struct Transpose {
-  void operator()(const DeviceContext& context, const framework::Tensor& in,
-                  framework::Tensor* out, const std::vector<int>& axis);
-};
-
-template <typename DeviceContext, typename T>
-struct SetConstant {
-  void operator()(const DeviceContext& context, framework::Tensor* tensor,
-                  T num);
-};
-
-template <typename Place>
-void set_constant_with_place(const platform::DeviceContext& context,
-                             framework::Tensor* tensor, float value);
-
-void set_constant(const platform::DeviceContext& context,
-                  framework::Tensor* tensor, float value);
-
-template <typename DeviceContext, typename T>
-struct RowwiseAdd {
-  void operator()(const DeviceContext& context, const framework::Tensor& input,
-                  const framework::Tensor& vec, framework::Tensor* output);
-};
-
-template <typename DeviceContext, typename T>
-struct ElementwiseAddTo {
-  // dst = dst + src
-  void operator()(DeviceContext* ctx, const framework::Tensor& src,
-                  framework::Tensor* dst);
-};
-
-template <typename DeviceContext, typename T>
-struct ColwiseSum {
-  void operator()(const DeviceContext& context, const framework::Tensor& input,
-                  framework::Tensor* vec);
-};
-
-template <typename DeviceContext, typename T>
-struct RowwiseSum {
-  void operator()(const DeviceContext& context, const framework::Tensor& input,
-                  framework::Tensor* vec);
-};
-
-template <typename DeviceContext, typename T>
-struct RowwiseMean {
-  void operator()(const DeviceContext& context, const framework::Tensor& input,
-                  framework::Tensor* vec);
-};
-
-#ifdef PADDLE_WITH_XPU
-template <typename U>
-struct TensorSetConstantXPU {
-  TensorSetConstantXPU(framework::Tensor* tensor, U value,
-                       platform::Place place)
-      : tensor_(tensor), value_(value), place_(place) {}
-  template <typename T>
-  void apply() const {
-    auto* begin = tensor_->mutable_data<T>(place_);
-    int numel = tensor_->numel();
-    std::unique_ptr<T[]> data_cpu(new T[numel]);
-    std::fill(data_cpu.get(), data_cpu.get() + numel, static_cast<T>(value_));
-    memory::Copy(place_, begin, platform::CPUPlace(),
-                 static_cast<void*>(data_cpu.get()), numel * sizeof(T));
-  }
-  framework::Tensor* tensor_;
-  U value_;
-  platform::Place place_;
-};
-#endif
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/math/matrix_solve.cu.cc b/paddle/fluid/operators/math/matrix_solve.cu.cc
index 8aaac0295c8..ee6610eae14 100644
--- a/paddle/fluid/operators/math/matrix_solve.cu.cc
+++ b/paddle/fluid/operators/math/matrix_solve.cu.cc
@@ -15,9 +15,9 @@ limitations under the License. */
 #include "paddle/fluid/operators/math/matrix_solve.h"
 #include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/operators/math/blas.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/solve_op.h"
 #include "paddle/fluid/platform/device_context.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace platform {
@@ -76,7 +76,7 @@ class MatrixSolveFunctor<platform::CUDADeviceContext, T> {
     const auto& new_dims_vec = getNewDimsVec(b_dims);
     tmp_b.Resize(framework::make_ddim(new_dims_vec));
     tmp_b.mutable_data<T>(context.GetPlace());
-    math::TransposeNormal<platform::CUDADeviceContext, T> trans;
+    pten::funcs::TransposeNormal<platform::CUDADeviceContext, T> trans;
     std::vector<int> new_axis = getNewAxis(b_rank);
     trans(context, b, &tmp_b, new_axis);
 
@@ -149,7 +149,7 @@ class MatrixSolveFunctor<platform::CUDADeviceContext, T> {
                           -host_info));
 
     // transpose tmp_b to get the final result in row-major form.
-    math::TransposeNormal<platform::CUDADeviceContext, T> trans2;
+    pten::funcs::TransposeNormal<platform::CUDADeviceContext, T> trans2;
     trans2(context, tmp_b, out, new_axis);
 
 #else
diff --git a/paddle/fluid/operators/math/prelu.h b/paddle/fluid/operators/math/prelu.h
index 70aae2ba59e..24c8721656b 100644
--- a/paddle/fluid/operators/math/prelu.h
+++ b/paddle/fluid/operators/math/prelu.h
@@ -15,8 +15,8 @@ limitations under the License. */
 #pragma once
 #include <vector>
 
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/math/sample_prob.cu b/paddle/fluid/operators/math/sample_prob.cu
index f596c1bc3dc..edc61bc667f 100644
--- a/paddle/fluid/operators/math/sample_prob.cu
+++ b/paddle/fluid/operators/math/sample_prob.cu
@@ -22,9 +22,9 @@ limitations under the License. */
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/math/sample_prob.h"
 #include "paddle/fluid/operators/math/sampler.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/math/segment_pooling.cu b/paddle/fluid/operators/math/segment_pooling.cu
index 0cbfaa4c5df..eaed2dc7d7e 100644
--- a/paddle/fluid/operators/math/segment_pooling.cu
+++ b/paddle/fluid/operators/math/segment_pooling.cu
@@ -14,10 +14,10 @@ limitations under the License. */
 
 #include <algorithm>
 #include "paddle/fluid/operators/gather.cu.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/math/segment_pooling.h"
 #include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/math/selected_rows_functor.cc b/paddle/fluid/operators/math/selected_rows_functor.cc
index 8cd3e1367d8..b921e844c9f 100644
--- a/paddle/fluid/operators/math/selected_rows_functor.cc
+++ b/paddle/fluid/operators/math/selected_rows_functor.cc
@@ -129,7 +129,7 @@ struct SelectedRowsAddTensor<platform::CPUDeviceContext, T> {
             "But recieved input width = [%d], output width = [%d]",
             in1_row_numel, output->numel() / in1_height));
 
-    SetConstant<platform::CPUDeviceContext, T> functor;
+    pten::funcs::SetConstant<platform::CPUDeviceContext, T> functor;
     functor(context, output, 0.0);
 
     auto* in1_data = in1_value.data<T>();
@@ -461,7 +461,7 @@ struct MergeAdd<platform::CPUDeviceContext, T> {
 
       out.set_rows(merge_rows);
 
-      math::SetConstant<platform::CPUDeviceContext, T> constant_functor;
+      pten::funcs::SetConstant<platform::CPUDeviceContext, T> constant_functor;
       constant_functor(context, out.mutable_value(), static_cast<T>(0.f));
 
       std::unordered_map<int64_t, size_t> rows_to_id;
@@ -689,7 +689,7 @@ struct MergeAverage<platform::CPUDeviceContext, T> {
 
     out.set_rows(merge_rows);
 
-    math::SetConstant<platform::CPUDeviceContext, T> constant_functor;
+    pten::funcs::SetConstant<platform::CPUDeviceContext, T> constant_functor;
     constant_functor(context, out.mutable_value(), 0.0);
 
     std::unordered_map<int64_t, size_t> rows_to_id;
diff --git a/paddle/fluid/operators/math/selected_rows_functor.cu b/paddle/fluid/operators/math/selected_rows_functor.cu
index 2ae2aaebb6c..d2caf82c93a 100644
--- a/paddle/fluid/operators/math/selected_rows_functor.cu
+++ b/paddle/fluid/operators/math/selected_rows_functor.cu
@@ -15,10 +15,10 @@ limitations under the License. */
 #include <set>
 #include <vector>
 
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/math/selected_rows_functor.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/fluid/platform/float16.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
@@ -156,7 +156,7 @@ struct SelectedRowsAddTensor<platform::CUDADeviceContext, T> {
     auto* in2_data = input2.data<T>();
     auto* out_data = output->data<T>();
 
-    SetConstant<platform::CUDADeviceContext, T> functor;
+    pten::funcs::SetConstant<platform::CUDADeviceContext, T> functor;
     functor(context, output, static_cast<T>(0));
 
     const int block_size = 256;
@@ -348,7 +348,7 @@ struct MergeAdd<platform::CUDADeviceContext, T> {
             {static_cast<int64_t>(merge_rows.size()), input_width}),
         context.GetPlace());
 
-    math::SetConstant<platform::CUDADeviceContext, T> constant_functor;
+    pten::funcs::SetConstant<platform::CUDADeviceContext, T> constant_functor;
     constant_functor(context, out.mutable_value(), static_cast<T>(0));
 
     auto* out_data = out.mutable_value()->data<T>();
@@ -411,7 +411,7 @@ struct MergeAdd<platform::CUDADeviceContext, T> {
             {static_cast<int64_t>(merge_rows.size()), input_width}),
         context.GetPlace());
 
-    math::SetConstant<platform::CUDADeviceContext, T> constant_functor;
+    pten::funcs::SetConstant<platform::CUDADeviceContext, T> constant_functor;
     constant_functor(context, out.mutable_value(), static_cast<T>(0));
 
     auto* out_data = out.mutable_value()->data<T>();
diff --git a/paddle/fluid/operators/math/selected_rows_functor.h b/paddle/fluid/operators/math/selected_rows_functor.h
index 690082036c5..e0ac583f15b 100644
--- a/paddle/fluid/operators/math/selected_rows_functor.h
+++ b/paddle/fluid/operators/math/selected_rows_functor.h
@@ -19,8 +19,8 @@ limitations under the License. */
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/selected_rows_utils.h"
 #include "paddle/fluid/operators/math/blas.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/platform/device_context.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 #define INLINE_FOR2(sizei, sizej)     \
   for (int64_t i = 0; i < sizei; i++) \
diff --git a/paddle/fluid/operators/math/selected_rows_functor_test.cc b/paddle/fluid/operators/math/selected_rows_functor_test.cc
index 19e70f924f1..9cb815e1611 100644
--- a/paddle/fluid/operators/math/selected_rows_functor_test.cc
+++ b/paddle/fluid/operators/math/selected_rows_functor_test.cc
@@ -15,14 +15,12 @@ limitations under the License. */
 #include "paddle/fluid/operators/math/selected_rows_functor.h"
 
 #include "gtest/gtest.h"
-#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 TEST(selected_rows_functor, cpu_add) {
   paddle::platform::CPUPlace cpu_place;
   paddle::platform::CPUDeviceContext ctx(cpu_place);
-  paddle::operators::math::SetConstant<paddle::platform::CPUDeviceContext,
-                                       float>
-      functor;
+  pten::funcs::SetConstant<paddle::platform::CPUDeviceContext, float> functor;
   int64_t height = 10;
   int64_t row_numel = 10;
 
@@ -122,9 +120,7 @@ TEST(selected_rows_functor, cpu_add) {
 TEST(selected_rows_functor, cpu_add_to) {
   paddle::platform::CPUPlace cpu_place;
   paddle::platform::CPUDeviceContext ctx(cpu_place);
-  paddle::operators::math::SetConstant<paddle::platform::CPUDeviceContext,
-                                       float>
-      functor;
+  pten::funcs::SetConstant<paddle::platform::CPUDeviceContext, float> functor;
   int64_t height = 10;
   int64_t row_numel = 10;
 
@@ -221,9 +217,7 @@ TEST(selected_rows_functor, cpu_add_to) {
 TEST(selected_rows_functor, cpu_merge_average_float) {
   paddle::platform::CPUPlace cpu_place;
   paddle::platform::CPUDeviceContext ctx(cpu_place);
-  paddle::operators::math::SetConstant<paddle::platform::CPUDeviceContext,
-                                       float>
-      functor;
+  pten::funcs::SetConstant<paddle::platform::CPUDeviceContext, float> functor;
   int64_t height = 10;
   int64_t row_numel = 10;
 
@@ -260,9 +254,7 @@ TEST(selected_rows_functor, cpu_merge_average_float) {
 TEST(selected_rows_functor, cpu_merge_add_float) {
   paddle::platform::CPUPlace cpu_place;
   paddle::platform::CPUDeviceContext ctx(cpu_place);
-  paddle::operators::math::SetConstant<paddle::platform::CPUDeviceContext,
-                                       float>
-      functor;
+  pten::funcs::SetConstant<paddle::platform::CPUDeviceContext, float> functor;
   int64_t height = 10;
   int64_t row_numel = 10;
 
@@ -301,8 +293,7 @@ TEST(selected_rows_functor, cpu_merge_add_float) {
 TEST(selected_rows_functor, cpu_merge_add_int) {
   paddle::platform::CPUPlace cpu_place;
   paddle::platform::CPUDeviceContext ctx(cpu_place);
-  paddle::operators::math::SetConstant<paddle::platform::CPUDeviceContext, int>
-      functor;
+  pten::funcs::SetConstant<paddle::platform::CPUDeviceContext, int> functor;
   int64_t height = 10;
   int64_t row_numel = 10;
 
@@ -341,9 +332,7 @@ TEST(selected_rows_functor, cpu_merge_add_int) {
 TEST(selected_rows_functor, cpu_merge_add_multi) {
   paddle::platform::CPUPlace cpu_place;
   paddle::platform::CPUDeviceContext ctx(cpu_place);
-  paddle::operators::math::SetConstant<paddle::platform::CPUDeviceContext,
-                                       float>
-      set_const;
+  pten::funcs::SetConstant<paddle::platform::CPUDeviceContext, float> set_const;
 
   int64_t height = 10;
   int64_t row_numel = 8;
@@ -397,9 +386,7 @@ TEST(selected_rows_functor, cpu_merge_add_multi) {
 TEST(selected_rows_functor, cpu_merge_add_multi_noduplicated) {
   paddle::platform::CPUPlace cpu_place;
   paddle::platform::CPUDeviceContext ctx(cpu_place);
-  paddle::operators::math::SetConstant<paddle::platform::CPUDeviceContext,
-                                       float>
-      set_const;
+  pten::funcs::SetConstant<paddle::platform::CPUDeviceContext, float> set_const;
 
   int64_t height = 10;
   int64_t row_numel = 8;
@@ -459,9 +446,7 @@ TEST(selected_rows_functor, cpu_merge_add_multi_noduplicated) {
 TEST(selected_rows_functor, cpu_sum_to) {
   paddle::platform::CPUPlace cpu_place;
   paddle::platform::CPUDeviceContext ctx(cpu_place);
-  paddle::operators::math::SetConstant<paddle::platform::CPUDeviceContext,
-                                       float>
-      functor;
+  pten::funcs::SetConstant<paddle::platform::CPUDeviceContext, float> functor;
   int64_t height = 10;
   int64_t row_numel = 10;
   std::vector<int64_t> rows1{0, 4, 7};
diff --git a/paddle/fluid/operators/math/selected_rows_functor_test.cu.cc b/paddle/fluid/operators/math/selected_rows_functor_test.cu.cc
index e826c2a7244..1bae95e1584 100644
--- a/paddle/fluid/operators/math/selected_rows_functor_test.cu.cc
+++ b/paddle/fluid/operators/math/selected_rows_functor_test.cu.cc
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/math/selected_rows_functor.h"
 #include "gtest/gtest.h"
-#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 TEST(selected_rows_functor, gpu_add) {
   paddle::platform::CUDAPlace gpu_place(0);
@@ -22,9 +22,7 @@ TEST(selected_rows_functor, gpu_add) {
   paddle::platform::CUDADeviceContext& ctx =
       *reinterpret_cast<paddle::platform::CUDADeviceContext*>(
           paddle::platform::DeviceContextPool::Instance().Get(gpu_place));
-  paddle::operators::math::SetConstant<paddle::platform::CUDADeviceContext,
-                                       float>
-      functor;
+  pten::funcs::SetConstant<paddle::platform::CUDADeviceContext, float> functor;
   int64_t height = 10;
   int64_t row_numel = 10;
 
@@ -144,9 +142,7 @@ TEST(selected_rows_functor, gpu_add_to) {
   paddle::platform::CUDADeviceContext& ctx =
       *reinterpret_cast<paddle::platform::CUDADeviceContext*>(
           paddle::platform::DeviceContextPool::Instance().Get(gpu_place));
-  paddle::operators::math::SetConstant<paddle::platform::CUDADeviceContext,
-                                       float>
-      functor;
+  pten::funcs::SetConstant<paddle::platform::CUDADeviceContext, float> functor;
   int64_t height = 10;
   int64_t row_numel = 10;
 
@@ -254,8 +250,7 @@ TEST(selected_rows_functor, gpu_merge_add) {
   paddle::platform::CUDADeviceContext& ctx =
       *reinterpret_cast<paddle::platform::CUDADeviceContext*>(
           paddle::platform::DeviceContextPool::Instance().Get(gpu_place));
-  paddle::operators::math::SetConstant<paddle::platform::CUDADeviceContext,
-                                       float>
+  pten::funcs::SetConstant<paddle::platform::CUDADeviceContext, float>
       set_const;
 
   int64_t height = 10;
diff --git a/paddle/fluid/operators/math/sequence_pooling.cc b/paddle/fluid/operators/math/sequence_pooling.cc
index 2eee4d0a6c1..22cd4352973 100644
--- a/paddle/fluid/operators/math/sequence_pooling.cc
+++ b/paddle/fluid/operators/math/sequence_pooling.cc
@@ -16,8 +16,8 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/jit/kernels.h"
 #include "paddle/fluid/operators/math/blas.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/math/sequence_pooling.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
@@ -191,7 +191,7 @@ class MaxSeqPoolGradFunctor {
     const int* max_index = index.data<int>();
     T* ig_data = in_grad->data<T>();
 
-    SetConstant<platform::CPUDeviceContext, T> set_zero;
+    pten::funcs::SetConstant<platform::CPUDeviceContext, T> set_zero;
     set_zero(context, in_grad, static_cast<T>(0.0));
     int64_t num_seq = og_dims[0];
     int64_t dim = out_grad.numel() / num_seq;
@@ -409,7 +409,7 @@ class SequencePoolGradFunctor<platform::CPUDeviceContext, T> {
 
     if (pooltype == "LAST" || pooltype == "FIRST") {
       // set X@Grad be zero at first when pooltype is LAST/FIRST
-      math::SetConstant<platform::CPUDeviceContext, T> functor;
+      pten::funcs::SetConstant<platform::CPUDeviceContext, T> functor;
       functor(context, in_grad, 0);
     }
 
diff --git a/paddle/fluid/operators/math/sequence_pooling.cu b/paddle/fluid/operators/math/sequence_pooling.cu
index b3e1922e106..3bf3b483e89 100644
--- a/paddle/fluid/operators/math/sequence_pooling.cu
+++ b/paddle/fluid/operators/math/sequence_pooling.cu
@@ -14,10 +14,10 @@ limitations under the License. */
 
 #include <algorithm>
 #include <string>
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/math/sequence_pooling.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/fluid/platform/macros.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/math/softmax.cu b/paddle/fluid/operators/math/softmax.cu
index bc32e068f56..632fc1d4b29 100644
--- a/paddle/fluid/operators/math/softmax.cu
+++ b/paddle/fluid/operators/math/softmax.cu
@@ -13,10 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include <vector>
 
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/math/softmax.h"
 #include "paddle/fluid/operators/math/softmax_impl.h"
 #include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/math/sparse_impl.cu.h b/paddle/fluid/operators/math/sparse_impl.cu.h
index 8ff2f4b27df..728cf0fcd0b 100644
--- a/paddle/fluid/operators/math/sparse_impl.cu.h
+++ b/paddle/fluid/operators/math/sparse_impl.cu.h
@@ -14,8 +14,8 @@
 
 #pragma once
 
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/platform/dynload/cusparse.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
 
diff --git a/paddle/fluid/operators/math/tree2col.cc b/paddle/fluid/operators/math/tree2col.cc
index af5df27207a..85d71b369a1 100644
--- a/paddle/fluid/operators/math/tree2col.cc
+++ b/paddle/fluid/operators/math/tree2col.cc
@@ -91,7 +91,7 @@ class Tree2ColFunctor<platform::CPUDeviceContext, T> {
     std::vector<std::vector<int>> tr;
     auto feature_dims = node_features.dims();
     auto cpu_place = context.GetPlace();
-    math::SetConstant<platform::CPUDeviceContext, T> constant;
+    pten::funcs::SetConstant<platform::CPUDeviceContext, T> constant;
     int64_t feature_size = feature_dims[1];
     size_t patch_elem_size = 3 * static_cast<size_t>(feature_size);
     size_t node_count = 0, patch_count = 0, patch_size;
@@ -144,7 +144,7 @@ class Col2TreeFunctor<platform::CPUDeviceContext, T> {
     std::vector<std::vector<int>> tr;
     auto output_dims = out_grad.dims();
     auto cpu_place = context.GetPlace();
-    math::SetConstant<platform::CPUDeviceContext, T> constant;
+    pten::funcs::SetConstant<platform::CPUDeviceContext, T> constant;
     int64_t output_size = output_dims[1];
     size_t grad_elem_size = 3 * static_cast<size_t>(output_size);
     size_t node_count = 0, grad_count = 0;
diff --git a/paddle/fluid/operators/math/tree2col.cu b/paddle/fluid/operators/math/tree2col.cu
index 4f3ab319165..4fcd1a1cf6b 100644
--- a/paddle/fluid/operators/math/tree2col.cu
+++ b/paddle/fluid/operators/math/tree2col.cu
@@ -13,8 +13,8 @@
 // limitations under the License.
 
 #include <stack>
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/math/tree2col.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
@@ -56,7 +56,7 @@ class Tree2ColFunctor<platform::CUDADeviceContext, T> {
     auto cpu_place = platform::CPUPlace();
     auto stream = context.stream();
     auto feature_dims = node_features.dims();
-    math::SetConstant<platform::CUDADeviceContext, T> constant;
+    pten::funcs::SetConstant<platform::CUDADeviceContext, T> constant;
 
     Tensor EdgeSet_cpu;
     framework::TensorCopy(EdgeSet, cpu_place, &EdgeSet_cpu);
@@ -128,7 +128,7 @@ class Col2TreeFunctor<platform::CUDADeviceContext, T> {
     auto cpu_place = platform::CPUPlace();
     auto stream = context.stream();
     auto output_dims = patch_grad.dims();
-    math::SetConstant<platform::CUDADeviceContext, T> constant;
+    pten::funcs::SetConstant<platform::CUDADeviceContext, T> constant;
 
     Tensor EdgeSet_cpu;
     framework::TensorCopy(EdgeSet, cpu_place, &EdgeSet_cpu);
diff --git a/paddle/fluid/operators/math/tree2col.h b/paddle/fluid/operators/math/tree2col.h
index 632777c9cd9..5cf7a93f4d4 100644
--- a/paddle/fluid/operators/math/tree2col.h
+++ b/paddle/fluid/operators/math/tree2col.h
@@ -18,7 +18,7 @@
 #include <unordered_map>
 #include <vector>
 #include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/matmul_op.cc b/paddle/fluid/operators/matmul_op.cc
index 717c1b5c0ed..6b24f477844 100644
--- a/paddle/fluid/operators/matmul_op.cc
+++ b/paddle/fluid/operators/matmul_op.cc
@@ -128,7 +128,7 @@ static framework::Tensor FoldHeadAndLastDims(const DeviceContext &context,
   output.Resize({in_dims[1], in_dims[0], in_dims[2]});
   output.mutable_data<T>(context.GetPlace());
   std::vector<int> axis = {1, 0, 2};
-  math::Transpose<DeviceContext, T, 3> trans;
+  pten::funcs::Transpose<DeviceContext, T, 3> trans;
   trans(context, input, &output, axis);
   output.Resize({in_dims[1], in_dims[0] * in_dims[2]});
 
diff --git a/paddle/fluid/operators/matrix_power_op.h b/paddle/fluid/operators/matrix_power_op.h
index 6c4b8860bf8..93755b22bf9 100644
--- a/paddle/fluid/operators/matrix_power_op.h
+++ b/paddle/fluid/operators/matrix_power_op.h
@@ -170,7 +170,7 @@ void MatrixPowerGradFunction(const Tensor* X, const Tensor* Out,
 
   if (n == 0) {
     // \nabla X = O
-    math::SetConstant<DeviceContext, T> zero;
+    pten::funcs::SetConstant<DeviceContext, T> zero;
     zero(dev_ctx, dX, static_cast<T>(0));
     return;
   } else if (n == 1) {
diff --git a/paddle/fluid/operators/matrix_rank_op.cu b/paddle/fluid/operators/matrix_rank_op.cu
index 7362d00afb7..d974d7c1b78 100644
--- a/paddle/fluid/operators/matrix_rank_op.cu
+++ b/paddle/fluid/operators/matrix_rank_op.cu
@@ -19,11 +19,11 @@ limitations under the License. */
 #include "paddle/fluid/memory/memory.h"
 #include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
 #include "paddle/fluid/operators/math/complex_functors.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/matrix_rank_op.h"
 #include "paddle/fluid/operators/svd_helper.h"
 #include "paddle/fluid/platform/dynload/cusolver.h"
 #include "paddle/fluid/platform/for_range.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/maxout_op.h b/paddle/fluid/operators/maxout_op.h
index 64b538fc5d5..d1c229342b9 100644
--- a/paddle/fluid/operators/maxout_op.h
+++ b/paddle/fluid/operators/maxout_op.h
@@ -15,8 +15,8 @@ limitations under the License. */
 #pragma once
 
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/math/maxouting.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
@@ -57,7 +57,7 @@ class MaxOutGradKernel : public framework::OpKernel<T> {
     }
 
     auto& device_ctx = context.template device_context<DeviceContext>();
-    math::SetConstant<DeviceContext, T> zero;
+    pten::funcs::SetConstant<DeviceContext, T> zero;
     if (in_x_grad) {
       in_x_grad->mutable_data<T>(context.GetPlace());
       zero(device_ctx, in_x_grad, static_cast<T>(0.0));
diff --git a/paddle/fluid/operators/mean_iou_op.cu b/paddle/fluid/operators/mean_iou_op.cu
index 79aff52a16f..48b34e18b8f 100644
--- a/paddle/fluid/operators/mean_iou_op.cu
+++ b/paddle/fluid/operators/mean_iou_op.cu
@@ -13,10 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/memory/malloc.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/mean_iou_op.h"
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/mlu/activation_op_mlu_test.cc b/paddle/fluid/operators/mlu/activation_op_mlu_test.cc
index 9da3a4c4872..555179e7cd1 100644
--- a/paddle/fluid/operators/mlu/activation_op_mlu_test.cc
+++ b/paddle/fluid/operators/mlu/activation_op_mlu_test.cc
@@ -15,9 +15,9 @@ limitations under the License. */
 #include <gtest/gtest.h>
 
 #include "paddle/fluid/operators/activation_op.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/platform/device/mlu/device_context.h"
 #include "paddle/fluid/platform/place.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace fw = paddle::framework;
 namespace plat = paddle::platform;
diff --git a/paddle/fluid/operators/mul_op.h b/paddle/fluid/operators/mul_op.h
index 0fb32cf4be8..6ea154c25db 100644
--- a/paddle/fluid/operators/mul_op.h
+++ b/paddle/fluid/operators/mul_op.h
@@ -17,7 +17,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/operators/math/blas.h"
-#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/norm_utils.cu.h b/paddle/fluid/operators/norm_utils.cu.h
index 241c634e3fc..562fe8a1bc8 100644
--- a/paddle/fluid/operators/norm_utils.cu.h
+++ b/paddle/fluid/operators/norm_utils.cu.h
@@ -25,8 +25,8 @@ limitations under the License. */
 namespace cub = hipcub;
 #endif
 #include "paddle/fluid/framework/data_layout.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 #ifdef __HIPCC__
 #define LAUNCH_BOUNDS(BlockDim) __launch_bounds__(BlockDim)
@@ -405,7 +405,7 @@ void NormDoubleGradFunctor(const framework::ExecutionContext &ctx,
   const T *ddbias_data = (ddBias == nullptr ? nullptr : ddBias->data<T>());
 
   auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
-  math::SetConstant<platform::CUDADeviceContext, T> set_constant;
+  pten::funcs::SetConstant<platform::CUDADeviceContext, T> set_constant;
 
   auto &x_dims = X->dims();
   const int C = (data_layout == DataLayout::kNCHW ? x_dims[1]
diff --git a/paddle/fluid/operators/one_hot_op.cu b/paddle/fluid/operators/one_hot_op.cu
index 2b021748048..092ffe78f57 100644
--- a/paddle/fluid/operators/one_hot_op.cu
+++ b/paddle/fluid/operators/one_hot_op.cu
@@ -46,7 +46,7 @@ struct OneHotOpCUDAFunctor {
     auto numel = in_->numel();
     auto* p_out_data = out_->mutable_data<OutT>(ctx_.GetPlace());
     auto stream = ctx_.stream();
-    math::set_constant(ctx_, out_, 0.0);
+    pten::funcs::set_constant(ctx_, out_, 0.0);
 
     FillOutputKernel<<<(numel + PADDLE_CUDA_NUM_THREADS - 1) /
                            PADDLE_CUDA_NUM_THREADS,
diff --git a/paddle/fluid/operators/one_hot_op.h b/paddle/fluid/operators/one_hot_op.h
index e671a1e99e7..a5b3ff78e14 100644
--- a/paddle/fluid/operators/one_hot_op.h
+++ b/paddle/fluid/operators/one_hot_op.h
@@ -14,7 +14,7 @@
 
 #pragma once
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
@@ -41,7 +41,7 @@ struct OneHotOpFunctor {
     auto* p_in_data = in_->data<InT>();
     auto numel = in_->numel();
     auto* p_out_data = out_->mutable_data<OutT>(ctx_.GetPlace());
-    math::set_constant(ctx_, out_, 0.0);
+    pten::funcs::set_constant(ctx_, out_, 0.0);
 
     if (allow_out_of_range_) {
       for (int i = 0; i < numel; ++i) {
diff --git a/paddle/fluid/operators/one_hot_v2_op.cu b/paddle/fluid/operators/one_hot_v2_op.cu
index 115c9460846..d145455a1f1 100644
--- a/paddle/fluid/operators/one_hot_v2_op.cu
+++ b/paddle/fluid/operators/one_hot_v2_op.cu
@@ -47,7 +47,7 @@ struct OneHotV2OpCUDAFunctor {
     auto numel = in_->numel();
     auto* p_out_data = out_->mutable_data<OutT>(ctx_.GetPlace());
     auto stream = ctx_.stream();
-    math::set_constant(ctx_, out_, 0.0);
+    pten::funcs::set_constant(ctx_, out_, 0.0);
 
     FillOutputKernel<<<(numel + PADDLE_CUDA_NUM_THREADS - 1) /
                            PADDLE_CUDA_NUM_THREADS,
diff --git a/paddle/fluid/operators/one_hot_v2_op.h b/paddle/fluid/operators/one_hot_v2_op.h
index 221b8cf0e2a..c95909e3753 100644
--- a/paddle/fluid/operators/one_hot_v2_op.h
+++ b/paddle/fluid/operators/one_hot_v2_op.h
@@ -14,7 +14,7 @@
 
 #pragma once
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
@@ -41,7 +41,7 @@ struct OneHotV2OpFunctor {
     auto* p_in_data = in_->data<InT>();
     auto numel = in_->numel();
     auto* p_out_data = out_->mutable_data<OutT>(ctx_.GetPlace());
-    math::set_constant(ctx_, out_, 0.0);
+    pten::funcs::set_constant(ctx_, out_, 0.0);
 
     if (allow_out_of_range_) {
       for (int i = 0; i < numel; ++i) {
diff --git a/paddle/fluid/operators/optimizers/adagrad_op.cc b/paddle/fluid/operators/optimizers/adagrad_op.cc
index 31d3e1208da..d865f7cff22 100644
--- a/paddle/fluid/operators/optimizers/adagrad_op.cc
+++ b/paddle/fluid/operators/optimizers/adagrad_op.cc
@@ -17,8 +17,8 @@ limitations under the License. */
 
 #include <cmath>
 
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/math/selected_rows_functor.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/optimizers/adagrad_op.cu b/paddle/fluid/operators/optimizers/adagrad_op.cu
index a7c32255bd1..5c970ceffb0 100644
--- a/paddle/fluid/operators/optimizers/adagrad_op.cu
+++ b/paddle/fluid/operators/optimizers/adagrad_op.cu
@@ -11,10 +11,10 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/math/selected_rows_functor.h"
 #include "paddle/fluid/operators/optimizers/adagrad_op.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/overlap_add_op.h b/paddle/fluid/operators/overlap_add_op.h
index 865659ee942..b69f99bc985 100644
--- a/paddle/fluid/operators/overlap_add_op.h
+++ b/paddle/fluid/operators/overlap_add_op.h
@@ -18,11 +18,11 @@
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/math/seq2col.h"
 #include "paddle/fluid/operators/transpose_op.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/for_range.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/p_norm_op.cu b/paddle/fluid/operators/p_norm_op.cu
index 7c8dfc7f647..ef885e3ae7a 100644
--- a/paddle/fluid/operators/p_norm_op.cu
+++ b/paddle/fluid/operators/p_norm_op.cu
@@ -180,7 +180,7 @@ class PnormGradCUDAKernel : public framework::OpKernel<T> {
     auto& cuda_ctx = ctx.template device_context<DeviceContext>();
 
     if (porder == 0) {
-      math::SetConstant<DeviceContext, T> set_zero;
+      pten::funcs::SetConstant<DeviceContext, T> set_zero;
       set_zero(cuda_ctx, out_dx, static_cast<T>(0));
     } else if (porder == INFINITY || porder == -INFINITY) {
       AbsMaxAndMinGradFunctor<T> functor;
diff --git a/paddle/fluid/operators/p_norm_op.h b/paddle/fluid/operators/p_norm_op.h
index 8fca6924a25..17d1240636f 100644
--- a/paddle/fluid/operators/p_norm_op.h
+++ b/paddle/fluid/operators/p_norm_op.h
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #pragma once
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
@@ -119,7 +119,7 @@ class PnormGradKernel : public framework::OpKernel<T> {
     Eigen::DSizes<int, 3> bcast(1, n, 1);
 
     if (porder == 0) {
-      math::SetConstant<DeviceContext, T> set_zero;
+      pten::funcs::SetConstant<DeviceContext, T> set_zero;
       auto& dev_ctx = ctx.template device_context<DeviceContext>();
       set_zero(dev_ctx, out_dx, static_cast<T>(0));
     } else if (porder == INFINITY || porder == -INFINITY) {
diff --git a/paddle/fluid/operators/pad2d_op.cc b/paddle/fluid/operators/pad2d_op.cc
index e50af02dcc4..3663cb95409 100644
--- a/paddle/fluid/operators/pad2d_op.cc
+++ b/paddle/fluid/operators/pad2d_op.cc
@@ -17,7 +17,7 @@ limitations under the License. */
 #include <string>
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
@@ -415,7 +415,7 @@ class Pad2dGradCPUKernel : public framework::OpKernel<T> {
     auto d_out_dims = d_out->dims();
     const T* d_out_data = d_out->data<T>();
     T* d_in_data = d_in->mutable_data<T>(context.GetPlace());
-    math::SetConstant<platform::CPUDeviceContext, T> set_zero;
+    pten::funcs::SetConstant<platform::CPUDeviceContext, T> set_zero;
     set_zero(context.template device_context<platform::CPUDeviceContext>(),
              d_in, static_cast<T>(0));
     const int pad_top = pads[0];
diff --git a/paddle/fluid/operators/pad2d_op.cu b/paddle/fluid/operators/pad2d_op.cu
index a854fa6091a..0c9e6ed2b72 100644
--- a/paddle/fluid/operators/pad2d_op.cu
+++ b/paddle/fluid/operators/pad2d_op.cu
@@ -14,9 +14,9 @@ limitations under the License. */
 
 #include <algorithm>
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
@@ -395,7 +395,7 @@ class Pad2dGradCUDAKernel : public framework::OpKernel<T> {
     const T* d_out_data = d_out->data<T>();
     T* d_in_data = d_in->mutable_data<T>(context.GetPlace());
 
-    math::SetConstant<platform::CUDADeviceContext, T> set_zero;
+    pten::funcs::SetConstant<platform::CUDADeviceContext, T> set_zero;
     set_zero(context.template device_context<platform::CUDADeviceContext>(),
              d_in, static_cast<T>(0));
 
diff --git a/paddle/fluid/operators/pad3d_op.cc b/paddle/fluid/operators/pad3d_op.cc
index e84b5a9d9ba..e29718af894 100644
--- a/paddle/fluid/operators/pad3d_op.cc
+++ b/paddle/fluid/operators/pad3d_op.cc
@@ -17,7 +17,7 @@ limitations under the License. */
 #include <string>
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
@@ -621,7 +621,7 @@ class Pad3dGradCPUKernel : public framework::OpKernel<T> {
     auto d_out_dims = d_out->dims();
     const T* d_out_data = d_out->data<T>();
     T* d_in_data = d_in->mutable_data<T>(context.GetPlace());
-    math::SetConstant<platform::CPUDeviceContext, T> set_zero;
+    pten::funcs::SetConstant<platform::CPUDeviceContext, T> set_zero;
     set_zero(context.template device_context<platform::CPUDeviceContext>(),
              d_in, static_cast<T>(0));
     const int pad_left = pads[0];
diff --git a/paddle/fluid/operators/pad3d_op.cu b/paddle/fluid/operators/pad3d_op.cu
index 15672512365..b7cf1be99fe 100644
--- a/paddle/fluid/operators/pad3d_op.cu
+++ b/paddle/fluid/operators/pad3d_op.cu
@@ -14,9 +14,9 @@ limitations under the License. */
 
 #include <algorithm>
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
@@ -697,7 +697,7 @@ class Pad3dGradCUDAKernel : public framework::OpKernel<T> {
     const T* d_out_data = d_out->data<T>();
     T* d_in_data = d_in->mutable_data<T>(context.GetPlace());
 
-    math::SetConstant<platform::CUDADeviceContext, T> set_zero;
+    pten::funcs::SetConstant<platform::CUDADeviceContext, T> set_zero;
     set_zero(context.template device_context<platform::CUDADeviceContext>(),
              d_in, static_cast<T>(0));
 
diff --git a/paddle/fluid/operators/pixel_shuffle_op.h b/paddle/fluid/operators/pixel_shuffle_op.h
index b2a0db0f838..4ae138ac7af 100644
--- a/paddle/fluid/operators/pixel_shuffle_op.h
+++ b/paddle/fluid/operators/pixel_shuffle_op.h
@@ -14,7 +14,7 @@ limitations under the License. */
 #include <string>
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
@@ -52,7 +52,7 @@ class PixelShuffleOpKernel : public framework::OpKernel<T> {
     } else {
       o.Resize({in_dims[0], in_dims[1], factor, in_dims[2], factor, o_dims[3]});
     }
-    math::Transpose<DeviceContext, T, 6> trans;
+    pten::funcs::Transpose<DeviceContext, T, 6> trans;
     auto& dev_ctx = ctx.template device_context<DeviceContext>();
     trans(dev_ctx, t, &o, axis);
     out->Resize(o_dims);
@@ -95,7 +95,7 @@ class PixelShuffleGradOpKernel : public framework::OpKernel<T> {
       o.Resize(
           {do_dims[0], dx_dims[1], dx_dims[2], do_dims[3], factor, factor});
     }
-    math::Transpose<DeviceContext, T, 6> trans;
+    pten::funcs::Transpose<DeviceContext, T, 6> trans;
     auto& dev_ctx = ctx.template device_context<DeviceContext>();
     trans(dev_ctx, t, &o, axis);
     dx->Resize(dx_dims);
diff --git a/paddle/fluid/operators/poisson_op.h b/paddle/fluid/operators/poisson_op.h
index 2159637b290..d2deb215671 100644
--- a/paddle/fluid/operators/poisson_op.h
+++ b/paddle/fluid/operators/poisson_op.h
@@ -17,7 +17,7 @@
 #include "paddle/fluid/framework/generator.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
@@ -31,7 +31,7 @@ class PoissonGradKernel : public framework::OpKernel<T> {
   void Compute(const framework::ExecutionContext& ctx) const override {
     auto* dx = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
     dx->mutable_data<T>(ctx.GetPlace());
-    math::SetConstant<DeviceContext, T> functor;
+    pten::funcs::SetConstant<DeviceContext, T> functor;
     auto& dev_ctx = ctx.template device_context<DeviceContext>();
     functor(dev_ctx, dx, static_cast<T>(0));
   }
diff --git a/paddle/fluid/operators/pool_cudnn_op.cu.cc b/paddle/fluid/operators/pool_cudnn_op.cu.cc
index bbe31740129..2b0300b87c2 100644
--- a/paddle/fluid/operators/pool_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/pool_cudnn_op.cu.cc
@@ -14,8 +14,8 @@ limitations under the License. */
 
 #include <string>
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/pool_op.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 #ifdef PADDLE_WITH_HIP
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/operator.h"
@@ -114,7 +114,7 @@ class PoolCUDNNOpKernel : public framework::OpKernel<T> {
       transformed_input.Resize(framework::make_ddim(in_dims_vec));
       transformed_input.mutable_data(ctx.GetPlace(), input->type());
 
-      math::Transpose<paddle::platform::CUDADeviceContext, T, 5> trans5;
+      pten::funcs::Transpose<paddle::platform::CUDADeviceContext, T, 5> trans5;
       trans5(dev_ctx, *input, &transformed_input, axis);
 
       // output
@@ -142,7 +142,7 @@ class PoolCUDNNOpKernel : public framework::OpKernel<T> {
       transformed_input.Resize(framework::make_ddim(in_dims_vec));
       transformed_input.mutable_data(ctx.GetPlace(), input->type());
 
-      math::Transpose<paddle::platform::CUDADeviceContext, T, 4> trans;
+      pten::funcs::Transpose<paddle::platform::CUDADeviceContext, T, 4> trans;
       trans(dev_ctx, *input, &transformed_input, axis);
 
       transformed_output.Resize(output->dims());
@@ -221,7 +221,8 @@ class PoolCUDNNOpKernel : public framework::OpKernel<T> {
       auto &dev_ctx =
           ctx.template device_context<paddle::platform::CUDADeviceContext>();
       std::vector<int> axis{0, 2, 3, 4, 1};
-      math::Transpose<paddle::platform::CUDADeviceContext, T, 5> trans5_v2;
+      pten::funcs::Transpose<paddle::platform::CUDADeviceContext, T, 5>
+          trans5_v2;
       trans5_v2(dev_ctx, transformed_output, output, axis);
     }
 #ifdef PADDLE_WITH_HIP
@@ -230,7 +231,7 @@ class PoolCUDNNOpKernel : public framework::OpKernel<T> {
       auto &dev_ctx =
           ctx.template device_context<paddle::platform::CUDADeviceContext>();
       std::vector<int> axis{0, 2, 3, 1};
-      math::Transpose<paddle::platform::CUDADeviceContext, T, 4> trans;
+      pten::funcs::Transpose<paddle::platform::CUDADeviceContext, T, 4> trans;
       trans(dev_ctx, transformed_output, output, axis);
     }
 #endif
@@ -337,7 +338,7 @@ class PoolCUDNNGradOpKernel : public framework::OpKernel<T> {
       transformed_input.Resize(framework::make_ddim(in_dims_vec));
       transformed_input.mutable_data(ctx.GetPlace(), input->type());
 
-      math::Transpose<paddle::platform::CUDADeviceContext, T, 5> trans5;
+      pten::funcs::Transpose<paddle::platform::CUDADeviceContext, T, 5> trans5;
       trans5(dev_ctx, *input, &transformed_input, axis);
 
       // output
@@ -351,14 +352,16 @@ class PoolCUDNNGradOpKernel : public framework::OpKernel<T> {
 
       transformed_output.mutable_data(ctx.GetPlace(), output->type());
 
-      math::Transpose<paddle::platform::CUDADeviceContext, T, 5> trans5_v2;
+      pten::funcs::Transpose<paddle::platform::CUDADeviceContext, T, 5>
+          trans5_v2;
       trans5_v2(dev_ctx, *output, &transformed_output, axis);
 
       // output grad
       transformed_output_grad.Resize(framework::make_ddim(out_dims_vec));
       transformed_output_grad.mutable_data(ctx.GetPlace(), output_grad->type());
 
-      math::Transpose<paddle::platform::CUDADeviceContext, T, 5> trans5_v3;
+      pten::funcs::Transpose<paddle::platform::CUDADeviceContext, T, 5>
+          trans5_v3;
       trans5_v3(dev_ctx, *output_grad, &transformed_output_grad, axis);
 
       // input grad
@@ -381,7 +384,7 @@ class PoolCUDNNGradOpKernel : public framework::OpKernel<T> {
       transformed_input.Resize(framework::make_ddim(in_dims_vec));
       transformed_input.mutable_data(ctx.GetPlace(), input->type());
 
-      math::Transpose<paddle::platform::CUDADeviceContext, T, 4> trans4;
+      pten::funcs::Transpose<paddle::platform::CUDADeviceContext, T, 4> trans4;
       trans4(dev_ctx, *input, &transformed_input, axis);
 
       // output
@@ -394,14 +397,16 @@ class PoolCUDNNGradOpKernel : public framework::OpKernel<T> {
 
       transformed_output.mutable_data(ctx.GetPlace(), output->type());
 
-      math::Transpose<paddle::platform::CUDADeviceContext, T, 4> trans4_v2;
+      pten::funcs::Transpose<paddle::platform::CUDADeviceContext, T, 4>
+          trans4_v2;
       trans4_v2(dev_ctx, *output, &transformed_output, axis);
 
       // output grad
       transformed_output_grad.Resize(framework::make_ddim(out_dims_vec));
       transformed_output_grad.mutable_data(ctx.GetPlace(), output_grad->type());
 
-      math::Transpose<paddle::platform::CUDADeviceContext, T, 4> trans4_v3;
+      pten::funcs::Transpose<paddle::platform::CUDADeviceContext, T, 4>
+          trans4_v3;
       trans4_v3(dev_ctx, *output_grad, &transformed_output_grad, axis);
 
       // input grad
@@ -485,7 +490,8 @@ class PoolCUDNNGradOpKernel : public framework::OpKernel<T> {
         auto &dev_ctx =
             ctx.template device_context<paddle::platform::CUDADeviceContext>();
         std::vector<int> axis{0, 2, 3, 4, 1};
-        math::Transpose<paddle::platform::CUDADeviceContext, T, 5> trans5_v4;
+        pten::funcs::Transpose<paddle::platform::CUDADeviceContext, T, 5>
+            trans5_v4;
         trans5_v4(dev_ctx, transformed_input_grad, input_grad, axis);
       }
 #ifdef PADDLE_WITH_HIP
@@ -494,7 +500,8 @@ class PoolCUDNNGradOpKernel : public framework::OpKernel<T> {
         auto &dev_ctx =
             ctx.template device_context<paddle::platform::CUDADeviceContext>();
         std::vector<int> axis{0, 2, 3, 1};
-        math::Transpose<paddle::platform::CUDADeviceContext, T, 4> trans4_v4;
+        pten::funcs::Transpose<paddle::platform::CUDADeviceContext, T, 4>
+            trans4_v4;
         trans4_v4(dev_ctx, transformed_input_grad, input_grad, axis);
       }
 #endif
diff --git a/paddle/fluid/operators/pool_op.h b/paddle/fluid/operators/pool_op.h
index 9e2f6cf223b..d220b13d18d 100644
--- a/paddle/fluid/operators/pool_op.h
+++ b/paddle/fluid/operators/pool_op.h
@@ -20,8 +20,8 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/math/pooling.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 #if defined(__HIPCC__) || defined(__NVCC__)
 #include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h"
 #endif
@@ -299,7 +299,7 @@ class PoolGradKernel : public framework::OpKernel<T> {
     auto& dev_ctx = context.template device_context<DeviceContext>();
     if (in_x_grad) {
       in_x_grad->mutable_data<T>(context.GetPlace());
-      paddle::operators::math::SetConstant<DeviceContext, T> set_constant;
+      pten::funcs::SetConstant<DeviceContext, T> set_constant;
       set_constant(dev_ctx, in_x_grad, static_cast<T>(0.0));
 
       switch (ksize.size()) {
diff --git a/paddle/fluid/operators/pool_with_index_op.h b/paddle/fluid/operators/pool_with_index_op.h
index 065d90704cf..d039598a8a0 100644
--- a/paddle/fluid/operators/pool_with_index_op.h
+++ b/paddle/fluid/operators/pool_with_index_op.h
@@ -17,8 +17,8 @@ limitations under the License. */
 #include <vector>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/math/pooling.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
@@ -92,7 +92,7 @@ class MaxPoolWithIndexGradKernel : public framework::OpKernel<T1> {
     if (in_x_grad) {
       in_x_grad->mutable_data<T1>(context.GetPlace());
       auto& device_ctx = context.template device_context<DeviceContext>();
-      math::set_constant(device_ctx, in_x_grad, 0);
+      pten::funcs::set_constant(device_ctx, in_x_grad, 0);
 
       switch (ksize.size()) {
         case 2: {
diff --git a/paddle/fluid/operators/prroi_pool_op.cu b/paddle/fluid/operators/prroi_pool_op.cu
index 71aaf08c525..256bc0473b4 100644
--- a/paddle/fluid/operators/prroi_pool_op.cu
+++ b/paddle/fluid/operators/prroi_pool_op.cu
@@ -327,7 +327,7 @@ class GPUPRROIPoolGradOpKernel : public framework::OpKernel<T> {
                    dev_ctx.stream());
 
       input_grad->mutable_data<T>(ctx.GetPlace());
-      math::SetConstant<DeviceContext, T> set_zero;
+      pten::funcs::SetConstant<DeviceContext, T> set_zero;
       set_zero(ctx.cuda_device_context(), input_grad, static_cast<T>(0));
       input_roi_grad->mutable_data<T>(ctx.GetPlace());
       set_zero(ctx.cuda_device_context(), input_roi_grad, static_cast<T>(0));
diff --git a/paddle/fluid/operators/prroi_pool_op.h b/paddle/fluid/operators/prroi_pool_op.h
index 38f8d6542ac..63f0047aa95 100644
--- a/paddle/fluid/operators/prroi_pool_op.h
+++ b/paddle/fluid/operators/prroi_pool_op.h
@@ -15,7 +15,7 @@ limitations under the License. */
 #pragma once
 #include <algorithm>
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 #if defined(__NVCC__) || defined(__HIPCC__)
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #endif
@@ -500,7 +500,7 @@ class CPUPRROIPoolGradOpKernel : public framework::OpKernel<T> {
       input_grad->mutable_data<T>(ctx.GetPlace());
       input_roi_grad->mutable_data<T>(ctx.GetPlace());
       // set gradient of X to be 0. before backpropagate.
-      math::SetConstant<DeviceContext, T> set_zero;
+      pten::funcs::SetConstant<DeviceContext, T> set_zero;
       set_zero(ctx.template device_context<DeviceContext>(), input_grad,
                static_cast<T>(0));
       set_zero(ctx.template device_context<DeviceContext>(), input_roi_grad,
diff --git a/paddle/fluid/operators/pscore/distributed_lookup_table_op.cc b/paddle/fluid/operators/pscore/distributed_lookup_table_op.cc
index 277c93fad6a..15b1aab8551 100644
--- a/paddle/fluid/operators/pscore/distributed_lookup_table_op.cc
+++ b/paddle/fluid/operators/pscore/distributed_lookup_table_op.cc
@@ -13,8 +13,8 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/pscore/distributed_lookup_table_op.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/pscore/distributed_lookup_table_op.h b/paddle/fluid/operators/pscore/distributed_lookup_table_op.h
index d715bf34a49..af423f71b0d 100644
--- a/paddle/fluid/operators/pscore/distributed_lookup_table_op.h
+++ b/paddle/fluid/operators/pscore/distributed_lookup_table_op.h
@@ -18,7 +18,7 @@
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/tensor_util.h"
-#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/pscore/distributed_push_sparse_op.cc b/paddle/fluid/operators/pscore/distributed_push_sparse_op.cc
index 3a1e2ea7861..b481235956d 100644
--- a/paddle/fluid/operators/pscore/distributed_push_sparse_op.cc
+++ b/paddle/fluid/operators/pscore/distributed_push_sparse_op.cc
@@ -13,8 +13,8 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/pscore/distributed_push_sparse_op.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/pscore/distributed_push_sparse_op.h b/paddle/fluid/operators/pscore/distributed_push_sparse_op.h
index f19ba5f2e41..c07ffa4bd0e 100644
--- a/paddle/fluid/operators/pscore/distributed_push_sparse_op.h
+++ b/paddle/fluid/operators/pscore/distributed_push_sparse_op.h
@@ -18,7 +18,7 @@
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/tensor_util.h"
-#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/pscore/fake_init_op.cc b/paddle/fluid/operators/pscore/fake_init_op.cc
index b3a745fc995..d337aa8b010 100644
--- a/paddle/fluid/operators/pscore/fake_init_op.cc
+++ b/paddle/fluid/operators/pscore/fake_init_op.cc
@@ -11,7 +11,7 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/psroi_pool_op.cu b/paddle/fluid/operators/psroi_pool_op.cu
index efdcc59a5c4..9bca5d86d4a 100644
--- a/paddle/fluid/operators/psroi_pool_op.cu
+++ b/paddle/fluid/operators/psroi_pool_op.cu
@@ -317,7 +317,7 @@ class GPUPSROIPoolGradOpKernel : public framework::OpKernel<T> {
                             ctx.device_context(), &rois_batch_id_list_gpu);
 
       input_grad->mutable_data<T>(ctx.GetPlace());
-      math::SetConstant<Place, T> set_zero;
+      pten::funcs::SetConstant<Place, T> set_zero;
       set_zero(ctx.cuda_device_context(), input_grad, static_cast<T>(0));
 
       int output_grad_size = output_grad->numel();
diff --git a/paddle/fluid/operators/psroi_pool_op.h b/paddle/fluid/operators/psroi_pool_op.h
index 4d7e9ce295f..ed5221648fd 100644
--- a/paddle/fluid/operators/psroi_pool_op.h
+++ b/paddle/fluid/operators/psroi_pool_op.h
@@ -15,7 +15,7 @@ limitations under the License. */
 #pragma once
 #include <algorithm>
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
@@ -225,7 +225,7 @@ class CPUPSROIPoolGradOpKernel : public framework::OpKernel<T> {
       T* input_grad_data = input_grad->mutable_data<T>(ctx.GetPlace());
 
       // set gradient of X to be 0. before backpropagate.
-      math::SetConstant<DeviceContext, T> set_zero;
+      pten::funcs::SetConstant<DeviceContext, T> set_zero;
       set_zero(ctx.template device_context<DeviceContext>(), input_grad,
                static_cast<T>(0));
 
diff --git a/paddle/fluid/operators/put_along_axis_op.cu b/paddle/fluid/operators/put_along_axis_op.cu
index da36b564337..800da8a275c 100644
--- a/paddle/fluid/operators/put_along_axis_op.cu
+++ b/paddle/fluid/operators/put_along_axis_op.cu
@@ -17,8 +17,8 @@ limitations under the License. */
 #include <vector>
 #include "paddle/fluid/framework/ddim.h"
 #include "paddle/fluid/framework/op_version_registry.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/put_along_axis_op.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/put_along_axis_op.h b/paddle/fluid/operators/put_along_axis_op.h
index f23ca177db9..0b4481ceacf 100644
--- a/paddle/fluid/operators/put_along_axis_op.h
+++ b/paddle/fluid/operators/put_along_axis_op.h
@@ -16,7 +16,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/gather_scatter_kernel.h"
-#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/qr_op.h b/paddle/fluid/operators/qr_op.h
index 1731aa9e072..c55619a4f76 100644
--- a/paddle/fluid/operators/qr_op.h
+++ b/paddle/fluid/operators/qr_op.h
@@ -142,7 +142,7 @@ class QrGradKernel : public framework::OpKernel<T> {
         *ctx.Output<framework::Tensor>(framework::GradVarName("X"));
     dA.mutable_data<math::Real<T>>(ctx.GetPlace());
     auto& dev_ctx = ctx.template device_context<DeviceContext>();
-    math::SetConstant<DeviceContext, T>()(dev_ctx, &dA, T(0));
+    pten::funcs::SetConstant<DeviceContext, T>()(dev_ctx, &dA, T(0));
 
     auto dito = math::DeviceIndependenceTensorOperations<DeviceContext, T>(ctx);
 
diff --git a/paddle/fluid/operators/range_op.h b/paddle/fluid/operators/range_op.h
index 5344147a906..aca9d50c327 100644
--- a/paddle/fluid/operators/range_op.h
+++ b/paddle/fluid/operators/range_op.h
@@ -15,7 +15,7 @@ limitations under the License. */
 #pragma once
 #include <functional>
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/range_op_npu_test.cc b/paddle/fluid/operators/range_op_npu_test.cc
index 081cafdf67b..00486dbed8b 100644
--- a/paddle/fluid/operators/range_op_npu_test.cc
+++ b/paddle/fluid/operators/range_op_npu_test.cc
@@ -25,12 +25,11 @@ limitations under the License. */
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/operators/dropout_op.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/string/printf.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace f = paddle::framework;
 namespace p = paddle::platform;
-namespace m = paddle::operators::math;
 
 USE_OP(range);
 USE_OP_DEVICE_KERNEL(range, NPU);
diff --git a/paddle/fluid/operators/rank_attention.cu.h b/paddle/fluid/operators/rank_attention.cu.h
index 8ec138c8824..3eb4d8401ab 100644
--- a/paddle/fluid/operators/rank_attention.cu.h
+++ b/paddle/fluid/operators/rank_attention.cu.h
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #pragma once
 #include "paddle/fluid/framework/dim.h"
-#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/reduce_ops/reduce_any_op_npu_test.cc b/paddle/fluid/operators/reduce_ops/reduce_any_op_npu_test.cc
index 1eeeb5e1f8a..f8ed44267e9 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_any_op_npu_test.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_any_op_npu_test.cc
@@ -27,12 +27,11 @@ limitations under the License. */
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/memory/malloc.h"
 #include "paddle/fluid/memory/memcpy.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/string/printf.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace f = paddle::framework;
 namespace p = paddle::platform;
-namespace m = paddle::operators::math;
 
 using Tensor = paddle::framework::Tensor;
 
diff --git a/paddle/fluid/operators/reduce_ops/reduce_op.h b/paddle/fluid/operators/reduce_ops/reduce_op.h
index 667ffabbf40..4101c8b73e7 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_op.h
+++ b/paddle/fluid/operators/reduce_ops/reduce_op.h
@@ -21,8 +21,8 @@ limitations under the License. */
 #include "paddle/fluid/framework/data_type_transform.h"
 #include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/operators/cast_op.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/reduce_ops/reduce_op_function.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 // only can include the headers in paddle/pten/api dirs
 #include "paddle/pten/api/lib/utils/tensor_utils.h"
@@ -102,7 +102,7 @@ void GetShuffledInput(const framework::ExecutionContext& context,
   shuffled_input->Resize(shuffled_dims);
   shuffled_input->mutable_data<OutT>(context.GetPlace());
 
-  math::TransposeNormal<DeviceContext, OutT> trans;
+  pten::funcs::TransposeNormal<DeviceContext, OutT> trans;
   trans(context.template device_context<DeviceContext>(), *input,
         shuffled_input, perm_axis);
 }
@@ -166,7 +166,7 @@ void HandleLargeDimGrad(const framework::ExecutionContext& context,
   framework::TensorCopy(*dx, context.GetPlace(), &dx_tmp);
   dx_tmp.Resize(shuffled_dim);
   dx->Resize(x_dim);
-  math::TransposeNormal<DeviceContext, T> trans;
+  pten::funcs::TransposeNormal<DeviceContext, T> trans;
   trans(context.template device_context<DeviceContext>(), dx_tmp, dx,
         origin_axis);
 }
diff --git a/paddle/fluid/operators/repeat_interleave_op.h b/paddle/fluid/operators/repeat_interleave_op.h
index 1a38b0271dd..ca861696d71 100644
--- a/paddle/fluid/operators/repeat_interleave_op.h
+++ b/paddle/fluid/operators/repeat_interleave_op.h
@@ -16,7 +16,7 @@
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/blas.h"
-#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 #include "paddle/fluid/operators/index_select_op.h"
 namespace paddle {
diff --git a/paddle/fluid/operators/rnn_op.cu.cc b/paddle/fluid/operators/rnn_op.cu.cc
index 80a0ef10fa1..94becaa43f0 100644
--- a/paddle/fluid/operators/rnn_op.cu.cc
+++ b/paddle/fluid/operators/rnn_op.cu.cc
@@ -14,9 +14,9 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/generator.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/utils.h"
 #include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
@@ -719,7 +719,7 @@ class RNNGradCudnnKernel : public framework::OpKernel<T> {
     }
 
     Tensor weight_grad;
-    math::SetConstant<paddle::platform::CUDADeviceContext, T> zero;
+    pten::funcs::SetConstant<paddle::platform::CUDADeviceContext, T> zero;
     weight_grad.mutable_data<T>({weight_numel}, ctx.GetPlace());
     zero(dev_ctx, &weight_grad, static_cast<T>(0.0));
     T *weight_grad_data = weight_grad.data<T>();
diff --git a/paddle/fluid/operators/rnn_op.h b/paddle/fluid/operators/rnn_op.h
index 5e19be5e4cf..b2c1b8b9895 100644
--- a/paddle/fluid/operators/rnn_op.h
+++ b/paddle/fluid/operators/rnn_op.h
@@ -25,9 +25,9 @@ limitations under the License. */
 #include "paddle/fluid/operators/math/fc.h"
 #include "paddle/fluid/operators/math/gru_compute.h"
 #include "paddle/fluid/operators/math/lstm_compute.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/unique_op.h"
 #include "paddle/fluid/operators/utils.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
@@ -305,7 +305,7 @@ struct Layer {
       framework::TensorCopy(bias_hh, context.GetPlace(), dev_ctx, &bias_hh_tmp);
       bias_hh_tmp.Resize({3, bias_hh_tmp.numel() / 3});
       auto bias_hh_tmp_unbind = Unbind(bias_hh_tmp);
-      math::SetConstant<platform::CPUDeviceContext, T> zero;
+      pten::funcs::SetConstant<platform::CPUDeviceContext, T> zero;
       zero(dev_ctx, &bias_hh_tmp_unbind[2], static_cast<T>(0.0));
 
       auto bias_hh_after_mask = framework::EigenMatrix<T>::From(
@@ -439,7 +439,7 @@ struct Layer {
                             &weight_hh_tmp);
       weight_hh_tmp.Resize({3, weight_hh_tmp.numel() / 3});
       auto weight_hh_tmp_unbind = Unbind(weight_hh_tmp);
-      math::SetConstant<platform::CPUDeviceContext, T> zero;
+      pten::funcs::SetConstant<platform::CPUDeviceContext, T> zero;
       zero(dev_ctx, &weight_hh_tmp_unbind[2], static_cast<T>(0.0));
       weight_hh_tmp.Resize(vec[1 + offset * 4].dims());
     }
@@ -585,7 +585,7 @@ struct Layer {
                             &weight_hh_tmp);
       weight_hh_tmp.Resize({3, weight_hh_tmp.numel() / 3});
       auto weight_hh_tmp_unbind = Unbind(weight_hh_tmp);
-      math::SetConstant<platform::CPUDeviceContext, T> zero;
+      pten::funcs::SetConstant<platform::CPUDeviceContext, T> zero;
       zero(dev_ctx, &weight_hh_tmp_unbind[2], static_cast<T>(0.0));
       weight_hh_tmp.Resize(vec[1 + offset * 4].dims());
     }
@@ -966,7 +966,7 @@ class RNNCPUKernel : public framework::OpKernel<T> {
     dropout_mask->mutable_data<uint8_t>(output->dims(), ctx.GetPlace());
 
     auto& dev_ctx = ctx.template device_context<platform::CPUDeviceContext>();
-    math::SetConstant<platform::CPUDeviceContext, uint8_t> ones;
+    pten::funcs::SetConstant<platform::CPUDeviceContext, uint8_t> ones;
     ones(dev_ctx, dropout_mask, static_cast<uint8_t>(1));
     // init the output and allocate the memory
     output->mutable_data<T>(ctx.GetPlace());
@@ -1095,7 +1095,7 @@ struct GradLayer {
     Tensor c, d;
     Tensor* dynamic_grad_pre_h = &c;
     Tensor* dynamic_grad_pre_c = &d;
-    math::SetConstant<platform::CPUDeviceContext, T> zero;
+    pten::funcs::SetConstant<platform::CPUDeviceContext, T> zero;
     if (init_h_grad_unbind->size() > 0) {
       dynamic_grad_pre_h->ShareDataWith(
           (*init_h_grad_unbind)[current_layer_idx]);
@@ -1293,7 +1293,7 @@ struct GradLayer {
                 mat_dim_parameter, static_cast<T>(1.0), input_grad, T(1));
 
     // calc the gradient of Bias_hi, Bias_hh
-    math::ColwiseSum<platform::CPUDeviceContext, T> col_sum;
+    pten::funcs::ColwiseSum<platform::CPUDeviceContext, T> col_sum;
     Tensor tmp_grad_gate;
     tmp_grad_gate.ShareDataWith(grad_gate);
     tmp_grad_gate.Resize(
@@ -1328,7 +1328,7 @@ struct SingleGradLayer : GradLayer<T, GradCellType> {
       const int& gate_num) {
     auto& device_ctx =
         context.template device_context<platform::CPUDeviceContext>();
-    math::SetConstant<platform::CPUDeviceContext, T> zero;
+    pten::funcs::SetConstant<platform::CPUDeviceContext, T> zero;
     zero(device_ctx, input_grad, static_cast<T>(0.0));
 
     const bool& is_bidirec = context.Attr<bool>("is_bidirec");
@@ -1425,7 +1425,7 @@ struct BidirGradLayer : GradLayer<T, GradCellType> {
     // split the output two tensor to output_forward, output_backward
     auto& device_ctx =
         context.template device_context<platform::CPUDeviceContext>();
-    math::SetConstant<platform::CPUDeviceContext, T> zero;
+    pten::funcs::SetConstant<platform::CPUDeviceContext, T> zero;
     zero(device_ctx, input_grad, static_cast<T>(0.0));
 
     std::vector<Tensor*> output_vec;
@@ -1675,7 +1675,7 @@ struct GRUGradCell : GradCell<T> {
       backup_tensor<T>(context, &grad_pre_hidden_bak, grad_pre_hidden);
     }
     // zero pre_hidden
-    math::SetConstant<platform::CPUDeviceContext, T> zero;
+    pten::funcs::SetConstant<platform::CPUDeviceContext, T> zero;
     zero(device_ctx, grad_pre_hidden, static_cast<T>(0.0));
     math::GRUMetaValue<T> gru_value;
     math::GRUMetaGrad<T> gru_grad;
diff --git a/paddle/fluid/operators/roi_align_op.cu b/paddle/fluid/operators/roi_align_op.cu
index 520023229fe..5c9c8b78a4b 100644
--- a/paddle/fluid/operators/roi_align_op.cu
+++ b/paddle/fluid/operators/roi_align_op.cu
@@ -395,7 +395,7 @@ class GPUROIAlignGradOpKernel : public framework::OpKernel<T> {
     memory::Copy(gplace, roi_id_data, cplace, roi_batch_id_data, bytes,
                  dev_ctx.stream());
     in_grad->mutable_data<T>(ctx.GetPlace());
-    math::SetConstant<Place, T> set_zero;
+    pten::funcs::SetConstant<Place, T> set_zero;
     set_zero(dev_ctx, in_grad, static_cast<T>(0));
 
     int output_grad_size = out_grad->numel();
diff --git a/paddle/fluid/operators/roi_align_op.h b/paddle/fluid/operators/roi_align_op.h
index 1ab5ddc83fb..acae86bd1b3 100644
--- a/paddle/fluid/operators/roi_align_op.h
+++ b/paddle/fluid/operators/roi_align_op.h
@@ -15,7 +15,7 @@ limitations under the License. */
 #include <numeric>
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
@@ -23,7 +23,7 @@ namespace operators {
 using Tensor = framework::Tensor;
 using LoDTensor = framework::LoDTensor;
 
-namespace {
+namespace {  // NOLINT
 constexpr size_t get_offset(size_t x, size_t y, size_t width) {
   return y * width + x;
 }
@@ -41,7 +41,7 @@ struct offsets_and_ratios {
         xy_ratio(xy_ratio),
         xY_ratio(xY_ratio),
         Xy_ratio(Xy_ratio),
-        XY_ratio(XY_ratio){};
+        XY_ratio(XY_ratio) {}
 
   std::size_t xy = 0;
   std::size_t xY = 0;
@@ -128,10 +128,10 @@ std::vector<offsets_and_ratios<T>> get_indexes_and_ratios(
     }
   }
   return interpolation_cords;
-}
+}  // namespace
 
 template <typename T>
-void interpolate(std::vector<T>& interpolated_values,
+void interpolate(std::vector<T>& interpolated_values,  // NOLINT
                  const std::vector<offsets_and_ratios<T>>& interpolation_cords,
                  const T* data) {
   for (auto& ic : interpolation_cords) {
@@ -167,7 +167,7 @@ void avg_pool(const std::vector<T>& interpolated_values, T* output_data,
     output_data[i] = sum * count;
   }
 }
-}
+}  // NOLINT
 
 template <class T>
 void bilinear_interpolate_gradient(const int height, const int width, T y, T x,
@@ -389,7 +389,7 @@ class CPUROIAlignGradOpKernel : public framework::OpKernel<T> {
     }
     in_grad->mutable_data<T>(ctx.GetPlace());
     auto& dev_ctx = ctx.template device_context<DeviceContext>();
-    math::SetConstant<DeviceContext, T> set_zero;
+    pten::funcs::SetConstant<DeviceContext, T> set_zero;
     set_zero(dev_ctx, in_grad, static_cast<T>(0));
 
     int output_grad_size = out_grad->numel();
diff --git a/paddle/fluid/operators/roi_align_op_npu.cc b/paddle/fluid/operators/roi_align_op_npu.cc
index d6ccf84bbfb..7e19287d425 100644
--- a/paddle/fluid/operators/roi_align_op_npu.cc
+++ b/paddle/fluid/operators/roi_align_op_npu.cc
@@ -10,8 +10,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/roi_align_op.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/roi_pool_op.cu b/paddle/fluid/operators/roi_pool_op.cu
index 16a8e2bf586..eafb7902851 100644
--- a/paddle/fluid/operators/roi_pool_op.cu
+++ b/paddle/fluid/operators/roi_pool_op.cu
@@ -274,7 +274,7 @@ class GPUROIPoolGradOpKernel : public framework::OpKernel<T> {
                    dev_ctx.stream());
 
       x_grad->mutable_data<T>(ctx.GetPlace());
-      math::SetConstant<Place, T> set_zero;
+      pten::funcs::SetConstant<Place, T> set_zero;
       set_zero(dev_ctx, x_grad, static_cast<T>(0));
 
       int output_grad_size = out_grad->numel();
diff --git a/paddle/fluid/operators/roi_pool_op.h b/paddle/fluid/operators/roi_pool_op.h
index 40de6d0cf6a..531fe241c43 100644
--- a/paddle/fluid/operators/roi_pool_op.h
+++ b/paddle/fluid/operators/roi_pool_op.h
@@ -18,7 +18,7 @@ limitations under the License. */
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/memory/memcpy.h"
-#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
@@ -212,7 +212,7 @@ class CPUROIPoolGradOpKernel : public framework::OpKernel<T> {
       const T* out_grad_data = out_grad->data<T>();
       const int64_t* argmax_data = argmax->data<int64_t>();
       T* in_grad_data = in_grad->mutable_data<T>(ctx.GetPlace());
-      math::SetConstant<DeviceContext, T> set_zero;
+      pten::funcs::SetConstant<DeviceContext, T> set_zero;
       set_zero(ctx.template device_context<DeviceContext>(), in_grad,
                static_cast<T>(0));
 
diff --git a/paddle/fluid/operators/row_conv_op.cu b/paddle/fluid/operators/row_conv_op.cu
index 586cf3239b5..24f8ba4f213 100644
--- a/paddle/fluid/operators/row_conv_op.cu
+++ b/paddle/fluid/operators/row_conv_op.cu
@@ -11,9 +11,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/row_conv_op.h"
 #include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
@@ -395,7 +395,7 @@ class RowConvGradKernel<platform::CUDADeviceContext, T>
     size_t *idx = batch_indices.CUDAMutableData(context.GetPlace());
 
     auto &device_ctx = context.cuda_device_context();
-    math::SetConstant<platform::CUDADeviceContext, T> zero;
+    pten::funcs::SetConstant<platform::CUDADeviceContext, T> zero;
 
     if (dFilter) {
       T *dfilter = dFilter->mutable_data<T>(context.GetPlace());
diff --git a/paddle/fluid/operators/sample_logits_op.cu b/paddle/fluid/operators/sample_logits_op.cu
index 4bcd27036a5..3caa79a0bff 100644
--- a/paddle/fluid/operators/sample_logits_op.cu
+++ b/paddle/fluid/operators/sample_logits_op.cu
@@ -19,10 +19,10 @@ limitations under the License. */
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/tensor_util.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/math/sample_prob.h"
 #include "paddle/fluid/operators/math/softmax.h"
 #include "paddle/fluid/operators/sample_logits_op.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
@@ -138,7 +138,7 @@ class SampleLogitsCUDAKernel : public framework::OpKernel<T> {
 
     // UNDERSTAND: allocate memories for temporaries
     sampled_logits->mutable_data<T>(samples_dim, context.GetPlace());
-    math::SetConstant<platform::CUDADeviceContext, T> set_zero;
+    pten::funcs::SetConstant<platform::CUDADeviceContext, T> set_zero;
     set_zero(dev_ctx, sampled_logits, static_cast<T>(0));
 
     auto sampled_labels_data =
@@ -224,7 +224,7 @@ class SampleLogitsGradCUDAKernel : public framework::OpKernel<T> {
     logits_grad->mutable_data<T>(context.GetPlace());
 
     auto& dev_ctx = context.cuda_device_context();
-    math::SetConstant<platform::CUDADeviceContext, T> set_zero;
+    pten::funcs::SetConstant<platform::CUDADeviceContext, T> set_zero;
     set_zero(dev_ctx, logits_grad, static_cast<T>(0));
 
     // UNDERSTAND: scatter it back to logit_grad
diff --git a/paddle/fluid/operators/sample_logits_op.h b/paddle/fluid/operators/sample_logits_op.h
index 872eb341d49..f7560991a6a 100644
--- a/paddle/fluid/operators/sample_logits_op.h
+++ b/paddle/fluid/operators/sample_logits_op.h
@@ -19,9 +19,9 @@ limitations under the License. */
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/tensor_util.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/math/sample_prob.h"
 #include "paddle/fluid/operators/math/softmax.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
@@ -286,7 +286,7 @@ class SampleLogitsGradKernel : public framework::OpKernel<T> {
 
     auto& dev_ctx =
         context.template device_context<platform::CPUDeviceContext>();
-    math::SetConstant<platform::CPUDeviceContext, T> set_zero;
+    pten::funcs::SetConstant<platform::CPUDeviceContext, T> set_zero;
     set_zero(dev_ctx, logits_grad, static_cast<T>(0));
 
     // UNDERSTAND: scatter it back to logit_grad
diff --git a/paddle/fluid/operators/scatter.cu.h b/paddle/fluid/operators/scatter.cu.h
index 13c08aea688..a98d98e72ad 100644
--- a/paddle/fluid/operators/scatter.cu.h
+++ b/paddle/fluid/operators/scatter.cu.h
@@ -15,11 +15,11 @@ limitations under the License. */
 #pragma once
 #include <unordered_set>
 #include <vector>
-#include "math/math_function.h"
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/memory/malloc.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/fluid/platform/place.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/search_compute.h b/paddle/fluid/operators/search_compute.h
index d0618bf2c30..3e8d270ca4f 100644
--- a/paddle/fluid/operators/search_compute.h
+++ b/paddle/fluid/operators/search_compute.h
@@ -23,7 +23,7 @@ limitations under the License. */
 #include <cstring>
 
 #include "paddle/fluid/operators/math/blas.h"
-#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/seed_op.cu b/paddle/fluid/operators/seed_op.cu
index 5a8d1c067c3..5257e7709f9 100644
--- a/paddle/fluid/operators/seed_op.cu
+++ b/paddle/fluid/operators/seed_op.cu
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/seed_op.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
@@ -32,7 +32,7 @@ class GPUSeedKernel : public framework::OpKernel<T> {
           platform::DeviceContextPool::Instance();
       auto &dev_ctx = *pool.Get(platform::CPUPlace());
       out->mutable_data<T>(platform::CPUPlace());
-      math::SetConstant<platform::CPUDeviceContext, T> functor;
+      pten::funcs::SetConstant<platform::CPUDeviceContext, T> functor;
       functor(reinterpret_cast<const platform::CPUDeviceContext &>(dev_ctx),
               out, static_cast<T>(seed));
     } else {
diff --git a/paddle/fluid/operators/segment_pool_op.h b/paddle/fluid/operators/segment_pool_op.h
index 4f180a31ce5..47b18e04e4d 100644
--- a/paddle/fluid/operators/segment_pool_op.h
+++ b/paddle/fluid/operators/segment_pool_op.h
@@ -16,10 +16,10 @@ limitations under the License. */
 #include <string>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/math/segment_pooling.h"
 #include "paddle/fluid/platform/macros.h"
 #include "paddle/pten/common/place.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
@@ -60,7 +60,7 @@ void SegmentKernelLaunchHelper(const framework::ExecutionContext& context) {
             "Segment ids must be >= 0, but got last id %d", dims[0]));
     output->Resize({dims});
     output->mutable_data<T>(context.GetPlace());
-    math::SetConstant<DeviceContext, T> set_zero;
+    pten::funcs::SetConstant<DeviceContext, T> set_zero;
     auto& dev_ctx = context.template device_context<DeviceContext>();
     set_zero(dev_ctx, output, static_cast<T>(0));
   }
@@ -98,7 +98,7 @@ void SegmentKernelLaunchHelper(const framework::ExecutionContext& context) {
     } else if (pooltype == "MIN") {
       init_value = static_cast<T>(FLT_MAX);
     }
-    math::SetConstant<DeviceContext, T> setconst;
+    pten::funcs::SetConstant<DeviceContext, T> setconst;
     auto& dev_ctx = context.template device_context<DeviceContext>();
     setconst(dev_ctx, output, static_cast<T>(init_value));
     // the gpu kernel of mean pool record the counts of segment_ids
@@ -152,7 +152,7 @@ class SegmentPoolGradKernel : public framework::OpKernel<T> {
     }
 
     in_g->mutable_data<T>(context.GetPlace());
-    math::SetConstant<DeviceContext, T> set_zero;
+    pten::funcs::SetConstant<DeviceContext, T> set_zero;
     auto& dev_ctx = context.template device_context<DeviceContext>();
     set_zero(dev_ctx, in_g, static_cast<T>(0));
 
diff --git a/paddle/fluid/operators/sequence_ops/sequence_conv_op.h b/paddle/fluid/operators/sequence_ops/sequence_conv_op.h
index f73b1804199..b43254f91fd 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_conv_op.h
+++ b/paddle/fluid/operators/sequence_ops/sequence_conv_op.h
@@ -16,7 +16,7 @@ limitations under the License. */
 #include <algorithm>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/context_project.h"
-#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
@@ -64,7 +64,7 @@ class SequenceConvKernel : public framework::OpKernel<T> {
     Tensor col;
     col.mutable_data<T>(col_shape, context.GetPlace());
     // Because if padding_trainable is false, padding data should be zeros.
-    math::SetConstant<DeviceContext, T> set_zero;
+    pten::funcs::SetConstant<DeviceContext, T> set_zero;
     auto& dev_ctx = context.template device_context<DeviceContext>();
     auto blas = math::GetBlas<DeviceContext, T>(dev_ctx);
     set_zero(dev_ctx, &col, static_cast<T>(0));
@@ -107,7 +107,7 @@ class SequenceConvGradKernel : public framework::OpKernel<T> {
     int down_pad = std::max(0, context_start + context_length - 1);
     auto sequence_width = static_cast<int64_t>(in->dims()[1]);
 
-    math::SetConstant<DeviceContext, T> set_zero;
+    pten::funcs::SetConstant<DeviceContext, T> set_zero;
     auto& dev_ctx = context.template device_context<DeviceContext>();
     auto blas = math::GetBlas<DeviceContext, T>(dev_ctx);
     // use col_shape in the im2col calculation
diff --git a/paddle/fluid/operators/sequence_ops/sequence_expand_op.h b/paddle/fluid/operators/sequence_ops/sequence_expand_op.h
index 1186ed891e8..74baf67f7fe 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_expand_op.h
+++ b/paddle/fluid/operators/sequence_ops/sequence_expand_op.h
@@ -17,7 +17,7 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/memory/memcpy.h"
-#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
@@ -172,7 +172,7 @@ struct SequenceExpandGradFunctor<platform::CPUDeviceContext, T> {
         int dout_end = dout_offset + repeat_num * x_seq_len;
         auto dout_sub = dout.Slice(dout_offset, dout_end);
         dout_sub.Resize({repeat_num, dx_sub.dims()[0]});
-        math::ColwiseSum<platform::CPUDeviceContext, T> col_sum;
+        pten::funcs::ColwiseSum<platform::CPUDeviceContext, T> col_sum;
         col_sum(context, dout_sub, &dx_sub);
         dout_offset += repeat_num * x_seq_len;
       }
@@ -194,7 +194,7 @@ class SequenceExpandGradKernel : public framework::OpKernel<T> {
     g_x->set_lod(x->lod());
 
     auto& dev_ctx = context.template device_context<DeviceContext>();
-    math::SetConstant<DeviceContext, T> set_zero;
+    pten::funcs::SetConstant<DeviceContext, T> set_zero;
     set_zero(dev_ctx, g_x, static_cast<T>(0));
 
     auto& y_lod = y->lod();
diff --git a/paddle/fluid/operators/sequence_ops/sequence_pad_op.h b/paddle/fluid/operators/sequence_ops/sequence_pad_op.h
index a9660f05c3c..2b50995a6ab 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_pad_op.h
+++ b/paddle/fluid/operators/sequence_ops/sequence_pad_op.h
@@ -17,8 +17,8 @@ limitations under the License. */
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/memory/memcpy.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/math/sequence_padding.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/sequence_ops/sequence_pool_op.h b/paddle/fluid/operators/sequence_ops/sequence_pool_op.h
index dca65512e32..bc279f1eb31 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_pool_op.h
+++ b/paddle/fluid/operators/sequence_ops/sequence_pool_op.h
@@ -16,8 +16,8 @@ limitations under the License. */
 #include <string>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/math/sequence_pooling.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/sequence_ops/sequence_reshape_op.h b/paddle/fluid/operators/sequence_ops/sequence_reshape_op.h
index b5d21242113..2cf81197f92 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_reshape_op.h
+++ b/paddle/fluid/operators/sequence_ops/sequence_reshape_op.h
@@ -14,7 +14,7 @@
 
 #pragma once
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/sequence_ops/sequence_slice_op.h b/paddle/fluid/operators/sequence_ops/sequence_slice_op.h
index 65e021b507a..d5689091bec 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_slice_op.h
+++ b/paddle/fluid/operators/sequence_ops/sequence_slice_op.h
@@ -14,8 +14,8 @@ limitations under the License. */
 
 #pragma once
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/strided_memcpy.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
@@ -168,7 +168,7 @@ class SequenceSliceGradOpKernel : public framework::OpKernel<T> {
     if (x_grad) {
       x_grad->mutable_data<T>(ctx.GetPlace());
       x_grad->set_lod(in->lod());
-      math::SetConstant<DeviceContext, T> set_zero;
+      pten::funcs::SetConstant<DeviceContext, T> set_zero;
       set_zero(ctx.template device_context<DeviceContext>(), x_grad,
                static_cast<T>(0));
 
diff --git a/paddle/fluid/operators/sequence_ops/sequence_softmax_cudnn_op.cu.cc b/paddle/fluid/operators/sequence_ops/sequence_softmax_cudnn_op.cu.cc
index 46e4196585b..869bc613c4a 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_softmax_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/sequence_ops/sequence_softmax_cudnn_op.cu.cc
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/math/softmax.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/sequence_ops/sequence_topk_avg_pooling_op.h b/paddle/fluid/operators/sequence_ops/sequence_topk_avg_pooling_op.h
index e8e0241e46a..5190108acde 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_topk_avg_pooling_op.h
+++ b/paddle/fluid/operators/sequence_ops/sequence_topk_avg_pooling_op.h
@@ -21,7 +21,7 @@ limitations under the License. */
 #include <vector>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
@@ -196,7 +196,7 @@ class SequenceTopkAvgPoolingGradKernel : public framework::OpKernel<T> {
 
     auto& dev_ctx =
         context.template device_context<platform::CPUDeviceContext>();
-    math::SetConstant<paddle::platform::CPUDeviceContext, T> zero;
+    pten::funcs::SetConstant<paddle::platform::CPUDeviceContext, T> zero;
     zero(dev_ctx, d_in, static_cast<T>(0.0));
 
     auto din_data = d_in->data<T>();
diff --git a/paddle/fluid/operators/sequence_ops/sequence_unpad_op.h b/paddle/fluid/operators/sequence_ops/sequence_unpad_op.h
index 60ba4797db1..b85b9384282 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_unpad_op.h
+++ b/paddle/fluid/operators/sequence_ops/sequence_unpad_op.h
@@ -17,8 +17,8 @@ limitations under the License. */
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/memory/memcpy.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/math/sequence_padding.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
@@ -87,7 +87,7 @@ class SequenceUnpadGradOpKernel : public framework::OpKernel<T> {
       LoDTensor zero_pads;
       zero_pads.Resize({1, 1});
       zero_pads.mutable_data<T>(ctx.GetPlace());
-      math::SetConstant<DeviceContext, T> set_zero;
+      pten::funcs::SetConstant<DeviceContext, T> set_zero;
       auto& dev_ctx = ctx.template device_context<DeviceContext>();
       set_zero(dev_ctx, &zero_pads, static_cast<T>(0));
 
diff --git a/paddle/fluid/operators/set_value_op.h b/paddle/fluid/operators/set_value_op.h
index 1580ef140ad..633bc468dc4 100644
--- a/paddle/fluid/operators/set_value_op.h
+++ b/paddle/fluid/operators/set_value_op.h
@@ -437,7 +437,7 @@ class SetValueGradKernel : public framework::OpKernel<T> {
     auto& dev_ctx = context.template device_context<DeviceContext>();
     auto& place =
         *context.template device_context<DeviceContext>().eigen_device();
-    math::SetConstant<DeviceContext, T> set_zero;
+    pten::funcs::SetConstant<DeviceContext, T> set_zero;
 
     if (grad_input) {
       // Set gradient of `Input`
diff --git a/paddle/fluid/operators/shrink_rnn_memory_op.cc b/paddle/fluid/operators/shrink_rnn_memory_op.cc
index 493073fadc2..38721e5e3e5 100644
--- a/paddle/fluid/operators/shrink_rnn_memory_op.cc
+++ b/paddle/fluid/operators/shrink_rnn_memory_op.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/operators/array_operator.h"
-#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 #include "paddle/pten/core/lod_utils.h"
 
@@ -156,7 +156,7 @@ class ShrinkRNNMemoryGradOp : public ArrayOp {
     auto &dev_ctx = *pool.Get(place);
 
     if (dout_var == nullptr) {  // dx_tensor fill zero
-      math::set_constant(dev_ctx, &dx_tensor, 0.0f);
+      pten::funcs::set_constant(dev_ctx, &dx_tensor, 0.0f);
     } else {
       auto &dout_tensor = dout_var->Get<framework::LoDTensor>();
       auto height = dout_tensor.dims()[0];
@@ -165,7 +165,7 @@ class ShrinkRNNMemoryGradOp : public ArrayOp {
       if (dx_tensor.dims()[0] > height) {
         auto rest_tensor = dx_tensor.Slice(
             static_cast<int>(height), static_cast<int>(dx_tensor.dims()[0]));
-        math::set_constant(dev_ctx, &rest_tensor, 0.0f);
+        pten::funcs::set_constant(dev_ctx, &rest_tensor, 0.0f);
       }
     }
     dx_tensor.set_lod(x_tensor.lod());
diff --git a/paddle/fluid/operators/shuffle_channel_op.h b/paddle/fluid/operators/shuffle_channel_op.h
index 3ce1e0c770b..2bf96fad269 100644
--- a/paddle/fluid/operators/shuffle_channel_op.h
+++ b/paddle/fluid/operators/shuffle_channel_op.h
@@ -13,7 +13,7 @@ limitations under the License. */
 #include <algorithm>
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/slice_op.h b/paddle/fluid/operators/slice_op.h
index d9ef45343d8..bf05bbadcbc 100644
--- a/paddle/fluid/operators/slice_op.h
+++ b/paddle/fluid/operators/slice_op.h
@@ -18,9 +18,9 @@ limitations under the License. */
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/eigen/eigen_function.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/slice_utils.h"
 #include "paddle/fluid/operators/utils.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
@@ -299,7 +299,7 @@ class SliceGradKernel : public framework::OpKernel<T> {
       platform::DeviceContextPool& pool =
           platform::DeviceContextPool::Instance();
       auto& dev_ctx = *pool.Get(ctx.GetPlace());
-      math::SetConstant<DeviceContext, T> functor;
+      pten::funcs::SetConstant<DeviceContext, T> functor;
       for (int i = 0; i < d_in_size; ++i) {
         auto dim = input_array->at(i).dims();
         d_in_arr->at(i).Resize(dim);
diff --git a/paddle/fluid/operators/softmax_op_npu_test.cc b/paddle/fluid/operators/softmax_op_npu_test.cc
index 8e9e077b845..98a67bc7487 100644
--- a/paddle/fluid/operators/softmax_op_npu_test.cc
+++ b/paddle/fluid/operators/softmax_op_npu_test.cc
@@ -23,12 +23,11 @@ limitations under the License. */
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/operators/dropout_op.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/string/printf.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace f = paddle::framework;
 namespace p = paddle::platform;
-namespace m = paddle::operators::math;
 
 USE_OP(softmax);
 USE_OP_DEVICE_KERNEL(softmax, NPU);
diff --git a/paddle/fluid/operators/softmax_with_cross_entropy_op.cu b/paddle/fluid/operators/softmax_with_cross_entropy_op.cu
index fe025641330..33bbed0f697 100644
--- a/paddle/fluid/operators/softmax_with_cross_entropy_op.cu
+++ b/paddle/fluid/operators/softmax_with_cross_entropy_op.cu
@@ -17,12 +17,12 @@ namespace cub = hipcub;
 #endif
 #include "paddle/fluid/operators/amp/fp16_type_traits.h"
 #include "paddle/fluid/operators/math/cross_entropy.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/softmax_cudnn_op.cu.h"
 #include "paddle/fluid/operators/softmax_with_cross_entropy_op.h"
 #include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
 #include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
 #include "paddle/fluid/platform/for_range.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
@@ -960,7 +960,7 @@ class SoftmaxWithCrossEntropyCUDAKernel : public framework::OpKernel<T> {
           softmax_out->template mutable_data<T>(context.GetPlace());
       auto* loss_data = loss->template mutable_data<T>(context.GetPlace());
 
-      math::SetConstant<platform::CUDADeviceContext, T> set_constant;
+      pten::funcs::SetConstant<platform::CUDADeviceContext, T> set_constant;
       set_constant(context.cuda_device_context(), loss, static_cast<T>(0));
       if (axis_dim == 1) {
         set_constant(context.cuda_device_context(), softmax_out,
@@ -1045,7 +1045,7 @@ class SoftmaxWithCrossEntropyCUDAKernel : public framework::OpKernel<T> {
     auto* loss_data = loss->template mutable_data<T>(context.GetPlace());
 
     if (axis_dim == 1) {
-      math::SetConstant<platform::CUDADeviceContext, T> set_constant;
+      pten::funcs::SetConstant<platform::CUDADeviceContext, T> set_constant;
       set_constant(context.cuda_device_context(), softmax, static_cast<T>(1));
       set_constant(context.cuda_device_context(), loss, static_cast<T>(0));
       return;
diff --git a/paddle/fluid/operators/solve_op.h b/paddle/fluid/operators/solve_op.h
index 7893b5da12c..c023d33a444 100644
--- a/paddle/fluid/operators/solve_op.h
+++ b/paddle/fluid/operators/solve_op.h
@@ -21,10 +21,10 @@ limitations under the License. */
 #include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/operators/eigen/eigen_function.h"
 #include "paddle/fluid/operators/math/blas.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/math/matrix_solve.h"
 #include "paddle/fluid/operators/reduce_ops/reduce_sum_op.h"
 #include "paddle/fluid/operators/squeeze_op.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 #if defined(__NVCC__) || defined(__HIPCC__)
 #include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h"
 #endif
@@ -509,7 +509,7 @@ class SolveGradKernel : public framework::OpKernel<T> {
     const auto& new_dims_vec = getNewDimsVec(input->dims());
     tmp_input.Resize(framework::make_ddim(new_dims_vec));
     tmp_input.mutable_data<T>(ctx.GetPlace());
-    math::TransposeNormal<DeviceContext, T> trans;
+    pten::funcs::TransposeNormal<DeviceContext, T> trans;
     std::vector<int> new_axis = getNewAxis(input->dims().size());
     auto& dev_ctx = ctx.template device_context<DeviceContext>();
     trans(dev_ctx, *input, &tmp_input, new_axis);
diff --git a/paddle/fluid/operators/spectral_norm_op.h b/paddle/fluid/operators/spectral_norm_op.h
index b8a15579e53..d0edcc16925 100644
--- a/paddle/fluid/operators/spectral_norm_op.h
+++ b/paddle/fluid/operators/spectral_norm_op.h
@@ -14,7 +14,7 @@
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/blas.h"
-#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
@@ -40,19 +40,19 @@ static inline void TransCompute(const int rank, const Tensor& in, Tensor* out,
 
   switch (rank) {
     case 2:
-      math::Transpose<DeviceContext, T, 2> trans2;
+      pten::funcs::Transpose<DeviceContext, T, 2> trans2;
       trans2(dev_ctx, in, out, perm);
       break;
     case 3:
-      math::Transpose<DeviceContext, T, 3> trans3;
+      pten::funcs::Transpose<DeviceContext, T, 3> trans3;
       trans3(dev_ctx, in, out, perm);
       break;
     case 4:
-      math::Transpose<DeviceContext, T, 4> trans4;
+      pten::funcs::Transpose<DeviceContext, T, 4> trans4;
       trans4(dev_ctx, in, out, perm);
       break;
     case 5:
-      math::Transpose<DeviceContext, T, 5> trans5;
+      pten::funcs::Transpose<DeviceContext, T, 5> trans5;
       trans5(dev_ctx, in, out, perm);
       break;
     default:
diff --git a/paddle/fluid/operators/spp_op.h b/paddle/fluid/operators/spp_op.h
index 6f78b885734..755cca99dad 100644
--- a/paddle/fluid/operators/spp_op.h
+++ b/paddle/fluid/operators/spp_op.h
@@ -16,9 +16,9 @@ limitations under the License. */
 #include <string>
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/math/pooling.h"
 #include "paddle/fluid/operators/strided_memcpy.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
@@ -95,7 +95,7 @@ class SppGradKernel : public framework::OpKernel<T> {
     std::string pooling_type =
         context.template Attr<std::string>("pooling_type");
     auto& device_ctx = context.template device_context<DeviceContext>();
-    math::SetConstant<DeviceContext, T> zero;
+    pten::funcs::SetConstant<DeviceContext, T> zero;
     in_x_grad->mutable_data<T>(context.GetPlace());
     zero(device_ctx, in_x_grad, static_cast<T>(0));
     auto out_stride = framework::stride(out->dims());
diff --git a/paddle/fluid/operators/squeeze_op.h b/paddle/fluid/operators/squeeze_op.h
old mode 100755
new mode 100644
index 2f621c11e58..d86037fa032
--- a/paddle/fluid/operators/squeeze_op.h
+++ b/paddle/fluid/operators/squeeze_op.h
@@ -18,9 +18,9 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/blas.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/math/pooling.h"
 #include "paddle/fluid/platform/device_context.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/squeeze_op_npu_test.cc b/paddle/fluid/operators/squeeze_op_npu_test.cc
index 3f6c43d7af2..ecedc0ba1c2 100644
--- a/paddle/fluid/operators/squeeze_op_npu_test.cc
+++ b/paddle/fluid/operators/squeeze_op_npu_test.cc
@@ -25,12 +25,11 @@ limitations under the License. */
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/operators/dropout_op.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/string/printf.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace f = paddle::framework;
 namespace p = paddle::platform;
-namespace m = paddle::operators::math;
 
 USE_OP(squeeze);
 USE_OP_DEVICE_KERNEL(squeeze, NPU);
diff --git a/paddle/fluid/operators/strided_slice_op.h b/paddle/fluid/operators/strided_slice_op.h
index 47714ebb806..d1efd3b6751 100644
--- a/paddle/fluid/operators/strided_slice_op.h
+++ b/paddle/fluid/operators/strided_slice_op.h
@@ -18,8 +18,8 @@ limitations under the License. */
 #include <utility>
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/slice_op.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 namespace paddle {
 namespace operators {
 
@@ -615,7 +615,7 @@ class StridedSliceGradKernel : public framework::OpKernel<T> {
             d_out_tensor->mutable_data<T>(context.GetPlace());
           }
 
-          math::SetConstant<DeviceContext, T> set_zero;
+          pten::funcs::SetConstant<DeviceContext, T> set_zero;
           set_zero(dev_ctx, d_out_tensor, static_cast<T>(0));
         }
       }
@@ -628,7 +628,7 @@ class StridedSliceGradKernel : public framework::OpKernel<T> {
 
       d_out->mutable_data<T>(context.GetPlace());
 
-      math::SetConstant<DeviceContext, T> set_zero;
+      pten::funcs::SetConstant<DeviceContext, T> set_zero;
       set_zero(dev_ctx, d_out, static_cast<T>(0));
 
       auto in_dims = d_input->dims();
diff --git a/paddle/fluid/operators/sum_op.cu b/paddle/fluid/operators/sum_op.cu
index 9de9b0b6338..ce152f44508 100644
--- a/paddle/fluid/operators/sum_op.cu
+++ b/paddle/fluid/operators/sum_op.cu
@@ -134,7 +134,7 @@ void SumToLoDTensor(const framework::ExecutionContext &context) {
 
   int start = in_place ? 1 : 0;
   if (!in_place) {
-    math::SetConstant<platform::CUDADeviceContext, T> constant_functor;
+    pten::funcs::SetConstant<platform::CUDADeviceContext, T> constant_functor;
     constant_functor(
         context.template device_context<platform::CUDADeviceContext>(), out,
         static_cast<T>(0));
diff --git a/paddle/fluid/operators/sum_op.h b/paddle/fluid/operators/sum_op.h
index 4e108b56a40..d8d57b1f7f0 100644
--- a/paddle/fluid/operators/sum_op.h
+++ b/paddle/fluid/operators/sum_op.h
@@ -14,8 +14,8 @@ limitations under the License. */
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/lod_tensor_array.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/math/selected_rows_functor.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
@@ -167,7 +167,7 @@ class SumKernel : public framework::OpKernel<T> {
         }
         if (start != 2) {
           VLOG(10) << "Fill with constant = 0 in sum kernel.";
-          math::SetConstant<DeviceContext, T> constant_functor;
+          pten::funcs::SetConstant<DeviceContext, T> constant_functor;
           constant_functor(context.template device_context<DeviceContext>(),
                            out, static_cast<T>(0));
         }
diff --git a/paddle/fluid/operators/svd_helper.h b/paddle/fluid/operators/svd_helper.h
index 48315980e31..3a57a7b3e54 100644
--- a/paddle/fluid/operators/svd_helper.h
+++ b/paddle/fluid/operators/svd_helper.h
@@ -26,9 +26,9 @@
 #include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
 #include "paddle/fluid/operators/math/blas.h"
 #include "paddle/fluid/operators/math/complex_functors.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/for_range.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
@@ -232,11 +232,11 @@ static std::vector<int64_t> get_broadcast_batch_portion(
   return batchPortion;
 }
 
-#define DITO_TRANSPOSE_RANK_CASE(N)             \
-  case N: {                                     \
-    math::Transpose<DeviceContext, T, N> trans; \
-    trans(dev_ctx, x, &ret, axis);              \
-    break;                                      \
+#define DITO_TRANSPOSE_RANK_CASE(N)                    \
+  case N: {                                            \
+    pten::funcs::Transpose<DeviceContext, T, N> trans; \
+    trans(dev_ctx, x, &ret, axis);                     \
+    break;                                             \
   }
 
 #define DITO_SLICE_RANK_CASE(N)                      \
@@ -526,7 +526,7 @@ struct DeviceIndependenceTensorOperations {
     ret.Resize(framework::make_ddim(shape));
     ret.mutable_data<T>(context.GetPlace());
     auto& dev_ctx = context.template device_context<DeviceContext>();
-    SetConstant<DeviceContext, T>()(dev_ctx, &ret, T(fill_value));
+    pten::funcs::SetConstant<DeviceContext, T>()(dev_ctx, &ret, T(fill_value));
     return ret;
   }
   framework::Tensor Infinits(std::vector<int> shape) {
diff --git a/paddle/fluid/operators/take_along_axis_op.cu b/paddle/fluid/operators/take_along_axis_op.cu
index e9f9b187187..2d0ebbc20f2 100644
--- a/paddle/fluid/operators/take_along_axis_op.cu
+++ b/paddle/fluid/operators/take_along_axis_op.cu
@@ -63,7 +63,7 @@ class TakeAlongAxisGradOpCUDAKernel : public framework::OpKernel<T> {
 
     // Set to zero tensor.
     auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
-    math::SetConstant<platform::CUDADeviceContext, T> functor;
+    pten::funcs::SetConstant<platform::CUDADeviceContext, T> functor;
     functor(reinterpret_cast<const platform::CUDADeviceContext &>(dev_ctx),
             input_grad, static_cast<T>(0));
     const auto &index_type = index->type();
diff --git a/paddle/fluid/operators/take_along_axis_op.h b/paddle/fluid/operators/take_along_axis_op.h
index 580ca528ceb..e7f804621b3 100644
--- a/paddle/fluid/operators/take_along_axis_op.h
+++ b/paddle/fluid/operators/take_along_axis_op.h
@@ -19,7 +19,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/gather_scatter_kernel.h"
-#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
@@ -72,7 +72,7 @@ class TakeAlongAxisGradOpKernel : public framework::OpKernel<T> {
 
     // Set to zero tensor.
     auto &dev_ctx = ctx.template device_context<platform::CPUDeviceContext>();
-    math::SetConstant<platform::CPUDeviceContext, T> functor;
+    pten::funcs::SetConstant<platform::CPUDeviceContext, T> functor;
     functor(reinterpret_cast<const platform::CPUDeviceContext &>(dev_ctx),
             input_grad, static_cast<T>(0));
 
diff --git a/paddle/fluid/operators/teacher_student_sigmoid_loss_op.cc b/paddle/fluid/operators/teacher_student_sigmoid_loss_op.cc
index 0e0a594846f..62c07d0654f 100644
--- a/paddle/fluid/operators/teacher_student_sigmoid_loss_op.cc
+++ b/paddle/fluid/operators/teacher_student_sigmoid_loss_op.cc
@@ -16,7 +16,7 @@ limitations under the License. */
 
 #include <memory>
 
-#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/temporal_shift_op.h b/paddle/fluid/operators/temporal_shift_op.h
index 05364b94c92..4b2aa098d0d 100644
--- a/paddle/fluid/operators/temporal_shift_op.h
+++ b/paddle/fluid/operators/temporal_shift_op.h
@@ -11,7 +11,7 @@
 
 #pragma once
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/transpose_op.h b/paddle/fluid/operators/transpose_op.h
index e4e5dfdba9f..c873f845117 100644
--- a/paddle/fluid/operators/transpose_op.h
+++ b/paddle/fluid/operators/transpose_op.h
@@ -16,7 +16,7 @@ limitations under the License. */
 
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
@@ -29,32 +29,32 @@ inline void TransCompute(const int dim, const DeviceContext& dev_ctx,
                          const std::vector<int>& axis) {
   switch (dim) {
     case 1:
-      math::Transpose<DeviceContext, T, 1> trans1;
+      pten::funcs::Transpose<DeviceContext, T, 1> trans1;
       trans1(dev_ctx, in, out, axis);
       break;
     case 2:
-      math::Transpose<DeviceContext, T, 2> trans2;
+      pten::funcs::Transpose<DeviceContext, T, 2> trans2;
       trans2(dev_ctx, in, out, axis);
       break;
     case 3:
-      math::Transpose<DeviceContext, T, 3> trans3;
+      pten::funcs::Transpose<DeviceContext, T, 3> trans3;
       trans3(dev_ctx, in, out, axis);
       break;
     case 4:
-      math::Transpose<DeviceContext, T, 4> trans4;
+      pten::funcs::Transpose<DeviceContext, T, 4> trans4;
       trans4(dev_ctx, in, out, axis);
       break;
     case 5:
-      math::Transpose<DeviceContext, T, 5> trans5;
+      pten::funcs::Transpose<DeviceContext, T, 5> trans5;
       trans5(dev_ctx, in, out, axis);
       break;
     case 6:
-      math::Transpose<DeviceContext, T, 6> trans6;
+      pten::funcs::Transpose<DeviceContext, T, 6> trans6;
       trans6(dev_ctx, in, out, axis);
       break;
     default:
       // for dim >= 7 situation
-      math::TransposeNormal<DeviceContext, T> trans_normal;
+      pten::funcs::TransposeNormal<DeviceContext, T> trans_normal;
       trans_normal(dev_ctx, in, out, axis);
   }
 }
diff --git a/paddle/fluid/operators/transpose_op_npu_test.cc b/paddle/fluid/operators/transpose_op_npu_test.cc
index 91923da819d..49aa265656e 100644
--- a/paddle/fluid/operators/transpose_op_npu_test.cc
+++ b/paddle/fluid/operators/transpose_op_npu_test.cc
@@ -25,12 +25,11 @@ limitations under the License. */
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/operators/dropout_op.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/string/printf.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace f = paddle::framework;
 namespace p = paddle::platform;
-namespace m = paddle::operators::math;
 
 USE_OP(transpose2);
 USE_OP_DEVICE_KERNEL(transpose2, NPU);
diff --git a/paddle/fluid/operators/tree_conv_op.h b/paddle/fluid/operators/tree_conv_op.h
index a84589b32fd..c2a6cfdd0d3 100644
--- a/paddle/fluid/operators/tree_conv_op.h
+++ b/paddle/fluid/operators/tree_conv_op.h
@@ -28,7 +28,7 @@ class TreeConvKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
     math::Tree2ColFunctor<DeviceContext, T> tree2col;
-    math::SetConstant<DeviceContext, T> constant;
+    pten::funcs::SetConstant<DeviceContext, T> constant;
 
     auto *Edges = ctx.Input<Tensor>("EdgeSet");
     auto *Embeddings = ctx.Input<Tensor>("NodesVector");
@@ -86,7 +86,7 @@ class TreeConvGradKernel : public framework::OpKernel<T> {
     auto *Filter = ctx.Input<Tensor>("Filter");
     math::Tree2ColFunctor<DeviceContext, T> tree2col;
     math::Col2TreeFunctor<DeviceContext, T> col2tree;
-    math::SetConstant<DeviceContext, T> constant;
+    pten::funcs::SetConstant<DeviceContext, T> constant;
     auto &dev_ctx = ctx.template device_context<DeviceContext>();
     auto blas = math::GetBlas<DeviceContext, T>(dev_ctx);
 
diff --git a/paddle/fluid/operators/unfold_op.h b/paddle/fluid/operators/unfold_op.h
index 006e4822fea..5107b5cc492 100644
--- a/paddle/fluid/operators/unfold_op.h
+++ b/paddle/fluid/operators/unfold_op.h
@@ -19,7 +19,7 @@
 
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/im2col.h"
-#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
@@ -106,7 +106,7 @@ class UnfoldGradOpKernel : public framework::OpKernel<T> {
     math::Col2ImFunctor<math::ColFormat::kCFO, DeviceContext, T> col2im;
     auto& dev_ctx = ctx.template device_context<DeviceContext>();
 
-    math::SetConstant<DeviceContext, T> set_zero;
+    pten::funcs::SetConstant<DeviceContext, T> set_zero;
     set_zero(dev_ctx, input_grad, static_cast<T>(0));
     for (int i = 0; i < batch_size; i++) {
       Tensor out_grad_batch =
diff --git a/paddle/fluid/operators/unique_consecutive_op.h b/paddle/fluid/operators/unique_consecutive_op.h
index e6cb5dafe34..9b933dfd92f 100644
--- a/paddle/fluid/operators/unique_consecutive_op.h
+++ b/paddle/fluid/operators/unique_consecutive_op.h
@@ -22,9 +22,9 @@ limitations under the License. */
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/concat_and_split.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/transpose_op.h"
 #include "paddle/fluid/operators/unique_op.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/unique_op.h b/paddle/fluid/operators/unique_op.h
index 66b0543771f..c3d291d1201 100644
--- a/paddle/fluid/operators/unique_op.h
+++ b/paddle/fluid/operators/unique_op.h
@@ -22,8 +22,8 @@ limitations under the License. */
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/concat_and_split.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/transpose_op.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/unique_with_counts_op.h b/paddle/fluid/operators/unique_with_counts_op.h
index f61bac7cda0..fc3568ff181 100644
--- a/paddle/fluid/operators/unique_with_counts_op.h
+++ b/paddle/fluid/operators/unique_with_counts_op.h
@@ -18,8 +18,8 @@ limitations under the License. */
 #include <utility>
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/unique_op.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/unpool_op.h b/paddle/fluid/operators/unpool_op.h
index 52849cb3e0f..95aa1a4688b 100644
--- a/paddle/fluid/operators/unpool_op.h
+++ b/paddle/fluid/operators/unpool_op.h
@@ -17,8 +17,8 @@ limitations under the License. */
 #include <string>
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/math/unpooling.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
@@ -36,7 +36,7 @@ class UnpoolKernel : public framework::OpKernel<T> {
     T* output_data = out->mutable_data<T>(context.GetPlace());
     auto& dev_ctx = context.template device_context<DeviceContext>();
     if (output_data) {
-      math::SetConstant<DeviceContext, T> set_zero;
+      pten::funcs::SetConstant<DeviceContext, T> set_zero;
       set_zero(dev_ctx, out, static_cast<T>(0));
     }
     math::Unpool2dMaxFunctor<DeviceContext, T> unpool2d_max_forward;
@@ -60,7 +60,7 @@ class UnpoolGradKernel : public framework::OpKernel<T> {
     std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
 
     auto& device_ctx = context.template device_context<DeviceContext>();
-    math::SetConstant<DeviceContext, T> zero;
+    pten::funcs::SetConstant<DeviceContext, T> zero;
 
     in_x_grad->mutable_data<T>(context.GetPlace());
     zero(device_ctx, in_x_grad, static_cast<T>(0));
@@ -84,7 +84,7 @@ class Unpool3dKernel : public framework::OpKernel<T> {
     T* output_data = out->mutable_data<T>(context.GetPlace());
     auto& dev_ctx = context.template device_context<DeviceContext>();
     if (output_data) {
-      math::SetConstant<DeviceContext, T> set_zero;
+      pten::funcs::SetConstant<DeviceContext, T> set_zero;
       set_zero(dev_ctx, out, static_cast<T>(0));
     }
     math::Unpool3dMaxFunctor<DeviceContext, T> unpool3d_max_forward;
@@ -109,7 +109,7 @@ class Unpool3dGradKernel : public framework::OpKernel<T> {
     std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
 
     auto& device_ctx = context.template device_context<DeviceContext>();
-    math::SetConstant<DeviceContext, T> zero;
+    pten::funcs::SetConstant<DeviceContext, T> zero;
 
     in_x_grad->mutable_data<T>(context.GetPlace());
     zero(device_ctx, in_x_grad, static_cast<T>(0));
diff --git a/paddle/fluid/operators/unsqueeze_op.h b/paddle/fluid/operators/unsqueeze_op.h
index d7a1e0ed3b8..649cc9de50e 100644
--- a/paddle/fluid/operators/unsqueeze_op.h
+++ b/paddle/fluid/operators/unsqueeze_op.h
@@ -17,10 +17,10 @@ limitations under the License. */
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/blas.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/math/pooling.h"
 #include "paddle/fluid/operators/utils.h"
 #include "paddle/fluid/platform/device_context.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/unsqueeze_op_npu_test.cc b/paddle/fluid/operators/unsqueeze_op_npu_test.cc
index cf96ef57a4d..c34cdbc2e79 100644
--- a/paddle/fluid/operators/unsqueeze_op_npu_test.cc
+++ b/paddle/fluid/operators/unsqueeze_op_npu_test.cc
@@ -25,12 +25,11 @@ limitations under the License. */
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/operators/dropout_op.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/string/printf.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace f = paddle::framework;
 namespace p = paddle::platform;
-namespace m = paddle::operators::math;
 
 USE_OP(unsqueeze);
 USE_OP_DEVICE_KERNEL(unsqueeze, NPU);
diff --git a/paddle/fluid/operators/var_conv_2d_op.cc b/paddle/fluid/operators/var_conv_2d_op.cc
index db8b2c30501..f67b969d459 100644
--- a/paddle/fluid/operators/var_conv_2d_op.cc
+++ b/paddle/fluid/operators/var_conv_2d_op.cc
@@ -16,8 +16,8 @@ limitations under the License. */
 #include <memory>
 #include <vector>
 #include "paddle/fluid/operators/math/blas.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/platform/dynload/mklml.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/viterbi_decode_op.h b/paddle/fluid/operators/viterbi_decode_op.h
index ab95dbc763a..77e38f4fa85 100644
--- a/paddle/fluid/operators/viterbi_decode_op.h
+++ b/paddle/fluid/operators/viterbi_decode_op.h
@@ -250,8 +250,8 @@ class ViterbiDecodeKernel : public framework::OpKernel<T> {
     auto batch_size = static_cast<int>(input->dims()[0]);
     auto seq_len = static_cast<int>(input->dims()[1]);
     auto n_labels = static_cast<int>(input->dims()[2]);
-    math::SetConstant<DeviceContext, T> float_functor;
-    math::SetConstant<DeviceContext, int64_t> int_functor;
+    pten::funcs::SetConstant<DeviceContext, T> float_functor;
+    pten::funcs::SetConstant<DeviceContext, int64_t> int_functor;
     std::vector<Tensor> historys;
     // We create tensor buffer in order to avoid allocating memory frequently
     // 10 means allocate 10*batch_size bytes memory, such as int_mask, zero...
diff --git a/paddle/fluid/operators/warpctc_op.h b/paddle/fluid/operators/warpctc_op.h
index 56f1d8d97ba..3f8c38aa601 100644
--- a/paddle/fluid/operators/warpctc_op.h
+++ b/paddle/fluid/operators/warpctc_op.h
@@ -17,10 +17,10 @@ limitations under the License. */
 #include <vector>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/math/sequence_padding.h"
 #include "paddle/fluid/operators/math/sequence_scale.h"
 #include "paddle/fluid/platform/dynload/warpctc.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
@@ -138,7 +138,7 @@ class WarpCTCFunctor {
         framework::make_ddim({static_cast<int64_t>(workspace_elements)}),
         dev_ctx);
     T* workspace_data = workspace.data<T>();
-    math::SetConstant<DeviceContext, T>()(
+    pten::funcs::SetConstant<DeviceContext, T>()(
         ctx.template device_context<DeviceContext>(), &workspace,
         static_cast<T>(0));
 
@@ -334,7 +334,7 @@ class WarpCTCKernel : public framework::OpKernel<T> {
     T* warpctc_grad_data =
         warpctc_grad->mutable_data<T>(warpctc_logits.dims(), ctx.GetPlace());
 
-    math::SetConstant<DeviceContext, T>()(
+    pten::funcs::SetConstant<DeviceContext, T>()(
         ctx.template device_context<DeviceContext>(), warpctc_grad,
         static_cast<T>(0));
 
diff --git a/paddle/fluid/operators/where_index_op.h b/paddle/fluid/operators/where_index_op.h
index 97a7bb939b9..c6828a78768 100644
--- a/paddle/fluid/operators/where_index_op.h
+++ b/paddle/fluid/operators/where_index_op.h
@@ -17,8 +17,8 @@ limitations under the License. */
 #include <vector>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/platform/for_range.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/where_op.h b/paddle/fluid/operators/where_op.h
index fdb65858eff..415632f3d7e 100644
--- a/paddle/fluid/operators/where_op.h
+++ b/paddle/fluid/operators/where_op.h
@@ -14,7 +14,7 @@
 
 #pragma once
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/pten/kernels/cpu/norm_grad_kernel.cc b/paddle/pten/kernels/cpu/norm_grad_kernel.cc
index 3357e6f76fa..7b2a07c37bc 100644
--- a/paddle/pten/kernels/cpu/norm_grad_kernel.cc
+++ b/paddle/pten/kernels/cpu/norm_grad_kernel.cc
@@ -13,8 +13,8 @@
 // limitations under the License.
 
 #include "paddle/pten/kernels/norm_grad_kernel.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/pten/kernels/funcs/eigen/eigen_function.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 #include "paddle/pten/kernels/funcs/eigen/common.h"
 
diff --git a/paddle/pten/kernels/cpu/norm_kernel.cc b/paddle/pten/kernels/cpu/norm_kernel.cc
index ef2cf405c13..f2996faccb1 100644
--- a/paddle/pten/kernels/cpu/norm_kernel.cc
+++ b/paddle/pten/kernels/cpu/norm_kernel.cc
@@ -13,11 +13,11 @@
 // limitations under the License.
 
 #include "paddle/pten/kernels/norm_kernel.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/pten/backends/cpu/cpu_context.h"
 #include "paddle/pten/core/kernel_registry.h"
 #include "paddle/pten/kernels/funcs/common_shape.h"
 #include "paddle/pten/kernels/funcs/eigen/eigen_function.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace pten {
 
diff --git a/paddle/pten/kernels/funcs/CMakeLists.txt b/paddle/pten/kernels/funcs/CMakeLists.txt
index 32bdc94b95d..e4dd437629a 100644
--- a/paddle/pten/kernels/funcs/CMakeLists.txt
+++ b/paddle/pten/kernels/funcs/CMakeLists.txt
@@ -6,3 +6,51 @@ if(WITH_GPU)
 elseif(WITH_ROCM)
   hip_library(pten_transpose_gpu SRCS transpose.cu DEPS dense_tensor malloc pten_context)
 endif()
+
+function(math_library TARGET)
+    # math_library is a function to create math library.
+    # The interface is the same as cc_library.
+    # But it handle split GPU/CPU code and link some common library.
+    set(cc_srcs)
+    set(cu_srcs)
+    set(hip_srcs)
+    set(math_common_deps device_context framework_proto enforce)
+    if (WITH_GPU)
+        if (${CMAKE_CUDA_COMPILER_VERSION} LESS 11.0)	
+            list(APPEND math_common_deps cub)
+	else()
+            list(APPEND math_common_deps)
+	endif()
+    endif()
+    set(multiValueArgs DEPS)
+    cmake_parse_arguments(math_library "${options}" "${oneValueArgs}"
+            "${multiValueArgs}" ${ARGN})
+
+    if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cc)
+        list(APPEND cc_srcs ${TARGET}.cc)
+    endif()
+    if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cu)
+        list(APPEND cu_srcs ${TARGET}.cu)
+    endif()
+    if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cu.cc)
+        list(APPEND cu_srcs ${TARGET}.cu.cc)
+    endif()
+
+    list(LENGTH cc_srcs cc_srcs_len)
+    if (WITH_GPU)
+        nv_library(${TARGET} SRCS ${cc_srcs} ${cu_srcs} DEPS ${math_library_DEPS} ${math_common_deps})
+    elseif (WITH_ROCM)
+        hip_library(${TARGET} SRCS ${cc_srcs} ${cu_srcs} DEPS ${math_library_DEPS} ${math_common_deps})
+    elseif(${cc_srcs_len} GREATER 0)
+        cc_library(${TARGET} SRCS ${cc_srcs} DEPS ${math_library_DEPS} ${math_common_deps})
+    endif()
+endfunction()
+
+math_library(math_function DEPS blas dense_tensor tensor)
+cc_test(math_function_test SRCS math_function_test.cc DEPS math_function)
+if(WITH_GPU)
+    nv_test(math_function_gpu_test SRCS math_function_test.cu DEPS math_function)
+endif()
+if(WITH_ROCM)
+    hip_test(math_function_gpu_test SRCS math_function_test.cu DEPS math_function tensor)
+endif()
diff --git a/paddle/pten/kernels/funcs/elementwise_base.h b/paddle/pten/kernels/funcs/elementwise_base.h
index 110b405bbcb..8e977f3e733 100644
--- a/paddle/pten/kernels/funcs/elementwise_base.h
+++ b/paddle/pten/kernels/funcs/elementwise_base.h
@@ -14,12 +14,12 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/platform/for_range.h"
 #include "paddle/fluid/platform/transform.h"
 #include "paddle/pten/backends/all_context.h"
 #include "paddle/pten/core/dense_tensor.h"
 #include "paddle/pten/kernels/empty_kernel.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 #if defined(__NVCC__) || defined(__HIPCC__)
 #include "paddle/fluid/platform/aligned_vector.h"
@@ -394,7 +394,7 @@ static inline void GetDoubleGradSafeTensor(const DeviceContext &dev_ctx,
     auto meta = pten::DenseTensorMeta(x.dtype(), x.dims(), x.layout());
     *ddx_safe = pten::Empty(dev_ctx, std::move(meta));
     ddx_safe->mutable_data(dev_ctx.GetPlace());
-    paddle::operators::math::SetConstant<DeviceContext, T> set_zero;
+    pten::funcs::SetConstant<DeviceContext, T> set_zero;
     set_zero(dev_ctx, ddx_safe, static_cast<T>(0));
   }
 }
diff --git a/paddle/pten/kernels/funcs/math_function.cc b/paddle/pten/kernels/funcs/math_function.cc
new file mode 100644
index 00000000000..550ec23c18f
--- /dev/null
+++ b/paddle/pten/kernels/funcs/math_function.cc
@@ -0,0 +1,342 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/pten/kernels/funcs/math_function.h"
+
+#ifdef PADDLE_WITH_MKLML
+#include "paddle/fluid/platform/dynload/mklml.h"
+#endif
+
+#ifdef PADDLE_USE_OPENBLAS
+#include <cblas.h>
+#endif
+
+#include <memory>
+#include <utility>
+#include <vector>
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/platform/bfloat16.h"
+#include "paddle/fluid/platform/float16.h"
+#include "paddle/pten/backends/cpu/cpu_context.h"
+#include "paddle/pten/kernels/funcs/eigen/common.h"
+#include "paddle/pten/kernels/funcs/math_function_impl.h"
+#include "unsupported/Eigen/CXX11/Tensor"
+
+namespace pten {
+namespace funcs {
+
+using float16 = paddle::platform::float16;
+
+template struct SetConstant<paddle::platform::CPUDeviceContext,
+                            paddle::platform::float16>;
+template struct SetConstant<paddle::platform::CPUDeviceContext,
+                            paddle::platform::bfloat16>;
+template struct SetConstant<paddle::platform::CPUDeviceContext, float>;
+template struct SetConstant<paddle::platform::CPUDeviceContext, double>;
+template struct SetConstant<paddle::platform::CPUDeviceContext, int16_t>;
+template struct SetConstant<paddle::platform::CPUDeviceContext, int>;
+template struct SetConstant<paddle::platform::CPUDeviceContext, int64_t>;
+template struct SetConstant<paddle::platform::CPUDeviceContext, bool>;
+template struct SetConstant<paddle::platform::CPUDeviceContext, uint8_t>;
+template struct SetConstant<paddle::platform::CPUDeviceContext,
+                            paddle::platform::complex<float>>;
+template struct SetConstant<paddle::platform::CPUDeviceContext,
+                            paddle::platform::complex<double>>;
+
+template struct SetConstant<pten::CPUContext, paddle::platform::float16>;
+template struct SetConstant<pten::CPUContext, paddle::platform::bfloat16>;
+template struct SetConstant<pten::CPUContext, float>;
+template struct SetConstant<pten::CPUContext, double>;
+template struct SetConstant<pten::CPUContext, int16_t>;
+template struct SetConstant<pten::CPUContext, int>;
+template struct SetConstant<pten::CPUContext, int64_t>;
+template struct SetConstant<pten::CPUContext, bool>;
+template struct SetConstant<pten::CPUContext, uint8_t>;
+template struct SetConstant<pten::CPUContext, paddle::platform::complex<float>>;
+template struct SetConstant<pten::CPUContext,
+                            paddle::platform::complex<double>>;
+
+#ifdef PADDLE_WITH_XPU
+template struct SetConstant<paddle::platform::XPUDeviceContext,
+                            paddle::platform::float16>;
+template struct SetConstant<paddle::platform::XPUDeviceContext,
+                            paddle::platform::bfloat16>;
+template struct SetConstant<paddle::platform::XPUDeviceContext, float>;
+template struct SetConstant<paddle::platform::XPUDeviceContext, double>;
+template struct SetConstant<paddle::platform::XPUDeviceContext, uint8_t>;
+template struct SetConstant<paddle::platform::XPUDeviceContext, int16_t>;
+template struct SetConstant<paddle::platform::XPUDeviceContext, int>;
+template struct SetConstant<paddle::platform::XPUDeviceContext, int64_t>;
+template struct SetConstant<paddle::platform::XPUDeviceContext, bool>;
+template struct SetConstant<paddle::platform::XPUDeviceContext,
+                            paddle::platform::complex<float>>;
+template struct SetConstant<paddle::platform::XPUDeviceContext,
+                            paddle::platform::complex<double>>;
+#endif
+
+#define DEFINE_CPU_TRANS(RANK)                                                 \
+  template struct Transpose<paddle::platform::CPUDeviceContext,                \
+                            paddle::platform::float16,                         \
+                            RANK>;                                             \
+  template struct Transpose<paddle::platform::CPUDeviceContext,                \
+                            paddle::platform::bfloat16,                        \
+                            RANK>;                                             \
+  template struct Transpose<paddle::platform::CPUDeviceContext, float, RANK>;  \
+  template struct Transpose<paddle::platform::CPUDeviceContext, double, RANK>; \
+  template struct Transpose<paddle::platform::CPUDeviceContext, int, RANK>;    \
+  template struct Transpose<paddle::platform::CPUDeviceContext,                \
+                            int64_t,                                           \
+                            RANK>;                                             \
+  template struct Transpose<paddle::platform::CPUDeviceContext, bool, RANK>;   \
+  template struct Transpose<paddle::platform::CPUDeviceContext,                \
+                            int16_t,                                           \
+                            RANK>;                                             \
+  template struct Transpose<paddle::platform::CPUDeviceContext,                \
+                            uint8_t,                                           \
+                            RANK>;                                             \
+  template struct Transpose<paddle::platform::CPUDeviceContext, int8_t, RANK>; \
+  template struct Transpose<paddle::platform::CPUDeviceContext,                \
+                            paddle::platform::complex<float>,                  \
+                            RANK>;                                             \
+  template struct Transpose<paddle::platform::CPUDeviceContext,                \
+                            paddle::platform::complex<double>,                 \
+                            RANK>;
+
+DEFINE_CPU_TRANS(1);
+DEFINE_CPU_TRANS(2);
+DEFINE_CPU_TRANS(3);
+DEFINE_CPU_TRANS(4);
+DEFINE_CPU_TRANS(5);
+DEFINE_CPU_TRANS(6);
+
+template <typename T>
+struct TransposeNormal<paddle::platform::CPUDeviceContext, T> {
+  void operator()(const paddle::platform::CPUDeviceContext& context,
+                  const paddle::framework::Tensor& in,
+                  paddle::framework::Tensor* out,
+                  const std::vector<int>& axis) {
+    const int rank = axis.size();
+    auto in_stride = paddle::framework::stride(in.dims());
+    auto out_stride = paddle::framework::stride(out->dims());
+    const T* in_ptr = in.data<T>();
+    T* out_ptr = out->data<T>();
+
+    auto transpose_helper = [&](int64_t beg, int64_t end) {
+      for (int64_t out_idx = beg; out_idx < end; ++out_idx) {
+        int64_t in_idx = 0;
+        int64_t tmp_idx = out_idx;
+        // calculate the input index
+        for (int i = 0; i < rank; ++i) {
+          const int64_t coordinate = tmp_idx / out_stride[i];
+          tmp_idx -= coordinate * out_stride[i];
+          in_idx += coordinate * in_stride[axis[i]];
+        }
+        out_ptr[out_idx] = in_ptr[in_idx];
+      }
+    };
+    transpose_helper(0, out->numel());
+  }
+};
+
+// define transpose normal
+#define DEFINE_CPU_TRANS_NORMAL(TYPE) \
+  template struct TransposeNormal<paddle::platform::CPUDeviceContext, TYPE>
+
+DEFINE_CPU_TRANS_NORMAL(paddle::platform::float16);
+DEFINE_CPU_TRANS_NORMAL(paddle::platform::bfloat16);
+DEFINE_CPU_TRANS_NORMAL(float);
+DEFINE_CPU_TRANS_NORMAL(double);
+DEFINE_CPU_TRANS_NORMAL(int);
+DEFINE_CPU_TRANS_NORMAL(int64_t);
+DEFINE_CPU_TRANS_NORMAL(bool);
+DEFINE_CPU_TRANS_NORMAL(int16_t);
+DEFINE_CPU_TRANS_NORMAL(uint8_t);
+DEFINE_CPU_TRANS_NORMAL(int8_t);
+DEFINE_CPU_TRANS_NORMAL(paddle::platform::complex<float>);
+DEFINE_CPU_TRANS_NORMAL(paddle::platform::complex<double>);
+
+struct TensorSetConstantCPU {
+  TensorSetConstantCPU(paddle::framework::Tensor* tensor, float value)
+      : tensor_(tensor), value_(value) {}
+  template <typename T>
+  void apply() const {
+    auto cpu = paddle::platform::CPUPlace();
+    auto* begin = tensor_->mutable_data<T>(cpu);
+    std::fill(begin, begin + tensor_->numel(), static_cast<T>(value_));
+  }
+  paddle::framework::Tensor* tensor_;
+  float value_;
+};
+
+template <>
+void set_constant_with_place<paddle::platform::XPUPlace>(
+    const paddle::platform::DeviceContext& context,
+    paddle::framework::Tensor* tensor,
+    float value) {
+  PADDLE_THROW(
+      paddle::platform::errors::Unimplemented("XPUPlace is not supported"));
+}
+
+template <>
+void set_constant_with_place<paddle::platform::NPUPlace>(
+    const paddle::platform::DeviceContext& context,
+    paddle::framework::Tensor* tensor,
+    float value) {
+  PADDLE_THROW(
+      paddle::platform::errors::Unimplemented("NPUPlace is not supported"));
+}
+
+template <>
+void set_constant_with_place<paddle::platform::NPUPinnedPlace>(
+    const paddle::platform::DeviceContext& context,
+    paddle::framework::Tensor* tensor,
+    float value) {
+  PADDLE_THROW(paddle::platform::errors::Unimplemented(
+      "NPUPinnedPlace is not supported"));
+}
+
+template <>
+void set_constant_with_place<paddle::platform::IPUPlace>(
+    const paddle::platform::DeviceContext& context,
+    paddle::framework::Tensor* tensor,
+    float value) {
+  PADDLE_THROW(
+      paddle::platform::errors::Unimplemented("IPUPlace is not supported"));
+}
+
+template <>
+void set_constant_with_place<paddle::platform::CPUPlace>(
+    const paddle::platform::DeviceContext& context,
+    paddle::framework::Tensor* tensor,
+    float value) {
+  paddle::framework::VisitDataType(tensor->type(),
+                                   TensorSetConstantCPU(tensor, value));
+}
+
+template <>
+void set_constant_with_place<paddle::platform::MLUPlace>(
+    const paddle::platform::DeviceContext& context,
+    paddle::framework::Tensor* tensor,
+    float value) {
+  PADDLE_THROW(
+      paddle::platform::errors::Unimplemented("MLUPlace is not supported"));
+}
+
+template <>
+void set_constant_with_place<paddle::platform::CUDAPinnedPlace>(
+    const paddle::platform::DeviceContext& context,
+    paddle::framework::Tensor* tensor,
+    float value) {
+  paddle::framework::VisitDataType(tensor->type(),
+                                   TensorSetConstantCPU(tensor, value));
+}
+
+struct TensorSetConstantWithPlace : public boost::static_visitor<void> {
+  TensorSetConstantWithPlace(const paddle::platform::DeviceContext& context,
+                             paddle::framework::Tensor* tensor,
+                             float value)
+      : context_(context), tensor_(tensor), value_(value) {}
+
+  template <typename Place>
+  void operator()(Place place) const {
+    set_constant_with_place<Place>(context_, tensor_, value_);
+  }
+
+  const paddle::platform::DeviceContext& context_;
+  paddle::framework::Tensor* tensor_;
+  float value_;
+};
+
+void set_constant(const paddle::platform::DeviceContext& context,
+                  paddle::framework::Tensor* tensor,
+                  float value) {
+  TensorSetConstantWithPlace func(context, tensor, value);
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+  // tensor->place().apply_visitor(func);
+  paddle::platform::VisitPlace(tensor->place(), func);
+#else
+  func(paddle::platform::CPUPlace());
+#endif
+}
+
+template <typename T>
+struct RowwiseAdd<paddle::platform::CPUDeviceContext, T> {
+  void operator()(const paddle::platform::CPUDeviceContext& context,
+                  const paddle::framework::Tensor& input,
+                  const paddle::framework::Tensor& vector,
+                  paddle::framework::Tensor* output) {
+    auto in_dims = input.dims();
+    auto out_dims = output->dims();
+    auto size = input.numel() / in_dims[0];
+    PADDLE_ENFORCE_EQ(
+        vector.numel(),
+        size,
+        paddle::platform::errors::InvalidArgument(
+            "The input vector size"
+            " should be equal to the size of each row of input tensor."
+            " Expected vector size=%d, but received %d",
+            size,
+            vector.numel()));
+    const char* in_dims_cstr = in_dims.to_str().c_str();
+    const char* out_dims_cstr = out_dims.to_str().c_str();
+    PADDLE_ENFORCE_EQ(out_dims,
+                      in_dims,
+                      paddle::platform::errors::InvalidArgument(
+                          "The output tensor shape should be same as the input"
+                          " tensor shape. Expected output tensor shape: %s,"
+                          " but received %s",
+                          in_dims_cstr,
+                          out_dims_cstr));
+
+    auto in = paddle::framework::EigenMatrix<T>::From(input);
+    auto vec = paddle::framework::EigenVector<T>::Flatten(vector);
+    auto out = paddle::framework::EigenMatrix<T>::From(*output);
+
+    for (int64_t i = 0; i < in_dims[0]; ++i) {
+      out.chip(i, 0) = in.chip(i, 0) + vec;
+    }
+  }
+};
+
+template struct RowwiseAdd<paddle::platform::CPUDeviceContext, float>;
+template struct RowwiseAdd<paddle::platform::CPUDeviceContext, double>;
+
+template struct ColwiseSum<paddle::platform::CPUDeviceContext, float>;
+template struct ColwiseSum<paddle::platform::CPUDeviceContext, double>;
+template struct ColwiseSum<paddle::platform::CPUDeviceContext, int>;
+template struct ColwiseSum<paddle::platform::CPUDeviceContext, int64_t>;
+
+template struct RowwiseSum<paddle::platform::CPUDeviceContext, float>;
+template struct RowwiseSum<paddle::platform::CPUDeviceContext, double>;
+
+template struct RowwiseMean<paddle::platform::CPUDeviceContext, float>;
+template struct RowwiseMean<paddle::platform::CPUDeviceContext, double>;
+
+template <typename T>
+struct ElementwiseAddTo<paddle::platform::CPUDeviceContext, T> {
+  void operator()(paddle::platform::CPUDeviceContext* ctx,
+                  const paddle::framework::Tensor& src,
+                  paddle::framework::Tensor* dst) {
+    auto in = paddle::framework::EigenVector<T>::Flatten(src);
+    auto out = paddle::framework::EigenVector<T>::Flatten(*dst);
+    auto& place = *(ctx->eigen_device());
+    out.device(place) = out + in;
+  }
+};
+
+template struct ElementwiseAddTo<paddle::platform::CPUDeviceContext,
+                                 paddle::platform::float16>;
+
+}  // namespace funcs
+}  // namespace pten
diff --git a/paddle/pten/kernels/funcs/math_function.cu b/paddle/pten/kernels/funcs/math_function.cu
new file mode 100644
index 00000000000..76bc5f806d3
--- /dev/null
+++ b/paddle/pten/kernels/funcs/math_function.cu
@@ -0,0 +1,380 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include <algorithm>
+#include <vector>
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/memory/malloc.h"
+#include "paddle/fluid/memory/memcpy.h"
+#include "paddle/fluid/operators/math/blas.h"
+#include "paddle/fluid/platform/bfloat16.h"
+#include "paddle/fluid/platform/float16.h"
+#include "paddle/pten/backends/gpu/gpu_context.h"
+#include "paddle/pten/kernels/funcs/eigen/common.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
+#include "paddle/pten/kernels/funcs/math_function_impl.h"
+
+namespace pten {
+namespace funcs {
+
+using float16 = paddle::platform::float16;
+using bfloat16 = paddle::platform::bfloat16;
+
+template struct SetConstant<paddle::platform::CUDADeviceContext,
+                            paddle::platform::float16>;
+template struct SetConstant<paddle::platform::CUDADeviceContext,
+                            paddle::platform::bfloat16>;
+template struct SetConstant<paddle::platform::CUDADeviceContext, float>;
+template struct SetConstant<paddle::platform::CUDADeviceContext, double>;
+template struct SetConstant<paddle::platform::CUDADeviceContext, uint8_t>;
+template struct SetConstant<paddle::platform::CUDADeviceContext, int>;
+template struct SetConstant<paddle::platform::CUDADeviceContext, int16_t>;
+template struct SetConstant<paddle::platform::CUDADeviceContext, int64_t>;
+template struct SetConstant<paddle::platform::CUDADeviceContext, bool>;
+template struct SetConstant<paddle::platform::CUDADeviceContext,
+                            paddle::platform::complex<float>>;
+template struct SetConstant<paddle::platform::CUDADeviceContext,
+                            paddle::platform::complex<double>>;
+
+template struct SetConstant<pten::GPUContext, paddle::platform::float16>;
+template struct SetConstant<pten::GPUContext, paddle::platform::bfloat16>;
+template struct SetConstant<pten::GPUContext, float>;
+template struct SetConstant<pten::GPUContext, double>;
+template struct SetConstant<pten::GPUContext, uint8_t>;
+template struct SetConstant<pten::GPUContext, int>;
+template struct SetConstant<pten::GPUContext, int16_t>;
+template struct SetConstant<pten::GPUContext, int64_t>;
+template struct SetConstant<pten::GPUContext, bool>;
+template struct SetConstant<pten::GPUContext, paddle::platform::complex<float>>;
+template struct SetConstant<pten::GPUContext,
+                            paddle::platform::complex<double>>;
+
+template struct SetConstant<paddle::platform::CUDAPinnedDeviceContext,
+                            paddle::platform::float16>;
+template struct SetConstant<paddle::platform::CUDAPinnedDeviceContext,
+                            paddle::platform::bfloat16>;
+template struct SetConstant<paddle::platform::CUDAPinnedDeviceContext, float>;
+template struct SetConstant<paddle::platform::CUDAPinnedDeviceContext, double>;
+template struct SetConstant<paddle::platform::CUDAPinnedDeviceContext, uint8_t>;
+template struct SetConstant<paddle::platform::CUDAPinnedDeviceContext, int>;
+template struct SetConstant<paddle::platform::CUDAPinnedDeviceContext, int16_t>;
+template struct SetConstant<paddle::platform::CUDAPinnedDeviceContext, int64_t>;
+template struct SetConstant<paddle::platform::CUDAPinnedDeviceContext, bool>;
+template struct SetConstant<paddle::platform::CUDAPinnedDeviceContext,
+                            paddle::platform::complex<float>>;
+template struct SetConstant<paddle::platform::CUDAPinnedDeviceContext,
+                            paddle::platform::complex<double>>;
+
+#define DEFINE_GPU_TRANS(RANK)                                                 \
+  template struct Transpose<paddle::platform::CUDADeviceContext, bool, RANK>;  \
+  template struct Transpose<paddle::platform::CUDADeviceContext, float, RANK>; \
+  template struct Transpose<paddle::platform::CUDADeviceContext,               \
+                            double,                                            \
+                            RANK>;                                             \
+  template struct Transpose<paddle::platform::CUDADeviceContext,               \
+                            float16,                                           \
+                            RANK>;                                             \
+  template struct Transpose<paddle::platform::CUDADeviceContext,               \
+                            bfloat16,                                          \
+                            RANK>;                                             \
+  template struct Transpose<paddle::platform::CUDADeviceContext,               \
+                            int8_t,                                            \
+                            RANK>;                                             \
+  template struct Transpose<paddle::platform::CUDADeviceContext,               \
+                            int32_t,                                           \
+                            RANK>;                                             \
+  template struct Transpose<paddle::platform::CUDADeviceContext,               \
+                            int64_t,                                           \
+                            RANK>;                                             \
+  template struct Transpose<paddle::platform::CUDADeviceContext,               \
+                            paddle::platform::complex<float>,                  \
+                            RANK>;                                             \
+  template struct Transpose<paddle::platform::CUDADeviceContext,               \
+                            paddle::platform::complex<double>,                 \
+                            RANK>;
+
+DEFINE_GPU_TRANS(1);
+DEFINE_GPU_TRANS(2);
+DEFINE_GPU_TRANS(3);
+DEFINE_GPU_TRANS(4);
+DEFINE_GPU_TRANS(5);
+DEFINE_GPU_TRANS(6);
+
+#define REINTERPRET(T, DST_PTR, SRC_PTR) \
+  T* DST_PTR = reinterpret_cast<T*>(SRC_PTR)
+
+template <typename T>
+__global__ void TransposeNormalKernel(const T* in_ptr,
+                                      T* out_ptr,
+                                      int64_t element,
+                                      const int64_t* in_stride_ptr,
+                                      const int64_t* out_stride_ptr,
+                                      const int64_t* axis_ptr,
+                                      int rank) {
+  CUDA_KERNEL_LOOP(out_idx, element) {
+    int64_t in_idx = 0;
+    int64_t tmp_idx = out_idx;
+    for (int i = 0; i < rank; ++i) {
+      const int64_t coordinate = tmp_idx / out_stride_ptr[i];
+      tmp_idx -= coordinate * out_stride_ptr[i];
+      in_idx += coordinate * in_stride_ptr[axis_ptr[i]];
+    }
+    out_ptr[out_idx] = in_ptr[in_idx];
+  }
+}
+
+template <typename T>
+struct TransposeNormal<paddle::platform::CUDADeviceContext, T> {
+  void operator()(const paddle::platform::CUDADeviceContext& context,
+                  const paddle::framework::Tensor& in,
+                  paddle::framework::Tensor* out,
+                  const std::vector<int>& axis) {
+    const int rank = axis.size();
+    auto in_stride = paddle::framework::stride(in.dims());
+    auto out_stride = paddle::framework::stride(out->dims());
+    auto* in_ptr = in.data<T>();
+    auto* out_ptr = out->data<T>();
+
+    // copy in_stride, out_stride, axis to gpu device
+    const paddle::platform::CUDAPlace& cuda_place = context.GetPlace();
+    paddle::platform::CPUPlace cpu_place = paddle::platform::CPUPlace();
+    size_t size = 3 * rank * sizeof(int64_t);
+    auto cpu_buf_holder = paddle::memory::Alloc(cpu_place, size);
+    auto cuda_buf_holder = paddle::memory::Alloc(cuda_place, size);
+    REINTERPRET(int64_t, cpu_buf, cpu_buf_holder->ptr());
+    REINTERPRET(int64_t, cuda_buf, cuda_buf_holder->ptr());
+    for (int i = 0; i < rank; ++i) {
+      cpu_buf[i] = in_stride[i];
+      cpu_buf[rank + i] = out_stride[i];
+      cpu_buf[2 * rank + i] = axis[i];
+    }
+    paddle::memory::Copy(
+        cuda_place, cuda_buf, cpu_place, cpu_buf, size, context.stream());
+    REINTERPRET(const int64_t, in_stride_ptr, cuda_buf);
+    REINTERPRET(const int64_t, out_stride_ptr, cuda_buf + rank);
+    REINTERPRET(const int64_t, axis_ptr, cuda_buf + 2 * rank);
+
+    const int MAX_BLOCK_DIM = context.GetMaxThreadsPerBlock();
+    const int MAX_GRID_DIM =
+        context.GetMaxPhysicalThreadCount() / MAX_BLOCK_DIM;
+    int64_t elements = in.numel();
+    int block_size = (elements >= MAX_BLOCK_DIM)
+                         ? MAX_BLOCK_DIM
+                         : (1 << static_cast<int>(std::log2(elements)));
+    int grid_size = elements / block_size;
+    grid_size = (grid_size >= MAX_GRID_DIM) ? MAX_GRID_DIM : grid_size;
+    TransposeNormalKernel<T><<<grid_size, block_size, 0, context.stream()>>>(
+        in_ptr,
+        out_ptr,
+        elements,
+        in_stride_ptr,
+        out_stride_ptr,
+        axis_ptr,
+        rank);
+  }
+};
+
+// define transpose normal
+#define DEFINE_GPU_TRANS_NORMAL(TYPE) \
+  template struct TransposeNormal<paddle::platform::CUDADeviceContext, TYPE>
+
+DEFINE_GPU_TRANS_NORMAL(float16);
+DEFINE_GPU_TRANS_NORMAL(bfloat16);
+DEFINE_GPU_TRANS_NORMAL(float);
+DEFINE_GPU_TRANS_NORMAL(double);
+DEFINE_GPU_TRANS_NORMAL(int);
+DEFINE_GPU_TRANS_NORMAL(int64_t);
+DEFINE_GPU_TRANS_NORMAL(bool);
+DEFINE_GPU_TRANS_NORMAL(int16_t);
+DEFINE_GPU_TRANS_NORMAL(uint8_t);
+DEFINE_GPU_TRANS_NORMAL(int8_t);
+DEFINE_GPU_TRANS_NORMAL(paddle::platform::complex<float>);
+DEFINE_GPU_TRANS_NORMAL(paddle::platform::complex<double>);
+
+struct TensorSetConstantGPU {
+  TensorSetConstantGPU(const paddle::platform::DeviceContext& context,
+                       paddle::framework::Tensor* tensor,
+                       float value)
+      : context_(context), tensor_(tensor), value_(value) {}
+
+  template <typename T>
+  void apply() const {
+    SetConstant<paddle::platform::CUDADeviceContext, T> functor;
+    functor(
+        reinterpret_cast<const paddle::platform::CUDADeviceContext&>(context_),
+        tensor_,
+        static_cast<T>(value_));
+  }
+
+  const paddle::platform::DeviceContext& context_;
+  paddle::framework::Tensor* tensor_;
+  float value_;
+};
+
+template <>
+void set_constant_with_place<paddle::platform::CUDAPlace>(
+    const paddle::platform::DeviceContext& context,
+    paddle::framework::Tensor* tensor,
+    float value) {
+  paddle::framework::VisitDataType(
+      tensor->type(), TensorSetConstantGPU(context, tensor, value));
+}
+
+template <typename T>
+__global__ void RowwiseAddKernel(
+    const T* a, const T* b, T* c, int width, int num) {
+  T tmp = 1.0 / width;
+  CUDA_KERNEL_LOOP(i, num) {
+    int h = i * tmp;
+    int w = i - h * width;
+    c[i] = a[i] + b[w];
+  }
+}
+
+template <typename T>
+struct RowwiseAdd<paddle::platform::CUDADeviceContext, T> {
+  void operator()(const paddle::platform::CUDADeviceContext& context,
+                  const paddle::framework::Tensor& input,
+                  const paddle::framework::Tensor& vector,
+                  paddle::framework::Tensor* output) {
+    auto in_dims = input.dims();
+    auto out_dims = output->dims();
+    auto size = input.numel() / in_dims[0];
+    PADDLE_ENFORCE_EQ(
+        vector.numel(),
+        size,
+        paddle::platform::errors::InvalidArgument(
+            "The input vector size"
+            " should be equal to the size of each row of input tensor."
+            " Expected vector size=%d, but received %d",
+            size,
+            vector.numel()));
+    const char* in_dims_cstr = in_dims.to_str().c_str();
+    const char* out_dims_cstr = out_dims.to_str().c_str();
+    PADDLE_ENFORCE_EQ(
+        out_dims,
+        in_dims,
+        paddle::platform::errors::InvalidArgument(
+            "The output tensor shape should be same as the input tensor"
+            " shape. Expected output tensor shape: %s,"
+            " but received %s",
+            in_dims_cstr,
+            out_dims_cstr));
+    int blocks = 512;
+    int grids = (input.numel() + blocks - 1) / blocks;
+    RowwiseAddKernel<T><<<grids, blocks, 0, context.stream()>>>(
+        input.data<T>(),
+        vector.data<T>(),
+        output->data<T>(),
+        static_cast<int>(in_dims[1]),
+        static_cast<int>(input.numel()));
+  }
+};
+
+template struct RowwiseAdd<paddle::platform::CUDADeviceContext, float>;
+template struct RowwiseAdd<paddle::platform::CUDADeviceContext, double>;
+template struct ColwiseSum<paddle::platform::CUDADeviceContext, float>;
+template struct ColwiseSum<paddle::platform::CUDADeviceContext, int>;
+template struct ColwiseSum<paddle::platform::CUDADeviceContext, int64_t>;
+// template struct ColwiseSum<paddle::platform::CUDADeviceContext, double>;
+// The ColwiseSum<paddle::platform::CUDADeviceContext, double> failed in debug
+// mode,
+// and only failed for this case. So reimplemented it.
+template <>
+void ColwiseSum<paddle::platform::CUDADeviceContext, double>::operator()(
+    const paddle::platform::CUDADeviceContext& context,
+    const paddle::framework::Tensor& input,
+    paddle::framework::Tensor* vector) {
+  auto in_dims = input.dims();
+  auto size = input.numel() / in_dims[0];
+  PADDLE_ENFORCE_EQ(vector->numel(),
+                    size,
+                    paddle::platform::errors::InvalidArgument(
+                        "The size of input vector"
+                        " should be equal to the size of input tensor column"
+                        " dimension. Expected vector size=%d, but received %d",
+                        size,
+                        vector->numel()));
+  paddle::framework::Tensor one;
+  one.mutable_data<double>({in_dims[0]}, context.GetPlace());
+  SetConstant<paddle::platform::CUDADeviceContext, double> set;
+  set(context, &one, static_cast<double>(1.0));
+  paddle::operators::math::GetBlas<paddle::platform::CUDADeviceContext, double>(
+      context)
+      .GEMV(true,
+            static_cast<int>(in_dims[0]),
+            static_cast<int>(in_dims[1]),
+            1.0,
+            input.data<double>(),
+            one.data<double>(),
+            0.0,
+            vector->data<double>());
+}
+
+template struct RowwiseSum<paddle::platform::CUDADeviceContext, float>;
+// template struct RowwiseSum<paddle::platform::CUDADeviceContext, double>;
+// TODO(zcd): Following ColwiseSum format, need to confirm.
+// The RowwiseSum<paddle::platform::CUDADeviceContext, double> failed in debug
+// mode,
+// and only failed for this case. So reimplemented it.
+template <>
+void RowwiseSum<paddle::platform::CUDADeviceContext, double>::operator()(
+    const paddle::platform::CUDADeviceContext& context,
+    const paddle::framework::Tensor& input,
+    paddle::framework::Tensor* vector) {
+  auto in_dims = input.dims();
+  auto size = input.numel() / in_dims[0];
+  PADDLE_ENFORCE_EQ(vector->numel(),
+                    in_dims[0],
+                    paddle::platform::errors::InvalidArgument(
+                        "The size of input vector"
+                        " should be equal to the size of input tensor row"
+                        " dimension. Expected vector size=%d, but received %d",
+                        in_dims[0],
+                        vector->numel()));
+  paddle::framework::Tensor one;
+  one.mutable_data<double>({size}, context.GetPlace());
+  SetConstant<paddle::platform::CUDADeviceContext, double> set;
+  set(context, &one, static_cast<double>(1.0));
+  paddle::operators::math::GetBlas<paddle::platform::CUDADeviceContext, double>(
+      context)
+      .GEMV(true,
+            static_cast<int>(in_dims[1]),
+            static_cast<int>(in_dims[0]),
+            1.0,
+            one.data<double>(),
+            input.data<double>(),
+            0.0,
+            vector->data<double>());
+}
+
+template struct RowwiseMean<paddle::platform::CUDADeviceContext, float>;
+template struct RowwiseMean<paddle::platform::CUDADeviceContext, double>;
+
+template <typename T>
+struct ElementwiseAddTo<paddle::platform::CUDADeviceContext, T> {
+  void operator()(paddle::platform::CUDADeviceContext* ctx,
+                  const paddle::framework::Tensor& src,
+                  paddle::framework::Tensor* dst) {
+    auto in = paddle::framework::EigenVector<T>::Flatten(src);
+    auto out = paddle::framework::EigenVector<T>::Flatten(*dst);
+    auto& place = *(ctx->eigen_device());
+    out.device(place) = out + in;
+  }
+};
+
+template struct ElementwiseAddTo<paddle::platform::CUDADeviceContext,
+                                 paddle::platform::float16>;
+
+}  // namespace funcs
+}  // namespace pten
diff --git a/paddle/pten/kernels/funcs/math_function.h b/paddle/pten/kernels/funcs/math_function.h
new file mode 100644
index 00000000000..8208c0afb06
--- /dev/null
+++ b/paddle/pten/kernels/funcs/math_function.h
@@ -0,0 +1,127 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <cmath>
+#include <memory>
+#include <vector>
+
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/pten/core/dense_tensor.h"
+
+namespace pten {
+namespace funcs {
+
+template <typename DeviceContext, typename T>
+struct TransposeNormal {
+  // for dims >= 7 situation
+  void operator()(const DeviceContext& context,
+                  const paddle::framework::Tensor& in,
+                  paddle::framework::Tensor* out,
+                  const std::vector<int>& axis);
+};
+
+template <typename DeviceContext, typename T, int Rank>
+struct Transpose {
+  void operator()(const DeviceContext& context,
+                  const paddle::framework::Tensor& in,
+                  paddle::framework::Tensor* out,
+                  const std::vector<int>& axis);
+};
+
+template <typename DeviceContext, typename T>
+struct SetConstant {
+  void operator()(const DeviceContext& context,
+                  paddle::framework::Tensor* tensor,
+                  T num);
+};
+
+template <typename Place>
+void set_constant_with_place(const paddle::platform::DeviceContext& context,
+                             paddle::framework::Tensor* tensor,
+                             float value);
+
+void set_constant(const paddle::platform::DeviceContext& context,
+                  paddle::framework::Tensor* tensor,
+                  float value);
+
+template <typename DeviceContext, typename T>
+struct RowwiseAdd {
+  void operator()(const DeviceContext& context,
+                  const paddle::framework::Tensor& input,
+                  const paddle::framework::Tensor& vec,
+                  paddle::framework::Tensor* output);
+};
+
+template <typename DeviceContext, typename T>
+struct ElementwiseAddTo {
+  // dst = dst + src
+  void operator()(DeviceContext* ctx,
+                  const paddle::framework::Tensor& src,
+                  paddle::framework::Tensor* dst);
+};
+
+template <typename DeviceContext, typename T>
+struct ColwiseSum {
+  void operator()(const DeviceContext& context,
+                  const paddle::framework::Tensor& input,
+                  paddle::framework::Tensor* vec);
+};
+
+template <typename DeviceContext, typename T>
+struct RowwiseSum {
+  void operator()(const DeviceContext& context,
+                  const paddle::framework::Tensor& input,
+                  paddle::framework::Tensor* vec);
+};
+
+template <typename DeviceContext, typename T>
+struct RowwiseMean {
+  void operator()(const DeviceContext& context,
+                  const paddle::framework::Tensor& input,
+                  paddle::framework::Tensor* vec);
+};
+
+#ifdef PADDLE_WITH_XPU
+template <typename U>
+struct TensorSetConstantXPU {
+  TensorSetConstantXPU(paddle::framework::Tensor* tensor,
+                       U value,
+                       paddle::platform::Place place)
+      : tensor_(tensor), value_(value), place_(place) {}
+  template <typename T>
+  void apply() const {
+    auto* begin = tensor_->mutable_data<T>(place_);
+    int numel = tensor_->numel();
+    std::unique_ptr<T[]> data_cpu(new T[numel]);
+    std::fill(data_cpu.get(), data_cpu.get() + numel, static_cast<T>(value_));
+    paddle::memory::Copy(place_,
+                         begin,
+                         paddle::platform::CPUPlace(),
+                         static_cast<void*>(data_cpu.get()),
+                         numel * sizeof(T));
+  }
+  paddle::framework::Tensor* tensor_;
+  U value_;
+  paddle::platform::Place place_;
+};
+#endif
+
+}  // namespace funcs
+}  // namespace pten
diff --git a/paddle/fluid/operators/math/math_function_impl.h b/paddle/pten/kernels/funcs/math_function_impl.h
similarity index 54%
rename from paddle/fluid/operators/math/math_function_impl.h
rename to paddle/pten/kernels/funcs/math_function_impl.h
index 0e44f903043..286f694ce51 100644
--- a/paddle/fluid/operators/math/math_function_impl.h
+++ b/paddle/pten/kernels/funcs/math_function_impl.h
@@ -16,47 +16,47 @@ limitations under the License. */
 #include <memory>
 #include <vector>
 #include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
-namespace paddle {
-namespace operators {
-namespace math {
+namespace pten {
+namespace funcs {
 
-using framework::To32BitIndex;
+using paddle::framework::To32BitIndex;
 
 template <typename DeviceContext, typename T>
-void SetConstant<DeviceContext, T>::operator()(const DeviceContext& context,
-                                               framework::Tensor* tensor,
-                                               T num) {
+void SetConstant<DeviceContext, T>::operator()(
+    const DeviceContext& context, paddle::framework::Tensor* tensor, T num) {
   bool xpu_place = false;
 #ifdef PADDLE_WITH_XPU
-  if (platform::is_xpu_place(context.GetPlace())) {
+  if (paddle::platform::is_xpu_place(context.GetPlace())) {
     xpu_place = true;
-    framework::VisitDataType(
+    paddle::framework::VisitDataType(
         tensor->type(),
         TensorSetConstantXPU<T>(tensor, num, context.GetPlace()));
   }
 #endif
   if (!xpu_place) {
-    auto t = framework::EigenVector<T>::Flatten(*tensor);
+    auto t = paddle::framework::EigenVector<T>::Flatten(*tensor);
     t.device(*context.eigen_device()) = t.constant(static_cast<T>(num));
   }
 }
 
 template <typename DeviceContext, typename T, int Rank>
 void Transpose<DeviceContext, T, Rank>::operator()(
-    const DeviceContext& context, const framework::Tensor& in,
-    framework::Tensor* out, const std::vector<int>& axis) {
+    const DeviceContext& context,
+    const paddle::framework::Tensor& in,
+    paddle::framework::Tensor* out,
+    const std::vector<int>& axis) {
   Eigen::array<int, Rank> permute;
   for (int i = 0; i < Rank; i++) {
     permute[i] = axis[i];
   }
-  auto eigen_in = framework::EigenTensor<T, Rank>::From(in);
-  auto eigen_out = framework::EigenTensor<T, Rank>::From(*out);
+  auto eigen_in = paddle::framework::EigenTensor<T, Rank>::From(in);
+  auto eigen_out = paddle::framework::EigenTensor<T, Rank>::From(*out);
   auto* dev = context.eigen_device();
   // use 32bit index to speed up computation
   bool use_32bit_index = eigen_out.size() < Eigen::NumTraits<int>::highest();
-  bool is_gpu_place = platform::is_gpu_place(context.GetPlace());
+  bool is_gpu_place = paddle::platform::is_gpu_place(context.GetPlace());
   if (use_32bit_index && is_gpu_place) {
     To32BitIndex(eigen_out).device(*dev) =
         To32BitIndex(eigen_in).shuffle(permute);
@@ -66,20 +66,23 @@ void Transpose<DeviceContext, T, Rank>::operator()(
 }
 
 template <typename DeviceContext, typename T>
-void ColwiseSum<DeviceContext, T>::operator()(const DeviceContext& context,
-                                              const framework::Tensor& input,
-                                              framework::Tensor* out) {
+void ColwiseSum<DeviceContext, T>::operator()(
+    const DeviceContext& context,
+    const paddle::framework::Tensor& input,
+    paddle::framework::Tensor* out) {
   auto in_dims = input.dims();
   auto size = input.numel() / in_dims[0];
-  PADDLE_ENFORCE_EQ(out->numel(), size,
-                    platform::errors::InvalidArgument(
+  PADDLE_ENFORCE_EQ(out->numel(),
+                    size,
+                    paddle::platform::errors::InvalidArgument(
                         "The size of output tensor "
                         "should be equal to the size of input tensor column"
                         " dimension. Expected output size=%d, but received %d",
-                        size, out->numel()));
+                        size,
+                        out->numel()));
 
-  auto in = framework::EigenMatrix<T>::From(input);
-  auto vec = framework::EigenVector<T>::Flatten(*out);
+  auto in = paddle::framework::EigenMatrix<T>::From(input);
+  auto vec = paddle::framework::EigenVector<T>::Flatten(*out);
 
   vec.device(*context.eigen_device()) = in.sum(Eigen::array<int, 1>({{0}}));
 }
@@ -88,20 +91,23 @@ void ColwiseSum<DeviceContext, T>::operator()(const DeviceContext& context,
 // colwise-sum can be easily implemented. General reduce has a huge overhead in
 // CPU
 template <typename T>
-class ColwiseSum<platform::CPUDeviceContext, T> {
+class ColwiseSum<paddle::platform::CPUDeviceContext, T> {
  public:
-  void operator()(const platform::CPUDeviceContext& context,
-                  const framework::Tensor& input, framework::Tensor* out) {
+  void operator()(const paddle::platform::CPUDeviceContext& context,
+                  const paddle::framework::Tensor& input,
+                  paddle::framework::Tensor* out) {
     auto& in_dims = input.dims();
     auto height = in_dims[0];
     auto size = in_dims[1];
     PADDLE_ENFORCE_EQ(
-        out->numel(), size,
-        platform::errors::InvalidArgument(
+        out->numel(),
+        size,
+        paddle::platform::errors::InvalidArgument(
             "The size of output tensor "
             "should be equal to the size of input tensor column"
             " dimension. Expected output size=%d, but received %d",
-            size, out->numel()));
+            size,
+            out->numel()));
 
     T* out_buf = out->mutable_data<T>(out->place());
     const T* in_buf = input.data<T>();
@@ -119,23 +125,28 @@ class ColwiseSum<platform::CPUDeviceContext, T> {
 };
 
 template <typename DeviceContext, typename T>
-void RowwiseMean<DeviceContext, T>::operator()(const DeviceContext& context,
-                                               const framework::Tensor& input,
-                                               framework::Tensor* out) {
+void RowwiseMean<DeviceContext, T>::operator()(
+    const DeviceContext& context,
+    const paddle::framework::Tensor& input,
+    paddle::framework::Tensor* out) {
   auto in_dims = input.dims();
-  PADDLE_ENFORCE_EQ(in_dims.size(), 2U, platform::errors::InvalidArgument(
-                                            "The rank of input tensor "
-                                            "should be 2, but received %d",
-                                            in_dims.size()));
-  PADDLE_ENFORCE_EQ(out->numel(), in_dims[0],
-                    platform::errors::InvalidArgument(
+  PADDLE_ENFORCE_EQ(
+      in_dims.size(),
+      2U,
+      paddle::platform::errors::InvalidArgument("The rank of input tensor "
+                                                "should be 2, but received %d",
+                                                in_dims.size()));
+  PADDLE_ENFORCE_EQ(out->numel(),
+                    in_dims[0],
+                    paddle::platform::errors::InvalidArgument(
                         "The size of output tensor "
                         "should be equal to the size of input tensor row"
                         " dimension. Expected output size=%d, but received %d",
-                        in_dims[0], out->numel()));
+                        in_dims[0],
+                        out->numel()));
 
-  auto in = framework::EigenMatrix<T>::From(input);
-  auto vec = framework::EigenVector<T>::Flatten(*out);
+  auto in = paddle::framework::EigenMatrix<T>::From(input);
+  auto vec = paddle::framework::EigenVector<T>::Flatten(*out);
 
   vec.device(*context.eigen_device()) = in.mean(Eigen::array<int, 1>({{1}}));
 }
@@ -144,24 +155,29 @@ void RowwiseMean<DeviceContext, T>::operator()(const DeviceContext& context,
 // rowwise-sum can be easily implemented. General reduce has a huge overhead in
 // CPU
 template <typename T>
-class RowwiseMean<platform::CPUDeviceContext, T> {
+class RowwiseMean<paddle::platform::CPUDeviceContext, T> {
  public:
-  void operator()(const platform::CPUDeviceContext& context,
-                  const framework::Tensor& input, framework::Tensor* out) {
+  void operator()(const paddle::platform::CPUDeviceContext& context,
+                  const paddle::framework::Tensor& input,
+                  paddle::framework::Tensor* out) {
     auto& in_dims = input.dims();
-    PADDLE_ENFORCE_EQ(in_dims.size(), 2U, platform::errors::InvalidArgument(
-                                              "The rank of input tensor "
-                                              "should be 2, but received %d",
-                                              in_dims.size()));
+    PADDLE_ENFORCE_EQ(in_dims.size(),
+                      2U,
+                      paddle::platform::errors::InvalidArgument(
+                          "The rank of input tensor "
+                          "should be 2, but received %d",
+                          in_dims.size()));
     auto height = in_dims[0];
     auto size = in_dims[1];
     PADDLE_ENFORCE_EQ(
-        out->numel(), height,
-        platform::errors::InvalidArgument(
+        out->numel(),
+        height,
+        paddle::platform::errors::InvalidArgument(
             "The size of output tensor "
             "should be equal to the size of input tensor row"
             " dimension. Expected output size=%d, but received %d",
-            height, out->numel()));
+            height,
+            out->numel()));
     auto inv_size = 1.0 / size;
     T* out_buf = out->mutable_data<T>(out->place());
     const T* in_buf = input.data<T>();
@@ -177,23 +193,28 @@ class RowwiseMean<platform::CPUDeviceContext, T> {
 };
 
 template <typename DeviceContext, typename T>
-void RowwiseSum<DeviceContext, T>::operator()(const DeviceContext& context,
-                                              const framework::Tensor& input,
-                                              framework::Tensor* out) {
+void RowwiseSum<DeviceContext, T>::operator()(
+    const DeviceContext& context,
+    const paddle::framework::Tensor& input,
+    paddle::framework::Tensor* out) {
   auto in_dims = input.dims();
-  PADDLE_ENFORCE_EQ(in_dims.size(), 2U, platform::errors::InvalidArgument(
-                                            "The rank of input tensor "
-                                            "should be 2, but received %d",
-                                            in_dims.size()));
-  PADDLE_ENFORCE_EQ(out->numel(), in_dims[0],
-                    platform::errors::InvalidArgument(
+  PADDLE_ENFORCE_EQ(
+      in_dims.size(),
+      2U,
+      paddle::platform::errors::InvalidArgument("The rank of input tensor "
+                                                "should be 2, but received %d",
+                                                in_dims.size()));
+  PADDLE_ENFORCE_EQ(out->numel(),
+                    in_dims[0],
+                    paddle::platform::errors::InvalidArgument(
                         "The size of output tensor "
                         "should be equal to the size of input tensor row"
                         " dimension. Expected output size=%d, but received %d",
-                        in_dims[0], out->numel()));
+                        in_dims[0],
+                        out->numel()));
 
-  auto in = framework::EigenMatrix<T>::From(input);
-  auto vec = framework::EigenVector<T>::Flatten(*out);
+  auto in = paddle::framework::EigenMatrix<T>::From(input);
+  auto vec = paddle::framework::EigenVector<T>::Flatten(*out);
 
   vec.device(*context.eigen_device()) = in.sum(Eigen::array<int, 1>({{1}}));
 }
@@ -202,24 +223,29 @@ void RowwiseSum<DeviceContext, T>::operator()(const DeviceContext& context,
 // rowwise-sum can be easily implemented. General reduce has a huge overhead in
 // CPU
 template <typename T>
-class RowwiseSum<platform::CPUDeviceContext, T> {
+class RowwiseSum<paddle::platform::CPUDeviceContext, T> {
  public:
-  void operator()(const platform::CPUDeviceContext& context,
-                  const framework::Tensor& input, framework::Tensor* out) {
+  void operator()(const paddle::platform::CPUDeviceContext& context,
+                  const paddle::framework::Tensor& input,
+                  paddle::framework::Tensor* out) {
     auto& in_dims = input.dims();
-    PADDLE_ENFORCE_EQ(in_dims.size(), 2U, platform::errors::InvalidArgument(
-                                              "The rank of input tensor "
-                                              "should be 2, but received %d",
-                                              in_dims.size()));
+    PADDLE_ENFORCE_EQ(in_dims.size(),
+                      2U,
+                      paddle::platform::errors::InvalidArgument(
+                          "The rank of input tensor "
+                          "should be 2, but received %d",
+                          in_dims.size()));
     auto height = in_dims[0];
     auto size = in_dims[1];
     PADDLE_ENFORCE_EQ(
-        out->numel(), height,
-        platform::errors::InvalidArgument(
+        out->numel(),
+        height,
+        paddle::platform::errors::InvalidArgument(
             "The size of output tensor "
             "should be equal to the size of input tensor row"
             " dimension. Expected output size=%d, but received %d",
-            height, out->numel()));
+            height,
+            out->numel()));
 
     T* out_buf = out->mutable_data<T>(out->place());
     const T* in_buf = input.data<T>();
@@ -234,6 +260,5 @@ class RowwiseSum<platform::CPUDeviceContext, T> {
   }
 };
 
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
+}  // namespace funcs
+}  // namespace pten
diff --git a/paddle/fluid/operators/math/math_function_test.cc b/paddle/pten/kernels/funcs/math_function_test.cc
similarity index 69%
rename from paddle/fluid/operators/math/math_function_test.cc
rename to paddle/pten/kernels/funcs/math_function_test.cc
index 91a4f2746ea..6ef8c6b689d 100644
--- a/paddle/fluid/operators/math/math_function_test.cc
+++ b/paddle/pten/kernels/funcs/math_function_test.cc
@@ -11,7 +11,7 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 #include "gtest/gtest.h"
 #include "paddle/fluid/operators/math/blas.h"
 
@@ -42,8 +42,19 @@ TEST(math_function, gemm_notrans_cblas) {
   memcpy(input3_ptr, arr3, 8 * sizeof(float));
 
   paddle::platform::CPUDeviceContext context(*cpu_place);
-  GetBlas<float>(context).GEMM(false, false, m, n, k, 1, input1_ptr, 3,
-                               input2_ptr + 1, 4, 1, input3_ptr + 1, 4);
+  GetBlas<float>(context).GEMM(false,
+                               false,
+                               m,
+                               n,
+                               k,
+                               1,
+                               input1_ptr,
+                               3,
+                               input2_ptr + 1,
+                               4,
+                               1,
+                               input3_ptr + 1,
+                               4);
 
   EXPECT_EQ(input3_ptr[0], 0);
   EXPECT_EQ(input3_ptr[1], 24);
@@ -83,15 +94,36 @@ void MklSmmCompare(int m, int n, int k) {
   auto smm = [&, m, n, k, lda, ldb, ldc, alpha, beta]() {
     const char transa = 'N';
     const char transb = 'N';
-    paddle::operators::math::CBlas<T>::SMM_GEMM(&transa, &transb, &n, &m, &k,
-                                                &alpha, B, &ldb, A, &lda, &beta,
-                                                CSMM, &ldc);
+    paddle::operators::math::CBlas<T>::SMM_GEMM(&transa,
+                                                &transb,
+                                                &n,
+                                                &m,
+                                                &k,
+                                                &alpha,
+                                                B,
+                                                &ldb,
+                                                A,
+                                                &lda,
+                                                &beta,
+                                                CSMM,
+                                                &ldc);
   };
 
   auto mkl = [&, m, n, k, lda, ldb, ldc, alpha, beta]() {
-    paddle::operators::math::CBlas<T>::GEMM(CblasRowMajor, CblasNoTrans,
-                                            CblasNoTrans, m, n, k, alpha, A,
-                                            lda, B, ldb, beta, CMKL, ldc);
+    paddle::operators::math::CBlas<T>::GEMM(CblasRowMajor,
+                                            CblasNoTrans,
+                                            CblasNoTrans,
+                                            m,
+                                            n,
+                                            k,
+                                            alpha,
+                                            A,
+                                            lda,
+                                            B,
+                                            ldb,
+                                            beta,
+                                            CMKL,
+                                            ldc);
   };
 
   smm();
@@ -131,8 +163,19 @@ TEST(math_function, gemm_trans_cblas) {
   memcpy(input3_ptr, arr3, 8 * sizeof(float));
 
   paddle::platform::CPUDeviceContext context(*cpu_place);
-  GetBlas<float>(context).GEMM(false, true, m, n, k, 1, input1_ptr, 3,
-                               input2_ptr + 3, 3, 1, input3_ptr + 1, 4);
+  GetBlas<float>(context).GEMM(false,
+                               true,
+                               m,
+                               n,
+                               k,
+                               1,
+                               input1_ptr,
+                               3,
+                               input2_ptr + 3,
+                               3,
+                               1,
+                               input3_ptr + 1,
+                               4);
   delete cpu_place;
   cpu_place = NULL;
 
@@ -151,9 +194,7 @@ TEST(math_function, zero) {
   auto* cpu_place = new paddle::platform::CPUPlace();
   float* t = tensor.mutable_data<float>({2, 2}, *cpu_place);
   paddle::platform::CPUDeviceContext context(*cpu_place);
-  paddle::operators::math::SetConstant<paddle::platform::CPUDeviceContext,
-                                       float>
-      functor;
+  pten::funcs::SetConstant<paddle::platform::CPUDeviceContext, float> functor;
   functor(context, &tensor, 0);
   EXPECT_EQ(t[0], 0);
   EXPECT_EQ(t[1], 0);
@@ -188,8 +229,14 @@ void GemvTest(int m, int n, bool trans) {
   }
 
   paddle::platform::CPUDeviceContext context(*cpu_place);
-  GetBlas<T>(context).GEMV(trans, static_cast<int>(m), static_cast<int>(n), 1.,
-                           data_a, data_b, 0., data_c);
+  GetBlas<T>(context).GEMV(trans,
+                           static_cast<int>(m),
+                           static_cast<int>(n),
+                           1.,
+                           data_a,
+                           data_b,
+                           0.,
+                           data_c);
 
   if (!trans) {
     for (int i = 0; i < m; ++i) {
@@ -224,9 +271,10 @@ TEST(math_funciton, set_constant) {
   t.mutable_data<int>(paddle::platform::CPUPlace());
   auto* ctx = new paddle::platform::CPUDeviceContext();
   ctx->Init();
-  paddle::operators::math::set_constant(*ctx, &t, 10);
+  pten::funcs::set_constant(*ctx, &t, 10);
   for (int64_t i = 0; i < t.numel(); ++i) {
-    PADDLE_ENFORCE_EQ(10, t.data<int>()[i],
+    PADDLE_ENFORCE_EQ(10,
+                      t.data<int>()[i],
                       paddle::platform::errors::InvalidArgument(
                           "Each value of input tensor should be 10, "
                           "but received %d.",
@@ -262,16 +310,27 @@ void GemmWarpTest(int m, int n, int k, T alpha, T beta) {
 
   // this would call gemm_warp
   paddle::platform::CPUDeviceContext context(*cpu_place);
-  GetBlas<T>(context).GEMM(CblasNoTrans, CblasNoTrans, m, n, k, alpha, A, B,
-                           beta, CREF);
+  GetBlas<T>(context).GEMM(
+      CblasNoTrans, CblasNoTrans, m, n, k, alpha, A, B, beta, CREF);
 
   // lda,ldb,ldc follow RowMajor
   int lda = k;
   int ldb = n;
   int ldc = n;
-  paddle::operators::math::CBlas<T>::GEMM(CblasRowMajor, CblasNoTrans,
-                                          CblasNoTrans, m, n, k, alpha, A, lda,
-                                          B, ldb, beta, CMKL, ldc);
+  paddle::operators::math::CBlas<T>::GEMM(CblasRowMajor,
+                                          CblasNoTrans,
+                                          CblasNoTrans,
+                                          m,
+                                          n,
+                                          k,
+                                          alpha,
+                                          A,
+                                          lda,
+                                          B,
+                                          ldb,
+                                          beta,
+                                          CMKL,
+                                          ldc);
 
   for (int i = 0; i < mat_c_mkl.numel(); ++i) {
     EXPECT_FLOAT_EQ(CREF[i], CMKL[i]);
diff --git a/paddle/fluid/operators/math/math_function_test.cu b/paddle/pten/kernels/funcs/math_function_test.cu
similarity index 90%
rename from paddle/fluid/operators/math/math_function_test.cu
rename to paddle/pten/kernels/funcs/math_function_test.cu
index 39c91e96a70..87f11c47a44 100644
--- a/paddle/fluid/operators/math/math_function_test.cu
+++ b/paddle/pten/kernels/funcs/math_function_test.cu
@@ -13,17 +13,20 @@
 // limitations under the License.
 #include "gtest/gtest.h"
 #include "paddle/fluid/operators/math/blas.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/platform/device_context.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
-void fill_fp16_data(paddle::platform::float16* in_ptr, size_t size,
+void fill_fp16_data(paddle::platform::float16* in_ptr,
+                    size_t size,
                     const std::vector<float>& data) {
   PADDLE_ENFORCE_EQ(
-      size, data.size(),
+      size,
+      data.size(),
       paddle::platform::errors::InvalidArgument(
           "The size of argument data should"
           " be equal to the argument size. Expected %d, but received %d.",
-          size, data.size()));
+          size,
+          data.size()));
   for (size_t i = 0; i < data.size(); ++i) {
     in_ptr[i] = paddle::platform::float16(data[i]);
   }
@@ -59,8 +62,8 @@ TEST(math_function, notrans_mul_trans_fp32) {
   paddle::framework::TensorCopySync(input1, gpu_place, &input2_gpu);
 
   out_gpu.mutable_data<float>({2, 2}, gpu_place);
-  GetBlas<float>(context).MatMul(input1_gpu, false, input2_gpu, true, 1,
-                                 &out_gpu, 0);
+  GetBlas<float>(context).MatMul(
+      input1_gpu, false, input2_gpu, true, 1, &out_gpu, 0);
 
   paddle::framework::TensorCopySync(out_gpu, cpu_place, &out);
 
@@ -102,8 +105,13 @@ TEST(math_function, notrans_mul_trans_fp16) {
   out_gpu.mutable_data<paddle::platform::float16>({2, 2}, gpu_place);
 
   GetBlas<paddle::platform::float16>(context).MatMul(
-      input1_gpu, false, input2_gpu, true, paddle::platform::float16(1),
-      &out_gpu, paddle::platform::float16(0));
+      input1_gpu,
+      false,
+      input2_gpu,
+      true,
+      paddle::platform::float16(1),
+      &out_gpu,
+      paddle::platform::float16(0));
 
   paddle::framework::TensorCopySync(out_gpu, cpu_place, &out);
 
@@ -139,8 +147,8 @@ TEST(math_function, trans_mul_notrans_fp32) {
 
   out_gpu.mutable_data<float>({3, 3}, gpu_place);
 
-  GetBlas<float>(context).MatMul(input1_gpu, true, input2_gpu, false, 1,
-                                 &out_gpu, 0);
+  GetBlas<float>(context).MatMul(
+      input1_gpu, true, input2_gpu, false, 1, &out_gpu, 0);
 
   paddle::framework::TensorCopySync(out_gpu, cpu_place, &out);
 
@@ -187,8 +195,13 @@ TEST(math_function, trans_mul_notrans_fp16) {
   out_gpu.mutable_data<paddle::platform::float16>({3, 3}, gpu_place);
 
   GetBlas<paddle::platform::float16>(context).MatMul(
-      input1_gpu, true, input2_gpu, false, paddle::platform::float16(1),
-      &out_gpu, paddle::platform::float16(0));
+      input1_gpu,
+      true,
+      input2_gpu,
+      false,
+      paddle::platform::float16(1),
+      &out_gpu,
+      paddle::platform::float16(0));
 
   paddle::framework::TensorCopySync(out_gpu, cpu_place, &out);
 
@@ -241,8 +254,8 @@ TEST(math_function, gemm_notrans_cublas_fp32) {
   float* b = input2_gpu.data<float>();
   float* c = input3_gpu.mutable_data<float>(gpu_place);
 
-  GetBlas<float>(context).GEMM(false, false, m, n, k, 1, a, 3, b + 1, 4, 1,
-                               c + 1, 4);
+  GetBlas<float>(context).GEMM(
+      false, false, m, n, k, 1, a, 3, b + 1, 4, 1, c + 1, 4);
 
   paddle::framework::TensorCopySync(input3_gpu, cpu_place, &input3);
 
@@ -292,8 +305,8 @@ TEST(math_function, gemm_notrans_cublas_fp16) {
   fill_fp16_data(input1_ptr, input1.numel(), {0, 1, 2, 3, 4, 5});
   paddle::platform::float16* input2_ptr =
       input2.mutable_data<paddle::platform::float16>({3, 4}, cpu_place);
-  fill_fp16_data(input2_ptr, input2.numel(),
-                 {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11});
+  fill_fp16_data(
+      input2_ptr, input2.numel(), {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11});
   paddle::platform::float16* input3_ptr =
       input3.mutable_data<paddle::platform::float16>({2, 4}, cpu_place);
   fill_fp16_data(input3_ptr, input3.numel(), {0, 1, 2, 3, 4, 5, 6, 7});
@@ -307,8 +320,19 @@ TEST(math_function, gemm_notrans_cublas_fp16) {
       input3_gpu.mutable_data<paddle::platform::float16>(gpu_place);
 
   GetBlas<paddle::platform::float16>(context).GEMM(
-      false, false, m, n, k, static_cast<paddle::platform::float16>(1), a, 3,
-      b + 1, 4, static_cast<paddle::platform::float16>(1), c + 1, 4);
+      false,
+      false,
+      m,
+      n,
+      k,
+      static_cast<paddle::platform::float16>(1),
+      a,
+      3,
+      b + 1,
+      4,
+      static_cast<paddle::platform::float16>(1),
+      c + 1,
+      4);
 
   paddle::framework::TensorCopySync(input3_gpu, cpu_place, &input3);
 
@@ -365,8 +389,8 @@ TEST(math_function, gemm_trans_cublas_fp32) {
   float* b = input2_gpu.data<float>();
   float* c = input3_gpu.mutable_data<float>(gpu_place);
 
-  GetBlas<float>(context).GEMM(false, true, m, n, k, 1, a, 3, b + 3, 3, 1,
-                               c + 1, 4);
+  GetBlas<float>(context).GEMM(
+      false, true, m, n, k, 1, a, 3, b + 3, 3, 1, c + 1, 4);
 
   paddle::framework::TensorCopySync(input3_gpu, cpu_place, &input3);
 
@@ -410,8 +434,8 @@ TEST(math_function, gemm_trans_cublas_fp16) {
   fill_fp16_data(input1_ptr, input1.numel(), {0, 1, 2, 3, 4, 5});
   paddle::platform::float16* input2_ptr =
       input2.mutable_data<paddle::platform::float16>({4, 3}, cpu_place);
-  fill_fp16_data(input2_ptr, input2.numel(),
-                 {0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11});
+  fill_fp16_data(
+      input2_ptr, input2.numel(), {0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11});
   paddle::platform::float16* input3_ptr =
       input3.mutable_data<paddle::platform::float16>({2, 4}, cpu_place);
   fill_fp16_data(input3_ptr, input3.numel(), {0, 1, 2, 3, 4, 5, 6, 7});
@@ -425,8 +449,19 @@ TEST(math_function, gemm_trans_cublas_fp16) {
       input3_gpu.mutable_data<paddle::platform::float16>(gpu_place);
 
   GetBlas<paddle::platform::float16>(context).GEMM(
-      false, true, m, n, k, static_cast<paddle::platform::float16>(1), a, 3,
-      b + 3, 3, static_cast<paddle::platform::float16>(1), c + 1, 4);
+      false,
+      true,
+      m,
+      n,
+      k,
+      static_cast<paddle::platform::float16>(1),
+      a,
+      3,
+      b + 3,
+      3,
+      static_cast<paddle::platform::float16>(1),
+      c + 1,
+      4);
 
   paddle::framework::TensorCopySync(input3_gpu, cpu_place, &input3);
 
@@ -476,8 +511,14 @@ void GemvTest(int m, int n, bool trans) {
   paddle::framework::TensorCopySync(mat_a, gpu_place, &g_mat_a);
   paddle::framework::TensorCopySync(vec_b, gpu_place, &g_vec_b);
 
-  GetBlas<T>(context).GEMV(trans, static_cast<int>(m), static_cast<int>(n), 1.,
-                           g_data_a, g_data_b, 0., g_data_c);
+  GetBlas<T>(context).GEMV(trans,
+                           static_cast<int>(m),
+                           static_cast<int>(n),
+                           1.,
+                           g_data_a,
+                           g_data_b,
+                           0.,
+                           g_data_c);
 
   paddle::framework::TensorCopySync(g_vec_c, cpu_place, &vec_c);
 
diff --git a/paddle/pten/kernels/gpu/trace_kernel.cu b/paddle/pten/kernels/gpu/trace_kernel.cu
index 155bfbd02af..f552386fafd 100644
--- a/paddle/pten/kernels/gpu/trace_kernel.cu
+++ b/paddle/pten/kernels/gpu/trace_kernel.cu
@@ -36,7 +36,7 @@ void TraceKernel(const Context& ctx,
     kernels::TensorReduceImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
         ctx, diag, out, kps::IdentityFunctor<T>(), reduce_dims, stream);
   } else {
-    paddle::operators::math::SetConstant<Context, T> functor;
+    pten::funcs::SetConstant<Context, T> functor;
     functor(ctx, out, static_cast<T>(0));
   }
 }
diff --git a/paddle/pten/kernels/impl/trace_kernel_impl.h b/paddle/pten/kernels/impl/trace_kernel_impl.h
index 4dbba9bc69e..1b499681bbb 100644
--- a/paddle/pten/kernels/impl/trace_kernel_impl.h
+++ b/paddle/pten/kernels/impl/trace_kernel_impl.h
@@ -22,8 +22,9 @@
 #include <algorithm>
 
 #include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/for_range.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace pten {
 template <typename T>
@@ -196,7 +197,7 @@ void TraceGradKernel(const Context& ctx,
   auto* out_data = out_grad.data<T>();
   T* x_data = in_grad->mutable_data<T>(ctx.GetPlace());
 
-  paddle::operators::math::SetConstant<Context, T> set_zero;
+  pten::funcs::SetConstant<Context, T> set_zero;
 
   set_zero(ctx, in_grad, static_cast<T>(0.0));
   auto dim1 = axis1;
-- 
GitLab