From d25a7f9ea7a171caa7b37fd9f624b28d25aa293a Mon Sep 17 00:00:00 2001
From: Feiyu Chan <chenfeiyu@baidu.com>
Date: Fri, 11 Feb 2022 16:48:51 +0800
Subject: [PATCH] [Pten] move operators/math/math_function_* to
 pten/kernels/func (#39300)

* move operators/math/math_function_* to pten/kernels/func
* namespace from `paddle::operators::math` to `pten::funcs`
---
 .../ps/service/communicator/communicator.h    |   4 +-
 .../ps/service/ps_service/graph_py_service.h  |   3 +-
 .../test/brpc_service_dense_sgd_test.cc       |   3 +-
 .../test/brpc_service_sparse_sgd_test.cc      |   3 +-
 .../fluid/distributed/test/brpc_utils_test.cc |   9 +-
 .../distributed/test/graph_node_split_test.cc |   3 +-
 .../fluid/distributed/test/graph_node_test.cc |   3 +-
 paddle/fluid/eager/grad_tensor_holder.cc      |   2 +-
 .../framework/data_device_transform_test.cu   |   2 +-
 .../fluid/framework/data_layout_transform.cc  |   4 +-
 paddle/fluid/framework/data_transform.h       |   2 +-
 .../multi_devices_graph_pass.cc               |   2 +-
 paddle/fluid/imperative/basic_engine.cc       |   6 +-
 .../fluid/imperative/gradient_accumulator.cc  |  12 +-
 paddle/fluid/imperative/layer.cc              |   4 +-
 .../fluid/imperative/partial_grad_engine.cc   |   4 +-
 paddle/fluid/imperative/reducer.cc            |   2 +-
 paddle/fluid/imperative/reducer.h             |   2 +-
 .../tests/test_gradient_accmulator.cc         |   2 +-
 paddle/fluid/operators/addmm_op.h             |   2 +-
 paddle/fluid/operators/affine_grid_op.cu      |   2 +-
 paddle/fluid/operators/affine_grid_op.h       |   8 +-
 .../check_finite_and_unscale_op_npu_test.cc   |   3 +-
 paddle/fluid/operators/assign_op_npu_test.cc  |   3 +-
 .../fluid/operators/average_accumulates_op.h  |   4 +-
 paddle/fluid/operators/batch_norm_op.cc       |   2 +-
 paddle/fluid/operators/batch_norm_op.cu       |   5 +-
 paddle/fluid/operators/batch_norm_op.h        |   2 +-
 paddle/fluid/operators/batch_size_like.h      |   2 +-
 .../operators/bilinear_tensor_product_op.h    |   2 +-
 paddle/fluid/operators/bincount_op.cu         |   6 +-
 paddle/fluid/operators/bincount_op.h          |   8 +-
 paddle/fluid/operators/bmm_op.h               |   2 +-
 paddle/fluid/operators/bpr_loss_op.h          |   2 +-
 paddle/fluid/operators/broadcast_tensors_op.h |   2 +-
 paddle/fluid/operators/coalesce_tensor_op.cc  |   6 +-
 .../collective/c_allgather_op_npu_test.cc     |   3 +-
 .../collective/c_allreduce_max_op_npu_test.cc |   3 +-
 .../collective/c_allreduce_sum_op_npu_test.cc |   3 +-
 .../collective/c_broadcast_op_npu_test.cc     |   3 +-
 .../collective/c_reduce_sum_op_npu_test.cc    |   3 +-
 .../collective/c_reducescatter_op_npu_test.cc |   3 +-
 .../c_sync_calc_stream_op_npu_test.cc         |   3 +-
 .../c_sync_comm_stream_op_npu_test.cc         |   3 +-
 .../collective/checknumeric_npu_test.cc       |   3 +-
 .../collective/recv_v2_op_npu_test.cc         |   3 +-
 .../collective/send_v2_op_npu_test.cc         |   3 +-
 .../controlflow/conditional_block_op.cc       |   4 +-
 paddle/fluid/operators/conv_cudnn_op.cu       |   2 +-
 paddle/fluid/operators/conv_op.h              |   6 +-
 paddle/fluid/operators/conv_shift_op.cu       |   4 +-
 .../operators/conv_transpose_cudnn_op.cu      |   6 +-
 paddle/fluid/operators/conv_transpose_op.h    |   8 +-
 paddle/fluid/operators/cos_sim_op.h           |   4 +-
 paddle/fluid/operators/crf_decoding_op.h      |   4 +-
 paddle/fluid/operators/cross_entropy_op.h     |   2 +-
 paddle/fluid/operators/ctc_align_op.cu        |   2 +-
 paddle/fluid/operators/ctc_align_op.h         |   2 +-
 paddle/fluid/operators/cudnn_lstm_op.cu.cc    |   4 +-
 paddle/fluid/operators/cvm_op.cc              |   2 +-
 .../operators/deformable_conv_filter.cu.h     |   2 +-
 paddle/fluid/operators/deformable_conv_func.h |   2 +-
 paddle/fluid/operators/deformable_conv_op.cu  |   4 +-
 paddle/fluid/operators/deformable_conv_op.h   |   4 +-
 .../fluid/operators/deformable_conv_v1_op.cu  |   4 +-
 .../fluid/operators/deformable_conv_v1_op.h   |   4 +-
 .../operators/deformable_psroi_pooling_op.cu  |   4 +-
 .../operators/deformable_psroi_pooling_op.h   |   6 +-
 .../operators/detection/anchor_generator_op.h |   2 +-
 .../fluid/operators/detection/bbox_util.cu.h  |   2 +-
 .../operators/detection/bipartite_match_op.cc |   6 +-
 .../fluid/operators/detection/box_clip_op.cu  |   2 +-
 .../fluid/operators/detection/box_clip_op.h   |   2 +-
 .../fluid/operators/detection/box_coder_op.h  |   2 +-
 .../detection/box_decoder_and_assign_op.h     |   2 +-
 .../detection/collect_fpn_proposals_op.cu     |   2 +-
 .../detection/collect_fpn_proposals_op.h      |   2 +-
 .../detection/distribute_fpn_proposals_op.cu  |   4 +-
 .../detection/distribute_fpn_proposals_op.h   |   2 +-
 .../detection/generate_mask_labels_op.cc      |   6 +-
 .../detection/generate_proposal_labels_op.cc  |  12 +-
 .../detection/generate_proposals_op.cc        |   6 +-
 .../detection/generate_proposals_op.cu        |   6 +-
 .../detection/generate_proposals_v2_op.cc     |   6 +-
 .../detection/generate_proposals_v2_op.cu     |   6 +-
 .../fluid/operators/detection/prior_box_op.h  |   2 +-
 .../detection/roi_perspective_transform_op.cc |   2 +-
 .../detection/roi_perspective_transform_op.cu |   6 +-
 .../detection/rpn_target_assign_op.cc         |   2 +-
 .../fluid/operators/detection/yolo_box_op.cu  |   4 +-
 .../fluid/operators/detection/yolo_box_op.h   |   2 +-
 .../operators/detection/yolov3_loss_op.h      |   6 +-
 paddle/fluid/operators/determinant_op.h       |   6 +-
 paddle/fluid/operators/dgc_op.h               |   2 +-
 paddle/fluid/operators/diag_embed_op.h        |   4 +-
 paddle/fluid/operators/diag_op.h              |   4 +-
 paddle/fluid/operators/diag_v2_op.cc          |   4 +-
 paddle/fluid/operators/diag_v2_op.cu          |   2 +-
 paddle/fluid/operators/diag_v2_op.h           |   2 +-
 paddle/fluid/operators/dist_op.h              |   4 +-
 paddle/fluid/operators/dropout_op_test.cc     |   3 +-
 paddle/fluid/operators/edit_distance_op.cu    |   4 +-
 paddle/fluid/operators/eig_op.h               |   2 +-
 .../elementwise/elementwise_op_function.h     |   2 +-
 .../elementwise/elementwise_op_npu_test.cc    |   3 +-
 paddle/fluid/operators/expand_op_npu_test.cc  |   3 +-
 paddle/fluid/operators/exponential_op.h       |   4 +-
 paddle/fluid/operators/eye_op.h               |   4 +-
 paddle/fluid/operators/feed_forward_test.cu   |   2 +-
 paddle/fluid/operators/fill_any_op.h          |   6 +-
 .../fill_constant_batch_size_like_op.h        |   6 +-
 .../fill_constant_batch_size_like_op_npu.cc   |   2 +-
 paddle/fluid/operators/fill_constant_op.h     |  10 +-
 paddle/fluid/operators/fill_zeros_like_op.h   |   4 +-
 paddle/fluid/operators/flatten_op.h           |   2 +-
 paddle/fluid/operators/fold_op.h              |   4 +-
 paddle/fluid/operators/frame_op.h             |   2 +-
 paddle/fluid/operators/fsp_op.h               |   4 +-
 .../operators/fused/cudnn_bn_add_relu_test.cc |   2 +-
 .../operators/fused/cudnn_norm_conv_test.cc   |   2 +-
 .../operators/fused/fused_attention_op.cu     |   2 +-
 .../operators/fused/fused_bn_activation_op.cu |   5 +-
 .../fused/fused_bn_add_activation_op.cu       |   2 +-
 .../operators/fused/fused_dropout_test.h      |   2 +-
 paddle/fluid/operators/gather.cu.h            |   4 +-
 paddle/fluid/operators/gather.h               |   4 +-
 paddle/fluid/operators/gather_op_npu_test.cc  |   3 +-
 paddle/fluid/operators/gelu_op_npu_test.cc    |   3 +-
 paddle/fluid/operators/grid_sampler_op.cu     |   6 +-
 paddle/fluid/operators/grid_sampler_op.h      |   8 +-
 paddle/fluid/operators/group_norm_op.cu       |   4 +-
 paddle/fluid/operators/group_norm_op.h        |   4 +-
 paddle/fluid/operators/gru_op.cc              |   2 +-
 paddle/fluid/operators/gru_op.cu.cc           |   2 +-
 paddle/fluid/operators/gru_op.h               |   6 +-
 paddle/fluid/operators/gumbel_softmax_op.cu   |   2 +-
 paddle/fluid/operators/gumbel_softmax_op.h    |   4 +-
 .../fluid/operators/hierarchical_sigmoid_op.h |   8 +-
 paddle/fluid/operators/histogram_op.cu        |   2 +-
 paddle/fluid/operators/histogram_op.h         |   4 +-
 paddle/fluid/operators/im2sequence_op.h       |   2 +-
 .../fluid/operators/increment_op_npu_test.cc  |   3 +-
 paddle/fluid/operators/index_sample_op.cu     |   4 +-
 paddle/fluid/operators/index_select_op.h      |   4 +-
 paddle/fluid/operators/inplace_abn_op.h       |   2 +-
 paddle/fluid/operators/instance_norm_op.cc    |   8 +-
 paddle/fluid/operators/instance_norm_op.cu    |  13 +-
 paddle/fluid/operators/interpolate_op.cu      |   6 +-
 paddle/fluid/operators/interpolate_op.h       |   8 +-
 paddle/fluid/operators/interpolate_v2_op.cu   |   6 +-
 paddle/fluid/operators/interpolate_v2_op.h    |   8 +-
 paddle/fluid/operators/layer_norm_op.h        |  10 +-
 paddle/fluid/operators/layout_utils.h         |  14 +-
 paddle/fluid/operators/linear_chain_crf_op.h  |   8 +-
 paddle/fluid/operators/linspace_op.h          |   2 +-
 paddle/fluid/operators/lrn_op.cc              |   4 +-
 paddle/fluid/operators/lrn_op.h               |   2 +-
 paddle/fluid/operators/lstm_op.h              |   6 +-
 paddle/fluid/operators/lstmp_op.h             |   6 +-
 paddle/fluid/operators/lstsq_op.h             |   2 +-
 paddle/fluid/operators/lu_op.h                |   4 +-
 paddle/fluid/operators/lu_unpack_op.h         |   2 +-
 .../operators/margin_cross_entropy_op.cu      |   6 +-
 paddle/fluid/operators/math/CMakeLists.txt    |   5 +-
 paddle/fluid/operators/math/blas_impl.cu.h    |   2 +-
 paddle/fluid/operators/math/blas_impl.h       |   2 +-
 paddle/fluid/operators/math/blas_impl.hip.h   |   2 +-
 paddle/fluid/operators/math/depthwise_conv.cu |  10 +-
 paddle/fluid/operators/math/math_function.cc  | 306 --------------
 paddle/fluid/operators/math/math_function.cu  | 322 ---------------
 paddle/fluid/operators/math/math_function.h   | 112 ------
 .../fluid/operators/math/matrix_solve.cu.cc   |   6 +-
 paddle/fluid/operators/math/prelu.h           |   2 +-
 paddle/fluid/operators/math/sample_prob.cu    |   2 +-
 .../fluid/operators/math/segment_pooling.cu   |   2 +-
 .../operators/math/selected_rows_functor.cc   |   6 +-
 .../operators/math/selected_rows_functor.cu   |   8 +-
 .../operators/math/selected_rows_functor.h    |   2 +-
 .../math/selected_rows_functor_test.cc        |  33 +-
 .../math/selected_rows_functor_test.cu.cc     |  13 +-
 .../fluid/operators/math/sequence_pooling.cc  |   6 +-
 .../fluid/operators/math/sequence_pooling.cu  |   2 +-
 paddle/fluid/operators/math/softmax.cu        |   2 +-
 paddle/fluid/operators/math/sparse_impl.cu.h  |   2 +-
 paddle/fluid/operators/math/tree2col.cc       |   4 +-
 paddle/fluid/operators/math/tree2col.cu       |   6 +-
 paddle/fluid/operators/math/tree2col.h        |   2 +-
 paddle/fluid/operators/matmul_op.cc           |   2 +-
 paddle/fluid/operators/matrix_power_op.h      |   2 +-
 paddle/fluid/operators/matrix_rank_op.cu      |   2 +-
 paddle/fluid/operators/maxout_op.h            |   4 +-
 paddle/fluid/operators/mean_iou_op.cu         |   2 +-
 .../operators/mlu/activation_op_mlu_test.cc   |   2 +-
 paddle/fluid/operators/mul_op.h               |   2 +-
 paddle/fluid/operators/norm_utils.cu.h        |   4 +-
 paddle/fluid/operators/one_hot_op.cu          |   2 +-
 paddle/fluid/operators/one_hot_op.h           |   4 +-
 paddle/fluid/operators/one_hot_v2_op.cu       |   2 +-
 paddle/fluid/operators/one_hot_v2_op.h        |   4 +-
 .../fluid/operators/optimizers/adagrad_op.cc  |   2 +-
 .../fluid/operators/optimizers/adagrad_op.cu  |   2 +-
 paddle/fluid/operators/overlap_add_op.h       |   2 +-
 paddle/fluid/operators/p_norm_op.cu           |   2 +-
 paddle/fluid/operators/p_norm_op.h            |   4 +-
 paddle/fluid/operators/pad2d_op.cc            |   4 +-
 paddle/fluid/operators/pad2d_op.cu            |   4 +-
 paddle/fluid/operators/pad3d_op.cc            |   4 +-
 paddle/fluid/operators/pad3d_op.cu            |   4 +-
 paddle/fluid/operators/pixel_shuffle_op.h     |   6 +-
 paddle/fluid/operators/poisson_op.h           |   4 +-
 paddle/fluid/operators/pool_cudnn_op.cu.cc    |  33 +-
 paddle/fluid/operators/pool_op.h              |   4 +-
 paddle/fluid/operators/pool_with_index_op.h   |   4 +-
 paddle/fluid/operators/prroi_pool_op.cu       |   2 +-
 paddle/fluid/operators/prroi_pool_op.h        |   4 +-
 .../pscore/distributed_lookup_table_op.cc     |   2 +-
 .../pscore/distributed_lookup_table_op.h      |   2 +-
 .../pscore/distributed_push_sparse_op.cc      |   2 +-
 .../pscore/distributed_push_sparse_op.h       |   2 +-
 paddle/fluid/operators/pscore/fake_init_op.cc |   2 +-
 paddle/fluid/operators/psroi_pool_op.cu       |   2 +-
 paddle/fluid/operators/psroi_pool_op.h        |   4 +-
 paddle/fluid/operators/put_along_axis_op.cu   |   2 +-
 paddle/fluid/operators/put_along_axis_op.h    |   2 +-
 paddle/fluid/operators/qr_op.h                |   2 +-
 paddle/fluid/operators/range_op.h             |   2 +-
 paddle/fluid/operators/range_op_npu_test.cc   |   3 +-
 paddle/fluid/operators/rank_attention.cu.h    |   2 +-
 .../reduce_ops/reduce_any_op_npu_test.cc      |   3 +-
 paddle/fluid/operators/reduce_ops/reduce_op.h |   6 +-
 paddle/fluid/operators/repeat_interleave_op.h |   2 +-
 paddle/fluid/operators/rnn_op.cu.cc           |   4 +-
 paddle/fluid/operators/rnn_op.h               |  20 +-
 paddle/fluid/operators/roi_align_op.cu        |   2 +-
 paddle/fluid/operators/roi_align_op.h         |  14 +-
 paddle/fluid/operators/roi_align_op_npu.cc    |   2 +-
 paddle/fluid/operators/roi_pool_op.cu         |   2 +-
 paddle/fluid/operators/roi_pool_op.h          |   4 +-
 paddle/fluid/operators/row_conv_op.cu         |   4 +-
 paddle/fluid/operators/sample_logits_op.cu    |   6 +-
 paddle/fluid/operators/sample_logits_op.h     |   4 +-
 paddle/fluid/operators/scatter.cu.h           |   2 +-
 paddle/fluid/operators/search_compute.h       |   2 +-
 paddle/fluid/operators/seed_op.cu             |   4 +-
 paddle/fluid/operators/segment_pool_op.h      |   8 +-
 .../operators/sequence_ops/sequence_conv_op.h |   6 +-
 .../sequence_ops/sequence_expand_op.h         |   6 +-
 .../operators/sequence_ops/sequence_pad_op.h  |   2 +-
 .../operators/sequence_ops/sequence_pool_op.h |   2 +-
 .../sequence_ops/sequence_reshape_op.h        |   2 +-
 .../sequence_ops/sequence_slice_op.h          |   4 +-
 .../sequence_softmax_cudnn_op.cu.cc           |   2 +-
 .../sequence_topk_avg_pooling_op.h            |   4 +-
 .../sequence_ops/sequence_unpad_op.h          |   4 +-
 paddle/fluid/operators/set_value_op.h         |   2 +-
 .../fluid/operators/shrink_rnn_memory_op.cc   |   6 +-
 paddle/fluid/operators/shuffle_channel_op.h   |   2 +-
 paddle/fluid/operators/slice_op.h             |   4 +-
 paddle/fluid/operators/softmax_op_npu_test.cc |   3 +-
 .../softmax_with_cross_entropy_op.cu          |   6 +-
 paddle/fluid/operators/solve_op.h             |   4 +-
 paddle/fluid/operators/spectral_norm_op.h     |  10 +-
 paddle/fluid/operators/spp_op.h               |   4 +-
 paddle/fluid/operators/squeeze_op.h           |   2 +-
 paddle/fluid/operators/squeeze_op_npu_test.cc |   3 +-
 paddle/fluid/operators/strided_slice_op.h     |   6 +-
 paddle/fluid/operators/sum_op.cu              |   2 +-
 paddle/fluid/operators/sum_op.h               |   4 +-
 paddle/fluid/operators/svd_helper.h           |  14 +-
 paddle/fluid/operators/take_along_axis_op.cu  |   2 +-
 paddle/fluid/operators/take_along_axis_op.h   |   4 +-
 .../teacher_student_sigmoid_loss_op.cc        |   2 +-
 paddle/fluid/operators/temporal_shift_op.h    |   2 +-
 paddle/fluid/operators/transpose_op.h         |  16 +-
 .../fluid/operators/transpose_op_npu_test.cc  |   3 +-
 paddle/fluid/operators/tree_conv_op.h         |   4 +-
 paddle/fluid/operators/unfold_op.h            |   4 +-
 .../fluid/operators/unique_consecutive_op.h   |   2 +-
 paddle/fluid/operators/unique_op.h            |   2 +-
 .../fluid/operators/unique_with_counts_op.h   |   2 +-
 paddle/fluid/operators/unpool_op.h            |  10 +-
 paddle/fluid/operators/unsqueeze_op.h         |   2 +-
 .../fluid/operators/unsqueeze_op_npu_test.cc  |   3 +-
 paddle/fluid/operators/var_conv_2d_op.cc      |   2 +-
 paddle/fluid/operators/viterbi_decode_op.h    |   4 +-
 paddle/fluid/operators/warpctc_op.h           |   6 +-
 paddle/fluid/operators/where_index_op.h       |   2 +-
 paddle/fluid/operators/where_op.h             |   2 +-
 paddle/pten/kernels/cpu/norm_grad_kernel.cc   |   2 +-
 paddle/pten/kernels/cpu/norm_kernel.cc        |   2 +-
 paddle/pten/kernels/funcs/CMakeLists.txt      |  48 +++
 paddle/pten/kernels/funcs/elementwise_base.h  |   4 +-
 paddle/pten/kernels/funcs/math_function.cc    | 342 ++++++++++++++++
 paddle/pten/kernels/funcs/math_function.cu    | 380 ++++++++++++++++++
 paddle/pten/kernels/funcs/math_function.h     | 127 ++++++
 .../kernels/funcs}/math_function_impl.h       | 179 +++++----
 .../kernels/funcs}/math_function_test.cc      | 105 +++--
 .../kernels/funcs}/math_function_test.cu      |  93 +++--
 paddle/pten/kernels/gpu/trace_kernel.cu       |   2 +-
 paddle/pten/kernels/impl/trace_kernel_impl.h  |   5 +-
 300 files changed, 1724 insertions(+), 1483 deletions(-)
 delete mode 100644 paddle/fluid/operators/math/math_function.cc
 delete mode 100644 paddle/fluid/operators/math/math_function.cu
 delete mode 100644 paddle/fluid/operators/math/math_function.h
 mode change 100755 => 100644 paddle/fluid/operators/squeeze_op.h
 create mode 100644 paddle/pten/kernels/funcs/math_function.cc
 create mode 100644 paddle/pten/kernels/funcs/math_function.cu
 create mode 100644 paddle/pten/kernels/funcs/math_function.h
 rename paddle/{fluid/operators/math => pten/kernels/funcs}/math_function_impl.h (54%)
 rename paddle/{fluid/operators/math => pten/kernels/funcs}/math_function_test.cc (69%)
 rename paddle/{fluid/operators/math => pten/kernels/funcs}/math_function_test.cu (90%)

diff --git a/paddle/fluid/distributed/ps/service/communicator/communicator.h b/paddle/fluid/distributed/ps/service/communicator/communicator.h
index da4e2f1a12..9f8c998d3a 100644
--- a/paddle/fluid/distributed/ps/service/communicator/communicator.h
+++ b/paddle/fluid/distributed/ps/service/communicator/communicator.h
@@ -35,12 +35,12 @@ limitations under the License. */
 #include "paddle/fluid/framework/variable.h"
 #include "paddle/fluid/framework/variable_helper.h"
 #include "paddle/fluid/operators/math/blas.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/math/selected_rows_functor.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/string/split.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 #include "paddle/fluid/distributed/ps/service/ps_client.h"
 
@@ -180,7 +180,7 @@ inline void MergeVars(const std::string &var_name,
 
     // set output tensor to 0.
     paddle::platform::CPUDeviceContext cpu_ctx;
-    paddle::operators::math::SetConstant<paddle::platform::CPUDeviceContext, T>
+    pten::funcs::SetConstant<paddle::platform::CPUDeviceContext, T>
         constant_functor;
     constant_functor(cpu_ctx, out_t, static_cast<T>(0));
     // sum all vars to out
diff --git a/paddle/fluid/distributed/ps/service/ps_service/graph_py_service.h b/paddle/fluid/distributed/ps/service/ps_service/graph_py_service.h
index 71b44f36d0..5bbcdca88a 100644
--- a/paddle/fluid/distributed/ps/service/ps_service/graph_py_service.h
+++ b/paddle/fluid/distributed/ps/service/ps_service/graph_py_service.h
@@ -38,9 +38,10 @@
 #include "paddle/fluid/distributed/ps/service/ps_service/service.h"
 #include "paddle/fluid/distributed/ps/service/sendrecv.pb.h"
 #include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/string/printf.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
+
 namespace paddle {
 namespace distributed {
 class GraphPyService {
diff --git a/paddle/fluid/distributed/test/brpc_service_dense_sgd_test.cc b/paddle/fluid/distributed/test/brpc_service_dense_sgd_test.cc
index d7d9d1ed1b..dd79d67be7 100644
--- a/paddle/fluid/distributed/test/brpc_service_dense_sgd_test.cc
+++ b/paddle/fluid/distributed/test/brpc_service_dense_sgd_test.cc
@@ -21,8 +21,8 @@ limitations under the License. */
 #include "paddle/fluid/distributed/ps/service/brpc_ps_server.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/platform/place.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace distributed {
@@ -42,7 +42,6 @@ class DenseTensor;
 namespace framework = paddle::framework;
 namespace platform = paddle::platform;
 namespace operators = paddle::operators;
-namespace math = paddle::operators::math;
 namespace memory = paddle::memory;
 namespace distributed = paddle::distributed;
 
diff --git a/paddle/fluid/distributed/test/brpc_service_sparse_sgd_test.cc b/paddle/fluid/distributed/test/brpc_service_sparse_sgd_test.cc
index 4f7b608c8b..0dfaafb258 100644
--- a/paddle/fluid/distributed/test/brpc_service_sparse_sgd_test.cc
+++ b/paddle/fluid/distributed/test/brpc_service_sparse_sgd_test.cc
@@ -22,8 +22,8 @@ limitations under the License. */
 #include "paddle/fluid/distributed/ps/service/brpc_ps_server.h"
 #include "paddle/fluid/distributed/ps/service/env.h"
 #include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/platform/place.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace distributed {
@@ -43,7 +43,6 @@ class DenseTensor;
 namespace framework = paddle::framework;
 namespace platform = paddle::platform;
 namespace operators = paddle::operators;
-namespace math = paddle::operators::math;
 namespace memory = paddle::memory;
 namespace distributed = paddle::distributed;
 
diff --git a/paddle/fluid/distributed/test/brpc_utils_test.cc b/paddle/fluid/distributed/test/brpc_utils_test.cc
index 608f647d14..7f18c86ac7 100644
--- a/paddle/fluid/distributed/test/brpc_utils_test.cc
+++ b/paddle/fluid/distributed/test/brpc_utils_test.cc
@@ -17,7 +17,7 @@ limitations under the License. */
 #include "gtest/gtest.h"
 
 #include "paddle/fluid/distributed/ps/service/brpc_utils.h"
-#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace framework {
@@ -28,7 +28,6 @@ class Variable;
 namespace framework = paddle::framework;
 namespace platform = paddle::platform;
 namespace operators = paddle::operators;
-namespace math = paddle::operators::math;
 namespace memory = paddle::memory;
 namespace distributed = paddle::distributed;
 
@@ -42,7 +41,7 @@ void CreateVarsOnScope(framework::Scope* scope, platform::Place* place,
   lod1.push_back(framework::Vector<size_t>({1, 3, 8}));
   tensor1->set_lod(lod1);
   tensor1->mutable_data<float>(*place);
-  math::set_constant(ctx, tensor1, 31.9);
+  pten::funcs::set_constant(ctx, tensor1, 31.9);
 
   // var 2
   framework::Variable* var2 = scope->Var("x2");
@@ -52,7 +51,7 @@ void CreateVarsOnScope(framework::Scope* scope, platform::Place* place,
   lod2.push_back(framework::Vector<size_t>({1, 1}));
   tensor2->set_lod(lod2);
   tensor2->mutable_data<int>(*place);
-  math::set_constant(ctx, tensor2, 100);
+  pten::funcs::set_constant(ctx, tensor2, 100);
 
   // var 3
   framework::Variable* var3 = scope->Var("x3");
@@ -62,7 +61,7 @@ void CreateVarsOnScope(framework::Scope* scope, platform::Place* place,
   auto* rows = slr->mutable_rows();
   tensor3->Resize(framework::make_ddim({564, 128}));
   tensor3->mutable_data<float>(*place);
-  math::set_constant(ctx, tensor3, 32.7);
+  pten::funcs::set_constant(ctx, tensor3, 32.7);
   for (int i = 0; i < 564; ++i) rows->push_back(i);
 }
 
diff --git a/paddle/fluid/distributed/test/graph_node_split_test.cc b/paddle/fluid/distributed/test/graph_node_split_test.cc
index e808d2a815..6bbcb1d399 100644
--- a/paddle/fluid/distributed/test/graph_node_split_test.cc
+++ b/paddle/fluid/distributed/test/graph_node_split_test.cc
@@ -36,14 +36,13 @@ limitations under the License. */
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/framework/variable.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/string/printf.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace framework = paddle::framework;
 namespace platform = paddle::platform;
 namespace operators = paddle::operators;
-namespace math = paddle::operators::math;
 namespace memory = paddle::memory;
 namespace distributed = paddle::distributed;
 
diff --git a/paddle/fluid/distributed/test/graph_node_test.cc b/paddle/fluid/distributed/test/graph_node_test.cc
index 3243ebc389..4aa2839c18 100644
--- a/paddle/fluid/distributed/test/graph_node_test.cc
+++ b/paddle/fluid/distributed/test/graph_node_test.cc
@@ -36,14 +36,13 @@ limitations under the License. */
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/framework/variable.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/string/printf.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace framework = paddle::framework;
 namespace platform = paddle::platform;
 namespace operators = paddle::operators;
-namespace math = paddle::operators::math;
 namespace memory = paddle::memory;
 namespace distributed = paddle::distributed;
 
diff --git a/paddle/fluid/eager/grad_tensor_holder.cc b/paddle/fluid/eager/grad_tensor_holder.cc
index 90ae91db5f..8bfeaf47b2 100644
--- a/paddle/fluid/eager/grad_tensor_holder.cc
+++ b/paddle/fluid/eager/grad_tensor_holder.cc
@@ -16,7 +16,7 @@
 #include "paddle/fluid/imperative/gradient_accumulator.h"
 
 #include "paddle/fluid/framework/var_type.h"
-#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace egr {
 
diff --git a/paddle/fluid/framework/data_device_transform_test.cu b/paddle/fluid/framework/data_device_transform_test.cu
index b364cf9b31..316f8c4d90 100644
--- a/paddle/fluid/framework/data_device_transform_test.cu
+++ b/paddle/fluid/framework/data_device_transform_test.cu
@@ -19,9 +19,9 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/init.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 #include "paddle/fluid/framework/pten_utils.h"
 
diff --git a/paddle/fluid/framework/data_layout_transform.cc b/paddle/fluid/framework/data_layout_transform.cc
index 688835cc3c..a014d34bcf 100644
--- a/paddle/fluid/framework/data_layout_transform.cc
+++ b/paddle/fluid/framework/data_layout_transform.cc
@@ -14,7 +14,7 @@
 
 #include "paddle/fluid/framework/data_layout_transform.h"
 
-#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 #ifdef PADDLE_WITH_MKLDNN
 #include "paddle/fluid/platform/mkldnn_reuse.h"
 #endif
@@ -42,7 +42,7 @@ void CastDataLayout::apply() {
   auto place = ctx_->GetPlace();
 
   if (platform::is_cpu_place(place)) {
-    operators::math::Transpose<platform::CPUDeviceContext, T, 4> trans4;
+    pten::funcs::Transpose<platform::CPUDeviceContext, T, 4> trans4;
     auto* context = static_cast<const platform::CPUDeviceContext*>(ctx_);
     trans4(*context, in_, out_, axis_);
   } else {
diff --git a/paddle/fluid/framework/data_transform.h b/paddle/fluid/framework/data_transform.h
index 385a5ff704..5c5d49f8fe 100644
--- a/paddle/fluid/framework/data_transform.h
+++ b/paddle/fluid/framework/data_transform.h
@@ -22,10 +22,10 @@ limitations under the License. */
 #include "paddle/fluid/framework/selected_rows_utils.h"
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/framework/variable.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/macros.h"
 #include "paddle/fluid/platform/transform.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.cc b/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.cc
index 5dbc3e38ea..cab7d5ddb8 100644
--- a/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.cc
+++ b/paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.cc
@@ -33,7 +33,7 @@
 #include "paddle/fluid/framework/ir/node.h"
 #include "paddle/fluid/framework/op_info.h"
 #include "paddle/fluid/framework/scope.h"
-#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 #if defined(PADDLE_WITH_DGC)
 #include "paddle/fluid/framework/details/sparse_all_reduce_op_handle.h"
diff --git a/paddle/fluid/imperative/basic_engine.cc b/paddle/fluid/imperative/basic_engine.cc
index 9d37792653..4c91ece049 100644
--- a/paddle/fluid/imperative/basic_engine.cc
+++ b/paddle/fluid/imperative/basic_engine.cc
@@ -28,8 +28,8 @@
 #include "paddle/fluid/imperative/layer.h"
 #include "paddle/fluid/imperative/op_base.h"
 #include "paddle/fluid/imperative/tracer.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/platform/profiler.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 DECLARE_bool(sort_sum_gradient);
 
@@ -103,7 +103,7 @@ void BasicEngine::Init(
     if (grad_tensor == nullptr) {
       grad_var->Resize(fwd_var.dims());
       grad_var->mutable_data(fwd_var.place(), fwd_var.type());
-      operators::math::set_constant(*dev_ctx, grad_var, 1.0);
+      pten::funcs::set_constant(*dev_ctx, grad_var, 1.0);
     } else {
       paddle::framework::TensorCopy(
           grad_tensor->Var().Get<framework::LoDTensor>(), fwd_var.place(),
@@ -156,7 +156,7 @@ void BasicEngine::CheckBackwardInputs(const OpBase& op) {
         VLOG(6) << "Set ungenerated Grad: " << var->Name()
                 << " as zero with dtype "
                 << framework::DataTypeToString(var->ForwardDataType());
-        operators::math::set_constant(*dev_ctx, tensor, 0.0);
+        pten::funcs::set_constant(*dev_ctx, tensor, 0.0);
       }
     }
   }
diff --git a/paddle/fluid/imperative/gradient_accumulator.cc b/paddle/fluid/imperative/gradient_accumulator.cc
index 75d4d8246e..5eed7eca7a 100644
--- a/paddle/fluid/imperative/gradient_accumulator.cc
+++ b/paddle/fluid/imperative/gradient_accumulator.cc
@@ -22,12 +22,12 @@
 #include "paddle/fluid/framework/selected_rows_utils.h"
 #include "paddle/fluid/imperative/layer.h"
 #include "paddle/fluid/operators/math/blas.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/math/selected_rows_functor.h"
 #include "paddle/fluid/platform/complex.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/float16.h"
 #include "paddle/fluid/platform/profiler.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 #ifdef PADDLE_WITH_XPU
 #include "xpu/refactor/math.h"
 #endif
@@ -210,7 +210,7 @@ void TensorAddImpl(const framework::Tensor& src, framework::Tensor* dst,
   platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
   paddle::platform::DeviceContext* ctx = pool.Get(place);
   auto dev_ctx = dynamic_cast<DeviceContext*>(ctx);
-  operators::math::ElementwiseAddTo<DeviceContext, T> func;
+  pten::funcs::ElementwiseAddTo<DeviceContext, T> func;
   func(dev_ctx, src, dst);
 }
 
@@ -703,12 +703,12 @@ void EagerGradientAccumulator::SumGrad(std::shared_ptr<VariableWrapper> var,
                 << var->Var().Get<framework::LoDTensor>().dims();
         tensor->Resize(var->Var().Get<framework::LoDTensor>().dims());
         tensor->mutable_data(place, var->DataType());
-        operators::math::set_constant(*dev_ctx, tensor, 0.0);
+        pten::funcs::set_constant(*dev_ctx, tensor, 0.0);
       } else {
         auto* tensor =
             dst_var->MutableVar()->GetMutable<framework::LoDTensor>();
         tensor->mutable_data(place, var->DataType());
-        operators::math::set_constant(*dev_ctx, tensor, 0.0);
+        pten::funcs::set_constant(*dev_ctx, tensor, 0.0);
       }
     }
   }
@@ -835,12 +835,12 @@ void SortedGradientAccumulator::SumGrad(std::shared_ptr<VariableWrapper> var,
                 << var->Var().Get<framework::LoDTensor>().dims();
         tensor->Resize(var->Var().Get<framework::LoDTensor>().dims());
         tensor->mutable_data(place, var->DataType());
-        operators::math::set_constant(*dev_ctx, tensor, 0.0);
+        pten::funcs::set_constant(*dev_ctx, tensor, 0.0);
       } else {
         auto* tensor =
             dst_var->MutableVar()->GetMutable<framework::LoDTensor>();
         tensor->mutable_data(place, var->DataType());
-        operators::math::set_constant(*dev_ctx, tensor, 0.0);
+        pten::funcs::set_constant(*dev_ctx, tensor, 0.0);
       }
     }
     // looks like tmp_grad_vars will not have any member but just in case
diff --git a/paddle/fluid/imperative/layer.cc b/paddle/fluid/imperative/layer.cc
index 5b8974b334..60e1291a08 100644
--- a/paddle/fluid/imperative/layer.cc
+++ b/paddle/fluid/imperative/layer.cc
@@ -20,10 +20,10 @@
 #include "paddle/fluid/imperative/op_base.h"
 #include "paddle/fluid/imperative/prepared_operator.h"
 #include "paddle/fluid/imperative/var_helper.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/profiler.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 #ifdef PADDLE_WITH_MKLDNN
 #include "paddle/fluid/platform/mkldnn_helper.h"
 #endif
@@ -229,7 +229,7 @@ void VarBase::ClearGradient(bool set_to_zero) {
         if (set_to_zero) {
           auto* dev_ctx =
               platform::DeviceContextPool::Instance().Get(grad_t->place());
-          operators::math::set_constant(*dev_ctx, grad_t, 0.0);
+          pten::funcs::set_constant(*dev_ctx, grad_t, 0.0);
         } else {
           grad_t->clear();
         }
diff --git a/paddle/fluid/imperative/partial_grad_engine.cc b/paddle/fluid/imperative/partial_grad_engine.cc
index 45756083c9..ed60a4dc08 100644
--- a/paddle/fluid/imperative/partial_grad_engine.cc
+++ b/paddle/fluid/imperative/partial_grad_engine.cc
@@ -28,10 +28,10 @@
 #include "paddle/fluid/imperative/layer.h"
 #include "paddle/fluid/imperative/op_base.h"
 #include "paddle/fluid/imperative/tracer.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/profiler.h"
 #include "paddle/fluid/string/string_helper.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 DECLARE_bool(sort_sum_gradient);
 
@@ -316,7 +316,7 @@ static void FillConstantLike(const VariableWrapper &ref_var,
   } else {
     dst_tensor->mutable_data(place, ref_var.DataType());
   }
-  operators::math::set_constant(*dev_ctx, dst_tensor, value);
+  pten::funcs::set_constant(*dev_ctx, dst_tensor, value);
 }
 
 /**
diff --git a/paddle/fluid/imperative/reducer.cc b/paddle/fluid/imperative/reducer.cc
index 54e27b2bd8..361b9eb0fe 100644
--- a/paddle/fluid/imperative/reducer.cc
+++ b/paddle/fluid/imperative/reducer.cc
@@ -755,7 +755,7 @@ void Reducer::MarkVarReady(const size_t var_index, const bool is_used_var) {
             {static_cast<int64_t>(length)});
       } else {
         group_tensor.Resize({static_cast<int64_t>(length)});
-        operators::math::set_constant(*dev_ctx, &group_tensor, 0.0);
+        pten::funcs::set_constant(*dev_ctx, &group_tensor, 0.0);
       }
 #endif
     }
diff --git a/paddle/fluid/imperative/reducer.h b/paddle/fluid/imperative/reducer.h
index b99d7adc0c..b0317fe33e 100644
--- a/paddle/fluid/imperative/reducer.h
+++ b/paddle/fluid/imperative/reducer.h
@@ -29,8 +29,8 @@
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/framework/variable.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/platform/for_range.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace imperative {
diff --git a/paddle/fluid/imperative/tests/test_gradient_accmulator.cc b/paddle/fluid/imperative/tests/test_gradient_accmulator.cc
index 6210cb108b..e91b0b0a77 100644
--- a/paddle/fluid/imperative/tests/test_gradient_accmulator.cc
+++ b/paddle/fluid/imperative/tests/test_gradient_accmulator.cc
@@ -20,7 +20,7 @@
 #include "paddle/fluid/framework/variable.h"
 #include "paddle/fluid/imperative/gradient_accumulator.h"
 #include "paddle/fluid/memory/memcpy.h"
-#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace imperative = paddle::imperative;
 namespace platform = paddle::platform;
diff --git a/paddle/fluid/operators/addmm_op.h b/paddle/fluid/operators/addmm_op.h
index ecfd10d2fa..8fe73d81b0 100644
--- a/paddle/fluid/operators/addmm_op.h
+++ b/paddle/fluid/operators/addmm_op.h
@@ -20,7 +20,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/operators/eigen/eigen_function.h"
 #include "paddle/fluid/operators/math/blas.h"
-#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
diff --git a/paddle/fluid/operators/affine_grid_op.cu b/paddle/fluid/operators/affine_grid_op.cu
index bcf7deefc9..d203dcb7b9 100644
--- a/paddle/fluid/operators/affine_grid_op.cu
+++ b/paddle/fluid/operators/affine_grid_op.cu
@@ -170,7 +170,7 @@ class AffineGridGradOpCUDAKernel : public framework::OpKernel<T> {
       w = size_attr[3];
     }
     T* theta_grad_data = theta_grad->mutable_data<T>({n, 2, 3}, ctx.GetPlace());
-    math::SetConstant<paddle::platform::CUDADeviceContext, T>()(
+    pten::funcs::SetConstant<paddle::platform::CUDADeviceContext, T>()(
         ctx.cuda_device_context(), theta_grad, static_cast<T>(0));
 
     T h_step;
diff --git a/paddle/fluid/operators/affine_grid_op.h b/paddle/fluid/operators/affine_grid_op.h
index 50c9ebcd9c..129c7a61a7 100644
--- a/paddle/fluid/operators/affine_grid_op.h
+++ b/paddle/fluid/operators/affine_grid_op.h
@@ -17,7 +17,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/blas.h"
-#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
@@ -61,7 +61,7 @@ inline void GetIdxMap(int n, int h, int w, bool align_corners, Tensor* grid,
   Tensor ones;
   ones.mutable_data<T>({h, w, 1}, ctx.GetPlace());
 
-  math::SetConstant<DeviceContext, T>()(
+  pten::funcs::SetConstant<DeviceContext, T>()(
       ctx.template device_context<DeviceContext>(), &ones, static_cast<T>(1));
   auto ones_t = EigenTensor<T, 3>::From(ones);
   // Get grid tensor with shape [n, h, w, 3] by concatenating h_idx, w_idx and
@@ -115,7 +115,7 @@ class AffineGridOpKernel : public framework::OpKernel<T> {
     }
     auto* output = ctx.Output<Tensor>("Output");
     output->mutable_data<T>({n, h, w, 2}, ctx.GetPlace());
-    math::SetConstant<DeviceContext, T>()(
+    pten::funcs::SetConstant<DeviceContext, T>()(
         ctx.template device_context<DeviceContext>(), output,
         static_cast<T>(0));
     Tensor grid;
@@ -158,7 +158,7 @@ class AffineGridGradOpKernel : public framework::OpKernel<T> {
       w = size_attr[3];
     }
     theta_grad->mutable_data<T>({n, 2, 3}, ctx.GetPlace());
-    math::SetConstant<DeviceContext, T>()(
+    pten::funcs::SetConstant<DeviceContext, T>()(
         ctx.template device_context<DeviceContext>(), theta_grad,
         static_cast<T>(0));
     Tensor grid;
diff --git a/paddle/fluid/operators/amp/check_finite_and_unscale_op_npu_test.cc b/paddle/fluid/operators/amp/check_finite_and_unscale_op_npu_test.cc
index a80b83f0cb..6390a1f473 100644
--- a/paddle/fluid/operators/amp/check_finite_and_unscale_op_npu_test.cc
+++ b/paddle/fluid/operators/amp/check_finite_and_unscale_op_npu_test.cc
@@ -24,12 +24,11 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/platform/enforce.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace f = paddle::framework;
 namespace p = paddle::platform;
-namespace m = paddle::operators::math;
 
 using Tensor = paddle::framework::Tensor;
 
diff --git a/paddle/fluid/operators/assign_op_npu_test.cc b/paddle/fluid/operators/assign_op_npu_test.cc
index 049cfb8046..4761ec6155 100644
--- a/paddle/fluid/operators/assign_op_npu_test.cc
+++ b/paddle/fluid/operators/assign_op_npu_test.cc
@@ -24,12 +24,11 @@ limitations under the License. */
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/operators/dropout_op.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/string/printf.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace f = paddle::framework;
 namespace p = paddle::platform;
-namespace m = paddle::operators::math;
 
 USE_OP(assign);
 USE_OP_DEVICE_KERNEL(assign, NPU);
diff --git a/paddle/fluid/operators/average_accumulates_op.h b/paddle/fluid/operators/average_accumulates_op.h
index 6813f56675..3cd235d89a 100644
--- a/paddle/fluid/operators/average_accumulates_op.h
+++ b/paddle/fluid/operators/average_accumulates_op.h
@@ -16,7 +16,7 @@ limitations under the License. */
 #include <algorithm>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
@@ -78,7 +78,7 @@ class AverageAccumulatesKernel : public framework::OpKernel<T> {
 
     // Compute
     auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
-    math::SetConstant<DeviceContext, T> constant_functor;
+    pten::funcs::SetConstant<DeviceContext, T> constant_functor;
     ++num_updates;
     ++num_accumulates;
     out_sum_1_tensor.device(place) = in_sum_1_tensor + param_tensor;
diff --git a/paddle/fluid/operators/batch_norm_op.cc b/paddle/fluid/operators/batch_norm_op.cc
index 0a8e753c01..8e960ff89b 100644
--- a/paddle/fluid/operators/batch_norm_op.cc
+++ b/paddle/fluid/operators/batch_norm_op.cc
@@ -989,7 +989,7 @@ class BatchNormDoubleGradKernel<platform::CPUDeviceContext, T>
         (data_layout == DataLayout::kNCHW ? x_dims[1]
                                           : x_dims[x_dims.size() - 1]);
     const int sample_size = X->numel() / C;
-    math::SetConstant<platform::CPUDeviceContext, T> set_constant;
+    pten::funcs::SetConstant<platform::CPUDeviceContext, T> set_constant;
 
     const T *mean_data = Saved_mean->data<T>();
     const T *inv_var_data = Saved_variance->data<T>();
diff --git a/paddle/fluid/operators/batch_norm_op.cu b/paddle/fluid/operators/batch_norm_op.cu
index 5f32d697ba..85bd8451b8 100644
--- a/paddle/fluid/operators/batch_norm_op.cu
+++ b/paddle/fluid/operators/batch_norm_op.cu
@@ -25,9 +25,9 @@ namespace cub = hipcub;
 #endif
 #include "paddle/fluid/framework/data_layout.h"
 #include "paddle/fluid/operators/batch_norm_op.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/norm_utils.cu.h"
 #include "paddle/fluid/platform/float16.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 DECLARE_bool(cudnn_batchnorm_spatial_persistent);
 
@@ -967,7 +967,8 @@ class BatchNormGradKernel<platform::CUDADeviceContext, T>
         if (d_x) {
           framework::TensorCopy(*d_y, ctx.GetPlace(), d_x);
         }
-        math::SetConstant<platform::CUDADeviceContext, BatchNormParamType<T>>
+        pten::funcs::SetConstant<platform::CUDADeviceContext,
+                                 BatchNormParamType<T>>
             functor;
         functor(dev_ctx, d_scale, static_cast<BatchNormParamType<T>>(0));
         functor(dev_ctx, d_bias, static_cast<BatchNormParamType<T>>(0));
diff --git a/paddle/fluid/operators/batch_norm_op.h b/paddle/fluid/operators/batch_norm_op.h
index 32e956e152..55f1964cf5 100644
--- a/paddle/fluid/operators/batch_norm_op.h
+++ b/paddle/fluid/operators/batch_norm_op.h
@@ -20,8 +20,8 @@ limitations under the License. */
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/layout_utils.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/norm_utils.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/batch_size_like.h b/paddle/fluid/operators/batch_size_like.h
index f24a3c316a..1ee0e7002a 100644
--- a/paddle/fluid/operators/batch_size_like.h
+++ b/paddle/fluid/operators/batch_size_like.h
@@ -16,7 +16,7 @@ limitations under the License. */
 #include <algorithm>
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/bilinear_tensor_product_op.h b/paddle/fluid/operators/bilinear_tensor_product_op.h
index 8f6c9b60dc..c7eb70c290 100644
--- a/paddle/fluid/operators/bilinear_tensor_product_op.h
+++ b/paddle/fluid/operators/bilinear_tensor_product_op.h
@@ -111,7 +111,7 @@ class BilinearTensorProductGradKernel : public framework::OpKernel<T> {
                             ctx.GetPlace());
     auto y_scale_mat = EigenMatrix<T>::From(y_scale);
 
-    math::SetConstant<DeviceContext, T> set_zero;
+    pten::funcs::SetConstant<DeviceContext, T> set_zero;
 
     if (d_x) {
       d_x->mutable_data<T>(ctx.GetPlace());
diff --git a/paddle/fluid/operators/bincount_op.cu b/paddle/fluid/operators/bincount_op.cu
index 5964b9e345..dd7804625a 100644
--- a/paddle/fluid/operators/bincount_op.cu
+++ b/paddle/fluid/operators/bincount_op.cu
@@ -105,7 +105,7 @@ void BincountCUDAInner(const framework::ExecutionContext& context) {
 
   if (!has_weights) {
     int64_t* output_data = output->mutable_data<int64_t>(context.GetPlace());
-    math::SetConstant<DeviceContext, int64_t>()(
+    pten::funcs::SetConstant<DeviceContext, int64_t>()(
         context.template device_context<DeviceContext>(), output, 0L);
 
     KernelBincount<T, InputT, int64_t><<<GET_BLOCKS(input_numel),
@@ -116,7 +116,7 @@ void BincountCUDAInner(const framework::ExecutionContext& context) {
 
     if (weights_type == framework::proto::VarType::FP32) {
       float* output_data = output->mutable_data<float>(context.GetPlace());
-      math::SetConstant<DeviceContext, float>()(
+      pten::funcs::SetConstant<DeviceContext, float>()(
           context.template device_context<DeviceContext>(), output,
           static_cast<float>(0));
 
@@ -125,7 +125,7 @@ void BincountCUDAInner(const framework::ExecutionContext& context) {
           input_data, input_numel, has_weights, weights_data, output_data);
     } else {
       double* output_data = output->mutable_data<double>(context.GetPlace());
-      math::SetConstant<DeviceContext, double>()(
+      pten::funcs::SetConstant<DeviceContext, double>()(
           context.template device_context<DeviceContext>(), output,
           static_cast<double>(0));
 
diff --git a/paddle/fluid/operators/bincount_op.h b/paddle/fluid/operators/bincount_op.h
index a142332bce..3f4334099e 100644
--- a/paddle/fluid/operators/bincount_op.h
+++ b/paddle/fluid/operators/bincount_op.h
@@ -18,7 +18,7 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
@@ -64,7 +64,7 @@ void BincountInner(const framework::ExecutionContext& context) {
     const auto& weights_type = weights->type();
     if (weights_type == framework::proto::VarType::FP32) {
       float* output_data = output->mutable_data<float>(context.GetPlace());
-      math::SetConstant<DeviceContext, float>()(
+      pten::funcs::SetConstant<DeviceContext, float>()(
           context.template device_context<DeviceContext>(), output,
           static_cast<float>(0));
       for (int64_t i = 0; i < input_numel; i++) {
@@ -72,7 +72,7 @@ void BincountInner(const framework::ExecutionContext& context) {
       }
     } else {
       double* output_data = output->mutable_data<double>(context.GetPlace());
-      math::SetConstant<DeviceContext, double>()(
+      pten::funcs::SetConstant<DeviceContext, double>()(
           context.template device_context<DeviceContext>(), output,
           static_cast<double>(0));
       for (int64_t i = 0; i < input_numel; i++) {
@@ -82,7 +82,7 @@ void BincountInner(const framework::ExecutionContext& context) {
 
   } else {
     int64_t* output_data = output->mutable_data<int64_t>(context.GetPlace());
-    math::SetConstant<DeviceContext, int64_t>()(
+    pten::funcs::SetConstant<DeviceContext, int64_t>()(
         context.template device_context<DeviceContext>(), output, 0L);
     for (int64_t i = 0; i < input_numel; i++) {
       output_data[input_data[i]] += 1L;
diff --git a/paddle/fluid/operators/bmm_op.h b/paddle/fluid/operators/bmm_op.h
index 15cd6de913..7a0ddd4582 100644
--- a/paddle/fluid/operators/bmm_op.h
+++ b/paddle/fluid/operators/bmm_op.h
@@ -21,7 +21,7 @@
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/operators/math/blas.h"
-#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 namespace paddle {
 namespace operators {
 
diff --git a/paddle/fluid/operators/bpr_loss_op.h b/paddle/fluid/operators/bpr_loss_op.h
index bebaf6e336..559d3e14ed 100644
--- a/paddle/fluid/operators/bpr_loss_op.h
+++ b/paddle/fluid/operators/bpr_loss_op.h
@@ -15,8 +15,8 @@ limitations under the License. */
 #pragma once
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/platform/for_range.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/broadcast_tensors_op.h b/paddle/fluid/operators/broadcast_tensors_op.h
index 0eeb9234df..4161b5879f 100644
--- a/paddle/fluid/operators/broadcast_tensors_op.h
+++ b/paddle/fluid/operators/broadcast_tensors_op.h
@@ -19,7 +19,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/lod_tensor_array.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/eigen/eigen_function.h"
-#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 #define SWITCH_OUT_RANK_CASE(n)                                \
   case n: {                                                    \
diff --git a/paddle/fluid/operators/coalesce_tensor_op.cc b/paddle/fluid/operators/coalesce_tensor_op.cc
index 5655fd25ec..d71d6fc39b 100644
--- a/paddle/fluid/operators/coalesce_tensor_op.cc
+++ b/paddle/fluid/operators/coalesce_tensor_op.cc
@@ -18,8 +18,8 @@
 #include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/var_type.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/platform/device_memory_aligment.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 #ifdef PADDLE_WITH_ASCEND_CL
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
 #endif
@@ -65,11 +65,11 @@ struct FillConstantVisitor {
               .stream();
       runner.Run(stream);
     } else {
-      math::SetConstant<DeviceContext, T> set_constant;
+      pten::funcs::SetConstant<DeviceContext, T> set_constant;
       set_constant(dev_ctx_, tensor_, static_cast<T>(value_));
     }
 #else
-    math::SetConstant<DeviceContext, T> set_constant;
+    pten::funcs::SetConstant<DeviceContext, T> set_constant;
     set_constant(dev_ctx_, tensor_, static_cast<T>(value_));
 #endif
   }
diff --git a/paddle/fluid/operators/collective/c_allgather_op_npu_test.cc b/paddle/fluid/operators/collective/c_allgather_op_npu_test.cc
index ecf682aa52..a51e81a427 100644
--- a/paddle/fluid/operators/collective/c_allgather_op_npu_test.cc
+++ b/paddle/fluid/operators/collective/c_allgather_op_npu_test.cc
@@ -27,8 +27,8 @@ limitations under the License. */
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/operators/dropout_op.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/string/printf.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 #include "paddle/fluid/operators/collective/c_allgather_op.h"
 #include "paddle/fluid/operators/collective/c_allreduce_op.h"
@@ -43,7 +43,6 @@ limitations under the License. */
 
 namespace f = paddle::framework;
 namespace p = paddle::platform;
-namespace m = paddle::operators::math;
 
 USE_OP(c_allgather);
 USE_NO_KERNEL_OP(c_gen_hccl_id);
diff --git a/paddle/fluid/operators/collective/c_allreduce_max_op_npu_test.cc b/paddle/fluid/operators/collective/c_allreduce_max_op_npu_test.cc
index fa134b60e2..f273e31f6b 100644
--- a/paddle/fluid/operators/collective/c_allreduce_max_op_npu_test.cc
+++ b/paddle/fluid/operators/collective/c_allreduce_max_op_npu_test.cc
@@ -27,8 +27,8 @@ limitations under the License. */
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/operators/dropout_op.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/string/printf.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 #include "paddle/fluid/operators/collective/c_allgather_op.h"
 #include "paddle/fluid/operators/collective/c_allreduce_op.h"
@@ -43,7 +43,6 @@ limitations under the License. */
 
 namespace f = paddle::framework;
 namespace p = paddle::platform;
-namespace m = paddle::operators::math;
 
 USE_OP(c_allreduce_max);
 USE_NO_KERNEL_OP(c_gen_hccl_id);
diff --git a/paddle/fluid/operators/collective/c_allreduce_sum_op_npu_test.cc b/paddle/fluid/operators/collective/c_allreduce_sum_op_npu_test.cc
index 3e91220423..66efcd2a49 100644
--- a/paddle/fluid/operators/collective/c_allreduce_sum_op_npu_test.cc
+++ b/paddle/fluid/operators/collective/c_allreduce_sum_op_npu_test.cc
@@ -27,8 +27,8 @@ limitations under the License. */
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/operators/dropout_op.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/string/printf.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 #include "paddle/fluid/operators/collective/c_allreduce_op.h"
 #include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h"
@@ -45,7 +45,6 @@ limitations under the License. */
 
 namespace f = paddle::framework;
 namespace p = paddle::platform;
-namespace m = paddle::operators::math;
 
 USE_OP(c_allreduce_sum);
 USE_NO_KERNEL_OP(c_gen_hccl_id);
diff --git a/paddle/fluid/operators/collective/c_broadcast_op_npu_test.cc b/paddle/fluid/operators/collective/c_broadcast_op_npu_test.cc
index 1ea34c8200..acfdd42a41 100644
--- a/paddle/fluid/operators/collective/c_broadcast_op_npu_test.cc
+++ b/paddle/fluid/operators/collective/c_broadcast_op_npu_test.cc
@@ -27,8 +27,8 @@ limitations under the License. */
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/operators/dropout_op.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/string/printf.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 #include "paddle/fluid/operators/collective/c_broadcast_op.h"
 #include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h"
@@ -40,7 +40,6 @@ limitations under the License. */
 
 namespace f = paddle::framework;
 namespace p = paddle::platform;
-namespace m = paddle::operators::math;
 
 USE_OP(c_broadcast);
 USE_NO_KERNEL_OP(c_gen_hccl_id);
diff --git a/paddle/fluid/operators/collective/c_reduce_sum_op_npu_test.cc b/paddle/fluid/operators/collective/c_reduce_sum_op_npu_test.cc
index d589d0a25e..ee0463f84b 100644
--- a/paddle/fluid/operators/collective/c_reduce_sum_op_npu_test.cc
+++ b/paddle/fluid/operators/collective/c_reduce_sum_op_npu_test.cc
@@ -27,8 +27,8 @@ limitations under the License. */
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/operators/dropout_op.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/string/printf.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 #include "paddle/fluid/operators/collective/c_reduce_op.h"
 #include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h"
@@ -40,7 +40,6 @@ limitations under the License. */
 
 namespace f = paddle::framework;
 namespace p = paddle::platform;
-namespace m = paddle::operators::math;
 
 USE_OP(c_reduce_sum);
 USE_NO_KERNEL_OP(c_gen_hccl_id);
diff --git a/paddle/fluid/operators/collective/c_reducescatter_op_npu_test.cc b/paddle/fluid/operators/collective/c_reducescatter_op_npu_test.cc
index db78652f87..652bf0c1f2 100644
--- a/paddle/fluid/operators/collective/c_reducescatter_op_npu_test.cc
+++ b/paddle/fluid/operators/collective/c_reducescatter_op_npu_test.cc
@@ -27,8 +27,8 @@ limitations under the License. */
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/operators/dropout_op.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/string/printf.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 #include "paddle/fluid/operators/collective/c_allgather_op.h"
 #include "paddle/fluid/operators/collective/c_allreduce_op.h"
@@ -43,7 +43,6 @@ limitations under the License. */
 
 namespace f = paddle::framework;
 namespace p = paddle::platform;
-namespace m = paddle::operators::math;
 
 USE_OP(c_reducescatter);
 USE_NO_KERNEL_OP(c_gen_hccl_id);
diff --git a/paddle/fluid/operators/collective/c_sync_calc_stream_op_npu_test.cc b/paddle/fluid/operators/collective/c_sync_calc_stream_op_npu_test.cc
index 5778a270f1..9d27d99b3a 100644
--- a/paddle/fluid/operators/collective/c_sync_calc_stream_op_npu_test.cc
+++ b/paddle/fluid/operators/collective/c_sync_calc_stream_op_npu_test.cc
@@ -26,12 +26,11 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/string/printf.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace f = paddle::framework;
 namespace p = paddle::platform;
-namespace m = paddle::operators::math;
 
 USE_OP(elementwise_add);
 USE_OP_DEVICE_KERNEL(elementwise_add, NPU);
diff --git a/paddle/fluid/operators/collective/c_sync_comm_stream_op_npu_test.cc b/paddle/fluid/operators/collective/c_sync_comm_stream_op_npu_test.cc
index e701783568..9d88378647 100644
--- a/paddle/fluid/operators/collective/c_sync_comm_stream_op_npu_test.cc
+++ b/paddle/fluid/operators/collective/c_sync_comm_stream_op_npu_test.cc
@@ -27,8 +27,8 @@ limitations under the License. */
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/operators/dropout_op.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/string/printf.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 #include "paddle/fluid/operators/collective/c_broadcast_op.h"
 #include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h"
@@ -40,7 +40,6 @@ limitations under the License. */
 
 namespace f = paddle::framework;
 namespace p = paddle::platform;
-namespace m = paddle::operators::math;
 
 USE_OP(c_broadcast);
 USE_OP_DEVICE_KERNEL(c_sync_comm_stream, NPU);
diff --git a/paddle/fluid/operators/collective/checknumeric_npu_test.cc b/paddle/fluid/operators/collective/checknumeric_npu_test.cc
index 2be37cc456..18b75d8e68 100644
--- a/paddle/fluid/operators/collective/checknumeric_npu_test.cc
+++ b/paddle/fluid/operators/collective/checknumeric_npu_test.cc
@@ -28,8 +28,8 @@ limitations under the License. */
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/operators/dropout_op.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/string/printf.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 #include "paddle/fluid/operators/collective/c_allreduce_op.h"
 #include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h"
@@ -41,7 +41,6 @@ limitations under the License. */
 
 namespace f = paddle::framework;
 namespace p = paddle::platform;
-namespace m = paddle::operators::math;
 
 USE_OP(c_allreduce_sum);
 USE_OP_DEVICE_KERNEL(c_allreduce_sum, NPU);
diff --git a/paddle/fluid/operators/collective/recv_v2_op_npu_test.cc b/paddle/fluid/operators/collective/recv_v2_op_npu_test.cc
index edd4b18b35..bf96f48bc8 100644
--- a/paddle/fluid/operators/collective/recv_v2_op_npu_test.cc
+++ b/paddle/fluid/operators/collective/recv_v2_op_npu_test.cc
@@ -27,8 +27,8 @@ limitations under the License. */
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/operators/dropout_op.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/string/printf.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 #include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h"
 #include "paddle/fluid/operators/collective/recv_v2_op.h"
@@ -40,7 +40,6 @@ limitations under the License. */
 
 namespace f = paddle::framework;
 namespace p = paddle::platform;
-namespace m = paddle::operators::math;
 
 USE_OP(recv_v2);
 USE_NO_KERNEL_OP(c_gen_hccl_id);
diff --git a/paddle/fluid/operators/collective/send_v2_op_npu_test.cc b/paddle/fluid/operators/collective/send_v2_op_npu_test.cc
index b2470ab4c0..748a4fb99b 100644
--- a/paddle/fluid/operators/collective/send_v2_op_npu_test.cc
+++ b/paddle/fluid/operators/collective/send_v2_op_npu_test.cc
@@ -26,8 +26,8 @@ limitations under the License. */
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/operators/dropout_op.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/string/printf.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 #include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h"
 #include "paddle/fluid/operators/collective/send_v2_op.h"
@@ -39,7 +39,6 @@ limitations under the License. */
 
 namespace f = paddle::framework;
 namespace p = paddle::platform;
-namespace m = paddle::operators::math;
 
 USE_OP(send_v2);
 USE_NO_KERNEL_OP(c_gen_hccl_id);
diff --git a/paddle/fluid/operators/controlflow/conditional_block_op.cc b/paddle/fluid/operators/controlflow/conditional_block_op.cc
index eeb410eba2..f961e479ce 100644
--- a/paddle/fluid/operators/controlflow/conditional_block_op.cc
+++ b/paddle/fluid/operators/controlflow/conditional_block_op.cc
@@ -15,7 +15,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/controlflow/conditional_block_op.h"
 
 #include "paddle/fluid/operators/assign_op.h"
-#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
@@ -245,7 +245,7 @@ class ConditionalBlockGradOp : public ConditionalOp {
     outside_tensor->mutable_data(place, input_tensor.type());
     const platform::DeviceContext *dev_ctx =
         platform::DeviceContextPool::Instance().Get(place);
-    math::set_constant(*dev_ctx, outside_tensor, 0.0f);
+    pten::funcs::set_constant(*dev_ctx, outside_tensor, 0.0f);
     outside_tensor->set_lod(input_tensor.lod());
   }
 };
diff --git a/paddle/fluid/operators/conv_cudnn_op.cu b/paddle/fluid/operators/conv_cudnn_op.cu
index 20b1afb42f..3e85194908 100644
--- a/paddle/fluid/operators/conv_cudnn_op.cu
+++ b/paddle/fluid/operators/conv_cudnn_op.cu
@@ -861,7 +861,7 @@ class CUDNNConvDoubleGradOpKernel : public framework::OpKernel<T> {
     auto dX = ctx.Output<Tensor>("DInput");
     if (ddO) {
       ddO->mutable_data<T>(ctx.GetPlace());
-      math::SetConstant<platform::CUDADeviceContext, T> set_zero;
+      pten::funcs::SetConstant<platform::CUDADeviceContext, T> set_zero;
       set_zero(dev_ctx, ddO, static_cast<T>(0));
     }
     if (dW) {
diff --git a/paddle/fluid/operators/conv_op.h b/paddle/fluid/operators/conv_op.h
index 94d1f707b7..fb22765d76 100644
--- a/paddle/fluid/operators/conv_op.h
+++ b/paddle/fluid/operators/conv_op.h
@@ -485,7 +485,7 @@ class GemmConvGradKernel : public framework::OpKernel<T> {
       col_matrix.Resize(col_matrix_shape);
     }
 
-    math::SetConstant<DeviceContext, T> set_zero;
+    pten::funcs::SetConstant<DeviceContext, T> set_zero;
     auto blas = math::GetBlas<DeviceContext, T>(dev_ctx);
 
     if (input_grad) {
@@ -692,7 +692,7 @@ class GemmConvDoubleGradKernel : public framework::OpKernel<T> {
       col_matrix.Resize(col_matrix_shape);
     }
 
-    math::SetConstant<DeviceContext, T> set_zero;
+    pten::funcs::SetConstant<DeviceContext, T> set_zero;
     auto blas = math::GetBlas<DeviceContext, T>(dev_ctx);
 
     // dx convolution double grad:  gemm + col2im(col2vol)
@@ -991,7 +991,7 @@ class DepthwiseConvGradKernel : public framework::OpKernel<T> {
         paddings.erase(paddings.begin() + i + 1);
       }
     }
-    math::SetConstant<DeviceContext, T> set_zero;
+    pten::funcs::SetConstant<DeviceContext, T> set_zero;
     auto& dev_ctx = context.template device_context<DeviceContext>();
 
     if (input_grad) {
diff --git a/paddle/fluid/operators/conv_shift_op.cu b/paddle/fluid/operators/conv_shift_op.cu
index 2289104d2d..aca3bf9ae2 100644
--- a/paddle/fluid/operators/conv_shift_op.cu
+++ b/paddle/fluid/operators/conv_shift_op.cu
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/conv_shift_op.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
@@ -163,7 +163,7 @@ class ConvShiftGradKernel<platform::CUDADeviceContext, T>
 
     auto &device_ctx =
         context.template device_context<platform::CUDADeviceContext>();
-    math::SetConstant<platform::CUDADeviceContext, T> zero;
+    pten::funcs::SetConstant<platform::CUDADeviceContext, T> zero;
 
     const int x_per_block = 256;
     int num_x_blocks = DivUp(x_width, x_per_block);
diff --git a/paddle/fluid/operators/conv_transpose_cudnn_op.cu b/paddle/fluid/operators/conv_transpose_cudnn_op.cu
index 19c0be44a1..32792d6d47 100644
--- a/paddle/fluid/operators/conv_transpose_cudnn_op.cu
+++ b/paddle/fluid/operators/conv_transpose_cudnn_op.cu
@@ -21,8 +21,8 @@ limitations under the License. */
 #include "paddle/fluid/operators/conv_cudnn_helper.h"
 #endif
 #include "paddle/fluid/operators/conv_transpose_op.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/math/padding.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
@@ -34,7 +34,7 @@ static void DataTranspose(const framework::ExecutionContext& ctx,
                           const Tensor* input, Tensor* output,
                           const std::vector<int>& axis, int flag = 0) {
   auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
-  math::Transpose<platform::CUDADeviceContext, T, D> transpose;
+  pten::funcs::Transpose<platform::CUDADeviceContext, T, D> transpose;
   auto in_dims = input->dims();
   std::vector<int64_t> input_transpose_vec;
   for (size_t i = 0; i < axis.size(); ++i) {
@@ -650,7 +650,7 @@ class CUDNNConvTransposeDoubleGradOpKernel : public framework::OpKernel<T> {
 
     if (ddO) {
       ddO->mutable_data<T>(ctx.GetPlace());
-      math::SetConstant<platform::CUDADeviceContext, T> set_zero;
+      pten::funcs::SetConstant<platform::CUDADeviceContext, T> set_zero;
       set_zero(dev_ctx, ddO, static_cast<T>(0));
     }
     if (dW) {
diff --git a/paddle/fluid/operators/conv_transpose_op.h b/paddle/fluid/operators/conv_transpose_op.h
index b8335c7506..7b1fb6901e 100644
--- a/paddle/fluid/operators/conv_transpose_op.h
+++ b/paddle/fluid/operators/conv_transpose_op.h
@@ -226,7 +226,7 @@ class GemmConvTransposeKernel : public framework::OpKernel<T> {
     filter.Resize(filter_matrix_shape);
 
     output->mutable_data<T>(context.GetPlace());
-    math::SetConstant<DeviceContext, T> set_zero;
+    pten::funcs::SetConstant<DeviceContext, T> set_zero;
     auto& dev_ctx = context.template device_context<DeviceContext>();
     auto blas = math::GetBlas<DeviceContext, T>(dev_ctx);
     set_zero(dev_ctx, output, static_cast<T>(0));
@@ -437,7 +437,7 @@ class GemmConvTransposeGradKernel : public framework::OpKernel<T> {
       col_matrix.Resize(col_matrix_shape);
 
       Tensor filter_grad_;
-      math::SetConstant<DeviceContext, T> set_zero;
+      pten::funcs::SetConstant<DeviceContext, T> set_zero;
 
       math::Im2ColFunctor<math::ColFormat::kCFO, DeviceContext, T> im2col;
       math::Vol2ColFunctor<DeviceContext, T> vol2col;
@@ -628,7 +628,7 @@ class DepthwiseConvTransposeKernel : public framework::OpKernel<T> {
 
     output->mutable_data<T>(context.GetPlace());
     auto& dev_ctx = context.template device_context<DeviceContext>();
-    math::SetConstant<DeviceContext, T> set_zero;
+    pten::funcs::SetConstant<DeviceContext, T> set_zero;
     set_zero(dev_ctx, output, static_cast<T>(0));
 
     math::DepthwiseConvInputGradFunctor<DeviceContext, T>
@@ -690,7 +690,7 @@ class DepthwiseConvTransposeGradKernel : public framework::OpKernel<T> {
     }
 
     if (filter_grad) {
-      math::SetConstant<DeviceContext, T> set_zero;
+      pten::funcs::SetConstant<DeviceContext, T> set_zero;
       filter_grad->mutable_data<T>(context.GetPlace());
       set_zero(dev_ctx, filter_grad, static_cast<T>(0));
 
diff --git a/paddle/fluid/operators/cos_sim_op.h b/paddle/fluid/operators/cos_sim_op.h
index 0b4e3f7746..f8b984e115 100644
--- a/paddle/fluid/operators/cos_sim_op.h
+++ b/paddle/fluid/operators/cos_sim_op.h
@@ -15,8 +15,8 @@ limitations under the License. */
 #pragma once
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/cos_sim_functor.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/platform/for_range.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
@@ -121,7 +121,7 @@ class CosSimGradKernel : public framework::OpKernel<T> {
       if (out_grad_y) {
         out_grad_y->Resize(in_y->dims());
         out_grad_y->mutable_data<T>(context.GetPlace());
-        math::SetConstant<DeviceContext, T> set_zero;
+        pten::funcs::SetConstant<DeviceContext, T> set_zero;
         auto& dev_ctx = context.template device_context<DeviceContext>();
         set_zero(dev_ctx, out_grad_y, static_cast<T>(0));
 
diff --git a/paddle/fluid/operators/crf_decoding_op.h b/paddle/fluid/operators/crf_decoding_op.h
index 33108251b3..8ca819de06 100644
--- a/paddle/fluid/operators/crf_decoding_op.h
+++ b/paddle/fluid/operators/crf_decoding_op.h
@@ -17,7 +17,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/jit/kernels.h"
-#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
@@ -36,7 +36,7 @@ class CRFDecodingOpKernel : public framework::OpKernel<T> {
     auto* decoded_path = ctx.Output<Tensor>("ViterbiPath");
 
     int64_t* path = decoded_path->mutable_data<int64_t>(platform::CPUPlace());
-    math::SetConstant<DeviceContext, int64_t>()(
+    pten::funcs::SetConstant<DeviceContext, int64_t>()(
         ctx.template device_context<DeviceContext>(), decoded_path, 0);
 
     bool has_length = ctx.HasInput("Length");
diff --git a/paddle/fluid/operators/cross_entropy_op.h b/paddle/fluid/operators/cross_entropy_op.h
index 8424fc4376..19ab6afd7f 100644
--- a/paddle/fluid/operators/cross_entropy_op.h
+++ b/paddle/fluid/operators/cross_entropy_op.h
@@ -17,8 +17,8 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math.h"
 #include "paddle/fluid/operators/math/cross_entropy.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/platform/for_range.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/ctc_align_op.cu b/paddle/fluid/operators/ctc_align_op.cu
index 67bd71d4a1..bd0b0ac0bc 100644
--- a/paddle/fluid/operators/ctc_align_op.cu
+++ b/paddle/fluid/operators/ctc_align_op.cu
@@ -128,7 +128,7 @@ class CTCAlignOpCUDAKernel : public framework::OpKernel<T> {
       if (host_out_lod0.back() == 0) {
         output->Resize({1, 1});
         output->mutable_data<T>(ctx.GetPlace());
-        math::SetConstant<platform::CUDADeviceContext, T> set_constant;
+        pten::funcs::SetConstant<platform::CUDADeviceContext, T> set_constant;
         set_constant(ctx.template device_context<platform::CUDADeviceContext>(),
                      output, -1);
       }
diff --git a/paddle/fluid/operators/ctc_align_op.h b/paddle/fluid/operators/ctc_align_op.h
index 662f899c0a..b79c3aeac4 100644
--- a/paddle/fluid/operators/ctc_align_op.h
+++ b/paddle/fluid/operators/ctc_align_op.h
@@ -17,7 +17,7 @@ limitations under the License. */
 #include <string.h>
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/cudnn_lstm_op.cu.cc b/paddle/fluid/operators/cudnn_lstm_op.cu.cc
index 8adf556b4c..5c899ac557 100644
--- a/paddle/fluid/operators/cudnn_lstm_op.cu.cc
+++ b/paddle/fluid/operators/cudnn_lstm_op.cu.cc
@@ -14,8 +14,8 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/generator.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/utils.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 #ifdef PADDLE_WITH_CUDA
 #include "paddle/fluid/operators/cudnn_lstm_cache.h"
 #endif
@@ -366,7 +366,7 @@ class CudnnLSTMGPUGradKernel : public framework::OpKernel<T> {
     }
 
     Tensor weight_grad;
-    math::SetConstant<paddle::platform::CUDADeviceContext, T> zero;
+    pten::funcs::SetConstant<paddle::platform::CUDADeviceContext, T> zero;
     weight_grad.mutable_data<T>({weight_numel}, ctx.GetPlace());
     zero(dev_ctx, &weight_grad, static_cast<T>(0.0));
     T *weight_grad_data = weight_grad.data<T>();
diff --git a/paddle/fluid/operators/cvm_op.cc b/paddle/fluid/operators/cvm_op.cc
index be7d4780f8..a84357b6e4 100644
--- a/paddle/fluid/operators/cvm_op.cc
+++ b/paddle/fluid/operators/cvm_op.cc
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/cvm_op.h"
 #include <memory>
-#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/deformable_conv_filter.cu.h b/paddle/fluid/operators/deformable_conv_filter.cu.h
index f466d1803f..75d16ae0d4 100644
--- a/paddle/fluid/operators/deformable_conv_filter.cu.h
+++ b/paddle/fluid/operators/deformable_conv_filter.cu.h
@@ -23,7 +23,7 @@
 
 #pragma once
 #include "paddle/fluid/operators/math/blas.h"
-#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 template <typename T>
 __global__ void FilterGradAddupCUDAKernel(const int nthreads, const int n,
diff --git a/paddle/fluid/operators/deformable_conv_func.h b/paddle/fluid/operators/deformable_conv_func.h
index 99d1d7c477..134a1ea06d 100644
--- a/paddle/fluid/operators/deformable_conv_func.h
+++ b/paddle/fluid/operators/deformable_conv_func.h
@@ -23,8 +23,8 @@
 
 #pragma once
 #include "paddle/fluid/operators/math/blas.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/pten/core/hostdevice.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 template <typename T>
 HOSTDEVICE T DmcnGetGradientWeight(T argmax_h, T argmax_w, const int h,
diff --git a/paddle/fluid/operators/deformable_conv_op.cu b/paddle/fluid/operators/deformable_conv_op.cu
index 924adafa4b..97d2f71758 100644
--- a/paddle/fluid/operators/deformable_conv_op.cu
+++ b/paddle/fluid/operators/deformable_conv_op.cu
@@ -26,8 +26,8 @@
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/deformable_conv_op.h"
 #include "paddle/fluid/operators/math/blas.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
@@ -623,7 +623,7 @@ class DeformableConvGradCUDAKernel : public framework::OpKernel<T> {
     Tensor col_buffer_3d;
     col_buffer_3d.ShareDataWith(col_buffer).Resize(col_buffer_3d_shape);
 
-    math::SetConstant<DeviceContext, T> set_zero;
+    pten::funcs::SetConstant<DeviceContext, T> set_zero;
     auto blas = math::GetBlas<DeviceContext, T>(dev_ctx);
 
     col_buffer.mutable_data<T>(ctx.GetPlace());
diff --git a/paddle/fluid/operators/deformable_conv_op.h b/paddle/fluid/operators/deformable_conv_op.h
index 4be98f3e6c..a5c0404ed3 100644
--- a/paddle/fluid/operators/deformable_conv_op.h
+++ b/paddle/fluid/operators/deformable_conv_op.h
@@ -27,7 +27,7 @@
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/deformable_conv_func.h"
 #include "paddle/fluid/operators/math/blas.h"
-#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
@@ -489,7 +489,7 @@ class DeformableConvGradCPUKernel : public framework::OpKernel<T> {
     Tensor col_buffer_3d;
     col_buffer_3d.ShareDataWith(col_buffer).Resize(col_buffer_3d_shape);
 
-    math::SetConstant<CPUDeviceContext, T> set_zero;
+    pten::funcs::SetConstant<CPUDeviceContext, T> set_zero;
     auto blas = math::GetBlas<CPUDeviceContext, T>(dev_ctx);
 
     col_buffer.mutable_data<T>(ctx.GetPlace());
diff --git a/paddle/fluid/operators/deformable_conv_v1_op.cu b/paddle/fluid/operators/deformable_conv_v1_op.cu
index c252700528..8f6c5a226b 100644
--- a/paddle/fluid/operators/deformable_conv_v1_op.cu
+++ b/paddle/fluid/operators/deformable_conv_v1_op.cu
@@ -29,8 +29,8 @@
 #include "paddle/fluid/operators/deformable_conv_func.h"
 #include "paddle/fluid/operators/deformable_conv_v1_op.h"
 #include "paddle/fluid/operators/math/blas.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
@@ -489,7 +489,7 @@ class DeformableConvV1GradCUDAKernel : public framework::OpKernel<T> {
     Tensor col_buffer_3d;
     col_buffer_3d.ShareDataWith(col_buffer).Resize(col_buffer_3d_shape);
 
-    math::SetConstant<CUDADeviceContext, T> set_zero;
+    pten::funcs::SetConstant<CUDADeviceContext, T> set_zero;
     auto blas = math::GetBlas<CUDADeviceContext, T>(dev_ctx);
 
     col_buffer.mutable_data<T>(ctx.GetPlace());
diff --git a/paddle/fluid/operators/deformable_conv_v1_op.h b/paddle/fluid/operators/deformable_conv_v1_op.h
index 92b19e3904..1ddc31c93e 100644
--- a/paddle/fluid/operators/deformable_conv_v1_op.h
+++ b/paddle/fluid/operators/deformable_conv_v1_op.h
@@ -28,7 +28,7 @@
 #include "paddle/fluid/operators/deformable_conv_func.h"
 #include "paddle/fluid/operators/deformable_conv_op.h"
 #include "paddle/fluid/operators/math/blas.h"
-#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
@@ -451,7 +451,7 @@ class DeformableConvV1GradCPUKernel : public framework::OpKernel<T> {
     Tensor col_buffer_3d;
     col_buffer_3d.ShareDataWith(col_buffer).Resize(col_buffer_3d_shape);
 
-    math::SetConstant<CPUDeviceContext, T> set_zero;
+    pten::funcs::SetConstant<CPUDeviceContext, T> set_zero;
     auto blas = math::GetBlas<CPUDeviceContext, T>(dev_ctx);
 
     col_buffer.mutable_data<T>(ctx.GetPlace());
diff --git a/paddle/fluid/operators/deformable_psroi_pooling_op.cu b/paddle/fluid/operators/deformable_psroi_pooling_op.cu
index eeb2c7692b..95f05963cd 100644
--- a/paddle/fluid/operators/deformable_psroi_pooling_op.cu
+++ b/paddle/fluid/operators/deformable_psroi_pooling_op.cu
@@ -31,8 +31,8 @@
 #include "paddle/fluid/memory/malloc.h"
 #include "paddle/fluid/operators/deformable_psroi_pooling_op.h"
 #include "paddle/fluid/operators/math/blas.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
@@ -436,7 +436,7 @@ class DeformablePSROIPoolGradCUDAKernel : public framework::OpKernel<T> {
     Tensor* input_grad = ctx.Output<Tensor>(framework::GradVarName("Input"));
     Tensor* trans_grad = ctx.Output<Tensor>(framework::GradVarName("Trans"));
 
-    math::SetConstant<DeviceContext, T> set_zero;
+    pten::funcs::SetConstant<DeviceContext, T> set_zero;
     auto& dev_ctx = ctx.cuda_device_context();
     if (input_grad) {
       input_grad->mutable_data<T>(ctx.GetPlace());
diff --git a/paddle/fluid/operators/deformable_psroi_pooling_op.h b/paddle/fluid/operators/deformable_psroi_pooling_op.h
index a986f915e2..08b8342a1f 100644
--- a/paddle/fluid/operators/deformable_psroi_pooling_op.h
+++ b/paddle/fluid/operators/deformable_psroi_pooling_op.h
@@ -27,7 +27,7 @@
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/blas.h"
-#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
@@ -165,7 +165,7 @@ class DeformablePSROIPoolCPUKernel : public framework::OpKernel<T> {
     auto* top_count = ctx.Output<Tensor>("TopCount");
     top_count->mutable_data<T>(ctx.GetPlace());
 
-    math::SetConstant<DeviceContext, T> set_zero;
+    pten::funcs::SetConstant<DeviceContext, T> set_zero;
     auto& dev_ctx = ctx.template device_context<DeviceContext>();
     set_zero(dev_ctx, out, static_cast<T>(0));
     set_zero(dev_ctx, top_count, static_cast<T>(0));
@@ -421,7 +421,7 @@ class DeformablePSROIPoolGradCPUKernel : public framework::OpKernel<T> {
     auto* top_count = ctx.Input<Tensor>("TopCount");
     auto* output_grad = ctx.Input<Tensor>(framework::GradVarName("Output"));
     auto* input_grad = ctx.Output<Tensor>(framework::GradVarName("Input"));
-    math::SetConstant<DeviceContext, T> set_zero;
+    pten::funcs::SetConstant<DeviceContext, T> set_zero;
     auto& dev_ctx = ctx.template device_context<DeviceContext>();
     if (input_grad) {
       input_grad->mutable_data<T>(ctx.GetPlace());
diff --git a/paddle/fluid/operators/detection/anchor_generator_op.h b/paddle/fluid/operators/detection/anchor_generator_op.h
index 599f693573..f888787cf5 100644
--- a/paddle/fluid/operators/detection/anchor_generator_op.h
+++ b/paddle/fluid/operators/detection/anchor_generator_op.h
@@ -16,8 +16,8 @@ limitations under the License. */
 #include <algorithm>
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/platform/transform.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/detection/bbox_util.cu.h b/paddle/fluid/operators/detection/bbox_util.cu.h
index c6754f62cc..c4ae795a50 100644
--- a/paddle/fluid/operators/detection/bbox_util.cu.h
+++ b/paddle/fluid/operators/detection/bbox_util.cu.h
@@ -24,9 +24,9 @@ limitations under the License. */
 namespace cub = hipcub;
 #endif
 #include "paddle/fluid/operators/gather.cu.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
 #include "paddle/fluid/platform/for_range.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/detection/bipartite_match_op.cc b/paddle/fluid/operators/detection/bipartite_match_op.cc
index 5cd8537589..582f81d71a 100644
--- a/paddle/fluid/operators/detection/bipartite_match_op.cc
+++ b/paddle/fluid/operators/detection/bipartite_match_op.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
@@ -205,9 +205,9 @@ class BipartiteMatchKernel : public framework::OpKernel<T> {
     match_indices->mutable_data<int>({n, col}, context.GetPlace());
     match_dist->mutable_data<T>({n, col}, context.GetPlace());
 
-    math::SetConstant<platform::CPUDeviceContext, int> iset;
+    pten::funcs::SetConstant<platform::CPUDeviceContext, int> iset;
     iset(dev_ctx, match_indices, static_cast<int>(-1));
-    math::SetConstant<platform::CPUDeviceContext, T> tset;
+    pten::funcs::SetConstant<platform::CPUDeviceContext, T> tset;
     tset(dev_ctx, match_dist, static_cast<T>(0));
 
     int* indices = match_indices->data<int>();
diff --git a/paddle/fluid/operators/detection/box_clip_op.cu b/paddle/fluid/operators/detection/box_clip_op.cu
index 53727d9d08..24f5f00b07 100644
--- a/paddle/fluid/operators/detection/box_clip_op.cu
+++ b/paddle/fluid/operators/detection/box_clip_op.cu
@@ -14,9 +14,9 @@ limitations under the License. */
 #include <algorithm>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/detection/box_clip_op.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/pten/core/hostdevice.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/detection/box_clip_op.h b/paddle/fluid/operators/detection/box_clip_op.h
index e24cefdcd7..5c1870e902 100644
--- a/paddle/fluid/operators/detection/box_clip_op.h
+++ b/paddle/fluid/operators/detection/box_clip_op.h
@@ -13,7 +13,7 @@ limitations under the License. */
 #include <string>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/detection/bbox_util.h"
-#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/detection/box_coder_op.h b/paddle/fluid/operators/detection/box_coder_op.h
index d120ebbeb4..b4fe27401d 100644
--- a/paddle/fluid/operators/detection/box_coder_op.h
+++ b/paddle/fluid/operators/detection/box_coder_op.h
@@ -13,7 +13,7 @@ limitations under the License. */
 #include <string>
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/detection/box_decoder_and_assign_op.h b/paddle/fluid/operators/detection/box_decoder_and_assign_op.h
index e66a8351f4..1fe05e6ebb 100644
--- a/paddle/fluid/operators/detection/box_decoder_and_assign_op.h
+++ b/paddle/fluid/operators/detection/box_decoder_and_assign_op.h
@@ -14,7 +14,7 @@ limitations under the License. */
 #include <string>
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu b/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu
index eddb25d57b..70cbd7a9de 100644
--- a/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu
+++ b/paddle/fluid/operators/detection/collect_fpn_proposals_op.cu
@@ -195,7 +195,7 @@ class GPUCollectFpnProposalsOpKernel : public framework::OpKernel<T> {
     Tensor length_lod;
     int* length_lod_data =
         length_lod.mutable_data<int>({lod_size}, dev_ctx.GetPlace());
-    math::SetConstant<platform::CUDADeviceContext, int> set_zero;
+    pten::funcs::SetConstant<platform::CUDADeviceContext, int> set_zero;
     set_zero(dev_ctx, &length_lod, static_cast<int>(0));
 
     int blocks = NumBlocks(real_post_num);
diff --git a/paddle/fluid/operators/detection/collect_fpn_proposals_op.h b/paddle/fluid/operators/detection/collect_fpn_proposals_op.h
index 950b8b7893..984b633291 100644
--- a/paddle/fluid/operators/detection/collect_fpn_proposals_op.h
+++ b/paddle/fluid/operators/detection/collect_fpn_proposals_op.h
@@ -22,7 +22,7 @@ limitations under the License.*/
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/gather.h"
-#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cu b/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cu
index 355a35d4dd..84d564ac4e 100644
--- a/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cu
+++ b/paddle/fluid/operators/detection/distribute_fpn_proposals_op.cu
@@ -25,9 +25,9 @@ namespace cub = hipcub;
 #include "paddle/fluid/operators/detection/bbox_util.h"
 #include "paddle/fluid/operators/detection/distribute_fpn_proposals_op.h"
 #include "paddle/fluid/operators/gather.cu.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/fluid/platform/for_range.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
@@ -121,7 +121,7 @@ class GPUDistributeFpnProposalsOpKernel : public framework::OpKernel<T> {
     Tensor sub_lod_list;
     sub_lod_list.Resize({num_level, lod_size});
     int* sub_lod_list_data = sub_lod_list.mutable_data<int>(dev_ctx.GetPlace());
-    math::SetConstant<platform::CUDADeviceContext, int> set_zero;
+    pten::funcs::SetConstant<platform::CUDADeviceContext, int> set_zero;
     set_zero(dev_ctx, &sub_lod_list, static_cast<int>(0));
 
     Tensor target_lvls;
diff --git a/paddle/fluid/operators/detection/distribute_fpn_proposals_op.h b/paddle/fluid/operators/detection/distribute_fpn_proposals_op.h
index f1b454913f..e96804ab6f 100644
--- a/paddle/fluid/operators/detection/distribute_fpn_proposals_op.h
+++ b/paddle/fluid/operators/detection/distribute_fpn_proposals_op.h
@@ -21,7 +21,7 @@ limitations under the License. */
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/gather.h"
-#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/detection/generate_mask_labels_op.cc b/paddle/fluid/operators/detection/generate_mask_labels_op.cc
index afa4ccf25d..92dba742f4 100644
--- a/paddle/fluid/operators/detection/generate_mask_labels_op.cc
+++ b/paddle/fluid/operators/detection/generate_mask_labels_op.cc
@@ -19,7 +19,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/detection/mask_util.h"
 #include "paddle/fluid/operators/gather.h"
 #include "paddle/fluid/operators/math/concat_and_split.h"
-#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
@@ -122,7 +122,7 @@ static inline void ExpandMaskTarget(const platform::CPUDeviceContext& ctx,
 
   int* mask_targets_data =
       mask_targets->mutable_data<int>({num_mask, mask_dim}, ctx.GetPlace());
-  math::set_constant(ctx, mask_targets, -1);
+  pten::funcs::set_constant(ctx, mask_targets, -1);
   for (int64_t mask_id = 0; mask_id < num_mask; ++mask_id) {
     int cls = mask_class_labels_data[mask_id];
     int start = M * cls;
@@ -271,7 +271,7 @@ std::vector<Tensor> SampleMaskForOneImage(
     }
     masks.mutable_data<uint8_t>({bg_num, resolution * resolution},
                                 ctx.GetPlace());
-    math::set_constant(ctx, &masks, -1);
+    pten::funcs::set_constant(ctx, &masks, -1);
     int* mask_class_labels_data =
         mask_class_labels.mutable_data<int>({bg_num, 1}, ctx.GetPlace());
     mask_class_labels_data[0] = 0;
diff --git a/paddle/fluid/operators/detection/generate_proposal_labels_op.cc b/paddle/fluid/operators/detection/generate_proposal_labels_op.cc
index 1b1fa7b064..67a1d2c5ac 100644
--- a/paddle/fluid/operators/detection/generate_proposal_labels_op.cc
+++ b/paddle/fluid/operators/detection/generate_proposal_labels_op.cc
@@ -18,7 +18,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/detection/bbox_util.h"
 #include "paddle/fluid/operators/gather.h"
 #include "paddle/fluid/operators/math/concat_and_split.h"
-#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
@@ -289,7 +289,7 @@ void GatherBoxesLabels(const platform::CPUDeviceContext& context,
   fg_labels.mutable_data<int>({fg_num}, context.GetPlace());
   CPUGather<int>(context, gt_classes, gt_label_inds_t, &fg_labels);
   bg_labels.mutable_data<int>({bg_num}, context.GetPlace());
-  math::set_constant(context, &bg_labels, 0);
+  pten::funcs::set_constant(context, &bg_labels, 0);
   Concat<int>(context, fg_labels, bg_labels, sampled_labels);
 
   Tensor fg_max_overlap, bg_max_overlap;
@@ -328,7 +328,7 @@ std::vector<Tensor> SampleRoisForOneImage(
     Tensor roi_filter;
     // Tensor box_filter;
     if (keep.numel() == 0) {
-      math::SetConstant<platform::CPUDeviceContext, T> set_zero;
+      pten::funcs::SetConstant<platform::CPUDeviceContext, T> set_zero;
       roi_filter.mutable_data<T>({proposals_num, kBoxDim}, context.GetPlace());
       set_zero(context, &roi_filter, static_cast<T>(0));
     } else {
@@ -403,9 +403,9 @@ std::vector<Tensor> SampleRoisForOneImage(
   bbox_targets.mutable_data<T>(bbox_expand_dim, context.GetPlace());
   bbox_inside_weights.mutable_data<T>(bbox_expand_dim, context.GetPlace());
   bbox_outside_weights.mutable_data<T>(bbox_expand_dim, context.GetPlace());
-  math::set_constant(context, &bbox_targets, 0.0);
-  math::set_constant(context, &bbox_inside_weights, 0.0);
-  math::set_constant(context, &bbox_outside_weights, 0.0);
+  pten::funcs::set_constant(context, &bbox_targets, 0.0);
+  pten::funcs::set_constant(context, &bbox_inside_weights, 0.0);
+  pten::funcs::set_constant(context, &bbox_outside_weights, 0.0);
 
   auto* bbox_targets_single_data = bbox_targets_single.data<T>();
   auto* sampled_labels_data = sampled_labels.data<int>();
diff --git a/paddle/fluid/operators/detection/generate_proposals_op.cc b/paddle/fluid/operators/detection/generate_proposals_op.cc
index bc48c3b5ba..570720550b 100644
--- a/paddle/fluid/operators/detection/generate_proposals_op.cc
+++ b/paddle/fluid/operators/detection/generate_proposals_op.cc
@@ -21,7 +21,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/detection/bbox_util.h"
 #include "paddle/fluid/operators/detection/nms_util.h"
 #include "paddle/fluid/operators/gather.h"
-#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
@@ -112,7 +112,7 @@ class GenerateProposalsKernel : public framework::OpKernel<T> {
     scores_swap.mutable_data<T>({num, h_score, w_score, c_score},
                                 dev_ctx.GetPlace());
 
-    math::Transpose<platform::CPUDeviceContext, T, 4> trans;
+    pten::funcs::Transpose<platform::CPUDeviceContext, T, 4> trans;
     std::vector<int> axis = {0, 2, 3, 1};
     trans(dev_ctx, *bbox_deltas, &bbox_deltas_swap, axis);
     trans(dev_ctx, *scores, &scores_swap, axis);
@@ -211,7 +211,7 @@ class GenerateProposalsKernel : public framework::OpKernel<T> {
     FilterBoxes<T>(ctx, &proposals, min_size, im_info_slice, true, &keep);
     // Handle the case when there is no keep index left
     if (keep.numel() == 0) {
-      math::SetConstant<platform::CPUDeviceContext, T> set_zero;
+      pten::funcs::SetConstant<platform::CPUDeviceContext, T> set_zero;
       bbox_sel.mutable_data<T>({1, 4}, ctx.GetPlace());
       set_zero(ctx, &bbox_sel, static_cast<T>(0));
       Tensor scores_filter;
diff --git a/paddle/fluid/operators/detection/generate_proposals_op.cu b/paddle/fluid/operators/detection/generate_proposals_op.cu
index 2de06e06d9..f34b8e26c0 100644
--- a/paddle/fluid/operators/detection/generate_proposals_op.cu
+++ b/paddle/fluid/operators/detection/generate_proposals_op.cu
@@ -20,7 +20,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/memory/memory.h"
 #include "paddle/fluid/operators/detection/bbox_util.cu.h"
-#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
@@ -76,7 +76,7 @@ static std::pair<Tensor, Tensor> ProposalForOneImage(
   Tensor scores_filter, proposals_filter;
   // Handle the case when there is no keep index left
   if (keep_num == 0) {
-    math::SetConstant<platform::CUDADeviceContext, T> set_zero;
+    pten::funcs::SetConstant<platform::CUDADeviceContext, T> set_zero;
     proposals_filter.mutable_data<T>({1, 4}, ctx.GetPlace());
     scores_filter.mutable_data<T>({1, 1}, ctx.GetPlace());
     set_zero(ctx, &proposals_filter, static_cast<T>(0));
@@ -154,7 +154,7 @@ class CUDAGenerateProposalsKernel : public framework::OpKernel<T> {
     scores_swap.mutable_data<T>({num, h_score, w_score, c_score},
                                 dev_ctx.GetPlace());
 
-    math::Transpose<DeviceContext, T, 4> trans;
+    pten::funcs::Transpose<DeviceContext, T, 4> trans;
     std::vector<int> axis = {0, 2, 3, 1};
     trans(dev_ctx, *bbox_deltas, &bbox_deltas_swap, axis);
     trans(dev_ctx, *scores, &scores_swap, axis);
diff --git a/paddle/fluid/operators/detection/generate_proposals_v2_op.cc b/paddle/fluid/operators/detection/generate_proposals_v2_op.cc
index 44554a941d..671a27429f 100644
--- a/paddle/fluid/operators/detection/generate_proposals_v2_op.cc
+++ b/paddle/fluid/operators/detection/generate_proposals_v2_op.cc
@@ -21,7 +21,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/detection/bbox_util.h"
 #include "paddle/fluid/operators/detection/nms_util.h"
 #include "paddle/fluid/operators/gather.h"
-#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
@@ -113,7 +113,7 @@ class GenerateProposalsV2Kernel : public framework::OpKernel<T> {
     scores_swap.mutable_data<T>({num, h_score, w_score, c_score},
                                 dev_ctx.GetPlace());
 
-    math::Transpose<platform::CPUDeviceContext, T, 4> trans;
+    pten::funcs::Transpose<platform::CPUDeviceContext, T, 4> trans;
     std::vector<int> axis = {0, 2, 3, 1};
     trans(dev_ctx, *bbox_deltas, &bbox_deltas_swap, axis);
     trans(dev_ctx, *scores, &scores_swap, axis);
@@ -215,7 +215,7 @@ class GenerateProposalsV2Kernel : public framework::OpKernel<T> {
                    pixel_offset);
     // Handle the case when there is no keep index left
     if (keep.numel() == 0) {
-      math::SetConstant<platform::CPUDeviceContext, T> set_zero;
+      pten::funcs::SetConstant<platform::CPUDeviceContext, T> set_zero;
       bbox_sel.mutable_data<T>({1, 4}, ctx.GetPlace());
       set_zero(ctx, &bbox_sel, static_cast<T>(0));
       Tensor scores_filter;
diff --git a/paddle/fluid/operators/detection/generate_proposals_v2_op.cu b/paddle/fluid/operators/detection/generate_proposals_v2_op.cu
index cc2d4578e3..98108a25da 100644
--- a/paddle/fluid/operators/detection/generate_proposals_v2_op.cu
+++ b/paddle/fluid/operators/detection/generate_proposals_v2_op.cu
@@ -20,7 +20,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/memory/memory.h"
 #include "paddle/fluid/operators/detection/bbox_util.cu.h"
-#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
@@ -77,7 +77,7 @@ static std::pair<Tensor, Tensor> ProposalForOneImage(
   Tensor scores_filter, proposals_filter;
   // Handle the case when there is no keep index left
   if (keep_num == 0) {
-    math::SetConstant<platform::CUDADeviceContext, T> set_zero;
+    pten::funcs::SetConstant<platform::CUDADeviceContext, T> set_zero;
     proposals_filter.mutable_data<T>({1, 4}, ctx.GetPlace());
     scores_filter.mutable_data<T>({1, 1}, ctx.GetPlace());
     set_zero(ctx, &proposals_filter, static_cast<T>(0));
@@ -157,7 +157,7 @@ class CUDAGenerateProposalsV2Kernel : public framework::OpKernel<T> {
     scores_swap.mutable_data<T>({num, h_score, w_score, c_score},
                                 dev_ctx.GetPlace());
 
-    math::Transpose<DeviceContext, T, 4> trans;
+    pten::funcs::Transpose<DeviceContext, T, 4> trans;
     std::vector<int> axis = {0, 2, 3, 1};
     trans(dev_ctx, *bbox_deltas, &bbox_deltas_swap, axis);
     trans(dev_ctx, *scores, &scores_swap, axis);
diff --git a/paddle/fluid/operators/detection/prior_box_op.h b/paddle/fluid/operators/detection/prior_box_op.h
index 21ac74f25c..94413c9c83 100644
--- a/paddle/fluid/operators/detection/prior_box_op.h
+++ b/paddle/fluid/operators/detection/prior_box_op.h
@@ -16,8 +16,8 @@ limitations under the License. */
 #include <algorithm>
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/platform/transform.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/detection/roi_perspective_transform_op.cc b/paddle/fluid/operators/detection/roi_perspective_transform_op.cc
index 4d0c9da2ee..777e69ab7b 100644
--- a/paddle/fluid/operators/detection/roi_perspective_transform_op.cc
+++ b/paddle/fluid/operators/detection/roi_perspective_transform_op.cc
@@ -16,7 +16,7 @@ limitations under the License. */
 #include <memory>
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/detection/roi_perspective_transform_op.cu b/paddle/fluid/operators/detection/roi_perspective_transform_op.cu
index fbf631f75b..ff8da478a0 100644
--- a/paddle/fluid/operators/detection/roi_perspective_transform_op.cu
+++ b/paddle/fluid/operators/detection/roi_perspective_transform_op.cu
@@ -14,9 +14,9 @@ limitations under the License. */
 
 #include <algorithm>
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/fluid/platform/float16.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 using paddle::platform::PADDLE_CUDA_NUM_THREADS;
 using paddle::platform::float16;
@@ -356,7 +356,7 @@ class CUDAROIPerspectiveTransformOpKernel : public framework::OpKernel<T> {
     T* out2in_w_data =
         out2in_w->mutable_data<T>({out->numel(), 4}, ctx.GetPlace());
 
-    math::SetConstant<platform::CUDADeviceContext, int> init;
+    pten::funcs::SetConstant<platform::CUDADeviceContext, int> init;
     init(ctx.cuda_device_context(), out2in_idx, static_cast<int>(-1));
 
     auto transformed_height = ctx.Attr<int>("transformed_height");
@@ -482,7 +482,7 @@ class CUDAROIPerspectiveTransformGradOpKernel : public framework::OpKernel<T> {
 
     T* in_grad_data = in_grad->mutable_data<T>(ctx.GetPlace());
 
-    math::SetConstant<platform::CUDADeviceContext, T> set_zero;
+    pten::funcs::SetConstant<platform::CUDADeviceContext, T> set_zero;
     set_zero(ctx.cuda_device_context(), in_grad, static_cast<T>(0));
 
     const T* out_grad_data = out_grad->data<T>();
diff --git a/paddle/fluid/operators/detection/rpn_target_assign_op.cc b/paddle/fluid/operators/detection/rpn_target_assign_op.cc
index 2a16e20c2a..cf7afc3853 100644
--- a/paddle/fluid/operators/detection/rpn_target_assign_op.cc
+++ b/paddle/fluid/operators/detection/rpn_target_assign_op.cc
@@ -15,7 +15,7 @@ limitations under the License. */
 #include <random>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/detection/bbox_util.h"
-#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/detection/yolo_box_op.cu b/paddle/fluid/operators/detection/yolo_box_op.cu
index bfe4742c4b..7cc66f2074 100644
--- a/paddle/fluid/operators/detection/yolo_box_op.cu
+++ b/paddle/fluid/operators/detection/yolo_box_op.cu
@@ -14,8 +14,8 @@ limitations under the License. */
 
 #include "paddle/fluid/memory/malloc.h"
 #include "paddle/fluid/operators/detection/yolo_box_op.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 namespace paddle {
 namespace operators {
 
@@ -114,7 +114,7 @@ class YoloBoxOpCUDAKernel : public framework::OpKernel<T> {
     T* boxes_data = boxes->mutable_data<T>({n, box_num, 4}, ctx.GetPlace());
     T* scores_data =
         scores->mutable_data<T>({n, box_num, class_num}, ctx.GetPlace());
-    math::SetConstant<platform::CUDADeviceContext, T> set_zero;
+    pten::funcs::SetConstant<platform::CUDADeviceContext, T> set_zero;
     set_zero(dev_ctx, boxes, static_cast<T>(0));
     set_zero(dev_ctx, scores, static_cast<T>(0));
     platform::GpuLaunchConfig config =
diff --git a/paddle/fluid/operators/detection/yolo_box_op.h b/paddle/fluid/operators/detection/yolo_box_op.h
index 31a67ecc26..27fe31587e 100644
--- a/paddle/fluid/operators/detection/yolo_box_op.h
+++ b/paddle/fluid/operators/detection/yolo_box_op.h
@@ -13,8 +13,8 @@
 #include <algorithm>
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/pten/core/hostdevice.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/detection/yolov3_loss_op.h b/paddle/fluid/operators/detection/yolov3_loss_op.h
index 1acfb2cf4e..1ab3039b2e 100644
--- a/paddle/fluid/operators/detection/yolov3_loss_op.h
+++ b/paddle/fluid/operators/detection/yolov3_loss_op.h
@@ -13,7 +13,7 @@
 #include <algorithm>
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
@@ -305,7 +305,7 @@ class Yolov3LossKernel : public framework::OpKernel<T> {
     Tensor gtscore;
     if (!gt_score) {
       gtscore.mutable_data<T>({n, b}, ctx.GetPlace());
-      math::SetConstant<platform::CPUDeviceContext, T>()(
+      pten::funcs::SetConstant<platform::CPUDeviceContext, T>()(
           ctx.template device_context<platform::CPUDeviceContext>(), &gtscore,
           static_cast<T>(1.0));
       gt_score = &gtscore;
@@ -461,7 +461,7 @@ class Yolov3LossGradKernel : public framework::OpKernel<T> {
     Tensor gtscore;
     if (!gt_score) {
       gtscore.mutable_data<T>({n, b}, ctx.GetPlace());
-      math::SetConstant<platform::CPUDeviceContext, T>()(
+      pten::funcs::SetConstant<platform::CPUDeviceContext, T>()(
           ctx.template device_context<platform::CPUDeviceContext>(), &gtscore,
           static_cast<T>(1.0));
       gt_score = &gtscore;
diff --git a/paddle/fluid/operators/determinant_op.h b/paddle/fluid/operators/determinant_op.h
index 4c17869fb5..90443e0928 100644
--- a/paddle/fluid/operators/determinant_op.h
+++ b/paddle/fluid/operators/determinant_op.h
@@ -150,7 +150,7 @@ inline bool CheckMatrixInvertible(const framework::ExecutionContext& ctx,
   auto* data = dev_tensor.mutable_data<bool>({1}, ctx.GetPlace());
 
   // set false
-  math::SetConstant<DeviceContext, bool> zero;
+  pten::funcs::SetConstant<DeviceContext, bool> zero;
   zero(dev_ctx, &dev_tensor, false);
 
   // find whether zero
@@ -208,7 +208,7 @@ class DeterminantGradKernel : public framework::OpKernel<T> {
       VLOG(3) << "The input matrix not invertible!";
       ddet->Resize(input->dims());
       ddet->mutable_data<T>(context.GetPlace());
-      math::SetConstant<DeviceContext, T> zero;
+      pten::funcs::SetConstant<DeviceContext, T> zero;
       zero(dev_ctx, ddet, static_cast<T>(0.0f));
       return;
     }
@@ -363,7 +363,7 @@ class SlogDeterminantGradKernel : public framework::OpKernel<T> {
       VLOG(3) << "The input matrix not invertible!";
       dslogdet->Resize(input->dims());
       dslogdet->mutable_data<T>(context.GetPlace());
-      math::SetConstant<DeviceContext, T> zero;
+      pten::funcs::SetConstant<DeviceContext, T> zero;
       zero(dev_ctx, dslogdet, std::numeric_limits<T>::quiet_NaN());
       return;
     }
diff --git a/paddle/fluid/operators/dgc_op.h b/paddle/fluid/operators/dgc_op.h
index 12ffc94833..4a81537b8c 100644
--- a/paddle/fluid/operators/dgc_op.h
+++ b/paddle/fluid/operators/dgc_op.h
@@ -187,7 +187,7 @@ class DGCOpKernel : public framework::OpKernel<T> {
           "V_out numel error, V_out numel is %d.", v_out->numel()));
     }
 
-    math::SetConstant<DeviceContext, T> tset;
+    pten::funcs::SetConstant<DeviceContext, T> tset;
     tset(dev_ctx, grad_out, static_cast<T>(0));
   }
 };
diff --git a/paddle/fluid/operators/diag_embed_op.h b/paddle/fluid/operators/diag_embed_op.h
index aff7d7e48a..922140b5b8 100644
--- a/paddle/fluid/operators/diag_embed_op.h
+++ b/paddle/fluid/operators/diag_embed_op.h
@@ -17,8 +17,8 @@
 #include <algorithm>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/platform/for_range.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
@@ -70,7 +70,7 @@ class DiagEmbedKernel : public framework::OpKernel<T> {
     auto* input_data = input->data<T>();
 
     T* out_data = out->mutable_data<T>(context.GetPlace());
-    math::SetConstant<DeviceContext, T> set_zero;
+    pten::funcs::SetConstant<DeviceContext, T> set_zero;
     auto& dev_ctx = context.template device_context<DeviceContext>();
     set_zero(dev_ctx, out, static_cast<T>(0.0));
 
diff --git a/paddle/fluid/operators/diag_op.h b/paddle/fluid/operators/diag_op.h
index f89415ae08..09723e6df6 100644
--- a/paddle/fluid/operators/diag_op.h
+++ b/paddle/fluid/operators/diag_op.h
@@ -15,8 +15,8 @@ limitations under the License. */
 #pragma once
 
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/platform/for_range.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
@@ -45,7 +45,7 @@ class DiagKernel : public framework::OpKernel<T> {
     auto* out = context.Output<framework::Tensor>("Out");
     T* out_data = out->mutable_data<T>(context.GetPlace());
 
-    math::SetConstant<DeviceContext, T> set_zero;
+    pten::funcs::SetConstant<DeviceContext, T> set_zero;
     auto& dev_ctx = context.template device_context<DeviceContext>();
     set_zero(dev_ctx, out, static_cast<T>(0));
 
diff --git a/paddle/fluid/operators/diag_v2_op.cc b/paddle/fluid/operators/diag_v2_op.cc
index dd5ad73950..3e74c7aa81 100644
--- a/paddle/fluid/operators/diag_v2_op.cc
+++ b/paddle/fluid/operators/diag_v2_op.cc
@@ -15,7 +15,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/diag_v2_op.h"
 #include <algorithm>
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
@@ -109,7 +109,7 @@ class DiagV2Kernel : public framework::OpKernel<T> {
     int64_t i;
     if (x_dims.size() == 1) {
       float padding_value = context.Attr<float>("padding_value");
-      math::SetConstant<DeviceContext, T> set_padding_value;
+      pten::funcs::SetConstant<DeviceContext, T> set_padding_value;
       auto& dev_ctx = context.template device_context<DeviceContext>();
       set_padding_value(dev_ctx, out, static_cast<T>(padding_value));
 
diff --git a/paddle/fluid/operators/diag_v2_op.cu b/paddle/fluid/operators/diag_v2_op.cu
index 12ea31945f..02e531765c 100644
--- a/paddle/fluid/operators/diag_v2_op.cu
+++ b/paddle/fluid/operators/diag_v2_op.cu
@@ -72,7 +72,7 @@ class DiagV2CUDAKernel : public framework::OpKernel<T> {
 
     if (x_dims.size() == 1) {
       float padding_value = context.Attr<float>("padding_value");
-      math::SetConstant<DeviceContext, T> set_padding_value;
+      pten::funcs::SetConstant<DeviceContext, T> set_padding_value;
       set_padding_value(dev_ctx, out, static_cast<T>(padding_value));
 
       auto x_length = x_dims[0];
diff --git a/paddle/fluid/operators/diag_v2_op.h b/paddle/fluid/operators/diag_v2_op.h
index 7850def061..0d1d6cd86e 100644
--- a/paddle/fluid/operators/diag_v2_op.h
+++ b/paddle/fluid/operators/diag_v2_op.h
@@ -15,7 +15,7 @@ limitations under the License. */
 #pragma once
 
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/dist_op.h b/paddle/fluid/operators/dist_op.h
index 6a34ef48a1..2d4620eca7 100644
--- a/paddle/fluid/operators/dist_op.h
+++ b/paddle/fluid/operators/dist_op.h
@@ -19,7 +19,7 @@
 #include <vector>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
@@ -171,7 +171,7 @@ static void DistGradFunction(const framework::ExecutionContext& context) {
 
   // 1: Lp-norm(z), z = x-y, compute dz
   if (p == 0) {
-    math::SetConstant<DeviceContext, T> set_zero;
+    pten::funcs::SetConstant<DeviceContext, T> set_zero;
     auto& dev_ctx = context.template device_context<DeviceContext>();
     set_zero(dev_ctx, &grad, static_cast<T>(0));
   } else if (p == INFINITY || p == -INFINITY) {
diff --git a/paddle/fluid/operators/dropout_op_test.cc b/paddle/fluid/operators/dropout_op_test.cc
index 5c9be58841..a268ef95e3 100644
--- a/paddle/fluid/operators/dropout_op_test.cc
+++ b/paddle/fluid/operators/dropout_op_test.cc
@@ -25,12 +25,11 @@ limitations under the License. */
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/operators/dropout_op.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/string/printf.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace f = paddle::framework;
 namespace p = paddle::platform;
-namespace m = paddle::operators::math;
 
 USE_OP(dropout);
 
diff --git a/paddle/fluid/operators/edit_distance_op.cu b/paddle/fluid/operators/edit_distance_op.cu
index 3096795f3e..be6534365e 100644
--- a/paddle/fluid/operators/edit_distance_op.cu
+++ b/paddle/fluid/operators/edit_distance_op.cu
@@ -16,9 +16,9 @@ limitations under the License. */
 #include "paddle/fluid/framework/mixed_vector.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/edit_distance_op.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
@@ -118,7 +118,7 @@ class EditDistanceGPUKernel : public framework::OpKernel<T> {
     }
 
     const size_t num_strs = hyp_lod.size() - 1;
-    math::SetConstant<platform::CUDADeviceContext, int64_t> set_constant;
+    pten::funcs::SetConstant<platform::CUDADeviceContext, int64_t> set_constant;
     set_constant(ctx.template device_context<platform::CUDADeviceContext>(),
                  sequence_num, static_cast<int64_t>(num_strs));
 
diff --git a/paddle/fluid/operators/eig_op.h b/paddle/fluid/operators/eig_op.h
index b9a3cb300b..4dd5b7cfd8 100644
--- a/paddle/fluid/operators/eig_op.h
+++ b/paddle/fluid/operators/eig_op.h
@@ -19,11 +19,11 @@
 #include <complex>
 #include "paddle/fluid/operators/math/complex_functors.h"
 #include "paddle/fluid/operators/math/lapack_function.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/math/matrix_solve.h"
 #include "paddle/fluid/operators/svd_helper.h"
 #include "paddle/fluid/operators/transpose_op.h"
 #include "paddle/fluid/platform/for_range.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 #define EPSILON 1e-6
 
 namespace paddle {
diff --git a/paddle/fluid/operators/elementwise/elementwise_op_function.h b/paddle/fluid/operators/elementwise/elementwise_op_function.h
index 34d40c741f..57b47d436d 100644
--- a/paddle/fluid/operators/elementwise/elementwise_op_function.h
+++ b/paddle/fluid/operators/elementwise/elementwise_op_function.h
@@ -47,8 +47,8 @@ limitations under the License. */
 
 #endif
 
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/platform/for_range.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 #define DIVUP(x, y) (((x) + (y)-1) / (y))
 
diff --git a/paddle/fluid/operators/elementwise/elementwise_op_npu_test.cc b/paddle/fluid/operators/elementwise/elementwise_op_npu_test.cc
index 3cd9729d34..63ec5bd4a2 100644
--- a/paddle/fluid/operators/elementwise/elementwise_op_npu_test.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_op_npu_test.cc
@@ -25,12 +25,11 @@ limitations under the License. */
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/operators/dropout_op.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/string/printf.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace f = paddle::framework;
 namespace p = paddle::platform;
-namespace m = paddle::operators::math;
 
 USE_OP(elementwise_add);
 USE_OP_DEVICE_KERNEL(elementwise_add, NPU);
diff --git a/paddle/fluid/operators/expand_op_npu_test.cc b/paddle/fluid/operators/expand_op_npu_test.cc
index 7de2bf2e69..4e18cc73d2 100644
--- a/paddle/fluid/operators/expand_op_npu_test.cc
+++ b/paddle/fluid/operators/expand_op_npu_test.cc
@@ -25,12 +25,11 @@ limitations under the License. */
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/operators/dropout_op.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/string/printf.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace f = paddle::framework;
 namespace p = paddle::platform;
-namespace m = paddle::operators::math;
 
 USE_OP(expand);
 USE_OP_DEVICE_KERNEL(expand, NPU);
diff --git a/paddle/fluid/operators/exponential_op.h b/paddle/fluid/operators/exponential_op.h
index d8cafb8ef7..88c891d8bf 100644
--- a/paddle/fluid/operators/exponential_op.h
+++ b/paddle/fluid/operators/exponential_op.h
@@ -18,7 +18,7 @@
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/operators/distribution_helper.h"
-#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
@@ -32,7 +32,7 @@ class ExponentialGradKernel : public framework::OpKernel<T> {
   void Compute(const framework::ExecutionContext& ctx) const override {
     auto* dx = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
     dx->mutable_data<T>(ctx.GetPlace());
-    math::SetConstant<DeviceContext, T> functor;
+    pten::funcs::SetConstant<DeviceContext, T> functor;
     auto& dev_ctx = ctx.template device_context<DeviceContext>();
     functor(dev_ctx, dx, static_cast<T>(0));
   }
diff --git a/paddle/fluid/operators/eye_op.h b/paddle/fluid/operators/eye_op.h
index d5ad27596d..1aa22e74f7 100644
--- a/paddle/fluid/operators/eye_op.h
+++ b/paddle/fluid/operators/eye_op.h
@@ -17,8 +17,8 @@ limitations under the License. */
 #include <algorithm>
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/platform/for_range.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
@@ -47,7 +47,7 @@ class EyeKernel : public framework::OpKernel<T> {
     auto* out_tensor = ctx.Output<framework::Tensor>("Out");
     T* out_data = out_tensor->mutable_data<T>(ctx.GetPlace());
 
-    math::SetConstant<DeviceContext, T> set_zero;
+    pten::funcs::SetConstant<DeviceContext, T> set_zero;
     auto& dev_ctx = ctx.template device_context<DeviceContext>();
     set_zero(dev_ctx, out_tensor, static_cast<T>(0));
 
diff --git a/paddle/fluid/operators/feed_forward_test.cu b/paddle/fluid/operators/feed_forward_test.cu
index dea427393b..551d8ee659 100644
--- a/paddle/fluid/operators/feed_forward_test.cu
+++ b/paddle/fluid/operators/feed_forward_test.cu
@@ -20,8 +20,8 @@ limitations under the License. */
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/operators/fused/attn_feed_forward.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/platform/float16.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace framework = paddle::framework;
 namespace platform = paddle::platform;
diff --git a/paddle/fluid/operators/fill_any_op.h b/paddle/fluid/operators/fill_any_op.h
index f483e05a08..a476b7a0a6 100644
--- a/paddle/fluid/operators/fill_any_op.h
+++ b/paddle/fluid/operators/fill_any_op.h
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
@@ -41,7 +41,7 @@ class FillAnyKernel : public framework::OpKernel<T> {
 
     out->mutable_data<T>(ctx.GetPlace());
     auto &dev_ctx = ctx.template device_context<DeviceContext>();
-    math::SetConstant<DeviceContext, T> functor;
+    pten::funcs::SetConstant<DeviceContext, T> functor;
     functor(reinterpret_cast<const DeviceContext &>(dev_ctx), out,
             static_cast<T>(fill_var));
   }
@@ -55,7 +55,7 @@ class FillAnyGradKernel : public framework::OpKernel<T> {
     if (dx) {
       dx->mutable_data<T>(ctx.GetPlace());
       auto &dev_ctx = ctx.template device_context<DeviceContext>();
-      math::SetConstant<DeviceContext, T> functor;
+      pten::funcs::SetConstant<DeviceContext, T> functor;
       functor(reinterpret_cast<const DeviceContext &>(dev_ctx), dx, T(0));
     }
   }
diff --git a/paddle/fluid/operators/fill_constant_batch_size_like_op.h b/paddle/fluid/operators/fill_constant_batch_size_like_op.h
index 4c90daa39f..ed3a661897 100644
--- a/paddle/fluid/operators/fill_constant_batch_size_like_op.h
+++ b/paddle/fluid/operators/fill_constant_batch_size_like_op.h
@@ -15,7 +15,7 @@ limitations under the License. */
 #pragma once
 #include <string>
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
@@ -60,7 +60,7 @@ class FillConstantBatchSizeLikeOpKernel : public framework::OpKernel<T> {
     bool cpu_place = force_cpu || ctx.GetPlace() == platform::CPUPlace();
     if (cpu_place) {
       auto &dev_ctx = *pool.Get(platform::CPUPlace());
-      math::SetConstant<platform::CPUDeviceContext, T> functor;
+      pten::funcs::SetConstant<platform::CPUDeviceContext, T> functor;
       out->mutable_data(platform::CPUPlace(), data_type);
       functor(reinterpret_cast<const platform::CPUDeviceContext &>(dev_ctx),
               out, static_cast<T>(value));
@@ -68,7 +68,7 @@ class FillConstantBatchSizeLikeOpKernel : public framework::OpKernel<T> {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
     if (!cpu_place) {
       auto &dev_ctx = *pool.Get(ctx.GetPlace());
-      math::SetConstant<platform::CUDADeviceContext, T> functor;
+      pten::funcs::SetConstant<platform::CUDADeviceContext, T> functor;
       out->mutable_data(ctx.GetPlace(), data_type);
       functor(reinterpret_cast<const platform::CUDADeviceContext &>(dev_ctx),
               out, static_cast<T>(value));
diff --git a/paddle/fluid/operators/fill_constant_batch_size_like_op_npu.cc b/paddle/fluid/operators/fill_constant_batch_size_like_op_npu.cc
index 6b07b021d1..98e03ea66d 100644
--- a/paddle/fluid/operators/fill_constant_batch_size_like_op_npu.cc
+++ b/paddle/fluid/operators/fill_constant_batch_size_like_op_npu.cc
@@ -70,7 +70,7 @@ class FillConstantBatchSizeLikeOpNPUKernel : public framework::OpKernel<T> {
     bool cpu_place = force_cpu || ctx.GetPlace() == platform::CPUPlace();
     if (cpu_place) {
       auto &dev_ctx = *pool.Get(platform::CPUPlace());
-      math::SetConstant<platform::CPUDeviceContext, T> functor;
+      pten::funcs::SetConstant<platform::CPUDeviceContext, T> functor;
       out->mutable_data(platform::CPUPlace(), data_type);
       functor(reinterpret_cast<const platform::CPUDeviceContext &>(dev_ctx),
               out, static_cast<T>(value));
diff --git a/paddle/fluid/operators/fill_constant_op.h b/paddle/fluid/operators/fill_constant_op.h
index c74cf2a824..15c9241275 100644
--- a/paddle/fluid/operators/fill_constant_op.h
+++ b/paddle/fluid/operators/fill_constant_op.h
@@ -21,8 +21,8 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/utils.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
@@ -121,14 +121,14 @@ class FillConstantKernel : public framework::OpKernel<T> {
               << ((data_type == framework::proto::VarType::BF16) ? "<bfloat16>"
                                                                  : "<T>");
       tensor->mutable_data(platform::CPUPlace(), data_type);
-      math::SetConstant<platform::CPUDeviceContext, T> functor;
+      pten::funcs::SetConstant<platform::CPUDeviceContext, T> functor;
       auto &dev_ctx = *pool.Get(platform::CPUPlace());
       functor(reinterpret_cast<const platform::CPUDeviceContext &>(dev_ctx),
               tensor, static_cast<T>(value));
     } else if (actual_place == 1) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
       tensor->mutable_data(ctx.GetPlace(), data_type);
-      math::SetConstant<platform::CUDADeviceContext, T> functor;
+      pten::funcs::SetConstant<platform::CUDADeviceContext, T> functor;
       auto &dev_ctx = *pool.Get(ctx.GetPlace());
       functor(reinterpret_cast<const platform::CUDADeviceContext &>(dev_ctx),
               tensor, static_cast<T>(value));
@@ -139,7 +139,7 @@ class FillConstantKernel : public framework::OpKernel<T> {
     } else if (actual_place == 2) {
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
       tensor->mutable_data(platform::CUDAPinnedPlace(), data_type);
-      math::SetConstant<platform::CUDAPinnedDeviceContext, T> functor;
+      pten::funcs::SetConstant<platform::CUDAPinnedDeviceContext, T> functor;
       auto &dev_ctx = *pool.Get(platform::CUDAPinnedPlace());
       functor(
           reinterpret_cast<const platform::CUDAPinnedDeviceContext &>(dev_ctx),
@@ -151,7 +151,7 @@ class FillConstantKernel : public framework::OpKernel<T> {
     } else if (actual_place == 3) {
 #ifdef PADDLE_WITH_XPU
       tensor->mutable_data(ctx.GetPlace(), data_type);
-      math::SetConstant<platform::XPUDeviceContext, T> functor;
+      pten::funcs::SetConstant<platform::XPUDeviceContext, T> functor;
       auto &dev_ctx = *pool.Get(ctx.GetPlace());
       functor(reinterpret_cast<const platform::XPUDeviceContext &>(dev_ctx),
               tensor, static_cast<T>(value));
diff --git a/paddle/fluid/operators/fill_zeros_like_op.h b/paddle/fluid/operators/fill_zeros_like_op.h
index 4bbe0df6b6..c34358d9a3 100644
--- a/paddle/fluid/operators/fill_zeros_like_op.h
+++ b/paddle/fluid/operators/fill_zeros_like_op.h
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #pragma once
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
@@ -26,7 +26,7 @@ class FillZerosLikeKernel : public framework::OpKernel<T> {
     auto* out = context.Output<framework::Tensor>("Out");
     out->mutable_data<T>(context.GetPlace());
 
-    math::SetConstant<DeviceContext, T> setter;
+    pten::funcs::SetConstant<DeviceContext, T> setter;
     setter(context.template device_context<DeviceContext>(), out,
            static_cast<T>(0));
   }
diff --git a/paddle/fluid/operators/flatten_op.h b/paddle/fluid/operators/flatten_op.h
index 2a9c2b27d2..15e820a9ee 100644
--- a/paddle/fluid/operators/flatten_op.h
+++ b/paddle/fluid/operators/flatten_op.h
@@ -17,12 +17,12 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/pten_utils.h"
 #include "paddle/fluid/operators/math/blas.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/math/pooling.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/pten/kernels/empty_kernel.h"
 #include "paddle/pten/kernels/flatten_grad_kernel.h"
 #include "paddle/pten/kernels/flatten_kernel.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/fold_op.h b/paddle/fluid/operators/fold_op.h
index d37edbfe80..7f2f26b464 100644
--- a/paddle/fluid/operators/fold_op.h
+++ b/paddle/fluid/operators/fold_op.h
@@ -18,7 +18,7 @@ limitations under the License. */
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/im2col.h"
-#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
@@ -64,7 +64,7 @@ class FoldOpKernel : public framework::OpKernel<T> {
     framework::DDim input_matrix_shape({input_dims[0], kernel_sizes[0],
                                         kernel_sizes[1], output_height,
                                         output_width});
-    math::SetConstant<DeviceContext, T> set_zero;
+    pten::funcs::SetConstant<DeviceContext, T> set_zero;
     set_zero(dev_ctx, output, static_cast<T>(0));
 
     for (int i = 0; i < batch_size; i++) {
diff --git a/paddle/fluid/operators/frame_op.h b/paddle/fluid/operators/frame_op.h
index 482c641181..0f34e2f7fc 100644
--- a/paddle/fluid/operators/frame_op.h
+++ b/paddle/fluid/operators/frame_op.h
@@ -18,11 +18,11 @@
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/math/seq2col.h"
 #include "paddle/fluid/operators/transpose_op.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/for_range.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/fsp_op.h b/paddle/fluid/operators/fsp_op.h
index 55bd23784d..999c3ae374 100644
--- a/paddle/fluid/operators/fsp_op.h
+++ b/paddle/fluid/operators/fsp_op.h
@@ -15,7 +15,7 @@ limitations under the License. */
 #pragma once
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/blas.h"
-#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
@@ -79,7 +79,7 @@ class FSPGradOpKernel : public framework::OpKernel<T> {
     int64_t w = 0;
 
     auto blas = math::GetBlas<DeviceContext, T>(context);
-    math::SetConstant<DeviceContext, T> set_zero;
+    pten::funcs::SetConstant<DeviceContext, T> set_zero;
     if (d_x != nullptr) {
       d_x->mutable_data<T>(context.GetPlace());
       set_zero(context.template device_context<DeviceContext>(), d_x,
diff --git a/paddle/fluid/operators/fused/cudnn_bn_add_relu_test.cc b/paddle/fluid/operators/fused/cudnn_bn_add_relu_test.cc
index 74307c3ba7..cd88b67a56 100644
--- a/paddle/fluid/operators/fused/cudnn_bn_add_relu_test.cc
+++ b/paddle/fluid/operators/fused/cudnn_bn_add_relu_test.cc
@@ -22,8 +22,8 @@ limitations under the License. */
 #include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/operators/fused/cudnn_bn_stats_finalize.cu.h"
 #include "paddle/fluid/operators/fused/cudnn_scale_bias_add_relu.cu.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/platform/float16.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 DECLARE_bool(cudnn_batchnorm_spatial_persistent);
 
diff --git a/paddle/fluid/operators/fused/cudnn_norm_conv_test.cc b/paddle/fluid/operators/fused/cudnn_norm_conv_test.cc
index 425782d790..bec44662a2 100644
--- a/paddle/fluid/operators/fused/cudnn_norm_conv_test.cc
+++ b/paddle/fluid/operators/fused/cudnn_norm_conv_test.cc
@@ -21,8 +21,8 @@ limitations under the License. */
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/operators/fused/cudnn_norm_conv.cu.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/platform/float16.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace framework = paddle::framework;
 namespace platform = paddle::platform;
diff --git a/paddle/fluid/operators/fused/fused_attention_op.cu b/paddle/fluid/operators/fused/fused_attention_op.cu
index 581fc45e26..79569bb3a7 100644
--- a/paddle/fluid/operators/fused/fused_attention_op.cu
+++ b/paddle/fluid/operators/fused/fused_attention_op.cu
@@ -20,7 +20,7 @@ limitations under the License. */
 #include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
 
 #include "paddle/fluid/operators/elementwise/elementwise_add_op.h"
-#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 #include "paddle/fluid/operators/fused/attention_layer_norm.h"
 #include "paddle/fluid/operators/fused/attn_gemm.h"
diff --git a/paddle/fluid/operators/fused/fused_bn_activation_op.cu b/paddle/fluid/operators/fused/fused_bn_activation_op.cu
index 83328caf38..e825ad3078 100644
--- a/paddle/fluid/operators/fused/fused_bn_activation_op.cu
+++ b/paddle/fluid/operators/fused/fused_bn_activation_op.cu
@@ -20,10 +20,10 @@
 #include "paddle/fluid/framework/data_layout.h"
 #include "paddle/fluid/operators/activation_op.h"
 #include "paddle/fluid/operators/fused/fused_bn_activation_op.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/norm_utils.h"
 #include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
 #include "paddle/fluid/platform/float16.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 DECLARE_bool(cudnn_batchnorm_spatial_persistent);
 
@@ -256,7 +256,8 @@ class FusedBatchNormActGradKernel<platform::CUDADeviceContext, T>
         PADDLE_THROW(
             platform::errors::Unimplemented("Unsupported activation type"));
       }
-      math::SetConstant<platform::CUDADeviceContext, BatchNormParamType<T>>
+      pten::funcs::SetConstant<platform::CUDADeviceContext,
+                               BatchNormParamType<T>>
           functor;
       functor(dev_ctx, d_scale, static_cast<BatchNormParamType<T>>(0));
       functor(dev_ctx, d_bias, static_cast<BatchNormParamType<T>>(0));
diff --git a/paddle/fluid/operators/fused/fused_bn_add_activation_op.cu b/paddle/fluid/operators/fused/fused_bn_add_activation_op.cu
index 7c124a0d6b..c5bc5b1725 100644
--- a/paddle/fluid/operators/fused/fused_bn_add_activation_op.cu
+++ b/paddle/fluid/operators/fused/fused_bn_add_activation_op.cu
@@ -19,10 +19,10 @@
 #include "paddle/fluid/framework/data_layout.h"
 #include "paddle/fluid/operators/activation_op.h"
 #include "paddle/fluid/operators/fused/fused_bn_add_activation_op.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/norm_utils.h"
 #include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
 #include "paddle/fluid/platform/float16.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 DECLARE_bool(cudnn_batchnorm_spatial_persistent);
 
diff --git a/paddle/fluid/operators/fused/fused_dropout_test.h b/paddle/fluid/operators/fused/fused_dropout_test.h
index a0d1cd4340..59b997bb51 100644
--- a/paddle/fluid/operators/fused/fused_dropout_test.h
+++ b/paddle/fluid/operators/fused/fused_dropout_test.h
@@ -24,8 +24,8 @@ limitations under the License. */
 #include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/memory/memory.h"
 #include "paddle/fluid/operators/layer_norm_kernel.cu.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/string/printf.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace framework = paddle::framework;
 namespace platform = paddle::platform;
diff --git a/paddle/fluid/operators/gather.cu.h b/paddle/fluid/operators/gather.cu.h
index 8386896027..739fcc9b18 100644
--- a/paddle/fluid/operators/gather.cu.h
+++ b/paddle/fluid/operators/gather.cu.h
@@ -18,10 +18,10 @@ limitations under the License. */
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/memory/malloc.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/fluid/platform/place.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 namespace paddle {
 namespace operators {
 
@@ -283,7 +283,7 @@ void GatherV2GradCUDAFunction(const Tensor* input, const Tensor* index,
   auto* dev_ctx = platform::DeviceContextPool::Instance().Get(place);
   auto out_dim = out->dims();
   int64_t out_index_dim_size = out_dim[axis_index];
-  operators::math::set_constant(*dev_ctx, out, 0.0);
+  pten::funcs::set_constant(*dev_ctx, out, 0.0);
 
   platform::GpuLaunchConfig config =
       platform::GetGpuLaunchConfig1D(ctx.cuda_device_context(), input_size);
diff --git a/paddle/fluid/operators/gather.h b/paddle/fluid/operators/gather.h
index 84ec587bed..bd339c4a08 100644
--- a/paddle/fluid/operators/gather.h
+++ b/paddle/fluid/operators/gather.h
@@ -20,8 +20,8 @@ limitations under the License. */
 #include "paddle/fluid/framework/ddim.h"
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/platform/place.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
@@ -226,7 +226,7 @@ void GatherV2GradFunction(const Tensor* input, const Tensor* index,
   auto* dev_ctx = platform::DeviceContextPool::Instance().Get(place);
   auto out_dim = out->dims();
   int64_t out_index_dim_size = out_dim[axis_index];
-  operators::math::set_constant(*dev_ctx, out, 0.0);
+  pten::funcs::set_constant(*dev_ctx, out, 0.0);
 
   for (int64_t i = 0; i < inner_dim_size; i++) {
     for (int64_t j = 0; j < input_index_dim_size; j++) {
diff --git a/paddle/fluid/operators/gather_op_npu_test.cc b/paddle/fluid/operators/gather_op_npu_test.cc
index f50c4f5528..247ce8529c 100644
--- a/paddle/fluid/operators/gather_op_npu_test.cc
+++ b/paddle/fluid/operators/gather_op_npu_test.cc
@@ -25,12 +25,11 @@ limitations under the License. */
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/operators/gather_op.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/string/printf.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace f = paddle::framework;
 namespace p = paddle::platform;
-namespace m = paddle::operators::math;
 
 USE_OP(gather);
 USE_OP_DEVICE_KERNEL(gather, NPU);
diff --git a/paddle/fluid/operators/gelu_op_npu_test.cc b/paddle/fluid/operators/gelu_op_npu_test.cc
index f47250c968..bcaf7b11fe 100644
--- a/paddle/fluid/operators/gelu_op_npu_test.cc
+++ b/paddle/fluid/operators/gelu_op_npu_test.cc
@@ -25,12 +25,11 @@ limitations under the License. */
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/operators/dropout_op.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/string/printf.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace f = paddle::framework;
 namespace p = paddle::platform;
-namespace m = paddle::operators::math;
 
 USE_OP(gelu);
 USE_OP_DEVICE_KERNEL(gelu, NPU);
diff --git a/paddle/fluid/operators/grid_sampler_op.cu b/paddle/fluid/operators/grid_sampler_op.cu
index 8e9f445f3b..df70efcc6f 100644
--- a/paddle/fluid/operators/grid_sampler_op.cu
+++ b/paddle/fluid/operators/grid_sampler_op.cu
@@ -292,7 +292,7 @@ class GridSampleOpCUDAKernel : public framework::OpKernel<T> {
     auto* output_data = output->mutable_data<T>(ctx.GetPlace());
     VLOG(3) << "out dims: " << output->dims()[0] << "; " << output->dims()[1]
             << "; " << output->dims()[2] << "; " << output->dims()[3];
-    math::SetConstant<paddle::platform::CUDADeviceContext, T>()(
+    pten::funcs::SetConstant<paddle::platform::CUDADeviceContext, T>()(
         dev_ctx, output, static_cast<T>(0));
     int count = static_cast<int>(n * out_h * out_w);
     auto cu_stream = dev_ctx.stream();
@@ -459,7 +459,7 @@ class GridSampleGradOpCUDAKernel : public framework::OpKernel<T> {
 
     auto* input_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
     input_grad->mutable_data<T>(ctx.GetPlace());
-    math::SetConstant<paddle::platform::CUDADeviceContext, T>()(
+    pten::funcs::SetConstant<paddle::platform::CUDADeviceContext, T>()(
         ctx.template device_context<paddle::platform::CUDADeviceContext>(),
         input_grad, static_cast<T>(0));
 
@@ -467,7 +467,7 @@ class GridSampleGradOpCUDAKernel : public framework::OpKernel<T> {
     if (ctx.HasOutput(framework::GradVarName("Grid"))) {
       auto* grid_grad = ctx.Output<Tensor>(framework::GradVarName("Grid"));
       grid_grad_data = grid_grad->mutable_data<T>(ctx.GetPlace());
-      math::SetConstant<paddle::platform::CUDADeviceContext, T>()(
+      pten::funcs::SetConstant<paddle::platform::CUDADeviceContext, T>()(
           ctx.template device_context<paddle::platform::CUDADeviceContext>(),
           grid_grad, static_cast<T>(0));
     }
diff --git a/paddle/fluid/operators/grid_sampler_op.h b/paddle/fluid/operators/grid_sampler_op.h
index a595e5078b..874a8d8c2a 100644
--- a/paddle/fluid/operators/grid_sampler_op.h
+++ b/paddle/fluid/operators/grid_sampler_op.h
@@ -19,8 +19,8 @@ limitations under the License. */
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/gather.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/pten/core/hostdevice.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
@@ -520,7 +520,7 @@ class GridSampleOpKernel : public framework::OpKernel<T> {
 
     auto* output = ctx.Output<Tensor>("Output");
     output->mutable_data<T>({n, c, out_h, out_w}, ctx.GetPlace());
-    math::SetConstant<DeviceContext, T>()(
+    pten::funcs::SetConstant<DeviceContext, T>()(
         ctx.template device_context<DeviceContext>(), output,
         static_cast<T>(0));
 
@@ -563,7 +563,7 @@ class GridSampleGradOpKernel : public framework::OpKernel<T> {
 
     auto* input_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
     input_grad->mutable_data<T>({n, c, in_h, in_w}, ctx.GetPlace());
-    math::SetConstant<DeviceContext, T>()(
+    pten::funcs::SetConstant<DeviceContext, T>()(
         ctx.template device_context<DeviceContext>(), input_grad,
         static_cast<T>(0));
 
@@ -571,7 +571,7 @@ class GridSampleGradOpKernel : public framework::OpKernel<T> {
     if (ctx.HasOutput(framework::GradVarName("Grid"))) {
       grid_grad = ctx.Output<Tensor>(framework::GradVarName("Grid"));
       grid_grad->mutable_data<T>({n, out_h, out_w, 2}, ctx.GetPlace());
-      math::SetConstant<DeviceContext, T>()(
+      pten::funcs::SetConstant<DeviceContext, T>()(
           ctx.template device_context<DeviceContext>(), grid_grad,
           static_cast<T>(0));
     }
diff --git a/paddle/fluid/operators/group_norm_op.cu b/paddle/fluid/operators/group_norm_op.cu
index 055fd791af..584be96c65 100644
--- a/paddle/fluid/operators/group_norm_op.cu
+++ b/paddle/fluid/operators/group_norm_op.cu
@@ -153,7 +153,7 @@ class GroupNormKernel<platform::CUDADeviceContext, T>
     y->mutable_data<T>(ctx.GetPlace());
     mean->mutable_data<T>(ctx.GetPlace());
     var->mutable_data<T>(ctx.GetPlace());
-    math::SetConstant<platform::CUDADeviceContext, T> set_zero;
+    pten::funcs::SetConstant<platform::CUDADeviceContext, T> set_zero;
     auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
     Tensor temp_var;
     temp_var.mutable_data<T>(var->dims(), ctx.GetPlace());
@@ -321,7 +321,7 @@ class GroupNormGradKernel<platform::CUDADeviceContext, T>
                                           : x_dims[x_dims.size() - 2]);
 
     d_x->mutable_data<T>(ctx.GetPlace());
-    math::SetConstant<platform::CUDADeviceContext, T> set_zero;
+    pten::funcs::SetConstant<platform::CUDADeviceContext, T> set_zero;
     auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
 
     Tensor temp_var;
diff --git a/paddle/fluid/operators/group_norm_op.h b/paddle/fluid/operators/group_norm_op.h
index 9cb451235f..3fc2d413b6 100644
--- a/paddle/fluid/operators/group_norm_op.h
+++ b/paddle/fluid/operators/group_norm_op.h
@@ -22,7 +22,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
 #include "paddle/fluid/operators/math/blas.h"
-#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
@@ -239,7 +239,7 @@ class GroupNormGradKernel : public framework::OpKernel<T> {
     const int group_size = C / groups;
 
     d_x->mutable_data<T>(ctx.GetPlace());
-    math::SetConstant<DeviceContext, T> set_zero;
+    pten::funcs::SetConstant<DeviceContext, T> set_zero;
     auto& dev_ctx = ctx.template device_context<DeviceContext>();
 
     auto* x_data = x->data<T>();
diff --git a/paddle/fluid/operators/gru_op.cc b/paddle/fluid/operators/gru_op.cc
index a2d6169564..20956e3cdb 100644
--- a/paddle/fluid/operators/gru_op.cc
+++ b/paddle/fluid/operators/gru_op.cc
@@ -321,7 +321,7 @@ class GRUCPUKernel : public framework::OpKernel<T> {
     to_batch(dev_ctx, *input, batch_gate, true, is_reverse);
 
     if (bias) {
-      math::RowwiseAdd<DeviceContext, T> add_bias;
+      pten::funcs::RowwiseAdd<DeviceContext, T> add_bias;
       add_bias(dev_ctx, *batch_gate, *bias, batch_gate);
     }
 
diff --git a/paddle/fluid/operators/gru_op.cu.cc b/paddle/fluid/operators/gru_op.cu.cc
index ce3c8ac51c..0f1db8de5a 100644
--- a/paddle/fluid/operators/gru_op.cu.cc
+++ b/paddle/fluid/operators/gru_op.cu.cc
@@ -70,7 +70,7 @@ class GRUKernel : public framework::OpKernel<T> {
     to_batch(dev_ctx, *input, batch_gate, true, is_reverse);
 
     if (bias) {
-      math::RowwiseAdd<DeviceContext, T> add_bias;
+      pten::funcs::RowwiseAdd<DeviceContext, T> add_bias;
       add_bias(dev_ctx, *batch_gate, *bias, batch_gate);
     }
 
diff --git a/paddle/fluid/operators/gru_op.h b/paddle/fluid/operators/gru_op.h
index bcca992e2b..e9d520dd9f 100644
--- a/paddle/fluid/operators/gru_op.h
+++ b/paddle/fluid/operators/gru_op.h
@@ -18,8 +18,8 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/detail/activation_functions.h"
 #include "paddle/fluid/operators/math/gru_compute.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/math/sequence2batch.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
@@ -69,7 +69,7 @@ class GRUGradKernel : public framework::OpKernel<T> {
     batch_gate_grad.mutable_data<T>(gate_dims, context.GetPlace());
     batch_reset_hidden_prev_grad.mutable_data<T>(hidden_dims,
                                                  context.GetPlace());
-    math::SetConstant<DeviceContext, T> zero;
+    pten::funcs::SetConstant<DeviceContext, T> zero;
     auto& dev_ctx = context.template device_context<DeviceContext>();
     zero(dev_ctx, &batch_hidden_grad, static_cast<T>(0.0));
     zero(dev_ctx, &batch_gate_grad, static_cast<T>(0.0));
@@ -157,7 +157,7 @@ class GRUGradKernel : public framework::OpKernel<T> {
     }
     if (bias_grad) {
       bias_grad->mutable_data<T>(context.GetPlace());
-      math::ColwiseSum<DeviceContext, T> col_sum;
+      pten::funcs::ColwiseSum<DeviceContext, T> col_sum;
       col_sum(dev_ctx, batch_gate_grad, bias_grad);
     }
     if (h0 && h0_grad) {
diff --git a/paddle/fluid/operators/gumbel_softmax_op.cu b/paddle/fluid/operators/gumbel_softmax_op.cu
index 63577ed1e0..ba6ce141e8 100644
--- a/paddle/fluid/operators/gumbel_softmax_op.cu
+++ b/paddle/fluid/operators/gumbel_softmax_op.cu
@@ -99,7 +99,7 @@ struct OneHotGenerator<platform::CUDADeviceContext, T> {
     Tensor input_tensor;
     input_tensor.mutable_data<T>(Out->dims(), platform::CUDAPlace());
     paddle::framework::TensorCopy(*Out, context.GetPlace(), &input_tensor);
-    math::set_constant(context, Out, 0.0);
+    pten::funcs::set_constant(context, Out, 0.0);
     OneHotCUDAKernel<
         T, thread_size><<<block_size, thread_size, 0, context.stream()>>>(
         height, size_from_axis / size_out_axis, size_out_axis,
diff --git a/paddle/fluid/operators/gumbel_softmax_op.h b/paddle/fluid/operators/gumbel_softmax_op.h
index f95a4810f4..3cd211ccc3 100644
--- a/paddle/fluid/operators/gumbel_softmax_op.h
+++ b/paddle/fluid/operators/gumbel_softmax_op.h
@@ -15,8 +15,8 @@ limitations under the License. */
 #pragma once
 #include "paddle/fluid/framework/generator.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/math/softmax.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
@@ -155,7 +155,7 @@ struct OneHotGenerator<platform::CPUDeviceContext, T> {
 #undef CALL_ARG_MINMAX_FUNCTOR
     }
 
-    math::set_constant(context, Out, 0.0);
+    pten::funcs::set_constant(context, Out, 0.0);
     for (int i = 0; i < size_to_axis; i++) {
       for (int j = 0; j < size_out_axis; j++) {
         *(Out->data<T>() + i * size_from_axis + j +
diff --git a/paddle/fluid/operators/hierarchical_sigmoid_op.h b/paddle/fluid/operators/hierarchical_sigmoid_op.h
index 17734b9c54..5734e247f4 100644
--- a/paddle/fluid/operators/hierarchical_sigmoid_op.h
+++ b/paddle/fluid/operators/hierarchical_sigmoid_op.h
@@ -24,9 +24,9 @@ limitations under the License. */
 #include "paddle/fluid/framework/mixed_vector.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/clip_op.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/math/matrix_bit_code.h"
 #include "paddle/fluid/platform/transform.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
@@ -81,10 +81,10 @@ class HierarchicalSigmoidOpKernel : public framework::OpKernel<T> {
     auto pre_out_mat = EigenMatrix<T>::From(*pre_out);
     // Not all class(leaf) nodes' path lengths equal code_length, thus init as
     // 0s can avoid out of path's loss.
-    math::SetConstant<DeviceContext, T> zero;
+    pten::funcs::SetConstant<DeviceContext, T> zero;
     zero(dev_ctx, pre_out, static_cast<T>(0.0));
     auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
-    math::RowwiseSum<DeviceContext, T> row_sum;
+    pten::funcs::RowwiseSum<DeviceContext, T> row_sum;
 
     std::unique_ptr<math::MatrixBitCodeFunctor<T>> bit_code;
     if (!is_custom) {
@@ -134,7 +134,7 @@ class HierarchicalSigmoidGradOpKernel : public framework::OpKernel<T> {
     auto* in_grad = ctx.Output<LoDTensor>(framework::GradVarName("X"));
     bool is_sparse = ctx.Attr<bool>("is_sparse");
     auto& dev_ctx = ctx.template device_context<DeviceContext>();
-    math::SetConstant<DeviceContext, T> zero;
+    pten::funcs::SetConstant<DeviceContext, T> zero;
     auto& label = GET_DATA_SAFELY(ctx.Input<LoDTensor>("Label"), "Input",
                                   "Label", "HierarchicalSigmoidGrad");
     auto& pre_out = GET_DATA_SAFELY(ctx.Input<LoDTensor>("PreOut"), "Input",
diff --git a/paddle/fluid/operators/histogram_op.cu b/paddle/fluid/operators/histogram_op.cu
index a34f4b8a22..48a637e6c3 100644
--- a/paddle/fluid/operators/histogram_op.cu
+++ b/paddle/fluid/operators/histogram_op.cu
@@ -82,7 +82,7 @@ class HistogramCUDAKernel : public framework::OpKernel<T> {
     const int input_numel = input->numel();
 
     int64_t* out_data = output->mutable_data<int64_t>(context.GetPlace());
-    math::SetConstant<platform::CUDADeviceContext, int64_t>()(
+    pten::funcs::SetConstant<platform::CUDADeviceContext, int64_t>()(
         context.template device_context<platform::CUDADeviceContext>(), output,
         static_cast<int64_t>(0));
 
diff --git a/paddle/fluid/operators/histogram_op.h b/paddle/fluid/operators/histogram_op.h
index a6f4448cbc..9e280336e4 100644
--- a/paddle/fluid/operators/histogram_op.h
+++ b/paddle/fluid/operators/histogram_op.h
@@ -18,7 +18,7 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
@@ -39,7 +39,7 @@ class HistogramKernel : public framework::OpKernel<T> {
     auto input_numel = input->numel();
 
     int64_t* out_data = output->mutable_data<int64_t>(context.GetPlace());
-    math::SetConstant<DeviceContext, int64_t>()(
+    pten::funcs::SetConstant<DeviceContext, int64_t>()(
         context.template device_context<DeviceContext>(), output,
         static_cast<int64_t>(0));
 
diff --git a/paddle/fluid/operators/im2sequence_op.h b/paddle/fluid/operators/im2sequence_op.h
index 39ff7ea40a..6eac1cc4e4 100644
--- a/paddle/fluid/operators/im2sequence_op.h
+++ b/paddle/fluid/operators/im2sequence_op.h
@@ -20,7 +20,7 @@
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/eigen/eigen_function.h"
 #include "paddle/fluid/operators/math/im2col.h"
-#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/increment_op_npu_test.cc b/paddle/fluid/operators/increment_op_npu_test.cc
index ca9420c04a..47e2f2c3cf 100644
--- a/paddle/fluid/operators/increment_op_npu_test.cc
+++ b/paddle/fluid/operators/increment_op_npu_test.cc
@@ -25,12 +25,11 @@ limitations under the License. */
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/operators/dropout_op.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/string/printf.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace f = paddle::framework;
 namespace p = paddle::platform;
-namespace m = paddle::operators::math;
 
 USE_OP(increment);
 USE_OP_DEVICE_KERNEL(increment, NPU);
diff --git a/paddle/fluid/operators/index_sample_op.cu b/paddle/fluid/operators/index_sample_op.cu
index 4c9dec1400..e145c555dc 100644
--- a/paddle/fluid/operators/index_sample_op.cu
+++ b/paddle/fluid/operators/index_sample_op.cu
@@ -14,9 +14,9 @@
 
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/index_sample_op.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 #define PREDEFINED_BLOCK_SIZE_X 512
 #define PREDEFINED_BLOCK_SIZE 1024
@@ -177,7 +177,7 @@ class IndexSampleGradKernel<platform::CUDADeviceContext, T>
                   (batch_size + block_dim.y - 1) / block_dim.y);
     LimitGridDim(ctx, &grid_dim);
 
-    math::SetConstant<platform::CUDADeviceContext, T> set_zero;
+    pten::funcs::SetConstant<platform::CUDADeviceContext, T> set_zero;
     auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
     set_zero(dev_ctx, input_grad, static_cast<T>(0));
 
diff --git a/paddle/fluid/operators/index_select_op.h b/paddle/fluid/operators/index_select_op.h
index be76a66ef7..b157f775d5 100644
--- a/paddle/fluid/operators/index_select_op.h
+++ b/paddle/fluid/operators/index_select_op.h
@@ -16,7 +16,7 @@
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/blas.h"
-#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
@@ -159,7 +159,7 @@ void IndexSelectGradInner(const framework::ExecutionContext& context,
   auto output_dim = x_grad->dims();
 
   auto& dev_ctx = context.template device_context<DeviceContext>();
-  math::SetConstant<DeviceContext, T> set_constant;
+  pten::funcs::SetConstant<DeviceContext, T> set_constant;
   set_constant(dev_ctx, x_grad, static_cast<T>(0.0));
 
   auto slice_size = 1;
diff --git a/paddle/fluid/operators/inplace_abn_op.h b/paddle/fluid/operators/inplace_abn_op.h
index 9c3727ab90..142096eb34 100644
--- a/paddle/fluid/operators/inplace_abn_op.h
+++ b/paddle/fluid/operators/inplace_abn_op.h
@@ -16,7 +16,7 @@
 #include <string>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/activation_op.h"
-#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/instance_norm_op.cc b/paddle/fluid/operators/instance_norm_op.cc
index cfdaacf8cb..8c650c6437 100644
--- a/paddle/fluid/operators/instance_norm_op.cc
+++ b/paddle/fluid/operators/instance_norm_op.cc
@@ -18,7 +18,7 @@ limitations under the License. */
 #include <unordered_map>
 #include "paddle/fluid/framework/data_layout.h"
 #include "paddle/fluid/framework/op_version_registry.h"
-#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
@@ -208,7 +208,7 @@ class InstanceNormKernel<platform::CPUDeviceContext, T>
     Eigen::IndexList<Eigen::type2index<1>> rdims;
 #endif
 
-    math::SetConstant<platform::CPUDeviceContext, T> set_constant;
+    pten::funcs::SetConstant<platform::CPUDeviceContext, T> set_constant;
 
     saved_mean->mutable_data<T>(ctx.GetPlace());
     saved_variance->mutable_data<T>(ctx.GetPlace());
@@ -356,7 +356,7 @@ class InstanceNormGradKernel<platform::CPUDeviceContext, T>
     NxC_shape.set(0, NxC);
 #endif
 
-    math::SetConstant<platform::CPUDeviceContext, T> set_constant;
+    pten::funcs::SetConstant<platform::CPUDeviceContext, T> set_constant;
 
     Tensor scale_data;
     if (!scale) {
@@ -492,7 +492,7 @@ class InstanceNormDoubleGradKernel<platform::CPUDeviceContext, T>
     auto *ddY = ctx.Output<Tensor>("DDY");
 
     auto &dev_ctx = ctx.template device_context<platform::CPUDeviceContext>();
-    math::SetConstant<platform::CPUDeviceContext, T> set_constant;
+    pten::funcs::SetConstant<platform::CPUDeviceContext, T> set_constant;
 
     const auto &x_dims = X->dims();
     int N, C, H, W, D;
diff --git a/paddle/fluid/operators/instance_norm_op.cu b/paddle/fluid/operators/instance_norm_op.cu
index e040136669..a6c935074f 100644
--- a/paddle/fluid/operators/instance_norm_op.cu
+++ b/paddle/fluid/operators/instance_norm_op.cu
@@ -25,8 +25,8 @@ namespace cub = hipcub;
 #endif
 #include "paddle/fluid/framework/data_layout.h"
 #include "paddle/fluid/operators/instance_norm_op.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
@@ -169,7 +169,7 @@ class InstanceNormKernel<platform::CUDADeviceContext, T>
     const int max_blocks = std::max(max_threads / block, 1);
     const int grid = std::min((NxC + block - 1) / block, max_blocks);
 
-    math::SetConstant<platform::CUDADeviceContext, T> set_constant;
+    pten::funcs::SetConstant<platform::CUDADeviceContext, T> set_constant;
     if (scale) {
       repeat_param<T><<<grid, block, 0, dev_ctx.stream()>>>(
           scale->data<T>(), scale_tmp.data<T>(), N, C);
@@ -185,7 +185,7 @@ class InstanceNormKernel<platform::CUDADeviceContext, T>
 
     auto handle = dev_ctx.cudnn_handle();
 
-    math::SetConstant<platform::CUDADeviceContext, BatchNormParamType<T>>
+    pten::funcs::SetConstant<platform::CUDADeviceContext, BatchNormParamType<T>>
         functor;
 
     auto *saved_mean = ctx.Output<Tensor>("SavedMean");
@@ -349,7 +349,7 @@ class InstanceNormGradKernel<platform::CUDADeviceContext, T>
     }
 
     auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
-    math::SetConstant<platform::CUDADeviceContext, T> set_constant;
+    pten::funcs::SetConstant<platform::CUDADeviceContext, T> set_constant;
 
     const int n = x->numel();
     const int block = 512;
@@ -379,7 +379,8 @@ class InstanceNormGradKernel<platform::CUDADeviceContext, T>
 
     if ((H * W * D) == 1) {
       framework::TensorCopy(*d_y, ctx.GetPlace(), d_x);
-      math::SetConstant<platform::CUDADeviceContext, BatchNormParamType<T>>
+      pten::funcs::SetConstant<platform::CUDADeviceContext,
+                               BatchNormParamType<T>>
           functor;
       functor(dev_ctx, d_scale, static_cast<BatchNormParamType<T>>(0));
       functor(dev_ctx, d_bias, static_cast<BatchNormParamType<T>>(0));
@@ -732,7 +733,7 @@ class InstanceNormDoubleGradKernel<platform::CUDADeviceContext, T>
     const T *variance_data = Saved_variance->data<T>();
 
     auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
-    math::SetConstant<platform::CUDADeviceContext, T> set_zero;
+    pten::funcs::SetConstant<platform::CUDADeviceContext, T> set_zero;
 
     auto &x_dims = X->dims();
     int N, C, H, W, D;
diff --git a/paddle/fluid/operators/interpolate_op.cu b/paddle/fluid/operators/interpolate_op.cu
index 3c857eb326..eaf8a2f7d9 100644
--- a/paddle/fluid/operators/interpolate_op.cu
+++ b/paddle/fluid/operators/interpolate_op.cu
@@ -1159,7 +1159,7 @@ static void Interpolate1DCUDABwd(const framework::ExecutionContext& ctx,
   input_grad->mutable_data<T>(dim_grad, ctx.GetPlace());
   auto* input_grad_data = input_grad->mutable_data<T>(dim_grad, ctx.GetPlace());
   auto& device_ctx = ctx.template device_context<platform::CUDADeviceContext>();
-  math::SetConstant<platform::CUDADeviceContext, T> zero;
+  pten::funcs::SetConstant<platform::CUDADeviceContext, T> zero;
   zero(device_ctx, input_grad, static_cast<T>(0.0));
 
   if (in_w == out_w) {
@@ -1241,7 +1241,7 @@ static void Interpolate2DCUDABwd(const framework::ExecutionContext& ctx,
   input_grad->mutable_data<T>(dim_grad, ctx.GetPlace());
   auto* input_grad_data = input_grad->mutable_data<T>(dim_grad, ctx.GetPlace());
   auto& device_ctx = ctx.template device_context<platform::CUDADeviceContext>();
-  math::SetConstant<platform::CUDADeviceContext, T> zero;
+  pten::funcs::SetConstant<platform::CUDADeviceContext, T> zero;
   zero(device_ctx, input_grad, static_cast<T>(0.0));
 
   if (in_h == out_h && in_w == out_w) {
@@ -1348,7 +1348,7 @@ static void Interpolate3DCUDABwd(const framework::ExecutionContext& ctx,
   }
   auto* input_grad_data = input_grad->mutable_data<T>(dim_grad, ctx.GetPlace());
   auto& device_ctx = ctx.template device_context<platform::CUDADeviceContext>();
-  math::SetConstant<platform::CUDADeviceContext, T> zero;
+  pten::funcs::SetConstant<platform::CUDADeviceContext, T> zero;
   zero(device_ctx, input_grad, static_cast<T>(0.0));
 
   if (in_d == out_d && in_h == out_h && in_w == out_w) {
diff --git a/paddle/fluid/operators/interpolate_op.h b/paddle/fluid/operators/interpolate_op.h
index 0c0dde6bd4..46353cfb2f 100644
--- a/paddle/fluid/operators/interpolate_op.h
+++ b/paddle/fluid/operators/interpolate_op.h
@@ -14,8 +14,8 @@
 #include <string>
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/pten/core/hostdevice.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
@@ -1057,7 +1057,7 @@ static void Interpolate1DCPUBwd(const framework::ExecutionContext& ctx,
   input_grad->mutable_data<T>(dim_grad, ctx.GetPlace());
 
   auto& device_ctx = ctx.template device_context<platform::CPUDeviceContext>();
-  math::SetConstant<platform::CPUDeviceContext, T> zero;
+  pten::funcs::SetConstant<platform::CPUDeviceContext, T> zero;
   zero(device_ctx, input_grad, static_cast<T>(0.0));
 
   if (in_w == out_w) {
@@ -1126,7 +1126,7 @@ static void Interpolate2DCPUBwd(const framework::ExecutionContext& ctx,
   input_grad->mutable_data<T>(dim_grad, ctx.GetPlace());
 
   auto& device_ctx = ctx.template device_context<platform::CPUDeviceContext>();
-  math::SetConstant<platform::CPUDeviceContext, T> zero;
+  pten::funcs::SetConstant<platform::CPUDeviceContext, T> zero;
   zero(device_ctx, input_grad, static_cast<T>(0.0));
 
   if (in_h == out_h && in_w == out_w) {
@@ -1213,7 +1213,7 @@ static void Interpolate3DCPUBwd(const framework::ExecutionContext& ctx,
   }
   input_grad->mutable_data<T>(dim_grad, ctx.GetPlace());
   auto& device_ctx = ctx.template device_context<platform::CPUDeviceContext>();
-  math::SetConstant<platform::CPUDeviceContext, T> zero;
+  pten::funcs::SetConstant<platform::CPUDeviceContext, T> zero;
   zero(device_ctx, input_grad, static_cast<T>(0.0));
 
   if (in_d == out_d && in_h == out_h && in_w == out_w) {
diff --git a/paddle/fluid/operators/interpolate_v2_op.cu b/paddle/fluid/operators/interpolate_v2_op.cu
index 8555cd14f4..8c15762958 100644
--- a/paddle/fluid/operators/interpolate_v2_op.cu
+++ b/paddle/fluid/operators/interpolate_v2_op.cu
@@ -1686,7 +1686,7 @@ static void Interpolate1DCUDABwd(const framework::ExecutionContext& ctx,
   input_grad->mutable_data<T>(dim_grad, ctx.GetPlace());
   auto* input_grad_data = input_grad->mutable_data<T>(dim_grad, ctx.GetPlace());
   auto& device_ctx = ctx.template device_context<platform::CUDADeviceContext>();
-  math::SetConstant<platform::CUDADeviceContext, T> zero;
+  pten::funcs::SetConstant<platform::CUDADeviceContext, T> zero;
   zero(device_ctx, input_grad, static_cast<T>(0.0));
 
   if (in_w == out_w) {
@@ -1808,7 +1808,7 @@ static void Interpolate2DCUDABwd(const framework::ExecutionContext& ctx,
   input_grad->mutable_data<T>(dim_grad, ctx.GetPlace());
   auto* input_grad_data = input_grad->mutable_data<T>(dim_grad, ctx.GetPlace());
   auto& device_ctx = ctx.template device_context<platform::CUDADeviceContext>();
-  math::SetConstant<platform::CUDADeviceContext, T> zero;
+  pten::funcs::SetConstant<platform::CUDADeviceContext, T> zero;
   zero(device_ctx, input_grad, static_cast<T>(0.0));
 
   if (in_h == out_h && in_w == out_w) {
@@ -1993,7 +1993,7 @@ static void Interpolate3DCUDABwd(const framework::ExecutionContext& ctx,
   }
   auto* input_grad_data = input_grad->mutable_data<T>(dim_grad, ctx.GetPlace());
   auto& device_ctx = ctx.template device_context<platform::CUDADeviceContext>();
-  math::SetConstant<platform::CUDADeviceContext, T> zero;
+  pten::funcs::SetConstant<platform::CUDADeviceContext, T> zero;
   zero(device_ctx, input_grad, static_cast<T>(0.0));
 
   if (in_d == out_d && in_h == out_h && in_w == out_w) {
diff --git a/paddle/fluid/operators/interpolate_v2_op.h b/paddle/fluid/operators/interpolate_v2_op.h
index 4d6189b57b..400c94f48a 100644
--- a/paddle/fluid/operators/interpolate_v2_op.h
+++ b/paddle/fluid/operators/interpolate_v2_op.h
@@ -14,8 +14,8 @@
 #include <string>
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/pten/core/hostdevice.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
@@ -1276,7 +1276,7 @@ static void Interpolate1DCPUBwd(const framework::ExecutionContext& ctx,
   input_grad->mutable_data<T>(dim_grad, ctx.GetPlace());
 
   auto& device_ctx = ctx.template device_context<platform::CPUDeviceContext>();
-  math::SetConstant<platform::CPUDeviceContext, T> zero;
+  pten::funcs::SetConstant<platform::CPUDeviceContext, T> zero;
   zero(device_ctx, input_grad, static_cast<T>(0.0));
 
   if (in_w == out_w) {
@@ -1383,7 +1383,7 @@ static void Interpolate2DCPUBwd(const framework::ExecutionContext& ctx,
   input_grad->mutable_data<T>(dim_grad, ctx.GetPlace());
 
   auto& device_ctx = ctx.template device_context<platform::CPUDeviceContext>();
-  math::SetConstant<platform::CPUDeviceContext, T> zero;
+  pten::funcs::SetConstant<platform::CPUDeviceContext, T> zero;
   zero(device_ctx, input_grad, static_cast<T>(0.0));
 
   if (in_h == out_h && in_w == out_w) {
@@ -1527,7 +1527,7 @@ static void Interpolate3DCPUBwd(const framework::ExecutionContext& ctx,
   }
   input_grad->mutable_data<T>(dim_grad, ctx.GetPlace());
   auto& device_ctx = ctx.template device_context<platform::CPUDeviceContext>();
-  math::SetConstant<platform::CPUDeviceContext, T> zero;
+  pten::funcs::SetConstant<platform::CPUDeviceContext, T> zero;
   zero(device_ctx, input_grad, static_cast<T>(0.0));
 
   if (in_d == out_d && in_h == out_h && in_w == out_w) {
diff --git a/paddle/fluid/operators/layer_norm_op.h b/paddle/fluid/operators/layer_norm_op.h
index ad7c0cc218..b7916f44d3 100644
--- a/paddle/fluid/operators/layer_norm_op.h
+++ b/paddle/fluid/operators/layer_norm_op.h
@@ -25,7 +25,7 @@ limitations under the License. */
     !defined(__OSX__)
 #include "paddle/fluid/operators/jit/kernels.h"
 #endif
-#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace platform {
@@ -57,7 +57,7 @@ class RowwiseMean2D<platform::CUDADeviceContext, T> {
       : left_(left), right_(right) {
     framework::DDim ones_dim({right_});
     divisor_.mutable_data<T>(ones_dim, dev_ctx.GetPlace());
-    math::set_constant(dev_ctx, &divisor_, 1.0 / right);
+    pten::funcs::set_constant(dev_ctx, &divisor_, 1.0 / right);
   }
   void operator()(const platform::CUDADeviceContext& context,
                   const framework::Tensor& input, framework::Tensor* out) {
@@ -84,7 +84,7 @@ class RowwiseMean2D<platform::CPUDeviceContext, T> {
   }
 
  private:
-  math::RowwiseMean<platform::CPUDeviceContext, T> row_mean_;
+  pten::funcs::RowwiseMean<platform::CPUDeviceContext, T> row_mean_;
 };
 
 template <typename DeviceContext, typename T>
@@ -103,7 +103,7 @@ class ColwiseSum2D<platform::CUDADeviceContext, T> {
       : left_(left), right_(right) {
     framework::DDim ones_dim({left_});
     divisor_.mutable_data<T>(ones_dim, dev_ctx.GetPlace());
-    math::set_constant(dev_ctx, &divisor_, 1.0);
+    pten::funcs::set_constant(dev_ctx, &divisor_, 1.0);
   }
 
   void operator()(const platform::CUDADeviceContext& context,
@@ -131,7 +131,7 @@ class ColwiseSum2D<platform::CPUDeviceContext, T> {
   }
 
  private:
-  math::ColwiseSum<platform::CPUDeviceContext, T> col_wise_;
+  pten::funcs::ColwiseSum<platform::CPUDeviceContext, T> col_wise_;
 };
 
 template <typename T>
diff --git a/paddle/fluid/operators/layout_utils.h b/paddle/fluid/operators/layout_utils.h
index 52fa7fd107..57c95afc10 100644
--- a/paddle/fluid/operators/layout_utils.h
+++ b/paddle/fluid/operators/layout_utils.h
@@ -20,7 +20,7 @@
 #include <vector>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
@@ -112,18 +112,18 @@ inline void TransToChannelFirst(const framework::ExecutionContext& context,
   if (dim == 3) {
     auto& dev_ctx = context.template device_context<DeviceContext>();
     std::vector<int> axis{0, 4, 1, 2, 3};
-    math::Transpose<DeviceContext, T, 5> trans5;
+    pten::funcs::Transpose<DeviceContext, T, 5> trans5;
     trans5(dev_ctx, *input, transformed_input, axis);
 
   } else if (dim == 2) {
     auto& dev_ctx = context.template device_context<DeviceContext>();
     std::vector<int> axis{0, 3, 1, 2};
-    math::Transpose<DeviceContext, T, 4> trans4;
+    pten::funcs::Transpose<DeviceContext, T, 4> trans4;
     trans4(dev_ctx, *input, transformed_input, axis);
   } else if (dim == 1) {
     auto& dev_ctx = context.template device_context<DeviceContext>();
     std::vector<int> axis{0, 2, 1};
-    math::Transpose<DeviceContext, T, 3> trans3;
+    pten::funcs::Transpose<DeviceContext, T, 3> trans3;
     trans3(dev_ctx, *input, transformed_input, axis);
   }
 }
@@ -135,18 +135,18 @@ inline void TransToChannelLast(const framework::ExecutionContext& context,
   if (dim == 3) {
     auto& dev_ctx = context.template device_context<DeviceContext>();
     std::vector<int> axis{0, 2, 3, 4, 1};
-    math::Transpose<DeviceContext, T, 5> trans5;
+    pten::funcs::Transpose<DeviceContext, T, 5> trans5;
     trans5(dev_ctx, *input, transformed_input, axis);
 
   } else if (dim == 2) {
     auto& dev_ctx = context.template device_context<DeviceContext>();
     std::vector<int> axis{0, 2, 3, 1};
-    math::Transpose<DeviceContext, T, 4> trans4;
+    pten::funcs::Transpose<DeviceContext, T, 4> trans4;
     trans4(dev_ctx, *input, transformed_input, axis);
   } else if (dim == 1) {
     auto& dev_ctx = context.template device_context<DeviceContext>();
     std::vector<int> axis{0, 2, 1};
-    math::Transpose<DeviceContext, T, 3> trans3;
+    pten::funcs::Transpose<DeviceContext, T, 3> trans3;
     trans3(dev_ctx, *input, transformed_input, axis);
   }
 }
diff --git a/paddle/fluid/operators/linear_chain_crf_op.h b/paddle/fluid/operators/linear_chain_crf_op.h
index eacc5f467d..c9a82dec72 100644
--- a/paddle/fluid/operators/linear_chain_crf_op.h
+++ b/paddle/fluid/operators/linear_chain_crf_op.h
@@ -15,7 +15,7 @@ limitations under the License. */
 #pragma once
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
@@ -102,8 +102,8 @@ class LinearChainCRFOpKernel : public framework::OpKernel<T> {
       label_tmp.Resize({batch_size, 1});
       alpha_tmp.Resize({batch_size, tag_num});
       emission_exps_tmp.Resize({batch_size, tag_num});
-      math::set_constant(ctx.device_context(), emission_exps, 0.0);
-      math::set_constant(ctx.device_context(), alpha, 0.0);
+      pten::funcs::set_constant(ctx.device_context(), emission_exps, 0.0);
+      pten::funcs::set_constant(ctx.device_context(), alpha, 0.0);
     } else {
       in_lod = ctx.Input<LoDTensor>("Label")->lod();
       PADDLE_ENFORCE_NE(in_lod.size(), 0,
@@ -274,7 +274,7 @@ class LinearChainCRFGradOpKernel : public framework::OpKernel<T> {
     // data reader operator, it can have no gradients.
     if (transition_grad) {
       transition_grad->mutable_data<T>(platform::CPUPlace());
-      math::set_constant(ctx.device_context(), transition_grad, 0.);
+      pten::funcs::set_constant(ctx.device_context(), transition_grad, 0.);
     }
     // Now, all the inputs and outputs should be on the CPU memory.
     auto emission_dims = emission_exps->dims();
diff --git a/paddle/fluid/operators/linspace_op.h b/paddle/fluid/operators/linspace_op.h
index d8e0fefe17..7e384f4b64 100644
--- a/paddle/fluid/operators/linspace_op.h
+++ b/paddle/fluid/operators/linspace_op.h
@@ -16,7 +16,7 @@ limitations under the License. */
 #include <functional>
 #include "paddle/fluid/framework/data_type_transform.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/lrn_op.cc b/paddle/fluid/operators/lrn_op.cc
index b7c28a0908..bee8b5396a 100644
--- a/paddle/fluid/operators/lrn_op.cc
+++ b/paddle/fluid/operators/lrn_op.cc
@@ -17,7 +17,7 @@ limitations under the License. */
 #include <string>
 #include <vector>
 #include "paddle/fluid/operators/math/blas.h"
-#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 #ifdef PADDLE_WITH_MKLDNN
 #include "paddle/fluid/platform/mkldnn_helper.h"
 #endif
@@ -36,7 +36,7 @@ struct LRNFunctor<platform::CPUDeviceContext, T> {
                   T k, T alpha, T beta, const DataLayout data_layout) {
     auto place = ctx.GetPlace();
     auto blas = math::GetBlas<platform::CPUDeviceContext, T>(ctx);
-    math::Transpose<platform::CPUDeviceContext, T, 4> transpose;
+    pten::funcs::Transpose<platform::CPUDeviceContext, T, 4> transpose;
     auto& dev_ctx = ctx.template device_context<platform::CPUDeviceContext>();
     Tensor in_transpose, mid_transpose, out_transpose;
     // if channel_last, transpose to channel_first
diff --git a/paddle/fluid/operators/lrn_op.h b/paddle/fluid/operators/lrn_op.h
index bdf3ad81dd..a619d6c723 100644
--- a/paddle/fluid/operators/lrn_op.h
+++ b/paddle/fluid/operators/lrn_op.h
@@ -18,7 +18,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/data_layout.h"
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/lstm_op.h b/paddle/fluid/operators/lstm_op.h
index c6f43b949a..df94952a9a 100644
--- a/paddle/fluid/operators/lstm_op.h
+++ b/paddle/fluid/operators/lstm_op.h
@@ -76,7 +76,7 @@ class LSTMKernel : public framework::OpKernel<T> {
       Tensor b = *bias;
       b.Resize({bias->numel(), 1});
       Tensor gate_bias = b.Slice(0, 4 * frame_size);
-      math::RowwiseAdd<DeviceContext, T> add_bias;
+      pten::funcs::RowwiseAdd<DeviceContext, T> add_bias;
       add_bias(device_ctx, *batch_gate, gate_bias, batch_gate);
     }
 
@@ -210,7 +210,7 @@ class LSTMGradKernel : public framework::OpKernel<T> {
     auto* c0_g = ctx.Output<Tensor>(framework::GradVarName("C0"));
 
     auto& device_ctx = ctx.template device_context<DeviceContext>();
-    math::SetConstant<DeviceContext, T> zero;
+    pten::funcs::SetConstant<DeviceContext, T> zero;
     if (weight_g) {
       weight_g->mutable_data<T>(ctx.GetPlace());
       zero(device_ctx, weight_g, static_cast<T>(0.0));
@@ -380,7 +380,7 @@ class LSTMGradKernel : public framework::OpKernel<T> {
       Tensor b_g = *bias_g;
       b_g.Resize({bias_g->numel(), 1});
       Tensor gate_bias_g = b_g.Slice(0, 4 * frame_size);
-      math::ColwiseSum<DeviceContext, T> col_sum;
+      pten::funcs::ColwiseSum<DeviceContext, T> col_sum;
       col_sum(device_ctx, batch_gate_g, &gate_bias_g);
     }
 
diff --git a/paddle/fluid/operators/lstmp_op.h b/paddle/fluid/operators/lstmp_op.h
index 5a6ac42f45..c63184f76e 100644
--- a/paddle/fluid/operators/lstmp_op.h
+++ b/paddle/fluid/operators/lstmp_op.h
@@ -133,7 +133,7 @@ class LSTMPKernel : public framework::OpKernel<T> {
       Tensor b = *bias;
       b.Resize({bias->numel(), 1});
       Tensor gate_bias = b.Slice(0, 4 * frame_size);
-      math::RowwiseAdd<DeviceContext, T> add_bias;
+      pten::funcs::RowwiseAdd<DeviceContext, T> add_bias;
       add_bias(device_ctx, *batch_gate, gate_bias, batch_gate);
     }
 
@@ -304,7 +304,7 @@ class LSTMPGradKernel : public framework::OpKernel<T> {
     auto* c0_g = ctx.Output<Tensor>(framework::GradVarName("C0"));
 
     auto& device_ctx = ctx.template device_context<DeviceContext>();
-    math::SetConstant<DeviceContext, T> zero;
+    pten::funcs::SetConstant<DeviceContext, T> zero;
     if (weight_g) {
       weight_g->mutable_data<T>(ctx.GetPlace());
       zero(device_ctx, weight_g, static_cast<T>(0.0));
@@ -514,7 +514,7 @@ class LSTMPGradKernel : public framework::OpKernel<T> {
       Tensor b_g = *bias_g;
       b_g.Resize({bias_g->numel(), 1});
       Tensor gate_bias_g = b_g.Slice(0, 4 * frame_size);
-      math::ColwiseSum<DeviceContext, T> col_sum;
+      pten::funcs::ColwiseSum<DeviceContext, T> col_sum;
       col_sum(device_ctx, batch_gate_g, &gate_bias_g);
     }
 
diff --git a/paddle/fluid/operators/lstsq_op.h b/paddle/fluid/operators/lstsq_op.h
index be41123270..dd0cff5cc5 100644
--- a/paddle/fluid/operators/lstsq_op.h
+++ b/paddle/fluid/operators/lstsq_op.h
@@ -21,12 +21,12 @@
 #include "paddle/fluid/operators/math/complex_functors.h"
 #include "paddle/fluid/operators/math/eigen_values_vectors.h"
 #include "paddle/fluid/operators/math/lapack_function.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/math/matrix_solve.h"
 #include "paddle/fluid/operators/svd_helper.h"
 #include "paddle/fluid/operators/transpose_op.h"
 #include "paddle/fluid/operators/triangular_solve_op.h"
 #include "paddle/fluid/platform/for_range.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 #define EPSILON 1e-6
 
diff --git a/paddle/fluid/operators/lu_op.h b/paddle/fluid/operators/lu_op.h
index c3b3552ba1..b3d79122bc 100644
--- a/paddle/fluid/operators/lu_op.h
+++ b/paddle/fluid/operators/lu_op.h
@@ -455,7 +455,7 @@ void Unpack_Pivot(const DeviceContext& dev_ctx, const framework::Tensor& Pivot,
   auto Pdim = framework::make_ddim(Pdimvec);
   P->Resize(Pdim);
   auto pdata = P->mutable_data<T>(dev_ctx.GetPlace());
-  math::SetConstant<DeviceContext, T> setter;
+  pten::funcs::SetConstant<DeviceContext, T> setter;
   setter(dev_ctx, P, static_cast<T>(0));
 
   auto batchsize = product(framework::slice_ddim(dims, 0, prank - 1));
@@ -543,7 +543,7 @@ class LUGradKernel : public framework::OpKernel<T> {
     Tensor_Add<DeviceContext, T>(dev_ctx, phi_L, phi_U, &phi);
     psi.Resize(xdims);
     psi.mutable_data<T>(ctx.GetPlace());
-    math::SetConstant<DeviceContext, T> setter;
+    pten::funcs::SetConstant<DeviceContext, T> setter;
     setter(dev_ctx, &psi, static_cast<T>(0));
 
     std::vector<int64_t> axes = {xrank - 2, xrank - 1};
diff --git a/paddle/fluid/operators/lu_unpack_op.h b/paddle/fluid/operators/lu_unpack_op.h
index 115ab116fd..c245c7eb65 100644
--- a/paddle/fluid/operators/lu_unpack_op.h
+++ b/paddle/fluid/operators/lu_unpack_op.h
@@ -110,7 +110,7 @@ class LU_UnpackGradKernel : public framework::OpKernel<T> {
     std::vector<int64_t> slice_ends(2, 0);
     auto valuedims = vectorize(xdims);
 
-    math::SetConstant<DeviceContext, T> setter;
+    pten::funcs::SetConstant<DeviceContext, T> setter;
     setter(dev_ctx, dx, static_cast<T>(0));
     if (m <= n) {
       slice_starts[0] = 0;
diff --git a/paddle/fluid/operators/margin_cross_entropy_op.cu b/paddle/fluid/operators/margin_cross_entropy_op.cu
index 51776f2166..a59909644a 100644
--- a/paddle/fluid/operators/margin_cross_entropy_op.cu
+++ b/paddle/fluid/operators/margin_cross_entropy_op.cu
@@ -22,11 +22,11 @@ namespace cub = hipcub;
 #include <vector>
 #include "paddle/fluid/operators/amp/fp16_type_traits.h"
 #include "paddle/fluid/operators/margin_cross_entropy_op.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/math/softmax_impl.h"
 #include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h"
 #include "paddle/fluid/operators/reduce_ops/reduce_op.h"
 #include "paddle/fluid/string/string_helper.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 #if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
 #include "paddle/fluid/platform/collective_helper.h"
@@ -341,8 +341,8 @@ class MarginCrossEntropyOpCUDAKernel : public framework::OpKernel<T> {
     // step 6, prob = exp((logit - logit_max) - log(sum(exp(logit -
     // logit_max))))
     // loss = -((logit_i - logit_max) - log(sum(exp(logit - logit_max))))
-    math::SetConstant<platform::CUDADeviceContext, T>()(dev_ctx, loss,
-                                                        static_cast<T>(0.0));
+    pten::funcs::SetConstant<platform::CUDADeviceContext, T>()(
+        dev_ctx, loss, static_cast<T>(0.0));
     if (label_type == framework::proto::VarType::INT32) {
       typedef int32_t LabelT;
       HardLabelSoftmaxWithCrossEntropyKernel<
diff --git a/paddle/fluid/operators/math/CMakeLists.txt b/paddle/fluid/operators/math/CMakeLists.txt
index 65bf595bce..a97e2ecfce 100644
--- a/paddle/fluid/operators/math/CMakeLists.txt
+++ b/paddle/fluid/operators/math/CMakeLists.txt
@@ -61,7 +61,7 @@ math_library(gru_compute DEPS activation_functions math_function)
 math_library(lstm_compute DEPS activation_functions)
 
 cc_library(blas SRCS blas.cc DEPS cblas framework_proto device_context)
-math_library(math_function DEPS blas dense_tensor tensor)
+# math_library(math_function DEPS blas dense_tensor tensor)
 math_library(maxouting)
 math_library(pooling)
 
@@ -95,7 +95,6 @@ math_library(matrix_inverse)
 math_library(segment_pooling)
 math_library(matrix_solve)
 
-cc_test(math_function_test SRCS math_function_test.cc DEPS math_function)
 cc_test(selected_rows_functor_test SRCS selected_rows_functor_test.cc DEPS selected_rows_functor)
 cc_test(im2col_test SRCS im2col_test.cc DEPS im2col)
 cc_test(vol2col_test SRCS vol2col_test.cc DEPS vol2col)
@@ -103,11 +102,9 @@ cc_test(sequence_padding_test SRCS sequence_padding_test.cc DEPS sequence_paddin
 cc_test(sequence_pooling_test SRCS sequence_pooling_test.cc DEPS sequence_pooling)
 cc_test(beam_search_test SRCS beam_search_test.cc DEPS beam_search)
 if(WITH_GPU)
-    nv_test(math_function_gpu_test SRCS math_function_test.cu DEPS math_function)
     nv_test(selected_rows_functor_gpu_test SRCS selected_rows_functor_test.cu.cc DEPS selected_rows_functor math_function)
 endif()
 if(WITH_ROCM)
-    hip_test(math_function_gpu_test SRCS math_function_test.cu DEPS math_function tensor)
     hip_test(selected_rows_functor_gpu_test SRCS selected_rows_functor_test.cu.cc DEPS selected_rows_functor math_function)
 endif()
 cc_test(concat_test SRCS concat_test.cc DEPS concat_and_split)
diff --git a/paddle/fluid/operators/math/blas_impl.cu.h b/paddle/fluid/operators/math/blas_impl.cu.h
index 7ffd2a7ab2..f9a4e963c0 100644
--- a/paddle/fluid/operators/math/blas_impl.cu.h
+++ b/paddle/fluid/operators/math/blas_impl.cu.h
@@ -14,8 +14,8 @@
 
 #pragma once
 
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/platform/dynload/cublas.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #include "paddle/pten/backends/gpu/gpu_context.h"
diff --git a/paddle/fluid/operators/math/blas_impl.h b/paddle/fluid/operators/math/blas_impl.h
index 80b7acc610..8e0075c42e 100644
--- a/paddle/fluid/operators/math/blas_impl.h
+++ b/paddle/fluid/operators/math/blas_impl.h
@@ -22,9 +22,9 @@
 #include <limits>
 #include <vector>
 
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/platform/bfloat16.h"
 #include "paddle/fluid/platform/complex.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/math/blas_impl.hip.h b/paddle/fluid/operators/math/blas_impl.hip.h
index bf7d66f485..980caa9cfe 100644
--- a/paddle/fluid/operators/math/blas_impl.hip.h
+++ b/paddle/fluid/operators/math/blas_impl.hip.h
@@ -14,10 +14,10 @@
 
 #pragma once
 
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #include "paddle/fluid/platform/dynload/rocblas.h"
 #include "paddle/pten/backends/gpu/gpu_context.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 DECLARE_bool(enable_cublas_tensor_op_math);
 
diff --git a/paddle/fluid/operators/math/depthwise_conv.cu b/paddle/fluid/operators/math/depthwise_conv.cu
index 6ff2ddaa33..117e6c4708 100644
--- a/paddle/fluid/operators/math/depthwise_conv.cu
+++ b/paddle/fluid/operators/math/depthwise_conv.cu
@@ -22,9 +22,9 @@ limitations under the License. */
 namespace cub = hipcub;
 #endif
 #include "paddle/fluid/operators/math/depthwise_conv.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
@@ -910,7 +910,7 @@ class DepthwiseConvFunctor<platform::CUDADeviceContext, T,
       filter_hwc.Resize(filter_hwc_dims);
       filter_hwc.mutable_data<T>(context.GetPlace());
       std::vector<int> perm_axis({2, 3, 0, 1});
-      math::TransposeNormal<platform::CUDADeviceContext, T> trans;
+      pten::funcs::TransposeNormal<platform::CUDADeviceContext, T> trans;
       trans(context, filter, &filter_hwc, perm_axis);
       filter_data = filter_hwc.data<T>();
     }
@@ -1053,7 +1053,7 @@ class DepthwiseConvInputGradFunctor<platform::CUDADeviceContext, T,
       filter_hwc.Resize(filter_hwc_dims);
       filter_hwc.mutable_data<T>(context.GetPlace());
       std::vector<int> perm_axis({2, 3, 0, 1});
-      math::TransposeNormal<platform::CUDADeviceContext, T> trans;
+      pten::funcs::TransposeNormal<platform::CUDADeviceContext, T> trans;
       trans(context, filter, &filter_hwc, perm_axis);
       filter_data = filter_hwc.data<T>();
     }
@@ -1215,7 +1215,7 @@ class DepthwiseConvFilterGradFunctor<platform::CUDADeviceContext, T,
              filter_grad->dims()[0], filter_grad->dims()[1]});                 \
         filter_grad_hwc.Resize(filter_grad_hwc_dims);                          \
         filter_grad_hwc.mutable_data<T>(context.GetPlace());                   \
-        math::SetConstant<platform::CUDADeviceContext, T> set_zero;            \
+        pten::funcs::SetConstant<platform::CUDADeviceContext, T> set_zero;     \
         set_zero(context, &filter_grad_hwc, static_cast<T>(0));                \
         filter_grad_data = filter_grad_hwc.data<T>();                          \
       } else {                                                                 \
@@ -1240,7 +1240,7 @@ class DepthwiseConvFilterGradFunctor<platform::CUDADeviceContext, T,
           dilate_height, dilate_width, filter_grad_data);                      \
       if (c_filter != -1) {                                                    \
         std::vector<int> perm_axis({2, 3, 0, 1});                              \
-        math::TransposeNormal<platform::CUDADeviceContext, T> trans;           \
+        pten::funcs::TransposeNormal<platform::CUDADeviceContext, T> trans;    \
         trans(context, filter_grad_hwc, filter_grad, perm_axis);               \
       }                                                                        \
     }                                                                          \
diff --git a/paddle/fluid/operators/math/math_function.cc b/paddle/fluid/operators/math/math_function.cc
deleted file mode 100644
index 2672d02db0..0000000000
--- a/paddle/fluid/operators/math/math_function.cc
+++ /dev/null
@@ -1,306 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include "paddle/fluid/operators/math/math_function.h"
-
-#ifdef PADDLE_WITH_MKLML
-#include "paddle/fluid/platform/dynload/mklml.h"
-#endif
-
-#ifdef PADDLE_USE_OPENBLAS
-#include <cblas.h>
-#endif
-
-#include <memory>
-#include <utility>
-#include <vector>
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/operators/math/math_function_impl.h"
-#include "paddle/fluid/platform/bfloat16.h"
-#include "paddle/fluid/platform/float16.h"
-#include "paddle/pten/backends/cpu/cpu_context.h"
-#include "paddle/pten/kernels/funcs/eigen/common.h"
-#include "unsupported/Eigen/CXX11/Tensor"
-
-namespace paddle {
-namespace operators {
-namespace math {
-
-using float16 = paddle::platform::float16;
-
-template struct SetConstant<platform::CPUDeviceContext, platform::float16>;
-template struct SetConstant<platform::CPUDeviceContext, platform::bfloat16>;
-template struct SetConstant<platform::CPUDeviceContext, float>;
-template struct SetConstant<platform::CPUDeviceContext, double>;
-template struct SetConstant<platform::CPUDeviceContext, int16_t>;
-template struct SetConstant<platform::CPUDeviceContext, int>;
-template struct SetConstant<platform::CPUDeviceContext, int64_t>;
-template struct SetConstant<platform::CPUDeviceContext, bool>;
-template struct SetConstant<platform::CPUDeviceContext, uint8_t>;
-template struct SetConstant<platform::CPUDeviceContext,
-                            platform::complex<float>>;
-template struct SetConstant<platform::CPUDeviceContext,
-                            platform::complex<double>>;
-
-template struct SetConstant<pten::CPUContext, platform::float16>;
-template struct SetConstant<pten::CPUContext, platform::bfloat16>;
-template struct SetConstant<pten::CPUContext, float>;
-template struct SetConstant<pten::CPUContext, double>;
-template struct SetConstant<pten::CPUContext, int16_t>;
-template struct SetConstant<pten::CPUContext, int>;
-template struct SetConstant<pten::CPUContext, int64_t>;
-template struct SetConstant<pten::CPUContext, bool>;
-template struct SetConstant<pten::CPUContext, uint8_t>;
-template struct SetConstant<pten::CPUContext, platform::complex<float>>;
-template struct SetConstant<pten::CPUContext, platform::complex<double>>;
-
-#ifdef PADDLE_WITH_XPU
-template struct SetConstant<platform::XPUDeviceContext, platform::float16>;
-template struct SetConstant<platform::XPUDeviceContext, platform::bfloat16>;
-template struct SetConstant<platform::XPUDeviceContext, float>;
-template struct SetConstant<platform::XPUDeviceContext, double>;
-template struct SetConstant<platform::XPUDeviceContext, uint8_t>;
-template struct SetConstant<platform::XPUDeviceContext, int16_t>;
-template struct SetConstant<platform::XPUDeviceContext, int>;
-template struct SetConstant<platform::XPUDeviceContext, int64_t>;
-template struct SetConstant<platform::XPUDeviceContext, bool>;
-template struct SetConstant<platform::XPUDeviceContext,
-                            platform::complex<float>>;
-template struct SetConstant<platform::XPUDeviceContext,
-                            platform::complex<double>>;
-#endif
-
-#define DEFINE_CPU_TRANS(RANK)                                              \
-  template struct Transpose<platform::CPUDeviceContext, platform::float16,  \
-                            RANK>;                                          \
-  template struct Transpose<platform::CPUDeviceContext, platform::bfloat16, \
-                            RANK>;                                          \
-  template struct Transpose<platform::CPUDeviceContext, float, RANK>;       \
-  template struct Transpose<platform::CPUDeviceContext, double, RANK>;      \
-  template struct Transpose<platform::CPUDeviceContext, int, RANK>;         \
-  template struct Transpose<platform::CPUDeviceContext, int64_t, RANK>;     \
-  template struct Transpose<platform::CPUDeviceContext, bool, RANK>;        \
-  template struct Transpose<platform::CPUDeviceContext, int16_t, RANK>;     \
-  template struct Transpose<platform::CPUDeviceContext, uint8_t, RANK>;     \
-  template struct Transpose<platform::CPUDeviceContext, int8_t, RANK>;      \
-  template struct Transpose<platform::CPUDeviceContext,                     \
-                            platform::complex<float>, RANK>;                \
-  template struct Transpose<platform::CPUDeviceContext,                     \
-                            platform::complex<double>, RANK>;
-
-DEFINE_CPU_TRANS(1);
-DEFINE_CPU_TRANS(2);
-DEFINE_CPU_TRANS(3);
-DEFINE_CPU_TRANS(4);
-DEFINE_CPU_TRANS(5);
-DEFINE_CPU_TRANS(6);
-
-template <typename T>
-struct TransposeNormal<platform::CPUDeviceContext, T> {
-  void operator()(const platform::CPUDeviceContext& context,
-                  const framework::Tensor& in, framework::Tensor* out,
-                  const std::vector<int>& axis) {
-    const int rank = axis.size();
-    auto in_stride = framework::stride(in.dims());
-    auto out_stride = framework::stride(out->dims());
-    const T* in_ptr = in.data<T>();
-    T* out_ptr = out->data<T>();
-
-    auto transpose_helper = [&](int64_t beg, int64_t end) {
-      for (int64_t out_idx = beg; out_idx < end; ++out_idx) {
-        int64_t in_idx = 0;
-        int64_t tmp_idx = out_idx;
-        // calculate the input index
-        for (int i = 0; i < rank; ++i) {
-          const int64_t coordinate = tmp_idx / out_stride[i];
-          tmp_idx -= coordinate * out_stride[i];
-          in_idx += coordinate * in_stride[axis[i]];
-        }
-        out_ptr[out_idx] = in_ptr[in_idx];
-      }
-    };
-    transpose_helper(0, out->numel());
-  }
-};
-
-// define transpose normal
-#define DEFINE_CPU_TRANS_NORMAL(TYPE) \
-  template struct TransposeNormal<platform::CPUDeviceContext, TYPE>
-
-DEFINE_CPU_TRANS_NORMAL(platform::float16);
-DEFINE_CPU_TRANS_NORMAL(platform::bfloat16);
-DEFINE_CPU_TRANS_NORMAL(float);
-DEFINE_CPU_TRANS_NORMAL(double);
-DEFINE_CPU_TRANS_NORMAL(int);
-DEFINE_CPU_TRANS_NORMAL(int64_t);
-DEFINE_CPU_TRANS_NORMAL(bool);
-DEFINE_CPU_TRANS_NORMAL(int16_t);
-DEFINE_CPU_TRANS_NORMAL(uint8_t);
-DEFINE_CPU_TRANS_NORMAL(int8_t);
-DEFINE_CPU_TRANS_NORMAL(platform::complex<float>);
-DEFINE_CPU_TRANS_NORMAL(platform::complex<double>);
-
-struct TensorSetConstantCPU {
-  TensorSetConstantCPU(framework::Tensor* tensor, float value)
-      : tensor_(tensor), value_(value) {}
-  template <typename T>
-  void apply() const {
-    auto cpu = platform::CPUPlace();
-    auto* begin = tensor_->mutable_data<T>(cpu);
-    std::fill(begin, begin + tensor_->numel(), static_cast<T>(value_));
-  }
-  framework::Tensor* tensor_;
-  float value_;
-};
-
-template <>
-void set_constant_with_place<platform::XPUPlace>(
-    const platform::DeviceContext& context, framework::Tensor* tensor,
-    float value) {
-  PADDLE_THROW(platform::errors::Unimplemented("XPUPlace is not supported"));
-}
-
-template <>
-void set_constant_with_place<platform::NPUPlace>(
-    const platform::DeviceContext& context, framework::Tensor* tensor,
-    float value) {
-  PADDLE_THROW(platform::errors::Unimplemented("NPUPlace is not supported"));
-}
-
-template <>
-void set_constant_with_place<platform::NPUPinnedPlace>(
-    const platform::DeviceContext& context, framework::Tensor* tensor,
-    float value) {
-  PADDLE_THROW(
-      platform::errors::Unimplemented("NPUPinnedPlace is not supported"));
-}
-
-template <>
-void set_constant_with_place<platform::IPUPlace>(
-    const platform::DeviceContext& context, framework::Tensor* tensor,
-    float value) {
-  PADDLE_THROW(platform::errors::Unimplemented("IPUPlace is not supported"));
-}
-
-template <>
-void set_constant_with_place<platform::CPUPlace>(
-    const platform::DeviceContext& context, framework::Tensor* tensor,
-    float value) {
-  framework::VisitDataType(tensor->type(), TensorSetConstantCPU(tensor, value));
-}
-
-template <>
-void set_constant_with_place<platform::MLUPlace>(
-    const platform::DeviceContext& context, framework::Tensor* tensor,
-    float value) {
-  PADDLE_THROW(platform::errors::Unimplemented("MLUPlace is not supported"));
-}
-
-template <>
-void set_constant_with_place<platform::CUDAPinnedPlace>(
-    const platform::DeviceContext& context, framework::Tensor* tensor,
-    float value) {
-  framework::VisitDataType(tensor->type(), TensorSetConstantCPU(tensor, value));
-}
-
-struct TensorSetConstantWithPlace : public boost::static_visitor<void> {
-  TensorSetConstantWithPlace(const platform::DeviceContext& context,
-                             framework::Tensor* tensor, float value)
-      : context_(context), tensor_(tensor), value_(value) {}
-
-  template <typename Place>
-  void operator()(Place place) const {
-    set_constant_with_place<Place>(context_, tensor_, value_);
-  }
-
-  const platform::DeviceContext& context_;
-  framework::Tensor* tensor_;
-  float value_;
-};
-
-void set_constant(const platform::DeviceContext& context,
-                  framework::Tensor* tensor, float value) {
-  TensorSetConstantWithPlace func(context, tensor, value);
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-  // tensor->place().apply_visitor(func);
-  paddle::platform::VisitPlace(tensor->place(), func);
-#else
-  func(platform::CPUPlace());
-#endif
-}
-
-template <typename T>
-struct RowwiseAdd<platform::CPUDeviceContext, T> {
-  void operator()(const platform::CPUDeviceContext& context,
-                  const framework::Tensor& input,
-                  const framework::Tensor& vector, framework::Tensor* output) {
-    auto in_dims = input.dims();
-    auto out_dims = output->dims();
-    auto size = input.numel() / in_dims[0];
-    PADDLE_ENFORCE_EQ(
-        vector.numel(), size,
-        platform::errors::InvalidArgument(
-            "The input vector size"
-            " should be equal to the size of each row of input tensor."
-            " Expected vector size=%d, but received %d",
-            size, vector.numel()));
-    const char* in_dims_cstr = in_dims.to_str().c_str();
-    const char* out_dims_cstr = out_dims.to_str().c_str();
-    PADDLE_ENFORCE_EQ(out_dims, in_dims,
-                      platform::errors::InvalidArgument(
-                          "The output tensor shape should be same as the input"
-                          " tensor shape. Expected output tensor shape: %s,"
-                          " but received %s",
-                          in_dims_cstr, out_dims_cstr));
-
-    auto in = framework::EigenMatrix<T>::From(input);
-    auto vec = framework::EigenVector<T>::Flatten(vector);
-    auto out = framework::EigenMatrix<T>::From(*output);
-
-    for (int64_t i = 0; i < in_dims[0]; ++i) {
-      out.chip(i, 0) = in.chip(i, 0) + vec;
-    }
-  }
-};
-
-template struct RowwiseAdd<platform::CPUDeviceContext, float>;
-template struct RowwiseAdd<platform::CPUDeviceContext, double>;
-
-template struct ColwiseSum<platform::CPUDeviceContext, float>;
-template struct ColwiseSum<platform::CPUDeviceContext, double>;
-template struct ColwiseSum<platform::CPUDeviceContext, int>;
-template struct ColwiseSum<platform::CPUDeviceContext, int64_t>;
-
-template struct RowwiseSum<platform::CPUDeviceContext, float>;
-template struct RowwiseSum<platform::CPUDeviceContext, double>;
-
-template struct RowwiseMean<platform::CPUDeviceContext, float>;
-template struct RowwiseMean<platform::CPUDeviceContext, double>;
-
-template <typename T>
-struct ElementwiseAddTo<platform::CPUDeviceContext, T> {
-  void operator()(platform::CPUDeviceContext* ctx, const framework::Tensor& src,
-                  framework::Tensor* dst) {
-    auto in = framework::EigenVector<T>::Flatten(src);
-    auto out = framework::EigenVector<T>::Flatten(*dst);
-    auto& place = *(ctx->eigen_device());
-    out.device(place) = out + in;
-  }
-};
-
-template struct ElementwiseAddTo<platform::CPUDeviceContext, platform::float16>;
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/math/math_function.cu b/paddle/fluid/operators/math/math_function.cu
deleted file mode 100644
index f0ef692b99..0000000000
--- a/paddle/fluid/operators/math/math_function.cu
+++ /dev/null
@@ -1,322 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-#include <algorithm>
-#include <vector>
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/memory/malloc.h"
-#include "paddle/fluid/memory/memcpy.h"
-#include "paddle/fluid/operators/math/blas.h"
-#include "paddle/fluid/operators/math/math_function.h"
-#include "paddle/fluid/operators/math/math_function_impl.h"
-#include "paddle/fluid/platform/bfloat16.h"
-#include "paddle/fluid/platform/float16.h"
-#include "paddle/pten/backends/gpu/gpu_context.h"
-#include "paddle/pten/kernels/funcs/eigen/common.h"
-
-namespace paddle {
-namespace operators {
-namespace math {
-
-using float16 = paddle::platform::float16;
-using bfloat16 = paddle::platform::bfloat16;
-
-template struct SetConstant<platform::CUDADeviceContext, platform::float16>;
-template struct SetConstant<platform::CUDADeviceContext, platform::bfloat16>;
-template struct SetConstant<platform::CUDADeviceContext, float>;
-template struct SetConstant<platform::CUDADeviceContext, double>;
-template struct SetConstant<platform::CUDADeviceContext, uint8_t>;
-template struct SetConstant<platform::CUDADeviceContext, int>;
-template struct SetConstant<platform::CUDADeviceContext, int16_t>;
-template struct SetConstant<platform::CUDADeviceContext, int64_t>;
-template struct SetConstant<platform::CUDADeviceContext, bool>;
-template struct SetConstant<platform::CUDADeviceContext,
-                            platform::complex<float>>;
-template struct SetConstant<platform::CUDADeviceContext,
-                            platform::complex<double>>;
-
-template struct SetConstant<pten::GPUContext, platform::float16>;
-template struct SetConstant<pten::GPUContext, platform::bfloat16>;
-template struct SetConstant<pten::GPUContext, float>;
-template struct SetConstant<pten::GPUContext, double>;
-template struct SetConstant<pten::GPUContext, uint8_t>;
-template struct SetConstant<pten::GPUContext, int>;
-template struct SetConstant<pten::GPUContext, int16_t>;
-template struct SetConstant<pten::GPUContext, int64_t>;
-template struct SetConstant<pten::GPUContext, bool>;
-template struct SetConstant<pten::GPUContext, platform::complex<float>>;
-template struct SetConstant<pten::GPUContext, platform::complex<double>>;
-
-template struct SetConstant<platform::CUDAPinnedDeviceContext,
-                            platform::float16>;
-template struct SetConstant<platform::CUDAPinnedDeviceContext,
-                            platform::bfloat16>;
-template struct SetConstant<platform::CUDAPinnedDeviceContext, float>;
-template struct SetConstant<platform::CUDAPinnedDeviceContext, double>;
-template struct SetConstant<platform::CUDAPinnedDeviceContext, uint8_t>;
-template struct SetConstant<platform::CUDAPinnedDeviceContext, int>;
-template struct SetConstant<platform::CUDAPinnedDeviceContext, int16_t>;
-template struct SetConstant<platform::CUDAPinnedDeviceContext, int64_t>;
-template struct SetConstant<platform::CUDAPinnedDeviceContext, bool>;
-template struct SetConstant<platform::CUDAPinnedDeviceContext,
-                            platform::complex<float>>;
-template struct SetConstant<platform::CUDAPinnedDeviceContext,
-                            platform::complex<double>>;
-
-#define DEFINE_GPU_TRANS(RANK)                                            \
-  template struct Transpose<platform::CUDADeviceContext, bool, RANK>;     \
-  template struct Transpose<platform::CUDADeviceContext, float, RANK>;    \
-  template struct Transpose<platform::CUDADeviceContext, double, RANK>;   \
-  template struct Transpose<platform::CUDADeviceContext, float16, RANK>;  \
-  template struct Transpose<platform::CUDADeviceContext, bfloat16, RANK>; \
-  template struct Transpose<platform::CUDADeviceContext, int8_t, RANK>;   \
-  template struct Transpose<platform::CUDADeviceContext, int32_t, RANK>;  \
-  template struct Transpose<platform::CUDADeviceContext, int64_t, RANK>;  \
-  template struct Transpose<platform::CUDADeviceContext,                  \
-                            paddle::platform::complex<float>, RANK>;      \
-  template struct Transpose<platform::CUDADeviceContext,                  \
-                            paddle::platform::complex<double>, RANK>;
-
-DEFINE_GPU_TRANS(1);
-DEFINE_GPU_TRANS(2);
-DEFINE_GPU_TRANS(3);
-DEFINE_GPU_TRANS(4);
-DEFINE_GPU_TRANS(5);
-DEFINE_GPU_TRANS(6);
-
-#define REINTERPRET(T, DST_PTR, SRC_PTR) \
-  T* DST_PTR = reinterpret_cast<T*>(SRC_PTR)
-
-template <typename T>
-__global__ void TransposeNormalKernel(const T* in_ptr, T* out_ptr,
-                                      int64_t element,
-                                      const int64_t* in_stride_ptr,
-                                      const int64_t* out_stride_ptr,
-                                      const int64_t* axis_ptr, int rank) {
-  CUDA_KERNEL_LOOP(out_idx, element) {
-    int64_t in_idx = 0;
-    int64_t tmp_idx = out_idx;
-    for (int i = 0; i < rank; ++i) {
-      const int64_t coordinate = tmp_idx / out_stride_ptr[i];
-      tmp_idx -= coordinate * out_stride_ptr[i];
-      in_idx += coordinate * in_stride_ptr[axis_ptr[i]];
-    }
-    out_ptr[out_idx] = in_ptr[in_idx];
-  }
-}
-
-template <typename T>
-struct TransposeNormal<platform::CUDADeviceContext, T> {
-  void operator()(const platform::CUDADeviceContext& context,
-                  const framework::Tensor& in, framework::Tensor* out,
-                  const std::vector<int>& axis) {
-    const int rank = axis.size();
-    auto in_stride = framework::stride(in.dims());
-    auto out_stride = framework::stride(out->dims());
-    auto* in_ptr = in.data<T>();
-    auto* out_ptr = out->data<T>();
-
-    // copy in_stride, out_stride, axis to gpu device
-    const platform::CUDAPlace& cuda_place = context.GetPlace();
-    platform::CPUPlace cpu_place = platform::CPUPlace();
-    size_t size = 3 * rank * sizeof(int64_t);
-    auto cpu_buf_holder = memory::Alloc(cpu_place, size);
-    auto cuda_buf_holder = memory::Alloc(cuda_place, size);
-    REINTERPRET(int64_t, cpu_buf, cpu_buf_holder->ptr());
-    REINTERPRET(int64_t, cuda_buf, cuda_buf_holder->ptr());
-    for (int i = 0; i < rank; ++i) {
-      cpu_buf[i] = in_stride[i];
-      cpu_buf[rank + i] = out_stride[i];
-      cpu_buf[2 * rank + i] = axis[i];
-    }
-    memory::Copy(cuda_place, cuda_buf, cpu_place, cpu_buf, size,
-                 context.stream());
-    REINTERPRET(const int64_t, in_stride_ptr, cuda_buf);
-    REINTERPRET(const int64_t, out_stride_ptr, cuda_buf + rank);
-    REINTERPRET(const int64_t, axis_ptr, cuda_buf + 2 * rank);
-
-    const int MAX_BLOCK_DIM = context.GetMaxThreadsPerBlock();
-    const int MAX_GRID_DIM =
-        context.GetMaxPhysicalThreadCount() / MAX_BLOCK_DIM;
-    int64_t elements = in.numel();
-    int block_size = (elements >= MAX_BLOCK_DIM)
-                         ? MAX_BLOCK_DIM
-                         : (1 << static_cast<int>(std::log2(elements)));
-    int grid_size = elements / block_size;
-    grid_size = (grid_size >= MAX_GRID_DIM) ? MAX_GRID_DIM : grid_size;
-    TransposeNormalKernel<T><<<grid_size, block_size, 0, context.stream()>>>(
-        in_ptr, out_ptr, elements, in_stride_ptr, out_stride_ptr, axis_ptr,
-        rank);
-  }
-};
-
-// define transpose normal
-#define DEFINE_GPU_TRANS_NORMAL(TYPE) \
-  template struct TransposeNormal<platform::CUDADeviceContext, TYPE>
-
-DEFINE_GPU_TRANS_NORMAL(float16);
-DEFINE_GPU_TRANS_NORMAL(bfloat16);
-DEFINE_GPU_TRANS_NORMAL(float);
-DEFINE_GPU_TRANS_NORMAL(double);
-DEFINE_GPU_TRANS_NORMAL(int);
-DEFINE_GPU_TRANS_NORMAL(int64_t);
-DEFINE_GPU_TRANS_NORMAL(bool);
-DEFINE_GPU_TRANS_NORMAL(int16_t);
-DEFINE_GPU_TRANS_NORMAL(uint8_t);
-DEFINE_GPU_TRANS_NORMAL(int8_t);
-DEFINE_GPU_TRANS_NORMAL(paddle::platform::complex<float>);
-DEFINE_GPU_TRANS_NORMAL(paddle::platform::complex<double>);
-
-struct TensorSetConstantGPU {
-  TensorSetConstantGPU(const platform::DeviceContext& context,
-                       framework::Tensor* tensor, float value)
-      : context_(context), tensor_(tensor), value_(value) {}
-
-  template <typename T>
-  void apply() const {
-    SetConstant<platform::CUDADeviceContext, T> functor;
-    functor(reinterpret_cast<const platform::CUDADeviceContext&>(context_),
-            tensor_, static_cast<T>(value_));
-  }
-
-  const platform::DeviceContext& context_;
-  framework::Tensor* tensor_;
-  float value_;
-};
-
-template <>
-void set_constant_with_place<platform::CUDAPlace>(
-    const platform::DeviceContext& context, framework::Tensor* tensor,
-    float value) {
-  framework::VisitDataType(tensor->type(),
-                           TensorSetConstantGPU(context, tensor, value));
-}
-
-template <typename T>
-__global__ void RowwiseAddKernel(const T* a, const T* b, T* c, int width,
-                                 int num) {
-  T tmp = 1.0 / width;
-  CUDA_KERNEL_LOOP(i, num) {
-    int h = i * tmp;
-    int w = i - h * width;
-    c[i] = a[i] + b[w];
-  }
-}
-
-template <typename T>
-struct RowwiseAdd<platform::CUDADeviceContext, T> {
-  void operator()(const platform::CUDADeviceContext& context,
-                  const framework::Tensor& input,
-                  const framework::Tensor& vector, framework::Tensor* output) {
-    auto in_dims = input.dims();
-    auto out_dims = output->dims();
-    auto size = input.numel() / in_dims[0];
-    PADDLE_ENFORCE_EQ(
-        vector.numel(), size,
-        platform::errors::InvalidArgument(
-            "The input vector size"
-            " should be equal to the size of each row of input tensor."
-            " Expected vector size=%d, but received %d",
-            size, vector.numel()));
-    const char* in_dims_cstr = in_dims.to_str().c_str();
-    const char* out_dims_cstr = out_dims.to_str().c_str();
-    PADDLE_ENFORCE_EQ(
-        out_dims, in_dims,
-        platform::errors::InvalidArgument(
-            "The output tensor shape should be same as the input tensor"
-            " shape. Expected output tensor shape: %s,"
-            " but received %s",
-            in_dims_cstr, out_dims_cstr));
-    int blocks = 512;
-    int grids = (input.numel() + blocks - 1) / blocks;
-    RowwiseAddKernel<T><<<grids, blocks, 0, context.stream()>>>(
-        input.data<T>(), vector.data<T>(), output->data<T>(),
-        static_cast<int>(in_dims[1]), static_cast<int>(input.numel()));
-  }
-};
-
-template struct RowwiseAdd<platform::CUDADeviceContext, float>;
-template struct RowwiseAdd<platform::CUDADeviceContext, double>;
-template struct ColwiseSum<platform::CUDADeviceContext, float>;
-template struct ColwiseSum<platform::CUDADeviceContext, int>;
-template struct ColwiseSum<platform::CUDADeviceContext, int64_t>;
-// template struct ColwiseSum<platform::CUDADeviceContext, double>;
-// The ColwiseSum<platform::CUDADeviceContext, double> failed in debug mode,
-// and only failed for this case. So reimplemented it.
-template <>
-void ColwiseSum<platform::CUDADeviceContext, double>::operator()(
-    const platform::CUDADeviceContext& context, const framework::Tensor& input,
-    framework::Tensor* vector) {
-  auto in_dims = input.dims();
-  auto size = input.numel() / in_dims[0];
-  PADDLE_ENFORCE_EQ(vector->numel(), size,
-                    platform::errors::InvalidArgument(
-                        "The size of input vector"
-                        " should be equal to the size of input tensor column"
-                        " dimension. Expected vector size=%d, but received %d",
-                        size, vector->numel()));
-  framework::Tensor one;
-  one.mutable_data<double>({in_dims[0]}, context.GetPlace());
-  SetConstant<platform::CUDADeviceContext, double> set;
-  set(context, &one, static_cast<double>(1.0));
-  GetBlas<platform::CUDADeviceContext, double>(context).GEMV(
-      true, static_cast<int>(in_dims[0]), static_cast<int>(in_dims[1]), 1.0,
-      input.data<double>(), one.data<double>(), 0.0, vector->data<double>());
-}
-
-template struct RowwiseSum<platform::CUDADeviceContext, float>;
-// template struct RowwiseSum<platform::CUDADeviceContext, double>;
-// TODO(zcd): Following ColwiseSum format, need to confirm.
-// The RowwiseSum<platform::CUDADeviceContext, double> failed in debug mode,
-// and only failed for this case. So reimplemented it.
-template <>
-void RowwiseSum<platform::CUDADeviceContext, double>::operator()(
-    const platform::CUDADeviceContext& context, const framework::Tensor& input,
-    framework::Tensor* vector) {
-  auto in_dims = input.dims();
-  auto size = input.numel() / in_dims[0];
-  PADDLE_ENFORCE_EQ(vector->numel(), in_dims[0],
-                    platform::errors::InvalidArgument(
-                        "The size of input vector"
-                        " should be equal to the size of input tensor row"
-                        " dimension. Expected vector size=%d, but received %d",
-                        in_dims[0], vector->numel()));
-  framework::Tensor one;
-  one.mutable_data<double>({size}, context.GetPlace());
-  SetConstant<platform::CUDADeviceContext, double> set;
-  set(context, &one, static_cast<double>(1.0));
-  GetBlas<platform::CUDADeviceContext, double>(context).GEMV(
-      true, static_cast<int>(in_dims[1]), static_cast<int>(in_dims[0]), 1.0,
-      one.data<double>(), input.data<double>(), 0.0, vector->data<double>());
-}
-
-template struct RowwiseMean<platform::CUDADeviceContext, float>;
-template struct RowwiseMean<platform::CUDADeviceContext, double>;
-
-template <typename T>
-struct ElementwiseAddTo<platform::CUDADeviceContext, T> {
-  void operator()(platform::CUDADeviceContext* ctx,
-                  const framework::Tensor& src, framework::Tensor* dst) {
-    auto in = framework::EigenVector<T>::Flatten(src);
-    auto out = framework::EigenVector<T>::Flatten(*dst);
-    auto& place = *(ctx->eigen_device());
-    out.device(place) = out + in;
-  }
-};
-
-template struct ElementwiseAddTo<platform::CUDADeviceContext,
-                                 platform::float16>;
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/math/math_function.h b/paddle/fluid/operators/math/math_function.h
deleted file mode 100644
index 9dbbf455f1..0000000000
--- a/paddle/fluid/operators/math/math_function.h
+++ /dev/null
@@ -1,112 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-#include <cmath>
-#include <memory>
-#include <vector>
-
-#include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/framework/tensor_util.h"
-#include "paddle/fluid/platform/device_context.h"
-#include "paddle/fluid/platform/enforce.h"
-#include "paddle/pten/core/dense_tensor.h"
-
-namespace paddle {
-namespace operators {
-namespace math {
-
-template <typename DeviceContext, typename T>
-struct TransposeNormal {
-  // for dims >= 7 situation
-  void operator()(const DeviceContext& context, const framework::Tensor& in,
-                  framework::Tensor* out, const std::vector<int>& axis);
-};
-
-template <typename DeviceContext, typename T, int Rank>
-struct Transpose {
-  void operator()(const DeviceContext& context, const framework::Tensor& in,
-                  framework::Tensor* out, const std::vector<int>& axis);
-};
-
-template <typename DeviceContext, typename T>
-struct SetConstant {
-  void operator()(const DeviceContext& context, framework::Tensor* tensor,
-                  T num);
-};
-
-template <typename Place>
-void set_constant_with_place(const platform::DeviceContext& context,
-                             framework::Tensor* tensor, float value);
-
-void set_constant(const platform::DeviceContext& context,
-                  framework::Tensor* tensor, float value);
-
-template <typename DeviceContext, typename T>
-struct RowwiseAdd {
-  void operator()(const DeviceContext& context, const framework::Tensor& input,
-                  const framework::Tensor& vec, framework::Tensor* output);
-};
-
-template <typename DeviceContext, typename T>
-struct ElementwiseAddTo {
-  // dst = dst + src
-  void operator()(DeviceContext* ctx, const framework::Tensor& src,
-                  framework::Tensor* dst);
-};
-
-template <typename DeviceContext, typename T>
-struct ColwiseSum {
-  void operator()(const DeviceContext& context, const framework::Tensor& input,
-                  framework::Tensor* vec);
-};
-
-template <typename DeviceContext, typename T>
-struct RowwiseSum {
-  void operator()(const DeviceContext& context, const framework::Tensor& input,
-                  framework::Tensor* vec);
-};
-
-template <typename DeviceContext, typename T>
-struct RowwiseMean {
-  void operator()(const DeviceContext& context, const framework::Tensor& input,
-                  framework::Tensor* vec);
-};
-
-#ifdef PADDLE_WITH_XPU
-template <typename U>
-struct TensorSetConstantXPU {
-  TensorSetConstantXPU(framework::Tensor* tensor, U value,
-                       platform::Place place)
-      : tensor_(tensor), value_(value), place_(place) {}
-  template <typename T>
-  void apply() const {
-    auto* begin = tensor_->mutable_data<T>(place_);
-    int numel = tensor_->numel();
-    std::unique_ptr<T[]> data_cpu(new T[numel]);
-    std::fill(data_cpu.get(), data_cpu.get() + numel, static_cast<T>(value_));
-    memory::Copy(place_, begin, platform::CPUPlace(),
-                 static_cast<void*>(data_cpu.get()), numel * sizeof(T));
-  }
-  framework::Tensor* tensor_;
-  U value_;
-  platform::Place place_;
-};
-#endif
-
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
diff --git a/paddle/fluid/operators/math/matrix_solve.cu.cc b/paddle/fluid/operators/math/matrix_solve.cu.cc
index 8aaac0295c..ee6610eae1 100644
--- a/paddle/fluid/operators/math/matrix_solve.cu.cc
+++ b/paddle/fluid/operators/math/matrix_solve.cu.cc
@@ -15,9 +15,9 @@ limitations under the License. */
 #include "paddle/fluid/operators/math/matrix_solve.h"
 #include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/operators/math/blas.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/solve_op.h"
 #include "paddle/fluid/platform/device_context.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace platform {
@@ -76,7 +76,7 @@ class MatrixSolveFunctor<platform::CUDADeviceContext, T> {
     const auto& new_dims_vec = getNewDimsVec(b_dims);
     tmp_b.Resize(framework::make_ddim(new_dims_vec));
     tmp_b.mutable_data<T>(context.GetPlace());
-    math::TransposeNormal<platform::CUDADeviceContext, T> trans;
+    pten::funcs::TransposeNormal<platform::CUDADeviceContext, T> trans;
     std::vector<int> new_axis = getNewAxis(b_rank);
     trans(context, b, &tmp_b, new_axis);
 
@@ -149,7 +149,7 @@ class MatrixSolveFunctor<platform::CUDADeviceContext, T> {
                           -host_info));
 
     // transpose tmp_b to get the final result in row-major form.
-    math::TransposeNormal<platform::CUDADeviceContext, T> trans2;
+    pten::funcs::TransposeNormal<platform::CUDADeviceContext, T> trans2;
     trans2(context, tmp_b, out, new_axis);
 
 #else
diff --git a/paddle/fluid/operators/math/prelu.h b/paddle/fluid/operators/math/prelu.h
index 70aae2ba59..24c8721656 100644
--- a/paddle/fluid/operators/math/prelu.h
+++ b/paddle/fluid/operators/math/prelu.h
@@ -15,8 +15,8 @@ limitations under the License. */
 #pragma once
 #include <vector>
 
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/math/sample_prob.cu b/paddle/fluid/operators/math/sample_prob.cu
index f596c1bc3d..edc61bc667 100644
--- a/paddle/fluid/operators/math/sample_prob.cu
+++ b/paddle/fluid/operators/math/sample_prob.cu
@@ -22,9 +22,9 @@ limitations under the License. */
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/math/sample_prob.h"
 #include "paddle/fluid/operators/math/sampler.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/math/segment_pooling.cu b/paddle/fluid/operators/math/segment_pooling.cu
index 0cbfaa4c5d..eaed2dc7d7 100644
--- a/paddle/fluid/operators/math/segment_pooling.cu
+++ b/paddle/fluid/operators/math/segment_pooling.cu
@@ -14,10 +14,10 @@ limitations under the License. */
 
 #include <algorithm>
 #include "paddle/fluid/operators/gather.cu.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/math/segment_pooling.h"
 #include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/math/selected_rows_functor.cc b/paddle/fluid/operators/math/selected_rows_functor.cc
index 8cd3e1367d..b921e844c9 100644
--- a/paddle/fluid/operators/math/selected_rows_functor.cc
+++ b/paddle/fluid/operators/math/selected_rows_functor.cc
@@ -129,7 +129,7 @@ struct SelectedRowsAddTensor<platform::CPUDeviceContext, T> {
             "But recieved input width = [%d], output width = [%d]",
             in1_row_numel, output->numel() / in1_height));
 
-    SetConstant<platform::CPUDeviceContext, T> functor;
+    pten::funcs::SetConstant<platform::CPUDeviceContext, T> functor;
     functor(context, output, 0.0);
 
     auto* in1_data = in1_value.data<T>();
@@ -461,7 +461,7 @@ struct MergeAdd<platform::CPUDeviceContext, T> {
 
       out.set_rows(merge_rows);
 
-      math::SetConstant<platform::CPUDeviceContext, T> constant_functor;
+      pten::funcs::SetConstant<platform::CPUDeviceContext, T> constant_functor;
       constant_functor(context, out.mutable_value(), static_cast<T>(0.f));
 
       std::unordered_map<int64_t, size_t> rows_to_id;
@@ -689,7 +689,7 @@ struct MergeAverage<platform::CPUDeviceContext, T> {
 
     out.set_rows(merge_rows);
 
-    math::SetConstant<platform::CPUDeviceContext, T> constant_functor;
+    pten::funcs::SetConstant<platform::CPUDeviceContext, T> constant_functor;
     constant_functor(context, out.mutable_value(), 0.0);
 
     std::unordered_map<int64_t, size_t> rows_to_id;
diff --git a/paddle/fluid/operators/math/selected_rows_functor.cu b/paddle/fluid/operators/math/selected_rows_functor.cu
index 2ae2aaebb6..d2caf82c93 100644
--- a/paddle/fluid/operators/math/selected_rows_functor.cu
+++ b/paddle/fluid/operators/math/selected_rows_functor.cu
@@ -15,10 +15,10 @@ limitations under the License. */
 #include <set>
 #include <vector>
 
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/math/selected_rows_functor.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/fluid/platform/float16.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
@@ -156,7 +156,7 @@ struct SelectedRowsAddTensor<platform::CUDADeviceContext, T> {
     auto* in2_data = input2.data<T>();
     auto* out_data = output->data<T>();
 
-    SetConstant<platform::CUDADeviceContext, T> functor;
+    pten::funcs::SetConstant<platform::CUDADeviceContext, T> functor;
     functor(context, output, static_cast<T>(0));
 
     const int block_size = 256;
@@ -348,7 +348,7 @@ struct MergeAdd<platform::CUDADeviceContext, T> {
             {static_cast<int64_t>(merge_rows.size()), input_width}),
         context.GetPlace());
 
-    math::SetConstant<platform::CUDADeviceContext, T> constant_functor;
+    pten::funcs::SetConstant<platform::CUDADeviceContext, T> constant_functor;
     constant_functor(context, out.mutable_value(), static_cast<T>(0));
 
     auto* out_data = out.mutable_value()->data<T>();
@@ -411,7 +411,7 @@ struct MergeAdd<platform::CUDADeviceContext, T> {
             {static_cast<int64_t>(merge_rows.size()), input_width}),
         context.GetPlace());
 
-    math::SetConstant<platform::CUDADeviceContext, T> constant_functor;
+    pten::funcs::SetConstant<platform::CUDADeviceContext, T> constant_functor;
     constant_functor(context, out.mutable_value(), static_cast<T>(0));
 
     auto* out_data = out.mutable_value()->data<T>();
diff --git a/paddle/fluid/operators/math/selected_rows_functor.h b/paddle/fluid/operators/math/selected_rows_functor.h
index 690082036c..e0ac583f15 100644
--- a/paddle/fluid/operators/math/selected_rows_functor.h
+++ b/paddle/fluid/operators/math/selected_rows_functor.h
@@ -19,8 +19,8 @@ limitations under the License. */
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/selected_rows_utils.h"
 #include "paddle/fluid/operators/math/blas.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/platform/device_context.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 #define INLINE_FOR2(sizei, sizej)     \
   for (int64_t i = 0; i < sizei; i++) \
diff --git a/paddle/fluid/operators/math/selected_rows_functor_test.cc b/paddle/fluid/operators/math/selected_rows_functor_test.cc
index 19e70f924f..9cb815e161 100644
--- a/paddle/fluid/operators/math/selected_rows_functor_test.cc
+++ b/paddle/fluid/operators/math/selected_rows_functor_test.cc
@@ -15,14 +15,12 @@ limitations under the License. */
 #include "paddle/fluid/operators/math/selected_rows_functor.h"
 
 #include "gtest/gtest.h"
-#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 TEST(selected_rows_functor, cpu_add) {
   paddle::platform::CPUPlace cpu_place;
   paddle::platform::CPUDeviceContext ctx(cpu_place);
-  paddle::operators::math::SetConstant<paddle::platform::CPUDeviceContext,
-                                       float>
-      functor;
+  pten::funcs::SetConstant<paddle::platform::CPUDeviceContext, float> functor;
   int64_t height = 10;
   int64_t row_numel = 10;
 
@@ -122,9 +120,7 @@ TEST(selected_rows_functor, cpu_add) {
 TEST(selected_rows_functor, cpu_add_to) {
   paddle::platform::CPUPlace cpu_place;
   paddle::platform::CPUDeviceContext ctx(cpu_place);
-  paddle::operators::math::SetConstant<paddle::platform::CPUDeviceContext,
-                                       float>
-      functor;
+  pten::funcs::SetConstant<paddle::platform::CPUDeviceContext, float> functor;
   int64_t height = 10;
   int64_t row_numel = 10;
 
@@ -221,9 +217,7 @@ TEST(selected_rows_functor, cpu_add_to) {
 TEST(selected_rows_functor, cpu_merge_average_float) {
   paddle::platform::CPUPlace cpu_place;
   paddle::platform::CPUDeviceContext ctx(cpu_place);
-  paddle::operators::math::SetConstant<paddle::platform::CPUDeviceContext,
-                                       float>
-      functor;
+  pten::funcs::SetConstant<paddle::platform::CPUDeviceContext, float> functor;
   int64_t height = 10;
   int64_t row_numel = 10;
 
@@ -260,9 +254,7 @@ TEST(selected_rows_functor, cpu_merge_average_float) {
 TEST(selected_rows_functor, cpu_merge_add_float) {
   paddle::platform::CPUPlace cpu_place;
   paddle::platform::CPUDeviceContext ctx(cpu_place);
-  paddle::operators::math::SetConstant<paddle::platform::CPUDeviceContext,
-                                       float>
-      functor;
+  pten::funcs::SetConstant<paddle::platform::CPUDeviceContext, float> functor;
   int64_t height = 10;
   int64_t row_numel = 10;
 
@@ -301,8 +293,7 @@ TEST(selected_rows_functor, cpu_merge_add_float) {
 TEST(selected_rows_functor, cpu_merge_add_int) {
   paddle::platform::CPUPlace cpu_place;
   paddle::platform::CPUDeviceContext ctx(cpu_place);
-  paddle::operators::math::SetConstant<paddle::platform::CPUDeviceContext, int>
-      functor;
+  pten::funcs::SetConstant<paddle::platform::CPUDeviceContext, int> functor;
   int64_t height = 10;
   int64_t row_numel = 10;
 
@@ -341,9 +332,7 @@ TEST(selected_rows_functor, cpu_merge_add_int) {
 TEST(selected_rows_functor, cpu_merge_add_multi) {
   paddle::platform::CPUPlace cpu_place;
   paddle::platform::CPUDeviceContext ctx(cpu_place);
-  paddle::operators::math::SetConstant<paddle::platform::CPUDeviceContext,
-                                       float>
-      set_const;
+  pten::funcs::SetConstant<paddle::platform::CPUDeviceContext, float> set_const;
 
   int64_t height = 10;
   int64_t row_numel = 8;
@@ -397,9 +386,7 @@ TEST(selected_rows_functor, cpu_merge_add_multi) {
 TEST(selected_rows_functor, cpu_merge_add_multi_noduplicated) {
   paddle::platform::CPUPlace cpu_place;
   paddle::platform::CPUDeviceContext ctx(cpu_place);
-  paddle::operators::math::SetConstant<paddle::platform::CPUDeviceContext,
-                                       float>
-      set_const;
+  pten::funcs::SetConstant<paddle::platform::CPUDeviceContext, float> set_const;
 
   int64_t height = 10;
   int64_t row_numel = 8;
@@ -459,9 +446,7 @@ TEST(selected_rows_functor, cpu_merge_add_multi_noduplicated) {
 TEST(selected_rows_functor, cpu_sum_to) {
   paddle::platform::CPUPlace cpu_place;
   paddle::platform::CPUDeviceContext ctx(cpu_place);
-  paddle::operators::math::SetConstant<paddle::platform::CPUDeviceContext,
-                                       float>
-      functor;
+  pten::funcs::SetConstant<paddle::platform::CPUDeviceContext, float> functor;
   int64_t height = 10;
   int64_t row_numel = 10;
   std::vector<int64_t> rows1{0, 4, 7};
diff --git a/paddle/fluid/operators/math/selected_rows_functor_test.cu.cc b/paddle/fluid/operators/math/selected_rows_functor_test.cu.cc
index e826c2a724..1bae95e158 100644
--- a/paddle/fluid/operators/math/selected_rows_functor_test.cu.cc
+++ b/paddle/fluid/operators/math/selected_rows_functor_test.cu.cc
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/math/selected_rows_functor.h"
 #include "gtest/gtest.h"
-#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 TEST(selected_rows_functor, gpu_add) {
   paddle::platform::CUDAPlace gpu_place(0);
@@ -22,9 +22,7 @@ TEST(selected_rows_functor, gpu_add) {
   paddle::platform::CUDADeviceContext& ctx =
       *reinterpret_cast<paddle::platform::CUDADeviceContext*>(
           paddle::platform::DeviceContextPool::Instance().Get(gpu_place));
-  paddle::operators::math::SetConstant<paddle::platform::CUDADeviceContext,
-                                       float>
-      functor;
+  pten::funcs::SetConstant<paddle::platform::CUDADeviceContext, float> functor;
   int64_t height = 10;
   int64_t row_numel = 10;
 
@@ -144,9 +142,7 @@ TEST(selected_rows_functor, gpu_add_to) {
   paddle::platform::CUDADeviceContext& ctx =
       *reinterpret_cast<paddle::platform::CUDADeviceContext*>(
           paddle::platform::DeviceContextPool::Instance().Get(gpu_place));
-  paddle::operators::math::SetConstant<paddle::platform::CUDADeviceContext,
-                                       float>
-      functor;
+  pten::funcs::SetConstant<paddle::platform::CUDADeviceContext, float> functor;
   int64_t height = 10;
   int64_t row_numel = 10;
 
@@ -254,8 +250,7 @@ TEST(selected_rows_functor, gpu_merge_add) {
   paddle::platform::CUDADeviceContext& ctx =
       *reinterpret_cast<paddle::platform::CUDADeviceContext*>(
           paddle::platform::DeviceContextPool::Instance().Get(gpu_place));
-  paddle::operators::math::SetConstant<paddle::platform::CUDADeviceContext,
-                                       float>
+  pten::funcs::SetConstant<paddle::platform::CUDADeviceContext, float>
       set_const;
 
   int64_t height = 10;
diff --git a/paddle/fluid/operators/math/sequence_pooling.cc b/paddle/fluid/operators/math/sequence_pooling.cc
index 2eee4d0a6c..22cd435297 100644
--- a/paddle/fluid/operators/math/sequence_pooling.cc
+++ b/paddle/fluid/operators/math/sequence_pooling.cc
@@ -16,8 +16,8 @@ limitations under the License. */
 
 #include "paddle/fluid/operators/jit/kernels.h"
 #include "paddle/fluid/operators/math/blas.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/math/sequence_pooling.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
@@ -191,7 +191,7 @@ class MaxSeqPoolGradFunctor {
     const int* max_index = index.data<int>();
     T* ig_data = in_grad->data<T>();
 
-    SetConstant<platform::CPUDeviceContext, T> set_zero;
+    pten::funcs::SetConstant<platform::CPUDeviceContext, T> set_zero;
     set_zero(context, in_grad, static_cast<T>(0.0));
     int64_t num_seq = og_dims[0];
     int64_t dim = out_grad.numel() / num_seq;
@@ -409,7 +409,7 @@ class SequencePoolGradFunctor<platform::CPUDeviceContext, T> {
 
     if (pooltype == "LAST" || pooltype == "FIRST") {
       // set X@Grad be zero at first when pooltype is LAST/FIRST
-      math::SetConstant<platform::CPUDeviceContext, T> functor;
+      pten::funcs::SetConstant<platform::CPUDeviceContext, T> functor;
       functor(context, in_grad, 0);
     }
 
diff --git a/paddle/fluid/operators/math/sequence_pooling.cu b/paddle/fluid/operators/math/sequence_pooling.cu
index b3e1922e10..3bf3b483e8 100644
--- a/paddle/fluid/operators/math/sequence_pooling.cu
+++ b/paddle/fluid/operators/math/sequence_pooling.cu
@@ -14,10 +14,10 @@ limitations under the License. */
 
 #include <algorithm>
 #include <string>
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/math/sequence_pooling.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/fluid/platform/macros.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/math/softmax.cu b/paddle/fluid/operators/math/softmax.cu
index bc32e068f5..632fc1d4b2 100644
--- a/paddle/fluid/operators/math/softmax.cu
+++ b/paddle/fluid/operators/math/softmax.cu
@@ -13,10 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include <vector>
 
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/math/softmax.h"
 #include "paddle/fluid/operators/math/softmax_impl.h"
 #include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/math/sparse_impl.cu.h b/paddle/fluid/operators/math/sparse_impl.cu.h
index 8ff2f4b27d..728cf0fcd0 100644
--- a/paddle/fluid/operators/math/sparse_impl.cu.h
+++ b/paddle/fluid/operators/math/sparse_impl.cu.h
@@ -14,8 +14,8 @@
 
 #pragma once
 
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/platform/dynload/cusparse.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
 
diff --git a/paddle/fluid/operators/math/tree2col.cc b/paddle/fluid/operators/math/tree2col.cc
index af5df27207..85d71b369a 100644
--- a/paddle/fluid/operators/math/tree2col.cc
+++ b/paddle/fluid/operators/math/tree2col.cc
@@ -91,7 +91,7 @@ class Tree2ColFunctor<platform::CPUDeviceContext, T> {
     std::vector<std::vector<int>> tr;
     auto feature_dims = node_features.dims();
     auto cpu_place = context.GetPlace();
-    math::SetConstant<platform::CPUDeviceContext, T> constant;
+    pten::funcs::SetConstant<platform::CPUDeviceContext, T> constant;
     int64_t feature_size = feature_dims[1];
     size_t patch_elem_size = 3 * static_cast<size_t>(feature_size);
     size_t node_count = 0, patch_count = 0, patch_size;
@@ -144,7 +144,7 @@ class Col2TreeFunctor<platform::CPUDeviceContext, T> {
     std::vector<std::vector<int>> tr;
     auto output_dims = out_grad.dims();
     auto cpu_place = context.GetPlace();
-    math::SetConstant<platform::CPUDeviceContext, T> constant;
+    pten::funcs::SetConstant<platform::CPUDeviceContext, T> constant;
     int64_t output_size = output_dims[1];
     size_t grad_elem_size = 3 * static_cast<size_t>(output_size);
     size_t node_count = 0, grad_count = 0;
diff --git a/paddle/fluid/operators/math/tree2col.cu b/paddle/fluid/operators/math/tree2col.cu
index 4f3ab31916..4fcd1a1cf6 100644
--- a/paddle/fluid/operators/math/tree2col.cu
+++ b/paddle/fluid/operators/math/tree2col.cu
@@ -13,8 +13,8 @@
 // limitations under the License.
 
 #include <stack>
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/math/tree2col.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
@@ -56,7 +56,7 @@ class Tree2ColFunctor<platform::CUDADeviceContext, T> {
     auto cpu_place = platform::CPUPlace();
     auto stream = context.stream();
     auto feature_dims = node_features.dims();
-    math::SetConstant<platform::CUDADeviceContext, T> constant;
+    pten::funcs::SetConstant<platform::CUDADeviceContext, T> constant;
 
     Tensor EdgeSet_cpu;
     framework::TensorCopy(EdgeSet, cpu_place, &EdgeSet_cpu);
@@ -128,7 +128,7 @@ class Col2TreeFunctor<platform::CUDADeviceContext, T> {
     auto cpu_place = platform::CPUPlace();
     auto stream = context.stream();
     auto output_dims = patch_grad.dims();
-    math::SetConstant<platform::CUDADeviceContext, T> constant;
+    pten::funcs::SetConstant<platform::CUDADeviceContext, T> constant;
 
     Tensor EdgeSet_cpu;
     framework::TensorCopy(EdgeSet, cpu_place, &EdgeSet_cpu);
diff --git a/paddle/fluid/operators/math/tree2col.h b/paddle/fluid/operators/math/tree2col.h
index 632777c9cd..5cf7a93f4d 100644
--- a/paddle/fluid/operators/math/tree2col.h
+++ b/paddle/fluid/operators/math/tree2col.h
@@ -18,7 +18,7 @@
 #include <unordered_map>
 #include <vector>
 #include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/matmul_op.cc b/paddle/fluid/operators/matmul_op.cc
index 717c1b5c0e..6b24f47784 100644
--- a/paddle/fluid/operators/matmul_op.cc
+++ b/paddle/fluid/operators/matmul_op.cc
@@ -128,7 +128,7 @@ static framework::Tensor FoldHeadAndLastDims(const DeviceContext &context,
   output.Resize({in_dims[1], in_dims[0], in_dims[2]});
   output.mutable_data<T>(context.GetPlace());
   std::vector<int> axis = {1, 0, 2};
-  math::Transpose<DeviceContext, T, 3> trans;
+  pten::funcs::Transpose<DeviceContext, T, 3> trans;
   trans(context, input, &output, axis);
   output.Resize({in_dims[1], in_dims[0] * in_dims[2]});
 
diff --git a/paddle/fluid/operators/matrix_power_op.h b/paddle/fluid/operators/matrix_power_op.h
index 6c4b8860bf..93755b22bf 100644
--- a/paddle/fluid/operators/matrix_power_op.h
+++ b/paddle/fluid/operators/matrix_power_op.h
@@ -170,7 +170,7 @@ void MatrixPowerGradFunction(const Tensor* X, const Tensor* Out,
 
   if (n == 0) {
     // \nabla X = O
-    math::SetConstant<DeviceContext, T> zero;
+    pten::funcs::SetConstant<DeviceContext, T> zero;
     zero(dev_ctx, dX, static_cast<T>(0));
     return;
   } else if (n == 1) {
diff --git a/paddle/fluid/operators/matrix_rank_op.cu b/paddle/fluid/operators/matrix_rank_op.cu
index 7362d00afb..d974d7c1b7 100644
--- a/paddle/fluid/operators/matrix_rank_op.cu
+++ b/paddle/fluid/operators/matrix_rank_op.cu
@@ -19,11 +19,11 @@ limitations under the License. */
 #include "paddle/fluid/memory/memory.h"
 #include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
 #include "paddle/fluid/operators/math/complex_functors.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/matrix_rank_op.h"
 #include "paddle/fluid/operators/svd_helper.h"
 #include "paddle/fluid/platform/dynload/cusolver.h"
 #include "paddle/fluid/platform/for_range.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/maxout_op.h b/paddle/fluid/operators/maxout_op.h
index 64b538fc5d..d1c229342b 100644
--- a/paddle/fluid/operators/maxout_op.h
+++ b/paddle/fluid/operators/maxout_op.h
@@ -15,8 +15,8 @@ limitations under the License. */
 #pragma once
 
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/math/maxouting.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
@@ -57,7 +57,7 @@ class MaxOutGradKernel : public framework::OpKernel<T> {
     }
 
     auto& device_ctx = context.template device_context<DeviceContext>();
-    math::SetConstant<DeviceContext, T> zero;
+    pten::funcs::SetConstant<DeviceContext, T> zero;
     if (in_x_grad) {
       in_x_grad->mutable_data<T>(context.GetPlace());
       zero(device_ctx, in_x_grad, static_cast<T>(0.0));
diff --git a/paddle/fluid/operators/mean_iou_op.cu b/paddle/fluid/operators/mean_iou_op.cu
index 79aff52a16..48b34e18b8 100644
--- a/paddle/fluid/operators/mean_iou_op.cu
+++ b/paddle/fluid/operators/mean_iou_op.cu
@@ -13,10 +13,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/memory/malloc.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/mean_iou_op.h"
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/mlu/activation_op_mlu_test.cc b/paddle/fluid/operators/mlu/activation_op_mlu_test.cc
index 9da3a4c487..555179e7cd 100644
--- a/paddle/fluid/operators/mlu/activation_op_mlu_test.cc
+++ b/paddle/fluid/operators/mlu/activation_op_mlu_test.cc
@@ -15,9 +15,9 @@ limitations under the License. */
 #include <gtest/gtest.h>
 
 #include "paddle/fluid/operators/activation_op.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/platform/device/mlu/device_context.h"
 #include "paddle/fluid/platform/place.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace fw = paddle::framework;
 namespace plat = paddle::platform;
diff --git a/paddle/fluid/operators/mul_op.h b/paddle/fluid/operators/mul_op.h
index 0fb32cf4be..6ea154c25d 100644
--- a/paddle/fluid/operators/mul_op.h
+++ b/paddle/fluid/operators/mul_op.h
@@ -17,7 +17,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/operators/math/blas.h"
-#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/norm_utils.cu.h b/paddle/fluid/operators/norm_utils.cu.h
index 241c634e3f..562fe8a1bc 100644
--- a/paddle/fluid/operators/norm_utils.cu.h
+++ b/paddle/fluid/operators/norm_utils.cu.h
@@ -25,8 +25,8 @@ limitations under the License. */
 namespace cub = hipcub;
 #endif
 #include "paddle/fluid/framework/data_layout.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 #ifdef __HIPCC__
 #define LAUNCH_BOUNDS(BlockDim) __launch_bounds__(BlockDim)
@@ -405,7 +405,7 @@ void NormDoubleGradFunctor(const framework::ExecutionContext &ctx,
   const T *ddbias_data = (ddBias == nullptr ? nullptr : ddBias->data<T>());
 
   auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
-  math::SetConstant<platform::CUDADeviceContext, T> set_constant;
+  pten::funcs::SetConstant<platform::CUDADeviceContext, T> set_constant;
 
   auto &x_dims = X->dims();
   const int C = (data_layout == DataLayout::kNCHW ? x_dims[1]
diff --git a/paddle/fluid/operators/one_hot_op.cu b/paddle/fluid/operators/one_hot_op.cu
index 2b02174804..092ffe78f5 100644
--- a/paddle/fluid/operators/one_hot_op.cu
+++ b/paddle/fluid/operators/one_hot_op.cu
@@ -46,7 +46,7 @@ struct OneHotOpCUDAFunctor {
     auto numel = in_->numel();
     auto* p_out_data = out_->mutable_data<OutT>(ctx_.GetPlace());
     auto stream = ctx_.stream();
-    math::set_constant(ctx_, out_, 0.0);
+    pten::funcs::set_constant(ctx_, out_, 0.0);
 
     FillOutputKernel<<<(numel + PADDLE_CUDA_NUM_THREADS - 1) /
                            PADDLE_CUDA_NUM_THREADS,
diff --git a/paddle/fluid/operators/one_hot_op.h b/paddle/fluid/operators/one_hot_op.h
index e671a1e99e..a5b3ff78e1 100644
--- a/paddle/fluid/operators/one_hot_op.h
+++ b/paddle/fluid/operators/one_hot_op.h
@@ -14,7 +14,7 @@
 
 #pragma once
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
@@ -41,7 +41,7 @@ struct OneHotOpFunctor {
     auto* p_in_data = in_->data<InT>();
     auto numel = in_->numel();
     auto* p_out_data = out_->mutable_data<OutT>(ctx_.GetPlace());
-    math::set_constant(ctx_, out_, 0.0);
+    pten::funcs::set_constant(ctx_, out_, 0.0);
 
     if (allow_out_of_range_) {
       for (int i = 0; i < numel; ++i) {
diff --git a/paddle/fluid/operators/one_hot_v2_op.cu b/paddle/fluid/operators/one_hot_v2_op.cu
index 115c946084..d145455a1f 100644
--- a/paddle/fluid/operators/one_hot_v2_op.cu
+++ b/paddle/fluid/operators/one_hot_v2_op.cu
@@ -47,7 +47,7 @@ struct OneHotV2OpCUDAFunctor {
     auto numel = in_->numel();
     auto* p_out_data = out_->mutable_data<OutT>(ctx_.GetPlace());
     auto stream = ctx_.stream();
-    math::set_constant(ctx_, out_, 0.0);
+    pten::funcs::set_constant(ctx_, out_, 0.0);
 
     FillOutputKernel<<<(numel + PADDLE_CUDA_NUM_THREADS - 1) /
                            PADDLE_CUDA_NUM_THREADS,
diff --git a/paddle/fluid/operators/one_hot_v2_op.h b/paddle/fluid/operators/one_hot_v2_op.h
index 221b8cf0e2..c95909e375 100644
--- a/paddle/fluid/operators/one_hot_v2_op.h
+++ b/paddle/fluid/operators/one_hot_v2_op.h
@@ -14,7 +14,7 @@
 
 #pragma once
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
@@ -41,7 +41,7 @@ struct OneHotV2OpFunctor {
     auto* p_in_data = in_->data<InT>();
     auto numel = in_->numel();
     auto* p_out_data = out_->mutable_data<OutT>(ctx_.GetPlace());
-    math::set_constant(ctx_, out_, 0.0);
+    pten::funcs::set_constant(ctx_, out_, 0.0);
 
     if (allow_out_of_range_) {
       for (int i = 0; i < numel; ++i) {
diff --git a/paddle/fluid/operators/optimizers/adagrad_op.cc b/paddle/fluid/operators/optimizers/adagrad_op.cc
index 31d3e1208d..d865f7cff2 100644
--- a/paddle/fluid/operators/optimizers/adagrad_op.cc
+++ b/paddle/fluid/operators/optimizers/adagrad_op.cc
@@ -17,8 +17,8 @@ limitations under the License. */
 
 #include <cmath>
 
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/math/selected_rows_functor.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/optimizers/adagrad_op.cu b/paddle/fluid/operators/optimizers/adagrad_op.cu
index a7c32255bd..5c970ceffb 100644
--- a/paddle/fluid/operators/optimizers/adagrad_op.cu
+++ b/paddle/fluid/operators/optimizers/adagrad_op.cu
@@ -11,10 +11,10 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/math/selected_rows_functor.h"
 #include "paddle/fluid/operators/optimizers/adagrad_op.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/overlap_add_op.h b/paddle/fluid/operators/overlap_add_op.h
index 865659ee94..b69f99bc98 100644
--- a/paddle/fluid/operators/overlap_add_op.h
+++ b/paddle/fluid/operators/overlap_add_op.h
@@ -18,11 +18,11 @@
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/math/seq2col.h"
 #include "paddle/fluid/operators/transpose_op.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/for_range.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/p_norm_op.cu b/paddle/fluid/operators/p_norm_op.cu
index 7c8dfc7f64..ef885e3ae7 100644
--- a/paddle/fluid/operators/p_norm_op.cu
+++ b/paddle/fluid/operators/p_norm_op.cu
@@ -180,7 +180,7 @@ class PnormGradCUDAKernel : public framework::OpKernel<T> {
     auto& cuda_ctx = ctx.template device_context<DeviceContext>();
 
     if (porder == 0) {
-      math::SetConstant<DeviceContext, T> set_zero;
+      pten::funcs::SetConstant<DeviceContext, T> set_zero;
       set_zero(cuda_ctx, out_dx, static_cast<T>(0));
     } else if (porder == INFINITY || porder == -INFINITY) {
       AbsMaxAndMinGradFunctor<T> functor;
diff --git a/paddle/fluid/operators/p_norm_op.h b/paddle/fluid/operators/p_norm_op.h
index 8fca6924a2..17d1240636 100644
--- a/paddle/fluid/operators/p_norm_op.h
+++ b/paddle/fluid/operators/p_norm_op.h
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #pragma once
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
@@ -119,7 +119,7 @@ class PnormGradKernel : public framework::OpKernel<T> {
     Eigen::DSizes<int, 3> bcast(1, n, 1);
 
     if (porder == 0) {
-      math::SetConstant<DeviceContext, T> set_zero;
+      pten::funcs::SetConstant<DeviceContext, T> set_zero;
       auto& dev_ctx = ctx.template device_context<DeviceContext>();
       set_zero(dev_ctx, out_dx, static_cast<T>(0));
     } else if (porder == INFINITY || porder == -INFINITY) {
diff --git a/paddle/fluid/operators/pad2d_op.cc b/paddle/fluid/operators/pad2d_op.cc
index e50af02dcc..3663cb9540 100644
--- a/paddle/fluid/operators/pad2d_op.cc
+++ b/paddle/fluid/operators/pad2d_op.cc
@@ -17,7 +17,7 @@ limitations under the License. */
 #include <string>
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
@@ -415,7 +415,7 @@ class Pad2dGradCPUKernel : public framework::OpKernel<T> {
     auto d_out_dims = d_out->dims();
     const T* d_out_data = d_out->data<T>();
     T* d_in_data = d_in->mutable_data<T>(context.GetPlace());
-    math::SetConstant<platform::CPUDeviceContext, T> set_zero;
+    pten::funcs::SetConstant<platform::CPUDeviceContext, T> set_zero;
     set_zero(context.template device_context<platform::CPUDeviceContext>(),
              d_in, static_cast<T>(0));
     const int pad_top = pads[0];
diff --git a/paddle/fluid/operators/pad2d_op.cu b/paddle/fluid/operators/pad2d_op.cu
index a854fa6091..0c9e6ed2b7 100644
--- a/paddle/fluid/operators/pad2d_op.cu
+++ b/paddle/fluid/operators/pad2d_op.cu
@@ -14,9 +14,9 @@ limitations under the License. */
 
 #include <algorithm>
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
@@ -395,7 +395,7 @@ class Pad2dGradCUDAKernel : public framework::OpKernel<T> {
     const T* d_out_data = d_out->data<T>();
     T* d_in_data = d_in->mutable_data<T>(context.GetPlace());
 
-    math::SetConstant<platform::CUDADeviceContext, T> set_zero;
+    pten::funcs::SetConstant<platform::CUDADeviceContext, T> set_zero;
     set_zero(context.template device_context<platform::CUDADeviceContext>(),
              d_in, static_cast<T>(0));
 
diff --git a/paddle/fluid/operators/pad3d_op.cc b/paddle/fluid/operators/pad3d_op.cc
index e84b5a9d9b..e29718af89 100644
--- a/paddle/fluid/operators/pad3d_op.cc
+++ b/paddle/fluid/operators/pad3d_op.cc
@@ -17,7 +17,7 @@ limitations under the License. */
 #include <string>
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
@@ -621,7 +621,7 @@ class Pad3dGradCPUKernel : public framework::OpKernel<T> {
     auto d_out_dims = d_out->dims();
     const T* d_out_data = d_out->data<T>();
     T* d_in_data = d_in->mutable_data<T>(context.GetPlace());
-    math::SetConstant<platform::CPUDeviceContext, T> set_zero;
+    pten::funcs::SetConstant<platform::CPUDeviceContext, T> set_zero;
     set_zero(context.template device_context<platform::CPUDeviceContext>(),
              d_in, static_cast<T>(0));
     const int pad_left = pads[0];
diff --git a/paddle/fluid/operators/pad3d_op.cu b/paddle/fluid/operators/pad3d_op.cu
index 1567251236..b7cf1be99f 100644
--- a/paddle/fluid/operators/pad3d_op.cu
+++ b/paddle/fluid/operators/pad3d_op.cu
@@ -14,9 +14,9 @@ limitations under the License. */
 
 #include <algorithm>
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
@@ -697,7 +697,7 @@ class Pad3dGradCUDAKernel : public framework::OpKernel<T> {
     const T* d_out_data = d_out->data<T>();
     T* d_in_data = d_in->mutable_data<T>(context.GetPlace());
 
-    math::SetConstant<platform::CUDADeviceContext, T> set_zero;
+    pten::funcs::SetConstant<platform::CUDADeviceContext, T> set_zero;
     set_zero(context.template device_context<platform::CUDADeviceContext>(),
              d_in, static_cast<T>(0));
 
diff --git a/paddle/fluid/operators/pixel_shuffle_op.h b/paddle/fluid/operators/pixel_shuffle_op.h
index b2a0db0f83..4ae138ac7a 100644
--- a/paddle/fluid/operators/pixel_shuffle_op.h
+++ b/paddle/fluid/operators/pixel_shuffle_op.h
@@ -14,7 +14,7 @@ limitations under the License. */
 #include <string>
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
@@ -52,7 +52,7 @@ class PixelShuffleOpKernel : public framework::OpKernel<T> {
     } else {
       o.Resize({in_dims[0], in_dims[1], factor, in_dims[2], factor, o_dims[3]});
     }
-    math::Transpose<DeviceContext, T, 6> trans;
+    pten::funcs::Transpose<DeviceContext, T, 6> trans;
     auto& dev_ctx = ctx.template device_context<DeviceContext>();
     trans(dev_ctx, t, &o, axis);
     out->Resize(o_dims);
@@ -95,7 +95,7 @@ class PixelShuffleGradOpKernel : public framework::OpKernel<T> {
       o.Resize(
           {do_dims[0], dx_dims[1], dx_dims[2], do_dims[3], factor, factor});
     }
-    math::Transpose<DeviceContext, T, 6> trans;
+    pten::funcs::Transpose<DeviceContext, T, 6> trans;
     auto& dev_ctx = ctx.template device_context<DeviceContext>();
     trans(dev_ctx, t, &o, axis);
     dx->Resize(dx_dims);
diff --git a/paddle/fluid/operators/poisson_op.h b/paddle/fluid/operators/poisson_op.h
index 2159637b29..d2deb21567 100644
--- a/paddle/fluid/operators/poisson_op.h
+++ b/paddle/fluid/operators/poisson_op.h
@@ -17,7 +17,7 @@
 #include "paddle/fluid/framework/generator.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
@@ -31,7 +31,7 @@ class PoissonGradKernel : public framework::OpKernel<T> {
   void Compute(const framework::ExecutionContext& ctx) const override {
     auto* dx = ctx.Output<framework::Tensor>(framework::GradVarName("X"));
     dx->mutable_data<T>(ctx.GetPlace());
-    math::SetConstant<DeviceContext, T> functor;
+    pten::funcs::SetConstant<DeviceContext, T> functor;
     auto& dev_ctx = ctx.template device_context<DeviceContext>();
     functor(dev_ctx, dx, static_cast<T>(0));
   }
diff --git a/paddle/fluid/operators/pool_cudnn_op.cu.cc b/paddle/fluid/operators/pool_cudnn_op.cu.cc
index bbe3174012..2b0300b87c 100644
--- a/paddle/fluid/operators/pool_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/pool_cudnn_op.cu.cc
@@ -14,8 +14,8 @@ limitations under the License. */
 
 #include <string>
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/pool_op.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 #ifdef PADDLE_WITH_HIP
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/operator.h"
@@ -114,7 +114,7 @@ class PoolCUDNNOpKernel : public framework::OpKernel<T> {
       transformed_input.Resize(framework::make_ddim(in_dims_vec));
       transformed_input.mutable_data(ctx.GetPlace(), input->type());
 
-      math::Transpose<paddle::platform::CUDADeviceContext, T, 5> trans5;
+      pten::funcs::Transpose<paddle::platform::CUDADeviceContext, T, 5> trans5;
       trans5(dev_ctx, *input, &transformed_input, axis);
 
       // output
@@ -142,7 +142,7 @@ class PoolCUDNNOpKernel : public framework::OpKernel<T> {
       transformed_input.Resize(framework::make_ddim(in_dims_vec));
       transformed_input.mutable_data(ctx.GetPlace(), input->type());
 
-      math::Transpose<paddle::platform::CUDADeviceContext, T, 4> trans;
+      pten::funcs::Transpose<paddle::platform::CUDADeviceContext, T, 4> trans;
       trans(dev_ctx, *input, &transformed_input, axis);
 
       transformed_output.Resize(output->dims());
@@ -221,7 +221,8 @@ class PoolCUDNNOpKernel : public framework::OpKernel<T> {
       auto &dev_ctx =
           ctx.template device_context<paddle::platform::CUDADeviceContext>();
       std::vector<int> axis{0, 2, 3, 4, 1};
-      math::Transpose<paddle::platform::CUDADeviceContext, T, 5> trans5_v2;
+      pten::funcs::Transpose<paddle::platform::CUDADeviceContext, T, 5>
+          trans5_v2;
       trans5_v2(dev_ctx, transformed_output, output, axis);
     }
 #ifdef PADDLE_WITH_HIP
@@ -230,7 +231,7 @@ class PoolCUDNNOpKernel : public framework::OpKernel<T> {
       auto &dev_ctx =
           ctx.template device_context<paddle::platform::CUDADeviceContext>();
       std::vector<int> axis{0, 2, 3, 1};
-      math::Transpose<paddle::platform::CUDADeviceContext, T, 4> trans;
+      pten::funcs::Transpose<paddle::platform::CUDADeviceContext, T, 4> trans;
       trans(dev_ctx, transformed_output, output, axis);
     }
 #endif
@@ -337,7 +338,7 @@ class PoolCUDNNGradOpKernel : public framework::OpKernel<T> {
       transformed_input.Resize(framework::make_ddim(in_dims_vec));
       transformed_input.mutable_data(ctx.GetPlace(), input->type());
 
-      math::Transpose<paddle::platform::CUDADeviceContext, T, 5> trans5;
+      pten::funcs::Transpose<paddle::platform::CUDADeviceContext, T, 5> trans5;
       trans5(dev_ctx, *input, &transformed_input, axis);
 
       // output
@@ -351,14 +352,16 @@ class PoolCUDNNGradOpKernel : public framework::OpKernel<T> {
 
       transformed_output.mutable_data(ctx.GetPlace(), output->type());
 
-      math::Transpose<paddle::platform::CUDADeviceContext, T, 5> trans5_v2;
+      pten::funcs::Transpose<paddle::platform::CUDADeviceContext, T, 5>
+          trans5_v2;
       trans5_v2(dev_ctx, *output, &transformed_output, axis);
 
       // output grad
       transformed_output_grad.Resize(framework::make_ddim(out_dims_vec));
       transformed_output_grad.mutable_data(ctx.GetPlace(), output_grad->type());
 
-      math::Transpose<paddle::platform::CUDADeviceContext, T, 5> trans5_v3;
+      pten::funcs::Transpose<paddle::platform::CUDADeviceContext, T, 5>
+          trans5_v3;
       trans5_v3(dev_ctx, *output_grad, &transformed_output_grad, axis);
 
       // input grad
@@ -381,7 +384,7 @@ class PoolCUDNNGradOpKernel : public framework::OpKernel<T> {
       transformed_input.Resize(framework::make_ddim(in_dims_vec));
       transformed_input.mutable_data(ctx.GetPlace(), input->type());
 
-      math::Transpose<paddle::platform::CUDADeviceContext, T, 4> trans4;
+      pten::funcs::Transpose<paddle::platform::CUDADeviceContext, T, 4> trans4;
       trans4(dev_ctx, *input, &transformed_input, axis);
 
       // output
@@ -394,14 +397,16 @@ class PoolCUDNNGradOpKernel : public framework::OpKernel<T> {
 
       transformed_output.mutable_data(ctx.GetPlace(), output->type());
 
-      math::Transpose<paddle::platform::CUDADeviceContext, T, 4> trans4_v2;
+      pten::funcs::Transpose<paddle::platform::CUDADeviceContext, T, 4>
+          trans4_v2;
       trans4_v2(dev_ctx, *output, &transformed_output, axis);
 
       // output grad
       transformed_output_grad.Resize(framework::make_ddim(out_dims_vec));
       transformed_output_grad.mutable_data(ctx.GetPlace(), output_grad->type());
 
-      math::Transpose<paddle::platform::CUDADeviceContext, T, 4> trans4_v3;
+      pten::funcs::Transpose<paddle::platform::CUDADeviceContext, T, 4>
+          trans4_v3;
       trans4_v3(dev_ctx, *output_grad, &transformed_output_grad, axis);
 
       // input grad
@@ -485,7 +490,8 @@ class PoolCUDNNGradOpKernel : public framework::OpKernel<T> {
         auto &dev_ctx =
             ctx.template device_context<paddle::platform::CUDADeviceContext>();
         std::vector<int> axis{0, 2, 3, 4, 1};
-        math::Transpose<paddle::platform::CUDADeviceContext, T, 5> trans5_v4;
+        pten::funcs::Transpose<paddle::platform::CUDADeviceContext, T, 5>
+            trans5_v4;
         trans5_v4(dev_ctx, transformed_input_grad, input_grad, axis);
       }
 #ifdef PADDLE_WITH_HIP
@@ -494,7 +500,8 @@ class PoolCUDNNGradOpKernel : public framework::OpKernel<T> {
         auto &dev_ctx =
             ctx.template device_context<paddle::platform::CUDADeviceContext>();
         std::vector<int> axis{0, 2, 3, 1};
-        math::Transpose<paddle::platform::CUDADeviceContext, T, 4> trans4_v4;
+        pten::funcs::Transpose<paddle::platform::CUDADeviceContext, T, 4>
+            trans4_v4;
         trans4_v4(dev_ctx, transformed_input_grad, input_grad, axis);
       }
 #endif
diff --git a/paddle/fluid/operators/pool_op.h b/paddle/fluid/operators/pool_op.h
index 9e2f6cf223..d220b13d18 100644
--- a/paddle/fluid/operators/pool_op.h
+++ b/paddle/fluid/operators/pool_op.h
@@ -20,8 +20,8 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/math/pooling.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 #if defined(__HIPCC__) || defined(__NVCC__)
 #include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h"
 #endif
@@ -299,7 +299,7 @@ class PoolGradKernel : public framework::OpKernel<T> {
     auto& dev_ctx = context.template device_context<DeviceContext>();
     if (in_x_grad) {
       in_x_grad->mutable_data<T>(context.GetPlace());
-      paddle::operators::math::SetConstant<DeviceContext, T> set_constant;
+      pten::funcs::SetConstant<DeviceContext, T> set_constant;
       set_constant(dev_ctx, in_x_grad, static_cast<T>(0.0));
 
       switch (ksize.size()) {
diff --git a/paddle/fluid/operators/pool_with_index_op.h b/paddle/fluid/operators/pool_with_index_op.h
index 065d90704c..d039598a8a 100644
--- a/paddle/fluid/operators/pool_with_index_op.h
+++ b/paddle/fluid/operators/pool_with_index_op.h
@@ -17,8 +17,8 @@ limitations under the License. */
 #include <vector>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/math/pooling.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
@@ -92,7 +92,7 @@ class MaxPoolWithIndexGradKernel : public framework::OpKernel<T1> {
     if (in_x_grad) {
       in_x_grad->mutable_data<T1>(context.GetPlace());
       auto& device_ctx = context.template device_context<DeviceContext>();
-      math::set_constant(device_ctx, in_x_grad, 0);
+      pten::funcs::set_constant(device_ctx, in_x_grad, 0);
 
       switch (ksize.size()) {
         case 2: {
diff --git a/paddle/fluid/operators/prroi_pool_op.cu b/paddle/fluid/operators/prroi_pool_op.cu
index 71aaf08c52..256bc0473b 100644
--- a/paddle/fluid/operators/prroi_pool_op.cu
+++ b/paddle/fluid/operators/prroi_pool_op.cu
@@ -327,7 +327,7 @@ class GPUPRROIPoolGradOpKernel : public framework::OpKernel<T> {
                    dev_ctx.stream());
 
       input_grad->mutable_data<T>(ctx.GetPlace());
-      math::SetConstant<DeviceContext, T> set_zero;
+      pten::funcs::SetConstant<DeviceContext, T> set_zero;
       set_zero(ctx.cuda_device_context(), input_grad, static_cast<T>(0));
       input_roi_grad->mutable_data<T>(ctx.GetPlace());
       set_zero(ctx.cuda_device_context(), input_roi_grad, static_cast<T>(0));
diff --git a/paddle/fluid/operators/prroi_pool_op.h b/paddle/fluid/operators/prroi_pool_op.h
index 38f8d6542a..63f0047aa9 100644
--- a/paddle/fluid/operators/prroi_pool_op.h
+++ b/paddle/fluid/operators/prroi_pool_op.h
@@ -15,7 +15,7 @@ limitations under the License. */
 #pragma once
 #include <algorithm>
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 #if defined(__NVCC__) || defined(__HIPCC__)
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #endif
@@ -500,7 +500,7 @@ class CPUPRROIPoolGradOpKernel : public framework::OpKernel<T> {
       input_grad->mutable_data<T>(ctx.GetPlace());
       input_roi_grad->mutable_data<T>(ctx.GetPlace());
       // set gradient of X to be 0. before backpropagate.
-      math::SetConstant<DeviceContext, T> set_zero;
+      pten::funcs::SetConstant<DeviceContext, T> set_zero;
       set_zero(ctx.template device_context<DeviceContext>(), input_grad,
                static_cast<T>(0));
       set_zero(ctx.template device_context<DeviceContext>(), input_roi_grad,
diff --git a/paddle/fluid/operators/pscore/distributed_lookup_table_op.cc b/paddle/fluid/operators/pscore/distributed_lookup_table_op.cc
index 277c93fad6..15b1aab855 100644
--- a/paddle/fluid/operators/pscore/distributed_lookup_table_op.cc
+++ b/paddle/fluid/operators/pscore/distributed_lookup_table_op.cc
@@ -13,8 +13,8 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/pscore/distributed_lookup_table_op.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/pscore/distributed_lookup_table_op.h b/paddle/fluid/operators/pscore/distributed_lookup_table_op.h
index d715bf34a4..af423f71b0 100644
--- a/paddle/fluid/operators/pscore/distributed_lookup_table_op.h
+++ b/paddle/fluid/operators/pscore/distributed_lookup_table_op.h
@@ -18,7 +18,7 @@
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/tensor_util.h"
-#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/pscore/distributed_push_sparse_op.cc b/paddle/fluid/operators/pscore/distributed_push_sparse_op.cc
index 3a1e2ea786..b481235956 100644
--- a/paddle/fluid/operators/pscore/distributed_push_sparse_op.cc
+++ b/paddle/fluid/operators/pscore/distributed_push_sparse_op.cc
@@ -13,8 +13,8 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/pscore/distributed_push_sparse_op.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/pscore/distributed_push_sparse_op.h b/paddle/fluid/operators/pscore/distributed_push_sparse_op.h
index f19ba5f2e4..c07ffa4bd0 100644
--- a/paddle/fluid/operators/pscore/distributed_push_sparse_op.h
+++ b/paddle/fluid/operators/pscore/distributed_push_sparse_op.h
@@ -18,7 +18,7 @@
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/tensor_util.h"
-#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/pscore/fake_init_op.cc b/paddle/fluid/operators/pscore/fake_init_op.cc
index b3a745fc99..d337aa8b01 100644
--- a/paddle/fluid/operators/pscore/fake_init_op.cc
+++ b/paddle/fluid/operators/pscore/fake_init_op.cc
@@ -11,7 +11,7 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/psroi_pool_op.cu b/paddle/fluid/operators/psroi_pool_op.cu
index efdcc59a5c..9bca5d86d4 100644
--- a/paddle/fluid/operators/psroi_pool_op.cu
+++ b/paddle/fluid/operators/psroi_pool_op.cu
@@ -317,7 +317,7 @@ class GPUPSROIPoolGradOpKernel : public framework::OpKernel<T> {
                             ctx.device_context(), &rois_batch_id_list_gpu);
 
       input_grad->mutable_data<T>(ctx.GetPlace());
-      math::SetConstant<Place, T> set_zero;
+      pten::funcs::SetConstant<Place, T> set_zero;
       set_zero(ctx.cuda_device_context(), input_grad, static_cast<T>(0));
 
       int output_grad_size = output_grad->numel();
diff --git a/paddle/fluid/operators/psroi_pool_op.h b/paddle/fluid/operators/psroi_pool_op.h
index 4d7e9ce295..ed5221648f 100644
--- a/paddle/fluid/operators/psroi_pool_op.h
+++ b/paddle/fluid/operators/psroi_pool_op.h
@@ -15,7 +15,7 @@ limitations under the License. */
 #pragma once
 #include <algorithm>
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
@@ -225,7 +225,7 @@ class CPUPSROIPoolGradOpKernel : public framework::OpKernel<T> {
       T* input_grad_data = input_grad->mutable_data<T>(ctx.GetPlace());
 
       // set gradient of X to be 0. before backpropagate.
-      math::SetConstant<DeviceContext, T> set_zero;
+      pten::funcs::SetConstant<DeviceContext, T> set_zero;
       set_zero(ctx.template device_context<DeviceContext>(), input_grad,
                static_cast<T>(0));
 
diff --git a/paddle/fluid/operators/put_along_axis_op.cu b/paddle/fluid/operators/put_along_axis_op.cu
index da36b56433..800da8a275 100644
--- a/paddle/fluid/operators/put_along_axis_op.cu
+++ b/paddle/fluid/operators/put_along_axis_op.cu
@@ -17,8 +17,8 @@ limitations under the License. */
 #include <vector>
 #include "paddle/fluid/framework/ddim.h"
 #include "paddle/fluid/framework/op_version_registry.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/put_along_axis_op.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/put_along_axis_op.h b/paddle/fluid/operators/put_along_axis_op.h
index f23ca177db..0b4481ceac 100644
--- a/paddle/fluid/operators/put_along_axis_op.h
+++ b/paddle/fluid/operators/put_along_axis_op.h
@@ -16,7 +16,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/gather_scatter_kernel.h"
-#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/qr_op.h b/paddle/fluid/operators/qr_op.h
index 1731aa9e07..c55619a4f7 100644
--- a/paddle/fluid/operators/qr_op.h
+++ b/paddle/fluid/operators/qr_op.h
@@ -142,7 +142,7 @@ class QrGradKernel : public framework::OpKernel<T> {
         *ctx.Output<framework::Tensor>(framework::GradVarName("X"));
     dA.mutable_data<math::Real<T>>(ctx.GetPlace());
     auto& dev_ctx = ctx.template device_context<DeviceContext>();
-    math::SetConstant<DeviceContext, T>()(dev_ctx, &dA, T(0));
+    pten::funcs::SetConstant<DeviceContext, T>()(dev_ctx, &dA, T(0));
 
     auto dito = math::DeviceIndependenceTensorOperations<DeviceContext, T>(ctx);
 
diff --git a/paddle/fluid/operators/range_op.h b/paddle/fluid/operators/range_op.h
index 5344147a90..aca9d50c32 100644
--- a/paddle/fluid/operators/range_op.h
+++ b/paddle/fluid/operators/range_op.h
@@ -15,7 +15,7 @@ limitations under the License. */
 #pragma once
 #include <functional>
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/range_op_npu_test.cc b/paddle/fluid/operators/range_op_npu_test.cc
index 081cafdf67..00486dbed8 100644
--- a/paddle/fluid/operators/range_op_npu_test.cc
+++ b/paddle/fluid/operators/range_op_npu_test.cc
@@ -25,12 +25,11 @@ limitations under the License. */
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/operators/dropout_op.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/string/printf.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace f = paddle::framework;
 namespace p = paddle::platform;
-namespace m = paddle::operators::math;
 
 USE_OP(range);
 USE_OP_DEVICE_KERNEL(range, NPU);
diff --git a/paddle/fluid/operators/rank_attention.cu.h b/paddle/fluid/operators/rank_attention.cu.h
index 8ec138c882..3eb4d8401a 100644
--- a/paddle/fluid/operators/rank_attention.cu.h
+++ b/paddle/fluid/operators/rank_attention.cu.h
@@ -14,7 +14,7 @@ limitations under the License. */
 
 #pragma once
 #include "paddle/fluid/framework/dim.h"
-#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/reduce_ops/reduce_any_op_npu_test.cc b/paddle/fluid/operators/reduce_ops/reduce_any_op_npu_test.cc
index 1eeeb5e1f8..f8ed44267e 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_any_op_npu_test.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_any_op_npu_test.cc
@@ -27,12 +27,11 @@ limitations under the License. */
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/memory/malloc.h"
 #include "paddle/fluid/memory/memcpy.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/string/printf.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace f = paddle::framework;
 namespace p = paddle::platform;
-namespace m = paddle::operators::math;
 
 using Tensor = paddle::framework::Tensor;
 
diff --git a/paddle/fluid/operators/reduce_ops/reduce_op.h b/paddle/fluid/operators/reduce_ops/reduce_op.h
index 667ffabbf4..4101c8b73e 100644
--- a/paddle/fluid/operators/reduce_ops/reduce_op.h
+++ b/paddle/fluid/operators/reduce_ops/reduce_op.h
@@ -21,8 +21,8 @@ limitations under the License. */
 #include "paddle/fluid/framework/data_type_transform.h"
 #include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/operators/cast_op.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/reduce_ops/reduce_op_function.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 // only can include the headers in paddle/pten/api dirs
 #include "paddle/pten/api/lib/utils/tensor_utils.h"
@@ -102,7 +102,7 @@ void GetShuffledInput(const framework::ExecutionContext& context,
   shuffled_input->Resize(shuffled_dims);
   shuffled_input->mutable_data<OutT>(context.GetPlace());
 
-  math::TransposeNormal<DeviceContext, OutT> trans;
+  pten::funcs::TransposeNormal<DeviceContext, OutT> trans;
   trans(context.template device_context<DeviceContext>(), *input,
         shuffled_input, perm_axis);
 }
@@ -166,7 +166,7 @@ void HandleLargeDimGrad(const framework::ExecutionContext& context,
   framework::TensorCopy(*dx, context.GetPlace(), &dx_tmp);
   dx_tmp.Resize(shuffled_dim);
   dx->Resize(x_dim);
-  math::TransposeNormal<DeviceContext, T> trans;
+  pten::funcs::TransposeNormal<DeviceContext, T> trans;
   trans(context.template device_context<DeviceContext>(), dx_tmp, dx,
         origin_axis);
 }
diff --git a/paddle/fluid/operators/repeat_interleave_op.h b/paddle/fluid/operators/repeat_interleave_op.h
index 1a38b0271d..ca861696d7 100644
--- a/paddle/fluid/operators/repeat_interleave_op.h
+++ b/paddle/fluid/operators/repeat_interleave_op.h
@@ -16,7 +16,7 @@
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/blas.h"
-#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 #include "paddle/fluid/operators/index_select_op.h"
 namespace paddle {
diff --git a/paddle/fluid/operators/rnn_op.cu.cc b/paddle/fluid/operators/rnn_op.cu.cc
index 80a0ef10fa..94becaa43f 100644
--- a/paddle/fluid/operators/rnn_op.cu.cc
+++ b/paddle/fluid/operators/rnn_op.cu.cc
@@ -14,9 +14,9 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/generator.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/utils.h"
 #include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
@@ -719,7 +719,7 @@ class RNNGradCudnnKernel : public framework::OpKernel<T> {
     }
 
     Tensor weight_grad;
-    math::SetConstant<paddle::platform::CUDADeviceContext, T> zero;
+    pten::funcs::SetConstant<paddle::platform::CUDADeviceContext, T> zero;
     weight_grad.mutable_data<T>({weight_numel}, ctx.GetPlace());
     zero(dev_ctx, &weight_grad, static_cast<T>(0.0));
     T *weight_grad_data = weight_grad.data<T>();
diff --git a/paddle/fluid/operators/rnn_op.h b/paddle/fluid/operators/rnn_op.h
index 5e19be5e4c..b2c1b8b989 100644
--- a/paddle/fluid/operators/rnn_op.h
+++ b/paddle/fluid/operators/rnn_op.h
@@ -25,9 +25,9 @@ limitations under the License. */
 #include "paddle/fluid/operators/math/fc.h"
 #include "paddle/fluid/operators/math/gru_compute.h"
 #include "paddle/fluid/operators/math/lstm_compute.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/unique_op.h"
 #include "paddle/fluid/operators/utils.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
@@ -305,7 +305,7 @@ struct Layer {
       framework::TensorCopy(bias_hh, context.GetPlace(), dev_ctx, &bias_hh_tmp);
       bias_hh_tmp.Resize({3, bias_hh_tmp.numel() / 3});
       auto bias_hh_tmp_unbind = Unbind(bias_hh_tmp);
-      math::SetConstant<platform::CPUDeviceContext, T> zero;
+      pten::funcs::SetConstant<platform::CPUDeviceContext, T> zero;
       zero(dev_ctx, &bias_hh_tmp_unbind[2], static_cast<T>(0.0));
 
       auto bias_hh_after_mask = framework::EigenMatrix<T>::From(
@@ -439,7 +439,7 @@ struct Layer {
                             &weight_hh_tmp);
       weight_hh_tmp.Resize({3, weight_hh_tmp.numel() / 3});
       auto weight_hh_tmp_unbind = Unbind(weight_hh_tmp);
-      math::SetConstant<platform::CPUDeviceContext, T> zero;
+      pten::funcs::SetConstant<platform::CPUDeviceContext, T> zero;
       zero(dev_ctx, &weight_hh_tmp_unbind[2], static_cast<T>(0.0));
       weight_hh_tmp.Resize(vec[1 + offset * 4].dims());
     }
@@ -585,7 +585,7 @@ struct Layer {
                             &weight_hh_tmp);
       weight_hh_tmp.Resize({3, weight_hh_tmp.numel() / 3});
       auto weight_hh_tmp_unbind = Unbind(weight_hh_tmp);
-      math::SetConstant<platform::CPUDeviceContext, T> zero;
+      pten::funcs::SetConstant<platform::CPUDeviceContext, T> zero;
       zero(dev_ctx, &weight_hh_tmp_unbind[2], static_cast<T>(0.0));
       weight_hh_tmp.Resize(vec[1 + offset * 4].dims());
     }
@@ -966,7 +966,7 @@ class RNNCPUKernel : public framework::OpKernel<T> {
     dropout_mask->mutable_data<uint8_t>(output->dims(), ctx.GetPlace());
 
     auto& dev_ctx = ctx.template device_context<platform::CPUDeviceContext>();
-    math::SetConstant<platform::CPUDeviceContext, uint8_t> ones;
+    pten::funcs::SetConstant<platform::CPUDeviceContext, uint8_t> ones;
     ones(dev_ctx, dropout_mask, static_cast<uint8_t>(1));
     // init the output and allocate the memory
     output->mutable_data<T>(ctx.GetPlace());
@@ -1095,7 +1095,7 @@ struct GradLayer {
     Tensor c, d;
     Tensor* dynamic_grad_pre_h = &c;
     Tensor* dynamic_grad_pre_c = &d;
-    math::SetConstant<platform::CPUDeviceContext, T> zero;
+    pten::funcs::SetConstant<platform::CPUDeviceContext, T> zero;
     if (init_h_grad_unbind->size() > 0) {
       dynamic_grad_pre_h->ShareDataWith(
           (*init_h_grad_unbind)[current_layer_idx]);
@@ -1293,7 +1293,7 @@ struct GradLayer {
                 mat_dim_parameter, static_cast<T>(1.0), input_grad, T(1));
 
     // calc the gradient of Bias_hi, Bias_hh
-    math::ColwiseSum<platform::CPUDeviceContext, T> col_sum;
+    pten::funcs::ColwiseSum<platform::CPUDeviceContext, T> col_sum;
     Tensor tmp_grad_gate;
     tmp_grad_gate.ShareDataWith(grad_gate);
     tmp_grad_gate.Resize(
@@ -1328,7 +1328,7 @@ struct SingleGradLayer : GradLayer<T, GradCellType> {
       const int& gate_num) {
     auto& device_ctx =
         context.template device_context<platform::CPUDeviceContext>();
-    math::SetConstant<platform::CPUDeviceContext, T> zero;
+    pten::funcs::SetConstant<platform::CPUDeviceContext, T> zero;
     zero(device_ctx, input_grad, static_cast<T>(0.0));
 
     const bool& is_bidirec = context.Attr<bool>("is_bidirec");
@@ -1425,7 +1425,7 @@ struct BidirGradLayer : GradLayer<T, GradCellType> {
     // split the output two tensor to output_forward, output_backward
     auto& device_ctx =
         context.template device_context<platform::CPUDeviceContext>();
-    math::SetConstant<platform::CPUDeviceContext, T> zero;
+    pten::funcs::SetConstant<platform::CPUDeviceContext, T> zero;
     zero(device_ctx, input_grad, static_cast<T>(0.0));
 
     std::vector<Tensor*> output_vec;
@@ -1675,7 +1675,7 @@ struct GRUGradCell : GradCell<T> {
       backup_tensor<T>(context, &grad_pre_hidden_bak, grad_pre_hidden);
     }
     // zero pre_hidden
-    math::SetConstant<platform::CPUDeviceContext, T> zero;
+    pten::funcs::SetConstant<platform::CPUDeviceContext, T> zero;
     zero(device_ctx, grad_pre_hidden, static_cast<T>(0.0));
     math::GRUMetaValue<T> gru_value;
     math::GRUMetaGrad<T> gru_grad;
diff --git a/paddle/fluid/operators/roi_align_op.cu b/paddle/fluid/operators/roi_align_op.cu
index 520023229f..5c9c8b78a4 100644
--- a/paddle/fluid/operators/roi_align_op.cu
+++ b/paddle/fluid/operators/roi_align_op.cu
@@ -395,7 +395,7 @@ class GPUROIAlignGradOpKernel : public framework::OpKernel<T> {
     memory::Copy(gplace, roi_id_data, cplace, roi_batch_id_data, bytes,
                  dev_ctx.stream());
     in_grad->mutable_data<T>(ctx.GetPlace());
-    math::SetConstant<Place, T> set_zero;
+    pten::funcs::SetConstant<Place, T> set_zero;
     set_zero(dev_ctx, in_grad, static_cast<T>(0));
 
     int output_grad_size = out_grad->numel();
diff --git a/paddle/fluid/operators/roi_align_op.h b/paddle/fluid/operators/roi_align_op.h
index 1ab5ddc83f..acae86bd1b 100644
--- a/paddle/fluid/operators/roi_align_op.h
+++ b/paddle/fluid/operators/roi_align_op.h
@@ -15,7 +15,7 @@ limitations under the License. */
 #include <numeric>
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
@@ -23,7 +23,7 @@ namespace operators {
 using Tensor = framework::Tensor;
 using LoDTensor = framework::LoDTensor;
 
-namespace {
+namespace {  // NOLINT
 constexpr size_t get_offset(size_t x, size_t y, size_t width) {
   return y * width + x;
 }
@@ -41,7 +41,7 @@ struct offsets_and_ratios {
         xy_ratio(xy_ratio),
         xY_ratio(xY_ratio),
         Xy_ratio(Xy_ratio),
-        XY_ratio(XY_ratio){};
+        XY_ratio(XY_ratio) {}
 
   std::size_t xy = 0;
   std::size_t xY = 0;
@@ -128,10 +128,10 @@ std::vector<offsets_and_ratios<T>> get_indexes_and_ratios(
     }
   }
   return interpolation_cords;
-}
+}  // namespace
 
 template <typename T>
-void interpolate(std::vector<T>& interpolated_values,
+void interpolate(std::vector<T>& interpolated_values,  // NOLINT
                  const std::vector<offsets_and_ratios<T>>& interpolation_cords,
                  const T* data) {
   for (auto& ic : interpolation_cords) {
@@ -167,7 +167,7 @@ void avg_pool(const std::vector<T>& interpolated_values, T* output_data,
     output_data[i] = sum * count;
   }
 }
-}
+}  // NOLINT
 
 template <class T>
 void bilinear_interpolate_gradient(const int height, const int width, T y, T x,
@@ -389,7 +389,7 @@ class CPUROIAlignGradOpKernel : public framework::OpKernel<T> {
     }
     in_grad->mutable_data<T>(ctx.GetPlace());
     auto& dev_ctx = ctx.template device_context<DeviceContext>();
-    math::SetConstant<DeviceContext, T> set_zero;
+    pten::funcs::SetConstant<DeviceContext, T> set_zero;
     set_zero(dev_ctx, in_grad, static_cast<T>(0));
 
     int output_grad_size = out_grad->numel();
diff --git a/paddle/fluid/operators/roi_align_op_npu.cc b/paddle/fluid/operators/roi_align_op_npu.cc
index d6ccf84bbf..7e19287d42 100644
--- a/paddle/fluid/operators/roi_align_op_npu.cc
+++ b/paddle/fluid/operators/roi_align_op_npu.cc
@@ -10,8 +10,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/operators/roi_align_op.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/platform/device/npu/npu_op_runner.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/roi_pool_op.cu b/paddle/fluid/operators/roi_pool_op.cu
index 16a8e2bf58..eafb790285 100644
--- a/paddle/fluid/operators/roi_pool_op.cu
+++ b/paddle/fluid/operators/roi_pool_op.cu
@@ -274,7 +274,7 @@ class GPUROIPoolGradOpKernel : public framework::OpKernel<T> {
                    dev_ctx.stream());
 
       x_grad->mutable_data<T>(ctx.GetPlace());
-      math::SetConstant<Place, T> set_zero;
+      pten::funcs::SetConstant<Place, T> set_zero;
       set_zero(dev_ctx, x_grad, static_cast<T>(0));
 
       int output_grad_size = out_grad->numel();
diff --git a/paddle/fluid/operators/roi_pool_op.h b/paddle/fluid/operators/roi_pool_op.h
index 40de6d0cf6..531fe241c4 100644
--- a/paddle/fluid/operators/roi_pool_op.h
+++ b/paddle/fluid/operators/roi_pool_op.h
@@ -18,7 +18,7 @@ limitations under the License. */
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/memory/memcpy.h"
-#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
@@ -212,7 +212,7 @@ class CPUROIPoolGradOpKernel : public framework::OpKernel<T> {
       const T* out_grad_data = out_grad->data<T>();
       const int64_t* argmax_data = argmax->data<int64_t>();
       T* in_grad_data = in_grad->mutable_data<T>(ctx.GetPlace());
-      math::SetConstant<DeviceContext, T> set_zero;
+      pten::funcs::SetConstant<DeviceContext, T> set_zero;
       set_zero(ctx.template device_context<DeviceContext>(), in_grad,
                static_cast<T>(0));
 
diff --git a/paddle/fluid/operators/row_conv_op.cu b/paddle/fluid/operators/row_conv_op.cu
index 586cf3239b..24f8ba4f21 100644
--- a/paddle/fluid/operators/row_conv_op.cu
+++ b/paddle/fluid/operators/row_conv_op.cu
@@ -11,9 +11,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/row_conv_op.h"
 #include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
@@ -395,7 +395,7 @@ class RowConvGradKernel<platform::CUDADeviceContext, T>
     size_t *idx = batch_indices.CUDAMutableData(context.GetPlace());
 
     auto &device_ctx = context.cuda_device_context();
-    math::SetConstant<platform::CUDADeviceContext, T> zero;
+    pten::funcs::SetConstant<platform::CUDADeviceContext, T> zero;
 
     if (dFilter) {
       T *dfilter = dFilter->mutable_data<T>(context.GetPlace());
diff --git a/paddle/fluid/operators/sample_logits_op.cu b/paddle/fluid/operators/sample_logits_op.cu
index 4bcd27036a..3caa79a0bf 100644
--- a/paddle/fluid/operators/sample_logits_op.cu
+++ b/paddle/fluid/operators/sample_logits_op.cu
@@ -19,10 +19,10 @@ limitations under the License. */
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/tensor_util.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/math/sample_prob.h"
 #include "paddle/fluid/operators/math/softmax.h"
 #include "paddle/fluid/operators/sample_logits_op.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
@@ -138,7 +138,7 @@ class SampleLogitsCUDAKernel : public framework::OpKernel<T> {
 
     // UNDERSTAND: allocate memories for temporaries
     sampled_logits->mutable_data<T>(samples_dim, context.GetPlace());
-    math::SetConstant<platform::CUDADeviceContext, T> set_zero;
+    pten::funcs::SetConstant<platform::CUDADeviceContext, T> set_zero;
     set_zero(dev_ctx, sampled_logits, static_cast<T>(0));
 
     auto sampled_labels_data =
@@ -224,7 +224,7 @@ class SampleLogitsGradCUDAKernel : public framework::OpKernel<T> {
     logits_grad->mutable_data<T>(context.GetPlace());
 
     auto& dev_ctx = context.cuda_device_context();
-    math::SetConstant<platform::CUDADeviceContext, T> set_zero;
+    pten::funcs::SetConstant<platform::CUDADeviceContext, T> set_zero;
     set_zero(dev_ctx, logits_grad, static_cast<T>(0));
 
     // UNDERSTAND: scatter it back to logit_grad
diff --git a/paddle/fluid/operators/sample_logits_op.h b/paddle/fluid/operators/sample_logits_op.h
index 872eb341d4..f7560991a6 100644
--- a/paddle/fluid/operators/sample_logits_op.h
+++ b/paddle/fluid/operators/sample_logits_op.h
@@ -19,9 +19,9 @@ limitations under the License. */
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/tensor_util.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/math/sample_prob.h"
 #include "paddle/fluid/operators/math/softmax.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
@@ -286,7 +286,7 @@ class SampleLogitsGradKernel : public framework::OpKernel<T> {
 
     auto& dev_ctx =
         context.template device_context<platform::CPUDeviceContext>();
-    math::SetConstant<platform::CPUDeviceContext, T> set_zero;
+    pten::funcs::SetConstant<platform::CPUDeviceContext, T> set_zero;
     set_zero(dev_ctx, logits_grad, static_cast<T>(0));
 
     // UNDERSTAND: scatter it back to logit_grad
diff --git a/paddle/fluid/operators/scatter.cu.h b/paddle/fluid/operators/scatter.cu.h
index 13c08aea68..a98d98e72a 100644
--- a/paddle/fluid/operators/scatter.cu.h
+++ b/paddle/fluid/operators/scatter.cu.h
@@ -15,11 +15,11 @@ limitations under the License. */
 #pragma once
 #include <unordered_set>
 #include <vector>
-#include "math/math_function.h"
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/memory/malloc.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
 #include "paddle/fluid/platform/place.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/search_compute.h b/paddle/fluid/operators/search_compute.h
index d0618bf2c3..3e8d270ca4 100644
--- a/paddle/fluid/operators/search_compute.h
+++ b/paddle/fluid/operators/search_compute.h
@@ -23,7 +23,7 @@ limitations under the License. */
 #include <cstring>
 
 #include "paddle/fluid/operators/math/blas.h"
-#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/seed_op.cu b/paddle/fluid/operators/seed_op.cu
index 5a8d1c067c..5257e7709f 100644
--- a/paddle/fluid/operators/seed_op.cu
+++ b/paddle/fluid/operators/seed_op.cu
@@ -12,8 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/seed_op.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
@@ -32,7 +32,7 @@ class GPUSeedKernel : public framework::OpKernel<T> {
           platform::DeviceContextPool::Instance();
       auto &dev_ctx = *pool.Get(platform::CPUPlace());
       out->mutable_data<T>(platform::CPUPlace());
-      math::SetConstant<platform::CPUDeviceContext, T> functor;
+      pten::funcs::SetConstant<platform::CPUDeviceContext, T> functor;
       functor(reinterpret_cast<const platform::CPUDeviceContext &>(dev_ctx),
               out, static_cast<T>(seed));
     } else {
diff --git a/paddle/fluid/operators/segment_pool_op.h b/paddle/fluid/operators/segment_pool_op.h
index 4f180a31ce..47b18e04e4 100644
--- a/paddle/fluid/operators/segment_pool_op.h
+++ b/paddle/fluid/operators/segment_pool_op.h
@@ -16,10 +16,10 @@ limitations under the License. */
 #include <string>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/math/segment_pooling.h"
 #include "paddle/fluid/platform/macros.h"
 #include "paddle/pten/common/place.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
@@ -60,7 +60,7 @@ void SegmentKernelLaunchHelper(const framework::ExecutionContext& context) {
             "Segment ids must be >= 0, but got last id %d", dims[0]));
     output->Resize({dims});
     output->mutable_data<T>(context.GetPlace());
-    math::SetConstant<DeviceContext, T> set_zero;
+    pten::funcs::SetConstant<DeviceContext, T> set_zero;
     auto& dev_ctx = context.template device_context<DeviceContext>();
     set_zero(dev_ctx, output, static_cast<T>(0));
   }
@@ -98,7 +98,7 @@ void SegmentKernelLaunchHelper(const framework::ExecutionContext& context) {
     } else if (pooltype == "MIN") {
       init_value = static_cast<T>(FLT_MAX);
     }
-    math::SetConstant<DeviceContext, T> setconst;
+    pten::funcs::SetConstant<DeviceContext, T> setconst;
     auto& dev_ctx = context.template device_context<DeviceContext>();
     setconst(dev_ctx, output, static_cast<T>(init_value));
     // the gpu kernel of mean pool record the counts of segment_ids
@@ -152,7 +152,7 @@ class SegmentPoolGradKernel : public framework::OpKernel<T> {
     }
 
     in_g->mutable_data<T>(context.GetPlace());
-    math::SetConstant<DeviceContext, T> set_zero;
+    pten::funcs::SetConstant<DeviceContext, T> set_zero;
     auto& dev_ctx = context.template device_context<DeviceContext>();
     set_zero(dev_ctx, in_g, static_cast<T>(0));
 
diff --git a/paddle/fluid/operators/sequence_ops/sequence_conv_op.h b/paddle/fluid/operators/sequence_ops/sequence_conv_op.h
index f73b180419..b43254f91f 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_conv_op.h
+++ b/paddle/fluid/operators/sequence_ops/sequence_conv_op.h
@@ -16,7 +16,7 @@ limitations under the License. */
 #include <algorithm>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/context_project.h"
-#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
@@ -64,7 +64,7 @@ class SequenceConvKernel : public framework::OpKernel<T> {
     Tensor col;
     col.mutable_data<T>(col_shape, context.GetPlace());
     // Because if padding_trainable is false, padding data should be zeros.
-    math::SetConstant<DeviceContext, T> set_zero;
+    pten::funcs::SetConstant<DeviceContext, T> set_zero;
     auto& dev_ctx = context.template device_context<DeviceContext>();
     auto blas = math::GetBlas<DeviceContext, T>(dev_ctx);
     set_zero(dev_ctx, &col, static_cast<T>(0));
@@ -107,7 +107,7 @@ class SequenceConvGradKernel : public framework::OpKernel<T> {
     int down_pad = std::max(0, context_start + context_length - 1);
     auto sequence_width = static_cast<int64_t>(in->dims()[1]);
 
-    math::SetConstant<DeviceContext, T> set_zero;
+    pten::funcs::SetConstant<DeviceContext, T> set_zero;
     auto& dev_ctx = context.template device_context<DeviceContext>();
     auto blas = math::GetBlas<DeviceContext, T>(dev_ctx);
     // use col_shape in the im2col calculation
diff --git a/paddle/fluid/operators/sequence_ops/sequence_expand_op.h b/paddle/fluid/operators/sequence_ops/sequence_expand_op.h
index 1186ed891e..74baf67f7f 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_expand_op.h
+++ b/paddle/fluid/operators/sequence_ops/sequence_expand_op.h
@@ -17,7 +17,7 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/memory/memcpy.h"
-#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
@@ -172,7 +172,7 @@ struct SequenceExpandGradFunctor<platform::CPUDeviceContext, T> {
         int dout_end = dout_offset + repeat_num * x_seq_len;
         auto dout_sub = dout.Slice(dout_offset, dout_end);
         dout_sub.Resize({repeat_num, dx_sub.dims()[0]});
-        math::ColwiseSum<platform::CPUDeviceContext, T> col_sum;
+        pten::funcs::ColwiseSum<platform::CPUDeviceContext, T> col_sum;
         col_sum(context, dout_sub, &dx_sub);
         dout_offset += repeat_num * x_seq_len;
       }
@@ -194,7 +194,7 @@ class SequenceExpandGradKernel : public framework::OpKernel<T> {
     g_x->set_lod(x->lod());
 
     auto& dev_ctx = context.template device_context<DeviceContext>();
-    math::SetConstant<DeviceContext, T> set_zero;
+    pten::funcs::SetConstant<DeviceContext, T> set_zero;
     set_zero(dev_ctx, g_x, static_cast<T>(0));
 
     auto& y_lod = y->lod();
diff --git a/paddle/fluid/operators/sequence_ops/sequence_pad_op.h b/paddle/fluid/operators/sequence_ops/sequence_pad_op.h
index a9660f05c3..2b50995a6a 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_pad_op.h
+++ b/paddle/fluid/operators/sequence_ops/sequence_pad_op.h
@@ -17,8 +17,8 @@ limitations under the License. */
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/memory/memcpy.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/math/sequence_padding.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/sequence_ops/sequence_pool_op.h b/paddle/fluid/operators/sequence_ops/sequence_pool_op.h
index dca65512e3..bc279f1eb3 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_pool_op.h
+++ b/paddle/fluid/operators/sequence_ops/sequence_pool_op.h
@@ -16,8 +16,8 @@ limitations under the License. */
 #include <string>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/math/sequence_pooling.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/sequence_ops/sequence_reshape_op.h b/paddle/fluid/operators/sequence_ops/sequence_reshape_op.h
index b5d2124211..2cf81197f9 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_reshape_op.h
+++ b/paddle/fluid/operators/sequence_ops/sequence_reshape_op.h
@@ -14,7 +14,7 @@
 
 #pragma once
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/sequence_ops/sequence_slice_op.h b/paddle/fluid/operators/sequence_ops/sequence_slice_op.h
index 65e021b507..d5689091be 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_slice_op.h
+++ b/paddle/fluid/operators/sequence_ops/sequence_slice_op.h
@@ -14,8 +14,8 @@ limitations under the License. */
 
 #pragma once
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/strided_memcpy.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
@@ -168,7 +168,7 @@ class SequenceSliceGradOpKernel : public framework::OpKernel<T> {
     if (x_grad) {
       x_grad->mutable_data<T>(ctx.GetPlace());
       x_grad->set_lod(in->lod());
-      math::SetConstant<DeviceContext, T> set_zero;
+      pten::funcs::SetConstant<DeviceContext, T> set_zero;
       set_zero(ctx.template device_context<DeviceContext>(), x_grad,
                static_cast<T>(0));
 
diff --git a/paddle/fluid/operators/sequence_ops/sequence_softmax_cudnn_op.cu.cc b/paddle/fluid/operators/sequence_ops/sequence_softmax_cudnn_op.cu.cc
index 46e4196585..869bc613c4 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_softmax_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/sequence_ops/sequence_softmax_cudnn_op.cu.cc
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/math/softmax.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/sequence_ops/sequence_topk_avg_pooling_op.h b/paddle/fluid/operators/sequence_ops/sequence_topk_avg_pooling_op.h
index e8e0241e46..5190108acd 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_topk_avg_pooling_op.h
+++ b/paddle/fluid/operators/sequence_ops/sequence_topk_avg_pooling_op.h
@@ -21,7 +21,7 @@ limitations under the License. */
 #include <vector>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
@@ -196,7 +196,7 @@ class SequenceTopkAvgPoolingGradKernel : public framework::OpKernel<T> {
 
     auto& dev_ctx =
         context.template device_context<platform::CPUDeviceContext>();
-    math::SetConstant<paddle::platform::CPUDeviceContext, T> zero;
+    pten::funcs::SetConstant<paddle::platform::CPUDeviceContext, T> zero;
     zero(dev_ctx, d_in, static_cast<T>(0.0));
 
     auto din_data = d_in->data<T>();
diff --git a/paddle/fluid/operators/sequence_ops/sequence_unpad_op.h b/paddle/fluid/operators/sequence_ops/sequence_unpad_op.h
index 60ba4797db..b85b938428 100644
--- a/paddle/fluid/operators/sequence_ops/sequence_unpad_op.h
+++ b/paddle/fluid/operators/sequence_ops/sequence_unpad_op.h
@@ -17,8 +17,8 @@ limitations under the License. */
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/memory/memcpy.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/math/sequence_padding.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
@@ -87,7 +87,7 @@ class SequenceUnpadGradOpKernel : public framework::OpKernel<T> {
       LoDTensor zero_pads;
       zero_pads.Resize({1, 1});
       zero_pads.mutable_data<T>(ctx.GetPlace());
-      math::SetConstant<DeviceContext, T> set_zero;
+      pten::funcs::SetConstant<DeviceContext, T> set_zero;
       auto& dev_ctx = ctx.template device_context<DeviceContext>();
       set_zero(dev_ctx, &zero_pads, static_cast<T>(0));
 
diff --git a/paddle/fluid/operators/set_value_op.h b/paddle/fluid/operators/set_value_op.h
index 1580ef140a..633bc468dc 100644
--- a/paddle/fluid/operators/set_value_op.h
+++ b/paddle/fluid/operators/set_value_op.h
@@ -437,7 +437,7 @@ class SetValueGradKernel : public framework::OpKernel<T> {
     auto& dev_ctx = context.template device_context<DeviceContext>();
     auto& place =
         *context.template device_context<DeviceContext>().eigen_device();
-    math::SetConstant<DeviceContext, T> set_zero;
+    pten::funcs::SetConstant<DeviceContext, T> set_zero;
 
     if (grad_input) {
       // Set gradient of `Input`
diff --git a/paddle/fluid/operators/shrink_rnn_memory_op.cc b/paddle/fluid/operators/shrink_rnn_memory_op.cc
index 493073fadc..38721e5e3e 100644
--- a/paddle/fluid/operators/shrink_rnn_memory_op.cc
+++ b/paddle/fluid/operators/shrink_rnn_memory_op.cc
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/operators/array_operator.h"
-#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 #include "paddle/pten/core/lod_utils.h"
 
@@ -156,7 +156,7 @@ class ShrinkRNNMemoryGradOp : public ArrayOp {
     auto &dev_ctx = *pool.Get(place);
 
     if (dout_var == nullptr) {  // dx_tensor fill zero
-      math::set_constant(dev_ctx, &dx_tensor, 0.0f);
+      pten::funcs::set_constant(dev_ctx, &dx_tensor, 0.0f);
     } else {
       auto &dout_tensor = dout_var->Get<framework::LoDTensor>();
       auto height = dout_tensor.dims()[0];
@@ -165,7 +165,7 @@ class ShrinkRNNMemoryGradOp : public ArrayOp {
       if (dx_tensor.dims()[0] > height) {
         auto rest_tensor = dx_tensor.Slice(
             static_cast<int>(height), static_cast<int>(dx_tensor.dims()[0]));
-        math::set_constant(dev_ctx, &rest_tensor, 0.0f);
+        pten::funcs::set_constant(dev_ctx, &rest_tensor, 0.0f);
       }
     }
     dx_tensor.set_lod(x_tensor.lod());
diff --git a/paddle/fluid/operators/shuffle_channel_op.h b/paddle/fluid/operators/shuffle_channel_op.h
index 3ce1e0c770..2bf96fad26 100644
--- a/paddle/fluid/operators/shuffle_channel_op.h
+++ b/paddle/fluid/operators/shuffle_channel_op.h
@@ -13,7 +13,7 @@ limitations under the License. */
 #include <algorithm>
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/slice_op.h b/paddle/fluid/operators/slice_op.h
index d9ef45343d..bf05bbadcb 100644
--- a/paddle/fluid/operators/slice_op.h
+++ b/paddle/fluid/operators/slice_op.h
@@ -18,9 +18,9 @@ limitations under the License. */
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/eigen/eigen_function.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/slice_utils.h"
 #include "paddle/fluid/operators/utils.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
@@ -299,7 +299,7 @@ class SliceGradKernel : public framework::OpKernel<T> {
       platform::DeviceContextPool& pool =
           platform::DeviceContextPool::Instance();
       auto& dev_ctx = *pool.Get(ctx.GetPlace());
-      math::SetConstant<DeviceContext, T> functor;
+      pten::funcs::SetConstant<DeviceContext, T> functor;
       for (int i = 0; i < d_in_size; ++i) {
         auto dim = input_array->at(i).dims();
         d_in_arr->at(i).Resize(dim);
diff --git a/paddle/fluid/operators/softmax_op_npu_test.cc b/paddle/fluid/operators/softmax_op_npu_test.cc
index 8e9e077b84..98a67bc748 100644
--- a/paddle/fluid/operators/softmax_op_npu_test.cc
+++ b/paddle/fluid/operators/softmax_op_npu_test.cc
@@ -23,12 +23,11 @@ limitations under the License. */
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/operators/dropout_op.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/string/printf.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace f = paddle::framework;
 namespace p = paddle::platform;
-namespace m = paddle::operators::math;
 
 USE_OP(softmax);
 USE_OP_DEVICE_KERNEL(softmax, NPU);
diff --git a/paddle/fluid/operators/softmax_with_cross_entropy_op.cu b/paddle/fluid/operators/softmax_with_cross_entropy_op.cu
index fe02564133..33bbed0f69 100644
--- a/paddle/fluid/operators/softmax_with_cross_entropy_op.cu
+++ b/paddle/fluid/operators/softmax_with_cross_entropy_op.cu
@@ -17,12 +17,12 @@ namespace cub = hipcub;
 #endif
 #include "paddle/fluid/operators/amp/fp16_type_traits.h"
 #include "paddle/fluid/operators/math/cross_entropy.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/softmax_cudnn_op.cu.h"
 #include "paddle/fluid/operators/softmax_with_cross_entropy_op.h"
 #include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
 #include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
 #include "paddle/fluid/platform/for_range.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
@@ -960,7 +960,7 @@ class SoftmaxWithCrossEntropyCUDAKernel : public framework::OpKernel<T> {
           softmax_out->template mutable_data<T>(context.GetPlace());
       auto* loss_data = loss->template mutable_data<T>(context.GetPlace());
 
-      math::SetConstant<platform::CUDADeviceContext, T> set_constant;
+      pten::funcs::SetConstant<platform::CUDADeviceContext, T> set_constant;
       set_constant(context.cuda_device_context(), loss, static_cast<T>(0));
       if (axis_dim == 1) {
         set_constant(context.cuda_device_context(), softmax_out,
@@ -1045,7 +1045,7 @@ class SoftmaxWithCrossEntropyCUDAKernel : public framework::OpKernel<T> {
     auto* loss_data = loss->template mutable_data<T>(context.GetPlace());
 
     if (axis_dim == 1) {
-      math::SetConstant<platform::CUDADeviceContext, T> set_constant;
+      pten::funcs::SetConstant<platform::CUDADeviceContext, T> set_constant;
       set_constant(context.cuda_device_context(), softmax, static_cast<T>(1));
       set_constant(context.cuda_device_context(), loss, static_cast<T>(0));
       return;
diff --git a/paddle/fluid/operators/solve_op.h b/paddle/fluid/operators/solve_op.h
index 7893b5da12..c023d33a44 100644
--- a/paddle/fluid/operators/solve_op.h
+++ b/paddle/fluid/operators/solve_op.h
@@ -21,10 +21,10 @@ limitations under the License. */
 #include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/operators/eigen/eigen_function.h"
 #include "paddle/fluid/operators/math/blas.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/math/matrix_solve.h"
 #include "paddle/fluid/operators/reduce_ops/reduce_sum_op.h"
 #include "paddle/fluid/operators/squeeze_op.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 #if defined(__NVCC__) || defined(__HIPCC__)
 #include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h"
 #endif
@@ -509,7 +509,7 @@ class SolveGradKernel : public framework::OpKernel<T> {
     const auto& new_dims_vec = getNewDimsVec(input->dims());
     tmp_input.Resize(framework::make_ddim(new_dims_vec));
     tmp_input.mutable_data<T>(ctx.GetPlace());
-    math::TransposeNormal<DeviceContext, T> trans;
+    pten::funcs::TransposeNormal<DeviceContext, T> trans;
     std::vector<int> new_axis = getNewAxis(input->dims().size());
     auto& dev_ctx = ctx.template device_context<DeviceContext>();
     trans(dev_ctx, *input, &tmp_input, new_axis);
diff --git a/paddle/fluid/operators/spectral_norm_op.h b/paddle/fluid/operators/spectral_norm_op.h
index b8a15579e5..d0edcc1692 100644
--- a/paddle/fluid/operators/spectral_norm_op.h
+++ b/paddle/fluid/operators/spectral_norm_op.h
@@ -14,7 +14,7 @@
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/blas.h"
-#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
@@ -40,19 +40,19 @@ static inline void TransCompute(const int rank, const Tensor& in, Tensor* out,
 
   switch (rank) {
     case 2:
-      math::Transpose<DeviceContext, T, 2> trans2;
+      pten::funcs::Transpose<DeviceContext, T, 2> trans2;
       trans2(dev_ctx, in, out, perm);
       break;
     case 3:
-      math::Transpose<DeviceContext, T, 3> trans3;
+      pten::funcs::Transpose<DeviceContext, T, 3> trans3;
       trans3(dev_ctx, in, out, perm);
       break;
     case 4:
-      math::Transpose<DeviceContext, T, 4> trans4;
+      pten::funcs::Transpose<DeviceContext, T, 4> trans4;
       trans4(dev_ctx, in, out, perm);
       break;
     case 5:
-      math::Transpose<DeviceContext, T, 5> trans5;
+      pten::funcs::Transpose<DeviceContext, T, 5> trans5;
       trans5(dev_ctx, in, out, perm);
       break;
     default:
diff --git a/paddle/fluid/operators/spp_op.h b/paddle/fluid/operators/spp_op.h
index 6f78b88573..755cca99da 100644
--- a/paddle/fluid/operators/spp_op.h
+++ b/paddle/fluid/operators/spp_op.h
@@ -16,9 +16,9 @@ limitations under the License. */
 #include <string>
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/math/pooling.h"
 #include "paddle/fluid/operators/strided_memcpy.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
@@ -95,7 +95,7 @@ class SppGradKernel : public framework::OpKernel<T> {
     std::string pooling_type =
         context.template Attr<std::string>("pooling_type");
     auto& device_ctx = context.template device_context<DeviceContext>();
-    math::SetConstant<DeviceContext, T> zero;
+    pten::funcs::SetConstant<DeviceContext, T> zero;
     in_x_grad->mutable_data<T>(context.GetPlace());
     zero(device_ctx, in_x_grad, static_cast<T>(0));
     auto out_stride = framework::stride(out->dims());
diff --git a/paddle/fluid/operators/squeeze_op.h b/paddle/fluid/operators/squeeze_op.h
old mode 100755
new mode 100644
index 2f621c11e5..d86037fa03
--- a/paddle/fluid/operators/squeeze_op.h
+++ b/paddle/fluid/operators/squeeze_op.h
@@ -18,9 +18,9 @@ limitations under the License. */
 
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/blas.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/math/pooling.h"
 #include "paddle/fluid/platform/device_context.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/squeeze_op_npu_test.cc b/paddle/fluid/operators/squeeze_op_npu_test.cc
index 3f6c43d7af..ecedc0ba1c 100644
--- a/paddle/fluid/operators/squeeze_op_npu_test.cc
+++ b/paddle/fluid/operators/squeeze_op_npu_test.cc
@@ -25,12 +25,11 @@ limitations under the License. */
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/operators/dropout_op.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/string/printf.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace f = paddle::framework;
 namespace p = paddle::platform;
-namespace m = paddle::operators::math;
 
 USE_OP(squeeze);
 USE_OP_DEVICE_KERNEL(squeeze, NPU);
diff --git a/paddle/fluid/operators/strided_slice_op.h b/paddle/fluid/operators/strided_slice_op.h
index 47714ebb80..d1efd3b675 100644
--- a/paddle/fluid/operators/strided_slice_op.h
+++ b/paddle/fluid/operators/strided_slice_op.h
@@ -18,8 +18,8 @@ limitations under the License. */
 #include <utility>
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/slice_op.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 namespace paddle {
 namespace operators {
 
@@ -615,7 +615,7 @@ class StridedSliceGradKernel : public framework::OpKernel<T> {
             d_out_tensor->mutable_data<T>(context.GetPlace());
           }
 
-          math::SetConstant<DeviceContext, T> set_zero;
+          pten::funcs::SetConstant<DeviceContext, T> set_zero;
           set_zero(dev_ctx, d_out_tensor, static_cast<T>(0));
         }
       }
@@ -628,7 +628,7 @@ class StridedSliceGradKernel : public framework::OpKernel<T> {
 
       d_out->mutable_data<T>(context.GetPlace());
 
-      math::SetConstant<DeviceContext, T> set_zero;
+      pten::funcs::SetConstant<DeviceContext, T> set_zero;
       set_zero(dev_ctx, d_out, static_cast<T>(0));
 
       auto in_dims = d_input->dims();
diff --git a/paddle/fluid/operators/sum_op.cu b/paddle/fluid/operators/sum_op.cu
index 9de9b0b633..ce152f4450 100644
--- a/paddle/fluid/operators/sum_op.cu
+++ b/paddle/fluid/operators/sum_op.cu
@@ -134,7 +134,7 @@ void SumToLoDTensor(const framework::ExecutionContext &context) {
 
   int start = in_place ? 1 : 0;
   if (!in_place) {
-    math::SetConstant<platform::CUDADeviceContext, T> constant_functor;
+    pten::funcs::SetConstant<platform::CUDADeviceContext, T> constant_functor;
     constant_functor(
         context.template device_context<platform::CUDADeviceContext>(), out,
         static_cast<T>(0));
diff --git a/paddle/fluid/operators/sum_op.h b/paddle/fluid/operators/sum_op.h
index 4e108b56a4..d8d57b1f7f 100644
--- a/paddle/fluid/operators/sum_op.h
+++ b/paddle/fluid/operators/sum_op.h
@@ -14,8 +14,8 @@ limitations under the License. */
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/lod_tensor_array.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/math/selected_rows_functor.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
@@ -167,7 +167,7 @@ class SumKernel : public framework::OpKernel<T> {
         }
         if (start != 2) {
           VLOG(10) << "Fill with constant = 0 in sum kernel.";
-          math::SetConstant<DeviceContext, T> constant_functor;
+          pten::funcs::SetConstant<DeviceContext, T> constant_functor;
           constant_functor(context.template device_context<DeviceContext>(),
                            out, static_cast<T>(0));
         }
diff --git a/paddle/fluid/operators/svd_helper.h b/paddle/fluid/operators/svd_helper.h
index 48315980e3..3a57a7b3e5 100644
--- a/paddle/fluid/operators/svd_helper.h
+++ b/paddle/fluid/operators/svd_helper.h
@@ -26,9 +26,9 @@
 #include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
 #include "paddle/fluid/operators/math/blas.h"
 #include "paddle/fluid/operators/math/complex_functors.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/for_range.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
@@ -232,11 +232,11 @@ static std::vector<int64_t> get_broadcast_batch_portion(
   return batchPortion;
 }
 
-#define DITO_TRANSPOSE_RANK_CASE(N)             \
-  case N: {                                     \
-    math::Transpose<DeviceContext, T, N> trans; \
-    trans(dev_ctx, x, &ret, axis);              \
-    break;                                      \
+#define DITO_TRANSPOSE_RANK_CASE(N)                    \
+  case N: {                                            \
+    pten::funcs::Transpose<DeviceContext, T, N> trans; \
+    trans(dev_ctx, x, &ret, axis);                     \
+    break;                                             \
   }
 
 #define DITO_SLICE_RANK_CASE(N)                      \
@@ -526,7 +526,7 @@ struct DeviceIndependenceTensorOperations {
     ret.Resize(framework::make_ddim(shape));
     ret.mutable_data<T>(context.GetPlace());
     auto& dev_ctx = context.template device_context<DeviceContext>();
-    SetConstant<DeviceContext, T>()(dev_ctx, &ret, T(fill_value));
+    pten::funcs::SetConstant<DeviceContext, T>()(dev_ctx, &ret, T(fill_value));
     return ret;
   }
   framework::Tensor Infinits(std::vector<int> shape) {
diff --git a/paddle/fluid/operators/take_along_axis_op.cu b/paddle/fluid/operators/take_along_axis_op.cu
index e9f9b18718..2d0ebbc20f 100644
--- a/paddle/fluid/operators/take_along_axis_op.cu
+++ b/paddle/fluid/operators/take_along_axis_op.cu
@@ -63,7 +63,7 @@ class TakeAlongAxisGradOpCUDAKernel : public framework::OpKernel<T> {
 
     // Set to zero tensor.
     auto &dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
-    math::SetConstant<platform::CUDADeviceContext, T> functor;
+    pten::funcs::SetConstant<platform::CUDADeviceContext, T> functor;
     functor(reinterpret_cast<const platform::CUDADeviceContext &>(dev_ctx),
             input_grad, static_cast<T>(0));
     const auto &index_type = index->type();
diff --git a/paddle/fluid/operators/take_along_axis_op.h b/paddle/fluid/operators/take_along_axis_op.h
index 580ca528ce..e7f804621b 100644
--- a/paddle/fluid/operators/take_along_axis_op.h
+++ b/paddle/fluid/operators/take_along_axis_op.h
@@ -19,7 +19,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/gather_scatter_kernel.h"
-#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
@@ -72,7 +72,7 @@ class TakeAlongAxisGradOpKernel : public framework::OpKernel<T> {
 
     // Set to zero tensor.
     auto &dev_ctx = ctx.template device_context<platform::CPUDeviceContext>();
-    math::SetConstant<platform::CPUDeviceContext, T> functor;
+    pten::funcs::SetConstant<platform::CPUDeviceContext, T> functor;
     functor(reinterpret_cast<const platform::CPUDeviceContext &>(dev_ctx),
             input_grad, static_cast<T>(0));
 
diff --git a/paddle/fluid/operators/teacher_student_sigmoid_loss_op.cc b/paddle/fluid/operators/teacher_student_sigmoid_loss_op.cc
index 0e0a594846..62c07d0654 100644
--- a/paddle/fluid/operators/teacher_student_sigmoid_loss_op.cc
+++ b/paddle/fluid/operators/teacher_student_sigmoid_loss_op.cc
@@ -16,7 +16,7 @@ limitations under the License. */
 
 #include <memory>
 
-#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/temporal_shift_op.h b/paddle/fluid/operators/temporal_shift_op.h
index 05364b94c9..4b2aa098d0 100644
--- a/paddle/fluid/operators/temporal_shift_op.h
+++ b/paddle/fluid/operators/temporal_shift_op.h
@@ -11,7 +11,7 @@
 
 #pragma once
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/transpose_op.h b/paddle/fluid/operators/transpose_op.h
index e4e5dfdba9..c873f84511 100644
--- a/paddle/fluid/operators/transpose_op.h
+++ b/paddle/fluid/operators/transpose_op.h
@@ -16,7 +16,7 @@ limitations under the License. */
 
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
@@ -29,32 +29,32 @@ inline void TransCompute(const int dim, const DeviceContext& dev_ctx,
                          const std::vector<int>& axis) {
   switch (dim) {
     case 1:
-      math::Transpose<DeviceContext, T, 1> trans1;
+      pten::funcs::Transpose<DeviceContext, T, 1> trans1;
       trans1(dev_ctx, in, out, axis);
       break;
     case 2:
-      math::Transpose<DeviceContext, T, 2> trans2;
+      pten::funcs::Transpose<DeviceContext, T, 2> trans2;
       trans2(dev_ctx, in, out, axis);
       break;
     case 3:
-      math::Transpose<DeviceContext, T, 3> trans3;
+      pten::funcs::Transpose<DeviceContext, T, 3> trans3;
       trans3(dev_ctx, in, out, axis);
       break;
     case 4:
-      math::Transpose<DeviceContext, T, 4> trans4;
+      pten::funcs::Transpose<DeviceContext, T, 4> trans4;
       trans4(dev_ctx, in, out, axis);
       break;
     case 5:
-      math::Transpose<DeviceContext, T, 5> trans5;
+      pten::funcs::Transpose<DeviceContext, T, 5> trans5;
       trans5(dev_ctx, in, out, axis);
       break;
     case 6:
-      math::Transpose<DeviceContext, T, 6> trans6;
+      pten::funcs::Transpose<DeviceContext, T, 6> trans6;
       trans6(dev_ctx, in, out, axis);
       break;
     default:
       // for dim >= 7 situation
-      math::TransposeNormal<DeviceContext, T> trans_normal;
+      pten::funcs::TransposeNormal<DeviceContext, T> trans_normal;
       trans_normal(dev_ctx, in, out, axis);
   }
 }
diff --git a/paddle/fluid/operators/transpose_op_npu_test.cc b/paddle/fluid/operators/transpose_op_npu_test.cc
index 91923da819..49aa265656 100644
--- a/paddle/fluid/operators/transpose_op_npu_test.cc
+++ b/paddle/fluid/operators/transpose_op_npu_test.cc
@@ -25,12 +25,11 @@ limitations under the License. */
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/operators/dropout_op.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/string/printf.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace f = paddle::framework;
 namespace p = paddle::platform;
-namespace m = paddle::operators::math;
 
 USE_OP(transpose2);
 USE_OP_DEVICE_KERNEL(transpose2, NPU);
diff --git a/paddle/fluid/operators/tree_conv_op.h b/paddle/fluid/operators/tree_conv_op.h
index a84589b32f..c2a6cfdd0d 100644
--- a/paddle/fluid/operators/tree_conv_op.h
+++ b/paddle/fluid/operators/tree_conv_op.h
@@ -28,7 +28,7 @@ class TreeConvKernel : public framework::OpKernel<T> {
  public:
   void Compute(const framework::ExecutionContext &ctx) const override {
     math::Tree2ColFunctor<DeviceContext, T> tree2col;
-    math::SetConstant<DeviceContext, T> constant;
+    pten::funcs::SetConstant<DeviceContext, T> constant;
 
     auto *Edges = ctx.Input<Tensor>("EdgeSet");
     auto *Embeddings = ctx.Input<Tensor>("NodesVector");
@@ -86,7 +86,7 @@ class TreeConvGradKernel : public framework::OpKernel<T> {
     auto *Filter = ctx.Input<Tensor>("Filter");
     math::Tree2ColFunctor<DeviceContext, T> tree2col;
     math::Col2TreeFunctor<DeviceContext, T> col2tree;
-    math::SetConstant<DeviceContext, T> constant;
+    pten::funcs::SetConstant<DeviceContext, T> constant;
     auto &dev_ctx = ctx.template device_context<DeviceContext>();
     auto blas = math::GetBlas<DeviceContext, T>(dev_ctx);
 
diff --git a/paddle/fluid/operators/unfold_op.h b/paddle/fluid/operators/unfold_op.h
index 006e4822fe..5107b5cc49 100644
--- a/paddle/fluid/operators/unfold_op.h
+++ b/paddle/fluid/operators/unfold_op.h
@@ -19,7 +19,7 @@
 
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/im2col.h"
-#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
@@ -106,7 +106,7 @@ class UnfoldGradOpKernel : public framework::OpKernel<T> {
     math::Col2ImFunctor<math::ColFormat::kCFO, DeviceContext, T> col2im;
     auto& dev_ctx = ctx.template device_context<DeviceContext>();
 
-    math::SetConstant<DeviceContext, T> set_zero;
+    pten::funcs::SetConstant<DeviceContext, T> set_zero;
     set_zero(dev_ctx, input_grad, static_cast<T>(0));
     for (int i = 0; i < batch_size; i++) {
       Tensor out_grad_batch =
diff --git a/paddle/fluid/operators/unique_consecutive_op.h b/paddle/fluid/operators/unique_consecutive_op.h
index e6cb5dafe3..9b933dfd92 100644
--- a/paddle/fluid/operators/unique_consecutive_op.h
+++ b/paddle/fluid/operators/unique_consecutive_op.h
@@ -22,9 +22,9 @@ limitations under the License. */
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/concat_and_split.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/transpose_op.h"
 #include "paddle/fluid/operators/unique_op.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/unique_op.h b/paddle/fluid/operators/unique_op.h
index 66b0543771..c3d291d120 100644
--- a/paddle/fluid/operators/unique_op.h
+++ b/paddle/fluid/operators/unique_op.h
@@ -22,8 +22,8 @@ limitations under the License. */
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/concat_and_split.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/transpose_op.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/unique_with_counts_op.h b/paddle/fluid/operators/unique_with_counts_op.h
index f61bac7cda..fc3568ff18 100644
--- a/paddle/fluid/operators/unique_with_counts_op.h
+++ b/paddle/fluid/operators/unique_with_counts_op.h
@@ -18,8 +18,8 @@ limitations under the License. */
 #include <utility>
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/unique_op.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/unpool_op.h b/paddle/fluid/operators/unpool_op.h
index 52849cb3e0..95aa1a4688 100644
--- a/paddle/fluid/operators/unpool_op.h
+++ b/paddle/fluid/operators/unpool_op.h
@@ -17,8 +17,8 @@ limitations under the License. */
 #include <string>
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/math/unpooling.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
@@ -36,7 +36,7 @@ class UnpoolKernel : public framework::OpKernel<T> {
     T* output_data = out->mutable_data<T>(context.GetPlace());
     auto& dev_ctx = context.template device_context<DeviceContext>();
     if (output_data) {
-      math::SetConstant<DeviceContext, T> set_zero;
+      pten::funcs::SetConstant<DeviceContext, T> set_zero;
       set_zero(dev_ctx, out, static_cast<T>(0));
     }
     math::Unpool2dMaxFunctor<DeviceContext, T> unpool2d_max_forward;
@@ -60,7 +60,7 @@ class UnpoolGradKernel : public framework::OpKernel<T> {
     std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
 
     auto& device_ctx = context.template device_context<DeviceContext>();
-    math::SetConstant<DeviceContext, T> zero;
+    pten::funcs::SetConstant<DeviceContext, T> zero;
 
     in_x_grad->mutable_data<T>(context.GetPlace());
     zero(device_ctx, in_x_grad, static_cast<T>(0));
@@ -84,7 +84,7 @@ class Unpool3dKernel : public framework::OpKernel<T> {
     T* output_data = out->mutable_data<T>(context.GetPlace());
     auto& dev_ctx = context.template device_context<DeviceContext>();
     if (output_data) {
-      math::SetConstant<DeviceContext, T> set_zero;
+      pten::funcs::SetConstant<DeviceContext, T> set_zero;
       set_zero(dev_ctx, out, static_cast<T>(0));
     }
     math::Unpool3dMaxFunctor<DeviceContext, T> unpool3d_max_forward;
@@ -109,7 +109,7 @@ class Unpool3dGradKernel : public framework::OpKernel<T> {
     std::vector<int> paddings = context.Attr<std::vector<int>>("paddings");
 
     auto& device_ctx = context.template device_context<DeviceContext>();
-    math::SetConstant<DeviceContext, T> zero;
+    pten::funcs::SetConstant<DeviceContext, T> zero;
 
     in_x_grad->mutable_data<T>(context.GetPlace());
     zero(device_ctx, in_x_grad, static_cast<T>(0));
diff --git a/paddle/fluid/operators/unsqueeze_op.h b/paddle/fluid/operators/unsqueeze_op.h
index d7a1e0ed3b..649cc9de50 100644
--- a/paddle/fluid/operators/unsqueeze_op.h
+++ b/paddle/fluid/operators/unsqueeze_op.h
@@ -17,10 +17,10 @@ limitations under the License. */
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/blas.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/math/pooling.h"
 #include "paddle/fluid/operators/utils.h"
 #include "paddle/fluid/platform/device_context.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/unsqueeze_op_npu_test.cc b/paddle/fluid/operators/unsqueeze_op_npu_test.cc
index cf96ef57a4..c34cdbc2e7 100644
--- a/paddle/fluid/operators/unsqueeze_op_npu_test.cc
+++ b/paddle/fluid/operators/unsqueeze_op_npu_test.cc
@@ -25,12 +25,11 @@ limitations under the License. */
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/program_desc.h"
 #include "paddle/fluid/operators/dropout_op.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/string/printf.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace f = paddle::framework;
 namespace p = paddle::platform;
-namespace m = paddle::operators::math;
 
 USE_OP(unsqueeze);
 USE_OP_DEVICE_KERNEL(unsqueeze, NPU);
diff --git a/paddle/fluid/operators/var_conv_2d_op.cc b/paddle/fluid/operators/var_conv_2d_op.cc
index db8b2c3050..f67b969d45 100644
--- a/paddle/fluid/operators/var_conv_2d_op.cc
+++ b/paddle/fluid/operators/var_conv_2d_op.cc
@@ -16,8 +16,8 @@ limitations under the License. */
 #include <memory>
 #include <vector>
 #include "paddle/fluid/operators/math/blas.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/platform/dynload/mklml.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/viterbi_decode_op.h b/paddle/fluid/operators/viterbi_decode_op.h
index ab95dbc763..77e38f4fa8 100644
--- a/paddle/fluid/operators/viterbi_decode_op.h
+++ b/paddle/fluid/operators/viterbi_decode_op.h
@@ -250,8 +250,8 @@ class ViterbiDecodeKernel : public framework::OpKernel<T> {
     auto batch_size = static_cast<int>(input->dims()[0]);
     auto seq_len = static_cast<int>(input->dims()[1]);
     auto n_labels = static_cast<int>(input->dims()[2]);
-    math::SetConstant<DeviceContext, T> float_functor;
-    math::SetConstant<DeviceContext, int64_t> int_functor;
+    pten::funcs::SetConstant<DeviceContext, T> float_functor;
+    pten::funcs::SetConstant<DeviceContext, int64_t> int_functor;
     std::vector<Tensor> historys;
     // We create tensor buffer in order to avoid allocating memory frequently
     // 10 means allocate 10*batch_size bytes memory, such as int_mask, zero...
diff --git a/paddle/fluid/operators/warpctc_op.h b/paddle/fluid/operators/warpctc_op.h
index 56f1d8d97b..3f8c38aa60 100644
--- a/paddle/fluid/operators/warpctc_op.h
+++ b/paddle/fluid/operators/warpctc_op.h
@@ -17,10 +17,10 @@ limitations under the License. */
 #include <vector>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/math/sequence_padding.h"
 #include "paddle/fluid/operators/math/sequence_scale.h"
 #include "paddle/fluid/platform/dynload/warpctc.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
@@ -138,7 +138,7 @@ class WarpCTCFunctor {
         framework::make_ddim({static_cast<int64_t>(workspace_elements)}),
         dev_ctx);
     T* workspace_data = workspace.data<T>();
-    math::SetConstant<DeviceContext, T>()(
+    pten::funcs::SetConstant<DeviceContext, T>()(
         ctx.template device_context<DeviceContext>(), &workspace,
         static_cast<T>(0));
 
@@ -334,7 +334,7 @@ class WarpCTCKernel : public framework::OpKernel<T> {
     T* warpctc_grad_data =
         warpctc_grad->mutable_data<T>(warpctc_logits.dims(), ctx.GetPlace());
 
-    math::SetConstant<DeviceContext, T>()(
+    pten::funcs::SetConstant<DeviceContext, T>()(
         ctx.template device_context<DeviceContext>(), warpctc_grad,
         static_cast<T>(0));
 
diff --git a/paddle/fluid/operators/where_index_op.h b/paddle/fluid/operators/where_index_op.h
index 97a7bb939b..c6828a7876 100644
--- a/paddle/fluid/operators/where_index_op.h
+++ b/paddle/fluid/operators/where_index_op.h
@@ -17,8 +17,8 @@ limitations under the License. */
 #include <vector>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/platform/for_range.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/fluid/operators/where_op.h b/paddle/fluid/operators/where_op.h
index fdb65858ef..415632f3d7 100644
--- a/paddle/fluid/operators/where_op.h
+++ b/paddle/fluid/operators/where_op.h
@@ -14,7 +14,7 @@
 
 #pragma once
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace paddle {
 namespace operators {
diff --git a/paddle/pten/kernels/cpu/norm_grad_kernel.cc b/paddle/pten/kernels/cpu/norm_grad_kernel.cc
index 3357e6f76f..7b2a07c37b 100644
--- a/paddle/pten/kernels/cpu/norm_grad_kernel.cc
+++ b/paddle/pten/kernels/cpu/norm_grad_kernel.cc
@@ -13,8 +13,8 @@
 // limitations under the License.
 
 #include "paddle/pten/kernels/norm_grad_kernel.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/pten/kernels/funcs/eigen/eigen_function.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 #include "paddle/pten/kernels/funcs/eigen/common.h"
 
diff --git a/paddle/pten/kernels/cpu/norm_kernel.cc b/paddle/pten/kernels/cpu/norm_kernel.cc
index ef2cf405c1..f2996faccb 100644
--- a/paddle/pten/kernels/cpu/norm_kernel.cc
+++ b/paddle/pten/kernels/cpu/norm_kernel.cc
@@ -13,11 +13,11 @@
 // limitations under the License.
 
 #include "paddle/pten/kernels/norm_kernel.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/pten/backends/cpu/cpu_context.h"
 #include "paddle/pten/core/kernel_registry.h"
 #include "paddle/pten/kernels/funcs/common_shape.h"
 #include "paddle/pten/kernels/funcs/eigen/eigen_function.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace pten {
 
diff --git a/paddle/pten/kernels/funcs/CMakeLists.txt b/paddle/pten/kernels/funcs/CMakeLists.txt
index 32bdc94b95..e4dd437629 100644
--- a/paddle/pten/kernels/funcs/CMakeLists.txt
+++ b/paddle/pten/kernels/funcs/CMakeLists.txt
@@ -6,3 +6,51 @@ if(WITH_GPU)
 elseif(WITH_ROCM)
   hip_library(pten_transpose_gpu SRCS transpose.cu DEPS dense_tensor malloc pten_context)
 endif()
+
+function(math_library TARGET)
+    # math_library is a function to create math library.
+    # The interface is the same as cc_library.
+    # But it handle split GPU/CPU code and link some common library.
+    set(cc_srcs)
+    set(cu_srcs)
+    set(hip_srcs)
+    set(math_common_deps device_context framework_proto enforce)
+    if (WITH_GPU)
+        if (${CMAKE_CUDA_COMPILER_VERSION} LESS 11.0)	
+            list(APPEND math_common_deps cub)
+	else()
+            list(APPEND math_common_deps)
+	endif()
+    endif()
+    set(multiValueArgs DEPS)
+    cmake_parse_arguments(math_library "${options}" "${oneValueArgs}"
+            "${multiValueArgs}" ${ARGN})
+
+    if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cc)
+        list(APPEND cc_srcs ${TARGET}.cc)
+    endif()
+    if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cu)
+        list(APPEND cu_srcs ${TARGET}.cu)
+    endif()
+    if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cu.cc)
+        list(APPEND cu_srcs ${TARGET}.cu.cc)
+    endif()
+
+    list(LENGTH cc_srcs cc_srcs_len)
+    if (WITH_GPU)
+        nv_library(${TARGET} SRCS ${cc_srcs} ${cu_srcs} DEPS ${math_library_DEPS} ${math_common_deps})
+    elseif (WITH_ROCM)
+        hip_library(${TARGET} SRCS ${cc_srcs} ${cu_srcs} DEPS ${math_library_DEPS} ${math_common_deps})
+    elseif(${cc_srcs_len} GREATER 0)
+        cc_library(${TARGET} SRCS ${cc_srcs} DEPS ${math_library_DEPS} ${math_common_deps})
+    endif()
+endfunction()
+
+math_library(math_function DEPS blas dense_tensor tensor)
+cc_test(math_function_test SRCS math_function_test.cc DEPS math_function)
+if(WITH_GPU)
+    nv_test(math_function_gpu_test SRCS math_function_test.cu DEPS math_function)
+endif()
+if(WITH_ROCM)
+    hip_test(math_function_gpu_test SRCS math_function_test.cu DEPS math_function tensor)
+endif()
diff --git a/paddle/pten/kernels/funcs/elementwise_base.h b/paddle/pten/kernels/funcs/elementwise_base.h
index 110b405bbc..8e977f3e73 100644
--- a/paddle/pten/kernels/funcs/elementwise_base.h
+++ b/paddle/pten/kernels/funcs/elementwise_base.h
@@ -14,12 +14,12 @@ limitations under the License. */
 
 #pragma once
 
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/platform/for_range.h"
 #include "paddle/fluid/platform/transform.h"
 #include "paddle/pten/backends/all_context.h"
 #include "paddle/pten/core/dense_tensor.h"
 #include "paddle/pten/kernels/empty_kernel.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 #if defined(__NVCC__) || defined(__HIPCC__)
 #include "paddle/fluid/platform/aligned_vector.h"
@@ -394,7 +394,7 @@ static inline void GetDoubleGradSafeTensor(const DeviceContext &dev_ctx,
     auto meta = pten::DenseTensorMeta(x.dtype(), x.dims(), x.layout());
     *ddx_safe = pten::Empty(dev_ctx, std::move(meta));
     ddx_safe->mutable_data(dev_ctx.GetPlace());
-    paddle::operators::math::SetConstant<DeviceContext, T> set_zero;
+    pten::funcs::SetConstant<DeviceContext, T> set_zero;
     set_zero(dev_ctx, ddx_safe, static_cast<T>(0));
   }
 }
diff --git a/paddle/pten/kernels/funcs/math_function.cc b/paddle/pten/kernels/funcs/math_function.cc
new file mode 100644
index 0000000000..550ec23c18
--- /dev/null
+++ b/paddle/pten/kernels/funcs/math_function.cc
@@ -0,0 +1,342 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/pten/kernels/funcs/math_function.h"
+
+#ifdef PADDLE_WITH_MKLML
+#include "paddle/fluid/platform/dynload/mklml.h"
+#endif
+
+#ifdef PADDLE_USE_OPENBLAS
+#include <cblas.h>
+#endif
+
+#include <memory>
+#include <utility>
+#include <vector>
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/platform/bfloat16.h"
+#include "paddle/fluid/platform/float16.h"
+#include "paddle/pten/backends/cpu/cpu_context.h"
+#include "paddle/pten/kernels/funcs/eigen/common.h"
+#include "paddle/pten/kernels/funcs/math_function_impl.h"
+#include "unsupported/Eigen/CXX11/Tensor"
+
+namespace pten {
+namespace funcs {
+
+using float16 = paddle::platform::float16;
+
+template struct SetConstant<paddle::platform::CPUDeviceContext,
+                            paddle::platform::float16>;
+template struct SetConstant<paddle::platform::CPUDeviceContext,
+                            paddle::platform::bfloat16>;
+template struct SetConstant<paddle::platform::CPUDeviceContext, float>;
+template struct SetConstant<paddle::platform::CPUDeviceContext, double>;
+template struct SetConstant<paddle::platform::CPUDeviceContext, int16_t>;
+template struct SetConstant<paddle::platform::CPUDeviceContext, int>;
+template struct SetConstant<paddle::platform::CPUDeviceContext, int64_t>;
+template struct SetConstant<paddle::platform::CPUDeviceContext, bool>;
+template struct SetConstant<paddle::platform::CPUDeviceContext, uint8_t>;
+template struct SetConstant<paddle::platform::CPUDeviceContext,
+                            paddle::platform::complex<float>>;
+template struct SetConstant<paddle::platform::CPUDeviceContext,
+                            paddle::platform::complex<double>>;
+
+template struct SetConstant<pten::CPUContext, paddle::platform::float16>;
+template struct SetConstant<pten::CPUContext, paddle::platform::bfloat16>;
+template struct SetConstant<pten::CPUContext, float>;
+template struct SetConstant<pten::CPUContext, double>;
+template struct SetConstant<pten::CPUContext, int16_t>;
+template struct SetConstant<pten::CPUContext, int>;
+template struct SetConstant<pten::CPUContext, int64_t>;
+template struct SetConstant<pten::CPUContext, bool>;
+template struct SetConstant<pten::CPUContext, uint8_t>;
+template struct SetConstant<pten::CPUContext, paddle::platform::complex<float>>;
+template struct SetConstant<pten::CPUContext,
+                            paddle::platform::complex<double>>;
+
+#ifdef PADDLE_WITH_XPU
+template struct SetConstant<paddle::platform::XPUDeviceContext,
+                            paddle::platform::float16>;
+template struct SetConstant<paddle::platform::XPUDeviceContext,
+                            paddle::platform::bfloat16>;
+template struct SetConstant<paddle::platform::XPUDeviceContext, float>;
+template struct SetConstant<paddle::platform::XPUDeviceContext, double>;
+template struct SetConstant<paddle::platform::XPUDeviceContext, uint8_t>;
+template struct SetConstant<paddle::platform::XPUDeviceContext, int16_t>;
+template struct SetConstant<paddle::platform::XPUDeviceContext, int>;
+template struct SetConstant<paddle::platform::XPUDeviceContext, int64_t>;
+template struct SetConstant<paddle::platform::XPUDeviceContext, bool>;
+template struct SetConstant<paddle::platform::XPUDeviceContext,
+                            paddle::platform::complex<float>>;
+template struct SetConstant<paddle::platform::XPUDeviceContext,
+                            paddle::platform::complex<double>>;
+#endif
+
+#define DEFINE_CPU_TRANS(RANK)                                                 \
+  template struct Transpose<paddle::platform::CPUDeviceContext,                \
+                            paddle::platform::float16,                         \
+                            RANK>;                                             \
+  template struct Transpose<paddle::platform::CPUDeviceContext,                \
+                            paddle::platform::bfloat16,                        \
+                            RANK>;                                             \
+  template struct Transpose<paddle::platform::CPUDeviceContext, float, RANK>;  \
+  template struct Transpose<paddle::platform::CPUDeviceContext, double, RANK>; \
+  template struct Transpose<paddle::platform::CPUDeviceContext, int, RANK>;    \
+  template struct Transpose<paddle::platform::CPUDeviceContext,                \
+                            int64_t,                                           \
+                            RANK>;                                             \
+  template struct Transpose<paddle::platform::CPUDeviceContext, bool, RANK>;   \
+  template struct Transpose<paddle::platform::CPUDeviceContext,                \
+                            int16_t,                                           \
+                            RANK>;                                             \
+  template struct Transpose<paddle::platform::CPUDeviceContext,                \
+                            uint8_t,                                           \
+                            RANK>;                                             \
+  template struct Transpose<paddle::platform::CPUDeviceContext, int8_t, RANK>; \
+  template struct Transpose<paddle::platform::CPUDeviceContext,                \
+                            paddle::platform::complex<float>,                  \
+                            RANK>;                                             \
+  template struct Transpose<paddle::platform::CPUDeviceContext,                \
+                            paddle::platform::complex<double>,                 \
+                            RANK>;
+
+DEFINE_CPU_TRANS(1);
+DEFINE_CPU_TRANS(2);
+DEFINE_CPU_TRANS(3);
+DEFINE_CPU_TRANS(4);
+DEFINE_CPU_TRANS(5);
+DEFINE_CPU_TRANS(6);
+
+template <typename T>
+struct TransposeNormal<paddle::platform::CPUDeviceContext, T> {
+  void operator()(const paddle::platform::CPUDeviceContext& context,
+                  const paddle::framework::Tensor& in,
+                  paddle::framework::Tensor* out,
+                  const std::vector<int>& axis) {
+    const int rank = axis.size();
+    auto in_stride = paddle::framework::stride(in.dims());
+    auto out_stride = paddle::framework::stride(out->dims());
+    const T* in_ptr = in.data<T>();
+    T* out_ptr = out->data<T>();
+
+    auto transpose_helper = [&](int64_t beg, int64_t end) {
+      for (int64_t out_idx = beg; out_idx < end; ++out_idx) {
+        int64_t in_idx = 0;
+        int64_t tmp_idx = out_idx;
+        // calculate the input index
+        for (int i = 0; i < rank; ++i) {
+          const int64_t coordinate = tmp_idx / out_stride[i];
+          tmp_idx -= coordinate * out_stride[i];
+          in_idx += coordinate * in_stride[axis[i]];
+        }
+        out_ptr[out_idx] = in_ptr[in_idx];
+      }
+    };
+    transpose_helper(0, out->numel());
+  }
+};
+
+// define transpose normal
+#define DEFINE_CPU_TRANS_NORMAL(TYPE) \
+  template struct TransposeNormal<paddle::platform::CPUDeviceContext, TYPE>
+
+DEFINE_CPU_TRANS_NORMAL(paddle::platform::float16);
+DEFINE_CPU_TRANS_NORMAL(paddle::platform::bfloat16);
+DEFINE_CPU_TRANS_NORMAL(float);
+DEFINE_CPU_TRANS_NORMAL(double);
+DEFINE_CPU_TRANS_NORMAL(int);
+DEFINE_CPU_TRANS_NORMAL(int64_t);
+DEFINE_CPU_TRANS_NORMAL(bool);
+DEFINE_CPU_TRANS_NORMAL(int16_t);
+DEFINE_CPU_TRANS_NORMAL(uint8_t);
+DEFINE_CPU_TRANS_NORMAL(int8_t);
+DEFINE_CPU_TRANS_NORMAL(paddle::platform::complex<float>);
+DEFINE_CPU_TRANS_NORMAL(paddle::platform::complex<double>);
+
+struct TensorSetConstantCPU {
+  TensorSetConstantCPU(paddle::framework::Tensor* tensor, float value)
+      : tensor_(tensor), value_(value) {}
+  template <typename T>
+  void apply() const {
+    auto cpu = paddle::platform::CPUPlace();
+    auto* begin = tensor_->mutable_data<T>(cpu);
+    std::fill(begin, begin + tensor_->numel(), static_cast<T>(value_));
+  }
+  paddle::framework::Tensor* tensor_;
+  float value_;
+};
+
+template <>
+void set_constant_with_place<paddle::platform::XPUPlace>(
+    const paddle::platform::DeviceContext& context,
+    paddle::framework::Tensor* tensor,
+    float value) {
+  PADDLE_THROW(
+      paddle::platform::errors::Unimplemented("XPUPlace is not supported"));
+}
+
+template <>
+void set_constant_with_place<paddle::platform::NPUPlace>(
+    const paddle::platform::DeviceContext& context,
+    paddle::framework::Tensor* tensor,
+    float value) {
+  PADDLE_THROW(
+      paddle::platform::errors::Unimplemented("NPUPlace is not supported"));
+}
+
+template <>
+void set_constant_with_place<paddle::platform::NPUPinnedPlace>(
+    const paddle::platform::DeviceContext& context,
+    paddle::framework::Tensor* tensor,
+    float value) {
+  PADDLE_THROW(paddle::platform::errors::Unimplemented(
+      "NPUPinnedPlace is not supported"));
+}
+
+template <>
+void set_constant_with_place<paddle::platform::IPUPlace>(
+    const paddle::platform::DeviceContext& context,
+    paddle::framework::Tensor* tensor,
+    float value) {
+  PADDLE_THROW(
+      paddle::platform::errors::Unimplemented("IPUPlace is not supported"));
+}
+
+template <>
+void set_constant_with_place<paddle::platform::CPUPlace>(
+    const paddle::platform::DeviceContext& context,
+    paddle::framework::Tensor* tensor,
+    float value) {
+  paddle::framework::VisitDataType(tensor->type(),
+                                   TensorSetConstantCPU(tensor, value));
+}
+
+template <>
+void set_constant_with_place<paddle::platform::MLUPlace>(
+    const paddle::platform::DeviceContext& context,
+    paddle::framework::Tensor* tensor,
+    float value) {
+  PADDLE_THROW(
+      paddle::platform::errors::Unimplemented("MLUPlace is not supported"));
+}
+
+template <>
+void set_constant_with_place<paddle::platform::CUDAPinnedPlace>(
+    const paddle::platform::DeviceContext& context,
+    paddle::framework::Tensor* tensor,
+    float value) {
+  paddle::framework::VisitDataType(tensor->type(),
+                                   TensorSetConstantCPU(tensor, value));
+}
+
+struct TensorSetConstantWithPlace : public boost::static_visitor<void> {
+  TensorSetConstantWithPlace(const paddle::platform::DeviceContext& context,
+                             paddle::framework::Tensor* tensor,
+                             float value)
+      : context_(context), tensor_(tensor), value_(value) {}
+
+  template <typename Place>
+  void operator()(Place place) const {
+    set_constant_with_place<Place>(context_, tensor_, value_);
+  }
+
+  const paddle::platform::DeviceContext& context_;
+  paddle::framework::Tensor* tensor_;
+  float value_;
+};
+
+void set_constant(const paddle::platform::DeviceContext& context,
+                  paddle::framework::Tensor* tensor,
+                  float value) {
+  TensorSetConstantWithPlace func(context, tensor, value);
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+  // tensor->place().apply_visitor(func);
+  paddle::platform::VisitPlace(tensor->place(), func);
+#else
+  func(paddle::platform::CPUPlace());
+#endif
+}
+
+template <typename T>
+struct RowwiseAdd<paddle::platform::CPUDeviceContext, T> {
+  void operator()(const paddle::platform::CPUDeviceContext& context,
+                  const paddle::framework::Tensor& input,
+                  const paddle::framework::Tensor& vector,
+                  paddle::framework::Tensor* output) {
+    auto in_dims = input.dims();
+    auto out_dims = output->dims();
+    auto size = input.numel() / in_dims[0];
+    PADDLE_ENFORCE_EQ(
+        vector.numel(),
+        size,
+        paddle::platform::errors::InvalidArgument(
+            "The input vector size"
+            " should be equal to the size of each row of input tensor."
+            " Expected vector size=%d, but received %d",
+            size,
+            vector.numel()));
+    const char* in_dims_cstr = in_dims.to_str().c_str();
+    const char* out_dims_cstr = out_dims.to_str().c_str();
+    PADDLE_ENFORCE_EQ(out_dims,
+                      in_dims,
+                      paddle::platform::errors::InvalidArgument(
+                          "The output tensor shape should be same as the input"
+                          " tensor shape. Expected output tensor shape: %s,"
+                          " but received %s",
+                          in_dims_cstr,
+                          out_dims_cstr));
+
+    auto in = paddle::framework::EigenMatrix<T>::From(input);
+    auto vec = paddle::framework::EigenVector<T>::Flatten(vector);
+    auto out = paddle::framework::EigenMatrix<T>::From(*output);
+
+    for (int64_t i = 0; i < in_dims[0]; ++i) {
+      out.chip(i, 0) = in.chip(i, 0) + vec;
+    }
+  }
+};
+
+template struct RowwiseAdd<paddle::platform::CPUDeviceContext, float>;
+template struct RowwiseAdd<paddle::platform::CPUDeviceContext, double>;
+
+template struct ColwiseSum<paddle::platform::CPUDeviceContext, float>;
+template struct ColwiseSum<paddle::platform::CPUDeviceContext, double>;
+template struct ColwiseSum<paddle::platform::CPUDeviceContext, int>;
+template struct ColwiseSum<paddle::platform::CPUDeviceContext, int64_t>;
+
+template struct RowwiseSum<paddle::platform::CPUDeviceContext, float>;
+template struct RowwiseSum<paddle::platform::CPUDeviceContext, double>;
+
+template struct RowwiseMean<paddle::platform::CPUDeviceContext, float>;
+template struct RowwiseMean<paddle::platform::CPUDeviceContext, double>;
+
+template <typename T>
+struct ElementwiseAddTo<paddle::platform::CPUDeviceContext, T> {
+  void operator()(paddle::platform::CPUDeviceContext* ctx,
+                  const paddle::framework::Tensor& src,
+                  paddle::framework::Tensor* dst) {
+    auto in = paddle::framework::EigenVector<T>::Flatten(src);
+    auto out = paddle::framework::EigenVector<T>::Flatten(*dst);
+    auto& place = *(ctx->eigen_device());
+    out.device(place) = out + in;
+  }
+};
+
+template struct ElementwiseAddTo<paddle::platform::CPUDeviceContext,
+                                 paddle::platform::float16>;
+
+}  // namespace funcs
+}  // namespace pten
diff --git a/paddle/pten/kernels/funcs/math_function.cu b/paddle/pten/kernels/funcs/math_function.cu
new file mode 100644
index 0000000000..76bc5f806d
--- /dev/null
+++ b/paddle/pten/kernels/funcs/math_function.cu
@@ -0,0 +1,380 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include <algorithm>
+#include <vector>
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/memory/malloc.h"
+#include "paddle/fluid/memory/memcpy.h"
+#include "paddle/fluid/operators/math/blas.h"
+#include "paddle/fluid/platform/bfloat16.h"
+#include "paddle/fluid/platform/float16.h"
+#include "paddle/pten/backends/gpu/gpu_context.h"
+#include "paddle/pten/kernels/funcs/eigen/common.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
+#include "paddle/pten/kernels/funcs/math_function_impl.h"
+
+namespace pten {
+namespace funcs {
+
+using float16 = paddle::platform::float16;
+using bfloat16 = paddle::platform::bfloat16;
+
+template struct SetConstant<paddle::platform::CUDADeviceContext,
+                            paddle::platform::float16>;
+template struct SetConstant<paddle::platform::CUDADeviceContext,
+                            paddle::platform::bfloat16>;
+template struct SetConstant<paddle::platform::CUDADeviceContext, float>;
+template struct SetConstant<paddle::platform::CUDADeviceContext, double>;
+template struct SetConstant<paddle::platform::CUDADeviceContext, uint8_t>;
+template struct SetConstant<paddle::platform::CUDADeviceContext, int>;
+template struct SetConstant<paddle::platform::CUDADeviceContext, int16_t>;
+template struct SetConstant<paddle::platform::CUDADeviceContext, int64_t>;
+template struct SetConstant<paddle::platform::CUDADeviceContext, bool>;
+template struct SetConstant<paddle::platform::CUDADeviceContext,
+                            paddle::platform::complex<float>>;
+template struct SetConstant<paddle::platform::CUDADeviceContext,
+                            paddle::platform::complex<double>>;
+
+template struct SetConstant<pten::GPUContext, paddle::platform::float16>;
+template struct SetConstant<pten::GPUContext, paddle::platform::bfloat16>;
+template struct SetConstant<pten::GPUContext, float>;
+template struct SetConstant<pten::GPUContext, double>;
+template struct SetConstant<pten::GPUContext, uint8_t>;
+template struct SetConstant<pten::GPUContext, int>;
+template struct SetConstant<pten::GPUContext, int16_t>;
+template struct SetConstant<pten::GPUContext, int64_t>;
+template struct SetConstant<pten::GPUContext, bool>;
+template struct SetConstant<pten::GPUContext, paddle::platform::complex<float>>;
+template struct SetConstant<pten::GPUContext,
+                            paddle::platform::complex<double>>;
+
+template struct SetConstant<paddle::platform::CUDAPinnedDeviceContext,
+                            paddle::platform::float16>;
+template struct SetConstant<paddle::platform::CUDAPinnedDeviceContext,
+                            paddle::platform::bfloat16>;
+template struct SetConstant<paddle::platform::CUDAPinnedDeviceContext, float>;
+template struct SetConstant<paddle::platform::CUDAPinnedDeviceContext, double>;
+template struct SetConstant<paddle::platform::CUDAPinnedDeviceContext, uint8_t>;
+template struct SetConstant<paddle::platform::CUDAPinnedDeviceContext, int>;
+template struct SetConstant<paddle::platform::CUDAPinnedDeviceContext, int16_t>;
+template struct SetConstant<paddle::platform::CUDAPinnedDeviceContext, int64_t>;
+template struct SetConstant<paddle::platform::CUDAPinnedDeviceContext, bool>;
+template struct SetConstant<paddle::platform::CUDAPinnedDeviceContext,
+                            paddle::platform::complex<float>>;
+template struct SetConstant<paddle::platform::CUDAPinnedDeviceContext,
+                            paddle::platform::complex<double>>;
+
+#define DEFINE_GPU_TRANS(RANK)                                                 \
+  template struct Transpose<paddle::platform::CUDADeviceContext, bool, RANK>;  \
+  template struct Transpose<paddle::platform::CUDADeviceContext, float, RANK>; \
+  template struct Transpose<paddle::platform::CUDADeviceContext,               \
+                            double,                                            \
+                            RANK>;                                             \
+  template struct Transpose<paddle::platform::CUDADeviceContext,               \
+                            float16,                                           \
+                            RANK>;                                             \
+  template struct Transpose<paddle::platform::CUDADeviceContext,               \
+                            bfloat16,                                          \
+                            RANK>;                                             \
+  template struct Transpose<paddle::platform::CUDADeviceContext,               \
+                            int8_t,                                            \
+                            RANK>;                                             \
+  template struct Transpose<paddle::platform::CUDADeviceContext,               \
+                            int32_t,                                           \
+                            RANK>;                                             \
+  template struct Transpose<paddle::platform::CUDADeviceContext,               \
+                            int64_t,                                           \
+                            RANK>;                                             \
+  template struct Transpose<paddle::platform::CUDADeviceContext,               \
+                            paddle::platform::complex<float>,                  \
+                            RANK>;                                             \
+  template struct Transpose<paddle::platform::CUDADeviceContext,               \
+                            paddle::platform::complex<double>,                 \
+                            RANK>;
+
+DEFINE_GPU_TRANS(1);
+DEFINE_GPU_TRANS(2);
+DEFINE_GPU_TRANS(3);
+DEFINE_GPU_TRANS(4);
+DEFINE_GPU_TRANS(5);
+DEFINE_GPU_TRANS(6);
+
+#define REINTERPRET(T, DST_PTR, SRC_PTR) \
+  T* DST_PTR = reinterpret_cast<T*>(SRC_PTR)
+
+template <typename T>
+__global__ void TransposeNormalKernel(const T* in_ptr,
+                                      T* out_ptr,
+                                      int64_t element,
+                                      const int64_t* in_stride_ptr,
+                                      const int64_t* out_stride_ptr,
+                                      const int64_t* axis_ptr,
+                                      int rank) {
+  CUDA_KERNEL_LOOP(out_idx, element) {
+    int64_t in_idx = 0;
+    int64_t tmp_idx = out_idx;
+    for (int i = 0; i < rank; ++i) {
+      const int64_t coordinate = tmp_idx / out_stride_ptr[i];
+      tmp_idx -= coordinate * out_stride_ptr[i];
+      in_idx += coordinate * in_stride_ptr[axis_ptr[i]];
+    }
+    out_ptr[out_idx] = in_ptr[in_idx];
+  }
+}
+
+template <typename T>
+struct TransposeNormal<paddle::platform::CUDADeviceContext, T> {
+  void operator()(const paddle::platform::CUDADeviceContext& context,
+                  const paddle::framework::Tensor& in,
+                  paddle::framework::Tensor* out,
+                  const std::vector<int>& axis) {
+    const int rank = axis.size();
+    auto in_stride = paddle::framework::stride(in.dims());
+    auto out_stride = paddle::framework::stride(out->dims());
+    auto* in_ptr = in.data<T>();
+    auto* out_ptr = out->data<T>();
+
+    // copy in_stride, out_stride, axis to gpu device
+    const paddle::platform::CUDAPlace& cuda_place = context.GetPlace();
+    paddle::platform::CPUPlace cpu_place = paddle::platform::CPUPlace();
+    size_t size = 3 * rank * sizeof(int64_t);
+    auto cpu_buf_holder = paddle::memory::Alloc(cpu_place, size);
+    auto cuda_buf_holder = paddle::memory::Alloc(cuda_place, size);
+    REINTERPRET(int64_t, cpu_buf, cpu_buf_holder->ptr());
+    REINTERPRET(int64_t, cuda_buf, cuda_buf_holder->ptr());
+    for (int i = 0; i < rank; ++i) {
+      cpu_buf[i] = in_stride[i];
+      cpu_buf[rank + i] = out_stride[i];
+      cpu_buf[2 * rank + i] = axis[i];
+    }
+    paddle::memory::Copy(
+        cuda_place, cuda_buf, cpu_place, cpu_buf, size, context.stream());
+    REINTERPRET(const int64_t, in_stride_ptr, cuda_buf);
+    REINTERPRET(const int64_t, out_stride_ptr, cuda_buf + rank);
+    REINTERPRET(const int64_t, axis_ptr, cuda_buf + 2 * rank);
+
+    const int MAX_BLOCK_DIM = context.GetMaxThreadsPerBlock();
+    const int MAX_GRID_DIM =
+        context.GetMaxPhysicalThreadCount() / MAX_BLOCK_DIM;
+    int64_t elements = in.numel();
+    int block_size = (elements >= MAX_BLOCK_DIM)
+                         ? MAX_BLOCK_DIM
+                         : (1 << static_cast<int>(std::log2(elements)));
+    int grid_size = elements / block_size;
+    grid_size = (grid_size >= MAX_GRID_DIM) ? MAX_GRID_DIM : grid_size;
+    TransposeNormalKernel<T><<<grid_size, block_size, 0, context.stream()>>>(
+        in_ptr,
+        out_ptr,
+        elements,
+        in_stride_ptr,
+        out_stride_ptr,
+        axis_ptr,
+        rank);
+  }
+};
+
+// define transpose normal
+#define DEFINE_GPU_TRANS_NORMAL(TYPE) \
+  template struct TransposeNormal<paddle::platform::CUDADeviceContext, TYPE>
+
+DEFINE_GPU_TRANS_NORMAL(float16);
+DEFINE_GPU_TRANS_NORMAL(bfloat16);
+DEFINE_GPU_TRANS_NORMAL(float);
+DEFINE_GPU_TRANS_NORMAL(double);
+DEFINE_GPU_TRANS_NORMAL(int);
+DEFINE_GPU_TRANS_NORMAL(int64_t);
+DEFINE_GPU_TRANS_NORMAL(bool);
+DEFINE_GPU_TRANS_NORMAL(int16_t);
+DEFINE_GPU_TRANS_NORMAL(uint8_t);
+DEFINE_GPU_TRANS_NORMAL(int8_t);
+DEFINE_GPU_TRANS_NORMAL(paddle::platform::complex<float>);
+DEFINE_GPU_TRANS_NORMAL(paddle::platform::complex<double>);
+
+struct TensorSetConstantGPU {
+  TensorSetConstantGPU(const paddle::platform::DeviceContext& context,
+                       paddle::framework::Tensor* tensor,
+                       float value)
+      : context_(context), tensor_(tensor), value_(value) {}
+
+  template <typename T>
+  void apply() const {
+    SetConstant<paddle::platform::CUDADeviceContext, T> functor;
+    functor(
+        reinterpret_cast<const paddle::platform::CUDADeviceContext&>(context_),
+        tensor_,
+        static_cast<T>(value_));
+  }
+
+  const paddle::platform::DeviceContext& context_;
+  paddle::framework::Tensor* tensor_;
+  float value_;
+};
+
+template <>
+void set_constant_with_place<paddle::platform::CUDAPlace>(
+    const paddle::platform::DeviceContext& context,
+    paddle::framework::Tensor* tensor,
+    float value) {
+  paddle::framework::VisitDataType(
+      tensor->type(), TensorSetConstantGPU(context, tensor, value));
+}
+
+template <typename T>
+__global__ void RowwiseAddKernel(
+    const T* a, const T* b, T* c, int width, int num) {
+  T tmp = 1.0 / width;
+  CUDA_KERNEL_LOOP(i, num) {
+    int h = i * tmp;
+    int w = i - h * width;
+    c[i] = a[i] + b[w];
+  }
+}
+
+template <typename T>
+struct RowwiseAdd<paddle::platform::CUDADeviceContext, T> {
+  void operator()(const paddle::platform::CUDADeviceContext& context,
+                  const paddle::framework::Tensor& input,
+                  const paddle::framework::Tensor& vector,
+                  paddle::framework::Tensor* output) {
+    auto in_dims = input.dims();
+    auto out_dims = output->dims();
+    auto size = input.numel() / in_dims[0];
+    PADDLE_ENFORCE_EQ(
+        vector.numel(),
+        size,
+        paddle::platform::errors::InvalidArgument(
+            "The input vector size"
+            " should be equal to the size of each row of input tensor."
+            " Expected vector size=%d, but received %d",
+            size,
+            vector.numel()));
+    const char* in_dims_cstr = in_dims.to_str().c_str();
+    const char* out_dims_cstr = out_dims.to_str().c_str();
+    PADDLE_ENFORCE_EQ(
+        out_dims,
+        in_dims,
+        paddle::platform::errors::InvalidArgument(
+            "The output tensor shape should be same as the input tensor"
+            " shape. Expected output tensor shape: %s,"
+            " but received %s",
+            in_dims_cstr,
+            out_dims_cstr));
+    int blocks = 512;
+    int grids = (input.numel() + blocks - 1) / blocks;
+    RowwiseAddKernel<T><<<grids, blocks, 0, context.stream()>>>(
+        input.data<T>(),
+        vector.data<T>(),
+        output->data<T>(),
+        static_cast<int>(in_dims[1]),
+        static_cast<int>(input.numel()));
+  }
+};
+
+template struct RowwiseAdd<paddle::platform::CUDADeviceContext, float>;
+template struct RowwiseAdd<paddle::platform::CUDADeviceContext, double>;
+template struct ColwiseSum<paddle::platform::CUDADeviceContext, float>;
+template struct ColwiseSum<paddle::platform::CUDADeviceContext, int>;
+template struct ColwiseSum<paddle::platform::CUDADeviceContext, int64_t>;
+// template struct ColwiseSum<paddle::platform::CUDADeviceContext, double>;
+// The ColwiseSum<paddle::platform::CUDADeviceContext, double> failed in debug
+// mode,
+// and only failed for this case. So reimplemented it.
+template <>
+void ColwiseSum<paddle::platform::CUDADeviceContext, double>::operator()(
+    const paddle::platform::CUDADeviceContext& context,
+    const paddle::framework::Tensor& input,
+    paddle::framework::Tensor* vector) {
+  auto in_dims = input.dims();
+  auto size = input.numel() / in_dims[0];
+  PADDLE_ENFORCE_EQ(vector->numel(),
+                    size,
+                    paddle::platform::errors::InvalidArgument(
+                        "The size of input vector"
+                        " should be equal to the size of input tensor column"
+                        " dimension. Expected vector size=%d, but received %d",
+                        size,
+                        vector->numel()));
+  paddle::framework::Tensor one;
+  one.mutable_data<double>({in_dims[0]}, context.GetPlace());
+  SetConstant<paddle::platform::CUDADeviceContext, double> set;
+  set(context, &one, static_cast<double>(1.0));
+  paddle::operators::math::GetBlas<paddle::platform::CUDADeviceContext, double>(
+      context)
+      .GEMV(true,
+            static_cast<int>(in_dims[0]),
+            static_cast<int>(in_dims[1]),
+            1.0,
+            input.data<double>(),
+            one.data<double>(),
+            0.0,
+            vector->data<double>());
+}
+
+template struct RowwiseSum<paddle::platform::CUDADeviceContext, float>;
+// template struct RowwiseSum<paddle::platform::CUDADeviceContext, double>;
+// TODO(zcd): Following ColwiseSum format, need to confirm.
+// The RowwiseSum<paddle::platform::CUDADeviceContext, double> failed in debug
+// mode,
+// and only failed for this case. So reimplemented it.
+template <>
+void RowwiseSum<paddle::platform::CUDADeviceContext, double>::operator()(
+    const paddle::platform::CUDADeviceContext& context,
+    const paddle::framework::Tensor& input,
+    paddle::framework::Tensor* vector) {
+  auto in_dims = input.dims();
+  auto size = input.numel() / in_dims[0];
+  PADDLE_ENFORCE_EQ(vector->numel(),
+                    in_dims[0],
+                    paddle::platform::errors::InvalidArgument(
+                        "The size of input vector"
+                        " should be equal to the size of input tensor row"
+                        " dimension. Expected vector size=%d, but received %d",
+                        in_dims[0],
+                        vector->numel()));
+  paddle::framework::Tensor one;
+  one.mutable_data<double>({size}, context.GetPlace());
+  SetConstant<paddle::platform::CUDADeviceContext, double> set;
+  set(context, &one, static_cast<double>(1.0));
+  paddle::operators::math::GetBlas<paddle::platform::CUDADeviceContext, double>(
+      context)
+      .GEMV(true,
+            static_cast<int>(in_dims[1]),
+            static_cast<int>(in_dims[0]),
+            1.0,
+            one.data<double>(),
+            input.data<double>(),
+            0.0,
+            vector->data<double>());
+}
+
+template struct RowwiseMean<paddle::platform::CUDADeviceContext, float>;
+template struct RowwiseMean<paddle::platform::CUDADeviceContext, double>;
+
+template <typename T>
+struct ElementwiseAddTo<paddle::platform::CUDADeviceContext, T> {
+  void operator()(paddle::platform::CUDADeviceContext* ctx,
+                  const paddle::framework::Tensor& src,
+                  paddle::framework::Tensor* dst) {
+    auto in = paddle::framework::EigenVector<T>::Flatten(src);
+    auto out = paddle::framework::EigenVector<T>::Flatten(*dst);
+    auto& place = *(ctx->eigen_device());
+    out.device(place) = out + in;
+  }
+};
+
+template struct ElementwiseAddTo<paddle::platform::CUDADeviceContext,
+                                 paddle::platform::float16>;
+
+}  // namespace funcs
+}  // namespace pten
diff --git a/paddle/pten/kernels/funcs/math_function.h b/paddle/pten/kernels/funcs/math_function.h
new file mode 100644
index 0000000000..8208c0afb0
--- /dev/null
+++ b/paddle/pten/kernels/funcs/math_function.h
@@ -0,0 +1,127 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <cmath>
+#include <memory>
+#include <vector>
+
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/operator.h"
+#include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/fluid/platform/device_context.h"
+#include "paddle/fluid/platform/enforce.h"
+#include "paddle/pten/core/dense_tensor.h"
+
+namespace pten {
+namespace funcs {
+
+template <typename DeviceContext, typename T>
+struct TransposeNormal {
+  // for dims >= 7 situation
+  void operator()(const DeviceContext& context,
+                  const paddle::framework::Tensor& in,
+                  paddle::framework::Tensor* out,
+                  const std::vector<int>& axis);
+};
+
+template <typename DeviceContext, typename T, int Rank>
+struct Transpose {
+  void operator()(const DeviceContext& context,
+                  const paddle::framework::Tensor& in,
+                  paddle::framework::Tensor* out,
+                  const std::vector<int>& axis);
+};
+
+template <typename DeviceContext, typename T>
+struct SetConstant {
+  void operator()(const DeviceContext& context,
+                  paddle::framework::Tensor* tensor,
+                  T num);
+};
+
+template <typename Place>
+void set_constant_with_place(const paddle::platform::DeviceContext& context,
+                             paddle::framework::Tensor* tensor,
+                             float value);
+
+void set_constant(const paddle::platform::DeviceContext& context,
+                  paddle::framework::Tensor* tensor,
+                  float value);
+
+template <typename DeviceContext, typename T>
+struct RowwiseAdd {
+  void operator()(const DeviceContext& context,
+                  const paddle::framework::Tensor& input,
+                  const paddle::framework::Tensor& vec,
+                  paddle::framework::Tensor* output);
+};
+
+template <typename DeviceContext, typename T>
+struct ElementwiseAddTo {
+  // dst = dst + src
+  void operator()(DeviceContext* ctx,
+                  const paddle::framework::Tensor& src,
+                  paddle::framework::Tensor* dst);
+};
+
+template <typename DeviceContext, typename T>
+struct ColwiseSum {
+  void operator()(const DeviceContext& context,
+                  const paddle::framework::Tensor& input,
+                  paddle::framework::Tensor* vec);
+};
+
+template <typename DeviceContext, typename T>
+struct RowwiseSum {
+  void operator()(const DeviceContext& context,
+                  const paddle::framework::Tensor& input,
+                  paddle::framework::Tensor* vec);
+};
+
+template <typename DeviceContext, typename T>
+struct RowwiseMean {
+  void operator()(const DeviceContext& context,
+                  const paddle::framework::Tensor& input,
+                  paddle::framework::Tensor* vec);
+};
+
+#ifdef PADDLE_WITH_XPU
+template <typename U>
+struct TensorSetConstantXPU {
+  TensorSetConstantXPU(paddle::framework::Tensor* tensor,
+                       U value,
+                       paddle::platform::Place place)
+      : tensor_(tensor), value_(value), place_(place) {}
+  template <typename T>
+  void apply() const {
+    auto* begin = tensor_->mutable_data<T>(place_);
+    int numel = tensor_->numel();
+    std::unique_ptr<T[]> data_cpu(new T[numel]);
+    std::fill(data_cpu.get(), data_cpu.get() + numel, static_cast<T>(value_));
+    paddle::memory::Copy(place_,
+                         begin,
+                         paddle::platform::CPUPlace(),
+                         static_cast<void*>(data_cpu.get()),
+                         numel * sizeof(T));
+  }
+  paddle::framework::Tensor* tensor_;
+  U value_;
+  paddle::platform::Place place_;
+};
+#endif
+
+}  // namespace funcs
+}  // namespace pten
diff --git a/paddle/fluid/operators/math/math_function_impl.h b/paddle/pten/kernels/funcs/math_function_impl.h
similarity index 54%
rename from paddle/fluid/operators/math/math_function_impl.h
rename to paddle/pten/kernels/funcs/math_function_impl.h
index 0e44f90304..286f694ce5 100644
--- a/paddle/fluid/operators/math/math_function_impl.h
+++ b/paddle/pten/kernels/funcs/math_function_impl.h
@@ -16,47 +16,47 @@ limitations under the License. */
 #include <memory>
 #include <vector>
 #include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
-namespace paddle {
-namespace operators {
-namespace math {
+namespace pten {
+namespace funcs {
 
-using framework::To32BitIndex;
+using paddle::framework::To32BitIndex;
 
 template <typename DeviceContext, typename T>
-void SetConstant<DeviceContext, T>::operator()(const DeviceContext& context,
-                                               framework::Tensor* tensor,
-                                               T num) {
+void SetConstant<DeviceContext, T>::operator()(
+    const DeviceContext& context, paddle::framework::Tensor* tensor, T num) {
   bool xpu_place = false;
 #ifdef PADDLE_WITH_XPU
-  if (platform::is_xpu_place(context.GetPlace())) {
+  if (paddle::platform::is_xpu_place(context.GetPlace())) {
     xpu_place = true;
-    framework::VisitDataType(
+    paddle::framework::VisitDataType(
         tensor->type(),
         TensorSetConstantXPU<T>(tensor, num, context.GetPlace()));
   }
 #endif
   if (!xpu_place) {
-    auto t = framework::EigenVector<T>::Flatten(*tensor);
+    auto t = paddle::framework::EigenVector<T>::Flatten(*tensor);
     t.device(*context.eigen_device()) = t.constant(static_cast<T>(num));
   }
 }
 
 template <typename DeviceContext, typename T, int Rank>
 void Transpose<DeviceContext, T, Rank>::operator()(
-    const DeviceContext& context, const framework::Tensor& in,
-    framework::Tensor* out, const std::vector<int>& axis) {
+    const DeviceContext& context,
+    const paddle::framework::Tensor& in,
+    paddle::framework::Tensor* out,
+    const std::vector<int>& axis) {
   Eigen::array<int, Rank> permute;
   for (int i = 0; i < Rank; i++) {
     permute[i] = axis[i];
   }
-  auto eigen_in = framework::EigenTensor<T, Rank>::From(in);
-  auto eigen_out = framework::EigenTensor<T, Rank>::From(*out);
+  auto eigen_in = paddle::framework::EigenTensor<T, Rank>::From(in);
+  auto eigen_out = paddle::framework::EigenTensor<T, Rank>::From(*out);
   auto* dev = context.eigen_device();
   // use 32bit index to speed up computation
   bool use_32bit_index = eigen_out.size() < Eigen::NumTraits<int>::highest();
-  bool is_gpu_place = platform::is_gpu_place(context.GetPlace());
+  bool is_gpu_place = paddle::platform::is_gpu_place(context.GetPlace());
   if (use_32bit_index && is_gpu_place) {
     To32BitIndex(eigen_out).device(*dev) =
         To32BitIndex(eigen_in).shuffle(permute);
@@ -66,20 +66,23 @@ void Transpose<DeviceContext, T, Rank>::operator()(
 }
 
 template <typename DeviceContext, typename T>
-void ColwiseSum<DeviceContext, T>::operator()(const DeviceContext& context,
-                                              const framework::Tensor& input,
-                                              framework::Tensor* out) {
+void ColwiseSum<DeviceContext, T>::operator()(
+    const DeviceContext& context,
+    const paddle::framework::Tensor& input,
+    paddle::framework::Tensor* out) {
   auto in_dims = input.dims();
   auto size = input.numel() / in_dims[0];
-  PADDLE_ENFORCE_EQ(out->numel(), size,
-                    platform::errors::InvalidArgument(
+  PADDLE_ENFORCE_EQ(out->numel(),
+                    size,
+                    paddle::platform::errors::InvalidArgument(
                         "The size of output tensor "
                         "should be equal to the size of input tensor column"
                         " dimension. Expected output size=%d, but received %d",
-                        size, out->numel()));
+                        size,
+                        out->numel()));
 
-  auto in = framework::EigenMatrix<T>::From(input);
-  auto vec = framework::EigenVector<T>::Flatten(*out);
+  auto in = paddle::framework::EigenMatrix<T>::From(input);
+  auto vec = paddle::framework::EigenVector<T>::Flatten(*out);
 
   vec.device(*context.eigen_device()) = in.sum(Eigen::array<int, 1>({{0}}));
 }
@@ -88,20 +91,23 @@ void ColwiseSum<DeviceContext, T>::operator()(const DeviceContext& context,
 // colwise-sum can be easily implemented. General reduce has a huge overhead in
 // CPU
 template <typename T>
-class ColwiseSum<platform::CPUDeviceContext, T> {
+class ColwiseSum<paddle::platform::CPUDeviceContext, T> {
  public:
-  void operator()(const platform::CPUDeviceContext& context,
-                  const framework::Tensor& input, framework::Tensor* out) {
+  void operator()(const paddle::platform::CPUDeviceContext& context,
+                  const paddle::framework::Tensor& input,
+                  paddle::framework::Tensor* out) {
     auto& in_dims = input.dims();
     auto height = in_dims[0];
     auto size = in_dims[1];
     PADDLE_ENFORCE_EQ(
-        out->numel(), size,
-        platform::errors::InvalidArgument(
+        out->numel(),
+        size,
+        paddle::platform::errors::InvalidArgument(
             "The size of output tensor "
             "should be equal to the size of input tensor column"
             " dimension. Expected output size=%d, but received %d",
-            size, out->numel()));
+            size,
+            out->numel()));
 
     T* out_buf = out->mutable_data<T>(out->place());
     const T* in_buf = input.data<T>();
@@ -119,23 +125,28 @@ class ColwiseSum<platform::CPUDeviceContext, T> {
 };
 
 template <typename DeviceContext, typename T>
-void RowwiseMean<DeviceContext, T>::operator()(const DeviceContext& context,
-                                               const framework::Tensor& input,
-                                               framework::Tensor* out) {
+void RowwiseMean<DeviceContext, T>::operator()(
+    const DeviceContext& context,
+    const paddle::framework::Tensor& input,
+    paddle::framework::Tensor* out) {
   auto in_dims = input.dims();
-  PADDLE_ENFORCE_EQ(in_dims.size(), 2U, platform::errors::InvalidArgument(
-                                            "The rank of input tensor "
-                                            "should be 2, but received %d",
-                                            in_dims.size()));
-  PADDLE_ENFORCE_EQ(out->numel(), in_dims[0],
-                    platform::errors::InvalidArgument(
+  PADDLE_ENFORCE_EQ(
+      in_dims.size(),
+      2U,
+      paddle::platform::errors::InvalidArgument("The rank of input tensor "
+                                                "should be 2, but received %d",
+                                                in_dims.size()));
+  PADDLE_ENFORCE_EQ(out->numel(),
+                    in_dims[0],
+                    paddle::platform::errors::InvalidArgument(
                         "The size of output tensor "
                         "should be equal to the size of input tensor row"
                         " dimension. Expected output size=%d, but received %d",
-                        in_dims[0], out->numel()));
+                        in_dims[0],
+                        out->numel()));
 
-  auto in = framework::EigenMatrix<T>::From(input);
-  auto vec = framework::EigenVector<T>::Flatten(*out);
+  auto in = paddle::framework::EigenMatrix<T>::From(input);
+  auto vec = paddle::framework::EigenVector<T>::Flatten(*out);
 
   vec.device(*context.eigen_device()) = in.mean(Eigen::array<int, 1>({{1}}));
 }
@@ -144,24 +155,29 @@ void RowwiseMean<DeviceContext, T>::operator()(const DeviceContext& context,
 // rowwise-sum can be easily implemented. General reduce has a huge overhead in
 // CPU
 template <typename T>
-class RowwiseMean<platform::CPUDeviceContext, T> {
+class RowwiseMean<paddle::platform::CPUDeviceContext, T> {
  public:
-  void operator()(const platform::CPUDeviceContext& context,
-                  const framework::Tensor& input, framework::Tensor* out) {
+  void operator()(const paddle::platform::CPUDeviceContext& context,
+                  const paddle::framework::Tensor& input,
+                  paddle::framework::Tensor* out) {
     auto& in_dims = input.dims();
-    PADDLE_ENFORCE_EQ(in_dims.size(), 2U, platform::errors::InvalidArgument(
-                                              "The rank of input tensor "
-                                              "should be 2, but received %d",
-                                              in_dims.size()));
+    PADDLE_ENFORCE_EQ(in_dims.size(),
+                      2U,
+                      paddle::platform::errors::InvalidArgument(
+                          "The rank of input tensor "
+                          "should be 2, but received %d",
+                          in_dims.size()));
     auto height = in_dims[0];
     auto size = in_dims[1];
     PADDLE_ENFORCE_EQ(
-        out->numel(), height,
-        platform::errors::InvalidArgument(
+        out->numel(),
+        height,
+        paddle::platform::errors::InvalidArgument(
             "The size of output tensor "
             "should be equal to the size of input tensor row"
             " dimension. Expected output size=%d, but received %d",
-            height, out->numel()));
+            height,
+            out->numel()));
     auto inv_size = 1.0 / size;
     T* out_buf = out->mutable_data<T>(out->place());
     const T* in_buf = input.data<T>();
@@ -177,23 +193,28 @@ class RowwiseMean<platform::CPUDeviceContext, T> {
 };
 
 template <typename DeviceContext, typename T>
-void RowwiseSum<DeviceContext, T>::operator()(const DeviceContext& context,
-                                              const framework::Tensor& input,
-                                              framework::Tensor* out) {
+void RowwiseSum<DeviceContext, T>::operator()(
+    const DeviceContext& context,
+    const paddle::framework::Tensor& input,
+    paddle::framework::Tensor* out) {
   auto in_dims = input.dims();
-  PADDLE_ENFORCE_EQ(in_dims.size(), 2U, platform::errors::InvalidArgument(
-                                            "The rank of input tensor "
-                                            "should be 2, but received %d",
-                                            in_dims.size()));
-  PADDLE_ENFORCE_EQ(out->numel(), in_dims[0],
-                    platform::errors::InvalidArgument(
+  PADDLE_ENFORCE_EQ(
+      in_dims.size(),
+      2U,
+      paddle::platform::errors::InvalidArgument("The rank of input tensor "
+                                                "should be 2, but received %d",
+                                                in_dims.size()));
+  PADDLE_ENFORCE_EQ(out->numel(),
+                    in_dims[0],
+                    paddle::platform::errors::InvalidArgument(
                         "The size of output tensor "
                         "should be equal to the size of input tensor row"
                         " dimension. Expected output size=%d, but received %d",
-                        in_dims[0], out->numel()));
+                        in_dims[0],
+                        out->numel()));
 
-  auto in = framework::EigenMatrix<T>::From(input);
-  auto vec = framework::EigenVector<T>::Flatten(*out);
+  auto in = paddle::framework::EigenMatrix<T>::From(input);
+  auto vec = paddle::framework::EigenVector<T>::Flatten(*out);
 
   vec.device(*context.eigen_device()) = in.sum(Eigen::array<int, 1>({{1}}));
 }
@@ -202,24 +223,29 @@ void RowwiseSum<DeviceContext, T>::operator()(const DeviceContext& context,
 // rowwise-sum can be easily implemented. General reduce has a huge overhead in
 // CPU
 template <typename T>
-class RowwiseSum<platform::CPUDeviceContext, T> {
+class RowwiseSum<paddle::platform::CPUDeviceContext, T> {
  public:
-  void operator()(const platform::CPUDeviceContext& context,
-                  const framework::Tensor& input, framework::Tensor* out) {
+  void operator()(const paddle::platform::CPUDeviceContext& context,
+                  const paddle::framework::Tensor& input,
+                  paddle::framework::Tensor* out) {
     auto& in_dims = input.dims();
-    PADDLE_ENFORCE_EQ(in_dims.size(), 2U, platform::errors::InvalidArgument(
-                                              "The rank of input tensor "
-                                              "should be 2, but received %d",
-                                              in_dims.size()));
+    PADDLE_ENFORCE_EQ(in_dims.size(),
+                      2U,
+                      paddle::platform::errors::InvalidArgument(
+                          "The rank of input tensor "
+                          "should be 2, but received %d",
+                          in_dims.size()));
     auto height = in_dims[0];
     auto size = in_dims[1];
     PADDLE_ENFORCE_EQ(
-        out->numel(), height,
-        platform::errors::InvalidArgument(
+        out->numel(),
+        height,
+        paddle::platform::errors::InvalidArgument(
             "The size of output tensor "
             "should be equal to the size of input tensor row"
             " dimension. Expected output size=%d, but received %d",
-            height, out->numel()));
+            height,
+            out->numel()));
 
     T* out_buf = out->mutable_data<T>(out->place());
     const T* in_buf = input.data<T>();
@@ -234,6 +260,5 @@ class RowwiseSum<platform::CPUDeviceContext, T> {
   }
 };
 
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
+}  // namespace funcs
+}  // namespace pten
diff --git a/paddle/fluid/operators/math/math_function_test.cc b/paddle/pten/kernels/funcs/math_function_test.cc
similarity index 69%
rename from paddle/fluid/operators/math/math_function_test.cc
rename to paddle/pten/kernels/funcs/math_function_test.cc
index 91a4f2746e..6ef8c6b689 100644
--- a/paddle/fluid/operators/math/math_function_test.cc
+++ b/paddle/pten/kernels/funcs/math_function_test.cc
@@ -11,7 +11,7 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
-#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 #include "gtest/gtest.h"
 #include "paddle/fluid/operators/math/blas.h"
 
@@ -42,8 +42,19 @@ TEST(math_function, gemm_notrans_cblas) {
   memcpy(input3_ptr, arr3, 8 * sizeof(float));
 
   paddle::platform::CPUDeviceContext context(*cpu_place);
-  GetBlas<float>(context).GEMM(false, false, m, n, k, 1, input1_ptr, 3,
-                               input2_ptr + 1, 4, 1, input3_ptr + 1, 4);
+  GetBlas<float>(context).GEMM(false,
+                               false,
+                               m,
+                               n,
+                               k,
+                               1,
+                               input1_ptr,
+                               3,
+                               input2_ptr + 1,
+                               4,
+                               1,
+                               input3_ptr + 1,
+                               4);
 
   EXPECT_EQ(input3_ptr[0], 0);
   EXPECT_EQ(input3_ptr[1], 24);
@@ -83,15 +94,36 @@ void MklSmmCompare(int m, int n, int k) {
   auto smm = [&, m, n, k, lda, ldb, ldc, alpha, beta]() {
     const char transa = 'N';
     const char transb = 'N';
-    paddle::operators::math::CBlas<T>::SMM_GEMM(&transa, &transb, &n, &m, &k,
-                                                &alpha, B, &ldb, A, &lda, &beta,
-                                                CSMM, &ldc);
+    paddle::operators::math::CBlas<T>::SMM_GEMM(&transa,
+                                                &transb,
+                                                &n,
+                                                &m,
+                                                &k,
+                                                &alpha,
+                                                B,
+                                                &ldb,
+                                                A,
+                                                &lda,
+                                                &beta,
+                                                CSMM,
+                                                &ldc);
   };
 
   auto mkl = [&, m, n, k, lda, ldb, ldc, alpha, beta]() {
-    paddle::operators::math::CBlas<T>::GEMM(CblasRowMajor, CblasNoTrans,
-                                            CblasNoTrans, m, n, k, alpha, A,
-                                            lda, B, ldb, beta, CMKL, ldc);
+    paddle::operators::math::CBlas<T>::GEMM(CblasRowMajor,
+                                            CblasNoTrans,
+                                            CblasNoTrans,
+                                            m,
+                                            n,
+                                            k,
+                                            alpha,
+                                            A,
+                                            lda,
+                                            B,
+                                            ldb,
+                                            beta,
+                                            CMKL,
+                                            ldc);
   };
 
   smm();
@@ -131,8 +163,19 @@ TEST(math_function, gemm_trans_cblas) {
   memcpy(input3_ptr, arr3, 8 * sizeof(float));
 
   paddle::platform::CPUDeviceContext context(*cpu_place);
-  GetBlas<float>(context).GEMM(false, true, m, n, k, 1, input1_ptr, 3,
-                               input2_ptr + 3, 3, 1, input3_ptr + 1, 4);
+  GetBlas<float>(context).GEMM(false,
+                               true,
+                               m,
+                               n,
+                               k,
+                               1,
+                               input1_ptr,
+                               3,
+                               input2_ptr + 3,
+                               3,
+                               1,
+                               input3_ptr + 1,
+                               4);
   delete cpu_place;
   cpu_place = NULL;
 
@@ -151,9 +194,7 @@ TEST(math_function, zero) {
   auto* cpu_place = new paddle::platform::CPUPlace();
   float* t = tensor.mutable_data<float>({2, 2}, *cpu_place);
   paddle::platform::CPUDeviceContext context(*cpu_place);
-  paddle::operators::math::SetConstant<paddle::platform::CPUDeviceContext,
-                                       float>
-      functor;
+  pten::funcs::SetConstant<paddle::platform::CPUDeviceContext, float> functor;
   functor(context, &tensor, 0);
   EXPECT_EQ(t[0], 0);
   EXPECT_EQ(t[1], 0);
@@ -188,8 +229,14 @@ void GemvTest(int m, int n, bool trans) {
   }
 
   paddle::platform::CPUDeviceContext context(*cpu_place);
-  GetBlas<T>(context).GEMV(trans, static_cast<int>(m), static_cast<int>(n), 1.,
-                           data_a, data_b, 0., data_c);
+  GetBlas<T>(context).GEMV(trans,
+                           static_cast<int>(m),
+                           static_cast<int>(n),
+                           1.,
+                           data_a,
+                           data_b,
+                           0.,
+                           data_c);
 
   if (!trans) {
     for (int i = 0; i < m; ++i) {
@@ -224,9 +271,10 @@ TEST(math_funciton, set_constant) {
   t.mutable_data<int>(paddle::platform::CPUPlace());
   auto* ctx = new paddle::platform::CPUDeviceContext();
   ctx->Init();
-  paddle::operators::math::set_constant(*ctx, &t, 10);
+  pten::funcs::set_constant(*ctx, &t, 10);
   for (int64_t i = 0; i < t.numel(); ++i) {
-    PADDLE_ENFORCE_EQ(10, t.data<int>()[i],
+    PADDLE_ENFORCE_EQ(10,
+                      t.data<int>()[i],
                       paddle::platform::errors::InvalidArgument(
                           "Each value of input tensor should be 10, "
                           "but received %d.",
@@ -262,16 +310,27 @@ void GemmWarpTest(int m, int n, int k, T alpha, T beta) {
 
   // this would call gemm_warp
   paddle::platform::CPUDeviceContext context(*cpu_place);
-  GetBlas<T>(context).GEMM(CblasNoTrans, CblasNoTrans, m, n, k, alpha, A, B,
-                           beta, CREF);
+  GetBlas<T>(context).GEMM(
+      CblasNoTrans, CblasNoTrans, m, n, k, alpha, A, B, beta, CREF);
 
   // lda,ldb,ldc follow RowMajor
   int lda = k;
   int ldb = n;
   int ldc = n;
-  paddle::operators::math::CBlas<T>::GEMM(CblasRowMajor, CblasNoTrans,
-                                          CblasNoTrans, m, n, k, alpha, A, lda,
-                                          B, ldb, beta, CMKL, ldc);
+  paddle::operators::math::CBlas<T>::GEMM(CblasRowMajor,
+                                          CblasNoTrans,
+                                          CblasNoTrans,
+                                          m,
+                                          n,
+                                          k,
+                                          alpha,
+                                          A,
+                                          lda,
+                                          B,
+                                          ldb,
+                                          beta,
+                                          CMKL,
+                                          ldc);
 
   for (int i = 0; i < mat_c_mkl.numel(); ++i) {
     EXPECT_FLOAT_EQ(CREF[i], CMKL[i]);
diff --git a/paddle/fluid/operators/math/math_function_test.cu b/paddle/pten/kernels/funcs/math_function_test.cu
similarity index 90%
rename from paddle/fluid/operators/math/math_function_test.cu
rename to paddle/pten/kernels/funcs/math_function_test.cu
index 39c91e96a7..87f11c47a4 100644
--- a/paddle/fluid/operators/math/math_function_test.cu
+++ b/paddle/pten/kernels/funcs/math_function_test.cu
@@ -13,17 +13,20 @@
 // limitations under the License.
 #include "gtest/gtest.h"
 #include "paddle/fluid/operators/math/blas.h"
-#include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/platform/device_context.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
-void fill_fp16_data(paddle::platform::float16* in_ptr, size_t size,
+void fill_fp16_data(paddle::platform::float16* in_ptr,
+                    size_t size,
                     const std::vector<float>& data) {
   PADDLE_ENFORCE_EQ(
-      size, data.size(),
+      size,
+      data.size(),
       paddle::platform::errors::InvalidArgument(
           "The size of argument data should"
           " be equal to the argument size. Expected %d, but received %d.",
-          size, data.size()));
+          size,
+          data.size()));
   for (size_t i = 0; i < data.size(); ++i) {
     in_ptr[i] = paddle::platform::float16(data[i]);
   }
@@ -59,8 +62,8 @@ TEST(math_function, notrans_mul_trans_fp32) {
   paddle::framework::TensorCopySync(input1, gpu_place, &input2_gpu);
 
   out_gpu.mutable_data<float>({2, 2}, gpu_place);
-  GetBlas<float>(context).MatMul(input1_gpu, false, input2_gpu, true, 1,
-                                 &out_gpu, 0);
+  GetBlas<float>(context).MatMul(
+      input1_gpu, false, input2_gpu, true, 1, &out_gpu, 0);
 
   paddle::framework::TensorCopySync(out_gpu, cpu_place, &out);
 
@@ -102,8 +105,13 @@ TEST(math_function, notrans_mul_trans_fp16) {
   out_gpu.mutable_data<paddle::platform::float16>({2, 2}, gpu_place);
 
   GetBlas<paddle::platform::float16>(context).MatMul(
-      input1_gpu, false, input2_gpu, true, paddle::platform::float16(1),
-      &out_gpu, paddle::platform::float16(0));
+      input1_gpu,
+      false,
+      input2_gpu,
+      true,
+      paddle::platform::float16(1),
+      &out_gpu,
+      paddle::platform::float16(0));
 
   paddle::framework::TensorCopySync(out_gpu, cpu_place, &out);
 
@@ -139,8 +147,8 @@ TEST(math_function, trans_mul_notrans_fp32) {
 
   out_gpu.mutable_data<float>({3, 3}, gpu_place);
 
-  GetBlas<float>(context).MatMul(input1_gpu, true, input2_gpu, false, 1,
-                                 &out_gpu, 0);
+  GetBlas<float>(context).MatMul(
+      input1_gpu, true, input2_gpu, false, 1, &out_gpu, 0);
 
   paddle::framework::TensorCopySync(out_gpu, cpu_place, &out);
 
@@ -187,8 +195,13 @@ TEST(math_function, trans_mul_notrans_fp16) {
   out_gpu.mutable_data<paddle::platform::float16>({3, 3}, gpu_place);
 
   GetBlas<paddle::platform::float16>(context).MatMul(
-      input1_gpu, true, input2_gpu, false, paddle::platform::float16(1),
-      &out_gpu, paddle::platform::float16(0));
+      input1_gpu,
+      true,
+      input2_gpu,
+      false,
+      paddle::platform::float16(1),
+      &out_gpu,
+      paddle::platform::float16(0));
 
   paddle::framework::TensorCopySync(out_gpu, cpu_place, &out);
 
@@ -241,8 +254,8 @@ TEST(math_function, gemm_notrans_cublas_fp32) {
   float* b = input2_gpu.data<float>();
   float* c = input3_gpu.mutable_data<float>(gpu_place);
 
-  GetBlas<float>(context).GEMM(false, false, m, n, k, 1, a, 3, b + 1, 4, 1,
-                               c + 1, 4);
+  GetBlas<float>(context).GEMM(
+      false, false, m, n, k, 1, a, 3, b + 1, 4, 1, c + 1, 4);
 
   paddle::framework::TensorCopySync(input3_gpu, cpu_place, &input3);
 
@@ -292,8 +305,8 @@ TEST(math_function, gemm_notrans_cublas_fp16) {
   fill_fp16_data(input1_ptr, input1.numel(), {0, 1, 2, 3, 4, 5});
   paddle::platform::float16* input2_ptr =
       input2.mutable_data<paddle::platform::float16>({3, 4}, cpu_place);
-  fill_fp16_data(input2_ptr, input2.numel(),
-                 {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11});
+  fill_fp16_data(
+      input2_ptr, input2.numel(), {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11});
   paddle::platform::float16* input3_ptr =
       input3.mutable_data<paddle::platform::float16>({2, 4}, cpu_place);
   fill_fp16_data(input3_ptr, input3.numel(), {0, 1, 2, 3, 4, 5, 6, 7});
@@ -307,8 +320,19 @@ TEST(math_function, gemm_notrans_cublas_fp16) {
       input3_gpu.mutable_data<paddle::platform::float16>(gpu_place);
 
   GetBlas<paddle::platform::float16>(context).GEMM(
-      false, false, m, n, k, static_cast<paddle::platform::float16>(1), a, 3,
-      b + 1, 4, static_cast<paddle::platform::float16>(1), c + 1, 4);
+      false,
+      false,
+      m,
+      n,
+      k,
+      static_cast<paddle::platform::float16>(1),
+      a,
+      3,
+      b + 1,
+      4,
+      static_cast<paddle::platform::float16>(1),
+      c + 1,
+      4);
 
   paddle::framework::TensorCopySync(input3_gpu, cpu_place, &input3);
 
@@ -365,8 +389,8 @@ TEST(math_function, gemm_trans_cublas_fp32) {
   float* b = input2_gpu.data<float>();
   float* c = input3_gpu.mutable_data<float>(gpu_place);
 
-  GetBlas<float>(context).GEMM(false, true, m, n, k, 1, a, 3, b + 3, 3, 1,
-                               c + 1, 4);
+  GetBlas<float>(context).GEMM(
+      false, true, m, n, k, 1, a, 3, b + 3, 3, 1, c + 1, 4);
 
   paddle::framework::TensorCopySync(input3_gpu, cpu_place, &input3);
 
@@ -410,8 +434,8 @@ TEST(math_function, gemm_trans_cublas_fp16) {
   fill_fp16_data(input1_ptr, input1.numel(), {0, 1, 2, 3, 4, 5});
   paddle::platform::float16* input2_ptr =
       input2.mutable_data<paddle::platform::float16>({4, 3}, cpu_place);
-  fill_fp16_data(input2_ptr, input2.numel(),
-                 {0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11});
+  fill_fp16_data(
+      input2_ptr, input2.numel(), {0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11});
   paddle::platform::float16* input3_ptr =
       input3.mutable_data<paddle::platform::float16>({2, 4}, cpu_place);
   fill_fp16_data(input3_ptr, input3.numel(), {0, 1, 2, 3, 4, 5, 6, 7});
@@ -425,8 +449,19 @@ TEST(math_function, gemm_trans_cublas_fp16) {
       input3_gpu.mutable_data<paddle::platform::float16>(gpu_place);
 
   GetBlas<paddle::platform::float16>(context).GEMM(
-      false, true, m, n, k, static_cast<paddle::platform::float16>(1), a, 3,
-      b + 3, 3, static_cast<paddle::platform::float16>(1), c + 1, 4);
+      false,
+      true,
+      m,
+      n,
+      k,
+      static_cast<paddle::platform::float16>(1),
+      a,
+      3,
+      b + 3,
+      3,
+      static_cast<paddle::platform::float16>(1),
+      c + 1,
+      4);
 
   paddle::framework::TensorCopySync(input3_gpu, cpu_place, &input3);
 
@@ -476,8 +511,14 @@ void GemvTest(int m, int n, bool trans) {
   paddle::framework::TensorCopySync(mat_a, gpu_place, &g_mat_a);
   paddle::framework::TensorCopySync(vec_b, gpu_place, &g_vec_b);
 
-  GetBlas<T>(context).GEMV(trans, static_cast<int>(m), static_cast<int>(n), 1.,
-                           g_data_a, g_data_b, 0., g_data_c);
+  GetBlas<T>(context).GEMV(trans,
+                           static_cast<int>(m),
+                           static_cast<int>(n),
+                           1.,
+                           g_data_a,
+                           g_data_b,
+                           0.,
+                           g_data_c);
 
   paddle::framework::TensorCopySync(g_vec_c, cpu_place, &vec_c);
 
diff --git a/paddle/pten/kernels/gpu/trace_kernel.cu b/paddle/pten/kernels/gpu/trace_kernel.cu
index 155bfbd02a..f552386faf 100644
--- a/paddle/pten/kernels/gpu/trace_kernel.cu
+++ b/paddle/pten/kernels/gpu/trace_kernel.cu
@@ -36,7 +36,7 @@ void TraceKernel(const Context& ctx,
     kernels::TensorReduceImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
         ctx, diag, out, kps::IdentityFunctor<T>(), reduce_dims, stream);
   } else {
-    paddle::operators::math::SetConstant<Context, T> functor;
+    pten::funcs::SetConstant<Context, T> functor;
     functor(ctx, out, static_cast<T>(0));
   }
 }
diff --git a/paddle/pten/kernels/impl/trace_kernel_impl.h b/paddle/pten/kernels/impl/trace_kernel_impl.h
index 4dbba9bc69..1b499681bb 100644
--- a/paddle/pten/kernels/impl/trace_kernel_impl.h
+++ b/paddle/pten/kernels/impl/trace_kernel_impl.h
@@ -22,8 +22,9 @@
 #include <algorithm>
 
 #include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/for_range.h"
+#include "paddle/pten/kernels/funcs/math_function.h"
 
 namespace pten {
 template <typename T>
@@ -196,7 +197,7 @@ void TraceGradKernel(const Context& ctx,
   auto* out_data = out_grad.data<T>();
   T* x_data = in_grad->mutable_data<T>(ctx.GetPlace());
 
-  paddle::operators::math::SetConstant<Context, T> set_zero;
+  pten::funcs::SetConstant<Context, T> set_zero;
 
   set_zero(ctx, in_grad, static_cast<T>(0.0));
   auto dim1 = axis1;
-- 
GitLab