[Pten] blas and lapck migration (#39587)

* move blas related files * move lapack related files

[Pten] blas and lapck migration (#39587)
* move blas related files * move lapack related files
8c7ee8c2 · Feiyu Chan · GitHub · 1d6fd81d · 8c7ee8c2 · 8c7ee8c2
130 changed file
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -1036,3 +1036,42 @@ function(generate_dummy_static_lib)
  add_library(${dummy_LIB_NAME} STATIC ${dummy_FILE_PATH})
 endfunction()
+function(math_library TARGET)
+    # math_library is a function to create math library.
+    # The interface is the same as cc_library.
+    # But it handle split GPU/CPU code and link some common library.
+    set(cc_srcs)
+    set(cu_srcs)
+    set(hip_srcs)
+    set(math_common_deps device_context framework_proto enforce)
+    if (WITH_GPU)
+        if (${CMAKE_CUDA_COMPILER_VERSION} LESS 11.0)	
+            list(APPEND math_common_deps cub)
+	else()
+            list(APPEND math_common_deps)
+	endif()
+    endif()
+    set(multiValueArgs DEPS)
+    cmake_parse_arguments(math_library "${options}" "${oneValueArgs}"
+            "${multiValueArgs}" ${ARGN})
+    if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cc)
+        list(APPEND cc_srcs ${TARGET}.cc)
+    endif()
+    if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cu)
+        list(APPEND cu_srcs ${TARGET}.cu)
+    endif()
+    if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cu.cc)
+        list(APPEND cu_srcs ${TARGET}.cu.cc)
+    endif()
+    list(LENGTH cc_srcs cc_srcs_len)
+    if (WITH_GPU)
+        nv_library(${TARGET} SRCS ${cc_srcs} ${cu_srcs} DEPS ${math_library_DEPS} ${math_common_deps})
+    elseif (WITH_ROCM)
+        hip_library(${TARGET} SRCS ${cc_srcs} ${cu_srcs} DEPS ${math_library_DEPS} ${math_common_deps})
+    elseif(${cc_srcs_len} GREATER 0)
+        cc_library(${TARGET} SRCS ${cc_srcs} DEPS ${math_library_DEPS} ${math_common_deps})
+    endif()
+endfunction()
--- a/paddle/fluid/distributed/common/utils.h
+++ b/paddle/fluid/distributed/common/utils.h
@@ -24,18 +24,16 @@
 #include <utility>
 #include <vector>
-#include "paddle/fluid/operators/math/blas.h"
 #include "paddle/fluid/platform/device_context.h"
+#include "paddle/pten/kernels/funcs/blas/blas.h"
 namespace paddle {
 namespace distributed {
 template <typename T>
-inline paddle::operators::math::BlasT<paddle::platform::CPUDeviceContext, T>
+inline pten::funcs::BlasT<paddle::platform::CPUDeviceContext, T> GetBlas() {
-GetBlas() {
  paddle::platform::CPUDeviceContext cpu_ctx;
-  return paddle::operators::math::GetBlas<paddle::platform::CPUDeviceContext,
+  return pten::funcs::GetBlas<paddle::platform::CPUDeviceContext, T>(cpu_ctx);
-                                          T>(cpu_ctx);
 }
 template <typename T>

--- a/paddle/fluid/distributed/ps/service/communicator/communicator.cc
+++ b/paddle/fluid/distributed/ps/service/communicator/communicator.cc
@@ -1161,8 +1161,7 @@ void GeoCommunicator::SendDense(const CommContext &send_ctx) {
    t_delta->mutable_data<float>(t_latest.dims(), cpu_ctx.GetPlace());
    auto blas =
-        paddle::operators::math::GetBlas<platform::CPUDeviceContext, float>(
+        pten::funcs::GetBlas<platform::CPUDeviceContext, float>(cpu_ctx);
-            cpu_ctx);
    blas.VSUB(t_latest.numel(), t_latest.data<float>(),
              t_timestamp->data<float>(), t_delta->data<float>());
@@ -1201,8 +1200,7 @@ void GeoCommunicator::RecvDense(const CommContext &send_ctx) {
    t_delta->mutable_data<float>(t_latest->dims(), cpu_ctx.GetPlace());
    auto blas =
-        paddle::operators::math::GetBlas<platform::CPUDeviceContext, float>(
+        pten::funcs::GetBlas<platform::CPUDeviceContext, float>(cpu_ctx);
-            cpu_ctx);
    blas.VSUB(t_latest->numel(), t_pserver.data<float>(), t_old->data<float>(),
              t_delta->data<float>());
    blas.VADD(t_latest->numel(), t_latest->data<float>(),
@@ -1303,9 +1301,7 @@ void GeoCommunicator::SendSparse(const std::string &varname,
  t_delta->set_rows(sparse_ids);
  t_delta->set_height(t_latest.dims()[0]);
-  auto blas =
+  auto blas = pten::funcs::GetBlas<platform::CPUDeviceContext, float>(cpu_ctx);
-      paddle::operators::math::GetBlas<platform::CPUDeviceContext, float>(
-          cpu_ctx);
  float coefficient = 1.0 / static_cast<float>(trainers_);
  std::vector<float *> push_g_vec;
@@ -1371,9 +1367,7 @@ void GeoCommunicator::RecvSparse(const std::string &varname, int table_id,
  v_delta.resize(numel);
  paddle::platform::CPUDeviceContext cpu_ctx;
-  auto blas =
+  auto blas = pten::funcs::GetBlas<platform::CPUDeviceContext, float>(cpu_ctx);
-      paddle::operators::math::GetBlas<platform::CPUDeviceContext, float>(
-          cpu_ctx);
  for (auto j = 0; j < static_cast<int>(keys.size()); ++j) {
    VLOG(5) << "DEBUG GeoCommunicator::RecvSparse recv sparse key" << keys[j]

--- a/paddle/fluid/distributed/ps/service/communicator/communicator.h
+++ b/paddle/fluid/distributed/ps/service/communicator/communicator.h
@@ -34,12 +34,12 @@ limitations under the License. */
 #include "paddle/fluid/framework/scope.h"
 #include "paddle/fluid/framework/variable.h"
 #include "paddle/fluid/framework/variable_helper.h"
-#include "paddle/fluid/operators/math/blas.h"
 #include "paddle/fluid/operators/math/selected_rows_functor.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/string/split.h"
+#include "paddle/pten/kernels/funcs/blas/blas.h"
 #include "paddle/pten/kernels/funcs/math_function.h"
 #include "paddle/fluid/distributed/ps/service/ps_client.h"

--- a/paddle/fluid/framework/ir/embedding_fc_lstm_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/embedding_fc_lstm_fuse_pass.cc
@@ -17,7 +17,7 @@
 #include <string>
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_version_registry.h"
-#include "paddle/fluid/operators/math/blas.h"
+#include "paddle/pten/kernels/funcs/blas/blas.h"
 namespace paddle {
 namespace framework {
@@ -121,14 +121,14 @@ static int BuildFusion(Graph* graph, const std::string& name_scope,
    // broadcast biases
    std::vector<float> ones(m, 1.0f);
-    paddle::operators::math::CBlas<float>::GEMM(
+    pten::funcs::CBlas<float>::GEMM(
        CblasRowMajor, CblasNoTrans, CblasNoTrans, m, n, 1, alpha, &ones[0], 1,
        &combined_biases[0], n, 0.0f, embeddings_data, n);
    // Wx*embeddings + biases
-    paddle::operators::math::CBlas<float>::GEMM(
+    pten::funcs::CBlas<float>::GEMM(CblasRowMajor, CblasNoTrans, CblasNoTrans,
-        CblasRowMajor, CblasNoTrans, CblasNoTrans, m, n, k, alpha,
+                                    m, n, k, alpha, embedding_data, k,
-        embedding_data, k, weightx_data, n, beta, embeddings_data, n);
+                                    weightx_data, n, beta, embeddings_data, n);
    op_desc.SetInput("Embeddings", {embeddings});
    op_desc.SetInput("H0", {});

--- a/paddle/fluid/imperative/gradient_accumulator.cc
+++ b/paddle/fluid/imperative/gradient_accumulator.cc
@@ -22,13 +22,13 @@
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/selected_rows_utils.h"
 #include "paddle/fluid/imperative/layer.h"
-#include "paddle/fluid/operators/math/blas.h"
 #include "paddle/fluid/operators/math/selected_rows_functor.h"
 #include "paddle/fluid/platform/bfloat16.h"
 #include "paddle/fluid/platform/complex.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/float16.h"
 #include "paddle/fluid/platform/profiler.h"
+#include "paddle/pten/kernels/funcs/blas/blas.h"
 #include "paddle/pten/kernels/funcs/math_function.h"
 #ifdef PADDLE_WITH_XPU
 #include "xpu/refactor/math.h"
@@ -86,7 +86,7 @@ class TensorAddFunctor : public boost::static_visitor<> {
  void operator()(const platform::CPUPlace& place) const {
    platform::CPUDeviceContext* ctx = dynamic_cast<platform::CPUDeviceContext*>(
        platform::DeviceContextPool::Instance().Get(place));
-    auto blas = operators::math::GetBlas<platform::CPUDeviceContext, T>(*ctx);
+    auto blas = pten::funcs::GetBlas<platform::CPUDeviceContext, T>(*ctx);
    blas.AXPY(numel_, 1., x_, y_);
  }
@@ -118,7 +118,7 @@ class TensorAddFunctor : public boost::static_visitor<> {
    platform::CUDADeviceContext* ctx =
        dynamic_cast<platform::CUDADeviceContext*>(
            platform::DeviceContextPool::Instance().Get(place));
-    auto blas = operators::math::GetBlas<platform::CUDADeviceContext, T>(*ctx);
+    auto blas = pten::funcs::GetBlas<platform::CUDADeviceContext, T>(*ctx);
    blas.AXPY(numel_, 1., x_, y_);
  }
 #else

--- a/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.cu
+++ b/paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.cu
@@ -22,8 +22,8 @@
 #include "paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.h"
 #include "paddle/fluid/inference/tensorrt/plugin/trt_plugin_utils.h"
 #include "paddle/fluid/operators/math/bert_encoder_functor.h"
-#include "paddle/fluid/operators/math/blas.h"
 #include "paddle/fluid/platform/device_context.h"
+#include "paddle/pten/kernels/funcs/blas/blas.h"
 namespace paddle {
 namespace inference {

--- a/paddle/fluid/operators/activation_op.h
+++ b/paddle/fluid/operators/activation_op.h
@@ -28,9 +28,9 @@ limitations under the License. */
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/tensor_util.h"
-#include "paddle/fluid/operators/math/blas.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/float16.h"
+#include "paddle/pten/kernels/funcs/blas/blas.h"
 #ifdef PADDLE_WITH_MKLDNN
 #include "paddle/fluid/platform/mkldnn_helper.h"
 #endif

--- a/paddle/fluid/operators/addmm_op.h
+++ b/paddle/fluid/operators/addmm_op.h
@@ -19,7 +19,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/operators/eigen/eigen_function.h"
-#include "paddle/fluid/operators/math/blas.h"
+#include "paddle/pten/kernels/funcs/blas/blas.h"
 #include "paddle/pten/kernels/funcs/math_function.h"
 namespace ops = paddle::operators;
@@ -94,7 +94,7 @@ class AddMMKernel : public framework::OpKernel<T> {
    float alpha = context.template Attr<float>("Alpha");
    float beta = context.template Attr<float>("Beta");
-    auto blas = math::GetBlas<DeviceContext, T>(context);
+    auto blas = pten::funcs::GetBlas<DeviceContext, T>(context);
    // calc broadcast dim
    Array2 bcast_dims;
@@ -146,7 +146,7 @@ class AddMMGradKernel : public framework::OpKernel<T> {
    }
    auto& dev_ctx = ctx.template device_context<DeviceContext>();
-    auto blas = math::GetBlas<DeviceContext, T>(dev_ctx);
+    auto blas = pten::funcs::GetBlas<DeviceContext, T>(dev_ctx);
    if (dinput) {
      dinput->mutable_data<T>(ctx.GetPlace());
      total_elems = in_dims[0] * in_dims[1];

--- a/paddle/fluid/operators/affine_grid_op.h
+++ b/paddle/fluid/operators/affine_grid_op.h
@@ -16,7 +16,7 @@ limitations under the License. */
 #include <vector>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/blas.h"
+#include "paddle/pten/kernels/funcs/blas/blas.h"
 #include "paddle/pten/kernels/funcs/math_function.h"
 namespace paddle {
@@ -122,7 +122,7 @@ class AffineGridOpKernel : public framework::OpKernel<T> {
    GetIdxMap<DeviceContext, T>(n, h, w, align_corners, &grid, ctx);
    // output = grid * theta.T
    // TODO(wanghaoshuang): Refine batched matrix multiply
-    auto blas = math::GetBlas<DeviceContext, T>(ctx);
+    auto blas = pten::funcs::GetBlas<DeviceContext, T>(ctx);
    for (int i = 0; i < n; ++i) {
      Tensor sliced_grid = grid.Slice(i, i + 1).Resize(
          {static_cast<int64_t>(h) * static_cast<int64_t>(w), 3});
@@ -165,7 +165,7 @@ class AffineGridGradOpKernel : public framework::OpKernel<T> {
    GetIdxMap<DeviceContext, T>(n, h, w, align_corners, &grid, ctx);
    // output = grid * theta.T
    // TODO(wanghaoshuang): Refine batched matrix multiply
-    auto blas = math::GetBlas<DeviceContext, T>(ctx);
+    auto blas = pten::funcs::GetBlas<DeviceContext, T>(ctx);
    for (int i = 0; i < n; ++i) {
      Tensor sliced_grid = grid.Slice(i, i + 1).Resize(
          {static_cast<int64_t>(h) * static_cast<int64_t>(w), 3});

--- a/paddle/fluid/operators/atan2_op.h
+++ b/paddle/fluid/operators/atan2_op.h
@@ -17,10 +17,10 @@
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/operators/math/blas.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/float16.h"
 #include "paddle/fluid/platform/for_range.h"
+#include "paddle/pten/kernels/funcs/blas/blas.h"
 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/attention_lstm_op.cc
+++ b/paddle/fluid/operators/attention_lstm_op.cc
@@ -14,10 +14,10 @@ limitations under the License. */
 #include "paddle/fluid/operators/attention_lstm_op.h"
 #include <string>
-#include "paddle/fluid/operators/math/blas.h"
 #include "paddle/fluid/operators/math/cpu_vec.h"
 #include "paddle/fluid/operators/math/fc.h"
 #include "paddle/fluid/platform/cpu_info.h"
+#include "paddle/pten/kernels/funcs/blas/blas.h"
 namespace paddle {
 namespace operators {
@@ -373,7 +373,7 @@ class AttentionLSTMKernel : public framework::OpKernel<T> {
    T* lstm_x_data = lstm_x->mutable_data<T>(ctx.GetPlace());
    T* lstm_out_data = lstm_out->mutable_data<T>(ctx.GetPlace());
-    auto blas = math::GetBlas<platform::CPUDeviceContext, T>(ctx);
+    auto blas = pten::funcs::GetBlas<platform::CPUDeviceContext, T>(ctx);
    // x(TxM) * fc (Mx1) part of atten_wgt(M+D)x1
    auto& dev_ctx = ctx.template device_context<platform::CPUDeviceContext>();

--- a/paddle/fluid/operators/batch_fc_op.cu
+++ b/paddle/fluid/operators/batch_fc_op.cu
@@ -15,9 +15,9 @@ limitations under the License. */
 #include <string>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/operators/batch_fc_op.h"
-#include "paddle/fluid/operators/math/blas.h"
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/pten/kernels/funcs/blas/blas.h"
 namespace paddle {
 namespace operators {
@@ -112,7 +112,7 @@ class BatchFCCUDAKernel : public framework::OpKernel<T> {
    int64_t strideA = ins_num * in_dim;
    int64_t strideB = in_dim * out_dim;
-    auto blas = math::GetBlas<platform::CUDADeviceContext, T>(dev_ctx);
+    auto blas = pten::funcs::GetBlas<platform::CUDADeviceContext, T>(dev_ctx);
    blas.BatchedGEMM(transA, transB, ins_num, out_dim, in_dim, alpha, in_data,
                     w_data, beta, out_data, slot_pairs_num, strideA, strideB);
    add_bias<T>(ctx.cuda_device_context().stream(), out_data, slot_pairs_num,
@@ -165,7 +165,7 @@ class BatchFCGradOpCUDAKernel : public framework::OpKernel<T> {
    add_bias_grad<T>(ctx.cuda_device_context().stream(), dout_data,
                     slot_pairs_num, ins_num, out_dim, db_data);
-    auto blas = math::GetBlas<platform::CUDADeviceContext, T>(dev_ctx);
+    auto blas = pten::funcs::GetBlas<platform::CUDADeviceContext, T>(dev_ctx);
    T alpha = 1;
    T beta = 0;

--- a/paddle/fluid/operators/bilinear_tensor_product_op.h
+++ b/paddle/fluid/operators/bilinear_tensor_product_op.h
@@ -16,7 +16,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/blas.h"
+#include "paddle/pten/kernels/funcs/blas/blas.h"
 namespace paddle {
 namespace operators {
@@ -61,7 +61,7 @@ class BilinearTensorProductKernel : public framework::OpKernel<T> {
      auto output_col_vec = output_mat.chip(i, 1);
      Tensor weight_mat =
          weight->Slice(i, i + 1).Resize(framework::make_ddim({x_dim, y_dim}));
-      math::GetBlas<DeviceContext, T>(dev_ctx).GEMM(
+      pten::funcs::GetBlas<DeviceContext, T>(dev_ctx).GEMM(
          CblasNoTrans, CblasNoTrans, batch_size, y_dim, x_dim, 1, x->data<T>(),
          weight_mat.data<T>(), 0, left_mul.data<T>());
      output_col_vec.device(place) =
@@ -127,7 +127,7 @@ class BilinearTensorProductGradKernel : public framework::OpKernel<T> {
      d_weight->mutable_data<T>(ctx.GetPlace());
    }
-    auto blas = math::GetBlas<DeviceContext, T>(ctx);
+    auto blas = pten::funcs::GetBlas<DeviceContext, T>(ctx);
    // Caculate the Output(X@Grad) and Output(Y@Grad).
    if (d_x || d_y || d_weight) {

--- a/paddle/fluid/operators/bmm_op.h
+++ b/paddle/fluid/operators/bmm_op.h
@@ -20,7 +20,7 @@
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/operators/math/blas.h"
+#include "paddle/pten/kernels/funcs/blas/blas.h"
 #include "paddle/pten/kernels/funcs/math_function.h"
 namespace paddle {
 namespace operators {
@@ -28,7 +28,7 @@ namespace operators {
 using Tensor = framework::Tensor;
 static void ReshapeTensorIntoMatrixSequence(
-    framework::Tensor *x, const math::MatDescriptor &descriptor) {
+    framework::Tensor *x, const pten::funcs::MatDescriptor &descriptor) {
  int64_t h, w;
  h = descriptor.height_;
  w = descriptor.width_;
@@ -45,8 +45,8 @@ static void ReshapeXYOutIntoMatrixSequence(framework::Tensor *x,
                                           bool trans_y) {
  auto x_dim = x->dims();
  auto y_dim = y->dims();
-  auto mat_dim_x = math::CreateMatrixDescriptor(x_dim, 0, false);
+  auto mat_dim_x = pten::funcs::CreateMatrixDescriptor(x_dim, 0, false);
-  auto mat_dim_y = math::CreateMatrixDescriptor(y_dim, 0, false);
+  auto mat_dim_y = pten::funcs::CreateMatrixDescriptor(y_dim, 0, false);
  out->Resize({std::max(mat_dim_x.batch_size_, mat_dim_y.batch_size_),
               mat_dim_x.height_, mat_dim_y.width_});
@@ -68,10 +68,10 @@ class BmmKernel : public framework::OpKernel<T> {
      return;
    }
-    auto blas = math::GetBlas<DeviceContext, T>(context);
+    auto blas = pten::funcs::GetBlas<DeviceContext, T>(context);
-    auto mat_dim_a = math::CreateMatrixDescriptor(x.dims(), 0, false);
+    auto mat_dim_a = pten::funcs::CreateMatrixDescriptor(x.dims(), 0, false);
-    auto mat_dim_b = math::CreateMatrixDescriptor(y.dims(), 0, false);
+    auto mat_dim_b = pten::funcs::CreateMatrixDescriptor(y.dims(), 0, false);
    // auto scale = static_cast<T>(context.Attr<float>("alpha"));
    blas.MatMul(x, mat_dim_a, y, mat_dim_b, T(1), out, T(0));
@@ -86,9 +86,9 @@ class BmmGradKernel : public framework::OpKernel<T> {
              const framework::Tensor &b, bool trans_b,
              framework::Tensor *out) const {
    out->mutable_data<T>(context.GetPlace());
-    auto blas = math::GetBlas<DeviceContext, T>(context);
+    auto blas = pten::funcs::GetBlas<DeviceContext, T>(context);
-    auto mat_dim_a = math::CreateMatrixDescriptor(a.dims(), 0, trans_a);
+    auto mat_dim_a = pten::funcs::CreateMatrixDescriptor(a.dims(), 0, trans_a);
-    auto mat_dim_b = math::CreateMatrixDescriptor(b.dims(), 0, trans_b);
+    auto mat_dim_b = pten::funcs::CreateMatrixDescriptor(b.dims(), 0, trans_b);
    blas.MatMul(a, mat_dim_a, b, mat_dim_b, T(1), out, T(0));
  }

--- a/paddle/fluid/operators/center_loss_op.h
+++ b/paddle/fluid/operators/center_loss_op.h
@@ -19,8 +19,8 @@ limitations under the License. */
 #include <vector>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/blas.h"
 #include "paddle/fluid/platform/transform.h"
+#include "paddle/pten/kernels/funcs/blas/blas.h"
 namespace paddle {
 namespace operators {
@@ -84,7 +84,7 @@ class CenterLossKernel : public framework::OpKernel<T> {
    int numel = centers_diffacc.numel();
    std::memset(centers_diffacc_data, 0, sizeof(T) * numel);
-    auto blas = math::GetBlas<DeviceContext, T>(ctx);
+    auto blas = pten::funcs::GetBlas<DeviceContext, T>(ctx);
    int tLabel;
    const T *x_index;

--- a/paddle/fluid/operators/cholesky_op.h
+++ b/paddle/fluid/operators/cholesky_op.h
@@ -19,9 +19,9 @@ limitations under the License. */
 #include "Eigen/Cholesky"
 #include "Eigen/Core"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/blas.h"
 #include "paddle/fluid/operators/transpose_op.h"
 #include "paddle/fluid/platform/for_range.h"
+#include "paddle/pten/kernels/funcs/blas/blas.h"
 namespace paddle {
 namespace operators {
@@ -323,9 +323,9 @@ class CholeskyGradKernel : public framework::OpKernel<T> {
    /*! phi = matmul(L.transpose(-1, -2), grad) */
    Tensor middle;
    auto* middle_data = middle.mutable_data<T>(dims, context.GetPlace());
-    auto trans_desc = math::CreateMatrixDescriptor(dims, 0, true);
+    auto trans_desc = pten::funcs::CreateMatrixDescriptor(dims, 0, true);
-    auto no_trans_desc = math::CreateMatrixDescriptor(dims, 0, false);
+    auto no_trans_desc = pten::funcs::CreateMatrixDescriptor(dims, 0, false);
-    auto blas = math::GetBlas<DeviceContext, T>(context);
+    auto blas = pten::funcs::GetBlas<DeviceContext, T>(context);
    blas.MatMul(l, trans_desc, l_grad, no_trans_desc, T(1), &middle, T(0));
    /*! phi.tril_().diagonal(0, -2, -1).mul_(0.5) */

--- a/paddle/fluid/operators/cholesky_solve_op.h
+++ b/paddle/fluid/operators/cholesky_solve_op.h
@@ -15,11 +15,11 @@ limitations under the License. */
 #pragma once
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/lapack_function.h"
 #include "paddle/fluid/operators/solve_op.h"
 #include "paddle/fluid/operators/svd_helper.h"
 #include "paddle/fluid/operators/triangular_solve_op.h"
 #include "paddle/fluid/platform/complex.h"
+#include "paddle/pten/kernels/funcs/lapack/lapack_function.h"
 #include "paddle/pten/kernels/math_kernel.h"
 namespace paddle {
@@ -38,8 +38,8 @@ class CholeskySolveFunctor<paddle::platform::CPUDeviceContext, T> {
  void operator()(const platform::CPUDeviceContext &dev_ctx, bool upper, int n,
                  int nrhs, T *Adata, int lda, T *Bdata, int *devInfo) {
    char uplo = upper ? 'U' : 'L';
-    math::lapackCholeskySolve<T>(uplo, n, nrhs, Adata, lda, Bdata, lda,
+    pten::funcs::lapackCholeskySolve<T>(uplo, n, nrhs, Adata, lda, Bdata, lda,
-                                 devInfo);
+                                        devInfo);
  }
 };
@@ -168,7 +168,7 @@ class CholeskySolveGradKernel : public framework::OpKernel<T> {
        db->Resize(bin->dims());
      }
-      auto blas = math::GetBlas<DeviceContext, T>(ctx);
+      auto blas = pten::funcs::GetBlas<DeviceContext, T>(ctx);
      // calculate out's conjugate for complex
      framework::Tensor out_conj(out->type());
@@ -182,8 +182,8 @@ class CholeskySolveGradKernel : public framework::OpKernel<T> {
      framework::Tensor commonterm(out->type());
      auto outdims = out_conj.dims();
      auto dbdims = db_bst.dims();
-      auto mat_dim_a = math::CreateMatrixDescriptor(outdims, 0, false);
+      auto mat_dim_a = pten::funcs::CreateMatrixDescriptor(outdims, 0, false);
-      auto mat_dim_b = math::CreateMatrixDescriptor(dbdims, 0, false);
+      auto mat_dim_b = pten::funcs::CreateMatrixDescriptor(dbdims, 0, false);
      auto cmtdim = outdims;
      cmtdim[cmtdim.size() - 2] = dbdims[dbdims.size() - 2];
      commonterm.Resize(cmtdim);
@@ -207,9 +207,10 @@ class CholeskySolveGradKernel : public framework::OpKernel<T> {
              DeviceContext>::TYPE &>(dev_ctx),
          commonterm, commonterm_conj, -1, &commonterm);
-      auto mat_dim_u = math::CreateMatrixDescriptor(u_bst.dims(), 0, false);
+      auto mat_dim_u =
+          pten::funcs::CreateMatrixDescriptor(u_bst.dims(), 0, false);
      auto mat_dim_c =
-          math::CreateMatrixDescriptor(commonterm.dims(), 0, false);
+          pten::funcs::CreateMatrixDescriptor(commonterm.dims(), 0, false);
      Tensor du_bst(uin->type());
      // get upper or lower triangular

--- a/paddle/fluid/operators/conv_op.h
+++ b/paddle/fluid/operators/conv_op.h
@@ -21,10 +21,10 @@ limitations under the License. */
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/layout_utils.h"
-#include "paddle/fluid/operators/math/blas.h"
 #include "paddle/fluid/operators/math/depthwise_conv.h"
 #include "paddle/fluid/operators/math/im2col.h"
 #include "paddle/fluid/operators/math/vol2col.h"
+#include "paddle/pten/kernels/funcs/blas/blas.h"
 namespace paddle {
 namespace operators {
@@ -332,7 +332,7 @@ class GemmConvKernel : public framework::OpKernel<T> {
    math::Vol2ColFunctor<DeviceContext, T> vol2col;
    math::Im2ColFunctor<math::ColFormat::kCFO, DeviceContext, T> im2col;
-    auto blas = math::GetBlas<DeviceContext, T>(dev_ctx);
+    auto blas = pten::funcs::GetBlas<DeviceContext, T>(dev_ctx);
    for (int i = 0; i < batch_size; i++) {
      Tensor in_batch =
          transformed_input.Slice(i, i + 1).Resize(in_matrix_shape);
@@ -486,7 +486,7 @@ class GemmConvGradKernel : public framework::OpKernel<T> {
    }
    pten::funcs::SetConstant<DeviceContext, T> set_zero;
-    auto blas = math::GetBlas<DeviceContext, T>(dev_ctx);
+    auto blas = pten::funcs::GetBlas<DeviceContext, T>(dev_ctx);
    if (input_grad) {
      input_grad->mutable_data<T>(context.GetPlace());
@@ -693,7 +693,7 @@ class GemmConvDoubleGradKernel : public framework::OpKernel<T> {
    }
    pten::funcs::SetConstant<DeviceContext, T> set_zero;
-    auto blas = math::GetBlas<DeviceContext, T>(dev_ctx);
+    auto blas = pten::funcs::GetBlas<DeviceContext, T>(dev_ctx);
    // dx convolution double grad:  gemm + col2im(col2vol)
    // dx = ddw * dy  ==> dx(N, Cin, H, W), ddw(Cout, Cin, kh, kw), dy(N, Cout,

--- a/paddle/fluid/operators/conv_transpose_op.h
+++ b/paddle/fluid/operators/conv_transpose_op.h
@@ -20,11 +20,11 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/conv_op.h"
 #include "paddle/fluid/operators/eigen/eigen_function.h"
-#include "paddle/fluid/operators/math/blas.h"
 #include "paddle/fluid/operators/math/concat_and_split.h"
 #include "paddle/fluid/operators/math/depthwise_conv.h"
 #include "paddle/fluid/operators/math/im2col.h"
 #include "paddle/fluid/operators/math/vol2col.h"
+#include "paddle/pten/kernels/funcs/blas/blas.h"
 namespace paddle {
 namespace operators {
@@ -228,7 +228,7 @@ class GemmConvTransposeKernel : public framework::OpKernel<T> {
    output->mutable_data<T>(context.GetPlace());
    pten::funcs::SetConstant<DeviceContext, T> set_zero;
    auto& dev_ctx = context.template device_context<DeviceContext>();
-    auto blas = math::GetBlas<DeviceContext, T>(dev_ctx);
+    auto blas = pten::funcs::GetBlas<DeviceContext, T>(dev_ctx);
    set_zero(dev_ctx, output, static_cast<T>(0));
    int in_step =
@@ -425,7 +425,7 @@ class GemmConvTransposeGradKernel : public framework::OpKernel<T> {
    // im2col + gemm (similar to conv-forward)
    // input need to compute gradient
    auto& dev_ctx = context.template device_context<DeviceContext>();
-    auto blas = math::GetBlas<DeviceContext, T>(dev_ctx);
+    auto blas = pten::funcs::GetBlas<DeviceContext, T>(dev_ctx);
    if (input_grad || filter_grad) {
      Tensor col;
      col.mutable_data<T>(col_shape, context.GetPlace());

--- a/paddle/fluid/operators/deformable_conv_filter.cu.h
+++ b/paddle/fluid/operators/deformable_conv_filter.cu.h
@@ -22,7 +22,7 @@
 // \author Yi Li, Guodong Zhang, Jifeng Dai
 #pragma once
-#include "paddle/fluid/operators/math/blas.h"
+#include "paddle/pten/kernels/funcs/blas/blas.h"
 #include "paddle/pten/kernels/funcs/math_function.h"
 template <typename T>

--- a/paddle/fluid/operators/deformable_conv_func.h
+++ b/paddle/fluid/operators/deformable_conv_func.h
@@ -22,8 +22,8 @@
 // \author Yi Li, Guodong Zhang, Jifeng Dai
 #pragma once
-#include "paddle/fluid/operators/math/blas.h"
 #include "paddle/pten/core/hostdevice.h"
+#include "paddle/pten/kernels/funcs/blas/blas.h"
 #include "paddle/pten/kernels/funcs/math_function.h"
 template <typename T>

--- a/paddle/fluid/operators/deformable_conv_op.cu
+++ b/paddle/fluid/operators/deformable_conv_op.cu
@@ -25,8 +25,8 @@
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/deformable_conv_op.h"
-#include "paddle/fluid/operators/math/blas.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/pten/kernels/funcs/blas/blas.h"
 #include "paddle/pten/kernels/funcs/math_function.h"
 namespace paddle {
@@ -513,7 +513,7 @@ class DeformableConvCUDAKernel : public framework::OpKernel<T> {
    int input_offset_dim = offset.numel() / offset.dims()[0];
    int input_mask_dim = mask.numel() / mask.dims()[0];
-    auto blas = math::GetBlas<DeviceContext, T>(dev_ctx);
+    auto blas = pten::funcs::GetBlas<DeviceContext, T>(dev_ctx);
    const T* input_ptr = input->data<T>();
    const T* offset_ptr = offset.data<T>();
@@ -624,7 +624,7 @@ class DeformableConvGradCUDAKernel : public framework::OpKernel<T> {
    col_buffer_3d.ShareDataWith(col_buffer).Resize(col_buffer_3d_shape);
    pten::funcs::SetConstant<DeviceContext, T> set_zero;
-    auto blas = math::GetBlas<DeviceContext, T>(dev_ctx);
+    auto blas = pten::funcs::GetBlas<DeviceContext, T>(dev_ctx);
    col_buffer.mutable_data<T>(ctx.GetPlace());
    col_buffer_3d.mutable_data<T>(ctx.GetPlace());

--- a/paddle/fluid/operators/deformable_conv_op.h
+++ b/paddle/fluid/operators/deformable_conv_op.h
@@ -26,7 +26,7 @@
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/deformable_conv_func.h"
-#include "paddle/fluid/operators/math/blas.h"
+#include "paddle/pten/kernels/funcs/blas/blas.h"
 #include "paddle/pten/kernels/funcs/math_function.h"
 namespace paddle {
@@ -382,7 +382,7 @@ class DeformableConvCPUKernel : public framework::OpKernel<T> {
    int input_dim = input->numel() / input->dims()[0];
    int input_offset_dim = offset->numel() / offset->dims()[0];
    int input_mask_dim = mask->numel() / mask->dims()[0];
-    auto blas = math::GetBlas<CPUDeviceContext, T>(dev_ctx);
+    auto blas = pten::funcs::GetBlas<CPUDeviceContext, T>(dev_ctx);
    const T* input_ptr = input->data<T>();
    const T* offset_ptr = offset->data<T>();
    const T* mask_ptr = mask->data<T>();
@@ -490,7 +490,7 @@ class DeformableConvGradCPUKernel : public framework::OpKernel<T> {
    col_buffer_3d.ShareDataWith(col_buffer).Resize(col_buffer_3d_shape);
    pten::funcs::SetConstant<CPUDeviceContext, T> set_zero;
-    auto blas = math::GetBlas<CPUDeviceContext, T>(dev_ctx);
+    auto blas = pten::funcs::GetBlas<CPUDeviceContext, T>(dev_ctx);
    col_buffer.mutable_data<T>(ctx.GetPlace());
    col_buffer_3d.mutable_data<T>(ctx.GetPlace());

--- a/paddle/fluid/operators/deformable_conv_v1_op.cu
+++ b/paddle/fluid/operators/deformable_conv_v1_op.cu
@@ -28,8 +28,8 @@
 #include "paddle/fluid/operators/deformable_conv_filter.cu.h"
 #include "paddle/fluid/operators/deformable_conv_func.h"
 #include "paddle/fluid/operators/deformable_conv_v1_op.h"
-#include "paddle/fluid/operators/math/blas.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/pten/kernels/funcs/blas/blas.h"
 #include "paddle/pten/kernels/funcs/math_function.h"
 namespace paddle {
@@ -381,7 +381,7 @@ class DeformableConvV1CUDAKernel : public framework::OpKernel<T> {
    int input_dim = input->numel() / input->dims()[0];
    int input_offset_dim = offset.numel() / offset.dims()[0];
-    auto blas = math::GetBlas<CUDADeviceContext, T>(dev_ctx);
+    auto blas = pten::funcs::GetBlas<CUDADeviceContext, T>(dev_ctx);
    const T* input_ptr = input->data<T>();
    const T* offset_ptr = offset.data<T>();
@@ -490,7 +490,7 @@ class DeformableConvV1GradCUDAKernel : public framework::OpKernel<T> {
    col_buffer_3d.ShareDataWith(col_buffer).Resize(col_buffer_3d_shape);
    pten::funcs::SetConstant<CUDADeviceContext, T> set_zero;
-    auto blas = math::GetBlas<CUDADeviceContext, T>(dev_ctx);
+    auto blas = pten::funcs::GetBlas<CUDADeviceContext, T>(dev_ctx);
    col_buffer.mutable_data<T>(ctx.GetPlace());
    col_buffer_3d.mutable_data<T>(ctx.GetPlace());

--- a/paddle/fluid/operators/deformable_conv_v1_op.h
+++ b/paddle/fluid/operators/deformable_conv_v1_op.h
@@ -27,7 +27,7 @@
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/deformable_conv_func.h"
 #include "paddle/fluid/operators/deformable_conv_op.h"
-#include "paddle/fluid/operators/math/blas.h"
+#include "paddle/pten/kernels/funcs/blas/blas.h"
 #include "paddle/pten/kernels/funcs/math_function.h"
 namespace paddle {
@@ -348,7 +348,7 @@ class DeformableConvV1CPUKernel : public framework::OpKernel<T> {
    std::vector<int64_t> input_shape_vec = framework::vectorize(input_shape);
    int input_dim = input->numel() / input->dims()[0];
    int input_offset_dim = offset->numel() / offset->dims()[0];
-    auto blas = math::GetBlas<CPUDeviceContext, T>(dev_ctx);
+    auto blas = pten::funcs::GetBlas<CPUDeviceContext, T>(dev_ctx);
    const T* input_ptr = input->data<T>();
    const T* offset_ptr = offset->data<T>();
    col_buffer.mutable_data<T>(ctx.GetPlace());
@@ -452,7 +452,7 @@ class DeformableConvV1GradCPUKernel : public framework::OpKernel<T> {
    col_buffer_3d.ShareDataWith(col_buffer).Resize(col_buffer_3d_shape);
    pten::funcs::SetConstant<CPUDeviceContext, T> set_zero;
-    auto blas = math::GetBlas<CPUDeviceContext, T>(dev_ctx);
+    auto blas = pten::funcs::GetBlas<CPUDeviceContext, T>(dev_ctx);
    col_buffer.mutable_data<T>(ctx.GetPlace());
    col_buffer_3d.mutable_data<T>(ctx.GetPlace());

--- a/paddle/fluid/operators/deformable_psroi_pooling_op.cc
+++ b/paddle/fluid/operators/deformable_psroi_pooling_op.cc
@@ -16,7 +16,7 @@
 #include <iostream>
 #include <memory>
 #include <vector>
-#include "paddle/fluid/operators/math/blas.h"
+#include "paddle/pten/kernels/funcs/blas/blas.h"
 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/deformable_psroi_pooling_op.cu
+++ b/paddle/fluid/operators/deformable_psroi_pooling_op.cu
@@ -30,8 +30,8 @@
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/memory/malloc.h"
 #include "paddle/fluid/operators/deformable_psroi_pooling_op.h"
-#include "paddle/fluid/operators/math/blas.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/pten/kernels/funcs/blas/blas.h"
 #include "paddle/pten/kernels/funcs/math_function.h"
 namespace paddle {

--- a/paddle/fluid/operators/deformable_psroi_pooling_op.h
+++ b/paddle/fluid/operators/deformable_psroi_pooling_op.h
@@ -26,7 +26,7 @@
 #include <iostream>
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/blas.h"
+#include "paddle/pten/kernels/funcs/blas/blas.h"
 #include "paddle/pten/kernels/funcs/math_function.h"
 namespace paddle {

--- a/paddle/fluid/operators/eig_op.h
+++ b/paddle/fluid/operators/eig_op.h
@@ -17,12 +17,12 @@
 #include <math.h>
 #include <algorithm>
 #include <complex>
-#include "paddle/fluid/operators/math/lapack_function.h"
 #include "paddle/fluid/operators/math/matrix_solve.h"
 #include "paddle/fluid/operators/svd_helper.h"
 #include "paddle/fluid/operators/transpose_op.h"
 #include "paddle/fluid/platform/for_range.h"
 #include "paddle/pten/kernels/funcs/complex_functors.h"
+#include "paddle/pten/kernels/funcs/lapack/lapack_function.h"
 #include "paddle/pten/kernels/funcs/math_function.h"
 #define EPSILON 1e-6
@@ -94,7 +94,7 @@ void LapackEig(Tensor* input, Tensor* values, Tensor* vectors, int info,
  // call lapackEig once to compute the size of work;
  T computed_work_size;
-  math::lapackEig<T, pten::funcs::Real<T>>(
+  pten::funcs::lapackEig<T, pten::funcs::Real<T>>(
      jobvl, jobvr, order, input_data, lda, values_data, lvector_data, ldvl,
      rvector_data, ldvr, &computed_work_size, lwork, rwork_data, &info);
@@ -109,7 +109,7 @@ void LapackEig(Tensor* input, Tensor* values, Tensor* vectors, int info,
    T* current_values = &values_data[i * values_stride];
    T* current_rvectors = &rvector_data[i * matrix_stride];
-    math::lapackEig<T, pten::funcs::Real<T>>(
+    pten::funcs::lapackEig<T, pten::funcs::Real<T>>(
        jobvl, jobvr, order, current_matrix, lda, current_values, lvector_data,
        ldvl, current_rvectors, ldvr, work_data, lwork, rwork_data, &info);
    PADDLE_ENFORCE_EQ(

--- a/paddle/fluid/operators/eigvals_op.h
+++ b/paddle/fluid/operators/eigvals_op.h
@@ -20,9 +20,9 @@
 #include "paddle/fluid/framework/ddim.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/memory/allocation/allocator.h"
-#include "paddle/fluid/operators/math/lapack_function.h"
 #include "paddle/fluid/platform/for_range.h"
 #include "paddle/pten/kernels/funcs/complex_functors.h"
+#include "paddle/pten/kernels/funcs/lapack/lapack_function.h"
 namespace paddle {
 namespace operators {
@@ -103,11 +103,11 @@ LapackEigvals(const framework::ExecutionContext& ctx, const Tensor& input,
          required_work_mem, work_mem));
  int info = 0;
-  math::lapackEig<T>('N', 'N', static_cast<int>(n_dim), a.template data<T>(),
+  pten::funcs::lapackEig<T>('N', 'N', static_cast<int>(n_dim),
-                     static_cast<int>(n_dim), w_data, NULL, 1, NULL, 1,
+                            a.template data<T>(), static_cast<int>(n_dim),
-                     work->template data<T>(),
+                            w_data, NULL, 1, NULL, 1, work->template data<T>(),
-                     static_cast<int>(work_mem / sizeof(T)),
+                            static_cast<int>(work_mem / sizeof(T)),
-                     static_cast<T*>(NULL), &info);
+                            static_cast<T*>(NULL), &info);
  std::string name = "framework::platform::dynload::dgeev_";
  if (framework::TransToProtoVarType(input.dtype()) ==
@@ -153,7 +153,7 @@ LapackEigvals(const framework::ExecutionContext& ctx, const Tensor& input,
          required_rwork_mem, rwork_mem));
  int info = 0;
-  math::lapackEig<T, pten::funcs::Real<T>>(
+  pten::funcs::lapackEig<T, pten::funcs::Real<T>>(
      'N', 'N', static_cast<int>(n_dim), a.template data<T>(),
      static_cast<int>(n_dim), output->template data<T>(), NULL, 1, NULL, 1,
      work->template data<T>(), static_cast<int>(work_mem / sizeof(T)),
@@ -187,10 +187,10 @@ class EigvalsKernel : public framework::OpKernel<T> {
    // query workspace size
    T qwork;
    int info;
-    math::lapackEig<T, pten::funcs::Real<T>>(
+    pten::funcs::lapackEig<T, pten::funcs::Real<T>>(
        'N', 'N', static_cast<int>(n_dim), input_matrices[0].template data<T>(),
        static_cast<int>(n_dim), NULL, NULL, 1, NULL, 1, &qwork, -1,
-        static_cast<pten::funcs::Real<T>*>(NULL), &info);
+        static_cast<Real<T>*>(NULL), &info);
    int64_t lwork = static_cast<int64_t>(qwork);
    Tensor work, rwork;

--- a/paddle/fluid/operators/elementwise/elementwise_mul_op.cc
+++ b/paddle/fluid/operators/elementwise/elementwise_mul_op.cc
@@ -28,7 +28,7 @@ struct SameDimsElemwiseMul<
  void operator()(const framework::ExecutionContext &ctx,
                  const framework::Tensor *x, const framework::Tensor *y,
                  framework::Tensor *z) {
-    auto blas = math::GetBlas<platform::CPUDeviceContext, T>(ctx);
+    auto blas = pten::funcs::GetBlas<platform::CPUDeviceContext, T>(ctx);
    blas.VMUL(x->numel(), x->data<T>(), y->data<T>(), z->data<T>());
  }
 };

--- a/paddle/fluid/operators/fake_quantize_op.h
+++ b/paddle/fluid/operators/fake_quantize_op.h
@@ -19,9 +19,9 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/memory/malloc.h"
-#include "paddle/fluid/operators/math/blas.h"
 #include "paddle/fluid/platform/transform.h"
 #include "paddle/pten/core/hostdevice.h"
+#include "paddle/pten/kernels/funcs/blas/blas.h"
 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/flatten_op.h
+++ b/paddle/fluid/operators/flatten_op.h
@@ -16,12 +16,12 @@ limitations under the License. */
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/pten_utils.h"
-#include "paddle/fluid/operators/math/blas.h"
 #include "paddle/fluid/operators/math/pooling.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/pten/kernels/empty_kernel.h"
 #include "paddle/pten/kernels/flatten_grad_kernel.h"
 #include "paddle/pten/kernels/flatten_kernel.h"
+#include "paddle/pten/kernels/funcs/blas/blas.h"
 #include "paddle/pten/kernels/funcs/math_function.h"
 namespace paddle {

--- a/paddle/fluid/operators/fsp_op.h
+++ b/paddle/fluid/operators/fsp_op.h
@@ -14,7 +14,7 @@ limitations under the License. */
 #pragma once
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/blas.h"
+#include "paddle/pten/kernels/funcs/blas/blas.h"
 #include "paddle/pten/kernels/funcs/math_function.h"
 namespace paddle {
@@ -39,16 +39,16 @@ class FSPOpKernel : public framework::OpKernel<T> {
    auto height = x_dims[2];
    auto width = x_dims[3];
-    auto blas = math::GetBlas<DeviceContext, T>(context);
+    auto blas = pten::funcs::GetBlas<DeviceContext, T>(context);
-    math::MatDescriptor x_mat_desc;
+    pten::funcs::MatDescriptor x_mat_desc;
    x_mat_desc.height_ = x_channel;
    x_mat_desc.width_ = height * width;
    x_mat_desc.batch_size_ = batch_size;
    x_mat_desc.stride_ = x_channel * height * width;
    x_mat_desc.trans_ = false;
-    math::MatDescriptor y_mat_desc;
+    pten::funcs::MatDescriptor y_mat_desc;
    y_mat_desc.height_ = height * width;
    y_mat_desc.width_ = y_channel;
    y_mat_desc.batch_size_ = batch_size;
@@ -78,7 +78,7 @@ class FSPGradOpKernel : public framework::OpKernel<T> {
    int64_t h = 0;
    int64_t w = 0;
-    auto blas = math::GetBlas<DeviceContext, T>(context);
+    auto blas = pten::funcs::GetBlas<DeviceContext, T>(context);
    pten::funcs::SetConstant<DeviceContext, T> set_zero;
    if (d_x != nullptr) {
      d_x->mutable_data<T>(context.GetPlace());
@@ -89,14 +89,14 @@ class FSPGradOpKernel : public framework::OpKernel<T> {
      h = y_dims[2];
      w = y_dims[3];
-      math::MatDescriptor d_out_mat_desc;
+      pten::funcs::MatDescriptor d_out_mat_desc;
      d_out_mat_desc.height_ = x_channel;
      d_out_mat_desc.width_ = y_channel;
      d_out_mat_desc.batch_size_ = batch_size;
      d_out_mat_desc.stride_ = x_channel * y_channel;
      d_out_mat_desc.trans_ = false;
-      math::MatDescriptor y_mat_desc;
+      pten::funcs::MatDescriptor y_mat_desc;
      y_mat_desc.height_ = y_channel;
      y_mat_desc.width_ = h * w;
      y_mat_desc.batch_size_ = batch_size;
@@ -116,14 +116,14 @@ class FSPGradOpKernel : public framework::OpKernel<T> {
      h = x_dims[2];
      w = x_dims[3];
-      math::MatDescriptor d_out_mat_desc;
+      pten::funcs::MatDescriptor d_out_mat_desc;
      d_out_mat_desc.height_ = y_channel;
      d_out_mat_desc.width_ = x_channel;
      d_out_mat_desc.batch_size_ = batch_size;
      d_out_mat_desc.stride_ = x_channel * y_channel;
      d_out_mat_desc.trans_ = true;
-      math::MatDescriptor x_mat_desc;
+      pten::funcs::MatDescriptor x_mat_desc;
      x_mat_desc.height_ = x_channel;
      x_mat_desc.width_ = h * w;
      x_mat_desc.batch_size_ = batch_size;

--- a/paddle/fluid/operators/fused/attn_feed_forward.h
+++ b/paddle/fluid/operators/fused/attn_feed_forward.h
@@ -15,8 +15,8 @@ limitations under the License. */
 #pragma once
 #include "paddle/fluid/operators/fused/attn_bias_add.cu.h"
-#include "paddle/fluid/operators/math/blas.h"
 #include "paddle/fluid/platform/float16.h"
+#include "paddle/pten/kernels/funcs/blas/blas.h"
 namespace paddle {
 namespace operators {
@@ -47,7 +47,7 @@ class FeedForward {
    // column-major: (m,n,k) = output_size,bsz_seq,input_size (weight*input=out)
    // here: (m,n,k) = bsz_seq,output_size,input_size (input*weight=out)
-    auto blas = math::GetBlas<platform::CUDADeviceContext, T>(dev_ctx_);
+    auto blas = pten::funcs::GetBlas<platform::CUDADeviceContext, T>(dev_ctx_);
    blas.GEMM(transA, transB, bsz_seq_, output_size_, input_size_, alpha,
              input_data, weight_data, beta, output_data);
    if (compute_bias_) {
@@ -60,7 +60,7 @@ class FeedForward {
                       T* d_weight, T* d_bias) {
    T alpha = static_cast<T>(1.0);
    T beta = static_cast<T>(0.0);
-    auto blas = math::GetBlas<platform::CUDADeviceContext, T>(dev_ctx_);
+    auto blas = pten::funcs::GetBlas<platform::CUDADeviceContext, T>(dev_ctx_);
    // column-major: gemm-nt, get d_weight.
    CBLAS_TRANSPOSE transA = CblasTrans;

--- a/paddle/fluid/operators/fused/attn_gemm.h
+++ b/paddle/fluid/operators/fused/attn_gemm.h
@@ -11,8 +11,8 @@ limitations under the License. */
 #pragma once
-#include "paddle/fluid/operators/math/blas.h"
 #include "paddle/fluid/platform/float16.h"
+#include "paddle/pten/kernels/funcs/blas/blas.h"
 #include "paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h"
 #include "paddle/fluid/operators/kernel_primitives/kernel_primitives.h"
@@ -56,7 +56,7 @@ class AttnMatMul {
    T beta = static_cast<T>(0.0);
    // here: (m, n, k) = bsz_seq, output_size, input_size, (input, weight, out)
-    auto blas = math::GetBlas<platform::CUDADeviceContext, T>(dev_ctx_);
+    auto blas = pten::funcs::GetBlas<platform::CUDADeviceContext, T>(dev_ctx_);
    blas.GEMM(transA, transB, bsz_seq_, output_size_, input_size_, alpha,
              input->data<T>(), weight->data<T>(), beta, output->data<T>());
    if (compute_bias_) {
@@ -80,7 +80,7 @@ class AttnMatMul {
                       framework::Tensor* d_bias) {
    T alpha = static_cast<T>(1.0);
    T beta = static_cast<T>(0.0);
-    auto blas = math::GetBlas<platform::CUDADeviceContext, T>(dev_ctx_);
+    auto blas = pten::funcs::GetBlas<platform::CUDADeviceContext, T>(dev_ctx_);
    CBLAS_TRANSPOSE dB_transA = CblasNoTrans;
    CBLAS_TRANSPOSE dB_transB = CblasNoTrans;

--- a/paddle/fluid/operators/fused/fmha_ref.h
+++ b/paddle/fluid/operators/fused/fmha_ref.h
@@ -99,7 +99,7 @@ class FMHARef {
    // q*k^t, batched_gemm
    CBLAS_TRANSPOSE transA = CblasNoTrans;
    CBLAS_TRANSPOSE transB = CblasTrans;
-    auto blas = math::GetBlas<platform::CUDADeviceContext, T>(dev_ctx_);
+    auto blas = pten::funcs::GetBlas<platform::CUDADeviceContext, T>(dev_ctx_);
    int gemm_batch_size = batch_size_ * num_head_;
    int gemm_m = seq_len_;
    int gemm_n = seq_len_;
@@ -174,7 +174,7 @@ class FMHARef {
      Tensor* softmax_out_grad_tensor, Tensor* src_mask_out_grad_tensor,
      Tensor* qk_out_grad_tensor, Tensor* transpose_2_out_grad_tensor,
      Tensor* src_mask_grad_tensor, Tensor* qkv_input_grad_tensor) {
-    auto blas = math::GetBlas<platform::CUDADeviceContext, T>(dev_ctx_);
+    auto blas = pten::funcs::GetBlas<platform::CUDADeviceContext, T>(dev_ctx_);
    int q_size = batch_size_ * seq_len_ * num_head_ * head_dim_;
    int k_size = q_size;
    int softmax_axis = -1;

--- a/paddle/fluid/operators/fused/fused_embedding_eltwise_layernorm_op.cu
+++ b/paddle/fluid/operators/fused/fused_embedding_eltwise_layernorm_op.cu
@@ -18,7 +18,7 @@
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/memory/malloc.h"
 #include "paddle/fluid/operators/math/bert_encoder_functor.h"
-#include "paddle/fluid/operators/math/blas.h"
+#include "paddle/pten/kernels/funcs/blas/blas.h"
 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.cc
+++ b/paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.cc
@@ -14,10 +14,10 @@ limitations under the License. */
 #include "paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.h"
 #include <string>
-#include "paddle/fluid/operators/math/blas.h"
 #include "paddle/fluid/operators/math/cpu_vec.h"
 #include "paddle/fluid/operators/math/sequence2batch.h"
 #include "paddle/fluid/platform/cpu_info.h"
+#include "paddle/pten/kernels/funcs/blas/blas.h"
 namespace paddle {
 namespace operators {
@@ -364,7 +364,7 @@ class FusedEmbeddingFCLSTMKernel : public framework::OpKernel<T> {
    T* xx_data = xx->mutable_data<T>(place);
    T* h_out_data = hidden_out->mutable_data<T>(place);
    T* c_out_data = cell_out->mutable_data<T>(place);
-    auto blas = math::GetBlas<DeviceContext, T>(ctx);
+    auto blas = pten::funcs::GetBlas<DeviceContext, T>(ctx);
    for (int64_t i = 0; i < ids_numel; ++i) {
      PADDLE_ENFORCE_LT(
@@ -475,7 +475,7 @@ class FusedEmbeddingFCLSTMKernel : public framework::OpKernel<T> {
    math::LoDTensor2BatchFunctor<DeviceContext, T> to_batch;
    auto& dev_ctx = ctx.template device_context<DeviceContext>();
-    auto blas = math::GetBlas<DeviceContext, T>(dev_ctx);
+    auto blas = pten::funcs::GetBlas<DeviceContext, T>(dev_ctx);
    for (int64_t i = 0; i < ids_numel; ++i) {
      PADDLE_ENFORCE_LT(

--- a/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.h
+++ b/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.h
@@ -23,7 +23,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/selected_rows_utils.h"
 #include "paddle/fluid/operators/jit/kernels.h"
-#include "paddle/fluid/operators/math/blas.h"
+#include "paddle/pten/kernels/funcs/blas/blas.h"
 namespace paddle {
 namespace operators {
@@ -179,7 +179,7 @@ class FusedEmbeddingSeqPoolKernel : public framework::OpKernel<T> {
      const int m = batch_size * idx_width;
      const int n = table_width;
      const int k = table_height;
-      auto blas = math::GetBlas<platform::CPUDeviceContext, T>(context);
+      auto blas = pten::funcs::GetBlas<platform::CPUDeviceContext, T>(context);
      blas.CSRMM(&transa, &m, &n, &k, &alpha, matdescra, (const T *)csr_vals,
                 (const int *)csr_colmuns, (const int *)csr_row_idx,
                 (const int *)csr_row_idx + 1, weights, &n, &beta, output, &n);
@@ -277,7 +277,7 @@ class FusedEmbeddingSeqPoolGradKernel : public framework::OpKernel<T> {
                          csr_colmuns, csr_row_idx, padding_idx);
      auto *d_output_data = d_output->data<T>();
-      auto blas = math::GetBlas<platform::CPUDeviceContext, T>(context);
+      auto blas = pten::funcs::GetBlas<platform::CPUDeviceContext, T>(context);
      int width = static_cast<int>(table_dim[1]);
      int num_seq = batch_size * idx_width;
      LOG(INFO) << "num seq = " << num_seq << " width = " << width;

--- a/paddle/fluid/operators/fused/fused_fc_elementwise_layernorm_op.cu
+++ b/paddle/fluid/operators/fused/fused_fc_elementwise_layernorm_op.cu
@@ -21,9 +21,9 @@ namespace cub = hipcub;
 #endif
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/blas.h"
 #include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
 #include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
+#include "paddle/pten/kernels/funcs/blas/blas.h"
 namespace paddle {
 namespace operators {
@@ -150,7 +150,7 @@ class FusedFCElementwiseLayerNormOpKernel : public framework::OpKernel<T> {
    T* out_data = out->mutable_data<T>(ctx.GetPlace());
    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
-    auto blas = math::GetBlas<platform::CUDADeviceContext, T>(dev_ctx);
+    auto blas = pten::funcs::GetBlas<platform::CUDADeviceContext, T>(dev_ctx);
    blas.GEMM(false, false, M, N, K, static_cast<T>(1.0), x_data, K, w_data, N,
              static_cast<T>(0.0), out_data, N);

--- a/paddle/fluid/operators/fused/fused_feedforward_op.cc
+++ b/paddle/fluid/operators/fused/fused_feedforward_op.cc
@@ -17,8 +17,8 @@ limitations under the License. */
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/op_version_registry.h"
-#include "paddle/fluid/operators/math/blas.h"
 #include "paddle/fluid/operators/matmul_v2_op.h"
+#include "paddle/pten/kernels/funcs/blas/blas.h"
 namespace paddle {
 namespace operators {
@@ -49,8 +49,8 @@ class FusedFeedForwardOp : public framework::OperatorWithKernel {
                   "fused_feedforward");
    auto dim_x = context->GetInputDim("X");
-    auto mat_dim_x =
+    auto mat_dim_x = pten::funcs::CreateMatrixDescriptor(
-        math::CreateMatrixDescriptor(RowMatrixFromVector(dim_x), 0, false);
+        RowMatrixFromVector(dim_x), 0, false);
    // verify for the pre layer_norm, the feature size must be larger than 1
    PADDLE_ENFORCE_GT(
        mat_dim_x.width_, static_cast<size_t>(1),

--- a/paddle/fluid/operators/fused/fused_feedforward_op.cu
+++ b/paddle/fluid/operators/fused/fused_feedforward_op.cu
@@ -14,8 +14,8 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/op_version_registry.h"
-#include "paddle/fluid/operators/math/blas.h"
 #include "paddle/fluid/operators/matmul_v2_op.h"
+#include "paddle/pten/kernels/funcs/blas/blas.h"
 #include "paddle/fluid/operators/elementwise/elementwise_add_op.h"
 #include "paddle/fluid/operators/fused/fused_dropout_helper.h"
@@ -32,11 +32,11 @@ class FusedFeedForwardKernel : public framework::OpKernel<T> {
  void MatMul(const platform::CUDADeviceContext& ctx,
              const framework::Tensor& a, const framework::Tensor& b,
              framework::Tensor* c) const {
-    auto blas = math::GetBlas<DeviceContext, T>(ctx);
+    auto blas = pten::funcs::GetBlas<DeviceContext, T>(ctx);
    auto a_2d = FoldInitDims(a);
    auto b_2d = FoldInitDims(b);
-    auto mat_dim_a = math::CreateMatrixDescriptor(a_2d.dims(), 0, false);
+    auto mat_dim_a = pten::funcs::CreateMatrixDescriptor(a_2d.dims(), 0, false);
-    auto mat_dim_b = math::CreateMatrixDescriptor(b_2d.dims(), 0, false);
+    auto mat_dim_b = pten::funcs::CreateMatrixDescriptor(b_2d.dims(), 0, false);
    T alpha = static_cast<T>(1.0);
    blas.MatMul(a, mat_dim_a, b, mat_dim_b, alpha, c, T(0));
  }
@@ -173,8 +173,8 @@ class FusedFeedForwardKernel : public framework::OpKernel<T> {
    dropout2_out->mutable_data<T>(place);
    auto x_dim = x->dims();
-    auto mat_dim_x =
+    auto mat_dim_x = pten::funcs::CreateMatrixDescriptor(
-        math::CreateMatrixDescriptor(RowMatrixFromVector(x_dim), 0, false);
+        RowMatrixFromVector(x_dim), 0, false);
    auto dim = linear1_weight->dims();
    int d_model = dim[0];
@@ -197,12 +197,13 @@ class FusedFeedForwardGradKernel : public framework::OpKernel<T> {
                  const framework::Tensor& d_out, const framework::Tensor& a,
                  const framework::Tensor& b, framework::Tensor* d_a,
                  framework::Tensor* d_b) const {
-    auto blas = math::GetBlas<DeviceContext, T>(ctx);
+    auto blas = pten::funcs::GetBlas<DeviceContext, T>(ctx);
    auto a_2d = FoldInitDims(a);
    auto b_2d = FoldInitDims(b);
-    auto mat_dim_a = math::CreateMatrixDescriptor(a_2d.dims(), 0, true);
+    auto mat_dim_a = pten::funcs::CreateMatrixDescriptor(a_2d.dims(), 0, true);
-    auto mat_dim_b = math::CreateMatrixDescriptor(b_2d.dims(), 0, true);
+    auto mat_dim_b = pten::funcs::CreateMatrixDescriptor(b_2d.dims(), 0, true);
-    auto mat_dim_dout = math::CreateMatrixDescriptor(d_out.dims(), 0, false);
+    auto mat_dim_dout =
+        pten::funcs::CreateMatrixDescriptor(d_out.dims(), 0, false);
    T alpha = static_cast<T>(1.0);
    blas.MatMul(d_out, mat_dim_dout, b, mat_dim_b, alpha, d_a, T(0));
    blas.MatMul(a, mat_dim_a, d_out, mat_dim_dout, alpha, d_b, T(0));
@@ -403,8 +404,8 @@ class FusedFeedForwardGradKernel : public framework::OpKernel<T> {
    d_linear2_weight->mutable_data<T>(place);
    auto x_dim = x.dims();
-    auto mat_dim_x =
+    auto mat_dim_x = pten::funcs::CreateMatrixDescriptor(
-        math::CreateMatrixDescriptor(RowMatrixFromVector(x_dim), 0, false);
+        RowMatrixFromVector(x_dim), 0, false);
    auto linear1_weight_dim = linear1_weight.dims();
    int d_model = linear1_weight_dim[0];

--- a/paddle/fluid/operators/fused/fusion_gru_op.cc
+++ b/paddle/fluid/operators/fused/fusion_gru_op.cc
@@ -18,9 +18,9 @@ limitations under the License. */
 #include <vector>
 #include "paddle/fluid/framework/op_version_registry.h"
 #include "paddle/fluid/operators/jit/kernels.h"
-#include "paddle/fluid/operators/math/blas.h"
 #include "paddle/fluid/operators/math/fc.h"
 #include "paddle/fluid/operators/math/sequence2batch.h"
+#include "paddle/pten/kernels/funcs/blas/blas.h"
 #ifdef PADDLE_WITH_MKLDNN
 #include "paddle/fluid/platform/mkldnn_helper.h"
 #endif
@@ -295,7 +295,7 @@ class FusionGRUKernel : public framework::OpKernel<T> {
    const T* h0_data = h0 ? h0->data<T>() : nullptr;
    const T* wh_state_data = wh_data + D * D2;
    T* hidden_out_data = hidden_out->mutable_data<T>(place);
-    auto blas = math::GetBlas<DeviceContext, T>(ctx);
+    auto blas = pten::funcs::GetBlas<DeviceContext, T>(ctx);
    auto& dev_ctx = ctx.template device_context<DeviceContext>();
    math::FCFunctor<DeviceContext, T> fc;
@@ -367,7 +367,7 @@ class FusionGRUKernel : public framework::OpKernel<T> {
    T* batched_out_data = batched_out->mutable_data<T>(place);
    hidden_out->mutable_data<T>(place);
    auto& dev_ctx = ctx.template device_context<DeviceContext>();
-    auto blas = math::GetBlas<DeviceContext, T>(dev_ctx);
+    auto blas = pten::funcs::GetBlas<DeviceContext, T>(dev_ctx);
    math::LoDTensor2BatchFunctor<DeviceContext, T> to_batch;
    math::FCFunctor<DeviceContext, T> fc;

--- a/paddle/fluid/operators/fused/fusion_lstm_op.cc
+++ b/paddle/fluid/operators/fused/fusion_lstm_op.cc
@@ -15,9 +15,9 @@ limitations under the License. */
 #include "paddle/fluid/operators/fused/fusion_lstm_op.h"
 #include <string>
 #include "paddle/fluid/operators/jit/kernels.h"
-#include "paddle/fluid/operators/math/blas.h"
 #include "paddle/fluid/operators/math/fc.h"
 #include "paddle/fluid/operators/math/sequence2batch.h"
+#include "paddle/pten/kernels/funcs/blas/blas.h"
 #ifdef PADDLE_WITH_MKLDNN
 #include "paddle/fluid/platform/mkldnn_helper.h"
 #endif
@@ -343,7 +343,7 @@ class FuisonLSTMKernel : public framework::OpKernel<T> {
    T* xx_data = xx->mutable_data<T>(place);
    T* h_out_data = hidden_out->mutable_data<T>(place);
    T* c_out_data = cell_out->mutable_data<T>(place);
-    auto blas = math::GetBlas<DeviceContext, T>(ctx);
+    auto blas = pten::funcs::GetBlas<DeviceContext, T>(ctx);
    auto& dev_ctx = ctx.template device_context<DeviceContext>();
    math::FCFunctor<DeviceContext, T> fc;
@@ -423,7 +423,7 @@ class FuisonLSTMKernel : public framework::OpKernel<T> {
    math::LoDTensor2BatchFunctor<DeviceContext, T> to_batch;
    auto& dev_ctx = ctx.template device_context<DeviceContext>();
-    auto blas = math::GetBlas<DeviceContext, T>(dev_ctx);
+    auto blas = pten::funcs::GetBlas<DeviceContext, T>(dev_ctx);
    math::FCFunctor<DeviceContext, T> fc;
    if (M > D4) {
      fc(dev_ctx, x_dims[0], D4, M, x_data, wx_data, xx_data, bias->data<T>());

--- a/paddle/fluid/operators/fused/fusion_seqconv_eltadd_relu_op.cc
+++ b/paddle/fluid/operators/fused/fusion_seqconv_eltadd_relu_op.cc
@@ -15,8 +15,8 @@ limitations under the License. */
 #include "paddle/fluid/operators/fused/fusion_seqconv_eltadd_relu_op.h"
 #include <algorithm>  // for min, max
 #include <string>
-#include "paddle/fluid/operators/math/blas.h"
 #include "paddle/fluid/operators/math/fc.h"
+#include "paddle/pten/kernels/funcs/blas/blas.h"
 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/fused/fusion_seqexpand_concat_fc_op.cc
+++ b/paddle/fluid/operators/fused/fusion_seqexpand_concat_fc_op.cc
@@ -14,10 +14,10 @@ limitations under the License. */
 #include "paddle/fluid/operators/fused/fusion_seqexpand_concat_fc_op.h"
 #include <string>
-#include "paddle/fluid/operators/math/blas.h"
 #include "paddle/fluid/operators/math/cpu_vec.h"
 #include "paddle/fluid/operators/math/fc.h"
 #include "paddle/fluid/platform/cpu_info.h"
+#include "paddle/pten/kernels/funcs/blas/blas.h"
 namespace paddle {
 namespace operators {
@@ -209,7 +209,7 @@ class FusionSeqExpandConcatFCOpKernel : public framework::OpKernel<T> {
    T* out_data = out->mutable_data<T>(ctx.GetPlace());
    T* fc_out_data = fc_out->mutable_data<T>(ctx.GetPlace());
-    auto blas = math::GetBlas<DeviceContext, T>(ctx);
+    auto blas = pten::funcs::GetBlas<DeviceContext, T>(ctx);
    auto& dev_ctx = ctx.template device_context<platform::CPUDeviceContext>();
    math::FCFunctor<DeviceContext, T> fc;

--- a/paddle/fluid/operators/fused/multi_gru_op.cc
+++ b/paddle/fluid/operators/fused/multi_gru_op.cc
@@ -18,9 +18,9 @@ limitations under the License. */
 #include <string>
 #include <vector>
 #include "paddle/fluid/operators/jit/kernels.h"
-#include "paddle/fluid/operators/math/blas.h"
 #include "paddle/fluid/operators/math/fc.h"
 #include "paddle/fluid/operators/math/sequence2batch.h"
+#include "paddle/pten/kernels/funcs/blas/blas.h"
 #ifdef PADDLE_WITH_MKLDNN
 #include "paddle/fluid/platform/mkldnn_helper.h"
 #endif

--- a/paddle/fluid/operators/fused/multihead_matmul_op.cu
+++ b/paddle/fluid/operators/fused/multihead_matmul_op.cu
@@ -17,7 +17,7 @@
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/memory/malloc.h"
 #include "paddle/fluid/operators/math/bert_encoder_functor.h"
-#include "paddle/fluid/operators/math/blas.h"
+#include "paddle/pten/kernels/funcs/blas/blas.h"
 namespace paddle {
 namespace operators {
@@ -211,7 +211,8 @@ class MultiHeadMatMulV2Kernel : public framework::OpKernel<T> {
    auto *temp_out_data = temp_out_tensor.mutable_data<T>(context.GetPlace());
    // (B * S, hidden) * (hidden, 3 * N * H) -> (B * S * 3 * N * H)
-    auto blas = math::GetBlas<platform::CUDADeviceContext, T>(device_ctx);
+    auto blas =
+        pten::funcs::GetBlas<platform::CUDADeviceContext, T>(device_ctx);
    blas.MatMul(input_matrix, w_matrix, &temp_out_tensor);
    // temp_out_tensor.Resize(temp_out_dims);

--- a/paddle/fluid/operators/fused/skip_layernorm_op.cu
+++ b/paddle/fluid/operators/fused/skip_layernorm_op.cu
@@ -17,7 +17,7 @@
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/memory/malloc.h"
 #include "paddle/fluid/operators/math/bert_encoder_functor.h"
-#include "paddle/fluid/operators/math/blas.h"
+#include "paddle/pten/kernels/funcs/blas/blas.h"
 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/gelu_op.h
+++ b/paddle/fluid/operators/gelu_op.h
@@ -20,8 +20,8 @@ limitations under the License. */
 #include <cmath>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/blas.h"
 #include "paddle/fluid/platform/float16.h"
+#include "paddle/pten/kernels/funcs/blas/blas.h"
 #ifdef PADDLE_WITH_MKLDNN
 #include "paddle/fluid/platform/mkldnn_helper.h"
@@ -63,13 +63,13 @@ struct GeluFunctor {
      int n = std::min(x.size(), out.size());
      std::memset(out_data, 0, n * sizeof(T));
-      math::CBlas<T>::AXPY(n, static_cast<T>(M_SQRT1_2), x_data, 1, out_data,
+      pten::funcs::CBlas<T>::AXPY(n, static_cast<T>(M_SQRT1_2), x_data, 1,
-                           1);
+                                  out_data, 1);
-      math::CBlas<T>::VMERF(n, out_data, out_data, VML_LA);
+      pten::funcs::CBlas<T>::VMERF(n, out_data, out_data, VML_LA);
      for (int i = 0; i < n; i++) {
        out_data[i] += static_cast<T>(1);
      }
-      math::CBlas<T>::VMUL(n, x_data, out_data, out_data);
+      pten::funcs::CBlas<T>::VMUL(n, x_data, out_data, out_data);
      for (int i = 0; i < n; i++) {
        out_data[i] *= static_cast<T>(0.5);
      }
@@ -138,24 +138,25 @@ struct GeluGradFunctor {
      std::memset(second, 0, n * sizeof(T));
      // first = (0.5 * (1 + erf(x / sqrt(2))))
-      math::CBlas<T>::AXPY(n, static_cast<T>(M_SQRT1_2), x_data, 1, first, 1);
+      pten::funcs::CBlas<T>::AXPY(n, static_cast<T>(M_SQRT1_2), x_data, 1,
-      math::CBlas<T>::VMERF(n, first, first, VML_LA);
+                                  first, 1);
+      pten::funcs::CBlas<T>::VMERF(n, first, first, VML_LA);
      for (int i = 0; i < n; i++) {
        first[i] += static_cast<T>(1);
      }
-      math::CBlas<T>::SCAL(n, static_cast<T>(0.5), first, 1);
+      pten::funcs::CBlas<T>::SCAL(n, static_cast<T>(0.5), first, 1);
      // second = (0.5 * 2/sqrt(pi) * 1/sqrt(2) * x * exp(-0.5 * x^2))
-      math::CBlas<T>::VSQUARE(n, x_data, second);
+      pten::funcs::CBlas<T>::VSQUARE(n, x_data, second);
-      math::CBlas<T>::SCAL(n, -static_cast<T>(0.5), second, 1);
+      pten::funcs::CBlas<T>::SCAL(n, -static_cast<T>(0.5), second, 1);
-      math::CBlas<T>::VEXP(n, second, second);
+      pten::funcs::CBlas<T>::VEXP(n, second, second);
-      math::CBlas<T>::VMUL(n, x_data, second, second);
+      pten::funcs::CBlas<T>::VMUL(n, x_data, second, second);
-      math::CBlas<T>::SCAL(n, static_cast<T>(0.5 * M_2_SQRTPI * M_SQRT1_2),
+      pten::funcs::CBlas<T>::SCAL(
-                           second, 1);
+          n, static_cast<T>(0.5 * M_2_SQRTPI * M_SQRT1_2), second, 1);
      // dx = dout * (first + second);
-      math::CBlas<T>::VADD(n, first, second, first);
+      pten::funcs::CBlas<T>::VADD(n, first, second, first);
-      math::CBlas<T>::VMUL(n, dout_data, first, dx_data);
+      pten::funcs::CBlas<T>::VMUL(n, dout_data, first, dx_data);
      std::free(first);
      std::free(second);

--- a/paddle/fluid/operators/group_norm_op.h
+++ b/paddle/fluid/operators/group_norm_op.h
@@ -21,7 +21,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
-#include "paddle/fluid/operators/math/blas.h"
+#include "paddle/pten/kernels/funcs/blas/blas.h"
 #include "paddle/pten/kernels/funcs/math_function.h"
 namespace paddle {

--- a/paddle/fluid/operators/gru_op.cc
+++ b/paddle/fluid/operators/gru_op.cc
@@ -15,9 +15,9 @@ limitations under the License. */
 #include "paddle/fluid/operators/gru_op.h"
 #include <memory>
 #include <string>
-#include "paddle/fluid/operators/math/blas.h"
 #include "paddle/fluid/operators/math/detail/gru_cpu_kernel.h"
 #include "paddle/fluid/operators/math/detail/gru_kernel.h"
+#include "paddle/pten/kernels/funcs/blas/blas.h"
 DECLARE_int32(paddle_num_threads);
@@ -355,7 +355,7 @@ class GRUCPUKernel : public framework::OpKernel<T> {
 #ifdef PADDLE_WITH_MKLML
    // use MKL packed to speedup GEMM
    if (FLAGS_paddle_num_threads >= 4) {
-      auto blas = math::GetBlas<DeviceContext, T>(dev_ctx);
+      auto blas = pten::funcs::GetBlas<DeviceContext, T>(dev_ctx);
      T* packed_gate = blas.GEMM_ALLOC(CblasBMatrix, 1 /*height of C*/,
                                       frame_size * 2 /*width of weight*/,
                                       frame_size /*height of height*/);

--- a/paddle/fluid/operators/gru_unit_op.h
+++ b/paddle/fluid/operators/gru_unit_op.h
@@ -17,8 +17,8 @@ limitations under the License. */
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/activation_op.h"
-#include "paddle/fluid/operators/math/blas.h"
 #include "paddle/fluid/platform/place.h"
+#include "paddle/pten/kernels/funcs/blas/blas.h"
 namespace paddle {
 namespace operators {
@@ -87,7 +87,7 @@ class GRUUnitKernel : public framework::OpKernel<T> {
    const T* weight_data = weight->data<T>();
    T* gate_data = gate->data<T>();
    T* reset_hidden_prev_data = reset_hidden_prev->data<T>();
-    auto blas = math::GetBlas<DeviceContext, T>(context);
+    auto blas = pten::funcs::GetBlas<DeviceContext, T>(context);
    blas.GEMM(false, false, batch_size, 2 * frame_size, frame_size, 1,
              hidden_prev_data, frame_size, weight_data, frame_size * 2, 1,
              gate_data, frame_size * 3);
@@ -204,7 +204,7 @@ class GRUUnitGradKernel : public framework::OpKernel<T> {
                     d_g.slice(c_offsets, extents), d_h * u);
    }
    // backward for reset_hidden_prev
-    auto blas = math::GetBlas<DeviceContext, T>(context);
+    auto blas = pten::funcs::GetBlas<DeviceContext, T>(context);
    blas.GEMM(false, true, batch_size, frame_size, frame_size, 1,
              gate_grad_data + frame_size * 2, frame_size * 3,
              weight_data + frame_size * frame_size * 2, frame_size, 0,

--- a/paddle/fluid/operators/hierarchical_sigmoid_op.h
+++ b/paddle/fluid/operators/hierarchical_sigmoid_op.h
@@ -166,7 +166,7 @@ class HierarchicalSigmoidGradOpKernel : public framework::OpKernel<T> {
    // softrelu derivative
-    auto blas = math::GetBlas<DeviceContext, T>(ctx);
+    auto blas = pten::funcs::GetBlas<DeviceContext, T>(ctx);
    auto* pre_out_grad_data = pre_out_grad.data<T>();
    auto* pre_out_data = pre_out.template data<T>();

--- a/paddle/fluid/operators/index_select_op.h
+++ b/paddle/fluid/operators/index_select_op.h
@@ -15,7 +15,7 @@
 #pragma once
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/blas.h"
+#include "paddle/pten/kernels/funcs/blas/blas.h"
 #include "paddle/pten/kernels/funcs/math_function.h"
 namespace paddle {
@@ -141,7 +141,7 @@ struct IndexSelectAdd<
    typename std::enable_if<std::is_floating_point<T>::value>::type> {
  void operator()(const framework::ExecutionContext& ctx, int slice_size,
                  const T* src_pointer, const T* p_pointer, T* dist_pointer) {
-    auto blas = math::GetBlas<DeviceContext, T>(ctx);
+    auto blas = pten::funcs::GetBlas<DeviceContext, T>(ctx);
    blas.VADD(slice_size, src_pointer, p_pointer, dist_pointer);
  }
 };

--- a/paddle/fluid/operators/inverse_op.h
+++ b/paddle/fluid/operators/inverse_op.h
@@ -15,8 +15,8 @@ limitations under the License. */
 #pragma once
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/blas.h"
 #include "paddle/fluid/operators/math/matrix_inverse.h"
+#include "paddle/pten/kernels/funcs/blas/blas.h"
 namespace paddle {
 namespace operators {
@@ -48,19 +48,22 @@ class InverseGradKernel : public framework::OpKernel<T> {
    if (a_grad) {
      a_grad->mutable_data<T>(context.GetPlace());
-      auto blas = math::GetBlas<DeviceContext, T>(context);
+      auto blas = pten::funcs::GetBlas<DeviceContext, T>(context);
      auto& dev_ctx = context.template device_context<DeviceContext>();
      framework::Tensor tmp_out =
          context.AllocateTmpTensor<T, DeviceContext>(a_inv->dims(), dev_ctx);
      auto mat_dim_a0 =
-          math::CreateMatrixDescriptor(a_inv_grad->dims(), 0, false);
+          pten::funcs::CreateMatrixDescriptor(a_inv_grad->dims(), 0, false);
-      auto mat_dim_b0 = math::CreateMatrixDescriptor(a_inv->dims(), 0, true);
+      auto mat_dim_b0 =
+          pten::funcs::CreateMatrixDescriptor(a_inv->dims(), 0, true);
      blas.MatMul(*a_inv_grad, mat_dim_a0, *a_inv, mat_dim_b0, T(1), &tmp_out,
                  T(0));
-      auto mat_dim_a1 = math::CreateMatrixDescriptor(a_inv->dims(), 0, true);
+      auto mat_dim_a1 =
-      auto mat_dim_b1 = math::CreateMatrixDescriptor(tmp_out.dims(), 0, false);
+          pten::funcs::CreateMatrixDescriptor(a_inv->dims(), 0, true);
+      auto mat_dim_b1 =
+          pten::funcs::CreateMatrixDescriptor(tmp_out.dims(), 0, false);
      blas.MatMul(*a_inv, mat_dim_a1, tmp_out, mat_dim_b1, T(-1), a_grad, T(0));
    }
  }

--- a/paddle/fluid/operators/layer_norm_op.h
+++ b/paddle/fluid/operators/layer_norm_op.h
@@ -20,7 +20,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
-#include "paddle/fluid/operators/math/blas.h"
+#include "paddle/pten/kernels/funcs/blas/blas.h"
 #if !defined(PADDLE_WITH_CUDA) && !defined(_WIN32) && !defined(__APPLE__) && \
    !defined(__OSX__)
 #include "paddle/fluid/operators/jit/kernels.h"
@@ -61,7 +61,7 @@ class RowwiseMean2D<platform::CUDADeviceContext, T> {
  }
  void operator()(const platform::CUDADeviceContext& context,
                  const framework::Tensor& input, framework::Tensor* out) {
-    math::GetBlas<platform::CUDADeviceContext, T>(context).GEMV(
+    pten::funcs::GetBlas<platform::CUDADeviceContext, T>(context).GEMV(
        false, left_, right_, 1., input.data<T>(), divisor_.data<T>(), 0.,
        out->data<T>());
  }
@@ -108,7 +108,7 @@ class ColwiseSum2D<platform::CUDADeviceContext, T> {
  void operator()(const platform::CUDADeviceContext& context,
                  const framework::Tensor& input, framework::Tensor* out) {
-    math::GetBlas<platform::CUDADeviceContext, T>(context).GEMV(
+    pten::funcs::GetBlas<platform::CUDADeviceContext, T>(context).GEMV(
        true, left_, right_, 1., input.data<T>(), divisor_.data<T>(), 0.,
        out->data<T>());
  }

--- a/paddle/fluid/operators/lookup_table_dequant_op.h
+++ b/paddle/fluid/operators/lookup_table_dequant_op.h
@@ -22,7 +22,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/selected_rows_utils.h"
 #include "paddle/fluid/framework/var_type_traits.h"
-#include "paddle/fluid/operators/math/blas.h"
+#include "paddle/pten/kernels/funcs/blas/blas.h"
 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/lookup_table_op.h
+++ b/paddle/fluid/operators/lookup_table_op.h
@@ -21,7 +21,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/selected_rows_utils.h"
-#include "paddle/fluid/operators/math/blas.h"
+#include "paddle/pten/kernels/funcs/blas/blas.h"
 namespace paddle {
 namespace operators {
@@ -109,8 +109,8 @@ class LookupTableKernel : public framework::OpKernel<T> {
                memcpy(output + i * row_width, table + id_index * row_width,
                       row_width * sizeof(T));
              } else {
-                auto blas =
+                auto blas = pten::funcs::GetBlas<platform::CPUDeviceContext, T>(
-                    math::GetBlas<platform::CPUDeviceContext, T>(context);
+                    context);
                blas.VCOPY(row_width, table + id_index * row_width,
                           output + i * row_width);
              }
@@ -137,7 +137,8 @@ class LookupTableKernel : public framework::OpKernel<T> {
              memcpy(output + i * row_width, table + id_index * row_width,
                     row_width * sizeof(T));
            } else {
-              auto blas = math::GetBlas<platform::CPUDeviceContext, T>(context);
+              auto blas =
+                  pten::funcs::GetBlas<platform::CPUDeviceContext, T>(context);
              blas.VCOPY(row_width, table + id_index * row_width,
                         output + i * row_width);
            }

--- a/paddle/fluid/operators/lookup_table_v2_op.h
+++ b/paddle/fluid/operators/lookup_table_v2_op.h
@@ -22,7 +22,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/selected_rows_utils.h"
-#include "paddle/fluid/operators/math/blas.h"
+#include "paddle/pten/kernels/funcs/blas/blas.h"
 namespace paddle {
 namespace operators {
@@ -124,7 +124,8 @@ struct LookupTableV2CPUFunctor {
            memcpy(output + i * row_width, table + id_index * row_width,
                   row_width * sizeof(T));
          } else {
-            auto blas = math::GetBlas<platform::CPUDeviceContext, T>(context_);
+            auto blas =
+                pten::funcs::GetBlas<platform::CPUDeviceContext, T>(context_);
            blas.VCOPY(row_width, table + id_index * row_width,
                       output + i * row_width);
          }

--- a/paddle/fluid/operators/lrn_op.cc
+++ b/paddle/fluid/operators/lrn_op.cc
@@ -16,7 +16,7 @@ limitations under the License. */
 #include <memory>
 #include <string>
 #include <vector>
-#include "paddle/fluid/operators/math/blas.h"
+#include "paddle/pten/kernels/funcs/blas/blas.h"
 #include "paddle/pten/kernels/funcs/math_function.h"
 #ifdef PADDLE_WITH_MKLDNN
 #include "paddle/fluid/platform/mkldnn_helper.h"
@@ -35,7 +35,7 @@ struct LRNFunctor<platform::CPUDeviceContext, T> {
                  framework::Tensor* mid, int N, int C, int H, int W, int n,
                  T k, T alpha, T beta, const DataLayout data_layout) {
    auto place = ctx.GetPlace();
-    auto blas = math::GetBlas<platform::CPUDeviceContext, T>(ctx);
+    auto blas = pten::funcs::GetBlas<platform::CPUDeviceContext, T>(ctx);
    pten::funcs::Transpose<platform::CPUDeviceContext, T, 4> transpose;
    auto& dev_ctx = ctx.template device_context<platform::CPUDeviceContext>();
    Tensor in_transpose, mid_transpose, out_transpose;

--- a/paddle/fluid/operators/lstm_op.h
+++ b/paddle/fluid/operators/lstm_op.h
@@ -15,10 +15,10 @@ limitations under the License. */
 #pragma once
 #include <string>
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/blas.h"
 #include "paddle/fluid/operators/math/detail/activation_functions.h"
 #include "paddle/fluid/operators/math/lstm_compute.h"
 #include "paddle/fluid/operators/math/sequence2batch.h"
+#include "paddle/pten/kernels/funcs/blas/blas.h"
 namespace paddle {
 namespace operators {
@@ -128,7 +128,7 @@ class LSTMKernel : public framework::OpKernel<T> {
    auto cand_act = math::detail::GetActivationType(
        ctx.Attr<std::string>("candidate_activation"));
-    auto blas = math::GetBlas<DeviceContext, T>(device_ctx);
+    auto blas = pten::funcs::GetBlas<DeviceContext, T>(device_ctx);
    for (size_t n = 0; n < num_batch; n++) {
      int bstart = static_cast<int>(batch_starts[n]);
      int bend = static_cast<int>(batch_starts[n + 1]);
@@ -302,7 +302,7 @@ class LSTMGradKernel : public framework::OpKernel<T> {
    auto batch_starts = batch_gate->lod()[0];
    size_t num_batch = batch_starts.size() - 1;
-    auto blas = math::GetBlas<DeviceContext, T>(device_ctx);
+    auto blas = pten::funcs::GetBlas<DeviceContext, T>(device_ctx);
    for (int n = static_cast<int>(num_batch) - 1; n >= 0; n--) {
      int bstart = static_cast<int>(batch_starts[n]);
      int bend = static_cast<int>(batch_starts[n + 1]);

--- a/paddle/fluid/operators/lstmp_op.h
+++ b/paddle/fluid/operators/lstmp_op.h
@@ -18,12 +18,12 @@ limitations under the License. */
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/activation_op.h"
-#include "paddle/fluid/operators/math/blas.h"
 #include "paddle/fluid/operators/math/detail/activation_functions.h"
 #include "paddle/fluid/operators/math/lstm_compute.h"
 #include "paddle/fluid/operators/math/sequence2batch.h"
 #include "paddle/fluid/platform/place.h"
 #include "paddle/fluid/platform/transform.h"
+#include "paddle/pten/kernels/funcs/blas/blas.h"
 namespace paddle {
 namespace operators {
@@ -185,7 +185,7 @@ class LSTMPKernel : public framework::OpKernel<T> {
    auto proj_act = math::detail::GetActivationType(
        ctx.Attr<std::string>("proj_activation"));
    auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
-    auto blas = math::GetBlas<DeviceContext, T>(device_ctx);
+    auto blas = pten::funcs::GetBlas<DeviceContext, T>(device_ctx);
    for (size_t n = 0; n < num_batch; n++) {
      int bstart = static_cast<int>(batch_starts[n]);
      int bend = static_cast<int>(batch_starts[n + 1]);
@@ -405,7 +405,7 @@ class LSTMPGradKernel : public framework::OpKernel<T> {
    auto batch_starts = batch_gate->lod()[0];
    size_t num_batch = batch_starts.size() - 1;
-    auto blas = math::GetBlas<DeviceContext, T>(device_ctx);
+    auto blas = pten::funcs::GetBlas<DeviceContext, T>(device_ctx);
    for (int n = static_cast<int>(num_batch) - 1; n >= 0; n--) {
      int bstart = static_cast<int>(batch_starts[n]);
      int bend = static_cast<int>(batch_starts[n + 1]);

--- a/paddle/fluid/operators/lstsq_op.h
+++ b/paddle/fluid/operators/lstsq_op.h
@@ -19,13 +19,13 @@
 #include <complex>
 #include "paddle/fluid/operators/eig_op.h"
 #include "paddle/fluid/operators/math/eigen_values_vectors.h"
-#include "paddle/fluid/operators/math/lapack_function.h"
 #include "paddle/fluid/operators/math/matrix_solve.h"
 #include "paddle/fluid/operators/svd_helper.h"
 #include "paddle/fluid/operators/transpose_op.h"
 #include "paddle/fluid/operators/triangular_solve_op.h"
 #include "paddle/fluid/platform/for_range.h"
 #include "paddle/pten/kernels/funcs/complex_functors.h"
+#include "paddle/pten/kernels/funcs/lapack/lapack_function.h"
 #include "paddle/pten/kernels/funcs/math_function.h"
 #define EPSILON 1e-6
@@ -153,20 +153,21 @@ class LstsqCPUKernel : public framework::OpKernel<T> {
    int iwkopt = 0;
    if (driver == LapackDriverType::Gels) {
-      math::lapackGels('N', m, n, nrhs, x_vector, lda, y_vector, ldb, &wkopt,
+      pten::funcs::lapackGels('N', m, n, nrhs, x_vector, lda, y_vector, ldb,
-                       lwork, &info);
+                              &wkopt, lwork, &info);
    } else if (driver == LapackDriverType::Gelsd) {
-      math::lapackGelsd(m, n, nrhs, x_vector, lda, y_vector, ldb, s_working_ptr,
+      pten::funcs::lapackGelsd(m, n, nrhs, x_vector, lda, y_vector, ldb,
-                        static_cast<ValueType>(rcond), &rank_32, &wkopt, lwork,
+                               s_working_ptr, static_cast<ValueType>(rcond),
-                        &rwkopt, &iwkopt, &info);
+                               &rank_32, &wkopt, lwork, &rwkopt, &iwkopt,
+                               &info);
    } else if (driver == LapackDriverType::Gelsy) {
-      math::lapackGelsy(m, n, nrhs, x_vector, lda, y_vector, ldb, jpvt_data,
+      pten::funcs::lapackGelsy(m, n, nrhs, x_vector, lda, y_vector, ldb,
-                        static_cast<ValueType>(rcond), &rank_32, &wkopt, lwork,
+                               jpvt_data, static_cast<ValueType>(rcond),
-                        &rwkopt, &info);
+                               &rank_32, &wkopt, lwork, &rwkopt, &info);
    } else if (driver == LapackDriverType::Gelss) {
-      math::lapackGelss(m, n, nrhs, x_vector, lda, y_vector, ldb, s_working_ptr,
+      pten::funcs::lapackGelss(m, n, nrhs, x_vector, lda, y_vector, ldb,
-                        static_cast<ValueType>(rcond), &rank_32, &wkopt, lwork,
+                               s_working_ptr, static_cast<ValueType>(rcond),
-                        &rwkopt, &info);
+                               &rank_32, &wkopt, lwork, &rwkopt, &info);
    }
    lwork = std::max<int>(1, static_cast<int>(pten::funcs::Real<T>(wkopt)));
@@ -206,20 +207,21 @@ class LstsqCPUKernel : public framework::OpKernel<T> {
      s_working_ptr = s_working_ptr ? &s_data[i * s_stride] : nullptr;
      if (driver == LapackDriverType::Gels) {
-        math::lapackGels('N', m, n, nrhs, x_input, lda, y_input, ldb, work_data,
+        pten::funcs::lapackGels('N', m, n, nrhs, x_input, lda, y_input, ldb,
-                         lwork, &info);
+                                work_data, lwork, &info);
      } else if (driver == LapackDriverType::Gelsd) {
-        math::lapackGelsd(m, n, nrhs, x_input, lda, y_input, ldb, s_working_ptr,
+        pten::funcs::lapackGelsd(m, n, nrhs, x_input, lda, y_input, ldb,
-                          static_cast<ValueType>(rcond), &rank_32, work_data,
+                                 s_working_ptr, static_cast<ValueType>(rcond),
-                          lwork, rwork_data, iwork_data, &info);
+                                 &rank_32, work_data, lwork, rwork_data,
+                                 iwork_data, &info);
      } else if (driver == LapackDriverType::Gelsy) {
-        math::lapackGelsy(m, n, nrhs, x_input, lda, y_input, ldb, jpvt_data,
+        pten::funcs::lapackGelsy(m, n, nrhs, x_input, lda, y_input, ldb,
-                          static_cast<ValueType>(rcond), &rank_32, work_data,
+                                 jpvt_data, static_cast<ValueType>(rcond),
-                          lwork, rwork_data, &info);
+                                 &rank_32, work_data, lwork, rwork_data, &info);
      } else if (driver == LapackDriverType::Gelss) {
-        math::lapackGelss(m, n, nrhs, x_input, lda, y_input, ldb, s_working_ptr,
+        pten::funcs::lapackGelss(m, n, nrhs, x_input, lda, y_input, ldb,
-                          static_cast<ValueType>(rcond), &rank_32, work_data,
+                                 s_working_ptr, static_cast<ValueType>(rcond),
-                          lwork, rwork_data, &info);
+                                 &rank_32, work_data, lwork, rwork_data, &info);
      }
      PADDLE_ENFORCE_EQ(

--- a/paddle/fluid/operators/lu_op.cc
+++ b/paddle/fluid/operators/lu_op.cc
@@ -142,8 +142,8 @@ class LUKernel : public framework::OpKernel<T> {
      auto out_data_item = &out_data[b * m * n];
      int *info_data_item = &info_data[b];
      int *ipiv_data_item = &ipiv_data[b * std::min(m, n)];
-      math::lapackLu<T>(m, n, out_data_item, lda, ipiv_data_item,
+      pten::funcs::lapackLu<T>(m, n, out_data_item, lda, ipiv_data_item,
-                        info_data_item);
+                               info_data_item);
    }
    *out = helper.Transpose(*out);
  }

--- a/paddle/fluid/operators/lu_op.h
+++ b/paddle/fluid/operators/lu_op.h
@@ -15,11 +15,11 @@ limitations under the License. */
 #pragma once
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/lapack_function.h"
 #include "paddle/fluid/operators/set_value_op.h"
 #include "paddle/fluid/operators/svd_helper.h"
 #include "paddle/fluid/operators/triangular_solve_op.h"
 #include "paddle/fluid/operators/tril_triu_op.h"
+#include "paddle/pten/kernels/funcs/lapack/lapack_function.h"
 #include "paddle/pten/kernels/math_kernel.h"
 namespace paddle {
@@ -489,7 +489,7 @@ class LUGradKernel : public framework::OpKernel<T> {
    const auto& dev_ctx = ctx.template device_context<DeviceContext>();
    math::DeviceIndependenceTensorOperations<DeviceContext, T> helper(ctx);
-    auto blas = math::GetBlas<DeviceContext, T>(ctx);
+    auto blas = pten::funcs::GetBlas<DeviceContext, T>(ctx);
    auto xdims = xin->dims();
    int xrank = xdims.size();
@@ -519,9 +519,9 @@ class LUGradKernel : public framework::OpKernel<T> {
    phi_L.mutable_data<T>(ctx.GetPlace());
    phi_U.Resize(UmHdims);
    phi_U.mutable_data<T>(ctx.GetPlace());
-    auto mat_dim_l = math::CreateMatrixDescriptor(LmHdims, 0, false);
+    auto mat_dim_l = pten::funcs::CreateMatrixDescriptor(LmHdims, 0, false);
-    auto mat_dim_u = math::CreateMatrixDescriptor(UmHdims, 0, false);
+    auto mat_dim_u = pten::funcs::CreateMatrixDescriptor(UmHdims, 0, false);
-    auto mat_dim_g = math::CreateMatrixDescriptor(graddims, 0, false);
+    auto mat_dim_g = pten::funcs::CreateMatrixDescriptor(graddims, 0, false);
    blas.MatMul(L_narrow_mH, mat_dim_l, grad_narrow, mat_dim_g,
                static_cast<T>(1), &phi_L, static_cast<T>(0));
@@ -567,10 +567,10 @@ class LUGradKernel : public framework::OpKernel<T> {
        Tensor_Conj<DeviceContext, T>(dev_ctx, U_complement_mH,
                                      &U_complement_mH);
-        auto mat_dim_g =
+        auto mat_dim_g = pten::funcs::CreateMatrixDescriptor(
-            math::CreateMatrixDescriptor(U_grad_complement.dims(), 0, false);
+            U_grad_complement.dims(), 0, false);
-        auto mat_dim_u =
+        auto mat_dim_u = pten::funcs::CreateMatrixDescriptor(
-            math::CreateMatrixDescriptor(U_complement_mH.dims(), 0, false);
+            U_complement_mH.dims(), 0, false);
        auto phidims = UmHdims;
        phidims[UmHdims.size() - 2] = k;
        phidims[UmHdims.size() - 1] = k;
@@ -623,8 +623,10 @@ class LUGradKernel : public framework::OpKernel<T> {
      triangular_solve<DeviceContext, T>(dev_ctx, L_narrow_mH, psi, &psi_tmp,
                                         true, false, true);
-      auto mat_dim_p = math::CreateMatrixDescriptor(Pmat.dims(), 0, false);
+      auto mat_dim_p =
-      auto mat_dim_b = math::CreateMatrixDescriptor(psi_tmp.dims(), 0, false);
+          pten::funcs::CreateMatrixDescriptor(Pmat.dims(), 0, false);
+      auto mat_dim_b =
+          pten::funcs::CreateMatrixDescriptor(psi_tmp.dims(), 0, false);
      blas.MatMul(Pmat, mat_dim_p, psi_tmp, mat_dim_b, static_cast<T>(1), dx,
                  static_cast<T>(0));
    } else {
@@ -636,10 +638,10 @@ class LUGradKernel : public framework::OpKernel<T> {
      framework::Tensor L_complement_mH = helper.Transpose(L_complement);
      Tensor_Conj<DeviceContext, T>(dev_ctx, L_complement_mH, &L_complement_mH);
-      auto mat_dim_g =
+      auto mat_dim_g = pten::funcs::CreateMatrixDescriptor(
-          math::CreateMatrixDescriptor(L_grad_complement.dims(), 0, false);
+          L_grad_complement.dims(), 0, false);
      auto mat_dim_u =
-          math::CreateMatrixDescriptor(L_complement_mH.dims(), 0, false);
+          pten::funcs::CreateMatrixDescriptor(L_complement_mH.dims(), 0, false);
      auto phidims = LmHdims;
      phidims[LmHdims.size() - 2] = k;
      phidims[LmHdims.size() - 1] = k;
@@ -685,8 +687,10 @@ class LUGradKernel : public framework::OpKernel<T> {
      psi_tmp.Resize(psi.dims());
      psi_tmp.mutable_data<T>(ctx.GetPlace());
-      auto mat_dim_p = math::CreateMatrixDescriptor(Pmat.dims(), 0, false);
+      auto mat_dim_p =
-      auto mat_dim_b = math::CreateMatrixDescriptor(psi.dims(), 0, false);
+          pten::funcs::CreateMatrixDescriptor(Pmat.dims(), 0, false);
+      auto mat_dim_b =
+          pten::funcs::CreateMatrixDescriptor(psi.dims(), 0, false);
      blas.MatMul(Pmat, mat_dim_p, psi, mat_dim_b, static_cast<T>(1), &psi_tmp,
                  static_cast<T>(0));
      psi_tmp = helper.Transpose(psi_tmp);

--- a/paddle/fluid/operators/match_matrix_tensor_op.cc
+++ b/paddle/fluid/operators/match_matrix_tensor_op.cc
@@ -249,7 +249,7 @@ class CPUMatchMatrixTensorOPKernel : public framework::OpKernel<T> {
    memset(bottom_l_trans_data, 0.0,
           tmp->dims()[0] * tmp->dims()[1] * sizeof(T));
-    auto blas = math::GetBlas<platform::CPUDeviceContext, T>(ctx);
+    auto blas = pten::funcs::GetBlas<platform::CPUDeviceContext, T>(ctx);
    call_gemm(blas, CblasNoTrans, CblasNoTrans, x->dims()[0], dim_t * dim_in,
              dim_in, 1.0f, bottom_l_data, t_data, 0.0f, bottom_l_trans_data);
@@ -262,7 +262,7 @@ class CPUMatchMatrixTensorOPKernel : public framework::OpKernel<T> {
        const auto* l_t_data =
            bottom_l_trans_data + offset_l[b] * dim_t * dim_in + t * dim_in;
        const auto* r_data = bottom_r_data + offset_r[b] * dim_in;
-        auto blas_2 = math::GetBlas<platform::CPUDeviceContext, T>(ctx);
+        auto blas_2 = pten::funcs::GetBlas<platform::CPUDeviceContext, T>(ctx);
        call_gemm_with_lda(blas_2, CblasNoTrans, CblasTrans, len_l, len_r,
                           dim_in, 1.0f, l_t_data, r_data, 0.0f, top_data,
                           dim_t * dim_in);
@@ -346,7 +346,7 @@ class CPUMatchMatrixTensorOPGradKernel : public framework::OpKernel<T> {
      }
    }
-    auto blas = math::GetBlas<platform::CPUDeviceContext, T>(ctx);
+    auto blas = pten::funcs::GetBlas<platform::CPUDeviceContext, T>(ctx);
    auto* t_data = w->data<T>();
    auto* d_w = ctx.Output<Tensor>(framework::GradVarName("W"));

--- a/paddle/fluid/operators/math/CMakeLists.txt
+++ b/paddle/fluid/operators/math/CMakeLists.txt
 add_subdirectory(detail)
-function(math_library TARGET)
-    # math_library is a function to create math library.
-    # The interface is the same as cc_library.
-    # But it handle split GPU/CPU code and link some common library.
-    set(cc_srcs)
-    set(cu_srcs)
-    set(hip_srcs)
-    set(math_common_deps device_context framework_proto enforce)
-    if (WITH_GPU)
-        if (${CMAKE_CUDA_COMPILER_VERSION} LESS 11.0)	
-            list(APPEND math_common_deps cub)
-	else()
-            list(APPEND math_common_deps)
-	endif()
-    endif()
-    set(multiValueArgs DEPS)
-    cmake_parse_arguments(math_library "${options}" "${oneValueArgs}"
-            "${multiValueArgs}" ${ARGN})
-    if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cc)
-        list(APPEND cc_srcs ${TARGET}.cc)
-    endif()
-    if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cu)
-        list(APPEND cu_srcs ${TARGET}.cu)
-    endif()
-    if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cu.cc)
-        list(APPEND cu_srcs ${TARGET}.cu.cc)
-    endif()
-    list(LENGTH cc_srcs cc_srcs_len)
-    if (WITH_GPU)
-        nv_library(${TARGET} SRCS ${cc_srcs} ${cu_srcs} DEPS ${math_library_DEPS} ${math_common_deps})
-    elseif (WITH_ROCM)
-        hip_library(${TARGET} SRCS ${cc_srcs} ${cu_srcs} DEPS ${math_library_DEPS} ${math_common_deps})
-    elseif(${cc_srcs_len} GREATER 0)
-        cc_library(${TARGET} SRCS ${cc_srcs} DEPS ${math_library_DEPS} ${math_common_deps})
-    endif()
-endfunction()
 if (WITH_ASCEND_CL)
  cc_library(beam_search_npu SRCS beam_search_npu.cc DEPS npu_op_runner)
 endif()
@@ -59,9 +20,6 @@ math_library(sampler DEPS generator)
 math_library(gru_compute DEPS activation_functions math_function)
 math_library(lstm_compute DEPS activation_functions)
-cc_library(blas SRCS blas.cc DEPS cblas framework_proto device_context)
-# math_library(math_function DEPS blas dense_tensor tensor)
 math_library(maxouting)
 math_library(pooling)
@@ -82,8 +40,6 @@ else()
    math_library(beam_search DEPS math_function)
 endif()
 math_library(fc DEPS blas)
-math_library(lapack_function DEPS dynload_lapack)
 math_library(matrix_bit_code)
 math_library(unpooling)

--- a/paddle/fluid/operators/math/bert_encoder_functor.cu
+++ b/paddle/fluid/operators/math/bert_encoder_functor.cu
@@ -17,8 +17,8 @@ limitations under the License. */
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/operators/math/bert_encoder_functor.h"
-#include "paddle/fluid/operators/math/blas.h"
 #include "paddle/fluid/platform/enforce.h"
+#include "paddle/pten/kernels/funcs/blas/blas.h"
 #include "paddle/pten/kernels/funcs/math_cuda_utils.h"
 namespace paddle {
@@ -502,7 +502,7 @@ inline void MatMulWithHeadQK(const platform::CUDADeviceContext &context,
  typedef typename CUDATypeTraits<T>::TYPE run_type;
  auto blas =
-      operators::math::GetBlas<platform::CUDADeviceContext, run_type>(context);
+      pten::funcs::GetBlas<platform::CUDADeviceContext, run_type>(context);
  auto stream = context.stream();
  blas.BatchedGEMM(
@@ -568,7 +568,7 @@ inline void MatMulWithHeadQKV(const platform::CUDADeviceContext &context,
  typedef typename CUDATypeTraits<T>::TYPE run_type;
  auto blas =
-      operators::math::GetBlas<platform::CUDADeviceContext, run_type>(context);
+      pten::funcs::GetBlas<platform::CUDADeviceContext, run_type>(context);
  auto stream = context.stream();
  CBLAS_TRANSPOSE transA = !qk_trans ? CblasNoTrans : CblasTrans;
  CBLAS_TRANSPOSE transB = !v_trans ? CblasNoTrans : CblasTrans;

--- a/paddle/fluid/operators/math/context_project.h
+++ b/paddle/fluid/operators/math/context_project.h
@@ -18,8 +18,8 @@ limitations under the License. */
 #include <vector>
 #include "paddle/fluid/framework/lod_tensor.h"
-#include "paddle/fluid/operators/math/blas.h"
 #include "paddle/fluid/operators/math/im2col.h"
+#include "paddle/pten/kernels/funcs/blas/blas.h"
 namespace paddle {
 namespace operators {
@@ -223,7 +223,7 @@ class ContextProjectGradFunctor {
    int input_row_begin, input_row_end;
    int sequence_height, sequence_width;
    sequence_width = in.dims()[1];
-    auto blas = math::GetBlas<DeviceContext, T>(context);
+    auto blas = pten::funcs::GetBlas<DeviceContext, T>(context);
    if (input_grad) {
      for (int i = 0; i < static_cast<int>(lod_level_0.size()) - 1; ++i) {

--- a/paddle/fluid/operators/math/eigen_values_vectors.h
+++ b/paddle/fluid/operators/math/eigen_values_vectors.h
@@ -15,8 +15,8 @@
 #pragma once
 #include "paddle/fluid/memory/memory.h"
-#include "paddle/fluid/operators/math/lapack_function.h"
 #include "paddle/fluid/operators/svd_helper.h"
+#include "paddle/pten/kernels/funcs/lapack/lapack_function.h"
 #ifdef PADDLE_WITH_CUDA
 #include "paddle/fluid/platform/dynload/cusolver.h"
 #endif  // PADDLE_WITH_CUDA
@@ -98,9 +98,9 @@ struct MatrixEighFunctor<platform::CPUDeviceContext, T> {
    int info = 0;
    // Call lapackEigh to get the optimal size of work data
-    math::lapackEigh<T, ValueType>(jobz, uplo, n, input_vector, lda, out_value,
+    pten::funcs::lapackEigh<T, ValueType>(
-                                   &lwork_opt, lwork, &rwork_opt, lrwork,
+        jobz, uplo, n, input_vector, lda, out_value, &lwork_opt, lwork,
-                                   &iwork_opt, liwork, &info);
+        &rwork_opt, lrwork, &iwork_opt, liwork, &info);
    lwork = std::max<int>(1, static_cast<int>(lwork_opt));
    liwork = std::max<int>(1, iwork_opt);
@@ -123,7 +123,7 @@ struct MatrixEighFunctor<platform::CPUDeviceContext, T> {
    for (auto i = 0; i < batch_size; i++) {
      auto *value_data = out_value + i * values_stride;
      auto *input_data = input_vector + i * vector_stride;
-      math::lapackEigh<T, pten::funcs::Real<T>>(
+      pten::funcs::lapackEigh<T, pten::funcs::Real<T>>(
          jobz, uplo, n, input_data, lda, value_data, work_data, lwork,
          rwork_data, lrwork, iwork_data, liwork, &info);
      CheckEighResult(i, info);

--- a/paddle/fluid/operators/math/fc.cc
+++ b/paddle/fluid/operators/math/fc.cc
@@ -15,7 +15,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/math/fc.h"
 #include "paddle/fluid/operators/jit/kernels.h"
-#include "paddle/fluid/operators/math/blas.h"
+#include "paddle/pten/kernels/funcs/blas/blas.h"
 namespace paddle {
 namespace operators {
@@ -28,7 +28,7 @@ class FCFunctor<platform::CPUDeviceContext, T> {
                  const int N, const int K, const T* X, const T* W, T* Y,
                  const T* B = nullptr, bool relu = false,
                  bool padding_weights = false) {
-    auto blas = math::GetBlas<platform::CPUDeviceContext, T>(context);
+    auto blas = pten::funcs::GetBlas<platform::CPUDeviceContext, T>(context);
    framework::Tensor Y1;
    T* Y1_data = nullptr;
    if (padding_weights) {

--- a/paddle/fluid/operators/math/fc.cu
+++ b/paddle/fluid/operators/math/fc.cu
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include <algorithm>
-#include "paddle/fluid/operators/math/blas.h"
 #include "paddle/fluid/operators/math/fc.h"
+#include "paddle/pten/kernels/funcs/blas/blas.h"
 namespace paddle {
 namespace operators {
@@ -85,7 +85,7 @@ class FCFunctor<platform::CUDADeviceContext, T> {
        padding_weights, false,
        platform::errors::PermissionDenied(
            "Weight padding in fc can not be used in GPU scope."));
-    auto blas = math::GetBlas<platform::CUDADeviceContext, T>(context);
+    auto blas = pten::funcs::GetBlas<platform::CUDADeviceContext, T>(context);
    blas.GEMM(false, false, M, N, K, static_cast<T>(1.0), X, K, W, N,
              static_cast<T>(0.0), Y, N);
    if (B == NULL) {

--- a/paddle/fluid/operators/math/gru_compute.cc
+++ b/paddle/fluid/operators/math/gru_compute.cc
@@ -11,9 +11,9 @@ limitations under the License. */
 #include "paddle/fluid/operators/math/gru_compute.h"
-#include "paddle/fluid/operators/math/blas.h"
 #include "paddle/fluid/operators/math/detail/gru_cpu_kernel.h"
 #include "paddle/fluid/operators/math/detail/gru_kernel.h"
+#include "paddle/pten/kernels/funcs/blas/blas.h"
 namespace paddle {
 namespace platform {
@@ -33,7 +33,7 @@ struct GRUUnitFunctor<platform::CPUDeviceContext, T> {
                      const detail::ActivationType active_gate,
                      bool origin_mode) {
 #if !defined(__NVCC__) && !defined(__HIPCC___)
-    auto blas = math::GetBlas<platform::CPUDeviceContext, T>(context);
+    auto blas = pten::funcs::GetBlas<platform::CPUDeviceContext, T>(context);
    if (value.prev_out_value) {
      blas.GEMM(false, false, batch_size, frame_size * 2, frame_size, 1,
                value.prev_out_value, frame_size, value.gate_weight,
@@ -70,7 +70,7 @@ struct GRUUnitGradFunctor<platform::CPUDeviceContext, T> {
    detail::backward_state_grad(detail::backward::gru_stateGrad<T>(), value,
                                grad, frame_size, batch_size, active_node,
                                origin_mode);
-    auto blas = math::GetBlas<platform::CPUDeviceContext, T>(context);
+    auto blas = pten::funcs::GetBlas<platform::CPUDeviceContext, T>(context);
    if (value.prev_out_value && grad.prev_out_grad) {
      blas.GEMM(false, true, batch_size, frame_size, frame_size, 1,
                grad.gate_grad + frame_size * 2, frame_size * 3,
@@ -109,7 +109,7 @@ struct GRUUnitFunctorV2<platform::CPUDeviceContext, T> {
                      const detail::ActivationType active_node,
                      const detail::ActivationType active_gate) {
 #if !defined(__NVCC__) && !defined(__HIPCC___)
-    auto blas = math::GetBlas<platform::CPUDeviceContext, T>(context);
+    auto blas = pten::funcs::GetBlas<platform::CPUDeviceContext, T>(context);
    if (value.prev_out_value) {
      blas.GEMM(CblasNoTrans, CblasTrans, batch_size, frame_size, frame_size, 1,
                value.prev_out_value, value.state_weight, 0,
@@ -147,7 +147,7 @@ struct GRUUnitGradFunctorV2<platform::CPUDeviceContext, T> {
    // grad_reset_output, grad_reset_gate
    detail::cpu_gru_backward(context, detail::backward::gru<T>(), value, grad,
                             frame_size, batch_size, active_node, active_gate);
-    auto blas = math::GetBlas<platform::CPUDeviceContext, T>(context);
+    auto blas = pten::funcs::GetBlas<platform::CPUDeviceContext, T>(context);
    if (grad.prev_out_grad && value.prev_out_value) {
      // update prev_out_grad
      blas.GEMM(false, false, batch_size, frame_size, frame_size, 1,

--- a/paddle/fluid/operators/math/gru_compute.cu
+++ b/paddle/fluid/operators/math/gru_compute.cu
@@ -10,10 +10,10 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include <paddle/fluid/platform/device_context.h>
-#include "paddle/fluid/operators/math/blas.h"
 #include "paddle/fluid/operators/math/detail/gru_gpu_kernel.h"
 #include "paddle/fluid/operators/math/detail/gru_kernel.h"
 #include "paddle/fluid/operators/math/gru_compute.h"
+#include "paddle/pten/kernels/funcs/blas/blas.h"
 namespace paddle {
 namespace operators {
@@ -77,7 +77,7 @@ struct GRUUnitFunctor<platform::CUDADeviceContext, T> {
      threads = dim3(32, 32);
      grid = dim3((frame_size + 32 - 1) / 32, (batch_size + 32 - 1) / 32);
    }
-    auto blas = math::GetBlas<platform::CUDADeviceContext, T>(context);
+    auto blas = pten::funcs::GetBlas<platform::CUDADeviceContext, T>(context);
    if (value.prev_out_value) {
      blas.GEMM(false, false, batch_size, frame_size * 2, frame_size, 1,
                value.prev_out_value, frame_size, value.gate_weight,
@@ -162,7 +162,7 @@ struct GRUUnitGradFunctor<platform::CUDADeviceContext, T> {
          grad.output_grad, frame_size, batch_size, active_node, origin_mode);
    }
-    auto blas = math::GetBlas<platform::CUDADeviceContext, T>(context);
+    auto blas = pten::funcs::GetBlas<platform::CUDADeviceContext, T>(context);
    if (value.prev_out_value && grad.prev_out_grad) {
      blas.GEMM(false, true, batch_size, frame_size, frame_size, 1,

--- a/paddle/fluid/operators/math/lapack_function.cc
+++ b/paddle/fluid/operators/math/lapack_function.cc
-//   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include "paddle/fluid/operators/math/lapack_function.h"
-#include "paddle/fluid/platform/complex.h"
-#include "paddle/fluid/platform/dynload/lapack.h"
-namespace paddle {
-namespace operators {
-namespace math {
-// LU (for example)
-template <>
-void lapackLu<double>(int m, int n, double *a, int lda, int *ipiv, int *info) {
-  platform::dynload::dgetrf_(&m, &n, a, &lda, ipiv, info);
-}
-template <>
-void lapackLu<float>(int m, int n, float *a, int lda, int *ipiv, int *info) {
-  platform::dynload::sgetrf_(&m, &n, a, &lda, ipiv, info);
-}
-// eigh
-template <>
-void lapackEigh<float>(char jobz, char uplo, int n, float *a, int lda, float *w,
-                       float *work, int lwork, float *rwork, int lrwork,
-                       int *iwork, int liwork, int *info) {
-  (void)rwork;   // unused
-  (void)lrwork;  // unused
-  platform::dynload::ssyevd_(&jobz, &uplo, &n, a, &lda, w, work, &lwork, iwork,
-                             &liwork, info);
-}
-template <>
-void lapackEigh<double>(char jobz, char uplo, int n, double *a, int lda,
-                        double *w, double *work, int lwork, double *rwork,
-                        int lrwork, int *iwork, int liwork, int *info) {
-  (void)rwork;   // unused
-  (void)lrwork;  // unused
-  platform::dynload::dsyevd_(&jobz, &uplo, &n, a, &lda, w, work, &lwork, iwork,
-                             &liwork, info);
-}
-template <>
-void lapackEigh<platform::complex<float>, float>(
-    char jobz, char uplo, int n, platform::complex<float> *a, int lda, float *w,
-    platform::complex<float> *work, int lwork, float *rwork, int lrwork,
-    int *iwork, int liwork, int *info) {
-  platform::dynload::cheevd_(&jobz, &uplo, &n,
-                             reinterpret_cast<std::complex<float> *>(a), &lda,
-                             w, reinterpret_cast<std::complex<float> *>(work),
-                             &lwork, rwork, &lrwork, iwork, &liwork, info);
-}
-template <>
-void lapackEigh<platform::complex<double>, double>(
-    char jobz, char uplo, int n, platform::complex<double> *a, int lda,
-    double *w, platform::complex<double> *work, int lwork, double *rwork,
-    int lrwork, int *iwork, int liwork, int *info) {
-  platform::dynload::zheevd_(&jobz, &uplo, &n,
-                             reinterpret_cast<std::complex<double> *>(a), &lda,
-                             w, reinterpret_cast<std::complex<double> *>(work),
-                             &lwork, rwork, &lrwork, iwork, &liwork, info);
-}
-// Eig
-template <>
-void lapackEig<double>(char jobvl, char jobvr, int n, double *a, int lda,
-                       double *w, double *vl, int ldvl, double *vr, int ldvr,
-                       double *work, int lwork, double *rwork, int *info) {
-  double *wr = w;
-  double *wi = w + n;
-  (void)rwork;  // unused
-  platform::dynload::dgeev_(&jobvl, &jobvr, &n, a, &lda, wr, wi, vl, &ldvl, vr,
-                            &ldvr, work, &lwork, info);
-}
-template <>
-void lapackEig<float>(char jobvl, char jobvr, int n, float *a, int lda,
-                      float *w, float *vl, int ldvl, float *vr, int ldvr,
-                      float *work, int lwork, float *rwork, int *info) {
-  float *wr = w;
-  float *wi = w + n;
-  (void)rwork;  // unused
-  platform::dynload::sgeev_(&jobvl, &jobvr, &n, a, &lda, wr, wi, vl, &ldvl, vr,
-                            &ldvr, work, &lwork, info);
-}
-template <>
-void lapackEig<platform::complex<double>, double>(
-    char jobvl, char jobvr, int n, platform::complex<double> *a, int lda,
-    platform::complex<double> *w, platform::complex<double> *vl, int ldvl,
-    platform::complex<double> *vr, int ldvr, platform::complex<double> *work,
-    int lwork, double *rwork, int *info) {
-  platform::dynload::zgeev_(
-      &jobvl, &jobvr, &n, reinterpret_cast<std::complex<double> *>(a), &lda,
-      reinterpret_cast<std::complex<double> *>(w),
-      reinterpret_cast<std::complex<double> *>(vl), &ldvl,
-      reinterpret_cast<std::complex<double> *>(vr), &ldvr,
-      reinterpret_cast<std::complex<double> *>(work), &lwork, rwork, info);
-}
-template <>
-void lapackEig<platform::complex<float>, float>(
-    char jobvl, char jobvr, int n, platform::complex<float> *a, int lda,
-    platform::complex<float> *w, platform::complex<float> *vl, int ldvl,
-    platform::complex<float> *vr, int ldvr, platform::complex<float> *work,
-    int lwork, float *rwork, int *info) {
-  platform::dynload::cgeev_(
-      &jobvl, &jobvr, &n, reinterpret_cast<std::complex<float> *>(a), &lda,
-      reinterpret_cast<std::complex<float> *>(w),
-      reinterpret_cast<std::complex<float> *>(vl), &ldvl,
-      reinterpret_cast<std::complex<float> *>(vr), &ldvr,
-      reinterpret_cast<std::complex<float> *>(work), &lwork, rwork, info);
-}
-template <>
-void lapackGels<double>(char trans, int m, int n, int nrhs, double *a, int lda,
-                        double *b, int ldb, double *work, int lwork,
-                        int *info) {
-  platform::dynload::dgels_(&trans, &m, &n, &nrhs, a, &lda, b, &ldb, work,
-                            &lwork, info);
-}
-template <>
-void lapackGels<float>(char trans, int m, int n, int nrhs, float *a, int lda,
-                       float *b, int ldb, float *work, int lwork, int *info) {
-  platform::dynload::sgels_(&trans, &m, &n, &nrhs, a, &lda, b, &ldb, work,
-                            &lwork, info);
-}
-template <>
-void lapackGelsd<double>(int m, int n, int nrhs, double *a, int lda, double *b,
-                         int ldb, double *s, double rcond, int *rank,
-                         double *work, int lwork, double *rwork, int *iwork,
-                         int *info) {
-  platform::dynload::dgelsd_(&m, &n, &nrhs, a, &lda, b, &ldb, s, &rcond, rank,
-                             work, &lwork, iwork, info);
-}
-template <>
-void lapackGelsd<float>(int m, int n, int nrhs, float *a, int lda, float *b,
-                        int ldb, float *s, float rcond, int *rank, float *work,
-                        int lwork, float *rwork, int *iwork, int *info) {
-  platform::dynload::sgelsd_(&m, &n, &nrhs, a, &lda, b, &ldb, s, &rcond, rank,
-                             work, &lwork, iwork, info);
-}
-template <>
-void lapackGelsy<double>(int m, int n, int nrhs, double *a, int lda, double *b,
-                         int ldb, int *jpvt, double rcond, int *rank,
-                         double *work, int lwork, double *rwork, int *info) {
-  platform::dynload::dgelsy_(&m, &n, &nrhs, a, &lda, b, &ldb, jpvt, &rcond,
-                             rank, work, &lwork, info);
-}
-template <>
-void lapackGelsy<float>(int m, int n, int nrhs, float *a, int lda, float *b,
-                        int ldb, int *jpvt, float rcond, int *rank, float *work,
-                        int lwork, float *rwork, int *info) {
-  platform::dynload::sgelsy_(&m, &n, &nrhs, a, &lda, b, &ldb, jpvt, &rcond,
-                             rank, work, &lwork, info);
-}
-template <>
-void lapackGelss<double>(int m, int n, int nrhs, double *a, int lda, double *b,
-                         int ldb, double *s, double rcond, int *rank,
-                         double *work, int lwork, double *rwork, int *info) {
-  platform::dynload::dgelss_(&m, &n, &nrhs, a, &lda, b, &ldb, s, &rcond, rank,
-                             work, &lwork, info);
-}
-template <>
-void lapackGelss<float>(int m, int n, int nrhs, float *a, int lda, float *b,
-                        int ldb, float *s, float rcond, int *rank, float *work,
-                        int lwork, float *rwork, int *info) {
-  platform::dynload::sgelss_(&m, &n, &nrhs, a, &lda, b, &ldb, s, &rcond, rank,
-                             work, &lwork, info);
-}
-template <>
-void lapackCholeskySolve<platform::complex<double>>(
-    char uplo, int n, int nrhs, platform::complex<double> *a, int lda,
-    platform::complex<double> *b, int ldb, int *info) {
-  platform::dynload::zpotrs_(
-      &uplo, &n, &nrhs, reinterpret_cast<std::complex<double> *>(a), &lda,
-      reinterpret_cast<std::complex<double> *>(b), &ldb, info);
-}
-template <>
-void lapackCholeskySolve<platform::complex<float>>(char uplo, int n, int nrhs,
-                                                   platform::complex<float> *a,
-                                                   int lda,
-                                                   platform::complex<float> *b,
-                                                   int ldb, int *info) {
-  platform::dynload::cpotrs_(
-      &uplo, &n, &nrhs, reinterpret_cast<std::complex<float> *>(a), &lda,
-      reinterpret_cast<std::complex<float> *>(b), &ldb, info);
-}
-template <>
-void lapackCholeskySolve<double>(char uplo, int n, int nrhs, double *a, int lda,
-                                 double *b, int ldb, int *info) {
-  platform::dynload::dpotrs_(&uplo, &n, &nrhs, a, &lda, b, &ldb, info);
-}
-template <>
-void lapackCholeskySolve<float>(char uplo, int n, int nrhs, float *a, int lda,
-                                float *b, int ldb, int *info) {
-  platform::dynload::spotrs_(&uplo, &n, &nrhs, a, &lda, b, &ldb, info);
-}
-}  // namespace math
-}  // namespace operators
-}  // namespace paddle
--- a/paddle/fluid/operators/math/matrix_bit_code.cc
+++ b/paddle/fluid/operators/math/matrix_bit_code.cc
@@ -135,8 +135,8 @@ struct MatrixBitCodeFunctorMul : public boost::static_visitor<void> {
  template <typename CodeTable>
  void operator()(const CodeTable &code_table) {
-    auto blas =
+    auto blas = pten::funcs::GetBlas<platform::CPUDeviceContext, T>(
-        GetBlas<platform::CPUDeviceContext, T>(platform::CPUDeviceContext());
+        platform::CPUDeviceContext());
    size_t num_samples = tmat_->dims()[0];
    size_t tmat_width = tmat_->dims()[1];
    size_t input_width = input_.dims()[1];
@@ -183,8 +183,8 @@ struct MatrixBitCodeFunctorMulGradWeight : public boost::static_visitor<void> {
      : tmat_(tmat), weight_(weight), input_(input) {}
  template <typename CodeTable>
  void operator()(const CodeTable &code_table) {
-    auto blas =
+    auto blas = pten::funcs::GetBlas<platform::CPUDeviceContext, T>(
-        GetBlas<platform::CPUDeviceContext, T>(platform::CPUDeviceContext());
+        platform::CPUDeviceContext());
    size_t num_samples = tmat_.dims()[0];
    size_t input_width = input_.dims()[1];
    size_t tmat_width = tmat_.dims()[1];
@@ -237,8 +237,8 @@ struct MatrixBitCodeFunctorMulGradWeightSR
  template <typename CodeTable>
  void operator()(const CodeTable &code_table) {
-    auto blas =
+    auto blas = pten::funcs::GetBlas<platform::CPUDeviceContext, T>(
-        GetBlas<platform::CPUDeviceContext, T>(platform::CPUDeviceContext());
+        platform::CPUDeviceContext());
    size_t num_samples = tmat_.dims()[0];
    size_t input_width = input_.dims()[1];
    size_t tmat_width = tmat_.dims()[1];

--- a/paddle/fluid/operators/math/matrix_bit_code.h
+++ b/paddle/fluid/operators/math/matrix_bit_code.h
@@ -21,9 +21,9 @@ limitations under the License. */
 #include "paddle/fluid/framework/lod_tensor.h"
 #include "paddle/fluid/framework/selected_rows_utils.h"
 #include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/operators/math/blas.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/variant.h"
+#include "paddle/pten/kernels/funcs/blas/blas.h"
 #if defined(_WIN32)
 #include <intrin.h>

--- a/paddle/fluid/operators/math/matrix_inverse.cc
+++ b/paddle/fluid/operators/math/matrix_inverse.cc
@@ -15,7 +15,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/math/matrix_inverse.h"
 #include "Eigen/Core"
 #include "Eigen/LU"
-#include "paddle/fluid/operators/math/blas.h"
+#include "paddle/pten/kernels/funcs/blas/blas.h"
 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/math/matrix_inverse.cu.cc
+++ b/paddle/fluid/operators/math/matrix_inverse.cu.cc
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/operators/math/matrix_inverse.h"
-#include "paddle/fluid/operators/math/blas.h"
+#include "paddle/pten/kernels/funcs/blas/blas.h"
 namespace paddle {
 namespace platform {
@@ -72,7 +72,7 @@ class MatrixInverseFunctor<platform::CUDADeviceContext, T> {
        memory::Alloc(context, num_ints * sizeof(int));
    int* gpu_info_ptr = reinterpret_cast<int*>(tmp_gpu_info_data->ptr());
-    auto blas = math::GetBlas<platform::CUDADeviceContext, T>(context);
+    auto blas = pten::funcs::GetBlas<platform::CUDADeviceContext, T>(context);
    std::vector<int> info;  // only for singular checking
    info.resize(batch_size);

--- a/paddle/fluid/operators/math/matrix_solve.cc
+++ b/paddle/fluid/operators/math/matrix_solve.cc
@@ -15,7 +15,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/math/matrix_solve.h"
 #include "Eigen/Core"
 #include "Eigen/LU"
-#include "paddle/fluid/operators/math/blas.h"
+#include "paddle/pten/kernels/funcs/blas/blas.h"
 namespace paddle {
 namespace operators {
@@ -62,7 +62,7 @@ class TriangularSolveFunctor<platform::CPUDeviceContext, T> {
      batch_size *= a_dim[i];
    }
-    auto blas = math::GetBlas<platform::CPUDeviceContext, T>(context);
+    auto blas = pten::funcs::GetBlas<platform::CPUDeviceContext, T>(context);
    for (int i = 0; i < batch_size; i++) {
      blas.TRSM(side, uplo, transA, diag, M, N, T(1), a_data + i * M * M, lda,
                b_data + i * N * M, ldb);

--- a/paddle/fluid/operators/math/matrix_solve.cu.cc
+++ b/paddle/fluid/operators/math/matrix_solve.cu.cc
@@ -14,9 +14,9 @@ limitations under the License. */
 #include "paddle/fluid/operators/math/matrix_solve.h"
 #include "paddle/fluid/framework/tensor_util.h"
-#include "paddle/fluid/operators/math/blas.h"
 #include "paddle/fluid/operators/solve_op.h"
 #include "paddle/fluid/platform/device_context.h"
+#include "paddle/pten/kernels/funcs/blas/blas.h"
 #include "paddle/pten/kernels/funcs/math_function.h"
 namespace paddle {
@@ -105,7 +105,7 @@ class MatrixSolveFunctor<platform::CUDADeviceContext, T> {
        memory::Alloc(context, num_ints * sizeof(int));
    int* gpu_info_ptr = reinterpret_cast<int*>(tmp_gpu_info_data->ptr());
-    auto blas = math::GetBlas<platform::CUDADeviceContext, T>(context);
+    auto blas = pten::funcs::GetBlas<platform::CUDADeviceContext, T>(context);
    // only for singular checking
    std::vector<int> info;
@@ -189,7 +189,7 @@ class TriangularSolveFunctor<platform::CUDADeviceContext, T> {
      batch_size *= a_dim[i];
    }
-    auto blas = math::GetBlas<platform::CUDADeviceContext, T>(context);
+    auto blas = pten::funcs::GetBlas<platform::CUDADeviceContext, T>(context);
    if (batch_size <= 8 && M >= 64) {
      for (auto i = 0; i < batch_size; i++) {
        blas.TRSM(side, uplo, transA, diag, M, N, static_cast<T>(1.0),

--- a/paddle/fluid/operators/math/selected_rows_functor.cc
+++ b/paddle/fluid/operators/math/selected_rows_functor.cc
@@ -224,7 +224,7 @@ struct SelectedRowsSumTo<platform::CPUDeviceContext, T> {
    auto* in2_value = input2->mutable_value();
    auto* in2_data = in2_value->data<T>();
-    auto blas = math::GetBlas<platform::CPUDeviceContext, T>(context);
+    auto blas = pten::funcs::GetBlas<platform::CPUDeviceContext, T>(context);
    size_t offset = 0u;
    for (size_t i = 0u; i != input1.size(); ++i) {
      auto& in_value = input1[i]->value();
@@ -295,15 +295,15 @@ namespace scatter {
 template <typename T>
 typename std::enable_if<!std::is_integral<T>::value>::type elementwise_add_to(
-    BlasT<platform::CPUDeviceContext, T>* blas, size_t data_len, const T* in,
+    pten::funcs::BlasT<platform::CPUDeviceContext, T>* blas, size_t data_len,
-    T* out) {
+    const T* in, T* out) {
  blas->AXPY(data_len, T(1.f), in, out);
 }
 template <typename T>
 typename std::enable_if<std::is_integral<T>::value>::type elementwise_add_to(
-    BlasT<platform::CPUDeviceContext, T>* blas, size_t data_len, const T* in,
+    pten::funcs::BlasT<platform::CPUDeviceContext, T>* blas, size_t data_len,
-    T* out) {
+    const T* in, T* out) {
  for (size_t i = 0; i < data_len; i++) {
    out[i] += in[i];
  }
@@ -316,7 +316,7 @@ add_sparse_inputs(const std::vector<const pten::SelectedRows*>& inputs,
                  int64_t input_width,
                  const platform::CPUDeviceContext& context, T* out_data) {
 #ifndef PADDLE_WITH_MKLDNN
-  auto blas = math::GetBlas<platform::CPUDeviceContext, T>(context);
+  auto blas = pten::funcs::GetBlas<platform::CPUDeviceContext, T>(context);
 #endif
  for (auto* input : inputs) {
    if (input->rows().size() == 0) {
@@ -350,7 +350,7 @@ add_sparse_inputs(const std::vector<const pten::SelectedRows*>& inputs,
                  int64_t input_width,
                  const platform::CPUDeviceContext& context, T* out_data) {
  VLOG(4) << "[CPU] add_sparse_inputs <" << typeid(T).name();
-  auto blas = math::GetBlas<platform::CPUDeviceContext, T>(context);
+  auto blas = pten::funcs::GetBlas<platform::CPUDeviceContext, T>(context);
  for (auto* input : inputs) {
    if (input->rows().size() == 0) {
      continue;
@@ -697,7 +697,7 @@ struct MergeAverage<platform::CPUDeviceContext, T> {
      rows_to_id[merge_rows[i]] = i;
    }
-    auto blas = math::GetBlas<platform::CPUDeviceContext, T>(context);
+    auto blas = pten::funcs::GetBlas<platform::CPUDeviceContext, T>(context);
    for (auto* input : inputs) {
      if (input->rows().size() == 0) {
        continue;

--- a/paddle/fluid/operators/math/selected_rows_functor.h
+++ b/paddle/fluid/operators/math/selected_rows_functor.h
@@ -18,8 +18,8 @@ limitations under the License. */
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/selected_rows_utils.h"
-#include "paddle/fluid/operators/math/blas.h"
 #include "paddle/fluid/platform/device_context.h"
+#include "paddle/pten/kernels/funcs/blas/blas.h"
 #include "paddle/pten/kernels/funcs/math_function.h"
 #define INLINE_FOR2(sizei, sizej)     \

--- a/paddle/fluid/operators/math/sequence_pooling.cc
+++ b/paddle/fluid/operators/math/sequence_pooling.cc
@@ -15,8 +15,8 @@ limitations under the License. */
 #include <string>
 #include "paddle/fluid/operators/jit/kernels.h"
-#include "paddle/fluid/operators/math/blas.h"
 #include "paddle/fluid/operators/math/sequence_pooling.h"
+#include "paddle/pten/kernels/funcs/blas/blas.h"
 #include "paddle/pten/kernels/funcs/math_function.h"
 namespace paddle {
@@ -289,7 +289,7 @@ class SumSeqPoolGradFunctor {
                          in_w, out_w, in_w, out_w));
    const T* out_g_data = out_grad.data<T>();
    T* in_g_data = in_grad->mutable_data<T>(context.GetPlace());
-    auto blas = math::GetBlas<platform::CPUDeviceContext, T>(context);
+    auto blas = pten::funcs::GetBlas<platform::CPUDeviceContext, T>(context);
    for (int i = 0; i < static_cast<int>(lod.size()) - 1; ++i) {
      int64_t h = static_cast<int64_t>(lod[i + 1] - lod[i]);
      if (h == 0) continue;

--- a/paddle/fluid/operators/matmul_op.cc
+++ b/paddle/fluid/operators/matmul_op.cc
@@ -14,7 +14,7 @@ limitations under the License. */
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/op_version_registry.h"
-#include "paddle/fluid/operators/math/blas.h"
+#include "paddle/pten/kernels/funcs/blas/blas.h"
 #ifdef PADDLE_WITH_MKLDNN
 #include "paddle/fluid/platform/mkldnn_helper.h"
 #endif
@@ -25,7 +25,8 @@ namespace operators {
 /**
 * Printing shape information into a string is easy to use.
 */
-inline static std::string DumpMatrixShape(const math::MatDescriptor &desc) {
+inline static std::string DumpMatrixShape(
+    const pten::funcs::MatDescriptor &desc) {
  std::stringstream buffer;
  buffer << "[" << desc.batch_size_ << ", " << desc.height_ << ", "
         << desc.width_ << "]";
@@ -65,10 +66,10 @@ class MatMulKernel : public framework::OpKernel<T> {
    auto *out = context.Output<framework::Tensor>("Out");
    out->mutable_data<T>(context.GetPlace());
-    auto blas = math::GetBlas<DeviceContext, T>(context);
+    auto blas = pten::funcs::GetBlas<DeviceContext, T>(context);
-    auto mat_dim_a = math::CreateMatrixDescriptor(
+    auto mat_dim_a = pten::funcs::CreateMatrixDescriptor(
        RowMatrixFromVector(x.dims()), 0, context.Attr<bool>("transpose_X"));
-    auto mat_dim_b = math::CreateMatrixDescriptor(
+    auto mat_dim_b = pten::funcs::CreateMatrixDescriptor(
        ColumnMatrixFromVector(y.dims()), 0, context.Attr<bool>("transpose_Y"));
    auto scale = static_cast<T>(context.Attr<float>("alpha"));
@@ -142,7 +143,7 @@ static framework::Tensor FoldHeadAndLastDims(const DeviceContext &context,
 * If transposed, `H,W` will be swapped.
 */
 static void ReshapeTensorIntoMatrixSequence(
-    framework::Tensor *x, const math::MatDescriptor &descriptor) {
+    framework::Tensor *x, const pten::funcs::MatDescriptor &descriptor) {
  int64_t h, w;
  h = descriptor.height_;
  w = descriptor.width_;
@@ -176,8 +177,8 @@ static void ReshapeXYOutIntoMatrixSequence(framework::Tensor *x,
                                           bool trans_y) {
  auto x_dim = RowMatrixFromVector(x->dims());
  auto y_dim = ColumnMatrixFromVector(y->dims());
-  auto mat_dim_x = math::CreateMatrixDescriptor(x_dim, 0, trans_x);
+  auto mat_dim_x = pten::funcs::CreateMatrixDescriptor(x_dim, 0, trans_x);
-  auto mat_dim_y = math::CreateMatrixDescriptor(y_dim, 0, trans_y);
+  auto mat_dim_y = pten::funcs::CreateMatrixDescriptor(y_dim, 0, trans_y);
  if (mat_dim_x.batch_size_ == 0 && mat_dim_y.batch_size_ == 0) {
    out->Resize({mat_dim_x.height_, mat_dim_y.width_});
  } else {
@@ -222,9 +223,9 @@ class MatMulGradKernel : public framework::OpKernel<T> {
              const framework::Tensor &b, bool trans_b,
              framework::Tensor *out) const {
    out->mutable_data<T>(context.GetPlace());
-    auto blas = math::GetBlas<DeviceContext, T>(context);
+    auto blas = pten::funcs::GetBlas<DeviceContext, T>(context);
-    auto mat_dim_a = math::CreateMatrixDescriptor(a.dims(), 0, trans_a);
+    auto mat_dim_a = pten::funcs::CreateMatrixDescriptor(a.dims(), 0, trans_a);
-    auto mat_dim_b = math::CreateMatrixDescriptor(b.dims(), 0, trans_b);
+    auto mat_dim_b = pten::funcs::CreateMatrixDescriptor(b.dims(), 0, trans_b);
    int head_number = 1;
 #if defined(PADDLE_WITH_MKLML) && !defined(PADDLE_WITH_CUDA) && \
@@ -404,9 +405,9 @@ class MatMulDoubleGradKernel : public framework::OpKernel<T> {
              const framework::Tensor &b, bool trans_b, bool flag,
              framework::Tensor *out) const {
    out->mutable_data<T>(context.GetPlace());
-    auto blas = math::GetBlas<DeviceContext, T>(context);
+    auto blas = pten::funcs::GetBlas<DeviceContext, T>(context);
-    auto mat_dim_a = math::CreateMatrixDescriptor(a.dims(), 0, trans_a);
+    auto mat_dim_a = pten::funcs::CreateMatrixDescriptor(a.dims(), 0, trans_a);
-    auto mat_dim_b = math::CreateMatrixDescriptor(b.dims(), 0, trans_b);
+    auto mat_dim_b = pten::funcs::CreateMatrixDescriptor(b.dims(), 0, trans_b);
    int head_number = 1;
 #if defined(PADDLE_WITH_MKLML) && !defined(PADDLE_WITH_CUDA) && \
@@ -584,12 +585,12 @@ class MatMulOp : public framework::OperatorWithKernel {
    auto dim_x = GetDimForInput(*context, "X");
    auto dim_y = GetDimForInput(*context, "Y");
-    auto mat_dim_x =
+    auto mat_dim_x = pten::funcs::CreateMatrixDescriptor(
-        math::CreateMatrixDescriptor(RowMatrixFromVector(dim_x), 0,
+        RowMatrixFromVector(dim_x), 0,
-                                     context->Attrs().Get<bool>("transpose_X"));
+        context->Attrs().Get<bool>("transpose_X"));
-    auto mat_dim_y =
+    auto mat_dim_y = pten::funcs::CreateMatrixDescriptor(
-        math::CreateMatrixDescriptor(ColumnMatrixFromVector(dim_y), 0,
+        ColumnMatrixFromVector(dim_y), 0,
-                                     context->Attrs().Get<bool>("transpose_Y"));
+        context->Attrs().Get<bool>("transpose_Y"));
    if (mat_dim_x.width_ == -1) {
      mat_dim_x.width_ = mat_dim_y.height_;

--- a/paddle/fluid/operators/matmul_op_xpu.cc
+++ b/paddle/fluid/operators/matmul_op_xpu.cc
@@ -19,8 +19,8 @@ limitations under the License. */
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/blas.h"
 #include "paddle/fluid/operators/xpu_api_wrapper.h"
+#include "paddle/pten/kernels/funcs/blas/blas.h"
 namespace paddle {
 namespace operators {
@@ -53,7 +53,7 @@ static framework::DDim ColumnMatrixFromVector(const framework::DDim &y_dim) {
 }
 static void ReshapeTensorIntoMatrixSequence(
-    framework::Tensor *x, const math::MatDescriptor &descriptor) {
+    framework::Tensor *x, const pten::funcs::MatDescriptor &descriptor) {
  int64_t h, w;
  h = descriptor.height_;
  w = descriptor.width_;
@@ -86,8 +86,8 @@ static void ReshapeXYOutIntoMatrixSequence(framework::Tensor *x,
                                           bool trans_y) {
  auto x_dim = RowMatrixFromVector(x->dims());
  auto y_dim = ColumnMatrixFromVector(y->dims());
-  auto mat_dim_x = math::CreateMatrixDescriptor(x_dim, 0, trans_x);
+  auto mat_dim_x = pten::funcs::CreateMatrixDescriptor(x_dim, 0, trans_x);
-  auto mat_dim_y = math::CreateMatrixDescriptor(y_dim, 0, trans_y);
+  auto mat_dim_y = pten::funcs::CreateMatrixDescriptor(y_dim, 0, trans_y);
  if (mat_dim_x.batch_size_ == 0 && mat_dim_y.batch_size_ == 0) {
    out->Resize({mat_dim_x.height_, mat_dim_y.width_});
  } else {
@@ -109,10 +109,10 @@ static void MatMulXPUFunction(const Tensor *x, const Tensor *y, Tensor *out,
  auto &dev_ctx =
      ctx.template device_context<paddle::platform::XPUDeviceContext>();
-  auto mat_dim_a =
+  auto mat_dim_a = pten::funcs::CreateMatrixDescriptor(
-      math::CreateMatrixDescriptor(RowMatrixFromVector(x_dims), 0, trans_x);
+      RowMatrixFromVector(x_dims), 0, trans_x);
-  auto mat_dim_b =
+  auto mat_dim_b = pten::funcs::CreateMatrixDescriptor(
-      math::CreateMatrixDescriptor(ColumnMatrixFromVector(y_dims), 0, trans_y);
+      ColumnMatrixFromVector(y_dims), 0, trans_y);
  if (x_dims.size() == 3 && y_dims.size() <= 2) {
    // if transpose_X is true, the transpose cost much time

--- a/paddle/fluid/operators/matmul_v2_op.h
+++ b/paddle/fluid/operators/matmul_v2_op.h
@@ -21,8 +21,8 @@ limitations under the License. */
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/dot_op.h"
-#include "paddle/fluid/operators/math/blas.h"
 #include "paddle/fluid/operators/reduce_ops/reduce_sum_op.h"
+#include "paddle/pten/kernels/funcs/blas/blas.h"
 #include "paddle/pten/kernels/funcs/complex_functors.h"
 // only can include the headers in paddle/pten/api dirs
@@ -77,7 +77,7 @@ static framework::DDim ColumnMatrixFromVector(const framework::DDim& y_dim) {
 * If transposed, `H,W` will be swapped.
 */
 static void ReshapeTensorIntoMatrixSequence(
-    framework::Tensor* x, const math::MatDescriptor& descriptor) {
+    framework::Tensor* x, const pten::funcs::MatDescriptor& descriptor) {
  int64_t h, w;
  h = descriptor.height_;
  w = descriptor.width_;
@@ -97,8 +97,8 @@ static void ReshapeXYOutIntoMatrixSequence(framework::Tensor* x,
                                           bool trans_y) {
  auto x_dim = RowMatrixFromVector(x->dims());
  auto y_dim = ColumnMatrixFromVector(y->dims());
-  auto mat_dim_x = math::CreateMatrixDescriptor(x_dim, 0, trans_x);
+  auto mat_dim_x = pten::funcs::CreateMatrixDescriptor(x_dim, 0, trans_x);
-  auto mat_dim_y = math::CreateMatrixDescriptor(y_dim, 0, trans_y);
+  auto mat_dim_y = pten::funcs::CreateMatrixDescriptor(y_dim, 0, trans_y);
  if (mat_dim_x.batch_size_ == 0 && mat_dim_y.batch_size_ == 0) {
    out->Resize({mat_dim_x.height_, mat_dim_y.width_});
  } else {

--- a/paddle/fluid/operators/matmul_v2_op_xpu.cc
+++ b/paddle/fluid/operators/matmul_v2_op_xpu.cc
@@ -33,10 +33,10 @@ static void MatMulXPUFunction(const Tensor* x, const Tensor* y, Tensor* out,
  auto& dev_ctx =
      ctx.template device_context<paddle::platform::XPUDeviceContext>();
-  auto mat_dim_a =
+  auto mat_dim_a = pten::funcs::CreateMatrixDescriptor(
-      math::CreateMatrixDescriptor(RowMatrixFromVector(x_dims), 0, trans_x);
+      RowMatrixFromVector(x_dims), 0, trans_x);
-  auto mat_dim_b =
+  auto mat_dim_b = pten::funcs::CreateMatrixDescriptor(
-      math::CreateMatrixDescriptor(ColumnMatrixFromVector(y_dims), 0, trans_y);
+      ColumnMatrixFromVector(y_dims), 0, trans_y);
  if (x_dims.size() == 3 && y_dims.size() <= 2) {
    // if transpose_X is true, the transpose cost much time

--- a/paddle/fluid/operators/matrix_power_op.h
+++ b/paddle/fluid/operators/matrix_power_op.h
@@ -18,9 +18,9 @@ limitations under the License. */
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/tensor_util.h"
-#include "paddle/fluid/operators/math/blas.h"
 #include "paddle/fluid/operators/math/matrix_inverse.h"
 #include "paddle/fluid/platform/for_range.h"
+#include "paddle/pten/kernels/funcs/blas/blas.h"
 namespace paddle {
 namespace operators {
@@ -58,7 +58,7 @@ void MatrixPowerFunction(const Tensor* X, const int n, Tensor* Out,
    return;
  }
-  auto blas = math::GetBlas<DeviceContext, T>(dev_ctx);
+  auto blas = pten::funcs::GetBlas<DeviceContext, T>(dev_ctx);
  Tensor new_x = ctx.AllocateTmpTensor<T, DeviceContext>(X->dims(), dev_ctx);
  int new_n = n;
@@ -77,7 +77,7 @@ void MatrixPowerFunction(const Tensor* X, const int n, Tensor* Out,
    return;
  }
-  auto no_trans_desc = math::CreateMatrixDescriptor(x_dims, 0, false);
+  auto no_trans_desc = pten::funcs::CreateMatrixDescriptor(x_dims, 0, false);
  if (new_n == 2) {
    // Out = newX * newX
@@ -166,7 +166,7 @@ void MatrixPowerGradFunction(const Tensor* X, const Tensor* Out,
  const auto& x_dims = X->dims();
  auto& dev_ctx = ctx.template device_context<DeviceContext>();
-  auto blas = math::GetBlas<DeviceContext, T>(dev_ctx);
+  auto blas = pten::funcs::GetBlas<DeviceContext, T>(dev_ctx);
  if (n == 0) {
    // \nabla X = O
@@ -179,8 +179,8 @@ void MatrixPowerGradFunction(const Tensor* X, const Tensor* Out,
    return;
  }
-  auto trans_desc = math::CreateMatrixDescriptor(x_dims, 0, true);
+  auto trans_desc = pten::funcs::CreateMatrixDescriptor(x_dims, 0, true);
-  auto no_trans_desc = math::CreateMatrixDescriptor(x_dims, 0, false);
+  auto no_trans_desc = pten::funcs::CreateMatrixDescriptor(x_dims, 0, false);
  if (n == -1) {
    // \nabla X = Out^{T} * \nabla Out * Out^{T}

--- a/paddle/fluid/operators/mkldnn/matmul_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/matmul_mkldnn_op.cc
@@ -113,10 +113,8 @@ class MatMulMKLDNNHandler
                      float scale)
      : paddle::platform::MKLDNNHandlerNoCachingT<XT, dnnl::matmul>(engine,
                                                                    cpu_place) {
-    auto mat_dim_x =
+    auto mat_dim_x = pten::funcs::CreateMatrixDescriptor(x->dims(), 0, trans_x);
-        paddle::operators::math::CreateMatrixDescriptor(x->dims(), 0, trans_x);
+    auto mat_dim_y = pten::funcs::CreateMatrixDescriptor(y->dims(), 0, trans_y);
-    auto mat_dim_y =
-        paddle::operators::math::CreateMatrixDescriptor(y->dims(), 0, trans_y);
    memory::dim x_bs = mat_dim_x.batch_size_;
    memory::dim y_bs = mat_dim_y.batch_size_;
@@ -237,8 +235,8 @@ class MatMulMKLDNNHandler
        out_strides;
  };
-  std::pair<paddle::operators::math::MatDescriptor, memory::dims>
+  std::pair<pten::funcs::MatDescriptor, memory::dims> GetInputDimsAndStrides(
-  GetInputDimsAndStrides(const ExecutionContext& ctx, std::string input_name) {
+      const ExecutionContext& ctx, std::string input_name) {
    auto shape = ctx.Attr<std::vector<int>>("fused_reshape_" + input_name);
    auto axis = ctx.Attr<std::vector<int>>("fused_transpose_" + input_name);
    auto input_dims = ctx.Input<Tensor>(input_name)->dims();
@@ -279,10 +277,9 @@ class MatMulMKLDNNHandler
    auto& MatrixDimsFromVector = input_name == "X" ? RowMatrixDimsFromVector
                                                   : ColumnMatrixDimsFromVector;
-    paddle::operators::math::MatDescriptor mat_dim =
+    pten::funcs::MatDescriptor mat_dim = pten::funcs::CreateMatrixDescriptor(
-        paddle::operators::math::CreateMatrixDescriptor(
+        MatrixDimsFromVector(new_dims), 0,
-            MatrixDimsFromVector(new_dims), 0,
+        ctx.Attr<bool>("transpose_" + input_name));
-            ctx.Attr<bool>("transpose_" + input_name));
    memory::dims strides;
    if (!shape.empty()) {
@@ -324,10 +321,10 @@ class MatMulMKLDNNHandler
  }
  MatMulDims GetMatmulDims(const ExecutionContext& ctx) {
-    paddle::operators::math::MatDescriptor mat_dim_x;
+    pten::funcs::MatDescriptor mat_dim_x;
    memory::dims strides_x;
    std::tie(mat_dim_x, strides_x) = GetInputDimsAndStrides(ctx, "X");
-    paddle::operators::math::MatDescriptor mat_dim_y;
+    pten::funcs::MatDescriptor mat_dim_y;
    memory::dims strides_y;
    std::tie(mat_dim_y, strides_y) = GetInputDimsAndStrides(ctx, "Y");
@@ -431,7 +428,7 @@ class MatMulMKLDNNHandler
 * If transposed, `H,W` will be swapped.
 */
 static void ReshapeTensorToMatrixSequence(
-    Tensor* x, const paddle::operators::math::MatDescriptor& descriptor) {
+    Tensor* x, const pten::funcs::MatDescriptor& descriptor) {
  int64_t h, w;
  h = descriptor.height_;
  w = descriptor.width_;
@@ -463,10 +460,8 @@ static void ReshapeXYOutToMatrixSequence(Tensor* x, Tensor* y, Tensor* out,
                                         bool trans_x, bool trans_y) {
  auto x_dim = RowMatrixDimsFromVector(x->dims());
  auto y_dim = ColumnMatrixDimsFromVector(y->dims());
-  auto mat_dim_x =
+  auto mat_dim_x = pten::funcs::CreateMatrixDescriptor(x_dim, 0, trans_x);
-      paddle::operators::math::CreateMatrixDescriptor(x_dim, 0, trans_x);
+  auto mat_dim_y = pten::funcs::CreateMatrixDescriptor(y_dim, 0, trans_y);
-  auto mat_dim_y =
-      paddle::operators::math::CreateMatrixDescriptor(y_dim, 0, trans_y);
  if (mat_dim_x.batch_size_ == 0 && mat_dim_y.batch_size_ == 0) {
    out->Resize({mat_dim_x.height_, mat_dim_y.width_});
  } else {

--- a/paddle/fluid/operators/mkldnn/matmul_mkldnn_op.h
+++ b/paddle/fluid/operators/mkldnn/matmul_mkldnn_op.h
@@ -16,8 +16,8 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/operators/math/blas.h"
 #include "paddle/fluid/platform/mkldnn_reuse.h"
+#include "paddle/pten/kernels/funcs/blas/blas.h"
 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/mkldnn/matmul_v2_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/matmul_v2_mkldnn_op.cc
@@ -84,11 +84,10 @@ std::vector<int64_t> GetInputStrides(const ExecutionContext& ctx,
  auto& MatrixDimsFromVector =
      input_name == "X" ? RowMatrixDimsFromVector : ColumnMatrixDimsFromVector;
-  paddle::operators::math::MatDescriptor mat_dim =
+  pten::funcs::MatDescriptor mat_dim = pten::funcs::CreateMatrixDescriptor(
-      paddle::operators::math::CreateMatrixDescriptor(
+      MatrixDimsFromVector(new_dims), 0,
-          MatrixDimsFromVector(new_dims), 0,
+      ctx.Attr<bool>(std::string("trans_") +
-          ctx.Attr<bool>(std::string("trans_") +
+                     static_cast<char>(std::tolower(input_name[0]))));
-                         static_cast<char>(std::tolower(input_name[0]))));
  std::vector<int64_t> strides;
  if (!shape.empty()) {

--- a/paddle/fluid/operators/mul_op.h
+++ b/paddle/fluid/operators/mul_op.h
@@ -16,7 +16,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/operators/math/blas.h"
+#include "paddle/pten/kernels/funcs/blas/blas.h"
 #include "paddle/pten/kernels/funcs/math_function.h"
 namespace paddle {
@@ -51,7 +51,7 @@ class MulKernel : public framework::OpKernel<T> {
      z->Resize({x_matrix.dims()[0], y_matrix.dims()[1]});
    }
-    auto blas = math::GetBlas<DeviceContext, T>(context);
+    auto blas = pten::funcs::GetBlas<DeviceContext, T>(context);
    blas.MatMul(x_matrix, y_matrix, z);
    if (z_dim.size() != 2) {
@@ -92,7 +92,7 @@ class MulGradKernel : public framework::OpKernel<T> {
    }
    auto& dev_ctx = ctx.template device_context<DeviceContext>();
-    auto blas = math::GetBlas<DeviceContext, T>(dev_ctx);
+    auto blas = pten::funcs::GetBlas<DeviceContext, T>(dev_ctx);
    if (dx) {
      dx->mutable_data<T>(ctx.GetPlace());
      Tensor dx_matrix = dx->dims().size() > 2
@@ -153,7 +153,7 @@ class MulDoubleGradKernel : public framework::OpKernel<T> {
    }
    auto& dev_ctx = ctx.template device_context<DeviceContext>();
-    auto blas = math::GetBlas<DeviceContext, T>(dev_ctx);
+    auto blas = pten::funcs::GetBlas<DeviceContext, T>(dev_ctx);
    // a flag to specify whether ddout value has been set, if flag
    // is false, MatMul beta should be 0 to set ddout, if flag is
    // true, MatMul beta should be 1 to add result to ddout.

--- a/paddle/fluid/operators/multi_dot_op.cc
+++ b/paddle/fluid/operators/multi_dot_op.cc
@@ -18,9 +18,9 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/op_version_registry.h"
-#include "paddle/fluid/operators/math/blas.h"
 #include "paddle/fluid/operators/strided_memcpy.h"
 #include "paddle/fluid/operators/utils.h"
+#include "paddle/pten/kernels/funcs/blas/blas.h"
 namespace paddle {
 namespace operators {
@@ -95,15 +95,15 @@ inline framework::Tensor MatMul(const framework::ExecutionContext& ctx,
                                const framework::DDim& a_dim,
                                const framework::DDim& b_dim) {
  auto place = ctx.GetPlace();
-  auto blas = math::GetBlas<DeviceContext, T>(ctx);
+  auto blas = pten::funcs::GetBlas<DeviceContext, T>(ctx);
  framework::Tensor matrix_c;
  framework::DDim c_dim = framework::make_ddim({a_dim[0], b_dim[1]});
  matrix_c.Resize(c_dim);
  matrix_c.mutable_data<T>(place);
-  auto mat_dim_a = math::CreateMatrixDescriptor(a_dim, 0, false);
+  auto mat_dim_a = pten::funcs::CreateMatrixDescriptor(a_dim, 0, false);
-  auto mat_dim_b = math::CreateMatrixDescriptor(b_dim, 0, false);
+  auto mat_dim_b = pten::funcs::CreateMatrixDescriptor(b_dim, 0, false);
  const T alpha = static_cast<T>(1.0);
  blas.MatMul(matrix_a, mat_dim_a, matrix_b, mat_dim_b, alpha, &matrix_c, T(0));
  return matrix_c;
@@ -269,7 +269,7 @@ class MultiDotKernel : public framework::OpKernel<T> {
    auto place = ctx.GetPlace();
    out->mutable_data<T>(place);
-    auto blas = math::GetBlas<DeviceContext, T>(ctx);
+    auto blas = pten::funcs::GetBlas<DeviceContext, T>(ctx);
    auto n = ins.size();
    std::vector<framework::DDim> ins_dims(n);
@@ -277,8 +277,10 @@ class MultiDotKernel : public framework::OpKernel<T> {
    const T scale = static_cast<T>(1.0);
    if (n == 2) {
-      auto mat_dim_a = math::CreateMatrixDescriptor(ins_dims[0], 0, false);
+      auto mat_dim_a =
-      auto mat_dim_b = math::CreateMatrixDescriptor(ins_dims[1], 0, false);
+          pten::funcs::CreateMatrixDescriptor(ins_dims[0], 0, false);
+      auto mat_dim_b =
+          pten::funcs::CreateMatrixDescriptor(ins_dims[1], 0, false);
      blas.MatMul(*ins[0], mat_dim_a, *ins[1], mat_dim_b, scale, out, T(0));
    } else if (n == 3) {
      const auto Ma = ins_dims[0][0];
@@ -287,16 +289,20 @@ class MultiDotKernel : public framework::OpKernel<T> {
      const auto Nc = ins_dims[2][1];
      const uint64_t cost1 = Ma * Nb * (Ka + Nc);
      const uint64_t cost2 = Ka * Nc * (Nb + Ma);
-      auto mat_dim_a = math::CreateMatrixDescriptor(ins_dims[0], 0, false);
+      auto mat_dim_a =
-      auto mat_dim_b = math::CreateMatrixDescriptor(ins_dims[1], 0, false);
+          pten::funcs::CreateMatrixDescriptor(ins_dims[0], 0, false);
-      auto mat_dim_c = math::CreateMatrixDescriptor(ins_dims[2], 0, false);
+      auto mat_dim_b =
+          pten::funcs::CreateMatrixDescriptor(ins_dims[1], 0, false);
+      auto mat_dim_c =
+          pten::funcs::CreateMatrixDescriptor(ins_dims[2], 0, false);
      if (cost1 < cost2) {
        framework::Tensor tmp_out;
        tmp_out.mutable_data<T>(place, Ma * Nb * sizeof(T));
        framework::DDim tmp_dim = framework::make_ddim({Ma, Nb});
        blas.MatMul(*ins[0], mat_dim_a, *ins[1], mat_dim_b, scale, &tmp_out,
                    T(0));
-        auto mat_dim_tmp = math::CreateMatrixDescriptor(tmp_dim, 0, false);
+        auto mat_dim_tmp =
+            pten::funcs::CreateMatrixDescriptor(tmp_dim, 0, false);
        blas.MatMul(tmp_out, mat_dim_tmp, *ins[2], mat_dim_c, scale, out, T(0));
      } else {
        framework::Tensor tmp_out;
@@ -304,7 +310,8 @@ class MultiDotKernel : public framework::OpKernel<T> {
        framework::DDim tmp_dim = framework::make_ddim({Ka, Nc});
        blas.MatMul(*ins[1], mat_dim_b, *ins[2], mat_dim_c, scale, &tmp_out,
                    T(0));
-        auto mat_dim_tmp = math::CreateMatrixDescriptor(tmp_dim, 0, false);
+        auto mat_dim_tmp =
+            pten::funcs::CreateMatrixDescriptor(tmp_dim, 0, false);
        blas.MatMul(*ins[0], mat_dim_a, tmp_out, mat_dim_tmp, scale, out, T(0));
      }
    } else {
@@ -348,11 +355,11 @@ class MultiDotGradKernel : public framework::OpKernel<T> {
                const framework::Tensor& B, const framework::DDim& dout_dim,
                const framework::DDim& a_dim, const framework::DDim& b_dim,
                framework::Tensor* dA, framework::Tensor* dB) const {
-    auto mat_dim_dout = math::CreateMatrixDescriptor(dout_dim, 0, false);
+    auto mat_dim_dout = pten::funcs::CreateMatrixDescriptor(dout_dim, 0, false);
-    auto mat_dim_a = math::CreateMatrixDescriptor(a_dim, 0, true);
+    auto mat_dim_a = pten::funcs::CreateMatrixDescriptor(a_dim, 0, true);
-    auto mat_dim_b = math::CreateMatrixDescriptor(b_dim, 0, true);
+    auto mat_dim_b = pten::funcs::CreateMatrixDescriptor(b_dim, 0, true);
    T alpha = static_cast<T>(1.0);
-    auto blas = math::GetBlas<DeviceContext, T>(ctx);
+    auto blas = pten::funcs::GetBlas<DeviceContext, T>(ctx);
    blas.MatMul(A, mat_dim_a, dout, mat_dim_dout, alpha, dB, T(0));
    blas.MatMul(dout, mat_dim_dout, B, mat_dim_b, alpha, dA, T(0));
  }
@@ -433,7 +440,7 @@ class MultiDotGradKernel : public framework::OpKernel<T> {
    auto dout = *ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
    auto dx = ctx.MultiOutput<framework::Tensor>(framework::GradVarName("X"));
-    auto blas = math::GetBlas<DeviceContext, T>(ctx);
+    auto blas = pten::funcs::GetBlas<DeviceContext, T>(ctx);
    auto place = ctx.GetPlace();
    const auto n = ins.size();
@@ -458,7 +465,7 @@ class MultiDotGradKernel : public framework::OpKernel<T> {
    }
    T alpha = static_cast<T>(1);
-    auto mat_dim_dout = math::CreateMatrixDescriptor(dout_dim, 0, false);
+    auto mat_dim_dout = pten::funcs::CreateMatrixDescriptor(dout_dim, 0, false);
    if (n == 2) {
      CalcGrad(ctx, dout, *ins[0], *ins[1], dout_dim, ins_dims[0], ins_dims[1],
               dx[0], dx[1]);
@@ -469,9 +476,12 @@ class MultiDotGradKernel : public framework::OpKernel<T> {
      const auto Nc = ins_dims[2][1];
      const uint64_t cost1 = Ma * Nb * (Ka + Nc);
      const uint64_t cost2 = Ka * Nc * (Nb + Ma);
-      auto mat_dim_a = math::CreateMatrixDescriptor(ins_dims[0], 0, false);
+      auto mat_dim_a =
-      auto mat_dim_b = math::CreateMatrixDescriptor(ins_dims[1], 0, false);
+          pten::funcs::CreateMatrixDescriptor(ins_dims[0], 0, false);
-      auto mat_dim_c = math::CreateMatrixDescriptor(ins_dims[2], 0, false);
+      auto mat_dim_b =
+          pten::funcs::CreateMatrixDescriptor(ins_dims[1], 0, false);
+      auto mat_dim_c =
+          pten::funcs::CreateMatrixDescriptor(ins_dims[2], 0, false);
      if (cost1 < cost2) {
        framework::Tensor tmp_out, tmp_dout;
        tmp_out.Resize({Ma, Nb});

--- a/paddle/fluid/operators/mv_op.cu
+++ b/paddle/fluid/operators/mv_op.cu
@@ -59,7 +59,7 @@ class MVGradKernel<platform::CUDADeviceContext, T>
    auto &dev_ctx =
        context.template device_context<platform::CUDADeviceContext>();
-    auto blas = math::GetBlas<platform::CUDADeviceContext, T>(dev_ctx);
+    auto blas = pten::funcs::GetBlas<platform::CUDADeviceContext, T>(dev_ctx);
    auto stream = context.cuda_device_context().stream();
    auto config = GetGpuLaunchConfig1D(dev_ctx, m * n);

--- a/paddle/fluid/operators/mv_op.h
+++ b/paddle/fluid/operators/mv_op.h
@@ -18,7 +18,7 @@ limitations under the License. */
 #include <utility>
 #include <vector>
 #include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/blas.h"
+#include "paddle/pten/kernels/funcs/blas/blas.h"
 #ifdef PADDLE_WITH_MKLDNN
 #include "paddle/fluid/platform/mkldnn_helper.h"
 #endif
@@ -45,7 +45,7 @@ class MVKernel : public framework::OpKernel<T> {
    T *out_data = out->mutable_data<T>(context.GetPlace());
    auto &dev_ctx = context.template device_context<DeviceContext>();
-    auto blas = math::GetBlas<DeviceContext, T>(dev_ctx);
+    auto blas = pten::funcs::GetBlas<DeviceContext, T>(dev_ctx);
    blas.GEMV(false, dim_x[0], dim_x[1], static_cast<T>(1), x_data, vec_data,
              static_cast<T>(0), out_data);
@@ -93,7 +93,7 @@ class MVGradKernel : public framework::OpKernel<T> {
      T *dvec_data = dvec->mutable_data<T>(context.GetPlace());
      auto &dev_ctx = context.template device_context<DeviceContext>();
-      auto blas = math::GetBlas<DeviceContext, T>(dev_ctx);
+      auto blas = pten::funcs::GetBlas<DeviceContext, T>(dev_ctx);
      blas.GEMV(true, dim_x[0], dim_x[1], static_cast<T>(1), x_data, dout_data,
                static_cast<T>(0), dvec_data);

--- a/paddle/fluid/operators/rank_attention_op.cu
+++ b/paddle/fluid/operators/rank_attention_op.cu
@@ -14,11 +14,11 @@ limitations under the License. */
 #include <algorithm>
 #include "paddle/fluid/framework/eigen.h"
-#include "paddle/fluid/operators/math/blas.h"
 #include "paddle/fluid/operators/rank_attention.cu.h"
 #include "paddle/fluid/operators/rank_attention_op.h"
 #include "paddle/fluid/platform/device/gpu/gpu_info.h"
 #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
+#include "paddle/pten/kernels/funcs/blas/blas.h"
 namespace paddle {
 namespace operators {
@@ -114,7 +114,7 @@ class RankAttentionCUDAKernel : public framework::OpKernel<T> {
    int64_t strideA = block_matrix_row;
    int64_t strideB = block_matrix_row * para_col;
-    auto blas = math::GetBlas<platform::CUDADeviceContext, T>(dev_ctx);
+    auto blas = pten::funcs::GetBlas<platform::CUDADeviceContext, T>(dev_ctx);
    blas.BatchedGEMM(transA, transB, 1, para_col, block_matrix_row, alpha,
                     input_help_data, param_help_data, beta, out_data, ins_num,
                     strideA, strideB);
@@ -170,7 +170,7 @@ class RankAttentionGradOpCUDAKernel : public framework::OpKernel<T> {
    const T *ins_rank_data = ins_rank->data<T>();
    T *param_grad_data = param_grad.data<T>();
-    auto blas = math::GetBlas<platform::CUDADeviceContext, T>(dev_ctx);
+    auto blas = pten::funcs::GetBlas<platform::CUDADeviceContext, T>(dev_ctx);
    T alpha = 1;
    T beta = 0;

--- a/paddle/fluid/operators/repeat_interleave_op.h
+++ b/paddle/fluid/operators/repeat_interleave_op.h
--- a/paddle/fluid/operators/rnn_op.h
+++ b/paddle/fluid/operators/rnn_op.h
--- a/paddle/fluid/operators/scatter.h
+++ b/paddle/fluid/operators/scatter.h
--- a/paddle/fluid/operators/search_compute.h
+++ b/paddle/fluid/operators/search_compute.h
--- a/paddle/fluid/operators/sequence_ops/sequence_conv_op.h
+++ b/paddle/fluid/operators/sequence_ops/sequence_conv_op.h
--- a/paddle/fluid/operators/solve_op.h
+++ b/paddle/fluid/operators/solve_op.h
--- a/paddle/fluid/operators/spectral_norm_op.h
+++ b/paddle/fluid/operators/spectral_norm_op.h
--- a/paddle/fluid/operators/squeeze_op.h
+++ b/paddle/fluid/operators/squeeze_op.h
--- a/paddle/fluid/operators/svd_helper.h
+++ b/paddle/fluid/operators/svd_helper.h
--- a/paddle/fluid/operators/tree_conv_op.h
+++ b/paddle/fluid/operators/tree_conv_op.h
--- a/paddle/fluid/operators/triangular_solve_op.h
+++ b/paddle/fluid/operators/triangular_solve_op.h
--- a/paddle/fluid/operators/unsqueeze_op.h
+++ b/paddle/fluid/operators/unsqueeze_op.h
--- a/paddle/fluid/operators/var_conv_2d_op.cc
+++ b/paddle/fluid/operators/var_conv_2d_op.cc
--- a/paddle/pten/kernels/cpu/elementwise.h
+++ b/paddle/pten/kernels/cpu/elementwise.h
--- a/paddle/pten/kernels/funcs/CMakeLists.txt
+++ b/paddle/pten/kernels/funcs/CMakeLists.txt
--- a/paddle/pten/kernels/funcs/blas/CMakeLists.txt
+++ b/paddle/pten/kernels/funcs/blas/CMakeLists.txt
--- a/paddle/fluid/operators/math/blas.cc
+++ b/paddle/fluid/operators/math/blas.cc
--- a/paddle/fluid/operators/math/blas.h
+++ b/paddle/fluid/operators/math/blas.h
--- a/paddle/fluid/operators/math/blas_impl.cu.h
+++ b/paddle/fluid/operators/math/blas_impl.cu.h
--- a/paddle/fluid/operators/math/blas_impl.h
+++ b/paddle/fluid/operators/math/blas_impl.h
--- a/paddle/fluid/operators/math/blas_impl.hip.h
+++ b/paddle/fluid/operators/math/blas_impl.hip.h
--- a/paddle/pten/kernels/funcs/functors.h
+++ b/paddle/pten/kernels/funcs/functors.h
--- a/paddle/pten/kernels/funcs/lapack/CMakeLists.txt
+++ b/paddle/pten/kernels/funcs/lapack/CMakeLists.txt
--- a/paddle/pten/kernels/funcs/lapack/lapack_function.cc
+++ b/paddle/pten/kernels/funcs/lapack/lapack_function.cc
--- a/paddle/fluid/operators/math/lapack_function.h
+++ b/paddle/fluid/operators/math/lapack_function.h
--- a/paddle/pten/kernels/funcs/math_function.cu
+++ b/paddle/pten/kernels/funcs/math_function.cu
--- a/paddle/pten/kernels/impl/matmul_grad_kernel_impl.h
+++ b/paddle/pten/kernels/impl/matmul_grad_kernel_impl.h
--- a/paddle/pten/kernels/impl/matmul_kernel_impl.h
+++ b/paddle/pten/kernels/impl/matmul_kernel_impl.h
--- a/paddle/pten/tests/kernels/test_math_function.cc
+++ b/paddle/pten/tests/kernels/test_math_function.cc
--- a/paddle/pten/tests/kernels/test_math_function.cu
+++ b/paddle/pten/tests/kernels/test_math_function.cu