未验证 提交 8c7ee8c2 编写于 作者: F Feiyu Chan 提交者: GitHub

[Pten] blas and lapck migration (#39587)

* move blas related files
* move lapack related files
上级 1d6fd81d
...@@ -1036,3 +1036,42 @@ function(generate_dummy_static_lib) ...@@ -1036,3 +1036,42 @@ function(generate_dummy_static_lib)
add_library(${dummy_LIB_NAME} STATIC ${dummy_FILE_PATH}) add_library(${dummy_LIB_NAME} STATIC ${dummy_FILE_PATH})
endfunction() endfunction()
function(math_library TARGET)
# math_library is a function to create math library.
# The interface is the same as cc_library.
# But it handle split GPU/CPU code and link some common library.
set(cc_srcs)
set(cu_srcs)
set(hip_srcs)
set(math_common_deps device_context framework_proto enforce)
if (WITH_GPU)
if (${CMAKE_CUDA_COMPILER_VERSION} LESS 11.0)
list(APPEND math_common_deps cub)
else()
list(APPEND math_common_deps)
endif()
endif()
set(multiValueArgs DEPS)
cmake_parse_arguments(math_library "${options}" "${oneValueArgs}"
"${multiValueArgs}" ${ARGN})
if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cc)
list(APPEND cc_srcs ${TARGET}.cc)
endif()
if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cu)
list(APPEND cu_srcs ${TARGET}.cu)
endif()
if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cu.cc)
list(APPEND cu_srcs ${TARGET}.cu.cc)
endif()
list(LENGTH cc_srcs cc_srcs_len)
if (WITH_GPU)
nv_library(${TARGET} SRCS ${cc_srcs} ${cu_srcs} DEPS ${math_library_DEPS} ${math_common_deps})
elseif (WITH_ROCM)
hip_library(${TARGET} SRCS ${cc_srcs} ${cu_srcs} DEPS ${math_library_DEPS} ${math_common_deps})
elseif(${cc_srcs_len} GREATER 0)
cc_library(${TARGET} SRCS ${cc_srcs} DEPS ${math_library_DEPS} ${math_common_deps})
endif()
endfunction()
...@@ -24,18 +24,16 @@ ...@@ -24,18 +24,16 @@
#include <utility> #include <utility>
#include <vector> #include <vector>
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/device_context.h"
#include "paddle/pten/kernels/funcs/blas/blas.h"
namespace paddle { namespace paddle {
namespace distributed { namespace distributed {
template <typename T> template <typename T>
inline paddle::operators::math::BlasT<paddle::platform::CPUDeviceContext, T> inline pten::funcs::BlasT<paddle::platform::CPUDeviceContext, T> GetBlas() {
GetBlas() {
paddle::platform::CPUDeviceContext cpu_ctx; paddle::platform::CPUDeviceContext cpu_ctx;
return paddle::operators::math::GetBlas<paddle::platform::CPUDeviceContext, return pten::funcs::GetBlas<paddle::platform::CPUDeviceContext, T>(cpu_ctx);
T>(cpu_ctx);
} }
template <typename T> template <typename T>
......
...@@ -1161,8 +1161,7 @@ void GeoCommunicator::SendDense(const CommContext &send_ctx) { ...@@ -1161,8 +1161,7 @@ void GeoCommunicator::SendDense(const CommContext &send_ctx) {
t_delta->mutable_data<float>(t_latest.dims(), cpu_ctx.GetPlace()); t_delta->mutable_data<float>(t_latest.dims(), cpu_ctx.GetPlace());
auto blas = auto blas =
paddle::operators::math::GetBlas<platform::CPUDeviceContext, float>( pten::funcs::GetBlas<platform::CPUDeviceContext, float>(cpu_ctx);
cpu_ctx);
blas.VSUB(t_latest.numel(), t_latest.data<float>(), blas.VSUB(t_latest.numel(), t_latest.data<float>(),
t_timestamp->data<float>(), t_delta->data<float>()); t_timestamp->data<float>(), t_delta->data<float>());
...@@ -1201,8 +1200,7 @@ void GeoCommunicator::RecvDense(const CommContext &send_ctx) { ...@@ -1201,8 +1200,7 @@ void GeoCommunicator::RecvDense(const CommContext &send_ctx) {
t_delta->mutable_data<float>(t_latest->dims(), cpu_ctx.GetPlace()); t_delta->mutable_data<float>(t_latest->dims(), cpu_ctx.GetPlace());
auto blas = auto blas =
paddle::operators::math::GetBlas<platform::CPUDeviceContext, float>( pten::funcs::GetBlas<platform::CPUDeviceContext, float>(cpu_ctx);
cpu_ctx);
blas.VSUB(t_latest->numel(), t_pserver.data<float>(), t_old->data<float>(), blas.VSUB(t_latest->numel(), t_pserver.data<float>(), t_old->data<float>(),
t_delta->data<float>()); t_delta->data<float>());
blas.VADD(t_latest->numel(), t_latest->data<float>(), blas.VADD(t_latest->numel(), t_latest->data<float>(),
...@@ -1303,9 +1301,7 @@ void GeoCommunicator::SendSparse(const std::string &varname, ...@@ -1303,9 +1301,7 @@ void GeoCommunicator::SendSparse(const std::string &varname,
t_delta->set_rows(sparse_ids); t_delta->set_rows(sparse_ids);
t_delta->set_height(t_latest.dims()[0]); t_delta->set_height(t_latest.dims()[0]);
auto blas = auto blas = pten::funcs::GetBlas<platform::CPUDeviceContext, float>(cpu_ctx);
paddle::operators::math::GetBlas<platform::CPUDeviceContext, float>(
cpu_ctx);
float coefficient = 1.0 / static_cast<float>(trainers_); float coefficient = 1.0 / static_cast<float>(trainers_);
std::vector<float *> push_g_vec; std::vector<float *> push_g_vec;
...@@ -1371,9 +1367,7 @@ void GeoCommunicator::RecvSparse(const std::string &varname, int table_id, ...@@ -1371,9 +1367,7 @@ void GeoCommunicator::RecvSparse(const std::string &varname, int table_id,
v_delta.resize(numel); v_delta.resize(numel);
paddle::platform::CPUDeviceContext cpu_ctx; paddle::platform::CPUDeviceContext cpu_ctx;
auto blas = auto blas = pten::funcs::GetBlas<platform::CPUDeviceContext, float>(cpu_ctx);
paddle::operators::math::GetBlas<platform::CPUDeviceContext, float>(
cpu_ctx);
for (auto j = 0; j < static_cast<int>(keys.size()); ++j) { for (auto j = 0; j < static_cast<int>(keys.size()); ++j) {
VLOG(5) << "DEBUG GeoCommunicator::RecvSparse recv sparse key" << keys[j] VLOG(5) << "DEBUG GeoCommunicator::RecvSparse recv sparse key" << keys[j]
......
...@@ -34,12 +34,12 @@ limitations under the License. */ ...@@ -34,12 +34,12 @@ limitations under the License. */
#include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/framework/variable.h" #include "paddle/fluid/framework/variable.h"
#include "paddle/fluid/framework/variable_helper.h" #include "paddle/fluid/framework/variable_helper.h"
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/operators/math/selected_rows_functor.h" #include "paddle/fluid/operators/math/selected_rows_functor.h"
#include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/place.h" #include "paddle/fluid/platform/place.h"
#include "paddle/fluid/string/split.h" #include "paddle/fluid/string/split.h"
#include "paddle/pten/kernels/funcs/blas/blas.h"
#include "paddle/pten/kernels/funcs/math_function.h" #include "paddle/pten/kernels/funcs/math_function.h"
#include "paddle/fluid/distributed/ps/service/ps_client.h" #include "paddle/fluid/distributed/ps/service/ps_client.h"
......
...@@ -17,7 +17,7 @@ ...@@ -17,7 +17,7 @@
#include <string> #include <string>
#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/op_version_registry.h" #include "paddle/fluid/framework/op_version_registry.h"
#include "paddle/fluid/operators/math/blas.h" #include "paddle/pten/kernels/funcs/blas/blas.h"
namespace paddle { namespace paddle {
namespace framework { namespace framework {
...@@ -121,14 +121,14 @@ static int BuildFusion(Graph* graph, const std::string& name_scope, ...@@ -121,14 +121,14 @@ static int BuildFusion(Graph* graph, const std::string& name_scope,
// broadcast biases // broadcast biases
std::vector<float> ones(m, 1.0f); std::vector<float> ones(m, 1.0f);
paddle::operators::math::CBlas<float>::GEMM( pten::funcs::CBlas<float>::GEMM(
CblasRowMajor, CblasNoTrans, CblasNoTrans, m, n, 1, alpha, &ones[0], 1, CblasRowMajor, CblasNoTrans, CblasNoTrans, m, n, 1, alpha, &ones[0], 1,
&combined_biases[0], n, 0.0f, embeddings_data, n); &combined_biases[0], n, 0.0f, embeddings_data, n);
// Wx*embeddings + biases // Wx*embeddings + biases
paddle::operators::math::CBlas<float>::GEMM( pten::funcs::CBlas<float>::GEMM(CblasRowMajor, CblasNoTrans, CblasNoTrans,
CblasRowMajor, CblasNoTrans, CblasNoTrans, m, n, k, alpha, m, n, k, alpha, embedding_data, k,
embedding_data, k, weightx_data, n, beta, embeddings_data, n); weightx_data, n, beta, embeddings_data, n);
op_desc.SetInput("Embeddings", {embeddings}); op_desc.SetInput("Embeddings", {embeddings});
op_desc.SetInput("H0", {}); op_desc.SetInput("H0", {});
......
...@@ -22,13 +22,13 @@ ...@@ -22,13 +22,13 @@
#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/selected_rows_utils.h" #include "paddle/fluid/framework/selected_rows_utils.h"
#include "paddle/fluid/imperative/layer.h" #include "paddle/fluid/imperative/layer.h"
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/operators/math/selected_rows_functor.h" #include "paddle/fluid/operators/math/selected_rows_functor.h"
#include "paddle/fluid/platform/bfloat16.h" #include "paddle/fluid/platform/bfloat16.h"
#include "paddle/fluid/platform/complex.h" #include "paddle/fluid/platform/complex.h"
#include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/float16.h" #include "paddle/fluid/platform/float16.h"
#include "paddle/fluid/platform/profiler.h" #include "paddle/fluid/platform/profiler.h"
#include "paddle/pten/kernels/funcs/blas/blas.h"
#include "paddle/pten/kernels/funcs/math_function.h" #include "paddle/pten/kernels/funcs/math_function.h"
#ifdef PADDLE_WITH_XPU #ifdef PADDLE_WITH_XPU
#include "xpu/refactor/math.h" #include "xpu/refactor/math.h"
...@@ -86,7 +86,7 @@ class TensorAddFunctor : public boost::static_visitor<> { ...@@ -86,7 +86,7 @@ class TensorAddFunctor : public boost::static_visitor<> {
void operator()(const platform::CPUPlace& place) const { void operator()(const platform::CPUPlace& place) const {
platform::CPUDeviceContext* ctx = dynamic_cast<platform::CPUDeviceContext*>( platform::CPUDeviceContext* ctx = dynamic_cast<platform::CPUDeviceContext*>(
platform::DeviceContextPool::Instance().Get(place)); platform::DeviceContextPool::Instance().Get(place));
auto blas = operators::math::GetBlas<platform::CPUDeviceContext, T>(*ctx); auto blas = pten::funcs::GetBlas<platform::CPUDeviceContext, T>(*ctx);
blas.AXPY(numel_, 1., x_, y_); blas.AXPY(numel_, 1., x_, y_);
} }
...@@ -118,7 +118,7 @@ class TensorAddFunctor : public boost::static_visitor<> { ...@@ -118,7 +118,7 @@ class TensorAddFunctor : public boost::static_visitor<> {
platform::CUDADeviceContext* ctx = platform::CUDADeviceContext* ctx =
dynamic_cast<platform::CUDADeviceContext*>( dynamic_cast<platform::CUDADeviceContext*>(
platform::DeviceContextPool::Instance().Get(place)); platform::DeviceContextPool::Instance().Get(place));
auto blas = operators::math::GetBlas<platform::CUDADeviceContext, T>(*ctx); auto blas = pten::funcs::GetBlas<platform::CUDADeviceContext, T>(*ctx);
blas.AXPY(numel_, 1., x_, y_); blas.AXPY(numel_, 1., x_, y_);
} }
#else #else
......
...@@ -22,8 +22,8 @@ ...@@ -22,8 +22,8 @@
#include "paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.h" #include "paddle/fluid/inference/tensorrt/plugin/qkv_to_context_plugin.h"
#include "paddle/fluid/inference/tensorrt/plugin/trt_plugin_utils.h" #include "paddle/fluid/inference/tensorrt/plugin/trt_plugin_utils.h"
#include "paddle/fluid/operators/math/bert_encoder_functor.h" #include "paddle/fluid/operators/math/bert_encoder_functor.h"
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/device_context.h"
#include "paddle/pten/kernels/funcs/blas/blas.h"
namespace paddle { namespace paddle {
namespace inference { namespace inference {
......
...@@ -28,9 +28,9 @@ limitations under the License. */ ...@@ -28,9 +28,9 @@ limitations under the License. */
#include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/tensor_util.h" #include "paddle/fluid/framework/tensor_util.h"
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/float16.h" #include "paddle/fluid/platform/float16.h"
#include "paddle/pten/kernels/funcs/blas/blas.h"
#ifdef PADDLE_WITH_MKLDNN #ifdef PADDLE_WITH_MKLDNN
#include "paddle/fluid/platform/mkldnn_helper.h" #include "paddle/fluid/platform/mkldnn_helper.h"
#endif #endif
......
...@@ -19,7 +19,7 @@ limitations under the License. */ ...@@ -19,7 +19,7 @@ limitations under the License. */
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/operators/eigen/eigen_function.h" #include "paddle/fluid/operators/eigen/eigen_function.h"
#include "paddle/fluid/operators/math/blas.h" #include "paddle/pten/kernels/funcs/blas/blas.h"
#include "paddle/pten/kernels/funcs/math_function.h" #include "paddle/pten/kernels/funcs/math_function.h"
namespace ops = paddle::operators; namespace ops = paddle::operators;
...@@ -94,7 +94,7 @@ class AddMMKernel : public framework::OpKernel<T> { ...@@ -94,7 +94,7 @@ class AddMMKernel : public framework::OpKernel<T> {
float alpha = context.template Attr<float>("Alpha"); float alpha = context.template Attr<float>("Alpha");
float beta = context.template Attr<float>("Beta"); float beta = context.template Attr<float>("Beta");
auto blas = math::GetBlas<DeviceContext, T>(context); auto blas = pten::funcs::GetBlas<DeviceContext, T>(context);
// calc broadcast dim // calc broadcast dim
Array2 bcast_dims; Array2 bcast_dims;
...@@ -146,7 +146,7 @@ class AddMMGradKernel : public framework::OpKernel<T> { ...@@ -146,7 +146,7 @@ class AddMMGradKernel : public framework::OpKernel<T> {
} }
auto& dev_ctx = ctx.template device_context<DeviceContext>(); auto& dev_ctx = ctx.template device_context<DeviceContext>();
auto blas = math::GetBlas<DeviceContext, T>(dev_ctx); auto blas = pten::funcs::GetBlas<DeviceContext, T>(dev_ctx);
if (dinput) { if (dinput) {
dinput->mutable_data<T>(ctx.GetPlace()); dinput->mutable_data<T>(ctx.GetPlace());
total_elems = in_dims[0] * in_dims[1]; total_elems = in_dims[0] * in_dims[1];
......
...@@ -16,7 +16,7 @@ limitations under the License. */ ...@@ -16,7 +16,7 @@ limitations under the License. */
#include <vector> #include <vector>
#include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/blas.h" #include "paddle/pten/kernels/funcs/blas/blas.h"
#include "paddle/pten/kernels/funcs/math_function.h" #include "paddle/pten/kernels/funcs/math_function.h"
namespace paddle { namespace paddle {
...@@ -122,7 +122,7 @@ class AffineGridOpKernel : public framework::OpKernel<T> { ...@@ -122,7 +122,7 @@ class AffineGridOpKernel : public framework::OpKernel<T> {
GetIdxMap<DeviceContext, T>(n, h, w, align_corners, &grid, ctx); GetIdxMap<DeviceContext, T>(n, h, w, align_corners, &grid, ctx);
// output = grid * theta.T // output = grid * theta.T
// TODO(wanghaoshuang): Refine batched matrix multiply // TODO(wanghaoshuang): Refine batched matrix multiply
auto blas = math::GetBlas<DeviceContext, T>(ctx); auto blas = pten::funcs::GetBlas<DeviceContext, T>(ctx);
for (int i = 0; i < n; ++i) { for (int i = 0; i < n; ++i) {
Tensor sliced_grid = grid.Slice(i, i + 1).Resize( Tensor sliced_grid = grid.Slice(i, i + 1).Resize(
{static_cast<int64_t>(h) * static_cast<int64_t>(w), 3}); {static_cast<int64_t>(h) * static_cast<int64_t>(w), 3});
...@@ -165,7 +165,7 @@ class AffineGridGradOpKernel : public framework::OpKernel<T> { ...@@ -165,7 +165,7 @@ class AffineGridGradOpKernel : public framework::OpKernel<T> {
GetIdxMap<DeviceContext, T>(n, h, w, align_corners, &grid, ctx); GetIdxMap<DeviceContext, T>(n, h, w, align_corners, &grid, ctx);
// output = grid * theta.T // output = grid * theta.T
// TODO(wanghaoshuang): Refine batched matrix multiply // TODO(wanghaoshuang): Refine batched matrix multiply
auto blas = math::GetBlas<DeviceContext, T>(ctx); auto blas = pten::funcs::GetBlas<DeviceContext, T>(ctx);
for (int i = 0; i < n; ++i) { for (int i = 0; i < n; ++i) {
Tensor sliced_grid = grid.Slice(i, i + 1).Resize( Tensor sliced_grid = grid.Slice(i, i + 1).Resize(
{static_cast<int64_t>(h) * static_cast<int64_t>(w), 3}); {static_cast<int64_t>(h) * static_cast<int64_t>(w), 3});
......
...@@ -17,10 +17,10 @@ ...@@ -17,10 +17,10 @@
#include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/float16.h" #include "paddle/fluid/platform/float16.h"
#include "paddle/fluid/platform/for_range.h" #include "paddle/fluid/platform/for_range.h"
#include "paddle/pten/kernels/funcs/blas/blas.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
......
...@@ -14,10 +14,10 @@ limitations under the License. */ ...@@ -14,10 +14,10 @@ limitations under the License. */
#include "paddle/fluid/operators/attention_lstm_op.h" #include "paddle/fluid/operators/attention_lstm_op.h"
#include <string> #include <string>
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/operators/math/cpu_vec.h" #include "paddle/fluid/operators/math/cpu_vec.h"
#include "paddle/fluid/operators/math/fc.h" #include "paddle/fluid/operators/math/fc.h"
#include "paddle/fluid/platform/cpu_info.h" #include "paddle/fluid/platform/cpu_info.h"
#include "paddle/pten/kernels/funcs/blas/blas.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -373,7 +373,7 @@ class AttentionLSTMKernel : public framework::OpKernel<T> { ...@@ -373,7 +373,7 @@ class AttentionLSTMKernel : public framework::OpKernel<T> {
T* lstm_x_data = lstm_x->mutable_data<T>(ctx.GetPlace()); T* lstm_x_data = lstm_x->mutable_data<T>(ctx.GetPlace());
T* lstm_out_data = lstm_out->mutable_data<T>(ctx.GetPlace()); T* lstm_out_data = lstm_out->mutable_data<T>(ctx.GetPlace());
auto blas = math::GetBlas<platform::CPUDeviceContext, T>(ctx); auto blas = pten::funcs::GetBlas<platform::CPUDeviceContext, T>(ctx);
// x(TxM) * fc (Mx1) part of atten_wgt(M+D)x1 // x(TxM) * fc (Mx1) part of atten_wgt(M+D)x1
auto& dev_ctx = ctx.template device_context<platform::CPUDeviceContext>(); auto& dev_ctx = ctx.template device_context<platform::CPUDeviceContext>();
......
...@@ -15,9 +15,9 @@ limitations under the License. */ ...@@ -15,9 +15,9 @@ limitations under the License. */
#include <string> #include <string>
#include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/operators/batch_fc_op.h" #include "paddle/fluid/operators/batch_fc_op.h"
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/platform/device/gpu/gpu_info.h" #include "paddle/fluid/platform/device/gpu/gpu_info.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
#include "paddle/pten/kernels/funcs/blas/blas.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -112,7 +112,7 @@ class BatchFCCUDAKernel : public framework::OpKernel<T> { ...@@ -112,7 +112,7 @@ class BatchFCCUDAKernel : public framework::OpKernel<T> {
int64_t strideA = ins_num * in_dim; int64_t strideA = ins_num * in_dim;
int64_t strideB = in_dim * out_dim; int64_t strideB = in_dim * out_dim;
auto blas = math::GetBlas<platform::CUDADeviceContext, T>(dev_ctx); auto blas = pten::funcs::GetBlas<platform::CUDADeviceContext, T>(dev_ctx);
blas.BatchedGEMM(transA, transB, ins_num, out_dim, in_dim, alpha, in_data, blas.BatchedGEMM(transA, transB, ins_num, out_dim, in_dim, alpha, in_data,
w_data, beta, out_data, slot_pairs_num, strideA, strideB); w_data, beta, out_data, slot_pairs_num, strideA, strideB);
add_bias<T>(ctx.cuda_device_context().stream(), out_data, slot_pairs_num, add_bias<T>(ctx.cuda_device_context().stream(), out_data, slot_pairs_num,
...@@ -165,7 +165,7 @@ class BatchFCGradOpCUDAKernel : public framework::OpKernel<T> { ...@@ -165,7 +165,7 @@ class BatchFCGradOpCUDAKernel : public framework::OpKernel<T> {
add_bias_grad<T>(ctx.cuda_device_context().stream(), dout_data, add_bias_grad<T>(ctx.cuda_device_context().stream(), dout_data,
slot_pairs_num, ins_num, out_dim, db_data); slot_pairs_num, ins_num, out_dim, db_data);
auto blas = math::GetBlas<platform::CUDADeviceContext, T>(dev_ctx); auto blas = pten::funcs::GetBlas<platform::CUDADeviceContext, T>(dev_ctx);
T alpha = 1; T alpha = 1;
T beta = 0; T beta = 0;
......
...@@ -16,7 +16,7 @@ limitations under the License. */ ...@@ -16,7 +16,7 @@ limitations under the License. */
#include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/blas.h" #include "paddle/pten/kernels/funcs/blas/blas.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -61,7 +61,7 @@ class BilinearTensorProductKernel : public framework::OpKernel<T> { ...@@ -61,7 +61,7 @@ class BilinearTensorProductKernel : public framework::OpKernel<T> {
auto output_col_vec = output_mat.chip(i, 1); auto output_col_vec = output_mat.chip(i, 1);
Tensor weight_mat = Tensor weight_mat =
weight->Slice(i, i + 1).Resize(framework::make_ddim({x_dim, y_dim})); weight->Slice(i, i + 1).Resize(framework::make_ddim({x_dim, y_dim}));
math::GetBlas<DeviceContext, T>(dev_ctx).GEMM( pten::funcs::GetBlas<DeviceContext, T>(dev_ctx).GEMM(
CblasNoTrans, CblasNoTrans, batch_size, y_dim, x_dim, 1, x->data<T>(), CblasNoTrans, CblasNoTrans, batch_size, y_dim, x_dim, 1, x->data<T>(),
weight_mat.data<T>(), 0, left_mul.data<T>()); weight_mat.data<T>(), 0, left_mul.data<T>());
output_col_vec.device(place) = output_col_vec.device(place) =
...@@ -127,7 +127,7 @@ class BilinearTensorProductGradKernel : public framework::OpKernel<T> { ...@@ -127,7 +127,7 @@ class BilinearTensorProductGradKernel : public framework::OpKernel<T> {
d_weight->mutable_data<T>(ctx.GetPlace()); d_weight->mutable_data<T>(ctx.GetPlace());
} }
auto blas = math::GetBlas<DeviceContext, T>(ctx); auto blas = pten::funcs::GetBlas<DeviceContext, T>(ctx);
// Caculate the Output(X@Grad) and Output(Y@Grad). // Caculate the Output(X@Grad) and Output(Y@Grad).
if (d_x || d_y || d_weight) { if (d_x || d_y || d_weight) {
......
...@@ -20,7 +20,7 @@ ...@@ -20,7 +20,7 @@
#include <vector> #include <vector>
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/operators/math/blas.h" #include "paddle/pten/kernels/funcs/blas/blas.h"
#include "paddle/pten/kernels/funcs/math_function.h" #include "paddle/pten/kernels/funcs/math_function.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -28,7 +28,7 @@ namespace operators { ...@@ -28,7 +28,7 @@ namespace operators {
using Tensor = framework::Tensor; using Tensor = framework::Tensor;
static void ReshapeTensorIntoMatrixSequence( static void ReshapeTensorIntoMatrixSequence(
framework::Tensor *x, const math::MatDescriptor &descriptor) { framework::Tensor *x, const pten::funcs::MatDescriptor &descriptor) {
int64_t h, w; int64_t h, w;
h = descriptor.height_; h = descriptor.height_;
w = descriptor.width_; w = descriptor.width_;
...@@ -45,8 +45,8 @@ static void ReshapeXYOutIntoMatrixSequence(framework::Tensor *x, ...@@ -45,8 +45,8 @@ static void ReshapeXYOutIntoMatrixSequence(framework::Tensor *x,
bool trans_y) { bool trans_y) {
auto x_dim = x->dims(); auto x_dim = x->dims();
auto y_dim = y->dims(); auto y_dim = y->dims();
auto mat_dim_x = math::CreateMatrixDescriptor(x_dim, 0, false); auto mat_dim_x = pten::funcs::CreateMatrixDescriptor(x_dim, 0, false);
auto mat_dim_y = math::CreateMatrixDescriptor(y_dim, 0, false); auto mat_dim_y = pten::funcs::CreateMatrixDescriptor(y_dim, 0, false);
out->Resize({std::max(mat_dim_x.batch_size_, mat_dim_y.batch_size_), out->Resize({std::max(mat_dim_x.batch_size_, mat_dim_y.batch_size_),
mat_dim_x.height_, mat_dim_y.width_}); mat_dim_x.height_, mat_dim_y.width_});
...@@ -68,10 +68,10 @@ class BmmKernel : public framework::OpKernel<T> { ...@@ -68,10 +68,10 @@ class BmmKernel : public framework::OpKernel<T> {
return; return;
} }
auto blas = math::GetBlas<DeviceContext, T>(context); auto blas = pten::funcs::GetBlas<DeviceContext, T>(context);
auto mat_dim_a = math::CreateMatrixDescriptor(x.dims(), 0, false); auto mat_dim_a = pten::funcs::CreateMatrixDescriptor(x.dims(), 0, false);
auto mat_dim_b = math::CreateMatrixDescriptor(y.dims(), 0, false); auto mat_dim_b = pten::funcs::CreateMatrixDescriptor(y.dims(), 0, false);
// auto scale = static_cast<T>(context.Attr<float>("alpha")); // auto scale = static_cast<T>(context.Attr<float>("alpha"));
blas.MatMul(x, mat_dim_a, y, mat_dim_b, T(1), out, T(0)); blas.MatMul(x, mat_dim_a, y, mat_dim_b, T(1), out, T(0));
...@@ -86,9 +86,9 @@ class BmmGradKernel : public framework::OpKernel<T> { ...@@ -86,9 +86,9 @@ class BmmGradKernel : public framework::OpKernel<T> {
const framework::Tensor &b, bool trans_b, const framework::Tensor &b, bool trans_b,
framework::Tensor *out) const { framework::Tensor *out) const {
out->mutable_data<T>(context.GetPlace()); out->mutable_data<T>(context.GetPlace());
auto blas = math::GetBlas<DeviceContext, T>(context); auto blas = pten::funcs::GetBlas<DeviceContext, T>(context);
auto mat_dim_a = math::CreateMatrixDescriptor(a.dims(), 0, trans_a); auto mat_dim_a = pten::funcs::CreateMatrixDescriptor(a.dims(), 0, trans_a);
auto mat_dim_b = math::CreateMatrixDescriptor(b.dims(), 0, trans_b); auto mat_dim_b = pten::funcs::CreateMatrixDescriptor(b.dims(), 0, trans_b);
blas.MatMul(a, mat_dim_a, b, mat_dim_b, T(1), out, T(0)); blas.MatMul(a, mat_dim_a, b, mat_dim_b, T(1), out, T(0));
} }
......
...@@ -19,8 +19,8 @@ limitations under the License. */ ...@@ -19,8 +19,8 @@ limitations under the License. */
#include <vector> #include <vector>
#include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/platform/transform.h" #include "paddle/fluid/platform/transform.h"
#include "paddle/pten/kernels/funcs/blas/blas.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -84,7 +84,7 @@ class CenterLossKernel : public framework::OpKernel<T> { ...@@ -84,7 +84,7 @@ class CenterLossKernel : public framework::OpKernel<T> {
int numel = centers_diffacc.numel(); int numel = centers_diffacc.numel();
std::memset(centers_diffacc_data, 0, sizeof(T) * numel); std::memset(centers_diffacc_data, 0, sizeof(T) * numel);
auto blas = math::GetBlas<DeviceContext, T>(ctx); auto blas = pten::funcs::GetBlas<DeviceContext, T>(ctx);
int tLabel; int tLabel;
const T *x_index; const T *x_index;
......
...@@ -19,9 +19,9 @@ limitations under the License. */ ...@@ -19,9 +19,9 @@ limitations under the License. */
#include "Eigen/Cholesky" #include "Eigen/Cholesky"
#include "Eigen/Core" #include "Eigen/Core"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/operators/transpose_op.h" #include "paddle/fluid/operators/transpose_op.h"
#include "paddle/fluid/platform/for_range.h" #include "paddle/fluid/platform/for_range.h"
#include "paddle/pten/kernels/funcs/blas/blas.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -323,9 +323,9 @@ class CholeskyGradKernel : public framework::OpKernel<T> { ...@@ -323,9 +323,9 @@ class CholeskyGradKernel : public framework::OpKernel<T> {
/*! phi = matmul(L.transpose(-1, -2), grad) */ /*! phi = matmul(L.transpose(-1, -2), grad) */
Tensor middle; Tensor middle;
auto* middle_data = middle.mutable_data<T>(dims, context.GetPlace()); auto* middle_data = middle.mutable_data<T>(dims, context.GetPlace());
auto trans_desc = math::CreateMatrixDescriptor(dims, 0, true); auto trans_desc = pten::funcs::CreateMatrixDescriptor(dims, 0, true);
auto no_trans_desc = math::CreateMatrixDescriptor(dims, 0, false); auto no_trans_desc = pten::funcs::CreateMatrixDescriptor(dims, 0, false);
auto blas = math::GetBlas<DeviceContext, T>(context); auto blas = pten::funcs::GetBlas<DeviceContext, T>(context);
blas.MatMul(l, trans_desc, l_grad, no_trans_desc, T(1), &middle, T(0)); blas.MatMul(l, trans_desc, l_grad, no_trans_desc, T(1), &middle, T(0));
/*! phi.tril_().diagonal(0, -2, -1).mul_(0.5) */ /*! phi.tril_().diagonal(0, -2, -1).mul_(0.5) */
......
...@@ -15,11 +15,11 @@ limitations under the License. */ ...@@ -15,11 +15,11 @@ limitations under the License. */
#pragma once #pragma once
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/lapack_function.h"
#include "paddle/fluid/operators/solve_op.h" #include "paddle/fluid/operators/solve_op.h"
#include "paddle/fluid/operators/svd_helper.h" #include "paddle/fluid/operators/svd_helper.h"
#include "paddle/fluid/operators/triangular_solve_op.h" #include "paddle/fluid/operators/triangular_solve_op.h"
#include "paddle/fluid/platform/complex.h" #include "paddle/fluid/platform/complex.h"
#include "paddle/pten/kernels/funcs/lapack/lapack_function.h"
#include "paddle/pten/kernels/math_kernel.h" #include "paddle/pten/kernels/math_kernel.h"
namespace paddle { namespace paddle {
...@@ -38,8 +38,8 @@ class CholeskySolveFunctor<paddle::platform::CPUDeviceContext, T> { ...@@ -38,8 +38,8 @@ class CholeskySolveFunctor<paddle::platform::CPUDeviceContext, T> {
void operator()(const platform::CPUDeviceContext &dev_ctx, bool upper, int n, void operator()(const platform::CPUDeviceContext &dev_ctx, bool upper, int n,
int nrhs, T *Adata, int lda, T *Bdata, int *devInfo) { int nrhs, T *Adata, int lda, T *Bdata, int *devInfo) {
char uplo = upper ? 'U' : 'L'; char uplo = upper ? 'U' : 'L';
math::lapackCholeskySolve<T>(uplo, n, nrhs, Adata, lda, Bdata, lda, pten::funcs::lapackCholeskySolve<T>(uplo, n, nrhs, Adata, lda, Bdata, lda,
devInfo); devInfo);
} }
}; };
...@@ -168,7 +168,7 @@ class CholeskySolveGradKernel : public framework::OpKernel<T> { ...@@ -168,7 +168,7 @@ class CholeskySolveGradKernel : public framework::OpKernel<T> {
db->Resize(bin->dims()); db->Resize(bin->dims());
} }
auto blas = math::GetBlas<DeviceContext, T>(ctx); auto blas = pten::funcs::GetBlas<DeviceContext, T>(ctx);
// calculate out's conjugate for complex // calculate out's conjugate for complex
framework::Tensor out_conj(out->type()); framework::Tensor out_conj(out->type());
...@@ -182,8 +182,8 @@ class CholeskySolveGradKernel : public framework::OpKernel<T> { ...@@ -182,8 +182,8 @@ class CholeskySolveGradKernel : public framework::OpKernel<T> {
framework::Tensor commonterm(out->type()); framework::Tensor commonterm(out->type());
auto outdims = out_conj.dims(); auto outdims = out_conj.dims();
auto dbdims = db_bst.dims(); auto dbdims = db_bst.dims();
auto mat_dim_a = math::CreateMatrixDescriptor(outdims, 0, false); auto mat_dim_a = pten::funcs::CreateMatrixDescriptor(outdims, 0, false);
auto mat_dim_b = math::CreateMatrixDescriptor(dbdims, 0, false); auto mat_dim_b = pten::funcs::CreateMatrixDescriptor(dbdims, 0, false);
auto cmtdim = outdims; auto cmtdim = outdims;
cmtdim[cmtdim.size() - 2] = dbdims[dbdims.size() - 2]; cmtdim[cmtdim.size() - 2] = dbdims[dbdims.size() - 2];
commonterm.Resize(cmtdim); commonterm.Resize(cmtdim);
...@@ -207,9 +207,10 @@ class CholeskySolveGradKernel : public framework::OpKernel<T> { ...@@ -207,9 +207,10 @@ class CholeskySolveGradKernel : public framework::OpKernel<T> {
DeviceContext>::TYPE &>(dev_ctx), DeviceContext>::TYPE &>(dev_ctx),
commonterm, commonterm_conj, -1, &commonterm); commonterm, commonterm_conj, -1, &commonterm);
auto mat_dim_u = math::CreateMatrixDescriptor(u_bst.dims(), 0, false); auto mat_dim_u =
pten::funcs::CreateMatrixDescriptor(u_bst.dims(), 0, false);
auto mat_dim_c = auto mat_dim_c =
math::CreateMatrixDescriptor(commonterm.dims(), 0, false); pten::funcs::CreateMatrixDescriptor(commonterm.dims(), 0, false);
Tensor du_bst(uin->type()); Tensor du_bst(uin->type());
// get upper or lower triangular // get upper or lower triangular
......
...@@ -21,10 +21,10 @@ limitations under the License. */ ...@@ -21,10 +21,10 @@ limitations under the License. */
#include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/layout_utils.h" #include "paddle/fluid/operators/layout_utils.h"
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/operators/math/depthwise_conv.h" #include "paddle/fluid/operators/math/depthwise_conv.h"
#include "paddle/fluid/operators/math/im2col.h" #include "paddle/fluid/operators/math/im2col.h"
#include "paddle/fluid/operators/math/vol2col.h" #include "paddle/fluid/operators/math/vol2col.h"
#include "paddle/pten/kernels/funcs/blas/blas.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -332,7 +332,7 @@ class GemmConvKernel : public framework::OpKernel<T> { ...@@ -332,7 +332,7 @@ class GemmConvKernel : public framework::OpKernel<T> {
math::Vol2ColFunctor<DeviceContext, T> vol2col; math::Vol2ColFunctor<DeviceContext, T> vol2col;
math::Im2ColFunctor<math::ColFormat::kCFO, DeviceContext, T> im2col; math::Im2ColFunctor<math::ColFormat::kCFO, DeviceContext, T> im2col;
auto blas = math::GetBlas<DeviceContext, T>(dev_ctx); auto blas = pten::funcs::GetBlas<DeviceContext, T>(dev_ctx);
for (int i = 0; i < batch_size; i++) { for (int i = 0; i < batch_size; i++) {
Tensor in_batch = Tensor in_batch =
transformed_input.Slice(i, i + 1).Resize(in_matrix_shape); transformed_input.Slice(i, i + 1).Resize(in_matrix_shape);
...@@ -486,7 +486,7 @@ class GemmConvGradKernel : public framework::OpKernel<T> { ...@@ -486,7 +486,7 @@ class GemmConvGradKernel : public framework::OpKernel<T> {
} }
pten::funcs::SetConstant<DeviceContext, T> set_zero; pten::funcs::SetConstant<DeviceContext, T> set_zero;
auto blas = math::GetBlas<DeviceContext, T>(dev_ctx); auto blas = pten::funcs::GetBlas<DeviceContext, T>(dev_ctx);
if (input_grad) { if (input_grad) {
input_grad->mutable_data<T>(context.GetPlace()); input_grad->mutable_data<T>(context.GetPlace());
...@@ -693,7 +693,7 @@ class GemmConvDoubleGradKernel : public framework::OpKernel<T> { ...@@ -693,7 +693,7 @@ class GemmConvDoubleGradKernel : public framework::OpKernel<T> {
} }
pten::funcs::SetConstant<DeviceContext, T> set_zero; pten::funcs::SetConstant<DeviceContext, T> set_zero;
auto blas = math::GetBlas<DeviceContext, T>(dev_ctx); auto blas = pten::funcs::GetBlas<DeviceContext, T>(dev_ctx);
// dx convolution double grad: gemm + col2im(col2vol) // dx convolution double grad: gemm + col2im(col2vol)
// dx = ddw * dy ==> dx(N, Cin, H, W), ddw(Cout, Cin, kh, kw), dy(N, Cout, // dx = ddw * dy ==> dx(N, Cin, H, W), ddw(Cout, Cin, kh, kw), dy(N, Cout,
......
...@@ -20,11 +20,11 @@ limitations under the License. */ ...@@ -20,11 +20,11 @@ limitations under the License. */
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/conv_op.h" #include "paddle/fluid/operators/conv_op.h"
#include "paddle/fluid/operators/eigen/eigen_function.h" #include "paddle/fluid/operators/eigen/eigen_function.h"
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/operators/math/concat_and_split.h" #include "paddle/fluid/operators/math/concat_and_split.h"
#include "paddle/fluid/operators/math/depthwise_conv.h" #include "paddle/fluid/operators/math/depthwise_conv.h"
#include "paddle/fluid/operators/math/im2col.h" #include "paddle/fluid/operators/math/im2col.h"
#include "paddle/fluid/operators/math/vol2col.h" #include "paddle/fluid/operators/math/vol2col.h"
#include "paddle/pten/kernels/funcs/blas/blas.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -228,7 +228,7 @@ class GemmConvTransposeKernel : public framework::OpKernel<T> { ...@@ -228,7 +228,7 @@ class GemmConvTransposeKernel : public framework::OpKernel<T> {
output->mutable_data<T>(context.GetPlace()); output->mutable_data<T>(context.GetPlace());
pten::funcs::SetConstant<DeviceContext, T> set_zero; pten::funcs::SetConstant<DeviceContext, T> set_zero;
auto& dev_ctx = context.template device_context<DeviceContext>(); auto& dev_ctx = context.template device_context<DeviceContext>();
auto blas = math::GetBlas<DeviceContext, T>(dev_ctx); auto blas = pten::funcs::GetBlas<DeviceContext, T>(dev_ctx);
set_zero(dev_ctx, output, static_cast<T>(0)); set_zero(dev_ctx, output, static_cast<T>(0));
int in_step = int in_step =
...@@ -425,7 +425,7 @@ class GemmConvTransposeGradKernel : public framework::OpKernel<T> { ...@@ -425,7 +425,7 @@ class GemmConvTransposeGradKernel : public framework::OpKernel<T> {
// im2col + gemm (similar to conv-forward) // im2col + gemm (similar to conv-forward)
// input need to compute gradient // input need to compute gradient
auto& dev_ctx = context.template device_context<DeviceContext>(); auto& dev_ctx = context.template device_context<DeviceContext>();
auto blas = math::GetBlas<DeviceContext, T>(dev_ctx); auto blas = pten::funcs::GetBlas<DeviceContext, T>(dev_ctx);
if (input_grad || filter_grad) { if (input_grad || filter_grad) {
Tensor col; Tensor col;
col.mutable_data<T>(col_shape, context.GetPlace()); col.mutable_data<T>(col_shape, context.GetPlace());
......
...@@ -22,7 +22,7 @@ ...@@ -22,7 +22,7 @@
// \author Yi Li, Guodong Zhang, Jifeng Dai // \author Yi Li, Guodong Zhang, Jifeng Dai
#pragma once #pragma once
#include "paddle/fluid/operators/math/blas.h" #include "paddle/pten/kernels/funcs/blas/blas.h"
#include "paddle/pten/kernels/funcs/math_function.h" #include "paddle/pten/kernels/funcs/math_function.h"
template <typename T> template <typename T>
......
...@@ -22,8 +22,8 @@ ...@@ -22,8 +22,8 @@
// \author Yi Li, Guodong Zhang, Jifeng Dai // \author Yi Li, Guodong Zhang, Jifeng Dai
#pragma once #pragma once
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/pten/core/hostdevice.h" #include "paddle/pten/core/hostdevice.h"
#include "paddle/pten/kernels/funcs/blas/blas.h"
#include "paddle/pten/kernels/funcs/math_function.h" #include "paddle/pten/kernels/funcs/math_function.h"
template <typename T> template <typename T>
......
...@@ -25,8 +25,8 @@ ...@@ -25,8 +25,8 @@
#include <vector> #include <vector>
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/deformable_conv_op.h" #include "paddle/fluid/operators/deformable_conv_op.h"
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
#include "paddle/pten/kernels/funcs/blas/blas.h"
#include "paddle/pten/kernels/funcs/math_function.h" #include "paddle/pten/kernels/funcs/math_function.h"
namespace paddle { namespace paddle {
...@@ -513,7 +513,7 @@ class DeformableConvCUDAKernel : public framework::OpKernel<T> { ...@@ -513,7 +513,7 @@ class DeformableConvCUDAKernel : public framework::OpKernel<T> {
int input_offset_dim = offset.numel() / offset.dims()[0]; int input_offset_dim = offset.numel() / offset.dims()[0];
int input_mask_dim = mask.numel() / mask.dims()[0]; int input_mask_dim = mask.numel() / mask.dims()[0];
auto blas = math::GetBlas<DeviceContext, T>(dev_ctx); auto blas = pten::funcs::GetBlas<DeviceContext, T>(dev_ctx);
const T* input_ptr = input->data<T>(); const T* input_ptr = input->data<T>();
const T* offset_ptr = offset.data<T>(); const T* offset_ptr = offset.data<T>();
...@@ -624,7 +624,7 @@ class DeformableConvGradCUDAKernel : public framework::OpKernel<T> { ...@@ -624,7 +624,7 @@ class DeformableConvGradCUDAKernel : public framework::OpKernel<T> {
col_buffer_3d.ShareDataWith(col_buffer).Resize(col_buffer_3d_shape); col_buffer_3d.ShareDataWith(col_buffer).Resize(col_buffer_3d_shape);
pten::funcs::SetConstant<DeviceContext, T> set_zero; pten::funcs::SetConstant<DeviceContext, T> set_zero;
auto blas = math::GetBlas<DeviceContext, T>(dev_ctx); auto blas = pten::funcs::GetBlas<DeviceContext, T>(dev_ctx);
col_buffer.mutable_data<T>(ctx.GetPlace()); col_buffer.mutable_data<T>(ctx.GetPlace());
col_buffer_3d.mutable_data<T>(ctx.GetPlace()); col_buffer_3d.mutable_data<T>(ctx.GetPlace());
......
...@@ -26,7 +26,7 @@ ...@@ -26,7 +26,7 @@
#include <vector> #include <vector>
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/deformable_conv_func.h" #include "paddle/fluid/operators/deformable_conv_func.h"
#include "paddle/fluid/operators/math/blas.h" #include "paddle/pten/kernels/funcs/blas/blas.h"
#include "paddle/pten/kernels/funcs/math_function.h" #include "paddle/pten/kernels/funcs/math_function.h"
namespace paddle { namespace paddle {
...@@ -382,7 +382,7 @@ class DeformableConvCPUKernel : public framework::OpKernel<T> { ...@@ -382,7 +382,7 @@ class DeformableConvCPUKernel : public framework::OpKernel<T> {
int input_dim = input->numel() / input->dims()[0]; int input_dim = input->numel() / input->dims()[0];
int input_offset_dim = offset->numel() / offset->dims()[0]; int input_offset_dim = offset->numel() / offset->dims()[0];
int input_mask_dim = mask->numel() / mask->dims()[0]; int input_mask_dim = mask->numel() / mask->dims()[0];
auto blas = math::GetBlas<CPUDeviceContext, T>(dev_ctx); auto blas = pten::funcs::GetBlas<CPUDeviceContext, T>(dev_ctx);
const T* input_ptr = input->data<T>(); const T* input_ptr = input->data<T>();
const T* offset_ptr = offset->data<T>(); const T* offset_ptr = offset->data<T>();
const T* mask_ptr = mask->data<T>(); const T* mask_ptr = mask->data<T>();
...@@ -490,7 +490,7 @@ class DeformableConvGradCPUKernel : public framework::OpKernel<T> { ...@@ -490,7 +490,7 @@ class DeformableConvGradCPUKernel : public framework::OpKernel<T> {
col_buffer_3d.ShareDataWith(col_buffer).Resize(col_buffer_3d_shape); col_buffer_3d.ShareDataWith(col_buffer).Resize(col_buffer_3d_shape);
pten::funcs::SetConstant<CPUDeviceContext, T> set_zero; pten::funcs::SetConstant<CPUDeviceContext, T> set_zero;
auto blas = math::GetBlas<CPUDeviceContext, T>(dev_ctx); auto blas = pten::funcs::GetBlas<CPUDeviceContext, T>(dev_ctx);
col_buffer.mutable_data<T>(ctx.GetPlace()); col_buffer.mutable_data<T>(ctx.GetPlace());
col_buffer_3d.mutable_data<T>(ctx.GetPlace()); col_buffer_3d.mutable_data<T>(ctx.GetPlace());
......
...@@ -28,8 +28,8 @@ ...@@ -28,8 +28,8 @@
#include "paddle/fluid/operators/deformable_conv_filter.cu.h" #include "paddle/fluid/operators/deformable_conv_filter.cu.h"
#include "paddle/fluid/operators/deformable_conv_func.h" #include "paddle/fluid/operators/deformable_conv_func.h"
#include "paddle/fluid/operators/deformable_conv_v1_op.h" #include "paddle/fluid/operators/deformable_conv_v1_op.h"
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
#include "paddle/pten/kernels/funcs/blas/blas.h"
#include "paddle/pten/kernels/funcs/math_function.h" #include "paddle/pten/kernels/funcs/math_function.h"
namespace paddle { namespace paddle {
...@@ -381,7 +381,7 @@ class DeformableConvV1CUDAKernel : public framework::OpKernel<T> { ...@@ -381,7 +381,7 @@ class DeformableConvV1CUDAKernel : public framework::OpKernel<T> {
int input_dim = input->numel() / input->dims()[0]; int input_dim = input->numel() / input->dims()[0];
int input_offset_dim = offset.numel() / offset.dims()[0]; int input_offset_dim = offset.numel() / offset.dims()[0];
auto blas = math::GetBlas<CUDADeviceContext, T>(dev_ctx); auto blas = pten::funcs::GetBlas<CUDADeviceContext, T>(dev_ctx);
const T* input_ptr = input->data<T>(); const T* input_ptr = input->data<T>();
const T* offset_ptr = offset.data<T>(); const T* offset_ptr = offset.data<T>();
...@@ -490,7 +490,7 @@ class DeformableConvV1GradCUDAKernel : public framework::OpKernel<T> { ...@@ -490,7 +490,7 @@ class DeformableConvV1GradCUDAKernel : public framework::OpKernel<T> {
col_buffer_3d.ShareDataWith(col_buffer).Resize(col_buffer_3d_shape); col_buffer_3d.ShareDataWith(col_buffer).Resize(col_buffer_3d_shape);
pten::funcs::SetConstant<CUDADeviceContext, T> set_zero; pten::funcs::SetConstant<CUDADeviceContext, T> set_zero;
auto blas = math::GetBlas<CUDADeviceContext, T>(dev_ctx); auto blas = pten::funcs::GetBlas<CUDADeviceContext, T>(dev_ctx);
col_buffer.mutable_data<T>(ctx.GetPlace()); col_buffer.mutable_data<T>(ctx.GetPlace());
col_buffer_3d.mutable_data<T>(ctx.GetPlace()); col_buffer_3d.mutable_data<T>(ctx.GetPlace());
......
...@@ -27,7 +27,7 @@ ...@@ -27,7 +27,7 @@
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/deformable_conv_func.h" #include "paddle/fluid/operators/deformable_conv_func.h"
#include "paddle/fluid/operators/deformable_conv_op.h" #include "paddle/fluid/operators/deformable_conv_op.h"
#include "paddle/fluid/operators/math/blas.h" #include "paddle/pten/kernels/funcs/blas/blas.h"
#include "paddle/pten/kernels/funcs/math_function.h" #include "paddle/pten/kernels/funcs/math_function.h"
namespace paddle { namespace paddle {
...@@ -348,7 +348,7 @@ class DeformableConvV1CPUKernel : public framework::OpKernel<T> { ...@@ -348,7 +348,7 @@ class DeformableConvV1CPUKernel : public framework::OpKernel<T> {
std::vector<int64_t> input_shape_vec = framework::vectorize(input_shape); std::vector<int64_t> input_shape_vec = framework::vectorize(input_shape);
int input_dim = input->numel() / input->dims()[0]; int input_dim = input->numel() / input->dims()[0];
int input_offset_dim = offset->numel() / offset->dims()[0]; int input_offset_dim = offset->numel() / offset->dims()[0];
auto blas = math::GetBlas<CPUDeviceContext, T>(dev_ctx); auto blas = pten::funcs::GetBlas<CPUDeviceContext, T>(dev_ctx);
const T* input_ptr = input->data<T>(); const T* input_ptr = input->data<T>();
const T* offset_ptr = offset->data<T>(); const T* offset_ptr = offset->data<T>();
col_buffer.mutable_data<T>(ctx.GetPlace()); col_buffer.mutable_data<T>(ctx.GetPlace());
...@@ -452,7 +452,7 @@ class DeformableConvV1GradCPUKernel : public framework::OpKernel<T> { ...@@ -452,7 +452,7 @@ class DeformableConvV1GradCPUKernel : public framework::OpKernel<T> {
col_buffer_3d.ShareDataWith(col_buffer).Resize(col_buffer_3d_shape); col_buffer_3d.ShareDataWith(col_buffer).Resize(col_buffer_3d_shape);
pten::funcs::SetConstant<CPUDeviceContext, T> set_zero; pten::funcs::SetConstant<CPUDeviceContext, T> set_zero;
auto blas = math::GetBlas<CPUDeviceContext, T>(dev_ctx); auto blas = pten::funcs::GetBlas<CPUDeviceContext, T>(dev_ctx);
col_buffer.mutable_data<T>(ctx.GetPlace()); col_buffer.mutable_data<T>(ctx.GetPlace());
col_buffer_3d.mutable_data<T>(ctx.GetPlace()); col_buffer_3d.mutable_data<T>(ctx.GetPlace());
......
...@@ -16,7 +16,7 @@ ...@@ -16,7 +16,7 @@
#include <iostream> #include <iostream>
#include <memory> #include <memory>
#include <vector> #include <vector>
#include "paddle/fluid/operators/math/blas.h" #include "paddle/pten/kernels/funcs/blas/blas.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
......
...@@ -30,8 +30,8 @@ ...@@ -30,8 +30,8 @@
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/memory/malloc.h" #include "paddle/fluid/memory/malloc.h"
#include "paddle/fluid/operators/deformable_psroi_pooling_op.h" #include "paddle/fluid/operators/deformable_psroi_pooling_op.h"
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
#include "paddle/pten/kernels/funcs/blas/blas.h"
#include "paddle/pten/kernels/funcs/math_function.h" #include "paddle/pten/kernels/funcs/math_function.h"
namespace paddle { namespace paddle {
......
...@@ -26,7 +26,7 @@ ...@@ -26,7 +26,7 @@
#include <iostream> #include <iostream>
#include <vector> #include <vector>
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/blas.h" #include "paddle/pten/kernels/funcs/blas/blas.h"
#include "paddle/pten/kernels/funcs/math_function.h" #include "paddle/pten/kernels/funcs/math_function.h"
namespace paddle { namespace paddle {
......
...@@ -17,12 +17,12 @@ ...@@ -17,12 +17,12 @@
#include <math.h> #include <math.h>
#include <algorithm> #include <algorithm>
#include <complex> #include <complex>
#include "paddle/fluid/operators/math/lapack_function.h"
#include "paddle/fluid/operators/math/matrix_solve.h" #include "paddle/fluid/operators/math/matrix_solve.h"
#include "paddle/fluid/operators/svd_helper.h" #include "paddle/fluid/operators/svd_helper.h"
#include "paddle/fluid/operators/transpose_op.h" #include "paddle/fluid/operators/transpose_op.h"
#include "paddle/fluid/platform/for_range.h" #include "paddle/fluid/platform/for_range.h"
#include "paddle/pten/kernels/funcs/complex_functors.h" #include "paddle/pten/kernels/funcs/complex_functors.h"
#include "paddle/pten/kernels/funcs/lapack/lapack_function.h"
#include "paddle/pten/kernels/funcs/math_function.h" #include "paddle/pten/kernels/funcs/math_function.h"
#define EPSILON 1e-6 #define EPSILON 1e-6
...@@ -94,7 +94,7 @@ void LapackEig(Tensor* input, Tensor* values, Tensor* vectors, int info, ...@@ -94,7 +94,7 @@ void LapackEig(Tensor* input, Tensor* values, Tensor* vectors, int info,
// call lapackEig once to compute the size of work; // call lapackEig once to compute the size of work;
T computed_work_size; T computed_work_size;
math::lapackEig<T, pten::funcs::Real<T>>( pten::funcs::lapackEig<T, pten::funcs::Real<T>>(
jobvl, jobvr, order, input_data, lda, values_data, lvector_data, ldvl, jobvl, jobvr, order, input_data, lda, values_data, lvector_data, ldvl,
rvector_data, ldvr, &computed_work_size, lwork, rwork_data, &info); rvector_data, ldvr, &computed_work_size, lwork, rwork_data, &info);
...@@ -109,7 +109,7 @@ void LapackEig(Tensor* input, Tensor* values, Tensor* vectors, int info, ...@@ -109,7 +109,7 @@ void LapackEig(Tensor* input, Tensor* values, Tensor* vectors, int info,
T* current_values = &values_data[i * values_stride]; T* current_values = &values_data[i * values_stride];
T* current_rvectors = &rvector_data[i * matrix_stride]; T* current_rvectors = &rvector_data[i * matrix_stride];
math::lapackEig<T, pten::funcs::Real<T>>( pten::funcs::lapackEig<T, pten::funcs::Real<T>>(
jobvl, jobvr, order, current_matrix, lda, current_values, lvector_data, jobvl, jobvr, order, current_matrix, lda, current_values, lvector_data,
ldvl, current_rvectors, ldvr, work_data, lwork, rwork_data, &info); ldvl, current_rvectors, ldvr, work_data, lwork, rwork_data, &info);
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
......
...@@ -20,9 +20,9 @@ ...@@ -20,9 +20,9 @@
#include "paddle/fluid/framework/ddim.h" #include "paddle/fluid/framework/ddim.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/memory/allocation/allocator.h" #include "paddle/fluid/memory/allocation/allocator.h"
#include "paddle/fluid/operators/math/lapack_function.h"
#include "paddle/fluid/platform/for_range.h" #include "paddle/fluid/platform/for_range.h"
#include "paddle/pten/kernels/funcs/complex_functors.h" #include "paddle/pten/kernels/funcs/complex_functors.h"
#include "paddle/pten/kernels/funcs/lapack/lapack_function.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -103,11 +103,11 @@ LapackEigvals(const framework::ExecutionContext& ctx, const Tensor& input, ...@@ -103,11 +103,11 @@ LapackEigvals(const framework::ExecutionContext& ctx, const Tensor& input,
required_work_mem, work_mem)); required_work_mem, work_mem));
int info = 0; int info = 0;
math::lapackEig<T>('N', 'N', static_cast<int>(n_dim), a.template data<T>(), pten::funcs::lapackEig<T>('N', 'N', static_cast<int>(n_dim),
static_cast<int>(n_dim), w_data, NULL, 1, NULL, 1, a.template data<T>(), static_cast<int>(n_dim),
work->template data<T>(), w_data, NULL, 1, NULL, 1, work->template data<T>(),
static_cast<int>(work_mem / sizeof(T)), static_cast<int>(work_mem / sizeof(T)),
static_cast<T*>(NULL), &info); static_cast<T*>(NULL), &info);
std::string name = "framework::platform::dynload::dgeev_"; std::string name = "framework::platform::dynload::dgeev_";
if (framework::TransToProtoVarType(input.dtype()) == if (framework::TransToProtoVarType(input.dtype()) ==
...@@ -153,7 +153,7 @@ LapackEigvals(const framework::ExecutionContext& ctx, const Tensor& input, ...@@ -153,7 +153,7 @@ LapackEigvals(const framework::ExecutionContext& ctx, const Tensor& input,
required_rwork_mem, rwork_mem)); required_rwork_mem, rwork_mem));
int info = 0; int info = 0;
math::lapackEig<T, pten::funcs::Real<T>>( pten::funcs::lapackEig<T, pten::funcs::Real<T>>(
'N', 'N', static_cast<int>(n_dim), a.template data<T>(), 'N', 'N', static_cast<int>(n_dim), a.template data<T>(),
static_cast<int>(n_dim), output->template data<T>(), NULL, 1, NULL, 1, static_cast<int>(n_dim), output->template data<T>(), NULL, 1, NULL, 1,
work->template data<T>(), static_cast<int>(work_mem / sizeof(T)), work->template data<T>(), static_cast<int>(work_mem / sizeof(T)),
...@@ -187,10 +187,10 @@ class EigvalsKernel : public framework::OpKernel<T> { ...@@ -187,10 +187,10 @@ class EigvalsKernel : public framework::OpKernel<T> {
// query workspace size // query workspace size
T qwork; T qwork;
int info; int info;
math::lapackEig<T, pten::funcs::Real<T>>( pten::funcs::lapackEig<T, pten::funcs::Real<T>>(
'N', 'N', static_cast<int>(n_dim), input_matrices[0].template data<T>(), 'N', 'N', static_cast<int>(n_dim), input_matrices[0].template data<T>(),
static_cast<int>(n_dim), NULL, NULL, 1, NULL, 1, &qwork, -1, static_cast<int>(n_dim), NULL, NULL, 1, NULL, 1, &qwork, -1,
static_cast<pten::funcs::Real<T>*>(NULL), &info); static_cast<Real<T>*>(NULL), &info);
int64_t lwork = static_cast<int64_t>(qwork); int64_t lwork = static_cast<int64_t>(qwork);
Tensor work, rwork; Tensor work, rwork;
......
...@@ -28,7 +28,7 @@ struct SameDimsElemwiseMul< ...@@ -28,7 +28,7 @@ struct SameDimsElemwiseMul<
void operator()(const framework::ExecutionContext &ctx, void operator()(const framework::ExecutionContext &ctx,
const framework::Tensor *x, const framework::Tensor *y, const framework::Tensor *x, const framework::Tensor *y,
framework::Tensor *z) { framework::Tensor *z) {
auto blas = math::GetBlas<platform::CPUDeviceContext, T>(ctx); auto blas = pten::funcs::GetBlas<platform::CPUDeviceContext, T>(ctx);
blas.VMUL(x->numel(), x->data<T>(), y->data<T>(), z->data<T>()); blas.VMUL(x->numel(), x->data<T>(), y->data<T>(), z->data<T>());
} }
}; };
......
...@@ -19,9 +19,9 @@ limitations under the License. */ ...@@ -19,9 +19,9 @@ limitations under the License. */
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/tensor_util.h" #include "paddle/fluid/framework/tensor_util.h"
#include "paddle/fluid/memory/malloc.h" #include "paddle/fluid/memory/malloc.h"
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/platform/transform.h" #include "paddle/fluid/platform/transform.h"
#include "paddle/pten/core/hostdevice.h" #include "paddle/pten/core/hostdevice.h"
#include "paddle/pten/kernels/funcs/blas/blas.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
......
...@@ -16,12 +16,12 @@ limitations under the License. */ ...@@ -16,12 +16,12 @@ limitations under the License. */
#include <vector> #include <vector>
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/pten_utils.h" #include "paddle/fluid/framework/pten_utils.h"
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/operators/math/pooling.h" #include "paddle/fluid/operators/math/pooling.h"
#include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/device_context.h"
#include "paddle/pten/kernels/empty_kernel.h" #include "paddle/pten/kernels/empty_kernel.h"
#include "paddle/pten/kernels/flatten_grad_kernel.h" #include "paddle/pten/kernels/flatten_grad_kernel.h"
#include "paddle/pten/kernels/flatten_kernel.h" #include "paddle/pten/kernels/flatten_kernel.h"
#include "paddle/pten/kernels/funcs/blas/blas.h"
#include "paddle/pten/kernels/funcs/math_function.h" #include "paddle/pten/kernels/funcs/math_function.h"
namespace paddle { namespace paddle {
......
...@@ -14,7 +14,7 @@ limitations under the License. */ ...@@ -14,7 +14,7 @@ limitations under the License. */
#pragma once #pragma once
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/blas.h" #include "paddle/pten/kernels/funcs/blas/blas.h"
#include "paddle/pten/kernels/funcs/math_function.h" #include "paddle/pten/kernels/funcs/math_function.h"
namespace paddle { namespace paddle {
...@@ -39,16 +39,16 @@ class FSPOpKernel : public framework::OpKernel<T> { ...@@ -39,16 +39,16 @@ class FSPOpKernel : public framework::OpKernel<T> {
auto height = x_dims[2]; auto height = x_dims[2];
auto width = x_dims[3]; auto width = x_dims[3];
auto blas = math::GetBlas<DeviceContext, T>(context); auto blas = pten::funcs::GetBlas<DeviceContext, T>(context);
math::MatDescriptor x_mat_desc; pten::funcs::MatDescriptor x_mat_desc;
x_mat_desc.height_ = x_channel; x_mat_desc.height_ = x_channel;
x_mat_desc.width_ = height * width; x_mat_desc.width_ = height * width;
x_mat_desc.batch_size_ = batch_size; x_mat_desc.batch_size_ = batch_size;
x_mat_desc.stride_ = x_channel * height * width; x_mat_desc.stride_ = x_channel * height * width;
x_mat_desc.trans_ = false; x_mat_desc.trans_ = false;
math::MatDescriptor y_mat_desc; pten::funcs::MatDescriptor y_mat_desc;
y_mat_desc.height_ = height * width; y_mat_desc.height_ = height * width;
y_mat_desc.width_ = y_channel; y_mat_desc.width_ = y_channel;
y_mat_desc.batch_size_ = batch_size; y_mat_desc.batch_size_ = batch_size;
...@@ -78,7 +78,7 @@ class FSPGradOpKernel : public framework::OpKernel<T> { ...@@ -78,7 +78,7 @@ class FSPGradOpKernel : public framework::OpKernel<T> {
int64_t h = 0; int64_t h = 0;
int64_t w = 0; int64_t w = 0;
auto blas = math::GetBlas<DeviceContext, T>(context); auto blas = pten::funcs::GetBlas<DeviceContext, T>(context);
pten::funcs::SetConstant<DeviceContext, T> set_zero; pten::funcs::SetConstant<DeviceContext, T> set_zero;
if (d_x != nullptr) { if (d_x != nullptr) {
d_x->mutable_data<T>(context.GetPlace()); d_x->mutable_data<T>(context.GetPlace());
...@@ -89,14 +89,14 @@ class FSPGradOpKernel : public framework::OpKernel<T> { ...@@ -89,14 +89,14 @@ class FSPGradOpKernel : public framework::OpKernel<T> {
h = y_dims[2]; h = y_dims[2];
w = y_dims[3]; w = y_dims[3];
math::MatDescriptor d_out_mat_desc; pten::funcs::MatDescriptor d_out_mat_desc;
d_out_mat_desc.height_ = x_channel; d_out_mat_desc.height_ = x_channel;
d_out_mat_desc.width_ = y_channel; d_out_mat_desc.width_ = y_channel;
d_out_mat_desc.batch_size_ = batch_size; d_out_mat_desc.batch_size_ = batch_size;
d_out_mat_desc.stride_ = x_channel * y_channel; d_out_mat_desc.stride_ = x_channel * y_channel;
d_out_mat_desc.trans_ = false; d_out_mat_desc.trans_ = false;
math::MatDescriptor y_mat_desc; pten::funcs::MatDescriptor y_mat_desc;
y_mat_desc.height_ = y_channel; y_mat_desc.height_ = y_channel;
y_mat_desc.width_ = h * w; y_mat_desc.width_ = h * w;
y_mat_desc.batch_size_ = batch_size; y_mat_desc.batch_size_ = batch_size;
...@@ -116,14 +116,14 @@ class FSPGradOpKernel : public framework::OpKernel<T> { ...@@ -116,14 +116,14 @@ class FSPGradOpKernel : public framework::OpKernel<T> {
h = x_dims[2]; h = x_dims[2];
w = x_dims[3]; w = x_dims[3];
math::MatDescriptor d_out_mat_desc; pten::funcs::MatDescriptor d_out_mat_desc;
d_out_mat_desc.height_ = y_channel; d_out_mat_desc.height_ = y_channel;
d_out_mat_desc.width_ = x_channel; d_out_mat_desc.width_ = x_channel;
d_out_mat_desc.batch_size_ = batch_size; d_out_mat_desc.batch_size_ = batch_size;
d_out_mat_desc.stride_ = x_channel * y_channel; d_out_mat_desc.stride_ = x_channel * y_channel;
d_out_mat_desc.trans_ = true; d_out_mat_desc.trans_ = true;
math::MatDescriptor x_mat_desc; pten::funcs::MatDescriptor x_mat_desc;
x_mat_desc.height_ = x_channel; x_mat_desc.height_ = x_channel;
x_mat_desc.width_ = h * w; x_mat_desc.width_ = h * w;
x_mat_desc.batch_size_ = batch_size; x_mat_desc.batch_size_ = batch_size;
......
...@@ -15,8 +15,8 @@ limitations under the License. */ ...@@ -15,8 +15,8 @@ limitations under the License. */
#pragma once #pragma once
#include "paddle/fluid/operators/fused/attn_bias_add.cu.h" #include "paddle/fluid/operators/fused/attn_bias_add.cu.h"
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/platform/float16.h" #include "paddle/fluid/platform/float16.h"
#include "paddle/pten/kernels/funcs/blas/blas.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -47,7 +47,7 @@ class FeedForward { ...@@ -47,7 +47,7 @@ class FeedForward {
// column-major: (m,n,k) = output_size,bsz_seq,input_size (weight*input=out) // column-major: (m,n,k) = output_size,bsz_seq,input_size (weight*input=out)
// here: (m,n,k) = bsz_seq,output_size,input_size (input*weight=out) // here: (m,n,k) = bsz_seq,output_size,input_size (input*weight=out)
auto blas = math::GetBlas<platform::CUDADeviceContext, T>(dev_ctx_); auto blas = pten::funcs::GetBlas<platform::CUDADeviceContext, T>(dev_ctx_);
blas.GEMM(transA, transB, bsz_seq_, output_size_, input_size_, alpha, blas.GEMM(transA, transB, bsz_seq_, output_size_, input_size_, alpha,
input_data, weight_data, beta, output_data); input_data, weight_data, beta, output_data);
if (compute_bias_) { if (compute_bias_) {
...@@ -60,7 +60,7 @@ class FeedForward { ...@@ -60,7 +60,7 @@ class FeedForward {
T* d_weight, T* d_bias) { T* d_weight, T* d_bias) {
T alpha = static_cast<T>(1.0); T alpha = static_cast<T>(1.0);
T beta = static_cast<T>(0.0); T beta = static_cast<T>(0.0);
auto blas = math::GetBlas<platform::CUDADeviceContext, T>(dev_ctx_); auto blas = pten::funcs::GetBlas<platform::CUDADeviceContext, T>(dev_ctx_);
// column-major: gemm-nt, get d_weight. // column-major: gemm-nt, get d_weight.
CBLAS_TRANSPOSE transA = CblasTrans; CBLAS_TRANSPOSE transA = CblasTrans;
......
...@@ -11,8 +11,8 @@ limitations under the License. */ ...@@ -11,8 +11,8 @@ limitations under the License. */
#pragma once #pragma once
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/platform/float16.h" #include "paddle/fluid/platform/float16.h"
#include "paddle/pten/kernels/funcs/blas/blas.h"
#include "paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h" #include "paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h"
#include "paddle/fluid/operators/kernel_primitives/kernel_primitives.h" #include "paddle/fluid/operators/kernel_primitives/kernel_primitives.h"
...@@ -56,7 +56,7 @@ class AttnMatMul { ...@@ -56,7 +56,7 @@ class AttnMatMul {
T beta = static_cast<T>(0.0); T beta = static_cast<T>(0.0);
// here: (m, n, k) = bsz_seq, output_size, input_size, (input, weight, out) // here: (m, n, k) = bsz_seq, output_size, input_size, (input, weight, out)
auto blas = math::GetBlas<platform::CUDADeviceContext, T>(dev_ctx_); auto blas = pten::funcs::GetBlas<platform::CUDADeviceContext, T>(dev_ctx_);
blas.GEMM(transA, transB, bsz_seq_, output_size_, input_size_, alpha, blas.GEMM(transA, transB, bsz_seq_, output_size_, input_size_, alpha,
input->data<T>(), weight->data<T>(), beta, output->data<T>()); input->data<T>(), weight->data<T>(), beta, output->data<T>());
if (compute_bias_) { if (compute_bias_) {
...@@ -80,7 +80,7 @@ class AttnMatMul { ...@@ -80,7 +80,7 @@ class AttnMatMul {
framework::Tensor* d_bias) { framework::Tensor* d_bias) {
T alpha = static_cast<T>(1.0); T alpha = static_cast<T>(1.0);
T beta = static_cast<T>(0.0); T beta = static_cast<T>(0.0);
auto blas = math::GetBlas<platform::CUDADeviceContext, T>(dev_ctx_); auto blas = pten::funcs::GetBlas<platform::CUDADeviceContext, T>(dev_ctx_);
CBLAS_TRANSPOSE dB_transA = CblasNoTrans; CBLAS_TRANSPOSE dB_transA = CblasNoTrans;
CBLAS_TRANSPOSE dB_transB = CblasNoTrans; CBLAS_TRANSPOSE dB_transB = CblasNoTrans;
......
...@@ -99,7 +99,7 @@ class FMHARef { ...@@ -99,7 +99,7 @@ class FMHARef {
// q*k^t, batched_gemm // q*k^t, batched_gemm
CBLAS_TRANSPOSE transA = CblasNoTrans; CBLAS_TRANSPOSE transA = CblasNoTrans;
CBLAS_TRANSPOSE transB = CblasTrans; CBLAS_TRANSPOSE transB = CblasTrans;
auto blas = math::GetBlas<platform::CUDADeviceContext, T>(dev_ctx_); auto blas = pten::funcs::GetBlas<platform::CUDADeviceContext, T>(dev_ctx_);
int gemm_batch_size = batch_size_ * num_head_; int gemm_batch_size = batch_size_ * num_head_;
int gemm_m = seq_len_; int gemm_m = seq_len_;
int gemm_n = seq_len_; int gemm_n = seq_len_;
...@@ -174,7 +174,7 @@ class FMHARef { ...@@ -174,7 +174,7 @@ class FMHARef {
Tensor* softmax_out_grad_tensor, Tensor* src_mask_out_grad_tensor, Tensor* softmax_out_grad_tensor, Tensor* src_mask_out_grad_tensor,
Tensor* qk_out_grad_tensor, Tensor* transpose_2_out_grad_tensor, Tensor* qk_out_grad_tensor, Tensor* transpose_2_out_grad_tensor,
Tensor* src_mask_grad_tensor, Tensor* qkv_input_grad_tensor) { Tensor* src_mask_grad_tensor, Tensor* qkv_input_grad_tensor) {
auto blas = math::GetBlas<platform::CUDADeviceContext, T>(dev_ctx_); auto blas = pten::funcs::GetBlas<platform::CUDADeviceContext, T>(dev_ctx_);
int q_size = batch_size_ * seq_len_ * num_head_ * head_dim_; int q_size = batch_size_ * seq_len_ * num_head_ * head_dim_;
int k_size = q_size; int k_size = q_size;
int softmax_axis = -1; int softmax_axis = -1;
......
...@@ -18,7 +18,7 @@ ...@@ -18,7 +18,7 @@
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/memory/malloc.h" #include "paddle/fluid/memory/malloc.h"
#include "paddle/fluid/operators/math/bert_encoder_functor.h" #include "paddle/fluid/operators/math/bert_encoder_functor.h"
#include "paddle/fluid/operators/math/blas.h" #include "paddle/pten/kernels/funcs/blas/blas.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
......
...@@ -14,10 +14,10 @@ limitations under the License. */ ...@@ -14,10 +14,10 @@ limitations under the License. */
#include "paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.h" #include "paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.h"
#include <string> #include <string>
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/operators/math/cpu_vec.h" #include "paddle/fluid/operators/math/cpu_vec.h"
#include "paddle/fluid/operators/math/sequence2batch.h" #include "paddle/fluid/operators/math/sequence2batch.h"
#include "paddle/fluid/platform/cpu_info.h" #include "paddle/fluid/platform/cpu_info.h"
#include "paddle/pten/kernels/funcs/blas/blas.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -364,7 +364,7 @@ class FusedEmbeddingFCLSTMKernel : public framework::OpKernel<T> { ...@@ -364,7 +364,7 @@ class FusedEmbeddingFCLSTMKernel : public framework::OpKernel<T> {
T* xx_data = xx->mutable_data<T>(place); T* xx_data = xx->mutable_data<T>(place);
T* h_out_data = hidden_out->mutable_data<T>(place); T* h_out_data = hidden_out->mutable_data<T>(place);
T* c_out_data = cell_out->mutable_data<T>(place); T* c_out_data = cell_out->mutable_data<T>(place);
auto blas = math::GetBlas<DeviceContext, T>(ctx); auto blas = pten::funcs::GetBlas<DeviceContext, T>(ctx);
for (int64_t i = 0; i < ids_numel; ++i) { for (int64_t i = 0; i < ids_numel; ++i) {
PADDLE_ENFORCE_LT( PADDLE_ENFORCE_LT(
...@@ -475,7 +475,7 @@ class FusedEmbeddingFCLSTMKernel : public framework::OpKernel<T> { ...@@ -475,7 +475,7 @@ class FusedEmbeddingFCLSTMKernel : public framework::OpKernel<T> {
math::LoDTensor2BatchFunctor<DeviceContext, T> to_batch; math::LoDTensor2BatchFunctor<DeviceContext, T> to_batch;
auto& dev_ctx = ctx.template device_context<DeviceContext>(); auto& dev_ctx = ctx.template device_context<DeviceContext>();
auto blas = math::GetBlas<DeviceContext, T>(dev_ctx); auto blas = pten::funcs::GetBlas<DeviceContext, T>(dev_ctx);
for (int64_t i = 0; i < ids_numel; ++i) { for (int64_t i = 0; i < ids_numel; ++i) {
PADDLE_ENFORCE_LT( PADDLE_ENFORCE_LT(
......
...@@ -23,7 +23,7 @@ limitations under the License. */ ...@@ -23,7 +23,7 @@ limitations under the License. */
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/selected_rows_utils.h" #include "paddle/fluid/framework/selected_rows_utils.h"
#include "paddle/fluid/operators/jit/kernels.h" #include "paddle/fluid/operators/jit/kernels.h"
#include "paddle/fluid/operators/math/blas.h" #include "paddle/pten/kernels/funcs/blas/blas.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -179,7 +179,7 @@ class FusedEmbeddingSeqPoolKernel : public framework::OpKernel<T> { ...@@ -179,7 +179,7 @@ class FusedEmbeddingSeqPoolKernel : public framework::OpKernel<T> {
const int m = batch_size * idx_width; const int m = batch_size * idx_width;
const int n = table_width; const int n = table_width;
const int k = table_height; const int k = table_height;
auto blas = math::GetBlas<platform::CPUDeviceContext, T>(context); auto blas = pten::funcs::GetBlas<platform::CPUDeviceContext, T>(context);
blas.CSRMM(&transa, &m, &n, &k, &alpha, matdescra, (const T *)csr_vals, blas.CSRMM(&transa, &m, &n, &k, &alpha, matdescra, (const T *)csr_vals,
(const int *)csr_colmuns, (const int *)csr_row_idx, (const int *)csr_colmuns, (const int *)csr_row_idx,
(const int *)csr_row_idx + 1, weights, &n, &beta, output, &n); (const int *)csr_row_idx + 1, weights, &n, &beta, output, &n);
...@@ -277,7 +277,7 @@ class FusedEmbeddingSeqPoolGradKernel : public framework::OpKernel<T> { ...@@ -277,7 +277,7 @@ class FusedEmbeddingSeqPoolGradKernel : public framework::OpKernel<T> {
csr_colmuns, csr_row_idx, padding_idx); csr_colmuns, csr_row_idx, padding_idx);
auto *d_output_data = d_output->data<T>(); auto *d_output_data = d_output->data<T>();
auto blas = math::GetBlas<platform::CPUDeviceContext, T>(context); auto blas = pten::funcs::GetBlas<platform::CPUDeviceContext, T>(context);
int width = static_cast<int>(table_dim[1]); int width = static_cast<int>(table_dim[1]);
int num_seq = batch_size * idx_width; int num_seq = batch_size * idx_width;
LOG(INFO) << "num seq = " << num_seq << " width = " << width; LOG(INFO) << "num seq = " << num_seq << " width = " << width;
......
...@@ -21,9 +21,9 @@ namespace cub = hipcub; ...@@ -21,9 +21,9 @@ namespace cub = hipcub;
#endif #endif
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/platform/device/gpu/gpu_device_function.h" #include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h" #include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
#include "paddle/pten/kernels/funcs/blas/blas.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -150,7 +150,7 @@ class FusedFCElementwiseLayerNormOpKernel : public framework::OpKernel<T> { ...@@ -150,7 +150,7 @@ class FusedFCElementwiseLayerNormOpKernel : public framework::OpKernel<T> {
T* out_data = out->mutable_data<T>(ctx.GetPlace()); T* out_data = out->mutable_data<T>(ctx.GetPlace());
auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>(); auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
auto blas = math::GetBlas<platform::CUDADeviceContext, T>(dev_ctx); auto blas = pten::funcs::GetBlas<platform::CUDADeviceContext, T>(dev_ctx);
blas.GEMM(false, false, M, N, K, static_cast<T>(1.0), x_data, K, w_data, N, blas.GEMM(false, false, M, N, K, static_cast<T>(1.0), x_data, K, w_data, N,
static_cast<T>(0.0), out_data, N); static_cast<T>(0.0), out_data, N);
......
...@@ -17,8 +17,8 @@ limitations under the License. */ ...@@ -17,8 +17,8 @@ limitations under the License. */
#include <vector> #include <vector>
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/op_version_registry.h" #include "paddle/fluid/framework/op_version_registry.h"
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/operators/matmul_v2_op.h" #include "paddle/fluid/operators/matmul_v2_op.h"
#include "paddle/pten/kernels/funcs/blas/blas.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -49,8 +49,8 @@ class FusedFeedForwardOp : public framework::OperatorWithKernel { ...@@ -49,8 +49,8 @@ class FusedFeedForwardOp : public framework::OperatorWithKernel {
"fused_feedforward"); "fused_feedforward");
auto dim_x = context->GetInputDim("X"); auto dim_x = context->GetInputDim("X");
auto mat_dim_x = auto mat_dim_x = pten::funcs::CreateMatrixDescriptor(
math::CreateMatrixDescriptor(RowMatrixFromVector(dim_x), 0, false); RowMatrixFromVector(dim_x), 0, false);
// verify for the pre layer_norm, the feature size must be larger than 1 // verify for the pre layer_norm, the feature size must be larger than 1
PADDLE_ENFORCE_GT( PADDLE_ENFORCE_GT(
mat_dim_x.width_, static_cast<size_t>(1), mat_dim_x.width_, static_cast<size_t>(1),
......
...@@ -14,8 +14,8 @@ limitations under the License. */ ...@@ -14,8 +14,8 @@ limitations under the License. */
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/op_version_registry.h" #include "paddle/fluid/framework/op_version_registry.h"
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/operators/matmul_v2_op.h" #include "paddle/fluid/operators/matmul_v2_op.h"
#include "paddle/pten/kernels/funcs/blas/blas.h"
#include "paddle/fluid/operators/elementwise/elementwise_add_op.h" #include "paddle/fluid/operators/elementwise/elementwise_add_op.h"
#include "paddle/fluid/operators/fused/fused_dropout_helper.h" #include "paddle/fluid/operators/fused/fused_dropout_helper.h"
...@@ -32,11 +32,11 @@ class FusedFeedForwardKernel : public framework::OpKernel<T> { ...@@ -32,11 +32,11 @@ class FusedFeedForwardKernel : public framework::OpKernel<T> {
void MatMul(const platform::CUDADeviceContext& ctx, void MatMul(const platform::CUDADeviceContext& ctx,
const framework::Tensor& a, const framework::Tensor& b, const framework::Tensor& a, const framework::Tensor& b,
framework::Tensor* c) const { framework::Tensor* c) const {
auto blas = math::GetBlas<DeviceContext, T>(ctx); auto blas = pten::funcs::GetBlas<DeviceContext, T>(ctx);
auto a_2d = FoldInitDims(a); auto a_2d = FoldInitDims(a);
auto b_2d = FoldInitDims(b); auto b_2d = FoldInitDims(b);
auto mat_dim_a = math::CreateMatrixDescriptor(a_2d.dims(), 0, false); auto mat_dim_a = pten::funcs::CreateMatrixDescriptor(a_2d.dims(), 0, false);
auto mat_dim_b = math::CreateMatrixDescriptor(b_2d.dims(), 0, false); auto mat_dim_b = pten::funcs::CreateMatrixDescriptor(b_2d.dims(), 0, false);
T alpha = static_cast<T>(1.0); T alpha = static_cast<T>(1.0);
blas.MatMul(a, mat_dim_a, b, mat_dim_b, alpha, c, T(0)); blas.MatMul(a, mat_dim_a, b, mat_dim_b, alpha, c, T(0));
} }
...@@ -173,8 +173,8 @@ class FusedFeedForwardKernel : public framework::OpKernel<T> { ...@@ -173,8 +173,8 @@ class FusedFeedForwardKernel : public framework::OpKernel<T> {
dropout2_out->mutable_data<T>(place); dropout2_out->mutable_data<T>(place);
auto x_dim = x->dims(); auto x_dim = x->dims();
auto mat_dim_x = auto mat_dim_x = pten::funcs::CreateMatrixDescriptor(
math::CreateMatrixDescriptor(RowMatrixFromVector(x_dim), 0, false); RowMatrixFromVector(x_dim), 0, false);
auto dim = linear1_weight->dims(); auto dim = linear1_weight->dims();
int d_model = dim[0]; int d_model = dim[0];
...@@ -197,12 +197,13 @@ class FusedFeedForwardGradKernel : public framework::OpKernel<T> { ...@@ -197,12 +197,13 @@ class FusedFeedForwardGradKernel : public framework::OpKernel<T> {
const framework::Tensor& d_out, const framework::Tensor& a, const framework::Tensor& d_out, const framework::Tensor& a,
const framework::Tensor& b, framework::Tensor* d_a, const framework::Tensor& b, framework::Tensor* d_a,
framework::Tensor* d_b) const { framework::Tensor* d_b) const {
auto blas = math::GetBlas<DeviceContext, T>(ctx); auto blas = pten::funcs::GetBlas<DeviceContext, T>(ctx);
auto a_2d = FoldInitDims(a); auto a_2d = FoldInitDims(a);
auto b_2d = FoldInitDims(b); auto b_2d = FoldInitDims(b);
auto mat_dim_a = math::CreateMatrixDescriptor(a_2d.dims(), 0, true); auto mat_dim_a = pten::funcs::CreateMatrixDescriptor(a_2d.dims(), 0, true);
auto mat_dim_b = math::CreateMatrixDescriptor(b_2d.dims(), 0, true); auto mat_dim_b = pten::funcs::CreateMatrixDescriptor(b_2d.dims(), 0, true);
auto mat_dim_dout = math::CreateMatrixDescriptor(d_out.dims(), 0, false); auto mat_dim_dout =
pten::funcs::CreateMatrixDescriptor(d_out.dims(), 0, false);
T alpha = static_cast<T>(1.0); T alpha = static_cast<T>(1.0);
blas.MatMul(d_out, mat_dim_dout, b, mat_dim_b, alpha, d_a, T(0)); blas.MatMul(d_out, mat_dim_dout, b, mat_dim_b, alpha, d_a, T(0));
blas.MatMul(a, mat_dim_a, d_out, mat_dim_dout, alpha, d_b, T(0)); blas.MatMul(a, mat_dim_a, d_out, mat_dim_dout, alpha, d_b, T(0));
...@@ -403,8 +404,8 @@ class FusedFeedForwardGradKernel : public framework::OpKernel<T> { ...@@ -403,8 +404,8 @@ class FusedFeedForwardGradKernel : public framework::OpKernel<T> {
d_linear2_weight->mutable_data<T>(place); d_linear2_weight->mutable_data<T>(place);
auto x_dim = x.dims(); auto x_dim = x.dims();
auto mat_dim_x = auto mat_dim_x = pten::funcs::CreateMatrixDescriptor(
math::CreateMatrixDescriptor(RowMatrixFromVector(x_dim), 0, false); RowMatrixFromVector(x_dim), 0, false);
auto linear1_weight_dim = linear1_weight.dims(); auto linear1_weight_dim = linear1_weight.dims();
int d_model = linear1_weight_dim[0]; int d_model = linear1_weight_dim[0];
......
...@@ -18,9 +18,9 @@ limitations under the License. */ ...@@ -18,9 +18,9 @@ limitations under the License. */
#include <vector> #include <vector>
#include "paddle/fluid/framework/op_version_registry.h" #include "paddle/fluid/framework/op_version_registry.h"
#include "paddle/fluid/operators/jit/kernels.h" #include "paddle/fluid/operators/jit/kernels.h"
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/operators/math/fc.h" #include "paddle/fluid/operators/math/fc.h"
#include "paddle/fluid/operators/math/sequence2batch.h" #include "paddle/fluid/operators/math/sequence2batch.h"
#include "paddle/pten/kernels/funcs/blas/blas.h"
#ifdef PADDLE_WITH_MKLDNN #ifdef PADDLE_WITH_MKLDNN
#include "paddle/fluid/platform/mkldnn_helper.h" #include "paddle/fluid/platform/mkldnn_helper.h"
#endif #endif
...@@ -295,7 +295,7 @@ class FusionGRUKernel : public framework::OpKernel<T> { ...@@ -295,7 +295,7 @@ class FusionGRUKernel : public framework::OpKernel<T> {
const T* h0_data = h0 ? h0->data<T>() : nullptr; const T* h0_data = h0 ? h0->data<T>() : nullptr;
const T* wh_state_data = wh_data + D * D2; const T* wh_state_data = wh_data + D * D2;
T* hidden_out_data = hidden_out->mutable_data<T>(place); T* hidden_out_data = hidden_out->mutable_data<T>(place);
auto blas = math::GetBlas<DeviceContext, T>(ctx); auto blas = pten::funcs::GetBlas<DeviceContext, T>(ctx);
auto& dev_ctx = ctx.template device_context<DeviceContext>(); auto& dev_ctx = ctx.template device_context<DeviceContext>();
math::FCFunctor<DeviceContext, T> fc; math::FCFunctor<DeviceContext, T> fc;
...@@ -367,7 +367,7 @@ class FusionGRUKernel : public framework::OpKernel<T> { ...@@ -367,7 +367,7 @@ class FusionGRUKernel : public framework::OpKernel<T> {
T* batched_out_data = batched_out->mutable_data<T>(place); T* batched_out_data = batched_out->mutable_data<T>(place);
hidden_out->mutable_data<T>(place); hidden_out->mutable_data<T>(place);
auto& dev_ctx = ctx.template device_context<DeviceContext>(); auto& dev_ctx = ctx.template device_context<DeviceContext>();
auto blas = math::GetBlas<DeviceContext, T>(dev_ctx); auto blas = pten::funcs::GetBlas<DeviceContext, T>(dev_ctx);
math::LoDTensor2BatchFunctor<DeviceContext, T> to_batch; math::LoDTensor2BatchFunctor<DeviceContext, T> to_batch;
math::FCFunctor<DeviceContext, T> fc; math::FCFunctor<DeviceContext, T> fc;
......
...@@ -15,9 +15,9 @@ limitations under the License. */ ...@@ -15,9 +15,9 @@ limitations under the License. */
#include "paddle/fluid/operators/fused/fusion_lstm_op.h" #include "paddle/fluid/operators/fused/fusion_lstm_op.h"
#include <string> #include <string>
#include "paddle/fluid/operators/jit/kernels.h" #include "paddle/fluid/operators/jit/kernels.h"
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/operators/math/fc.h" #include "paddle/fluid/operators/math/fc.h"
#include "paddle/fluid/operators/math/sequence2batch.h" #include "paddle/fluid/operators/math/sequence2batch.h"
#include "paddle/pten/kernels/funcs/blas/blas.h"
#ifdef PADDLE_WITH_MKLDNN #ifdef PADDLE_WITH_MKLDNN
#include "paddle/fluid/platform/mkldnn_helper.h" #include "paddle/fluid/platform/mkldnn_helper.h"
#endif #endif
...@@ -343,7 +343,7 @@ class FuisonLSTMKernel : public framework::OpKernel<T> { ...@@ -343,7 +343,7 @@ class FuisonLSTMKernel : public framework::OpKernel<T> {
T* xx_data = xx->mutable_data<T>(place); T* xx_data = xx->mutable_data<T>(place);
T* h_out_data = hidden_out->mutable_data<T>(place); T* h_out_data = hidden_out->mutable_data<T>(place);
T* c_out_data = cell_out->mutable_data<T>(place); T* c_out_data = cell_out->mutable_data<T>(place);
auto blas = math::GetBlas<DeviceContext, T>(ctx); auto blas = pten::funcs::GetBlas<DeviceContext, T>(ctx);
auto& dev_ctx = ctx.template device_context<DeviceContext>(); auto& dev_ctx = ctx.template device_context<DeviceContext>();
math::FCFunctor<DeviceContext, T> fc; math::FCFunctor<DeviceContext, T> fc;
...@@ -423,7 +423,7 @@ class FuisonLSTMKernel : public framework::OpKernel<T> { ...@@ -423,7 +423,7 @@ class FuisonLSTMKernel : public framework::OpKernel<T> {
math::LoDTensor2BatchFunctor<DeviceContext, T> to_batch; math::LoDTensor2BatchFunctor<DeviceContext, T> to_batch;
auto& dev_ctx = ctx.template device_context<DeviceContext>(); auto& dev_ctx = ctx.template device_context<DeviceContext>();
auto blas = math::GetBlas<DeviceContext, T>(dev_ctx); auto blas = pten::funcs::GetBlas<DeviceContext, T>(dev_ctx);
math::FCFunctor<DeviceContext, T> fc; math::FCFunctor<DeviceContext, T> fc;
if (M > D4) { if (M > D4) {
fc(dev_ctx, x_dims[0], D4, M, x_data, wx_data, xx_data, bias->data<T>()); fc(dev_ctx, x_dims[0], D4, M, x_data, wx_data, xx_data, bias->data<T>());
......
...@@ -15,8 +15,8 @@ limitations under the License. */ ...@@ -15,8 +15,8 @@ limitations under the License. */
#include "paddle/fluid/operators/fused/fusion_seqconv_eltadd_relu_op.h" #include "paddle/fluid/operators/fused/fusion_seqconv_eltadd_relu_op.h"
#include <algorithm> // for min, max #include <algorithm> // for min, max
#include <string> #include <string>
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/operators/math/fc.h" #include "paddle/fluid/operators/math/fc.h"
#include "paddle/pten/kernels/funcs/blas/blas.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
......
...@@ -14,10 +14,10 @@ limitations under the License. */ ...@@ -14,10 +14,10 @@ limitations under the License. */
#include "paddle/fluid/operators/fused/fusion_seqexpand_concat_fc_op.h" #include "paddle/fluid/operators/fused/fusion_seqexpand_concat_fc_op.h"
#include <string> #include <string>
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/operators/math/cpu_vec.h" #include "paddle/fluid/operators/math/cpu_vec.h"
#include "paddle/fluid/operators/math/fc.h" #include "paddle/fluid/operators/math/fc.h"
#include "paddle/fluid/platform/cpu_info.h" #include "paddle/fluid/platform/cpu_info.h"
#include "paddle/pten/kernels/funcs/blas/blas.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -209,7 +209,7 @@ class FusionSeqExpandConcatFCOpKernel : public framework::OpKernel<T> { ...@@ -209,7 +209,7 @@ class FusionSeqExpandConcatFCOpKernel : public framework::OpKernel<T> {
T* out_data = out->mutable_data<T>(ctx.GetPlace()); T* out_data = out->mutable_data<T>(ctx.GetPlace());
T* fc_out_data = fc_out->mutable_data<T>(ctx.GetPlace()); T* fc_out_data = fc_out->mutable_data<T>(ctx.GetPlace());
auto blas = math::GetBlas<DeviceContext, T>(ctx); auto blas = pten::funcs::GetBlas<DeviceContext, T>(ctx);
auto& dev_ctx = ctx.template device_context<platform::CPUDeviceContext>(); auto& dev_ctx = ctx.template device_context<platform::CPUDeviceContext>();
math::FCFunctor<DeviceContext, T> fc; math::FCFunctor<DeviceContext, T> fc;
......
...@@ -18,9 +18,9 @@ limitations under the License. */ ...@@ -18,9 +18,9 @@ limitations under the License. */
#include <string> #include <string>
#include <vector> #include <vector>
#include "paddle/fluid/operators/jit/kernels.h" #include "paddle/fluid/operators/jit/kernels.h"
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/operators/math/fc.h" #include "paddle/fluid/operators/math/fc.h"
#include "paddle/fluid/operators/math/sequence2batch.h" #include "paddle/fluid/operators/math/sequence2batch.h"
#include "paddle/pten/kernels/funcs/blas/blas.h"
#ifdef PADDLE_WITH_MKLDNN #ifdef PADDLE_WITH_MKLDNN
#include "paddle/fluid/platform/mkldnn_helper.h" #include "paddle/fluid/platform/mkldnn_helper.h"
#endif #endif
......
...@@ -17,7 +17,7 @@ ...@@ -17,7 +17,7 @@
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/memory/malloc.h" #include "paddle/fluid/memory/malloc.h"
#include "paddle/fluid/operators/math/bert_encoder_functor.h" #include "paddle/fluid/operators/math/bert_encoder_functor.h"
#include "paddle/fluid/operators/math/blas.h" #include "paddle/pten/kernels/funcs/blas/blas.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -211,7 +211,8 @@ class MultiHeadMatMulV2Kernel : public framework::OpKernel<T> { ...@@ -211,7 +211,8 @@ class MultiHeadMatMulV2Kernel : public framework::OpKernel<T> {
auto *temp_out_data = temp_out_tensor.mutable_data<T>(context.GetPlace()); auto *temp_out_data = temp_out_tensor.mutable_data<T>(context.GetPlace());
// (B * S, hidden) * (hidden, 3 * N * H) -> (B * S * 3 * N * H) // (B * S, hidden) * (hidden, 3 * N * H) -> (B * S * 3 * N * H)
auto blas = math::GetBlas<platform::CUDADeviceContext, T>(device_ctx); auto blas =
pten::funcs::GetBlas<platform::CUDADeviceContext, T>(device_ctx);
blas.MatMul(input_matrix, w_matrix, &temp_out_tensor); blas.MatMul(input_matrix, w_matrix, &temp_out_tensor);
// temp_out_tensor.Resize(temp_out_dims); // temp_out_tensor.Resize(temp_out_dims);
......
...@@ -17,7 +17,7 @@ ...@@ -17,7 +17,7 @@
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/memory/malloc.h" #include "paddle/fluid/memory/malloc.h"
#include "paddle/fluid/operators/math/bert_encoder_functor.h" #include "paddle/fluid/operators/math/bert_encoder_functor.h"
#include "paddle/fluid/operators/math/blas.h" #include "paddle/pten/kernels/funcs/blas/blas.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
......
...@@ -20,8 +20,8 @@ limitations under the License. */ ...@@ -20,8 +20,8 @@ limitations under the License. */
#include <cmath> #include <cmath>
#include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/platform/float16.h" #include "paddle/fluid/platform/float16.h"
#include "paddle/pten/kernels/funcs/blas/blas.h"
#ifdef PADDLE_WITH_MKLDNN #ifdef PADDLE_WITH_MKLDNN
#include "paddle/fluid/platform/mkldnn_helper.h" #include "paddle/fluid/platform/mkldnn_helper.h"
...@@ -63,13 +63,13 @@ struct GeluFunctor { ...@@ -63,13 +63,13 @@ struct GeluFunctor {
int n = std::min(x.size(), out.size()); int n = std::min(x.size(), out.size());
std::memset(out_data, 0, n * sizeof(T)); std::memset(out_data, 0, n * sizeof(T));
math::CBlas<T>::AXPY(n, static_cast<T>(M_SQRT1_2), x_data, 1, out_data, pten::funcs::CBlas<T>::AXPY(n, static_cast<T>(M_SQRT1_2), x_data, 1,
1); out_data, 1);
math::CBlas<T>::VMERF(n, out_data, out_data, VML_LA); pten::funcs::CBlas<T>::VMERF(n, out_data, out_data, VML_LA);
for (int i = 0; i < n; i++) { for (int i = 0; i < n; i++) {
out_data[i] += static_cast<T>(1); out_data[i] += static_cast<T>(1);
} }
math::CBlas<T>::VMUL(n, x_data, out_data, out_data); pten::funcs::CBlas<T>::VMUL(n, x_data, out_data, out_data);
for (int i = 0; i < n; i++) { for (int i = 0; i < n; i++) {
out_data[i] *= static_cast<T>(0.5); out_data[i] *= static_cast<T>(0.5);
} }
...@@ -138,24 +138,25 @@ struct GeluGradFunctor { ...@@ -138,24 +138,25 @@ struct GeluGradFunctor {
std::memset(second, 0, n * sizeof(T)); std::memset(second, 0, n * sizeof(T));
// first = (0.5 * (1 + erf(x / sqrt(2)))) // first = (0.5 * (1 + erf(x / sqrt(2))))
math::CBlas<T>::AXPY(n, static_cast<T>(M_SQRT1_2), x_data, 1, first, 1); pten::funcs::CBlas<T>::AXPY(n, static_cast<T>(M_SQRT1_2), x_data, 1,
math::CBlas<T>::VMERF(n, first, first, VML_LA); first, 1);
pten::funcs::CBlas<T>::VMERF(n, first, first, VML_LA);
for (int i = 0; i < n; i++) { for (int i = 0; i < n; i++) {
first[i] += static_cast<T>(1); first[i] += static_cast<T>(1);
} }
math::CBlas<T>::SCAL(n, static_cast<T>(0.5), first, 1); pten::funcs::CBlas<T>::SCAL(n, static_cast<T>(0.5), first, 1);
// second = (0.5 * 2/sqrt(pi) * 1/sqrt(2) * x * exp(-0.5 * x^2)) // second = (0.5 * 2/sqrt(pi) * 1/sqrt(2) * x * exp(-0.5 * x^2))
math::CBlas<T>::VSQUARE(n, x_data, second); pten::funcs::CBlas<T>::VSQUARE(n, x_data, second);
math::CBlas<T>::SCAL(n, -static_cast<T>(0.5), second, 1); pten::funcs::CBlas<T>::SCAL(n, -static_cast<T>(0.5), second, 1);
math::CBlas<T>::VEXP(n, second, second); pten::funcs::CBlas<T>::VEXP(n, second, second);
math::CBlas<T>::VMUL(n, x_data, second, second); pten::funcs::CBlas<T>::VMUL(n, x_data, second, second);
math::CBlas<T>::SCAL(n, static_cast<T>(0.5 * M_2_SQRTPI * M_SQRT1_2), pten::funcs::CBlas<T>::SCAL(
second, 1); n, static_cast<T>(0.5 * M_2_SQRTPI * M_SQRT1_2), second, 1);
// dx = dout * (first + second); // dx = dout * (first + second);
math::CBlas<T>::VADD(n, first, second, first); pten::funcs::CBlas<T>::VADD(n, first, second, first);
math::CBlas<T>::VMUL(n, dout_data, first, dx_data); pten::funcs::CBlas<T>::VMUL(n, dout_data, first, dx_data);
std::free(first); std::free(first);
std::free(second); std::free(second);
......
...@@ -21,7 +21,7 @@ limitations under the License. */ ...@@ -21,7 +21,7 @@ limitations under the License. */
#include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/elementwise/elementwise_op_function.h" #include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
#include "paddle/fluid/operators/math/blas.h" #include "paddle/pten/kernels/funcs/blas/blas.h"
#include "paddle/pten/kernels/funcs/math_function.h" #include "paddle/pten/kernels/funcs/math_function.h"
namespace paddle { namespace paddle {
......
...@@ -15,9 +15,9 @@ limitations under the License. */ ...@@ -15,9 +15,9 @@ limitations under the License. */
#include "paddle/fluid/operators/gru_op.h" #include "paddle/fluid/operators/gru_op.h"
#include <memory> #include <memory>
#include <string> #include <string>
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/operators/math/detail/gru_cpu_kernel.h" #include "paddle/fluid/operators/math/detail/gru_cpu_kernel.h"
#include "paddle/fluid/operators/math/detail/gru_kernel.h" #include "paddle/fluid/operators/math/detail/gru_kernel.h"
#include "paddle/pten/kernels/funcs/blas/blas.h"
DECLARE_int32(paddle_num_threads); DECLARE_int32(paddle_num_threads);
...@@ -355,7 +355,7 @@ class GRUCPUKernel : public framework::OpKernel<T> { ...@@ -355,7 +355,7 @@ class GRUCPUKernel : public framework::OpKernel<T> {
#ifdef PADDLE_WITH_MKLML #ifdef PADDLE_WITH_MKLML
// use MKL packed to speedup GEMM // use MKL packed to speedup GEMM
if (FLAGS_paddle_num_threads >= 4) { if (FLAGS_paddle_num_threads >= 4) {
auto blas = math::GetBlas<DeviceContext, T>(dev_ctx); auto blas = pten::funcs::GetBlas<DeviceContext, T>(dev_ctx);
T* packed_gate = blas.GEMM_ALLOC(CblasBMatrix, 1 /*height of C*/, T* packed_gate = blas.GEMM_ALLOC(CblasBMatrix, 1 /*height of C*/,
frame_size * 2 /*width of weight*/, frame_size * 2 /*width of weight*/,
frame_size /*height of height*/); frame_size /*height of height*/);
......
...@@ -17,8 +17,8 @@ limitations under the License. */ ...@@ -17,8 +17,8 @@ limitations under the License. */
#include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/activation_op.h" #include "paddle/fluid/operators/activation_op.h"
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/platform/place.h" #include "paddle/fluid/platform/place.h"
#include "paddle/pten/kernels/funcs/blas/blas.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -87,7 +87,7 @@ class GRUUnitKernel : public framework::OpKernel<T> { ...@@ -87,7 +87,7 @@ class GRUUnitKernel : public framework::OpKernel<T> {
const T* weight_data = weight->data<T>(); const T* weight_data = weight->data<T>();
T* gate_data = gate->data<T>(); T* gate_data = gate->data<T>();
T* reset_hidden_prev_data = reset_hidden_prev->data<T>(); T* reset_hidden_prev_data = reset_hidden_prev->data<T>();
auto blas = math::GetBlas<DeviceContext, T>(context); auto blas = pten::funcs::GetBlas<DeviceContext, T>(context);
blas.GEMM(false, false, batch_size, 2 * frame_size, frame_size, 1, blas.GEMM(false, false, batch_size, 2 * frame_size, frame_size, 1,
hidden_prev_data, frame_size, weight_data, frame_size * 2, 1, hidden_prev_data, frame_size, weight_data, frame_size * 2, 1,
gate_data, frame_size * 3); gate_data, frame_size * 3);
...@@ -204,7 +204,7 @@ class GRUUnitGradKernel : public framework::OpKernel<T> { ...@@ -204,7 +204,7 @@ class GRUUnitGradKernel : public framework::OpKernel<T> {
d_g.slice(c_offsets, extents), d_h * u); d_g.slice(c_offsets, extents), d_h * u);
} }
// backward for reset_hidden_prev // backward for reset_hidden_prev
auto blas = math::GetBlas<DeviceContext, T>(context); auto blas = pten::funcs::GetBlas<DeviceContext, T>(context);
blas.GEMM(false, true, batch_size, frame_size, frame_size, 1, blas.GEMM(false, true, batch_size, frame_size, frame_size, 1,
gate_grad_data + frame_size * 2, frame_size * 3, gate_grad_data + frame_size * 2, frame_size * 3,
weight_data + frame_size * frame_size * 2, frame_size, 0, weight_data + frame_size * frame_size * 2, frame_size, 0,
......
...@@ -166,7 +166,7 @@ class HierarchicalSigmoidGradOpKernel : public framework::OpKernel<T> { ...@@ -166,7 +166,7 @@ class HierarchicalSigmoidGradOpKernel : public framework::OpKernel<T> {
// softrelu derivative // softrelu derivative
auto blas = math::GetBlas<DeviceContext, T>(ctx); auto blas = pten::funcs::GetBlas<DeviceContext, T>(ctx);
auto* pre_out_grad_data = pre_out_grad.data<T>(); auto* pre_out_grad_data = pre_out_grad.data<T>();
auto* pre_out_data = pre_out.template data<T>(); auto* pre_out_data = pre_out.template data<T>();
......
...@@ -15,7 +15,7 @@ ...@@ -15,7 +15,7 @@
#pragma once #pragma once
#include <vector> #include <vector>
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/blas.h" #include "paddle/pten/kernels/funcs/blas/blas.h"
#include "paddle/pten/kernels/funcs/math_function.h" #include "paddle/pten/kernels/funcs/math_function.h"
namespace paddle { namespace paddle {
...@@ -141,7 +141,7 @@ struct IndexSelectAdd< ...@@ -141,7 +141,7 @@ struct IndexSelectAdd<
typename std::enable_if<std::is_floating_point<T>::value>::type> { typename std::enable_if<std::is_floating_point<T>::value>::type> {
void operator()(const framework::ExecutionContext& ctx, int slice_size, void operator()(const framework::ExecutionContext& ctx, int slice_size,
const T* src_pointer, const T* p_pointer, T* dist_pointer) { const T* src_pointer, const T* p_pointer, T* dist_pointer) {
auto blas = math::GetBlas<DeviceContext, T>(ctx); auto blas = pten::funcs::GetBlas<DeviceContext, T>(ctx);
blas.VADD(slice_size, src_pointer, p_pointer, dist_pointer); blas.VADD(slice_size, src_pointer, p_pointer, dist_pointer);
} }
}; };
......
...@@ -15,8 +15,8 @@ limitations under the License. */ ...@@ -15,8 +15,8 @@ limitations under the License. */
#pragma once #pragma once
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/operators/math/matrix_inverse.h" #include "paddle/fluid/operators/math/matrix_inverse.h"
#include "paddle/pten/kernels/funcs/blas/blas.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -48,19 +48,22 @@ class InverseGradKernel : public framework::OpKernel<T> { ...@@ -48,19 +48,22 @@ class InverseGradKernel : public framework::OpKernel<T> {
if (a_grad) { if (a_grad) {
a_grad->mutable_data<T>(context.GetPlace()); a_grad->mutable_data<T>(context.GetPlace());
auto blas = math::GetBlas<DeviceContext, T>(context); auto blas = pten::funcs::GetBlas<DeviceContext, T>(context);
auto& dev_ctx = context.template device_context<DeviceContext>(); auto& dev_ctx = context.template device_context<DeviceContext>();
framework::Tensor tmp_out = framework::Tensor tmp_out =
context.AllocateTmpTensor<T, DeviceContext>(a_inv->dims(), dev_ctx); context.AllocateTmpTensor<T, DeviceContext>(a_inv->dims(), dev_ctx);
auto mat_dim_a0 = auto mat_dim_a0 =
math::CreateMatrixDescriptor(a_inv_grad->dims(), 0, false); pten::funcs::CreateMatrixDescriptor(a_inv_grad->dims(), 0, false);
auto mat_dim_b0 = math::CreateMatrixDescriptor(a_inv->dims(), 0, true); auto mat_dim_b0 =
pten::funcs::CreateMatrixDescriptor(a_inv->dims(), 0, true);
blas.MatMul(*a_inv_grad, mat_dim_a0, *a_inv, mat_dim_b0, T(1), &tmp_out, blas.MatMul(*a_inv_grad, mat_dim_a0, *a_inv, mat_dim_b0, T(1), &tmp_out,
T(0)); T(0));
auto mat_dim_a1 = math::CreateMatrixDescriptor(a_inv->dims(), 0, true); auto mat_dim_a1 =
auto mat_dim_b1 = math::CreateMatrixDescriptor(tmp_out.dims(), 0, false); pten::funcs::CreateMatrixDescriptor(a_inv->dims(), 0, true);
auto mat_dim_b1 =
pten::funcs::CreateMatrixDescriptor(tmp_out.dims(), 0, false);
blas.MatMul(*a_inv, mat_dim_a1, tmp_out, mat_dim_b1, T(-1), a_grad, T(0)); blas.MatMul(*a_inv, mat_dim_a1, tmp_out, mat_dim_b1, T(-1), a_grad, T(0));
} }
} }
......
...@@ -20,7 +20,7 @@ limitations under the License. */ ...@@ -20,7 +20,7 @@ limitations under the License. */
#include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/elementwise/elementwise_op_function.h" #include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
#include "paddle/fluid/operators/math/blas.h" #include "paddle/pten/kernels/funcs/blas/blas.h"
#if !defined(PADDLE_WITH_CUDA) && !defined(_WIN32) && !defined(__APPLE__) && \ #if !defined(PADDLE_WITH_CUDA) && !defined(_WIN32) && !defined(__APPLE__) && \
!defined(__OSX__) !defined(__OSX__)
#include "paddle/fluid/operators/jit/kernels.h" #include "paddle/fluid/operators/jit/kernels.h"
...@@ -61,7 +61,7 @@ class RowwiseMean2D<platform::CUDADeviceContext, T> { ...@@ -61,7 +61,7 @@ class RowwiseMean2D<platform::CUDADeviceContext, T> {
} }
void operator()(const platform::CUDADeviceContext& context, void operator()(const platform::CUDADeviceContext& context,
const framework::Tensor& input, framework::Tensor* out) { const framework::Tensor& input, framework::Tensor* out) {
math::GetBlas<platform::CUDADeviceContext, T>(context).GEMV( pten::funcs::GetBlas<platform::CUDADeviceContext, T>(context).GEMV(
false, left_, right_, 1., input.data<T>(), divisor_.data<T>(), 0., false, left_, right_, 1., input.data<T>(), divisor_.data<T>(), 0.,
out->data<T>()); out->data<T>());
} }
...@@ -108,7 +108,7 @@ class ColwiseSum2D<platform::CUDADeviceContext, T> { ...@@ -108,7 +108,7 @@ class ColwiseSum2D<platform::CUDADeviceContext, T> {
void operator()(const platform::CUDADeviceContext& context, void operator()(const platform::CUDADeviceContext& context,
const framework::Tensor& input, framework::Tensor* out) { const framework::Tensor& input, framework::Tensor* out) {
math::GetBlas<platform::CUDADeviceContext, T>(context).GEMV( pten::funcs::GetBlas<platform::CUDADeviceContext, T>(context).GEMV(
true, left_, right_, 1., input.data<T>(), divisor_.data<T>(), 0., true, left_, right_, 1., input.data<T>(), divisor_.data<T>(), 0.,
out->data<T>()); out->data<T>());
} }
......
...@@ -22,7 +22,7 @@ limitations under the License. */ ...@@ -22,7 +22,7 @@ limitations under the License. */
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/selected_rows_utils.h" #include "paddle/fluid/framework/selected_rows_utils.h"
#include "paddle/fluid/framework/var_type_traits.h" #include "paddle/fluid/framework/var_type_traits.h"
#include "paddle/fluid/operators/math/blas.h" #include "paddle/pten/kernels/funcs/blas/blas.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
......
...@@ -21,7 +21,7 @@ limitations under the License. */ ...@@ -21,7 +21,7 @@ limitations under the License. */
#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/selected_rows_utils.h" #include "paddle/fluid/framework/selected_rows_utils.h"
#include "paddle/fluid/operators/math/blas.h" #include "paddle/pten/kernels/funcs/blas/blas.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -109,8 +109,8 @@ class LookupTableKernel : public framework::OpKernel<T> { ...@@ -109,8 +109,8 @@ class LookupTableKernel : public framework::OpKernel<T> {
memcpy(output + i * row_width, table + id_index * row_width, memcpy(output + i * row_width, table + id_index * row_width,
row_width * sizeof(T)); row_width * sizeof(T));
} else { } else {
auto blas = auto blas = pten::funcs::GetBlas<platform::CPUDeviceContext, T>(
math::GetBlas<platform::CPUDeviceContext, T>(context); context);
blas.VCOPY(row_width, table + id_index * row_width, blas.VCOPY(row_width, table + id_index * row_width,
output + i * row_width); output + i * row_width);
} }
...@@ -137,7 +137,8 @@ class LookupTableKernel : public framework::OpKernel<T> { ...@@ -137,7 +137,8 @@ class LookupTableKernel : public framework::OpKernel<T> {
memcpy(output + i * row_width, table + id_index * row_width, memcpy(output + i * row_width, table + id_index * row_width,
row_width * sizeof(T)); row_width * sizeof(T));
} else { } else {
auto blas = math::GetBlas<platform::CPUDeviceContext, T>(context); auto blas =
pten::funcs::GetBlas<platform::CPUDeviceContext, T>(context);
blas.VCOPY(row_width, table + id_index * row_width, blas.VCOPY(row_width, table + id_index * row_width,
output + i * row_width); output + i * row_width);
} }
......
...@@ -22,7 +22,7 @@ limitations under the License. */ ...@@ -22,7 +22,7 @@ limitations under the License. */
#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/selected_rows_utils.h" #include "paddle/fluid/framework/selected_rows_utils.h"
#include "paddle/fluid/operators/math/blas.h" #include "paddle/pten/kernels/funcs/blas/blas.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -124,7 +124,8 @@ struct LookupTableV2CPUFunctor { ...@@ -124,7 +124,8 @@ struct LookupTableV2CPUFunctor {
memcpy(output + i * row_width, table + id_index * row_width, memcpy(output + i * row_width, table + id_index * row_width,
row_width * sizeof(T)); row_width * sizeof(T));
} else { } else {
auto blas = math::GetBlas<platform::CPUDeviceContext, T>(context_); auto blas =
pten::funcs::GetBlas<platform::CPUDeviceContext, T>(context_);
blas.VCOPY(row_width, table + id_index * row_width, blas.VCOPY(row_width, table + id_index * row_width,
output + i * row_width); output + i * row_width);
} }
......
...@@ -16,7 +16,7 @@ limitations under the License. */ ...@@ -16,7 +16,7 @@ limitations under the License. */
#include <memory> #include <memory>
#include <string> #include <string>
#include <vector> #include <vector>
#include "paddle/fluid/operators/math/blas.h" #include "paddle/pten/kernels/funcs/blas/blas.h"
#include "paddle/pten/kernels/funcs/math_function.h" #include "paddle/pten/kernels/funcs/math_function.h"
#ifdef PADDLE_WITH_MKLDNN #ifdef PADDLE_WITH_MKLDNN
#include "paddle/fluid/platform/mkldnn_helper.h" #include "paddle/fluid/platform/mkldnn_helper.h"
...@@ -35,7 +35,7 @@ struct LRNFunctor<platform::CPUDeviceContext, T> { ...@@ -35,7 +35,7 @@ struct LRNFunctor<platform::CPUDeviceContext, T> {
framework::Tensor* mid, int N, int C, int H, int W, int n, framework::Tensor* mid, int N, int C, int H, int W, int n,
T k, T alpha, T beta, const DataLayout data_layout) { T k, T alpha, T beta, const DataLayout data_layout) {
auto place = ctx.GetPlace(); auto place = ctx.GetPlace();
auto blas = math::GetBlas<platform::CPUDeviceContext, T>(ctx); auto blas = pten::funcs::GetBlas<platform::CPUDeviceContext, T>(ctx);
pten::funcs::Transpose<platform::CPUDeviceContext, T, 4> transpose; pten::funcs::Transpose<platform::CPUDeviceContext, T, 4> transpose;
auto& dev_ctx = ctx.template device_context<platform::CPUDeviceContext>(); auto& dev_ctx = ctx.template device_context<platform::CPUDeviceContext>();
Tensor in_transpose, mid_transpose, out_transpose; Tensor in_transpose, mid_transpose, out_transpose;
......
...@@ -15,10 +15,10 @@ limitations under the License. */ ...@@ -15,10 +15,10 @@ limitations under the License. */
#pragma once #pragma once
#include <string> #include <string>
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/operators/math/detail/activation_functions.h" #include "paddle/fluid/operators/math/detail/activation_functions.h"
#include "paddle/fluid/operators/math/lstm_compute.h" #include "paddle/fluid/operators/math/lstm_compute.h"
#include "paddle/fluid/operators/math/sequence2batch.h" #include "paddle/fluid/operators/math/sequence2batch.h"
#include "paddle/pten/kernels/funcs/blas/blas.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -128,7 +128,7 @@ class LSTMKernel : public framework::OpKernel<T> { ...@@ -128,7 +128,7 @@ class LSTMKernel : public framework::OpKernel<T> {
auto cand_act = math::detail::GetActivationType( auto cand_act = math::detail::GetActivationType(
ctx.Attr<std::string>("candidate_activation")); ctx.Attr<std::string>("candidate_activation"));
auto blas = math::GetBlas<DeviceContext, T>(device_ctx); auto blas = pten::funcs::GetBlas<DeviceContext, T>(device_ctx);
for (size_t n = 0; n < num_batch; n++) { for (size_t n = 0; n < num_batch; n++) {
int bstart = static_cast<int>(batch_starts[n]); int bstart = static_cast<int>(batch_starts[n]);
int bend = static_cast<int>(batch_starts[n + 1]); int bend = static_cast<int>(batch_starts[n + 1]);
...@@ -302,7 +302,7 @@ class LSTMGradKernel : public framework::OpKernel<T> { ...@@ -302,7 +302,7 @@ class LSTMGradKernel : public framework::OpKernel<T> {
auto batch_starts = batch_gate->lod()[0]; auto batch_starts = batch_gate->lod()[0];
size_t num_batch = batch_starts.size() - 1; size_t num_batch = batch_starts.size() - 1;
auto blas = math::GetBlas<DeviceContext, T>(device_ctx); auto blas = pten::funcs::GetBlas<DeviceContext, T>(device_ctx);
for (int n = static_cast<int>(num_batch) - 1; n >= 0; n--) { for (int n = static_cast<int>(num_batch) - 1; n >= 0; n--) {
int bstart = static_cast<int>(batch_starts[n]); int bstart = static_cast<int>(batch_starts[n]);
int bend = static_cast<int>(batch_starts[n + 1]); int bend = static_cast<int>(batch_starts[n + 1]);
......
...@@ -18,12 +18,12 @@ limitations under the License. */ ...@@ -18,12 +18,12 @@ limitations under the License. */
#include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/activation_op.h" #include "paddle/fluid/operators/activation_op.h"
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/operators/math/detail/activation_functions.h" #include "paddle/fluid/operators/math/detail/activation_functions.h"
#include "paddle/fluid/operators/math/lstm_compute.h" #include "paddle/fluid/operators/math/lstm_compute.h"
#include "paddle/fluid/operators/math/sequence2batch.h" #include "paddle/fluid/operators/math/sequence2batch.h"
#include "paddle/fluid/platform/place.h" #include "paddle/fluid/platform/place.h"
#include "paddle/fluid/platform/transform.h" #include "paddle/fluid/platform/transform.h"
#include "paddle/pten/kernels/funcs/blas/blas.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -185,7 +185,7 @@ class LSTMPKernel : public framework::OpKernel<T> { ...@@ -185,7 +185,7 @@ class LSTMPKernel : public framework::OpKernel<T> {
auto proj_act = math::detail::GetActivationType( auto proj_act = math::detail::GetActivationType(
ctx.Attr<std::string>("proj_activation")); ctx.Attr<std::string>("proj_activation"));
auto& place = *ctx.template device_context<DeviceContext>().eigen_device(); auto& place = *ctx.template device_context<DeviceContext>().eigen_device();
auto blas = math::GetBlas<DeviceContext, T>(device_ctx); auto blas = pten::funcs::GetBlas<DeviceContext, T>(device_ctx);
for (size_t n = 0; n < num_batch; n++) { for (size_t n = 0; n < num_batch; n++) {
int bstart = static_cast<int>(batch_starts[n]); int bstart = static_cast<int>(batch_starts[n]);
int bend = static_cast<int>(batch_starts[n + 1]); int bend = static_cast<int>(batch_starts[n + 1]);
...@@ -405,7 +405,7 @@ class LSTMPGradKernel : public framework::OpKernel<T> { ...@@ -405,7 +405,7 @@ class LSTMPGradKernel : public framework::OpKernel<T> {
auto batch_starts = batch_gate->lod()[0]; auto batch_starts = batch_gate->lod()[0];
size_t num_batch = batch_starts.size() - 1; size_t num_batch = batch_starts.size() - 1;
auto blas = math::GetBlas<DeviceContext, T>(device_ctx); auto blas = pten::funcs::GetBlas<DeviceContext, T>(device_ctx);
for (int n = static_cast<int>(num_batch) - 1; n >= 0; n--) { for (int n = static_cast<int>(num_batch) - 1; n >= 0; n--) {
int bstart = static_cast<int>(batch_starts[n]); int bstart = static_cast<int>(batch_starts[n]);
int bend = static_cast<int>(batch_starts[n + 1]); int bend = static_cast<int>(batch_starts[n + 1]);
......
...@@ -19,13 +19,13 @@ ...@@ -19,13 +19,13 @@
#include <complex> #include <complex>
#include "paddle/fluid/operators/eig_op.h" #include "paddle/fluid/operators/eig_op.h"
#include "paddle/fluid/operators/math/eigen_values_vectors.h" #include "paddle/fluid/operators/math/eigen_values_vectors.h"
#include "paddle/fluid/operators/math/lapack_function.h"
#include "paddle/fluid/operators/math/matrix_solve.h" #include "paddle/fluid/operators/math/matrix_solve.h"
#include "paddle/fluid/operators/svd_helper.h" #include "paddle/fluid/operators/svd_helper.h"
#include "paddle/fluid/operators/transpose_op.h" #include "paddle/fluid/operators/transpose_op.h"
#include "paddle/fluid/operators/triangular_solve_op.h" #include "paddle/fluid/operators/triangular_solve_op.h"
#include "paddle/fluid/platform/for_range.h" #include "paddle/fluid/platform/for_range.h"
#include "paddle/pten/kernels/funcs/complex_functors.h" #include "paddle/pten/kernels/funcs/complex_functors.h"
#include "paddle/pten/kernels/funcs/lapack/lapack_function.h"
#include "paddle/pten/kernels/funcs/math_function.h" #include "paddle/pten/kernels/funcs/math_function.h"
#define EPSILON 1e-6 #define EPSILON 1e-6
...@@ -153,20 +153,21 @@ class LstsqCPUKernel : public framework::OpKernel<T> { ...@@ -153,20 +153,21 @@ class LstsqCPUKernel : public framework::OpKernel<T> {
int iwkopt = 0; int iwkopt = 0;
if (driver == LapackDriverType::Gels) { if (driver == LapackDriverType::Gels) {
math::lapackGels('N', m, n, nrhs, x_vector, lda, y_vector, ldb, &wkopt, pten::funcs::lapackGels('N', m, n, nrhs, x_vector, lda, y_vector, ldb,
lwork, &info); &wkopt, lwork, &info);
} else if (driver == LapackDriverType::Gelsd) { } else if (driver == LapackDriverType::Gelsd) {
math::lapackGelsd(m, n, nrhs, x_vector, lda, y_vector, ldb, s_working_ptr, pten::funcs::lapackGelsd(m, n, nrhs, x_vector, lda, y_vector, ldb,
static_cast<ValueType>(rcond), &rank_32, &wkopt, lwork, s_working_ptr, static_cast<ValueType>(rcond),
&rwkopt, &iwkopt, &info); &rank_32, &wkopt, lwork, &rwkopt, &iwkopt,
&info);
} else if (driver == LapackDriverType::Gelsy) { } else if (driver == LapackDriverType::Gelsy) {
math::lapackGelsy(m, n, nrhs, x_vector, lda, y_vector, ldb, jpvt_data, pten::funcs::lapackGelsy(m, n, nrhs, x_vector, lda, y_vector, ldb,
static_cast<ValueType>(rcond), &rank_32, &wkopt, lwork, jpvt_data, static_cast<ValueType>(rcond),
&rwkopt, &info); &rank_32, &wkopt, lwork, &rwkopt, &info);
} else if (driver == LapackDriverType::Gelss) { } else if (driver == LapackDriverType::Gelss) {
math::lapackGelss(m, n, nrhs, x_vector, lda, y_vector, ldb, s_working_ptr, pten::funcs::lapackGelss(m, n, nrhs, x_vector, lda, y_vector, ldb,
static_cast<ValueType>(rcond), &rank_32, &wkopt, lwork, s_working_ptr, static_cast<ValueType>(rcond),
&rwkopt, &info); &rank_32, &wkopt, lwork, &rwkopt, &info);
} }
lwork = std::max<int>(1, static_cast<int>(pten::funcs::Real<T>(wkopt))); lwork = std::max<int>(1, static_cast<int>(pten::funcs::Real<T>(wkopt)));
...@@ -206,20 +207,21 @@ class LstsqCPUKernel : public framework::OpKernel<T> { ...@@ -206,20 +207,21 @@ class LstsqCPUKernel : public framework::OpKernel<T> {
s_working_ptr = s_working_ptr ? &s_data[i * s_stride] : nullptr; s_working_ptr = s_working_ptr ? &s_data[i * s_stride] : nullptr;
if (driver == LapackDriverType::Gels) { if (driver == LapackDriverType::Gels) {
math::lapackGels('N', m, n, nrhs, x_input, lda, y_input, ldb, work_data, pten::funcs::lapackGels('N', m, n, nrhs, x_input, lda, y_input, ldb,
lwork, &info); work_data, lwork, &info);
} else if (driver == LapackDriverType::Gelsd) { } else if (driver == LapackDriverType::Gelsd) {
math::lapackGelsd(m, n, nrhs, x_input, lda, y_input, ldb, s_working_ptr, pten::funcs::lapackGelsd(m, n, nrhs, x_input, lda, y_input, ldb,
static_cast<ValueType>(rcond), &rank_32, work_data, s_working_ptr, static_cast<ValueType>(rcond),
lwork, rwork_data, iwork_data, &info); &rank_32, work_data, lwork, rwork_data,
iwork_data, &info);
} else if (driver == LapackDriverType::Gelsy) { } else if (driver == LapackDriverType::Gelsy) {
math::lapackGelsy(m, n, nrhs, x_input, lda, y_input, ldb, jpvt_data, pten::funcs::lapackGelsy(m, n, nrhs, x_input, lda, y_input, ldb,
static_cast<ValueType>(rcond), &rank_32, work_data, jpvt_data, static_cast<ValueType>(rcond),
lwork, rwork_data, &info); &rank_32, work_data, lwork, rwork_data, &info);
} else if (driver == LapackDriverType::Gelss) { } else if (driver == LapackDriverType::Gelss) {
math::lapackGelss(m, n, nrhs, x_input, lda, y_input, ldb, s_working_ptr, pten::funcs::lapackGelss(m, n, nrhs, x_input, lda, y_input, ldb,
static_cast<ValueType>(rcond), &rank_32, work_data, s_working_ptr, static_cast<ValueType>(rcond),
lwork, rwork_data, &info); &rank_32, work_data, lwork, rwork_data, &info);
} }
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
......
...@@ -142,8 +142,8 @@ class LUKernel : public framework::OpKernel<T> { ...@@ -142,8 +142,8 @@ class LUKernel : public framework::OpKernel<T> {
auto out_data_item = &out_data[b * m * n]; auto out_data_item = &out_data[b * m * n];
int *info_data_item = &info_data[b]; int *info_data_item = &info_data[b];
int *ipiv_data_item = &ipiv_data[b * std::min(m, n)]; int *ipiv_data_item = &ipiv_data[b * std::min(m, n)];
math::lapackLu<T>(m, n, out_data_item, lda, ipiv_data_item, pten::funcs::lapackLu<T>(m, n, out_data_item, lda, ipiv_data_item,
info_data_item); info_data_item);
} }
*out = helper.Transpose(*out); *out = helper.Transpose(*out);
} }
......
...@@ -15,11 +15,11 @@ limitations under the License. */ ...@@ -15,11 +15,11 @@ limitations under the License. */
#pragma once #pragma once
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/lapack_function.h"
#include "paddle/fluid/operators/set_value_op.h" #include "paddle/fluid/operators/set_value_op.h"
#include "paddle/fluid/operators/svd_helper.h" #include "paddle/fluid/operators/svd_helper.h"
#include "paddle/fluid/operators/triangular_solve_op.h" #include "paddle/fluid/operators/triangular_solve_op.h"
#include "paddle/fluid/operators/tril_triu_op.h" #include "paddle/fluid/operators/tril_triu_op.h"
#include "paddle/pten/kernels/funcs/lapack/lapack_function.h"
#include "paddle/pten/kernels/math_kernel.h" #include "paddle/pten/kernels/math_kernel.h"
namespace paddle { namespace paddle {
...@@ -489,7 +489,7 @@ class LUGradKernel : public framework::OpKernel<T> { ...@@ -489,7 +489,7 @@ class LUGradKernel : public framework::OpKernel<T> {
const auto& dev_ctx = ctx.template device_context<DeviceContext>(); const auto& dev_ctx = ctx.template device_context<DeviceContext>();
math::DeviceIndependenceTensorOperations<DeviceContext, T> helper(ctx); math::DeviceIndependenceTensorOperations<DeviceContext, T> helper(ctx);
auto blas = math::GetBlas<DeviceContext, T>(ctx); auto blas = pten::funcs::GetBlas<DeviceContext, T>(ctx);
auto xdims = xin->dims(); auto xdims = xin->dims();
int xrank = xdims.size(); int xrank = xdims.size();
...@@ -519,9 +519,9 @@ class LUGradKernel : public framework::OpKernel<T> { ...@@ -519,9 +519,9 @@ class LUGradKernel : public framework::OpKernel<T> {
phi_L.mutable_data<T>(ctx.GetPlace()); phi_L.mutable_data<T>(ctx.GetPlace());
phi_U.Resize(UmHdims); phi_U.Resize(UmHdims);
phi_U.mutable_data<T>(ctx.GetPlace()); phi_U.mutable_data<T>(ctx.GetPlace());
auto mat_dim_l = math::CreateMatrixDescriptor(LmHdims, 0, false); auto mat_dim_l = pten::funcs::CreateMatrixDescriptor(LmHdims, 0, false);
auto mat_dim_u = math::CreateMatrixDescriptor(UmHdims, 0, false); auto mat_dim_u = pten::funcs::CreateMatrixDescriptor(UmHdims, 0, false);
auto mat_dim_g = math::CreateMatrixDescriptor(graddims, 0, false); auto mat_dim_g = pten::funcs::CreateMatrixDescriptor(graddims, 0, false);
blas.MatMul(L_narrow_mH, mat_dim_l, grad_narrow, mat_dim_g, blas.MatMul(L_narrow_mH, mat_dim_l, grad_narrow, mat_dim_g,
static_cast<T>(1), &phi_L, static_cast<T>(0)); static_cast<T>(1), &phi_L, static_cast<T>(0));
...@@ -567,10 +567,10 @@ class LUGradKernel : public framework::OpKernel<T> { ...@@ -567,10 +567,10 @@ class LUGradKernel : public framework::OpKernel<T> {
Tensor_Conj<DeviceContext, T>(dev_ctx, U_complement_mH, Tensor_Conj<DeviceContext, T>(dev_ctx, U_complement_mH,
&U_complement_mH); &U_complement_mH);
auto mat_dim_g = auto mat_dim_g = pten::funcs::CreateMatrixDescriptor(
math::CreateMatrixDescriptor(U_grad_complement.dims(), 0, false); U_grad_complement.dims(), 0, false);
auto mat_dim_u = auto mat_dim_u = pten::funcs::CreateMatrixDescriptor(
math::CreateMatrixDescriptor(U_complement_mH.dims(), 0, false); U_complement_mH.dims(), 0, false);
auto phidims = UmHdims; auto phidims = UmHdims;
phidims[UmHdims.size() - 2] = k; phidims[UmHdims.size() - 2] = k;
phidims[UmHdims.size() - 1] = k; phidims[UmHdims.size() - 1] = k;
...@@ -623,8 +623,10 @@ class LUGradKernel : public framework::OpKernel<T> { ...@@ -623,8 +623,10 @@ class LUGradKernel : public framework::OpKernel<T> {
triangular_solve<DeviceContext, T>(dev_ctx, L_narrow_mH, psi, &psi_tmp, triangular_solve<DeviceContext, T>(dev_ctx, L_narrow_mH, psi, &psi_tmp,
true, false, true); true, false, true);
auto mat_dim_p = math::CreateMatrixDescriptor(Pmat.dims(), 0, false); auto mat_dim_p =
auto mat_dim_b = math::CreateMatrixDescriptor(psi_tmp.dims(), 0, false); pten::funcs::CreateMatrixDescriptor(Pmat.dims(), 0, false);
auto mat_dim_b =
pten::funcs::CreateMatrixDescriptor(psi_tmp.dims(), 0, false);
blas.MatMul(Pmat, mat_dim_p, psi_tmp, mat_dim_b, static_cast<T>(1), dx, blas.MatMul(Pmat, mat_dim_p, psi_tmp, mat_dim_b, static_cast<T>(1), dx,
static_cast<T>(0)); static_cast<T>(0));
} else { } else {
...@@ -636,10 +638,10 @@ class LUGradKernel : public framework::OpKernel<T> { ...@@ -636,10 +638,10 @@ class LUGradKernel : public framework::OpKernel<T> {
framework::Tensor L_complement_mH = helper.Transpose(L_complement); framework::Tensor L_complement_mH = helper.Transpose(L_complement);
Tensor_Conj<DeviceContext, T>(dev_ctx, L_complement_mH, &L_complement_mH); Tensor_Conj<DeviceContext, T>(dev_ctx, L_complement_mH, &L_complement_mH);
auto mat_dim_g = auto mat_dim_g = pten::funcs::CreateMatrixDescriptor(
math::CreateMatrixDescriptor(L_grad_complement.dims(), 0, false); L_grad_complement.dims(), 0, false);
auto mat_dim_u = auto mat_dim_u =
math::CreateMatrixDescriptor(L_complement_mH.dims(), 0, false); pten::funcs::CreateMatrixDescriptor(L_complement_mH.dims(), 0, false);
auto phidims = LmHdims; auto phidims = LmHdims;
phidims[LmHdims.size() - 2] = k; phidims[LmHdims.size() - 2] = k;
phidims[LmHdims.size() - 1] = k; phidims[LmHdims.size() - 1] = k;
...@@ -685,8 +687,10 @@ class LUGradKernel : public framework::OpKernel<T> { ...@@ -685,8 +687,10 @@ class LUGradKernel : public framework::OpKernel<T> {
psi_tmp.Resize(psi.dims()); psi_tmp.Resize(psi.dims());
psi_tmp.mutable_data<T>(ctx.GetPlace()); psi_tmp.mutable_data<T>(ctx.GetPlace());
auto mat_dim_p = math::CreateMatrixDescriptor(Pmat.dims(), 0, false); auto mat_dim_p =
auto mat_dim_b = math::CreateMatrixDescriptor(psi.dims(), 0, false); pten::funcs::CreateMatrixDescriptor(Pmat.dims(), 0, false);
auto mat_dim_b =
pten::funcs::CreateMatrixDescriptor(psi.dims(), 0, false);
blas.MatMul(Pmat, mat_dim_p, psi, mat_dim_b, static_cast<T>(1), &psi_tmp, blas.MatMul(Pmat, mat_dim_p, psi, mat_dim_b, static_cast<T>(1), &psi_tmp,
static_cast<T>(0)); static_cast<T>(0));
psi_tmp = helper.Transpose(psi_tmp); psi_tmp = helper.Transpose(psi_tmp);
......
...@@ -249,7 +249,7 @@ class CPUMatchMatrixTensorOPKernel : public framework::OpKernel<T> { ...@@ -249,7 +249,7 @@ class CPUMatchMatrixTensorOPKernel : public framework::OpKernel<T> {
memset(bottom_l_trans_data, 0.0, memset(bottom_l_trans_data, 0.0,
tmp->dims()[0] * tmp->dims()[1] * sizeof(T)); tmp->dims()[0] * tmp->dims()[1] * sizeof(T));
auto blas = math::GetBlas<platform::CPUDeviceContext, T>(ctx); auto blas = pten::funcs::GetBlas<platform::CPUDeviceContext, T>(ctx);
call_gemm(blas, CblasNoTrans, CblasNoTrans, x->dims()[0], dim_t * dim_in, call_gemm(blas, CblasNoTrans, CblasNoTrans, x->dims()[0], dim_t * dim_in,
dim_in, 1.0f, bottom_l_data, t_data, 0.0f, bottom_l_trans_data); dim_in, 1.0f, bottom_l_data, t_data, 0.0f, bottom_l_trans_data);
...@@ -262,7 +262,7 @@ class CPUMatchMatrixTensorOPKernel : public framework::OpKernel<T> { ...@@ -262,7 +262,7 @@ class CPUMatchMatrixTensorOPKernel : public framework::OpKernel<T> {
const auto* l_t_data = const auto* l_t_data =
bottom_l_trans_data + offset_l[b] * dim_t * dim_in + t * dim_in; bottom_l_trans_data + offset_l[b] * dim_t * dim_in + t * dim_in;
const auto* r_data = bottom_r_data + offset_r[b] * dim_in; const auto* r_data = bottom_r_data + offset_r[b] * dim_in;
auto blas_2 = math::GetBlas<platform::CPUDeviceContext, T>(ctx); auto blas_2 = pten::funcs::GetBlas<platform::CPUDeviceContext, T>(ctx);
call_gemm_with_lda(blas_2, CblasNoTrans, CblasTrans, len_l, len_r, call_gemm_with_lda(blas_2, CblasNoTrans, CblasTrans, len_l, len_r,
dim_in, 1.0f, l_t_data, r_data, 0.0f, top_data, dim_in, 1.0f, l_t_data, r_data, 0.0f, top_data,
dim_t * dim_in); dim_t * dim_in);
...@@ -346,7 +346,7 @@ class CPUMatchMatrixTensorOPGradKernel : public framework::OpKernel<T> { ...@@ -346,7 +346,7 @@ class CPUMatchMatrixTensorOPGradKernel : public framework::OpKernel<T> {
} }
} }
auto blas = math::GetBlas<platform::CPUDeviceContext, T>(ctx); auto blas = pten::funcs::GetBlas<platform::CPUDeviceContext, T>(ctx);
auto* t_data = w->data<T>(); auto* t_data = w->data<T>();
auto* d_w = ctx.Output<Tensor>(framework::GradVarName("W")); auto* d_w = ctx.Output<Tensor>(framework::GradVarName("W"));
......
add_subdirectory(detail) add_subdirectory(detail)
function(math_library TARGET)
# math_library is a function to create math library.
# The interface is the same as cc_library.
# But it handle split GPU/CPU code and link some common library.
set(cc_srcs)
set(cu_srcs)
set(hip_srcs)
set(math_common_deps device_context framework_proto enforce)
if (WITH_GPU)
if (${CMAKE_CUDA_COMPILER_VERSION} LESS 11.0)
list(APPEND math_common_deps cub)
else()
list(APPEND math_common_deps)
endif()
endif()
set(multiValueArgs DEPS)
cmake_parse_arguments(math_library "${options}" "${oneValueArgs}"
"${multiValueArgs}" ${ARGN})
if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cc)
list(APPEND cc_srcs ${TARGET}.cc)
endif()
if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cu)
list(APPEND cu_srcs ${TARGET}.cu)
endif()
if (EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/${TARGET}.cu.cc)
list(APPEND cu_srcs ${TARGET}.cu.cc)
endif()
list(LENGTH cc_srcs cc_srcs_len)
if (WITH_GPU)
nv_library(${TARGET} SRCS ${cc_srcs} ${cu_srcs} DEPS ${math_library_DEPS} ${math_common_deps})
elseif (WITH_ROCM)
hip_library(${TARGET} SRCS ${cc_srcs} ${cu_srcs} DEPS ${math_library_DEPS} ${math_common_deps})
elseif(${cc_srcs_len} GREATER 0)
cc_library(${TARGET} SRCS ${cc_srcs} DEPS ${math_library_DEPS} ${math_common_deps})
endif()
endfunction()
if (WITH_ASCEND_CL) if (WITH_ASCEND_CL)
cc_library(beam_search_npu SRCS beam_search_npu.cc DEPS npu_op_runner) cc_library(beam_search_npu SRCS beam_search_npu.cc DEPS npu_op_runner)
endif() endif()
...@@ -59,9 +20,6 @@ math_library(sampler DEPS generator) ...@@ -59,9 +20,6 @@ math_library(sampler DEPS generator)
math_library(gru_compute DEPS activation_functions math_function) math_library(gru_compute DEPS activation_functions math_function)
math_library(lstm_compute DEPS activation_functions) math_library(lstm_compute DEPS activation_functions)
cc_library(blas SRCS blas.cc DEPS cblas framework_proto device_context)
# math_library(math_function DEPS blas dense_tensor tensor)
math_library(maxouting) math_library(maxouting)
math_library(pooling) math_library(pooling)
...@@ -82,8 +40,6 @@ else() ...@@ -82,8 +40,6 @@ else()
math_library(beam_search DEPS math_function) math_library(beam_search DEPS math_function)
endif() endif()
math_library(fc DEPS blas) math_library(fc DEPS blas)
math_library(lapack_function DEPS dynload_lapack)
math_library(matrix_bit_code) math_library(matrix_bit_code)
math_library(unpooling) math_library(unpooling)
......
...@@ -17,8 +17,8 @@ limitations under the License. */ ...@@ -17,8 +17,8 @@ limitations under the License. */
#include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/framework/tensor_util.h" #include "paddle/fluid/framework/tensor_util.h"
#include "paddle/fluid/operators/math/bert_encoder_functor.h" #include "paddle/fluid/operators/math/bert_encoder_functor.h"
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/enforce.h"
#include "paddle/pten/kernels/funcs/blas/blas.h"
#include "paddle/pten/kernels/funcs/math_cuda_utils.h" #include "paddle/pten/kernels/funcs/math_cuda_utils.h"
namespace paddle { namespace paddle {
...@@ -502,7 +502,7 @@ inline void MatMulWithHeadQK(const platform::CUDADeviceContext &context, ...@@ -502,7 +502,7 @@ inline void MatMulWithHeadQK(const platform::CUDADeviceContext &context,
typedef typename CUDATypeTraits<T>::TYPE run_type; typedef typename CUDATypeTraits<T>::TYPE run_type;
auto blas = auto blas =
operators::math::GetBlas<platform::CUDADeviceContext, run_type>(context); pten::funcs::GetBlas<platform::CUDADeviceContext, run_type>(context);
auto stream = context.stream(); auto stream = context.stream();
blas.BatchedGEMM( blas.BatchedGEMM(
...@@ -568,7 +568,7 @@ inline void MatMulWithHeadQKV(const platform::CUDADeviceContext &context, ...@@ -568,7 +568,7 @@ inline void MatMulWithHeadQKV(const platform::CUDADeviceContext &context,
typedef typename CUDATypeTraits<T>::TYPE run_type; typedef typename CUDATypeTraits<T>::TYPE run_type;
auto blas = auto blas =
operators::math::GetBlas<platform::CUDADeviceContext, run_type>(context); pten::funcs::GetBlas<platform::CUDADeviceContext, run_type>(context);
auto stream = context.stream(); auto stream = context.stream();
CBLAS_TRANSPOSE transA = !qk_trans ? CblasNoTrans : CblasTrans; CBLAS_TRANSPOSE transA = !qk_trans ? CblasNoTrans : CblasTrans;
CBLAS_TRANSPOSE transB = !v_trans ? CblasNoTrans : CblasTrans; CBLAS_TRANSPOSE transB = !v_trans ? CblasNoTrans : CblasTrans;
......
...@@ -18,8 +18,8 @@ limitations under the License. */ ...@@ -18,8 +18,8 @@ limitations under the License. */
#include <vector> #include <vector>
#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/operators/math/im2col.h" #include "paddle/fluid/operators/math/im2col.h"
#include "paddle/pten/kernels/funcs/blas/blas.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -223,7 +223,7 @@ class ContextProjectGradFunctor { ...@@ -223,7 +223,7 @@ class ContextProjectGradFunctor {
int input_row_begin, input_row_end; int input_row_begin, input_row_end;
int sequence_height, sequence_width; int sequence_height, sequence_width;
sequence_width = in.dims()[1]; sequence_width = in.dims()[1];
auto blas = math::GetBlas<DeviceContext, T>(context); auto blas = pten::funcs::GetBlas<DeviceContext, T>(context);
if (input_grad) { if (input_grad) {
for (int i = 0; i < static_cast<int>(lod_level_0.size()) - 1; ++i) { for (int i = 0; i < static_cast<int>(lod_level_0.size()) - 1; ++i) {
......
...@@ -15,8 +15,8 @@ ...@@ -15,8 +15,8 @@
#pragma once #pragma once
#include "paddle/fluid/memory/memory.h" #include "paddle/fluid/memory/memory.h"
#include "paddle/fluid/operators/math/lapack_function.h"
#include "paddle/fluid/operators/svd_helper.h" #include "paddle/fluid/operators/svd_helper.h"
#include "paddle/pten/kernels/funcs/lapack/lapack_function.h"
#ifdef PADDLE_WITH_CUDA #ifdef PADDLE_WITH_CUDA
#include "paddle/fluid/platform/dynload/cusolver.h" #include "paddle/fluid/platform/dynload/cusolver.h"
#endif // PADDLE_WITH_CUDA #endif // PADDLE_WITH_CUDA
...@@ -98,9 +98,9 @@ struct MatrixEighFunctor<platform::CPUDeviceContext, T> { ...@@ -98,9 +98,9 @@ struct MatrixEighFunctor<platform::CPUDeviceContext, T> {
int info = 0; int info = 0;
// Call lapackEigh to get the optimal size of work data // Call lapackEigh to get the optimal size of work data
math::lapackEigh<T, ValueType>(jobz, uplo, n, input_vector, lda, out_value, pten::funcs::lapackEigh<T, ValueType>(
&lwork_opt, lwork, &rwork_opt, lrwork, jobz, uplo, n, input_vector, lda, out_value, &lwork_opt, lwork,
&iwork_opt, liwork, &info); &rwork_opt, lrwork, &iwork_opt, liwork, &info);
lwork = std::max<int>(1, static_cast<int>(lwork_opt)); lwork = std::max<int>(1, static_cast<int>(lwork_opt));
liwork = std::max<int>(1, iwork_opt); liwork = std::max<int>(1, iwork_opt);
...@@ -123,7 +123,7 @@ struct MatrixEighFunctor<platform::CPUDeviceContext, T> { ...@@ -123,7 +123,7 @@ struct MatrixEighFunctor<platform::CPUDeviceContext, T> {
for (auto i = 0; i < batch_size; i++) { for (auto i = 0; i < batch_size; i++) {
auto *value_data = out_value + i * values_stride; auto *value_data = out_value + i * values_stride;
auto *input_data = input_vector + i * vector_stride; auto *input_data = input_vector + i * vector_stride;
math::lapackEigh<T, pten::funcs::Real<T>>( pten::funcs::lapackEigh<T, pten::funcs::Real<T>>(
jobz, uplo, n, input_data, lda, value_data, work_data, lwork, jobz, uplo, n, input_data, lda, value_data, work_data, lwork,
rwork_data, lrwork, iwork_data, liwork, &info); rwork_data, lrwork, iwork_data, liwork, &info);
CheckEighResult(i, info); CheckEighResult(i, info);
......
...@@ -15,7 +15,7 @@ limitations under the License. */ ...@@ -15,7 +15,7 @@ limitations under the License. */
#include "paddle/fluid/operators/math/fc.h" #include "paddle/fluid/operators/math/fc.h"
#include "paddle/fluid/operators/jit/kernels.h" #include "paddle/fluid/operators/jit/kernels.h"
#include "paddle/fluid/operators/math/blas.h" #include "paddle/pten/kernels/funcs/blas/blas.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -28,7 +28,7 @@ class FCFunctor<platform::CPUDeviceContext, T> { ...@@ -28,7 +28,7 @@ class FCFunctor<platform::CPUDeviceContext, T> {
const int N, const int K, const T* X, const T* W, T* Y, const int N, const int K, const T* X, const T* W, T* Y,
const T* B = nullptr, bool relu = false, const T* B = nullptr, bool relu = false,
bool padding_weights = false) { bool padding_weights = false) {
auto blas = math::GetBlas<platform::CPUDeviceContext, T>(context); auto blas = pten::funcs::GetBlas<platform::CPUDeviceContext, T>(context);
framework::Tensor Y1; framework::Tensor Y1;
T* Y1_data = nullptr; T* Y1_data = nullptr;
if (padding_weights) { if (padding_weights) {
......
...@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and ...@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include <algorithm> #include <algorithm>
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/operators/math/fc.h" #include "paddle/fluid/operators/math/fc.h"
#include "paddle/pten/kernels/funcs/blas/blas.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -85,7 +85,7 @@ class FCFunctor<platform::CUDADeviceContext, T> { ...@@ -85,7 +85,7 @@ class FCFunctor<platform::CUDADeviceContext, T> {
padding_weights, false, padding_weights, false,
platform::errors::PermissionDenied( platform::errors::PermissionDenied(
"Weight padding in fc can not be used in GPU scope.")); "Weight padding in fc can not be used in GPU scope."));
auto blas = math::GetBlas<platform::CUDADeviceContext, T>(context); auto blas = pten::funcs::GetBlas<platform::CUDADeviceContext, T>(context);
blas.GEMM(false, false, M, N, K, static_cast<T>(1.0), X, K, W, N, blas.GEMM(false, false, M, N, K, static_cast<T>(1.0), X, K, W, N,
static_cast<T>(0.0), Y, N); static_cast<T>(0.0), Y, N);
if (B == NULL) { if (B == NULL) {
......
...@@ -11,9 +11,9 @@ limitations under the License. */ ...@@ -11,9 +11,9 @@ limitations under the License. */
#include "paddle/fluid/operators/math/gru_compute.h" #include "paddle/fluid/operators/math/gru_compute.h"
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/operators/math/detail/gru_cpu_kernel.h" #include "paddle/fluid/operators/math/detail/gru_cpu_kernel.h"
#include "paddle/fluid/operators/math/detail/gru_kernel.h" #include "paddle/fluid/operators/math/detail/gru_kernel.h"
#include "paddle/pten/kernels/funcs/blas/blas.h"
namespace paddle { namespace paddle {
namespace platform { namespace platform {
...@@ -33,7 +33,7 @@ struct GRUUnitFunctor<platform::CPUDeviceContext, T> { ...@@ -33,7 +33,7 @@ struct GRUUnitFunctor<platform::CPUDeviceContext, T> {
const detail::ActivationType active_gate, const detail::ActivationType active_gate,
bool origin_mode) { bool origin_mode) {
#if !defined(__NVCC__) && !defined(__HIPCC___) #if !defined(__NVCC__) && !defined(__HIPCC___)
auto blas = math::GetBlas<platform::CPUDeviceContext, T>(context); auto blas = pten::funcs::GetBlas<platform::CPUDeviceContext, T>(context);
if (value.prev_out_value) { if (value.prev_out_value) {
blas.GEMM(false, false, batch_size, frame_size * 2, frame_size, 1, blas.GEMM(false, false, batch_size, frame_size * 2, frame_size, 1,
value.prev_out_value, frame_size, value.gate_weight, value.prev_out_value, frame_size, value.gate_weight,
...@@ -70,7 +70,7 @@ struct GRUUnitGradFunctor<platform::CPUDeviceContext, T> { ...@@ -70,7 +70,7 @@ struct GRUUnitGradFunctor<platform::CPUDeviceContext, T> {
detail::backward_state_grad(detail::backward::gru_stateGrad<T>(), value, detail::backward_state_grad(detail::backward::gru_stateGrad<T>(), value,
grad, frame_size, batch_size, active_node, grad, frame_size, batch_size, active_node,
origin_mode); origin_mode);
auto blas = math::GetBlas<platform::CPUDeviceContext, T>(context); auto blas = pten::funcs::GetBlas<platform::CPUDeviceContext, T>(context);
if (value.prev_out_value && grad.prev_out_grad) { if (value.prev_out_value && grad.prev_out_grad) {
blas.GEMM(false, true, batch_size, frame_size, frame_size, 1, blas.GEMM(false, true, batch_size, frame_size, frame_size, 1,
grad.gate_grad + frame_size * 2, frame_size * 3, grad.gate_grad + frame_size * 2, frame_size * 3,
...@@ -109,7 +109,7 @@ struct GRUUnitFunctorV2<platform::CPUDeviceContext, T> { ...@@ -109,7 +109,7 @@ struct GRUUnitFunctorV2<platform::CPUDeviceContext, T> {
const detail::ActivationType active_node, const detail::ActivationType active_node,
const detail::ActivationType active_gate) { const detail::ActivationType active_gate) {
#if !defined(__NVCC__) && !defined(__HIPCC___) #if !defined(__NVCC__) && !defined(__HIPCC___)
auto blas = math::GetBlas<platform::CPUDeviceContext, T>(context); auto blas = pten::funcs::GetBlas<platform::CPUDeviceContext, T>(context);
if (value.prev_out_value) { if (value.prev_out_value) {
blas.GEMM(CblasNoTrans, CblasTrans, batch_size, frame_size, frame_size, 1, blas.GEMM(CblasNoTrans, CblasTrans, batch_size, frame_size, frame_size, 1,
value.prev_out_value, value.state_weight, 0, value.prev_out_value, value.state_weight, 0,
...@@ -147,7 +147,7 @@ struct GRUUnitGradFunctorV2<platform::CPUDeviceContext, T> { ...@@ -147,7 +147,7 @@ struct GRUUnitGradFunctorV2<platform::CPUDeviceContext, T> {
// grad_reset_output, grad_reset_gate // grad_reset_output, grad_reset_gate
detail::cpu_gru_backward(context, detail::backward::gru<T>(), value, grad, detail::cpu_gru_backward(context, detail::backward::gru<T>(), value, grad,
frame_size, batch_size, active_node, active_gate); frame_size, batch_size, active_node, active_gate);
auto blas = math::GetBlas<platform::CPUDeviceContext, T>(context); auto blas = pten::funcs::GetBlas<platform::CPUDeviceContext, T>(context);
if (grad.prev_out_grad && value.prev_out_value) { if (grad.prev_out_grad && value.prev_out_value) {
// update prev_out_grad // update prev_out_grad
blas.GEMM(false, false, batch_size, frame_size, frame_size, 1, blas.GEMM(false, false, batch_size, frame_size, frame_size, 1,
......
...@@ -10,10 +10,10 @@ See the License for the specific language governing permissions and ...@@ -10,10 +10,10 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include <paddle/fluid/platform/device_context.h> #include <paddle/fluid/platform/device_context.h>
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/operators/math/detail/gru_gpu_kernel.h" #include "paddle/fluid/operators/math/detail/gru_gpu_kernel.h"
#include "paddle/fluid/operators/math/detail/gru_kernel.h" #include "paddle/fluid/operators/math/detail/gru_kernel.h"
#include "paddle/fluid/operators/math/gru_compute.h" #include "paddle/fluid/operators/math/gru_compute.h"
#include "paddle/pten/kernels/funcs/blas/blas.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -77,7 +77,7 @@ struct GRUUnitFunctor<platform::CUDADeviceContext, T> { ...@@ -77,7 +77,7 @@ struct GRUUnitFunctor<platform::CUDADeviceContext, T> {
threads = dim3(32, 32); threads = dim3(32, 32);
grid = dim3((frame_size + 32 - 1) / 32, (batch_size + 32 - 1) / 32); grid = dim3((frame_size + 32 - 1) / 32, (batch_size + 32 - 1) / 32);
} }
auto blas = math::GetBlas<platform::CUDADeviceContext, T>(context); auto blas = pten::funcs::GetBlas<platform::CUDADeviceContext, T>(context);
if (value.prev_out_value) { if (value.prev_out_value) {
blas.GEMM(false, false, batch_size, frame_size * 2, frame_size, 1, blas.GEMM(false, false, batch_size, frame_size * 2, frame_size, 1,
value.prev_out_value, frame_size, value.gate_weight, value.prev_out_value, frame_size, value.gate_weight,
...@@ -162,7 +162,7 @@ struct GRUUnitGradFunctor<platform::CUDADeviceContext, T> { ...@@ -162,7 +162,7 @@ struct GRUUnitGradFunctor<platform::CUDADeviceContext, T> {
grad.output_grad, frame_size, batch_size, active_node, origin_mode); grad.output_grad, frame_size, batch_size, active_node, origin_mode);
} }
auto blas = math::GetBlas<platform::CUDADeviceContext, T>(context); auto blas = pten::funcs::GetBlas<platform::CUDADeviceContext, T>(context);
if (value.prev_out_value && grad.prev_out_grad) { if (value.prev_out_value && grad.prev_out_grad) {
blas.GEMM(false, true, batch_size, frame_size, frame_size, 1, blas.GEMM(false, true, batch_size, frame_size, frame_size, 1,
......
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/operators/math/lapack_function.h"
#include "paddle/fluid/platform/complex.h"
#include "paddle/fluid/platform/dynload/lapack.h"
namespace paddle {
namespace operators {
namespace math {
// LU (for example)
template <>
void lapackLu<double>(int m, int n, double *a, int lda, int *ipiv, int *info) {
platform::dynload::dgetrf_(&m, &n, a, &lda, ipiv, info);
}
template <>
void lapackLu<float>(int m, int n, float *a, int lda, int *ipiv, int *info) {
platform::dynload::sgetrf_(&m, &n, a, &lda, ipiv, info);
}
// eigh
template <>
void lapackEigh<float>(char jobz, char uplo, int n, float *a, int lda, float *w,
float *work, int lwork, float *rwork, int lrwork,
int *iwork, int liwork, int *info) {
(void)rwork; // unused
(void)lrwork; // unused
platform::dynload::ssyevd_(&jobz, &uplo, &n, a, &lda, w, work, &lwork, iwork,
&liwork, info);
}
template <>
void lapackEigh<double>(char jobz, char uplo, int n, double *a, int lda,
double *w, double *work, int lwork, double *rwork,
int lrwork, int *iwork, int liwork, int *info) {
(void)rwork; // unused
(void)lrwork; // unused
platform::dynload::dsyevd_(&jobz, &uplo, &n, a, &lda, w, work, &lwork, iwork,
&liwork, info);
}
template <>
void lapackEigh<platform::complex<float>, float>(
char jobz, char uplo, int n, platform::complex<float> *a, int lda, float *w,
platform::complex<float> *work, int lwork, float *rwork, int lrwork,
int *iwork, int liwork, int *info) {
platform::dynload::cheevd_(&jobz, &uplo, &n,
reinterpret_cast<std::complex<float> *>(a), &lda,
w, reinterpret_cast<std::complex<float> *>(work),
&lwork, rwork, &lrwork, iwork, &liwork, info);
}
template <>
void lapackEigh<platform::complex<double>, double>(
char jobz, char uplo, int n, platform::complex<double> *a, int lda,
double *w, platform::complex<double> *work, int lwork, double *rwork,
int lrwork, int *iwork, int liwork, int *info) {
platform::dynload::zheevd_(&jobz, &uplo, &n,
reinterpret_cast<std::complex<double> *>(a), &lda,
w, reinterpret_cast<std::complex<double> *>(work),
&lwork, rwork, &lrwork, iwork, &liwork, info);
}
// Eig
template <>
void lapackEig<double>(char jobvl, char jobvr, int n, double *a, int lda,
double *w, double *vl, int ldvl, double *vr, int ldvr,
double *work, int lwork, double *rwork, int *info) {
double *wr = w;
double *wi = w + n;
(void)rwork; // unused
platform::dynload::dgeev_(&jobvl, &jobvr, &n, a, &lda, wr, wi, vl, &ldvl, vr,
&ldvr, work, &lwork, info);
}
template <>
void lapackEig<float>(char jobvl, char jobvr, int n, float *a, int lda,
float *w, float *vl, int ldvl, float *vr, int ldvr,
float *work, int lwork, float *rwork, int *info) {
float *wr = w;
float *wi = w + n;
(void)rwork; // unused
platform::dynload::sgeev_(&jobvl, &jobvr, &n, a, &lda, wr, wi, vl, &ldvl, vr,
&ldvr, work, &lwork, info);
}
template <>
void lapackEig<platform::complex<double>, double>(
char jobvl, char jobvr, int n, platform::complex<double> *a, int lda,
platform::complex<double> *w, platform::complex<double> *vl, int ldvl,
platform::complex<double> *vr, int ldvr, platform::complex<double> *work,
int lwork, double *rwork, int *info) {
platform::dynload::zgeev_(
&jobvl, &jobvr, &n, reinterpret_cast<std::complex<double> *>(a), &lda,
reinterpret_cast<std::complex<double> *>(w),
reinterpret_cast<std::complex<double> *>(vl), &ldvl,
reinterpret_cast<std::complex<double> *>(vr), &ldvr,
reinterpret_cast<std::complex<double> *>(work), &lwork, rwork, info);
}
template <>
void lapackEig<platform::complex<float>, float>(
char jobvl, char jobvr, int n, platform::complex<float> *a, int lda,
platform::complex<float> *w, platform::complex<float> *vl, int ldvl,
platform::complex<float> *vr, int ldvr, platform::complex<float> *work,
int lwork, float *rwork, int *info) {
platform::dynload::cgeev_(
&jobvl, &jobvr, &n, reinterpret_cast<std::complex<float> *>(a), &lda,
reinterpret_cast<std::complex<float> *>(w),
reinterpret_cast<std::complex<float> *>(vl), &ldvl,
reinterpret_cast<std::complex<float> *>(vr), &ldvr,
reinterpret_cast<std::complex<float> *>(work), &lwork, rwork, info);
}
template <>
void lapackGels<double>(char trans, int m, int n, int nrhs, double *a, int lda,
double *b, int ldb, double *work, int lwork,
int *info) {
platform::dynload::dgels_(&trans, &m, &n, &nrhs, a, &lda, b, &ldb, work,
&lwork, info);
}
template <>
void lapackGels<float>(char trans, int m, int n, int nrhs, float *a, int lda,
float *b, int ldb, float *work, int lwork, int *info) {
platform::dynload::sgels_(&trans, &m, &n, &nrhs, a, &lda, b, &ldb, work,
&lwork, info);
}
template <>
void lapackGelsd<double>(int m, int n, int nrhs, double *a, int lda, double *b,
int ldb, double *s, double rcond, int *rank,
double *work, int lwork, double *rwork, int *iwork,
int *info) {
platform::dynload::dgelsd_(&m, &n, &nrhs, a, &lda, b, &ldb, s, &rcond, rank,
work, &lwork, iwork, info);
}
template <>
void lapackGelsd<float>(int m, int n, int nrhs, float *a, int lda, float *b,
int ldb, float *s, float rcond, int *rank, float *work,
int lwork, float *rwork, int *iwork, int *info) {
platform::dynload::sgelsd_(&m, &n, &nrhs, a, &lda, b, &ldb, s, &rcond, rank,
work, &lwork, iwork, info);
}
template <>
void lapackGelsy<double>(int m, int n, int nrhs, double *a, int lda, double *b,
int ldb, int *jpvt, double rcond, int *rank,
double *work, int lwork, double *rwork, int *info) {
platform::dynload::dgelsy_(&m, &n, &nrhs, a, &lda, b, &ldb, jpvt, &rcond,
rank, work, &lwork, info);
}
template <>
void lapackGelsy<float>(int m, int n, int nrhs, float *a, int lda, float *b,
int ldb, int *jpvt, float rcond, int *rank, float *work,
int lwork, float *rwork, int *info) {
platform::dynload::sgelsy_(&m, &n, &nrhs, a, &lda, b, &ldb, jpvt, &rcond,
rank, work, &lwork, info);
}
template <>
void lapackGelss<double>(int m, int n, int nrhs, double *a, int lda, double *b,
int ldb, double *s, double rcond, int *rank,
double *work, int lwork, double *rwork, int *info) {
platform::dynload::dgelss_(&m, &n, &nrhs, a, &lda, b, &ldb, s, &rcond, rank,
work, &lwork, info);
}
template <>
void lapackGelss<float>(int m, int n, int nrhs, float *a, int lda, float *b,
int ldb, float *s, float rcond, int *rank, float *work,
int lwork, float *rwork, int *info) {
platform::dynload::sgelss_(&m, &n, &nrhs, a, &lda, b, &ldb, s, &rcond, rank,
work, &lwork, info);
}
template <>
void lapackCholeskySolve<platform::complex<double>>(
char uplo, int n, int nrhs, platform::complex<double> *a, int lda,
platform::complex<double> *b, int ldb, int *info) {
platform::dynload::zpotrs_(
&uplo, &n, &nrhs, reinterpret_cast<std::complex<double> *>(a), &lda,
reinterpret_cast<std::complex<double> *>(b), &ldb, info);
}
template <>
void lapackCholeskySolve<platform::complex<float>>(char uplo, int n, int nrhs,
platform::complex<float> *a,
int lda,
platform::complex<float> *b,
int ldb, int *info) {
platform::dynload::cpotrs_(
&uplo, &n, &nrhs, reinterpret_cast<std::complex<float> *>(a), &lda,
reinterpret_cast<std::complex<float> *>(b), &ldb, info);
}
template <>
void lapackCholeskySolve<double>(char uplo, int n, int nrhs, double *a, int lda,
double *b, int ldb, int *info) {
platform::dynload::dpotrs_(&uplo, &n, &nrhs, a, &lda, b, &ldb, info);
}
template <>
void lapackCholeskySolve<float>(char uplo, int n, int nrhs, float *a, int lda,
float *b, int ldb, int *info) {
platform::dynload::spotrs_(&uplo, &n, &nrhs, a, &lda, b, &ldb, info);
}
} // namespace math
} // namespace operators
} // namespace paddle
...@@ -135,8 +135,8 @@ struct MatrixBitCodeFunctorMul : public boost::static_visitor<void> { ...@@ -135,8 +135,8 @@ struct MatrixBitCodeFunctorMul : public boost::static_visitor<void> {
template <typename CodeTable> template <typename CodeTable>
void operator()(const CodeTable &code_table) { void operator()(const CodeTable &code_table) {
auto blas = auto blas = pten::funcs::GetBlas<platform::CPUDeviceContext, T>(
GetBlas<platform::CPUDeviceContext, T>(platform::CPUDeviceContext()); platform::CPUDeviceContext());
size_t num_samples = tmat_->dims()[0]; size_t num_samples = tmat_->dims()[0];
size_t tmat_width = tmat_->dims()[1]; size_t tmat_width = tmat_->dims()[1];
size_t input_width = input_.dims()[1]; size_t input_width = input_.dims()[1];
...@@ -183,8 +183,8 @@ struct MatrixBitCodeFunctorMulGradWeight : public boost::static_visitor<void> { ...@@ -183,8 +183,8 @@ struct MatrixBitCodeFunctorMulGradWeight : public boost::static_visitor<void> {
: tmat_(tmat), weight_(weight), input_(input) {} : tmat_(tmat), weight_(weight), input_(input) {}
template <typename CodeTable> template <typename CodeTable>
void operator()(const CodeTable &code_table) { void operator()(const CodeTable &code_table) {
auto blas = auto blas = pten::funcs::GetBlas<platform::CPUDeviceContext, T>(
GetBlas<platform::CPUDeviceContext, T>(platform::CPUDeviceContext()); platform::CPUDeviceContext());
size_t num_samples = tmat_.dims()[0]; size_t num_samples = tmat_.dims()[0];
size_t input_width = input_.dims()[1]; size_t input_width = input_.dims()[1];
size_t tmat_width = tmat_.dims()[1]; size_t tmat_width = tmat_.dims()[1];
...@@ -237,8 +237,8 @@ struct MatrixBitCodeFunctorMulGradWeightSR ...@@ -237,8 +237,8 @@ struct MatrixBitCodeFunctorMulGradWeightSR
template <typename CodeTable> template <typename CodeTable>
void operator()(const CodeTable &code_table) { void operator()(const CodeTable &code_table) {
auto blas = auto blas = pten::funcs::GetBlas<platform::CPUDeviceContext, T>(
GetBlas<platform::CPUDeviceContext, T>(platform::CPUDeviceContext()); platform::CPUDeviceContext());
size_t num_samples = tmat_.dims()[0]; size_t num_samples = tmat_.dims()[0];
size_t input_width = input_.dims()[1]; size_t input_width = input_.dims()[1];
size_t tmat_width = tmat_.dims()[1]; size_t tmat_width = tmat_.dims()[1];
......
...@@ -21,9 +21,9 @@ limitations under the License. */ ...@@ -21,9 +21,9 @@ limitations under the License. */
#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/selected_rows_utils.h" #include "paddle/fluid/framework/selected_rows_utils.h"
#include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/variant.h" #include "paddle/fluid/platform/variant.h"
#include "paddle/pten/kernels/funcs/blas/blas.h"
#if defined(_WIN32) #if defined(_WIN32)
#include <intrin.h> #include <intrin.h>
......
...@@ -15,7 +15,7 @@ limitations under the License. */ ...@@ -15,7 +15,7 @@ limitations under the License. */
#include "paddle/fluid/operators/math/matrix_inverse.h" #include "paddle/fluid/operators/math/matrix_inverse.h"
#include "Eigen/Core" #include "Eigen/Core"
#include "Eigen/LU" #include "Eigen/LU"
#include "paddle/fluid/operators/math/blas.h" #include "paddle/pten/kernels/funcs/blas/blas.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
......
...@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and ...@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/math/matrix_inverse.h" #include "paddle/fluid/operators/math/matrix_inverse.h"
#include "paddle/fluid/operators/math/blas.h" #include "paddle/pten/kernels/funcs/blas/blas.h"
namespace paddle { namespace paddle {
namespace platform { namespace platform {
...@@ -72,7 +72,7 @@ class MatrixInverseFunctor<platform::CUDADeviceContext, T> { ...@@ -72,7 +72,7 @@ class MatrixInverseFunctor<platform::CUDADeviceContext, T> {
memory::Alloc(context, num_ints * sizeof(int)); memory::Alloc(context, num_ints * sizeof(int));
int* gpu_info_ptr = reinterpret_cast<int*>(tmp_gpu_info_data->ptr()); int* gpu_info_ptr = reinterpret_cast<int*>(tmp_gpu_info_data->ptr());
auto blas = math::GetBlas<platform::CUDADeviceContext, T>(context); auto blas = pten::funcs::GetBlas<platform::CUDADeviceContext, T>(context);
std::vector<int> info; // only for singular checking std::vector<int> info; // only for singular checking
info.resize(batch_size); info.resize(batch_size);
......
...@@ -15,7 +15,7 @@ limitations under the License. */ ...@@ -15,7 +15,7 @@ limitations under the License. */
#include "paddle/fluid/operators/math/matrix_solve.h" #include "paddle/fluid/operators/math/matrix_solve.h"
#include "Eigen/Core" #include "Eigen/Core"
#include "Eigen/LU" #include "Eigen/LU"
#include "paddle/fluid/operators/math/blas.h" #include "paddle/pten/kernels/funcs/blas/blas.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -62,7 +62,7 @@ class TriangularSolveFunctor<platform::CPUDeviceContext, T> { ...@@ -62,7 +62,7 @@ class TriangularSolveFunctor<platform::CPUDeviceContext, T> {
batch_size *= a_dim[i]; batch_size *= a_dim[i];
} }
auto blas = math::GetBlas<platform::CPUDeviceContext, T>(context); auto blas = pten::funcs::GetBlas<platform::CPUDeviceContext, T>(context);
for (int i = 0; i < batch_size; i++) { for (int i = 0; i < batch_size; i++) {
blas.TRSM(side, uplo, transA, diag, M, N, T(1), a_data + i * M * M, lda, blas.TRSM(side, uplo, transA, diag, M, N, T(1), a_data + i * M * M, lda,
b_data + i * N * M, ldb); b_data + i * N * M, ldb);
......
...@@ -14,9 +14,9 @@ limitations under the License. */ ...@@ -14,9 +14,9 @@ limitations under the License. */
#include "paddle/fluid/operators/math/matrix_solve.h" #include "paddle/fluid/operators/math/matrix_solve.h"
#include "paddle/fluid/framework/tensor_util.h" #include "paddle/fluid/framework/tensor_util.h"
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/operators/solve_op.h" #include "paddle/fluid/operators/solve_op.h"
#include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/device_context.h"
#include "paddle/pten/kernels/funcs/blas/blas.h"
#include "paddle/pten/kernels/funcs/math_function.h" #include "paddle/pten/kernels/funcs/math_function.h"
namespace paddle { namespace paddle {
...@@ -105,7 +105,7 @@ class MatrixSolveFunctor<platform::CUDADeviceContext, T> { ...@@ -105,7 +105,7 @@ class MatrixSolveFunctor<platform::CUDADeviceContext, T> {
memory::Alloc(context, num_ints * sizeof(int)); memory::Alloc(context, num_ints * sizeof(int));
int* gpu_info_ptr = reinterpret_cast<int*>(tmp_gpu_info_data->ptr()); int* gpu_info_ptr = reinterpret_cast<int*>(tmp_gpu_info_data->ptr());
auto blas = math::GetBlas<platform::CUDADeviceContext, T>(context); auto blas = pten::funcs::GetBlas<platform::CUDADeviceContext, T>(context);
// only for singular checking // only for singular checking
std::vector<int> info; std::vector<int> info;
...@@ -189,7 +189,7 @@ class TriangularSolveFunctor<platform::CUDADeviceContext, T> { ...@@ -189,7 +189,7 @@ class TriangularSolveFunctor<platform::CUDADeviceContext, T> {
batch_size *= a_dim[i]; batch_size *= a_dim[i];
} }
auto blas = math::GetBlas<platform::CUDADeviceContext, T>(context); auto blas = pten::funcs::GetBlas<platform::CUDADeviceContext, T>(context);
if (batch_size <= 8 && M >= 64) { if (batch_size <= 8 && M >= 64) {
for (auto i = 0; i < batch_size; i++) { for (auto i = 0; i < batch_size; i++) {
blas.TRSM(side, uplo, transA, diag, M, N, static_cast<T>(1.0), blas.TRSM(side, uplo, transA, diag, M, N, static_cast<T>(1.0),
......
...@@ -224,7 +224,7 @@ struct SelectedRowsSumTo<platform::CPUDeviceContext, T> { ...@@ -224,7 +224,7 @@ struct SelectedRowsSumTo<platform::CPUDeviceContext, T> {
auto* in2_value = input2->mutable_value(); auto* in2_value = input2->mutable_value();
auto* in2_data = in2_value->data<T>(); auto* in2_data = in2_value->data<T>();
auto blas = math::GetBlas<platform::CPUDeviceContext, T>(context); auto blas = pten::funcs::GetBlas<platform::CPUDeviceContext, T>(context);
size_t offset = 0u; size_t offset = 0u;
for (size_t i = 0u; i != input1.size(); ++i) { for (size_t i = 0u; i != input1.size(); ++i) {
auto& in_value = input1[i]->value(); auto& in_value = input1[i]->value();
...@@ -295,15 +295,15 @@ namespace scatter { ...@@ -295,15 +295,15 @@ namespace scatter {
template <typename T> template <typename T>
typename std::enable_if<!std::is_integral<T>::value>::type elementwise_add_to( typename std::enable_if<!std::is_integral<T>::value>::type elementwise_add_to(
BlasT<platform::CPUDeviceContext, T>* blas, size_t data_len, const T* in, pten::funcs::BlasT<platform::CPUDeviceContext, T>* blas, size_t data_len,
T* out) { const T* in, T* out) {
blas->AXPY(data_len, T(1.f), in, out); blas->AXPY(data_len, T(1.f), in, out);
} }
template <typename T> template <typename T>
typename std::enable_if<std::is_integral<T>::value>::type elementwise_add_to( typename std::enable_if<std::is_integral<T>::value>::type elementwise_add_to(
BlasT<platform::CPUDeviceContext, T>* blas, size_t data_len, const T* in, pten::funcs::BlasT<platform::CPUDeviceContext, T>* blas, size_t data_len,
T* out) { const T* in, T* out) {
for (size_t i = 0; i < data_len; i++) { for (size_t i = 0; i < data_len; i++) {
out[i] += in[i]; out[i] += in[i];
} }
...@@ -316,7 +316,7 @@ add_sparse_inputs(const std::vector<const pten::SelectedRows*>& inputs, ...@@ -316,7 +316,7 @@ add_sparse_inputs(const std::vector<const pten::SelectedRows*>& inputs,
int64_t input_width, int64_t input_width,
const platform::CPUDeviceContext& context, T* out_data) { const platform::CPUDeviceContext& context, T* out_data) {
#ifndef PADDLE_WITH_MKLDNN #ifndef PADDLE_WITH_MKLDNN
auto blas = math::GetBlas<platform::CPUDeviceContext, T>(context); auto blas = pten::funcs::GetBlas<platform::CPUDeviceContext, T>(context);
#endif #endif
for (auto* input : inputs) { for (auto* input : inputs) {
if (input->rows().size() == 0) { if (input->rows().size() == 0) {
...@@ -350,7 +350,7 @@ add_sparse_inputs(const std::vector<const pten::SelectedRows*>& inputs, ...@@ -350,7 +350,7 @@ add_sparse_inputs(const std::vector<const pten::SelectedRows*>& inputs,
int64_t input_width, int64_t input_width,
const platform::CPUDeviceContext& context, T* out_data) { const platform::CPUDeviceContext& context, T* out_data) {
VLOG(4) << "[CPU] add_sparse_inputs <" << typeid(T).name(); VLOG(4) << "[CPU] add_sparse_inputs <" << typeid(T).name();
auto blas = math::GetBlas<platform::CPUDeviceContext, T>(context); auto blas = pten::funcs::GetBlas<platform::CPUDeviceContext, T>(context);
for (auto* input : inputs) { for (auto* input : inputs) {
if (input->rows().size() == 0) { if (input->rows().size() == 0) {
continue; continue;
...@@ -697,7 +697,7 @@ struct MergeAverage<platform::CPUDeviceContext, T> { ...@@ -697,7 +697,7 @@ struct MergeAverage<platform::CPUDeviceContext, T> {
rows_to_id[merge_rows[i]] = i; rows_to_id[merge_rows[i]] = i;
} }
auto blas = math::GetBlas<platform::CPUDeviceContext, T>(context); auto blas = pten::funcs::GetBlas<platform::CPUDeviceContext, T>(context);
for (auto* input : inputs) { for (auto* input : inputs) {
if (input->rows().size() == 0) { if (input->rows().size() == 0) {
continue; continue;
......
...@@ -18,8 +18,8 @@ limitations under the License. */ ...@@ -18,8 +18,8 @@ limitations under the License. */
#include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/selected_rows_utils.h" #include "paddle/fluid/framework/selected_rows_utils.h"
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/platform/device_context.h" #include "paddle/fluid/platform/device_context.h"
#include "paddle/pten/kernels/funcs/blas/blas.h"
#include "paddle/pten/kernels/funcs/math_function.h" #include "paddle/pten/kernels/funcs/math_function.h"
#define INLINE_FOR2(sizei, sizej) \ #define INLINE_FOR2(sizei, sizej) \
......
...@@ -15,8 +15,8 @@ limitations under the License. */ ...@@ -15,8 +15,8 @@ limitations under the License. */
#include <string> #include <string>
#include "paddle/fluid/operators/jit/kernels.h" #include "paddle/fluid/operators/jit/kernels.h"
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/operators/math/sequence_pooling.h" #include "paddle/fluid/operators/math/sequence_pooling.h"
#include "paddle/pten/kernels/funcs/blas/blas.h"
#include "paddle/pten/kernels/funcs/math_function.h" #include "paddle/pten/kernels/funcs/math_function.h"
namespace paddle { namespace paddle {
...@@ -289,7 +289,7 @@ class SumSeqPoolGradFunctor { ...@@ -289,7 +289,7 @@ class SumSeqPoolGradFunctor {
in_w, out_w, in_w, out_w)); in_w, out_w, in_w, out_w));
const T* out_g_data = out_grad.data<T>(); const T* out_g_data = out_grad.data<T>();
T* in_g_data = in_grad->mutable_data<T>(context.GetPlace()); T* in_g_data = in_grad->mutable_data<T>(context.GetPlace());
auto blas = math::GetBlas<platform::CPUDeviceContext, T>(context); auto blas = pten::funcs::GetBlas<platform::CPUDeviceContext, T>(context);
for (int i = 0; i < static_cast<int>(lod.size()) - 1; ++i) { for (int i = 0; i < static_cast<int>(lod.size()) - 1; ++i) {
int64_t h = static_cast<int64_t>(lod[i + 1] - lod[i]); int64_t h = static_cast<int64_t>(lod[i + 1] - lod[i]);
if (h == 0) continue; if (h == 0) continue;
......
...@@ -14,7 +14,7 @@ limitations under the License. */ ...@@ -14,7 +14,7 @@ limitations under the License. */
#include <vector> #include <vector>
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/op_version_registry.h" #include "paddle/fluid/framework/op_version_registry.h"
#include "paddle/fluid/operators/math/blas.h" #include "paddle/pten/kernels/funcs/blas/blas.h"
#ifdef PADDLE_WITH_MKLDNN #ifdef PADDLE_WITH_MKLDNN
#include "paddle/fluid/platform/mkldnn_helper.h" #include "paddle/fluid/platform/mkldnn_helper.h"
#endif #endif
...@@ -25,7 +25,8 @@ namespace operators { ...@@ -25,7 +25,8 @@ namespace operators {
/** /**
* Printing shape information into a string is easy to use. * Printing shape information into a string is easy to use.
*/ */
inline static std::string DumpMatrixShape(const math::MatDescriptor &desc) { inline static std::string DumpMatrixShape(
const pten::funcs::MatDescriptor &desc) {
std::stringstream buffer; std::stringstream buffer;
buffer << "[" << desc.batch_size_ << ", " << desc.height_ << ", " buffer << "[" << desc.batch_size_ << ", " << desc.height_ << ", "
<< desc.width_ << "]"; << desc.width_ << "]";
...@@ -65,10 +66,10 @@ class MatMulKernel : public framework::OpKernel<T> { ...@@ -65,10 +66,10 @@ class MatMulKernel : public framework::OpKernel<T> {
auto *out = context.Output<framework::Tensor>("Out"); auto *out = context.Output<framework::Tensor>("Out");
out->mutable_data<T>(context.GetPlace()); out->mutable_data<T>(context.GetPlace());
auto blas = math::GetBlas<DeviceContext, T>(context); auto blas = pten::funcs::GetBlas<DeviceContext, T>(context);
auto mat_dim_a = math::CreateMatrixDescriptor( auto mat_dim_a = pten::funcs::CreateMatrixDescriptor(
RowMatrixFromVector(x.dims()), 0, context.Attr<bool>("transpose_X")); RowMatrixFromVector(x.dims()), 0, context.Attr<bool>("transpose_X"));
auto mat_dim_b = math::CreateMatrixDescriptor( auto mat_dim_b = pten::funcs::CreateMatrixDescriptor(
ColumnMatrixFromVector(y.dims()), 0, context.Attr<bool>("transpose_Y")); ColumnMatrixFromVector(y.dims()), 0, context.Attr<bool>("transpose_Y"));
auto scale = static_cast<T>(context.Attr<float>("alpha")); auto scale = static_cast<T>(context.Attr<float>("alpha"));
...@@ -142,7 +143,7 @@ static framework::Tensor FoldHeadAndLastDims(const DeviceContext &context, ...@@ -142,7 +143,7 @@ static framework::Tensor FoldHeadAndLastDims(const DeviceContext &context,
* If transposed, `H,W` will be swapped. * If transposed, `H,W` will be swapped.
*/ */
static void ReshapeTensorIntoMatrixSequence( static void ReshapeTensorIntoMatrixSequence(
framework::Tensor *x, const math::MatDescriptor &descriptor) { framework::Tensor *x, const pten::funcs::MatDescriptor &descriptor) {
int64_t h, w; int64_t h, w;
h = descriptor.height_; h = descriptor.height_;
w = descriptor.width_; w = descriptor.width_;
...@@ -176,8 +177,8 @@ static void ReshapeXYOutIntoMatrixSequence(framework::Tensor *x, ...@@ -176,8 +177,8 @@ static void ReshapeXYOutIntoMatrixSequence(framework::Tensor *x,
bool trans_y) { bool trans_y) {
auto x_dim = RowMatrixFromVector(x->dims()); auto x_dim = RowMatrixFromVector(x->dims());
auto y_dim = ColumnMatrixFromVector(y->dims()); auto y_dim = ColumnMatrixFromVector(y->dims());
auto mat_dim_x = math::CreateMatrixDescriptor(x_dim, 0, trans_x); auto mat_dim_x = pten::funcs::CreateMatrixDescriptor(x_dim, 0, trans_x);
auto mat_dim_y = math::CreateMatrixDescriptor(y_dim, 0, trans_y); auto mat_dim_y = pten::funcs::CreateMatrixDescriptor(y_dim, 0, trans_y);
if (mat_dim_x.batch_size_ == 0 && mat_dim_y.batch_size_ == 0) { if (mat_dim_x.batch_size_ == 0 && mat_dim_y.batch_size_ == 0) {
out->Resize({mat_dim_x.height_, mat_dim_y.width_}); out->Resize({mat_dim_x.height_, mat_dim_y.width_});
} else { } else {
...@@ -222,9 +223,9 @@ class MatMulGradKernel : public framework::OpKernel<T> { ...@@ -222,9 +223,9 @@ class MatMulGradKernel : public framework::OpKernel<T> {
const framework::Tensor &b, bool trans_b, const framework::Tensor &b, bool trans_b,
framework::Tensor *out) const { framework::Tensor *out) const {
out->mutable_data<T>(context.GetPlace()); out->mutable_data<T>(context.GetPlace());
auto blas = math::GetBlas<DeviceContext, T>(context); auto blas = pten::funcs::GetBlas<DeviceContext, T>(context);
auto mat_dim_a = math::CreateMatrixDescriptor(a.dims(), 0, trans_a); auto mat_dim_a = pten::funcs::CreateMatrixDescriptor(a.dims(), 0, trans_a);
auto mat_dim_b = math::CreateMatrixDescriptor(b.dims(), 0, trans_b); auto mat_dim_b = pten::funcs::CreateMatrixDescriptor(b.dims(), 0, trans_b);
int head_number = 1; int head_number = 1;
#if defined(PADDLE_WITH_MKLML) && !defined(PADDLE_WITH_CUDA) && \ #if defined(PADDLE_WITH_MKLML) && !defined(PADDLE_WITH_CUDA) && \
...@@ -404,9 +405,9 @@ class MatMulDoubleGradKernel : public framework::OpKernel<T> { ...@@ -404,9 +405,9 @@ class MatMulDoubleGradKernel : public framework::OpKernel<T> {
const framework::Tensor &b, bool trans_b, bool flag, const framework::Tensor &b, bool trans_b, bool flag,
framework::Tensor *out) const { framework::Tensor *out) const {
out->mutable_data<T>(context.GetPlace()); out->mutable_data<T>(context.GetPlace());
auto blas = math::GetBlas<DeviceContext, T>(context); auto blas = pten::funcs::GetBlas<DeviceContext, T>(context);
auto mat_dim_a = math::CreateMatrixDescriptor(a.dims(), 0, trans_a); auto mat_dim_a = pten::funcs::CreateMatrixDescriptor(a.dims(), 0, trans_a);
auto mat_dim_b = math::CreateMatrixDescriptor(b.dims(), 0, trans_b); auto mat_dim_b = pten::funcs::CreateMatrixDescriptor(b.dims(), 0, trans_b);
int head_number = 1; int head_number = 1;
#if defined(PADDLE_WITH_MKLML) && !defined(PADDLE_WITH_CUDA) && \ #if defined(PADDLE_WITH_MKLML) && !defined(PADDLE_WITH_CUDA) && \
...@@ -584,12 +585,12 @@ class MatMulOp : public framework::OperatorWithKernel { ...@@ -584,12 +585,12 @@ class MatMulOp : public framework::OperatorWithKernel {
auto dim_x = GetDimForInput(*context, "X"); auto dim_x = GetDimForInput(*context, "X");
auto dim_y = GetDimForInput(*context, "Y"); auto dim_y = GetDimForInput(*context, "Y");
auto mat_dim_x = auto mat_dim_x = pten::funcs::CreateMatrixDescriptor(
math::CreateMatrixDescriptor(RowMatrixFromVector(dim_x), 0, RowMatrixFromVector(dim_x), 0,
context->Attrs().Get<bool>("transpose_X")); context->Attrs().Get<bool>("transpose_X"));
auto mat_dim_y = auto mat_dim_y = pten::funcs::CreateMatrixDescriptor(
math::CreateMatrixDescriptor(ColumnMatrixFromVector(dim_y), 0, ColumnMatrixFromVector(dim_y), 0,
context->Attrs().Get<bool>("transpose_Y")); context->Attrs().Get<bool>("transpose_Y"));
if (mat_dim_x.width_ == -1) { if (mat_dim_x.width_ == -1) {
mat_dim_x.width_ = mat_dim_y.height_; mat_dim_x.width_ = mat_dim_y.height_;
......
...@@ -19,8 +19,8 @@ limitations under the License. */ ...@@ -19,8 +19,8 @@ limitations under the License. */
#include <vector> #include <vector>
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/operators/xpu_api_wrapper.h" #include "paddle/fluid/operators/xpu_api_wrapper.h"
#include "paddle/pten/kernels/funcs/blas/blas.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -53,7 +53,7 @@ static framework::DDim ColumnMatrixFromVector(const framework::DDim &y_dim) { ...@@ -53,7 +53,7 @@ static framework::DDim ColumnMatrixFromVector(const framework::DDim &y_dim) {
} }
static void ReshapeTensorIntoMatrixSequence( static void ReshapeTensorIntoMatrixSequence(
framework::Tensor *x, const math::MatDescriptor &descriptor) { framework::Tensor *x, const pten::funcs::MatDescriptor &descriptor) {
int64_t h, w; int64_t h, w;
h = descriptor.height_; h = descriptor.height_;
w = descriptor.width_; w = descriptor.width_;
...@@ -86,8 +86,8 @@ static void ReshapeXYOutIntoMatrixSequence(framework::Tensor *x, ...@@ -86,8 +86,8 @@ static void ReshapeXYOutIntoMatrixSequence(framework::Tensor *x,
bool trans_y) { bool trans_y) {
auto x_dim = RowMatrixFromVector(x->dims()); auto x_dim = RowMatrixFromVector(x->dims());
auto y_dim = ColumnMatrixFromVector(y->dims()); auto y_dim = ColumnMatrixFromVector(y->dims());
auto mat_dim_x = math::CreateMatrixDescriptor(x_dim, 0, trans_x); auto mat_dim_x = pten::funcs::CreateMatrixDescriptor(x_dim, 0, trans_x);
auto mat_dim_y = math::CreateMatrixDescriptor(y_dim, 0, trans_y); auto mat_dim_y = pten::funcs::CreateMatrixDescriptor(y_dim, 0, trans_y);
if (mat_dim_x.batch_size_ == 0 && mat_dim_y.batch_size_ == 0) { if (mat_dim_x.batch_size_ == 0 && mat_dim_y.batch_size_ == 0) {
out->Resize({mat_dim_x.height_, mat_dim_y.width_}); out->Resize({mat_dim_x.height_, mat_dim_y.width_});
} else { } else {
...@@ -109,10 +109,10 @@ static void MatMulXPUFunction(const Tensor *x, const Tensor *y, Tensor *out, ...@@ -109,10 +109,10 @@ static void MatMulXPUFunction(const Tensor *x, const Tensor *y, Tensor *out,
auto &dev_ctx = auto &dev_ctx =
ctx.template device_context<paddle::platform::XPUDeviceContext>(); ctx.template device_context<paddle::platform::XPUDeviceContext>();
auto mat_dim_a = auto mat_dim_a = pten::funcs::CreateMatrixDescriptor(
math::CreateMatrixDescriptor(RowMatrixFromVector(x_dims), 0, trans_x); RowMatrixFromVector(x_dims), 0, trans_x);
auto mat_dim_b = auto mat_dim_b = pten::funcs::CreateMatrixDescriptor(
math::CreateMatrixDescriptor(ColumnMatrixFromVector(y_dims), 0, trans_y); ColumnMatrixFromVector(y_dims), 0, trans_y);
if (x_dims.size() == 3 && y_dims.size() <= 2) { if (x_dims.size() == 3 && y_dims.size() <= 2) {
// if transpose_X is true, the transpose cost much time // if transpose_X is true, the transpose cost much time
......
...@@ -21,8 +21,8 @@ limitations under the License. */ ...@@ -21,8 +21,8 @@ limitations under the License. */
#include "paddle/fluid/framework/data_type.h" #include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/dot_op.h" #include "paddle/fluid/operators/dot_op.h"
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/operators/reduce_ops/reduce_sum_op.h" #include "paddle/fluid/operators/reduce_ops/reduce_sum_op.h"
#include "paddle/pten/kernels/funcs/blas/blas.h"
#include "paddle/pten/kernels/funcs/complex_functors.h" #include "paddle/pten/kernels/funcs/complex_functors.h"
// only can include the headers in paddle/pten/api dirs // only can include the headers in paddle/pten/api dirs
...@@ -77,7 +77,7 @@ static framework::DDim ColumnMatrixFromVector(const framework::DDim& y_dim) { ...@@ -77,7 +77,7 @@ static framework::DDim ColumnMatrixFromVector(const framework::DDim& y_dim) {
* If transposed, `H,W` will be swapped. * If transposed, `H,W` will be swapped.
*/ */
static void ReshapeTensorIntoMatrixSequence( static void ReshapeTensorIntoMatrixSequence(
framework::Tensor* x, const math::MatDescriptor& descriptor) { framework::Tensor* x, const pten::funcs::MatDescriptor& descriptor) {
int64_t h, w; int64_t h, w;
h = descriptor.height_; h = descriptor.height_;
w = descriptor.width_; w = descriptor.width_;
...@@ -97,8 +97,8 @@ static void ReshapeXYOutIntoMatrixSequence(framework::Tensor* x, ...@@ -97,8 +97,8 @@ static void ReshapeXYOutIntoMatrixSequence(framework::Tensor* x,
bool trans_y) { bool trans_y) {
auto x_dim = RowMatrixFromVector(x->dims()); auto x_dim = RowMatrixFromVector(x->dims());
auto y_dim = ColumnMatrixFromVector(y->dims()); auto y_dim = ColumnMatrixFromVector(y->dims());
auto mat_dim_x = math::CreateMatrixDescriptor(x_dim, 0, trans_x); auto mat_dim_x = pten::funcs::CreateMatrixDescriptor(x_dim, 0, trans_x);
auto mat_dim_y = math::CreateMatrixDescriptor(y_dim, 0, trans_y); auto mat_dim_y = pten::funcs::CreateMatrixDescriptor(y_dim, 0, trans_y);
if (mat_dim_x.batch_size_ == 0 && mat_dim_y.batch_size_ == 0) { if (mat_dim_x.batch_size_ == 0 && mat_dim_y.batch_size_ == 0) {
out->Resize({mat_dim_x.height_, mat_dim_y.width_}); out->Resize({mat_dim_x.height_, mat_dim_y.width_});
} else { } else {
......
...@@ -33,10 +33,10 @@ static void MatMulXPUFunction(const Tensor* x, const Tensor* y, Tensor* out, ...@@ -33,10 +33,10 @@ static void MatMulXPUFunction(const Tensor* x, const Tensor* y, Tensor* out,
auto& dev_ctx = auto& dev_ctx =
ctx.template device_context<paddle::platform::XPUDeviceContext>(); ctx.template device_context<paddle::platform::XPUDeviceContext>();
auto mat_dim_a = auto mat_dim_a = pten::funcs::CreateMatrixDescriptor(
math::CreateMatrixDescriptor(RowMatrixFromVector(x_dims), 0, trans_x); RowMatrixFromVector(x_dims), 0, trans_x);
auto mat_dim_b = auto mat_dim_b = pten::funcs::CreateMatrixDescriptor(
math::CreateMatrixDescriptor(ColumnMatrixFromVector(y_dims), 0, trans_y); ColumnMatrixFromVector(y_dims), 0, trans_y);
if (x_dims.size() == 3 && y_dims.size() <= 2) { if (x_dims.size() == 3 && y_dims.size() <= 2) {
// if transpose_X is true, the transpose cost much time // if transpose_X is true, the transpose cost much time
......
...@@ -18,9 +18,9 @@ limitations under the License. */ ...@@ -18,9 +18,9 @@ limitations under the License. */
#include <vector> #include <vector>
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/tensor_util.h" #include "paddle/fluid/framework/tensor_util.h"
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/operators/math/matrix_inverse.h" #include "paddle/fluid/operators/math/matrix_inverse.h"
#include "paddle/fluid/platform/for_range.h" #include "paddle/fluid/platform/for_range.h"
#include "paddle/pten/kernels/funcs/blas/blas.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -58,7 +58,7 @@ void MatrixPowerFunction(const Tensor* X, const int n, Tensor* Out, ...@@ -58,7 +58,7 @@ void MatrixPowerFunction(const Tensor* X, const int n, Tensor* Out,
return; return;
} }
auto blas = math::GetBlas<DeviceContext, T>(dev_ctx); auto blas = pten::funcs::GetBlas<DeviceContext, T>(dev_ctx);
Tensor new_x = ctx.AllocateTmpTensor<T, DeviceContext>(X->dims(), dev_ctx); Tensor new_x = ctx.AllocateTmpTensor<T, DeviceContext>(X->dims(), dev_ctx);
int new_n = n; int new_n = n;
...@@ -77,7 +77,7 @@ void MatrixPowerFunction(const Tensor* X, const int n, Tensor* Out, ...@@ -77,7 +77,7 @@ void MatrixPowerFunction(const Tensor* X, const int n, Tensor* Out,
return; return;
} }
auto no_trans_desc = math::CreateMatrixDescriptor(x_dims, 0, false); auto no_trans_desc = pten::funcs::CreateMatrixDescriptor(x_dims, 0, false);
if (new_n == 2) { if (new_n == 2) {
// Out = newX * newX // Out = newX * newX
...@@ -166,7 +166,7 @@ void MatrixPowerGradFunction(const Tensor* X, const Tensor* Out, ...@@ -166,7 +166,7 @@ void MatrixPowerGradFunction(const Tensor* X, const Tensor* Out,
const auto& x_dims = X->dims(); const auto& x_dims = X->dims();
auto& dev_ctx = ctx.template device_context<DeviceContext>(); auto& dev_ctx = ctx.template device_context<DeviceContext>();
auto blas = math::GetBlas<DeviceContext, T>(dev_ctx); auto blas = pten::funcs::GetBlas<DeviceContext, T>(dev_ctx);
if (n == 0) { if (n == 0) {
// \nabla X = O // \nabla X = O
...@@ -179,8 +179,8 @@ void MatrixPowerGradFunction(const Tensor* X, const Tensor* Out, ...@@ -179,8 +179,8 @@ void MatrixPowerGradFunction(const Tensor* X, const Tensor* Out,
return; return;
} }
auto trans_desc = math::CreateMatrixDescriptor(x_dims, 0, true); auto trans_desc = pten::funcs::CreateMatrixDescriptor(x_dims, 0, true);
auto no_trans_desc = math::CreateMatrixDescriptor(x_dims, 0, false); auto no_trans_desc = pten::funcs::CreateMatrixDescriptor(x_dims, 0, false);
if (n == -1) { if (n == -1) {
// \nabla X = Out^{T} * \nabla Out * Out^{T} // \nabla X = Out^{T} * \nabla Out * Out^{T}
......
...@@ -113,10 +113,8 @@ class MatMulMKLDNNHandler ...@@ -113,10 +113,8 @@ class MatMulMKLDNNHandler
float scale) float scale)
: paddle::platform::MKLDNNHandlerNoCachingT<XT, dnnl::matmul>(engine, : paddle::platform::MKLDNNHandlerNoCachingT<XT, dnnl::matmul>(engine,
cpu_place) { cpu_place) {
auto mat_dim_x = auto mat_dim_x = pten::funcs::CreateMatrixDescriptor(x->dims(), 0, trans_x);
paddle::operators::math::CreateMatrixDescriptor(x->dims(), 0, trans_x); auto mat_dim_y = pten::funcs::CreateMatrixDescriptor(y->dims(), 0, trans_y);
auto mat_dim_y =
paddle::operators::math::CreateMatrixDescriptor(y->dims(), 0, trans_y);
memory::dim x_bs = mat_dim_x.batch_size_; memory::dim x_bs = mat_dim_x.batch_size_;
memory::dim y_bs = mat_dim_y.batch_size_; memory::dim y_bs = mat_dim_y.batch_size_;
...@@ -237,8 +235,8 @@ class MatMulMKLDNNHandler ...@@ -237,8 +235,8 @@ class MatMulMKLDNNHandler
out_strides; out_strides;
}; };
std::pair<paddle::operators::math::MatDescriptor, memory::dims> std::pair<pten::funcs::MatDescriptor, memory::dims> GetInputDimsAndStrides(
GetInputDimsAndStrides(const ExecutionContext& ctx, std::string input_name) { const ExecutionContext& ctx, std::string input_name) {
auto shape = ctx.Attr<std::vector<int>>("fused_reshape_" + input_name); auto shape = ctx.Attr<std::vector<int>>("fused_reshape_" + input_name);
auto axis = ctx.Attr<std::vector<int>>("fused_transpose_" + input_name); auto axis = ctx.Attr<std::vector<int>>("fused_transpose_" + input_name);
auto input_dims = ctx.Input<Tensor>(input_name)->dims(); auto input_dims = ctx.Input<Tensor>(input_name)->dims();
...@@ -279,10 +277,9 @@ class MatMulMKLDNNHandler ...@@ -279,10 +277,9 @@ class MatMulMKLDNNHandler
auto& MatrixDimsFromVector = input_name == "X" ? RowMatrixDimsFromVector auto& MatrixDimsFromVector = input_name == "X" ? RowMatrixDimsFromVector
: ColumnMatrixDimsFromVector; : ColumnMatrixDimsFromVector;
paddle::operators::math::MatDescriptor mat_dim = pten::funcs::MatDescriptor mat_dim = pten::funcs::CreateMatrixDescriptor(
paddle::operators::math::CreateMatrixDescriptor( MatrixDimsFromVector(new_dims), 0,
MatrixDimsFromVector(new_dims), 0, ctx.Attr<bool>("transpose_" + input_name));
ctx.Attr<bool>("transpose_" + input_name));
memory::dims strides; memory::dims strides;
if (!shape.empty()) { if (!shape.empty()) {
...@@ -324,10 +321,10 @@ class MatMulMKLDNNHandler ...@@ -324,10 +321,10 @@ class MatMulMKLDNNHandler
} }
MatMulDims GetMatmulDims(const ExecutionContext& ctx) { MatMulDims GetMatmulDims(const ExecutionContext& ctx) {
paddle::operators::math::MatDescriptor mat_dim_x; pten::funcs::MatDescriptor mat_dim_x;
memory::dims strides_x; memory::dims strides_x;
std::tie(mat_dim_x, strides_x) = GetInputDimsAndStrides(ctx, "X"); std::tie(mat_dim_x, strides_x) = GetInputDimsAndStrides(ctx, "X");
paddle::operators::math::MatDescriptor mat_dim_y; pten::funcs::MatDescriptor mat_dim_y;
memory::dims strides_y; memory::dims strides_y;
std::tie(mat_dim_y, strides_y) = GetInputDimsAndStrides(ctx, "Y"); std::tie(mat_dim_y, strides_y) = GetInputDimsAndStrides(ctx, "Y");
...@@ -431,7 +428,7 @@ class MatMulMKLDNNHandler ...@@ -431,7 +428,7 @@ class MatMulMKLDNNHandler
* If transposed, `H,W` will be swapped. * If transposed, `H,W` will be swapped.
*/ */
static void ReshapeTensorToMatrixSequence( static void ReshapeTensorToMatrixSequence(
Tensor* x, const paddle::operators::math::MatDescriptor& descriptor) { Tensor* x, const pten::funcs::MatDescriptor& descriptor) {
int64_t h, w; int64_t h, w;
h = descriptor.height_; h = descriptor.height_;
w = descriptor.width_; w = descriptor.width_;
...@@ -463,10 +460,8 @@ static void ReshapeXYOutToMatrixSequence(Tensor* x, Tensor* y, Tensor* out, ...@@ -463,10 +460,8 @@ static void ReshapeXYOutToMatrixSequence(Tensor* x, Tensor* y, Tensor* out,
bool trans_x, bool trans_y) { bool trans_x, bool trans_y) {
auto x_dim = RowMatrixDimsFromVector(x->dims()); auto x_dim = RowMatrixDimsFromVector(x->dims());
auto y_dim = ColumnMatrixDimsFromVector(y->dims()); auto y_dim = ColumnMatrixDimsFromVector(y->dims());
auto mat_dim_x = auto mat_dim_x = pten::funcs::CreateMatrixDescriptor(x_dim, 0, trans_x);
paddle::operators::math::CreateMatrixDescriptor(x_dim, 0, trans_x); auto mat_dim_y = pten::funcs::CreateMatrixDescriptor(y_dim, 0, trans_y);
auto mat_dim_y =
paddle::operators::math::CreateMatrixDescriptor(y_dim, 0, trans_y);
if (mat_dim_x.batch_size_ == 0 && mat_dim_y.batch_size_ == 0) { if (mat_dim_x.batch_size_ == 0 && mat_dim_y.batch_size_ == 0) {
out->Resize({mat_dim_x.height_, mat_dim_y.width_}); out->Resize({mat_dim_x.height_, mat_dim_y.width_});
} else { } else {
......
...@@ -16,8 +16,8 @@ limitations under the License. */ ...@@ -16,8 +16,8 @@ limitations under the License. */
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/platform/mkldnn_reuse.h" #include "paddle/fluid/platform/mkldnn_reuse.h"
#include "paddle/pten/kernels/funcs/blas/blas.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
......
...@@ -84,11 +84,10 @@ std::vector<int64_t> GetInputStrides(const ExecutionContext& ctx, ...@@ -84,11 +84,10 @@ std::vector<int64_t> GetInputStrides(const ExecutionContext& ctx,
auto& MatrixDimsFromVector = auto& MatrixDimsFromVector =
input_name == "X" ? RowMatrixDimsFromVector : ColumnMatrixDimsFromVector; input_name == "X" ? RowMatrixDimsFromVector : ColumnMatrixDimsFromVector;
paddle::operators::math::MatDescriptor mat_dim = pten::funcs::MatDescriptor mat_dim = pten::funcs::CreateMatrixDescriptor(
paddle::operators::math::CreateMatrixDescriptor( MatrixDimsFromVector(new_dims), 0,
MatrixDimsFromVector(new_dims), 0, ctx.Attr<bool>(std::string("trans_") +
ctx.Attr<bool>(std::string("trans_") + static_cast<char>(std::tolower(input_name[0]))));
static_cast<char>(std::tolower(input_name[0]))));
std::vector<int64_t> strides; std::vector<int64_t> strides;
if (!shape.empty()) { if (!shape.empty()) {
......
...@@ -16,7 +16,7 @@ limitations under the License. */ ...@@ -16,7 +16,7 @@ limitations under the License. */
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/operator.h" #include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/operators/math/blas.h" #include "paddle/pten/kernels/funcs/blas/blas.h"
#include "paddle/pten/kernels/funcs/math_function.h" #include "paddle/pten/kernels/funcs/math_function.h"
namespace paddle { namespace paddle {
...@@ -51,7 +51,7 @@ class MulKernel : public framework::OpKernel<T> { ...@@ -51,7 +51,7 @@ class MulKernel : public framework::OpKernel<T> {
z->Resize({x_matrix.dims()[0], y_matrix.dims()[1]}); z->Resize({x_matrix.dims()[0], y_matrix.dims()[1]});
} }
auto blas = math::GetBlas<DeviceContext, T>(context); auto blas = pten::funcs::GetBlas<DeviceContext, T>(context);
blas.MatMul(x_matrix, y_matrix, z); blas.MatMul(x_matrix, y_matrix, z);
if (z_dim.size() != 2) { if (z_dim.size() != 2) {
...@@ -92,7 +92,7 @@ class MulGradKernel : public framework::OpKernel<T> { ...@@ -92,7 +92,7 @@ class MulGradKernel : public framework::OpKernel<T> {
} }
auto& dev_ctx = ctx.template device_context<DeviceContext>(); auto& dev_ctx = ctx.template device_context<DeviceContext>();
auto blas = math::GetBlas<DeviceContext, T>(dev_ctx); auto blas = pten::funcs::GetBlas<DeviceContext, T>(dev_ctx);
if (dx) { if (dx) {
dx->mutable_data<T>(ctx.GetPlace()); dx->mutable_data<T>(ctx.GetPlace());
Tensor dx_matrix = dx->dims().size() > 2 Tensor dx_matrix = dx->dims().size() > 2
...@@ -153,7 +153,7 @@ class MulDoubleGradKernel : public framework::OpKernel<T> { ...@@ -153,7 +153,7 @@ class MulDoubleGradKernel : public framework::OpKernel<T> {
} }
auto& dev_ctx = ctx.template device_context<DeviceContext>(); auto& dev_ctx = ctx.template device_context<DeviceContext>();
auto blas = math::GetBlas<DeviceContext, T>(dev_ctx); auto blas = pten::funcs::GetBlas<DeviceContext, T>(dev_ctx);
// a flag to specify whether ddout value has been set, if flag // a flag to specify whether ddout value has been set, if flag
// is false, MatMul beta should be 0 to set ddout, if flag is // is false, MatMul beta should be 0 to set ddout, if flag is
// true, MatMul beta should be 1 to add result to ddout. // true, MatMul beta should be 1 to add result to ddout.
......
...@@ -18,9 +18,9 @@ limitations under the License. */ ...@@ -18,9 +18,9 @@ limitations under the License. */
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/op_version_registry.h" #include "paddle/fluid/framework/op_version_registry.h"
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/operators/strided_memcpy.h" #include "paddle/fluid/operators/strided_memcpy.h"
#include "paddle/fluid/operators/utils.h" #include "paddle/fluid/operators/utils.h"
#include "paddle/pten/kernels/funcs/blas/blas.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -95,15 +95,15 @@ inline framework::Tensor MatMul(const framework::ExecutionContext& ctx, ...@@ -95,15 +95,15 @@ inline framework::Tensor MatMul(const framework::ExecutionContext& ctx,
const framework::DDim& a_dim, const framework::DDim& a_dim,
const framework::DDim& b_dim) { const framework::DDim& b_dim) {
auto place = ctx.GetPlace(); auto place = ctx.GetPlace();
auto blas = math::GetBlas<DeviceContext, T>(ctx); auto blas = pten::funcs::GetBlas<DeviceContext, T>(ctx);
framework::Tensor matrix_c; framework::Tensor matrix_c;
framework::DDim c_dim = framework::make_ddim({a_dim[0], b_dim[1]}); framework::DDim c_dim = framework::make_ddim({a_dim[0], b_dim[1]});
matrix_c.Resize(c_dim); matrix_c.Resize(c_dim);
matrix_c.mutable_data<T>(place); matrix_c.mutable_data<T>(place);
auto mat_dim_a = math::CreateMatrixDescriptor(a_dim, 0, false); auto mat_dim_a = pten::funcs::CreateMatrixDescriptor(a_dim, 0, false);
auto mat_dim_b = math::CreateMatrixDescriptor(b_dim, 0, false); auto mat_dim_b = pten::funcs::CreateMatrixDescriptor(b_dim, 0, false);
const T alpha = static_cast<T>(1.0); const T alpha = static_cast<T>(1.0);
blas.MatMul(matrix_a, mat_dim_a, matrix_b, mat_dim_b, alpha, &matrix_c, T(0)); blas.MatMul(matrix_a, mat_dim_a, matrix_b, mat_dim_b, alpha, &matrix_c, T(0));
return matrix_c; return matrix_c;
...@@ -269,7 +269,7 @@ class MultiDotKernel : public framework::OpKernel<T> { ...@@ -269,7 +269,7 @@ class MultiDotKernel : public framework::OpKernel<T> {
auto place = ctx.GetPlace(); auto place = ctx.GetPlace();
out->mutable_data<T>(place); out->mutable_data<T>(place);
auto blas = math::GetBlas<DeviceContext, T>(ctx); auto blas = pten::funcs::GetBlas<DeviceContext, T>(ctx);
auto n = ins.size(); auto n = ins.size();
std::vector<framework::DDim> ins_dims(n); std::vector<framework::DDim> ins_dims(n);
...@@ -277,8 +277,10 @@ class MultiDotKernel : public framework::OpKernel<T> { ...@@ -277,8 +277,10 @@ class MultiDotKernel : public framework::OpKernel<T> {
const T scale = static_cast<T>(1.0); const T scale = static_cast<T>(1.0);
if (n == 2) { if (n == 2) {
auto mat_dim_a = math::CreateMatrixDescriptor(ins_dims[0], 0, false); auto mat_dim_a =
auto mat_dim_b = math::CreateMatrixDescriptor(ins_dims[1], 0, false); pten::funcs::CreateMatrixDescriptor(ins_dims[0], 0, false);
auto mat_dim_b =
pten::funcs::CreateMatrixDescriptor(ins_dims[1], 0, false);
blas.MatMul(*ins[0], mat_dim_a, *ins[1], mat_dim_b, scale, out, T(0)); blas.MatMul(*ins[0], mat_dim_a, *ins[1], mat_dim_b, scale, out, T(0));
} else if (n == 3) { } else if (n == 3) {
const auto Ma = ins_dims[0][0]; const auto Ma = ins_dims[0][0];
...@@ -287,16 +289,20 @@ class MultiDotKernel : public framework::OpKernel<T> { ...@@ -287,16 +289,20 @@ class MultiDotKernel : public framework::OpKernel<T> {
const auto Nc = ins_dims[2][1]; const auto Nc = ins_dims[2][1];
const uint64_t cost1 = Ma * Nb * (Ka + Nc); const uint64_t cost1 = Ma * Nb * (Ka + Nc);
const uint64_t cost2 = Ka * Nc * (Nb + Ma); const uint64_t cost2 = Ka * Nc * (Nb + Ma);
auto mat_dim_a = math::CreateMatrixDescriptor(ins_dims[0], 0, false); auto mat_dim_a =
auto mat_dim_b = math::CreateMatrixDescriptor(ins_dims[1], 0, false); pten::funcs::CreateMatrixDescriptor(ins_dims[0], 0, false);
auto mat_dim_c = math::CreateMatrixDescriptor(ins_dims[2], 0, false); auto mat_dim_b =
pten::funcs::CreateMatrixDescriptor(ins_dims[1], 0, false);
auto mat_dim_c =
pten::funcs::CreateMatrixDescriptor(ins_dims[2], 0, false);
if (cost1 < cost2) { if (cost1 < cost2) {
framework::Tensor tmp_out; framework::Tensor tmp_out;
tmp_out.mutable_data<T>(place, Ma * Nb * sizeof(T)); tmp_out.mutable_data<T>(place, Ma * Nb * sizeof(T));
framework::DDim tmp_dim = framework::make_ddim({Ma, Nb}); framework::DDim tmp_dim = framework::make_ddim({Ma, Nb});
blas.MatMul(*ins[0], mat_dim_a, *ins[1], mat_dim_b, scale, &tmp_out, blas.MatMul(*ins[0], mat_dim_a, *ins[1], mat_dim_b, scale, &tmp_out,
T(0)); T(0));
auto mat_dim_tmp = math::CreateMatrixDescriptor(tmp_dim, 0, false); auto mat_dim_tmp =
pten::funcs::CreateMatrixDescriptor(tmp_dim, 0, false);
blas.MatMul(tmp_out, mat_dim_tmp, *ins[2], mat_dim_c, scale, out, T(0)); blas.MatMul(tmp_out, mat_dim_tmp, *ins[2], mat_dim_c, scale, out, T(0));
} else { } else {
framework::Tensor tmp_out; framework::Tensor tmp_out;
...@@ -304,7 +310,8 @@ class MultiDotKernel : public framework::OpKernel<T> { ...@@ -304,7 +310,8 @@ class MultiDotKernel : public framework::OpKernel<T> {
framework::DDim tmp_dim = framework::make_ddim({Ka, Nc}); framework::DDim tmp_dim = framework::make_ddim({Ka, Nc});
blas.MatMul(*ins[1], mat_dim_b, *ins[2], mat_dim_c, scale, &tmp_out, blas.MatMul(*ins[1], mat_dim_b, *ins[2], mat_dim_c, scale, &tmp_out,
T(0)); T(0));
auto mat_dim_tmp = math::CreateMatrixDescriptor(tmp_dim, 0, false); auto mat_dim_tmp =
pten::funcs::CreateMatrixDescriptor(tmp_dim, 0, false);
blas.MatMul(*ins[0], mat_dim_a, tmp_out, mat_dim_tmp, scale, out, T(0)); blas.MatMul(*ins[0], mat_dim_a, tmp_out, mat_dim_tmp, scale, out, T(0));
} }
} else { } else {
...@@ -348,11 +355,11 @@ class MultiDotGradKernel : public framework::OpKernel<T> { ...@@ -348,11 +355,11 @@ class MultiDotGradKernel : public framework::OpKernel<T> {
const framework::Tensor& B, const framework::DDim& dout_dim, const framework::Tensor& B, const framework::DDim& dout_dim,
const framework::DDim& a_dim, const framework::DDim& b_dim, const framework::DDim& a_dim, const framework::DDim& b_dim,
framework::Tensor* dA, framework::Tensor* dB) const { framework::Tensor* dA, framework::Tensor* dB) const {
auto mat_dim_dout = math::CreateMatrixDescriptor(dout_dim, 0, false); auto mat_dim_dout = pten::funcs::CreateMatrixDescriptor(dout_dim, 0, false);
auto mat_dim_a = math::CreateMatrixDescriptor(a_dim, 0, true); auto mat_dim_a = pten::funcs::CreateMatrixDescriptor(a_dim, 0, true);
auto mat_dim_b = math::CreateMatrixDescriptor(b_dim, 0, true); auto mat_dim_b = pten::funcs::CreateMatrixDescriptor(b_dim, 0, true);
T alpha = static_cast<T>(1.0); T alpha = static_cast<T>(1.0);
auto blas = math::GetBlas<DeviceContext, T>(ctx); auto blas = pten::funcs::GetBlas<DeviceContext, T>(ctx);
blas.MatMul(A, mat_dim_a, dout, mat_dim_dout, alpha, dB, T(0)); blas.MatMul(A, mat_dim_a, dout, mat_dim_dout, alpha, dB, T(0));
blas.MatMul(dout, mat_dim_dout, B, mat_dim_b, alpha, dA, T(0)); blas.MatMul(dout, mat_dim_dout, B, mat_dim_b, alpha, dA, T(0));
} }
...@@ -433,7 +440,7 @@ class MultiDotGradKernel : public framework::OpKernel<T> { ...@@ -433,7 +440,7 @@ class MultiDotGradKernel : public framework::OpKernel<T> {
auto dout = *ctx.Input<framework::Tensor>(framework::GradVarName("Out")); auto dout = *ctx.Input<framework::Tensor>(framework::GradVarName("Out"));
auto dx = ctx.MultiOutput<framework::Tensor>(framework::GradVarName("X")); auto dx = ctx.MultiOutput<framework::Tensor>(framework::GradVarName("X"));
auto blas = math::GetBlas<DeviceContext, T>(ctx); auto blas = pten::funcs::GetBlas<DeviceContext, T>(ctx);
auto place = ctx.GetPlace(); auto place = ctx.GetPlace();
const auto n = ins.size(); const auto n = ins.size();
...@@ -458,7 +465,7 @@ class MultiDotGradKernel : public framework::OpKernel<T> { ...@@ -458,7 +465,7 @@ class MultiDotGradKernel : public framework::OpKernel<T> {
} }
T alpha = static_cast<T>(1); T alpha = static_cast<T>(1);
auto mat_dim_dout = math::CreateMatrixDescriptor(dout_dim, 0, false); auto mat_dim_dout = pten::funcs::CreateMatrixDescriptor(dout_dim, 0, false);
if (n == 2) { if (n == 2) {
CalcGrad(ctx, dout, *ins[0], *ins[1], dout_dim, ins_dims[0], ins_dims[1], CalcGrad(ctx, dout, *ins[0], *ins[1], dout_dim, ins_dims[0], ins_dims[1],
dx[0], dx[1]); dx[0], dx[1]);
...@@ -469,9 +476,12 @@ class MultiDotGradKernel : public framework::OpKernel<T> { ...@@ -469,9 +476,12 @@ class MultiDotGradKernel : public framework::OpKernel<T> {
const auto Nc = ins_dims[2][1]; const auto Nc = ins_dims[2][1];
const uint64_t cost1 = Ma * Nb * (Ka + Nc); const uint64_t cost1 = Ma * Nb * (Ka + Nc);
const uint64_t cost2 = Ka * Nc * (Nb + Ma); const uint64_t cost2 = Ka * Nc * (Nb + Ma);
auto mat_dim_a = math::CreateMatrixDescriptor(ins_dims[0], 0, false); auto mat_dim_a =
auto mat_dim_b = math::CreateMatrixDescriptor(ins_dims[1], 0, false); pten::funcs::CreateMatrixDescriptor(ins_dims[0], 0, false);
auto mat_dim_c = math::CreateMatrixDescriptor(ins_dims[2], 0, false); auto mat_dim_b =
pten::funcs::CreateMatrixDescriptor(ins_dims[1], 0, false);
auto mat_dim_c =
pten::funcs::CreateMatrixDescriptor(ins_dims[2], 0, false);
if (cost1 < cost2) { if (cost1 < cost2) {
framework::Tensor tmp_out, tmp_dout; framework::Tensor tmp_out, tmp_dout;
tmp_out.Resize({Ma, Nb}); tmp_out.Resize({Ma, Nb});
......
...@@ -59,7 +59,7 @@ class MVGradKernel<platform::CUDADeviceContext, T> ...@@ -59,7 +59,7 @@ class MVGradKernel<platform::CUDADeviceContext, T>
auto &dev_ctx = auto &dev_ctx =
context.template device_context<platform::CUDADeviceContext>(); context.template device_context<platform::CUDADeviceContext>();
auto blas = math::GetBlas<platform::CUDADeviceContext, T>(dev_ctx); auto blas = pten::funcs::GetBlas<platform::CUDADeviceContext, T>(dev_ctx);
auto stream = context.cuda_device_context().stream(); auto stream = context.cuda_device_context().stream();
auto config = GetGpuLaunchConfig1D(dev_ctx, m * n); auto config = GetGpuLaunchConfig1D(dev_ctx, m * n);
......
...@@ -18,7 +18,7 @@ limitations under the License. */ ...@@ -18,7 +18,7 @@ limitations under the License. */
#include <utility> #include <utility>
#include <vector> #include <vector>
#include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/blas.h" #include "paddle/pten/kernels/funcs/blas/blas.h"
#ifdef PADDLE_WITH_MKLDNN #ifdef PADDLE_WITH_MKLDNN
#include "paddle/fluid/platform/mkldnn_helper.h" #include "paddle/fluid/platform/mkldnn_helper.h"
#endif #endif
...@@ -45,7 +45,7 @@ class MVKernel : public framework::OpKernel<T> { ...@@ -45,7 +45,7 @@ class MVKernel : public framework::OpKernel<T> {
T *out_data = out->mutable_data<T>(context.GetPlace()); T *out_data = out->mutable_data<T>(context.GetPlace());
auto &dev_ctx = context.template device_context<DeviceContext>(); auto &dev_ctx = context.template device_context<DeviceContext>();
auto blas = math::GetBlas<DeviceContext, T>(dev_ctx); auto blas = pten::funcs::GetBlas<DeviceContext, T>(dev_ctx);
blas.GEMV(false, dim_x[0], dim_x[1], static_cast<T>(1), x_data, vec_data, blas.GEMV(false, dim_x[0], dim_x[1], static_cast<T>(1), x_data, vec_data,
static_cast<T>(0), out_data); static_cast<T>(0), out_data);
...@@ -93,7 +93,7 @@ class MVGradKernel : public framework::OpKernel<T> { ...@@ -93,7 +93,7 @@ class MVGradKernel : public framework::OpKernel<T> {
T *dvec_data = dvec->mutable_data<T>(context.GetPlace()); T *dvec_data = dvec->mutable_data<T>(context.GetPlace());
auto &dev_ctx = context.template device_context<DeviceContext>(); auto &dev_ctx = context.template device_context<DeviceContext>();
auto blas = math::GetBlas<DeviceContext, T>(dev_ctx); auto blas = pten::funcs::GetBlas<DeviceContext, T>(dev_ctx);
blas.GEMV(true, dim_x[0], dim_x[1], static_cast<T>(1), x_data, dout_data, blas.GEMV(true, dim_x[0], dim_x[1], static_cast<T>(1), x_data, dout_data,
static_cast<T>(0), dvec_data); static_cast<T>(0), dvec_data);
......
...@@ -14,11 +14,11 @@ limitations under the License. */ ...@@ -14,11 +14,11 @@ limitations under the License. */
#include <algorithm> #include <algorithm>
#include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/operators/rank_attention.cu.h" #include "paddle/fluid/operators/rank_attention.cu.h"
#include "paddle/fluid/operators/rank_attention_op.h" #include "paddle/fluid/operators/rank_attention_op.h"
#include "paddle/fluid/platform/device/gpu/gpu_info.h" #include "paddle/fluid/platform/device/gpu/gpu_info.h"
#include "paddle/fluid/platform/device/gpu/gpu_primitives.h" #include "paddle/fluid/platform/device/gpu/gpu_primitives.h"
#include "paddle/pten/kernels/funcs/blas/blas.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -114,7 +114,7 @@ class RankAttentionCUDAKernel : public framework::OpKernel<T> { ...@@ -114,7 +114,7 @@ class RankAttentionCUDAKernel : public framework::OpKernel<T> {
int64_t strideA = block_matrix_row; int64_t strideA = block_matrix_row;
int64_t strideB = block_matrix_row * para_col; int64_t strideB = block_matrix_row * para_col;
auto blas = math::GetBlas<platform::CUDADeviceContext, T>(dev_ctx); auto blas = pten::funcs::GetBlas<platform::CUDADeviceContext, T>(dev_ctx);
blas.BatchedGEMM(transA, transB, 1, para_col, block_matrix_row, alpha, blas.BatchedGEMM(transA, transB, 1, para_col, block_matrix_row, alpha,
input_help_data, param_help_data, beta, out_data, ins_num, input_help_data, param_help_data, beta, out_data, ins_num,
strideA, strideB); strideA, strideB);
...@@ -170,7 +170,7 @@ class RankAttentionGradOpCUDAKernel : public framework::OpKernel<T> { ...@@ -170,7 +170,7 @@ class RankAttentionGradOpCUDAKernel : public framework::OpKernel<T> {
const T *ins_rank_data = ins_rank->data<T>(); const T *ins_rank_data = ins_rank->data<T>();
T *param_grad_data = param_grad.data<T>(); T *param_grad_data = param_grad.data<T>();
auto blas = math::GetBlas<platform::CUDADeviceContext, T>(dev_ctx); auto blas = pten::funcs::GetBlas<platform::CUDADeviceContext, T>(dev_ctx);
T alpha = 1; T alpha = 1;
T beta = 0; T beta = 0;
......
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册