Refactor GEMM in blas

c888e016 · Yu Yang · c93a624b · c888e016 · c888e016 · c888e016
10 changed file
--- a/paddle/fluid/operators/bilinear_tensor_product_op.h
+++ b/paddle/fluid/operators/bilinear_tensor_product_op.h
@@ -61,9 +61,9 @@ class BilinearTensorProductKernel : public framework::OpKernel<T> {
      auto output_col_vec = output_mat.chip(i, 1);
      Tensor weight_mat =
          weight->Slice(i, i + 1).Resize(framework::make_ddim({x_dim, y_dim}));
-      math::gemm<DeviceContext, T>(dev_ctx, CblasNoTrans, CblasNoTrans,
+      math::GetBlas<DeviceContext, T>(dev_ctx).GEMM(
-                                   batch_size, y_dim, x_dim, 1, x->data<T>(),
+          CblasNoTrans, CblasNoTrans, batch_size, y_dim, x_dim, 1, x->data<T>(),
-                                   weight_mat.data<T>(), 0, left_mul.data<T>());
+          weight_mat.data<T>(), 0, left_mul.data<T>());
      output_col_vec.device(place) =
          (left_mul_mat * y_mat).sum(Eigen::DSizes<int, 1>(1));
    }
@@ -125,6 +125,8 @@ class BilinearTensorProductGradKernel : public framework::OpKernel<T> {
      set_zero(dev_ctx, d_y, static_cast<T>(0));
    }
+    auto blas = math::GetBlas<DeviceContext, T>(ctx);
    // Caculate the Output(X@Grad) and Output(Y@Grad).
    if (d_x || d_y) {
      Eigen::DSizes<int, 2> bcast_for_x(1, y_dim);
@@ -138,18 +140,16 @@ class BilinearTensorProductGradKernel : public framework::OpKernel<T> {
              output_vec.reshape(Eigen::DSizes<int, 2>(batch_size, 1))
                  .broadcast(bcast_for_x) *
              y_mat;
-          math::gemm<DeviceContext, T>(
+          blas.GEMM(CblasNoTrans, CblasTrans, batch_size, x_dim, y_dim, 1,
-              dev_ctx, CblasNoTrans, CblasTrans, batch_size, x_dim, y_dim, 1,
+                    y_scale.data<T>(), weight_i.data<T>(), 1, d_x->data<T>());
-              y_scale.data<T>(), weight_i.data<T>(), 1, d_x->data<T>());
        }
        if (d_y) {
          x_scale_mat.device(place) =
              output_vec.reshape(Eigen::DSizes<int, 2>(batch_size, 1))
                  .broadcast(bcast_for_y) *
              x_mat;
-          math::gemm<DeviceContext, T>(
+          blas.GEMM(CblasNoTrans, CblasNoTrans, batch_size, y_dim, x_dim, 1,
-              dev_ctx, CblasNoTrans, CblasNoTrans, batch_size, y_dim, x_dim, 1,
+                    x_scale.data<T>(), weight_i.data<T>(), 1, d_y->data<T>());
-              x_scale.data<T>(), weight_i.data<T>(), 1, d_y->data<T>());
        }
      }
    }
@@ -166,9 +166,8 @@ class BilinearTensorProductGradKernel : public framework::OpKernel<T> {
            output_vec.reshape(Eigen::DSizes<int, 2>(batch_size, 1))
                .broadcast(bcast_for_weight) *
            x_mat;
-        math::gemm<DeviceContext, T>(dev_ctx, CblasTrans, CblasNoTrans, x_dim,
+        blas.GEMM(CblasTrans, CblasNoTrans, x_dim, y_dim, batch_size, 1,
-                                     y_dim, batch_size, 1, x_scale.data<T>(),
+                  x_scale.data<T>(), y->data<T>(), 0, d_weight_i.data<T>());
-                                     y->data<T>(), 0, d_weight_i.data<T>());
      }
    }

--- a/paddle/fluid/operators/gru_unit_op.h
+++ b/paddle/fluid/operators/gru_unit_op.h
@@ -87,10 +87,10 @@ class GRUUnitKernel : public framework::OpKernel<T> {
    const T* weight_data = weight->data<T>();
    T* gate_data = gate->data<T>();
    T* reset_hidden_prev_data = reset_hidden_prev->data<T>();
-    math::gemm<DeviceContext, T>(
+    auto blas = math::GetBlas<DeviceContext, T>(context);
-        context.template device_context<DeviceContext>(), false, false,
+    blas.GEMM(false, false, batch_size, 2 * frame_size, frame_size, 1,
-        batch_size, 2 * frame_size, frame_size, 1, hidden_prev_data, frame_size,
+              hidden_prev_data, frame_size, weight_data, frame_size * 2, 1,
-        weight_data, frame_size * 2, 1, gate_data, frame_size * 3);
+              gate_data, frame_size * 3);
    // calculate activited gate
    Eigen::array<int, 2> extents({{batch_size, frame_size}});
@@ -103,11 +103,10 @@ class GRUUnitKernel : public framework::OpKernel<T> {
               g.slice(r_offsets, extents), g.slice(r_offsets, extents));
    auto r = g.slice(r_offsets, extents);  // reset gate
    r_h_p.device(place) = r * h_p;         // reset previous hidden state
-    math::gemm<DeviceContext, T>(
+    blas.GEMM(false, false, batch_size, frame_size, frame_size, 1,
-        context.template device_context<DeviceContext>(), false, false,
+              reset_hidden_prev_data, frame_size,
-        batch_size, frame_size, frame_size, 1, reset_hidden_prev_data,
+              weight_data + frame_size * frame_size * 2, frame_size, 1,
-        frame_size, weight_data + frame_size * frame_size * 2, frame_size, 1,
+              gate_data + frame_size * 2, frame_size * 3);
-        gate_data + frame_size * 2, frame_size * 3);
    Eigen::array<int, 2> c_offsets({{0, frame_size * 2}});
    ActCompute(context.Attr<int>("activation"), place,
@@ -188,11 +187,11 @@ class GRUUnitGradKernel : public framework::OpKernel<T> {
    ActGradCompute(context.Attr<int>("activation"), place, c, c,
                   d_g.slice(c_offsets, extents), d_h * u);
    // backward for reset_hidden_prev
-    math::gemm<DeviceContext, T>(
+    auto blas = math::GetBlas<DeviceContext, T>(context);
-        context.template device_context<DeviceContext>(), false, true,
+    blas.GEMM(false, true, batch_size, frame_size, frame_size, 1,
-        batch_size, frame_size, frame_size, 1, gate_grad_data + frame_size * 2,
+              gate_grad_data + frame_size * 2, frame_size * 3,
-        frame_size * 3, weight_data + frame_size * frame_size * 2, frame_size,
+              weight_data + frame_size * frame_size * 2, frame_size, 0,
-        0, reset_hidden_prev_grad_data, frame_size);
+              reset_hidden_prev_grad_data, frame_size);
    // backward for unactivated reset gate
    ActGradCompute(context.Attr<int>("gate_activation"), place, r, r,
                   d_g.slice(r_offsets, extents), d_r_h_p * h_p);
@@ -200,18 +199,15 @@ class GRUUnitGradKernel : public framework::OpKernel<T> {
    if (weight_grad) {
      T* weight_grad_data = weight_grad->mutable_data<T>(context.GetPlace());
      // backward for state_weight
-      math::gemm<DeviceContext, T>(
+      blas.GEMM(true, false, frame_size, frame_size, batch_size, 1,
-          context.template device_context<DeviceContext>(), true, false,
+                reset_hidden_prev_data, frame_size,
-          frame_size, frame_size, batch_size, 1, reset_hidden_prev_data,
+                gate_grad_data + frame_size * 2, frame_size * 3, 0,
-          frame_size, gate_grad_data + frame_size * 2, frame_size * 3, 0,
+                weight_grad_data + frame_size * frame_size * 2, frame_size);
-          weight_grad_data + frame_size * frame_size * 2, frame_size);
      // backward for update_gate_weight and reset_gate_weight
-      math::gemm<DeviceContext, T>(
+      blas.GEMM(true, false, frame_size, frame_size * 2, batch_size, 1,
-          context.template device_context<DeviceContext>(), true, false,
+                hidden_prev_data, frame_size, gate_grad_data, frame_size * 3, 0,
-          frame_size, frame_size * 2, batch_size, 1, hidden_prev_data,
+                weight_grad_data, frame_size * 2);
-          frame_size, gate_grad_data, frame_size * 3, 0, weight_grad_data,
-          frame_size * 2);
    }
    // backward for hidden_prev
    if (hidden_prev_grad) {
@@ -219,11 +215,9 @@ class GRUUnitGradKernel : public framework::OpKernel<T> {
          hidden_prev_grad->mutable_data<T>(context.GetPlace());
      auto d_h_p = EigenMatrix<T>::From(*hidden_prev_grad);
      d_h_p.device(place) = d_r_h_p * r + d_h * (u.constant(T(1)) - u);
-      math::gemm<DeviceContext, T>(
+      blas.GEMM(false, true, batch_size, frame_size, frame_size * 2, 1,
-          context.template device_context<DeviceContext>(), false, true,
+                gate_grad_data, frame_size * 3, weight_data, frame_size * 2, 1,
-          batch_size, frame_size, frame_size * 2, 1, gate_grad_data,
+                hidden_prev_grad_data, frame_size);
-          frame_size * 3, weight_data, frame_size * 2, 1, hidden_prev_grad_data,
-          frame_size);
    }
    // backward for input
    if (input_grad) {

--- a/paddle/fluid/operators/math/blas_impl.cu.h
+++ b/paddle/fluid/operators/math/blas_impl.cu.h
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/platform/dynload/cublas.h"
+namespace paddle {
+namespace operators {
+namespace math {
+template <typename T>
+struct CUBlas;
+template <>
+struct CUBlas<float> {
+  template <typename... ARGS>
+  static void GEMM(ARGS... args) {
+    PADDLE_ENFORCE(platform::dynload::cublasSgemm(args...));
+  }
+};
+template <>
+struct CUBlas<double> {
+  template <typename... ARGS>
+  static void GEMM(ARGS... args) {
+    PADDLE_ENFORCE(platform::dynload::cublasDgemm(args...));
+  }
+};
+template <>
+struct CUBlas<platform::float16> {
+  template <typename... ARGS>
+  static void GEMM(ARGS... args) {
+    PADDLE_ENFORCE(platform::dynload::cublasHgemm(args...));
+  }
+};
+template <>
+template <typename T>
+void Blas<platform::CUDADeviceContext>::GEMM(const CBLAS_TRANSPOSE transA,
+                                             const CBLAS_TRANSPOSE transB,
+                                             const int M, const int N,
+                                             const int K, const T alpha,
+                                             const T *A, const T *B,
+                                             const T beta, T *C) const {
+  // Note that cublas follows fortran order, so the order is different from
+  // the cblas convention.
+  int lda = (transA == CblasNoTrans) ? K : M;
+  int ldb = (transB == CblasNoTrans) ? N : K;
+  cublasOperation_t cuTransA =
+      (transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
+  cublasOperation_t cuTransB =
+      (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
+  CUBlas<T>::GEMM(context_.cublas_handle(), cuTransB, cuTransA, N, M, K, &alpha,
+                  B, ldb, A, lda, &beta, C, N);
+}
+template <>
+template <>
+inline void Blas<platform::CUDADeviceContext>::GEMM(
+    const CBLAS_TRANSPOSE transA, const CBLAS_TRANSPOSE transB, const int M,
+    const int N, const int K, const platform::float16 alpha,
+    const platform::float16 *A, const platform::float16 *B,
+    const platform::float16 beta, platform::float16 *C) const {
+  // Note that cublas follows fortran order, so the order is different from
+  // the cblas convention.
+  int lda = (transA == CblasNoTrans) ? K : M;
+  int ldb = (transB == CblasNoTrans) ? N : K;
+  cublasOperation_t cuTransA =
+      (transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
+  cublasOperation_t cuTransB =
+      (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
+  // TODO(kexinzhao): add processing code for compute capability < 53 case
+  PADDLE_ENFORCE_GE(context_.GetComputeCapability(), 53,
+                    "cublas fp16 gemm requires GPU compute capability >= 53");
+#if CUDA_VERSION >= 8000
+  float h_alpha = static_cast<float>(alpha);
+  float h_beta = static_cast<float>(beta);
+  cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT;
+#if CUDA_VERSION >= 9000
+  if (context_.GetComputeCapability() >= 70) {
+    PADDLE_ENFORCE(platform::dynload::cublasSetMathMode(
+        context_.cublas_handle(), CUBLAS_TENSOR_OP_MATH));
+    algo = CUBLAS_GEMM_DFALT_TENSOR_OP;
+  } else {
+    PADDLE_ENFORCE(platform::dynload::cublasSetMathMode(
+        context_.cublas_handle(), CUBLAS_DEFAULT_MATH));
+  }
+#endif  // CUDA_VERSION >= 9000
+  // cublasHgemm does true FP16 computation which is slow for non-Volta
+  // GPUs. So use cublasGemmEx instead which does pesudo FP16 computation:
+  // input/output in fp16, computation in fp32, which can also be accelerated
+  // using tensor cores in volta GPUs.
+  PADDLE_ENFORCE(platform::dynload::cublasGemmEx(
+      context_.cublas_handle(), cuTransB, cuTransA, N, M, K, &h_alpha, B,
+      CUDA_R_16F, ldb, A, CUDA_R_16F, lda, &h_beta, C, CUDA_R_16F, N,
+      CUDA_R_32F, algo));
+#else
+  // CUDA 7.5 does not support cublasGemmEx, hence we fall back to use hgemm
+  const half h_alpha = static_cast<const half>(alpha);
+  const half h_beta = static_cast<const half>(beta);
+  const half *h_A = reinterpret_cast<const half *>(A);
+  const half *h_B = reinterpret_cast<const half *>(B);
+  half *h_C = reinterpret_cast<half *>(C);
+  CUBlas<platform::float16>(context_.cublas_handle(), cuTransB, cuTransA, N, M,
+                            K, &h_alpha, h_B, ldb, h_A, lda, &h_beta, h_C, N);
+#endif  // CUDA_VERSION >= 8000
+}
+template <>
+template <typename T>
+void Blas<platform::CUDADeviceContext>::GEMM(
+    const bool transA, const bool transB, const int M, const int N, const int K,
+    const T alpha, const T *A, const int lda, const T *B, const int ldb,
+    const T beta, T *C, const int ldc) const {
+  // Note that cublas follows fortran order, so the order is different from
+  // the cblas convention.
+  cublasOperation_t cuTransA = transA == false ? CUBLAS_OP_N : CUBLAS_OP_T;
+  cublasOperation_t cuTransB = transB == false ? CUBLAS_OP_N : CUBLAS_OP_T;
+  CUBlas<T>::GEMM(context_.cublas_handle(), cuTransB, cuTransA, N, M, K, &alpha,
+                  B, ldb, A, lda, &beta, C, ldc);
+}
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/math/blas_impl.h
+++ b/paddle/fluid/operators/math/blas_impl.h
+//   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include "paddle/fluid/operators/math/math_function.h"
+namespace paddle {
+namespace operators {
+namespace math {
+template <typename T>
+struct CBlas;
+template <>
+struct CBlas<float> {
+  static constexpr auto GEMM = cblas_sgemm;
+};
+template <>
+struct CBlas<double> {
+  static constexpr auto GEMM = cblas_dgemm;
+};
+template <>
+struct CBlas<platform::float16> {
+  void GEMM(...) { PADDLE_THROW("float16 GEMM not supported on CPU"); }
+};
+template <>
+template <typename T>
+void Blas<platform::CPUDeviceContext>::GEMM(const CBLAS_TRANSPOSE transA,
+                                            const CBLAS_TRANSPOSE transB,
+                                            const int M, const int N,
+                                            const int K, const T alpha,
+                                            const T *A, const T *B,
+                                            const T beta, T *C) const {
+  int lda = (transA == CblasNoTrans) ? K : M;
+  int ldb = (transB == CblasNoTrans) ? N : K;
+  int ldc = N;
+  CBlas<T>::GEMM(CblasRowMajor, transA, transB, M, N, K, alpha, A, lda, B, ldb,
+                 beta, C, ldc);
+}
+template <>
+template <typename T>
+void Blas<platform::CPUDeviceContext>::GEMM(
+    const bool transA, const bool transB, const int M, const int N, const int K,
+    const T alpha, const T *A, const int lda, const T *B, const int ldb,
+    const T beta, T *C, const int ldc) const {
+  CBlas<T>::GEMM(CblasRowMajor, transA == false ? CblasNoTrans : CblasTrans,
+                 transB == false ? CblasNoTrans : CblasTrans, M, N, K, alpha, A,
+                 lda, B, ldb, beta, C, ldc);
+}
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/math/gru_compute.cc
+++ b/paddle/fluid/operators/math/gru_compute.cc
@@ -25,21 +25,21 @@ struct GRUUnitFunctor<platform::CPUDeviceContext, T> {
                      const detail::ActivationType active_node,
                      const detail::ActivationType active_gate) {
 #ifndef __NVCC__
+    auto blas = math::GetBlas<platform::CPUDeviceContext, T>(context);
    if (value.prev_out_value) {
-      math::gemm<platform::CPUDeviceContext, T>(
+      blas.GEMM(false, false, batch_size, frame_size * 2, frame_size, 1,
-          context, false, false, batch_size, frame_size * 2, frame_size, 1,
+                value.prev_out_value, frame_size, value.gate_weight,
-          value.prev_out_value, frame_size, value.gate_weight, frame_size * 2,
+                frame_size * 2, 1, value.gate_value, frame_size * 3);
-          1, value.gate_value, frame_size * 3);
    }
    detail::forward_reset_output(detail::forward::gru_resetOutput<T>(), value,
                                 frame_size, batch_size, active_gate);
    if (value.prev_out_value) {
-      math::gemm<platform::CPUDeviceContext, T>(
+      blas.GEMM(false, false, batch_size, frame_size, frame_size, 1,
-          context, false, false, batch_size, frame_size, frame_size, 1,
+                value.reset_output_value, frame_size, value.state_weight,
-          value.reset_output_value, frame_size, value.state_weight, frame_size,
+                frame_size, 1, value.gate_value + frame_size * 2,
-          1, value.gate_value + frame_size * 2, frame_size * 3);
+                frame_size * 3);
    }
    detail::forward_final_output(detail::forward::gru_finalOutput<T>(), value,
@@ -58,36 +58,32 @@ struct GRUUnitGradFunctor<platform::CPUDeviceContext, T> {
 #ifndef __NVCC__
    detail::backward_state_grad(detail::backward::gru_stateGrad<T>(), value,
                                grad, frame_size, batch_size, active_node);
+    auto blas = math::GetBlas<platform::CPUDeviceContext, T>(context);
    if (value.prev_out_value && grad.prev_out_grad) {
-      math::gemm<platform::CPUDeviceContext, T>(
+      blas.GEMM(false, true, batch_size, frame_size, frame_size, 1,
-          context, false, true, batch_size, frame_size, frame_size, 1,
+                grad.gate_grad + frame_size * 2, frame_size * 3,
-          grad.gate_grad + frame_size * 2, frame_size * 3, value.state_weight,
+                value.state_weight, frame_size, 0, grad.reset_output_grad,
-          frame_size, 0, grad.reset_output_grad, frame_size);
+                frame_size);
      if (grad.state_weight_grad) {
-        math::gemm<platform::CPUDeviceContext, T>(
+        blas.GEMM(true, false, frame_size, frame_size, batch_size, 1,
-            context, true, false, frame_size, frame_size, batch_size, 1,
+                  value.reset_output_value, frame_size,
-            value.reset_output_value, frame_size,
+                  grad.gate_grad + frame_size * 2, frame_size * 3, 1,
-            grad.gate_grad + frame_size * 2, frame_size * 3, 1,
+                  grad.state_weight_grad, frame_size);
-            grad.state_weight_grad, frame_size);
      }
    }
    detail::backward_reset_grad(detail::backward::gru_resetGrad<T>(), value,
                                grad, frame_size, batch_size, active_gate);
    if (grad.prev_out_grad && value.prev_out_value) {
-      math::gemm<platform::CPUDeviceContext, T>(
+      blas.GEMM(false, true, batch_size, frame_size, frame_size * 2, 1,
-          context, false, true, batch_size, frame_size, frame_size * 2, 1,
+                grad.gate_grad, frame_size * 3, value.gate_weight,
-          grad.gate_grad, frame_size * 3, value.gate_weight, frame_size * 2, 1,
+                frame_size * 2, 1, grad.prev_out_grad, frame_size);
-          grad.prev_out_grad, frame_size);
      if (grad.gate_weight_grad) {
-        math::gemm<platform::CPUDeviceContext, T>(
+        blas.GEMM(true, false, frame_size, frame_size * 2, batch_size, 1,
-            context, true, false, frame_size, frame_size * 2, batch_size, 1,
+                  value.prev_out_value, frame_size, grad.gate_grad,
-            value.prev_out_value, frame_size, grad.gate_grad, frame_size * 3, 1,
+                  frame_size * 3, 1, grad.gate_weight_grad, frame_size * 2);
-            grad.gate_weight_grad, frame_size * 2);
      }
    }
 #endif

--- a/paddle/fluid/operators/math/gru_compute.cu
+++ b/paddle/fluid/operators/math/gru_compute.cu
@@ -9,6 +9,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+#include <paddle/fluid/platform/device_context.h>
 #include "paddle/fluid/operators/math/detail/gru_gpu_kernel.h"
 #include "paddle/fluid/operators/math/detail/gru_kernel.h"
 #include "paddle/fluid/operators/math/gru_compute.h"
@@ -36,12 +37,11 @@ struct GRUUnitFunctor<platform::CUDADeviceContext, T> {
      threads = dim3(32, 32);
      grid = dim3((frame_size + 32 - 1) / 32, (batch_size + 32 - 1) / 32);
    }
+    auto blas = math::GetBlas<platform::CUDADeviceContext, T>(context);
    if (value.prev_out_value) {
-      math::gemm<platform::CUDADeviceContext, T>(
+      blas.GEMM(false, false, batch_size, frame_size * 2, frame_size, 1,
-          context, false, false, batch_size, frame_size * 2, frame_size, 1,
+                value.prev_out_value, frame_size, value.gate_weight,
-          value.prev_out_value, frame_size, value.gate_weight, frame_size * 2,
+                frame_size * 2, 1, value.gate_value, frame_size * 3);
-          1, value.gate_value, frame_size * 3);
    }
    if (batch_size == 1) {
@@ -61,10 +61,10 @@ struct GRUUnitFunctor<platform::CUDADeviceContext, T> {
    }
    if (value.prev_out_value) {
-      math::gemm<platform::CUDADeviceContext, T>(
+      blas.GEMM(false, false, batch_size, frame_size, frame_size, 1,
-          context, false, false, batch_size, frame_size, frame_size, 1,
+                value.reset_output_value, frame_size, value.state_weight,
-          value.reset_output_value, frame_size, value.state_weight, frame_size,
+                frame_size, 1, value.gate_value + frame_size * 2,
-          1, value.gate_value + frame_size * 2, frame_size * 3);
+                frame_size * 3);
    }
    if (batch_size == 1) {
@@ -121,18 +121,19 @@ struct GRUUnitGradFunctor<platform::CUDADeviceContext, T> {
          grad.output_grad, frame_size, batch_size, active_node);
    }
+    auto blas = math::GetBlas<platform::CUDADeviceContext, T>(context);
    if (value.prev_out_value && grad.prev_out_grad) {
-      math::gemm<platform::CUDADeviceContext, T>(
+      blas.GEMM(false, true, batch_size, frame_size, frame_size, 1,
-          context, false, true, batch_size, frame_size, frame_size, 1,
+                grad.gate_grad + frame_size * 2, frame_size * 3,
-          grad.gate_grad + frame_size * 2, frame_size * 3, value.state_weight,
+                value.state_weight, frame_size, 0, grad.reset_output_grad,
-          frame_size, 0, grad.reset_output_grad, frame_size);
+                frame_size);
      if (grad.state_weight_grad) {
-        math::gemm<platform::CUDADeviceContext, T>(
+        blas.GEMM(true, false, frame_size, frame_size, batch_size, 1,
-            context, true, false, frame_size, frame_size, batch_size, 1,
+                  value.reset_output_value, frame_size,
-            value.reset_output_value, frame_size,
+                  grad.gate_grad + frame_size * 2, frame_size * 3, 1,
-            grad.gate_grad + frame_size * 2, frame_size * 3, 1,
+                  grad.state_weight_grad, frame_size);
-            grad.state_weight_grad, frame_size);
      }
    }
@@ -153,16 +154,14 @@ struct GRUUnitGradFunctor<platform::CUDADeviceContext, T> {
    }
    if (grad.prev_out_grad && value.prev_out_value) {
-      math::gemm<platform::CUDADeviceContext, T>(
+      blas.GEMM(false, true, batch_size, frame_size, frame_size * 2, 1,
-          context, false, true, batch_size, frame_size, frame_size * 2, 1,
+                grad.gate_grad, frame_size * 3, value.gate_weight,
-          grad.gate_grad, frame_size * 3, value.gate_weight, frame_size * 2, 1,
+                frame_size * 2, 1, grad.prev_out_grad, frame_size);
-          grad.prev_out_grad, frame_size);
      if (grad.gate_weight_grad) {
-        math::gemm<platform::CUDADeviceContext, T>(
+        blas.GEMM(true, false, frame_size, frame_size * 2, batch_size, 1,
-            context, true, false, frame_size, frame_size * 2, batch_size, 1,
+                  value.prev_out_value, frame_size, grad.gate_grad,
-            value.prev_out_value, frame_size, grad.gate_grad, frame_size * 3, 1,
+                  frame_size * 3, 1, grad.gate_weight_grad, frame_size * 2);
-            grad.gate_weight_grad, frame_size * 2);
      }
    }
  }

--- a/paddle/fluid/operators/math/math_function.cc
+++ b/paddle/fluid/operators/math/math_function.cc
@@ -24,72 +24,6 @@ namespace math {
 using float16 = paddle::platform::float16;
-template <>
-void gemm<platform::CPUDeviceContext, float16>(
-    const platform::CPUDeviceContext& context, const CBLAS_TRANSPOSE transA,
-    const CBLAS_TRANSPOSE transB, const int M, const int N, const int K,
-    const float16 alpha, const float16* A, const float16* B, const float16 beta,
-    float16* C) {
-  PADDLE_THROW("float16 GEMM not supported on CPU");
-}
-template <>
-void gemm<platform::CPUDeviceContext, float>(
-    const platform::CPUDeviceContext& context, const CBLAS_TRANSPOSE transA,
-    const CBLAS_TRANSPOSE transB, const int M, const int N, const int K,
-    const float alpha, const float* A, const float* B, const float beta,
-    float* C) {
-  int lda = (transA == CblasNoTrans) ? K : M;
-  int ldb = (transB == CblasNoTrans) ? N : K;
-  int ldc = N;
-  cblas_sgemm(CblasRowMajor, transA, transB, M, N, K, alpha, A, lda, B, ldb,
-              beta, C, ldc);
-}
-template <>
-void gemm<platform::CPUDeviceContext, double>(
-    const platform::CPUDeviceContext& context, const CBLAS_TRANSPOSE transA,
-    const CBLAS_TRANSPOSE transB, const int M, const int N, const int K,
-    const double alpha, const double* A, const double* B, const double beta,
-    double* C) {
-  int lda = (transA == CblasNoTrans) ? K : M;
-  int ldb = (transB == CblasNoTrans) ? N : K;
-  int ldc = N;
-  cblas_dgemm(CblasRowMajor, transA, transB, M, N, K, alpha, A, lda, B, ldb,
-              beta, C, ldc);
-}
-template <>
-void gemm<platform::CPUDeviceContext, float16>(
-    const platform::CPUDeviceContext& context, const bool transA,
-    const bool transB, const int M, const int N, const int K,
-    const float16 alpha, const float16* A, const int lda, const float16* B,
-    const int ldb, const float16 beta, float16* C, const int ldc) {
-  PADDLE_THROW("float16 GEMM not supported on CPU");
-}
-template <>
-void gemm<platform::CPUDeviceContext, float>(
-    const platform::CPUDeviceContext& context, const bool transA,
-    const bool transB, const int M, const int N, const int K, const float alpha,
-    const float* A, const int lda, const float* B, const int ldb,
-    const float beta, float* C, const int ldc) {
-  cblas_sgemm(CblasRowMajor, transA == false ? CblasNoTrans : CblasTrans,
-              transB == false ? CblasNoTrans : CblasTrans, M, N, K, alpha, A,
-              lda, B, ldb, beta, C, ldc);
-}
-template <>
-void gemm<platform::CPUDeviceContext, double>(
-    const platform::CPUDeviceContext& context, const bool transA,
-    const bool transB, const int M, const int N, const int K,
-    const double alpha, const double* A, const int lda, const double* B,
-    const int ldb, const double beta, double* C, const int ldc) {
-  cblas_dgemm(CblasRowMajor, transA == false ? CblasNoTrans : CblasTrans,
-              transB == false ? CblasNoTrans : CblasTrans, M, N, K, alpha, A,
-              lda, B, ldb, beta, C, ldc);
-}
 template <>
 void matmul<platform::CPUDeviceContext, float16>(
    const platform::CPUDeviceContext& context,
@@ -123,8 +57,8 @@ void matmul<platform::CPUDeviceContext, float>(
  CBLAS_TRANSPOSE transA = (trans_a == false) ? CblasNoTrans : CblasTrans;
  CBLAS_TRANSPOSE transB = (trans_b == false) ? CblasNoTrans : CblasTrans;
-  gemm<platform::CPUDeviceContext, float>(
+  Blas<platform::CPUDeviceContext>(context).GEMM(
-      context, transA, transB, M, N, K, alpha, matrix_a.data<float>(),
+      transA, transB, M, N, K, alpha, matrix_a.data<float>(),
      matrix_b.data<float>(), beta, matrix_out->data<float>());
 }
@@ -152,8 +86,8 @@ void matmul<platform::CPUDeviceContext, double>(
  CBLAS_TRANSPOSE transA = (trans_a == false) ? CblasNoTrans : CblasTrans;
  CBLAS_TRANSPOSE transB = (trans_b == false) ? CblasNoTrans : CblasTrans;
-  gemm<platform::CPUDeviceContext, double>(
+  Blas<platform::CPUDeviceContext>(context).GEMM(
-      context, transA, transB, M, N, K, alpha, matrix_a.data<double>(),
+      transA, transB, M, N, K, alpha, matrix_a.data<double>(),
      matrix_b.data<double>(), beta, matrix_out->data<double>());
 }
@@ -230,8 +164,8 @@ void batched_gemm<platform::CPUDeviceContext, float>(
    const float* Ak = &A[k * strideA];
    const float* Bk = &B[k * strideB];
    float* Ck = &C[k * M * N];
-    gemm<platform::CPUDeviceContext, float>(context, transA, transB, M, N, K,
+    Blas<platform::CPUDeviceContext>(context).GEMM(transA, transB, M, N, K,
-                                            alpha, Ak, Bk, beta, Ck);
+                                                   alpha, Ak, Bk, beta, Ck);
  }
 }
@@ -246,8 +180,8 @@ void batched_gemm<platform::CPUDeviceContext, double>(
    const double* Ak = &A[k * strideA];
    const double* Bk = &B[k * strideB];
    double* Ck = &C[k * M * N];
-    gemm<platform::CPUDeviceContext, double>(context, transA, transB, M, N, K,
+    Blas<platform::CPUDeviceContext>(context).GEMM(transA, transB, M, N, K,
-                                             alpha, Ak, Bk, beta, Ck);
+                                                   alpha, Ak, Bk, beta, Ck);
  }
 }
 #endif

--- a/paddle/fluid/operators/math/math_function.cu
+++ b/paddle/fluid/operators/math/math_function.cu
@@ -25,157 +25,6 @@ namespace math {
 using float16 = paddle::platform::float16;
-template <>
-void gemm<platform::CUDADeviceContext, float16>(
-    const platform::CUDADeviceContext& context, const CBLAS_TRANSPOSE transA,
-    const CBLAS_TRANSPOSE transB, const int M, const int N, const int K,
-    const float16 alpha, const float16* A, const float16* B, const float16 beta,
-    float16* C) {
-  // Note that cublas follows fortran order, so the order is different from
-  // the cblas convention.
-  int lda = (transA == CblasNoTrans) ? K : M;
-  int ldb = (transB == CblasNoTrans) ? N : K;
-  cublasOperation_t cuTransA =
-      (transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
-  cublasOperation_t cuTransB =
-      (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
-  // TODO(kexinzhao): add processing code for compute capability < 53 case
-  PADDLE_ENFORCE_GE(context.GetComputeCapability(), 53,
-                    "cublas fp16 gemm requires GPU compute capability >= 53");
-#if CUDA_VERSION >= 8000
-  float h_alpha = static_cast<float>(alpha);
-  float h_beta = static_cast<float>(beta);
-  cublasGemmAlgo_t algo = CUBLAS_GEMM_DFALT;
-#if CUDA_VERSION >= 9000
-  if (context.GetComputeCapability() >= 70) {
-    PADDLE_ENFORCE(platform::dynload::cublasSetMathMode(context.cublas_handle(),
-                                                        CUBLAS_TENSOR_OP_MATH));
-    algo = CUBLAS_GEMM_DFALT_TENSOR_OP;
-  } else {
-    PADDLE_ENFORCE(platform::dynload::cublasSetMathMode(context.cublas_handle(),
-                                                        CUBLAS_DEFAULT_MATH));
-  }
-#endif  // CUDA_VERSION >= 9000
-  // cublasHgemm does true FP16 computation which is slow for non-Volta
-  // GPUs. So use cublasGemmEx instead which does pesudo FP16 computation:
-  // input/output in fp16, computation in fp32, which can also be accelerated
-  // using tensor cores in volta GPUs.
-  PADDLE_ENFORCE(platform::dynload::cublasGemmEx(
-      context.cublas_handle(), cuTransB, cuTransA, N, M, K, &h_alpha, B,
-      CUDA_R_16F, ldb, A, CUDA_R_16F, lda, &h_beta, C, CUDA_R_16F, N,
-      CUDA_R_32F, algo));
-#else
-  // CUDA 7.5 does not support cublasGemmEx, hence we fall back to use hgemm
-  const half h_alpha = static_cast<const half>(alpha);
-  const half h_beta = static_cast<const half>(beta);
-  const half* h_A = reinterpret_cast<const half*>(A);
-  const half* h_B = reinterpret_cast<const half*>(B);
-  half* h_C = reinterpret_cast<half*>(C);
-  PADDLE_ENFORCE(platform::dynload::cublasHgemm(
-      context.cublas_handle(), cuTransB, cuTransA, N, M, K, &h_alpha, h_B, ldb,
-      h_A, lda, &h_beta, h_C, N));
-#endif  // CUDA_VERSION >= 8000
-}
-template <>
-void gemm<platform::CUDADeviceContext, float>(
-    const platform::CUDADeviceContext& context, const CBLAS_TRANSPOSE transA,
-    const CBLAS_TRANSPOSE transB, const int M, const int N, const int K,
-    const float alpha, const float* A, const float* B, const float beta,
-    float* C) {
-  // Note that cublas follows fortran order, so the order is different from
-  // the cblas convention.
-  int lda = (transA == CblasNoTrans) ? K : M;
-  int ldb = (transB == CblasNoTrans) ? N : K;
-  cublasOperation_t cuTransA =
-      (transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
-  cublasOperation_t cuTransB =
-      (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
-  PADDLE_ENFORCE(platform::dynload::cublasSgemm(
-      context.cublas_handle(), cuTransB, cuTransA, N, M, K, &alpha, B, ldb, A,
-      lda, &beta, C, N));
-}
-template <>
-void gemm<platform::CUDADeviceContext, double>(
-    const platform::CUDADeviceContext& context, const CBLAS_TRANSPOSE transA,
-    const CBLAS_TRANSPOSE transB, const int M, const int N, const int K,
-    const double alpha, const double* A, const double* B, const double beta,
-    double* C) {
-  // Note that cublas follows fortran order, so the order is different from
-  // the cblas convention.
-  int lda = (transA == CblasNoTrans) ? K : M;
-  int ldb = (transB == CblasNoTrans) ? N : K;
-  cublasOperation_t cuTransA =
-      (transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
-  cublasOperation_t cuTransB =
-      (transB == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
-  PADDLE_ENFORCE(platform::dynload::cublasDgemm(
-      context.cublas_handle(), cuTransB, cuTransA, N, M, K, &alpha, B, ldb, A,
-      lda, &beta, C, N));
-}
-template <>
-void gemm<platform::CUDADeviceContext, float16>(
-    const platform::CUDADeviceContext& context, const bool transA,
-    const bool transB, const int M, const int N, const int K,
-    const float16 alpha, const float16* A, const int lda, const float16* B,
-    const int ldb, const float16 beta, float16* C, const int ldc) {
-  // Note that cublas follows fortran order, so the order is different from
-  // the cblas convention.
-  cublasOperation_t cuTransA = transA == false ? CUBLAS_OP_N : CUBLAS_OP_T;
-  cublasOperation_t cuTransB = transB == false ? CUBLAS_OP_N : CUBLAS_OP_T;
-  const half h_alpha = static_cast<const half>(alpha);
-  const half h_beta = static_cast<const half>(beta);
-  const half* h_A = reinterpret_cast<const half*>(A);
-  const half* h_B = reinterpret_cast<const half*>(B);
-  half* h_C = reinterpret_cast<half*>(C);
-  // TODO(kexinzhao): add processing code for compute capability < 53 case
-  PADDLE_ENFORCE_GE(context.GetComputeCapability(), 53,
-                    "cublas Hgemm requires GPU compute capability >= 53");
-  PADDLE_ENFORCE(platform::dynload::cublasHgemm(
-      context.cublas_handle(), cuTransB, cuTransA, N, M, K, &h_alpha, h_B, ldb,
-      h_A, lda, &h_beta, h_C, ldc));
-}
-template <>
-void gemm<platform::CUDADeviceContext, float>(
-    const platform::CUDADeviceContext& context, const bool transA,
-    const bool transB, const int M, const int N, const int K, const float alpha,
-    const float* A, const int lda, const float* B, const int ldb,
-    const float beta, float* C, const int ldc) {
-  // Note that cublas follows fortran order, so the order is different from
-  // the cblas convention.
-  cublasOperation_t cuTransA = transA == false ? CUBLAS_OP_N : CUBLAS_OP_T;
-  cublasOperation_t cuTransB = transB == false ? CUBLAS_OP_N : CUBLAS_OP_T;
-  PADDLE_ENFORCE(platform::dynload::cublasSgemm(
-      context.cublas_handle(), cuTransB, cuTransA, N, M, K, &alpha, B, ldb, A,
-      lda, &beta, C, ldc));
-}
-template <>
-void gemm<platform::CUDADeviceContext, double>(
-    const platform::CUDADeviceContext& context, const bool transA,
-    const bool transB, const int M, const int N, const int K,
-    const double alpha, const double* A, const int lda, const double* B,
-    const int ldb, const double beta, double* C, const int ldc) {
-  // Note that cublas follows fortran order, so the order is different from
-  // the cblas convention.
-  cublasOperation_t cuTransA = transA == false ? CUBLAS_OP_N : CUBLAS_OP_T;
-  cublasOperation_t cuTransB = transB == false ? CUBLAS_OP_N : CUBLAS_OP_T;
-  PADDLE_ENFORCE(platform::dynload::cublasDgemm(
-      context.cublas_handle(), cuTransB, cuTransA, N, M, K, &alpha, B, ldb, A,
-      lda, &beta, C, ldc));
-}
 template <>
 void matmul<platform::CUDADeviceContext, float16>(
    const platform::CUDADeviceContext& context,
@@ -200,8 +49,8 @@ void matmul<platform::CUDADeviceContext, float16>(
  CBLAS_TRANSPOSE transA = (trans_a == false) ? CblasNoTrans : CblasTrans;
  CBLAS_TRANSPOSE transB = (trans_b == false) ? CblasNoTrans : CblasTrans;
-  gemm<platform::CUDADeviceContext, float16>(
+  Blas<platform::CUDADeviceContext>(context).GEMM(
-      context, transA, transB, M, N, K, alpha, matrix_a.data<float16>(),
+      transA, transB, M, N, K, alpha, matrix_a.data<float16>(),
      matrix_b.data<float16>(), beta, matrix_out->data<float16>());
 }
@@ -229,8 +78,8 @@ void matmul<platform::CUDADeviceContext, float>(
  CBLAS_TRANSPOSE transA = (trans_a == false) ? CblasNoTrans : CblasTrans;
  CBLAS_TRANSPOSE transB = (trans_b == false) ? CblasNoTrans : CblasTrans;
-  gemm<platform::CUDADeviceContext, float>(
+  Blas<platform::CUDADeviceContext>(context).GEMM(
-      context, transA, transB, M, N, K, alpha, matrix_a.data<float>(),
+      transA, transB, M, N, K, alpha, matrix_a.data<float>(),
      matrix_b.data<float>(), beta, matrix_out->data<float>());
 }
@@ -258,8 +107,8 @@ void matmul<platform::CUDADeviceContext, double>(
  CBLAS_TRANSPOSE transA = (trans_a == false) ? CblasNoTrans : CblasTrans;
  CBLAS_TRANSPOSE transB = (trans_b == false) ? CblasNoTrans : CblasTrans;
-  gemm<platform::CUDADeviceContext, double>(
+  Blas<platform::CUDADeviceContext>(context).GEMM(
-      context, transA, transB, M, N, K, alpha, matrix_a.data<double>(),
+      transA, transB, M, N, K, alpha, matrix_a.data<double>(),
      matrix_b.data<double>(), beta, matrix_out->data<double>());
 }

--- a/paddle/fluid/operators/math/math_function.h
+++ b/paddle/fluid/operators/math/math_function.h
@@ -42,6 +42,7 @@ int LAPACKE_dgetri(int matrix_layout, int n, double* a, int lda,
 #include <vector>
 #include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/operator.h"
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/platform/device_context.h"
@@ -56,17 +57,48 @@ namespace math {
 // Then matrixA: M * K, matrixB: K * N, matrixC : M * N
 // For more detailed info, please refer to
 // http://www.netlib.org/lapack/explore-html/d4/de2/sgemm_8f.html
+template <typename DeviceContext>
+class Blas {
+ public:
+  explicit Blas(const DeviceContext& context) : context_(context) {}
+  template <typename T>
+  void GEMM(const CBLAS_TRANSPOSE transA, const CBLAS_TRANSPOSE transB,
+            const int M, const int N, const int K, const T alpha, const T* A,
+            const T* B, const T beta, T* C) const;
+  template <typename T>
+  void GEMM(const bool transA, const bool transB, const int M, const int N,
+            const int K, const T alpha, const T* A, const int lda, const T* B,
+            const int ldb, const T beta, T* C, const int ldc) const;
+ private:
+  const DeviceContext& context_;
+};
 template <typename DeviceContext, typename T>
-void gemm(const DeviceContext& context, const CBLAS_TRANSPOSE transA,
+class BlasT : private Blas<DeviceContext> {
-          const CBLAS_TRANSPOSE transB, const int M, const int N, const int K,
+ public:
-          const T alpha, const T* A, const T* B, const T beta, T* C);
+  using Blas<DeviceContext>::Blas;
+  template <typename... ARGS>
+  void GEMM(ARGS... args) const {
+    static_cast<const Blas<DeviceContext>*>(this)->template GEMM<T>(args...);
+  }
+};
-// gemm wrapper with stride args for matrix uncontinuous in memory
 template <typename DeviceContext, typename T>
-void gemm(const DeviceContext& context, const bool transA, const bool transB,
+inline BlasT<DeviceContext, T> GetBlas(
-          const int M, const int N, const int K, const T alpha, const T* A,
+    const framework::ExecutionContext& exe_ctx) {
-          const int lda, const T* B, const int ldb, const T beta, T* C,
+  return BlasT<DeviceContext, T>(
-          const int ldc);
+      exe_ctx.template device_context<DeviceContext>());
+}
+template <typename DeviceContext, typename T>
+inline BlasT<DeviceContext, T> GetBlas(const DeviceContext& dev_ctx) {
+  return BlasT<DeviceContext, T>(dev_ctx);
+}
 // matrix multiply with continuous memory
 template <typename DeviceContext, typename T>
@@ -137,3 +169,8 @@ struct RowwiseMean {
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle
+#include "paddle/fluid/operators/math/blas_impl.h"
+#ifdef PADDLE_WITH_CUDA
+#include "paddle/fluid/operators/math/blas_impl.cu.h"
+#endif
--- a/paddle/fluid/operators/math/matmul.h
+++ b/paddle/fluid/operators/math/matmul.h
@@ -131,8 +131,9 @@ class MatMulFunctor {
    if (!batchCount) {
      // regular matrix multiplication
-      gemm<DeviceContext, T>(context, transA, transB, M, N, kA, alpha,
+      Blas<DeviceContext>(context).GEMM(transA, transB, M, N, kA, alpha,
-                             a.data<T>(), b.data<T>(), beta, out->data<T>());
+                                        a.data<T>(), b.data<T>(), beta,
+                                        out->data<T>());
    } else {
      // batched matrix multiplication
      batched_gemm<DeviceContext, T>(