diff --git a/CMakeLists.txt b/CMakeLists.txt
index b174831109372cb014741d63032fa6a470e74042..c7d743e193e7d32dbc0b56f3bcb05b6c61f85f1d 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -36,8 +36,8 @@ include(simd)
 ################################ Configurations #######################################
 option(WITH_GPU         "Compile PaddlePaddle with NVIDIA GPU"          ${CUDA_FOUND})
 option(WITH_AVX         "Compile PaddlePaddle with AVX intrinsics"      ${AVX_FOUND})
-option(WITH_MKLDNN      "Compile PaddlePaddle with mkl-dnn support."    ${AVX_FOUND})
-option(WITH_MKLML       "Compile PaddlePaddle with mklml package."      ${AVX_FOUND})
+option(WITH_MKLDNN      "Compile PaddlePaddle with mkl-dnn support."    OFF)
+option(WITH_MKLML       "Compile PaddlePaddle with mklml package."      OFF)
 option(WITH_DSO         "Compile PaddlePaddle with dynamic linked CUDA" ON)
 option(WITH_TESTING     "Compile PaddlePaddle with unit testing"        ON)
 option(WITH_SWIG_PY     "Compile PaddlePaddle with inference api"       ON)
diff --git a/paddle/operators/math/math_function.cu b/paddle/operators/math/math_function.cu
index 3e2aeea1da490d524db7b04cce947c57af97d8ef..2cc3c24fb34596ca127585e3a5543378db1efda7 100644
--- a/paddle/operators/math/math_function.cu
+++ b/paddle/operators/math/math_function.cu
@@ -13,7 +13,6 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include "paddle/operators/math/math_function.h"
-
 namespace paddle {
 namespace operators {
 namespace math {
@@ -26,6 +25,8 @@ void gemm<platform::GPUPlace, float>(
     platform::DeviceContext* context) {
   // Note that cublas follows fortran order, so the order is different from
   // the cblas convention.
+  int lda = (transA == CblasNoTrans) ? K : M;
+  int ldb = (transB == CblasNoTrans) ? N : K;
   cublasOperation_t cuTransA =
       (transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
   cublasOperation_t cuTransB =
@@ -44,6 +45,8 @@ void gemm<platform::GPUPlace, double>(
     const int ldc, platform::DeviceContext* context) {
   // Note that cublas follows fortran order, so the order is different from
   // the cblas convention.
+  lda = (transA == CblasNoTrans) ? K : M;
+  ldb = (transB == CblasNoTrans) ? N : K;
   cublasOperation_t cuTransA =
       (transA == CblasNoTrans) ? CUBLAS_OP_N : CUBLAS_OP_T;
   cublasOperation_t cuTransB =
@@ -118,7 +121,6 @@ void matmul<platform::GPUPlace, double>(const framework::Tensor& in1,
                                    in1.data<double>(), K, in2.data<double>(), N,
                                    beta, out->data<double>(), N, context);
 }
-
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle
diff --git a/paddle/operators/math/math_function.h b/paddle/operators/math/math_function.h
index f068f4a15eec7593c257e9a9e026413c95904398..1ecca604039f6cd8ad9c3f976398b258252d57bb 100644
--- a/paddle/operators/math/math_function.h
+++ b/paddle/operators/math/math_function.h
@@ -37,6 +37,20 @@ extern "C" {
 #include <lapacke.h>
 #endif
 
+#ifndef LAPACK_FOUND
+extern "C" {
+#include <cblas.h>
+int LAPACKE_sgetrf(
+    int matrix_layout, int m, int n, float* a, int lda, int* ipiv);
+int LAPACKE_dgetrf(
+    int matrix_layout, int m, int n, double* a, int lda, int* ipiv);
+int LAPACKE_sgetri(
+    int matrix_layout, int n, float* a, int lda, const int* ipiv);
+int LAPACKE_dgetri(
+    int matrix_layout, int n, double* a, int lda, const int* ipiv);
+}
+#endif
+
 #include <cmath>
 #include "paddle/framework/tensor.h"
 #include "paddle/platform/device_context.h"
@@ -61,7 +75,7 @@ void gemm(const CBLAS_TRANSPOSE transA,
           const int ldc,
           platform::DeviceContext* context);
 
-// matrix multiply with continous memory
+// matrix multiply with continuous memory
 template <typename Place, typename T>
 void matmul(const framework::Tensor& in1,
             bool in1_T,
diff --git a/paddle/operators/mul_op.cu b/paddle/operators/mul_op.cu
index 7435b74bd87a392a27c1ae7f4a8d2416526a40d0..346a7e505d123b5e4e831daa39a1f6349b3dcccf 100644
--- a/paddle/operators/mul_op.cu
+++ b/paddle/operators/mul_op.cu
@@ -15,4 +15,5 @@
 #define EIGEN_USE_GPU
 #include "paddle/operators/mul_op.h"
 
+namespace ops = paddle::operators;
 REGISTER_OP_GPU_KERNEL(mul, ops::MulKernel<paddle::platform::GPUPlace, float>);
diff --git a/paddle/operators/mul_op.h b/paddle/operators/mul_op.h
index 2087e98901d69574e36adafe2a030abf2305c97b..98c54f1dfb8ef66e8ec51f9a16415a63b383214b 100644
--- a/paddle/operators/mul_op.h
+++ b/paddle/operators/mul_op.h
@@ -31,9 +31,6 @@ template <typename Place, typename T>
 class MulKernel : public framework::OpKernel {
  public:
   void Compute(const framework::ExecutionContext& context) const override {
-    Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 1> dim_pair = {
-        {Eigen::IndexPair<Eigen::DenseIndex>(1, 0)}};
-
     auto input0 = context.Input<Tensor>("X");
     auto input1 = context.Input<Tensor>("Y");
     auto output = context.Output<Tensor>(0);