diff --git a/CMakeLists.txt b/CMakeLists.txt
index dcff6b54cafce35846627e78cfcdac65fae7e686..5e664d1415399965c345a06566708a2a95c4f80d 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -37,6 +37,7 @@ include(simd)
 ################################ Configurations #######################################
 option(WITH_GPU         "Compile PaddlePaddle with NVIDIA GPU"          ${CUDA_FOUND})
 option(WITH_AVX         "Compile PaddlePaddle with AVX intrinsics"      ${AVX_FOUND})
+option(WITH_MKLDNN      "Compile PaddlePaddle with mkl-dnn support."    ON)
 option(WITH_DSO         "Compile PaddlePaddle with dynamic linked CUDA" ON)
 option(WITH_TESTING     "Compile PaddlePaddle with unit testing"        ON)
 option(WITH_SWIG_PY     "Compile PaddlePaddle with inference api"       ON)
@@ -94,6 +95,7 @@ include(external/glog)      # download, build, install glog
 include(external/gtest)     # download, build, install gtest
 include(external/protobuf)  # download, build, install protobuf
 include(external/python)    # download, build, install python
+include(external/mkldnn)    # download, build, install mkldnn
 include(external/openblas)  # download, build, install openblas
 include(external/swig)      # download, build, install swig
 include(external/warpctc)   # download, build, install warpctc
@@ -136,6 +138,11 @@ if(WITH_GPU)
     endif(NOT WITH_DSO)
 endif(WITH_GPU)
 
+if(WITH_MKLDNN)
+    message(STATUS "MKLDNN_LIBRARY: ${MKLDNN_LIBRARY}")
+    list(APPEND EXTERNAL_LIBS ${MKLDNN_LIBRARY} ${MKL_LITE_LIB_IOMP})
+endif()
+
 if(USE_NNPACK)
     include(external/nnpack)
     list(APPEND EXTERNAL_LIBS ${NNPACK_LIBS})
diff --git a/cmake/cblas.cmake b/cmake/cblas.cmake
index 913f711afff3b8f9f77b8da978a3b9e7165d0077..ee654e64bd0d5ce240e3c5db012e0b7e92142bdf 100644
--- a/cmake/cblas.cmake
+++ b/cmake/cblas.cmake
@@ -16,22 +16,42 @@
 set(CBLAS_FOUND OFF)
 
 ## Find MKL First.
-set(INTEL_ROOT "/opt/intel" CACHE PATH "Folder contains intel libs")
-set(MKL_ROOT ${INTEL_ROOT}/mkl CACHE PATH "Folder contains MKL")
+set(INTEL_MKL_ROOT "/opt/intel/mkl" CACHE PATH "Folder contains intel mkl libs")
+set(MKL_ROOT $ENV{MKL_ROOT} CACHE PATH "Folder contains env MKL")
+
+set(MKL_INCLUDE_SEARCH_PATHS
+  ${MKL_ROOT}/include
+  ${INTEL_MKL_ROOT}/include)
+set(MKL_LIB_SEARCH_PATHS
+  ${MKL_ROOT}/lib
+  ${MKL_ROOT}/lib/intel64
+  ${INTEL_MKL_ROOT}/lib
+  ${INTEL_MKL_ROOT}/lib/intel64)
+
+if(MKL_LITE_INC_DIR AND MKL_LITE_LIB)
+  set(CBLAS_FOUND ON)
+  set(CBLAS_PROVIDER MKL_LITE)
+  set(CBLAS_INC_DIR ${MKL_LITE_INC_DIR})
+  set(CBLAS_LIBRARIES ${MKL_LITE_LIB})
+
+  add_definitions(-DPADDLE_USE_MKL_LITE)
+  add_definitions(-DLAPACK_FOUND)
+
+  message(STATUS "Found cblas and lapack in MKL Lite "
+    "(include: ${MKL_LITE_INC_DIR}, library: ${CBLAS_LIBRARIES})")
+  return()
+endif()
 
 find_path(MKL_INC_DIR mkl.h PATHS
-  ${MKL_ROOT}/include)
+  ${MKL_INCLUDE_SEARCH_PATHS})
 find_path(MKL_LAPACK_INC_DIR mkl_lapacke.h PATHS
-  ${MKL_ROOT}/include)
+  ${MKL_INCLUDE_SEARCH_PATHS})
 find_library(MKL_CORE_LIB NAMES mkl_core PATHS
-  ${MKL_ROOT}/lib
-  ${MKL_ROOT}/lib/intel64)
+  ${MKL_LIB_SEARCH_PATHS})
 find_library(MKL_SEQUENTIAL_LIB NAMES mkl_sequential PATHS
-  ${MKL_ROOT}/lib
-  ${MKL_ROOT}/lib/intel64)
+  ${MKL_LIB_SEARCH_PATHS})
 find_library(MKL_INTEL_LP64 NAMES mkl_intel_lp64 PATHS
-  ${MKL_ROOT}/lib
-  ${MKL_ROOT}/lib/intel64)
+  ${MKL_LIB_SEARCH_PATHS})
 
 if(MKL_LAPACK_INC_DIR AND MKL_INC_DIR AND MKL_CORE_LIB AND MKL_SEQUENTIAL_LIB AND MKL_INTEL_LP64)
   set(CBLAS_FOUND ON)
diff --git a/cmake/configure.cmake b/cmake/configure.cmake
index 7afab5d5344b704a9329e313a81379032ba0cc97..87191976821dd06d51727ea7c116a08b30147b2b 100644
--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -67,6 +67,12 @@ else()
     include_directories(${CUDA_TOOLKIT_INCLUDE})
 endif(NOT WITH_GPU)
 
+if(WITH_MKLDNN)
+    add_definitions(-DPADDLE_USE_MKLDNN)
+    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
+endif(WITH_MKLDNN)
+
 set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${SIMD_FLAG}")
 set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${SIMD_FLAG}")
 
diff --git a/cmake/external/mkldnn.cmake b/cmake/external/mkldnn.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..834f5ae230c9b980a3b71f76b1058e0c7453fc74
--- /dev/null
+++ b/cmake/external/mkldnn.cmake
@@ -0,0 +1,78 @@
+# Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+IF(NOT ${WITH_MKLDNN})
+  return()
+ENDIF(NOT ${WITH_MKLDNN})
+
+INCLUDE(ExternalProject)
+
+SET(MKLDNN_PROJECT "extern_mkldnn")
+SET(MKLDNN_SOURCES_DIR ${THIRD_PARTY_PATH}/mkldnn)
+SET(MKLDNN_INSTALL_DIR ${THIRD_PARTY_PATH}/install/mkldnn)
+SET(MKLDNN_INCLUDE_DIR "${MKLDNN_INSTALL_DIR}/include" CACHE PATH "mkldnn include directory." FORCE)
+
+# The following magic numbers should be updated regularly to keep latest version
+SET(MKLDNN_TAG "v0.9")
+SET(MKLDNN_MKL_VER "mklml_lnx_2018.0.20170425")
+
+IF(WIN32)
+    MESSAGE(WARNING "It is not supported compiling with mkldnn in windows Paddle yet."
+      "Force WITH_MKLDNN=OFF")
+    SET(WITH_MKLDNN OFF)
+    return()
+ELSE(WIN32)
+    SET(MKLDNN_LIBRARY "${MKLDNN_INSTALL_DIR}/lib/libmkldnn.so" CACHE FILEPATH "mkldnn library." FORCE)
+    MESSAGE(STATUS "Set ${MKLDNN_INSTALL_DIR}/lib to runtime path")
+    SET(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE)
+    #SET(CMAKE_MACOSX_RPATH 1) # hold for MacOS
+    SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${MKLDNN_INSTALL_DIR}/lib")
+ENDIF(WIN32)
+
+INCLUDE_DIRECTORIES(${MKLDNN_INCLUDE_DIR})
+
+SET(MKLDNN_CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
+SET(MKLDNN_CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
+
+ExternalProject_Add(
+    ${MKLDNN_PROJECT}
+    ${EXTERNAL_PROJECT_LOG_ARGS}
+    GIT_REPOSITORY    "https://github.com/01org/mkl-dnn.git"
+    GIT_TAG           "${MKLDNN_TAG}"
+    PREFIX            ${MKLDNN_SOURCES_DIR}
+    PATCH_COMMAND     cd <SOURCE_DIR>/scripts && ./prepare_mkl.sh
+    UPDATE_COMMAND    ""
+    CMAKE_ARGS        -DCMAKE_CXX_COMPILER=${CMAKE_CXX_COMPILER}
+    CMAKE_ARGS        -DCMAKE_C_COMPILER=${CMAKE_C_COMPILER}
+    CMAKE_ARGS        -DCMAKE_CXX_FLAGS=${MKLDNN_CMAKE_CXX_FLAGS}
+    CMAKE_ARGS        -DCMAKE_C_FLAGS=${MKLDNN_CMAKE_C_FLAGS}
+    CMAKE_ARGS        -DCMAKE_INSTALL_PREFIX=${MKLDNN_INSTALL_DIR}
+    CMAKE_ARGS        -DCMAKE_INSTALL_LIBDIR=${MKLDNN_INSTALL_DIR}/lib
+    CMAKE_ARGS        -DCMAKE_BUILD_TYPE=Release
+    CMAKE_CACHE_ARGS  -DCMAKE_INSTALL_PREFIX:PATH=${MKLDNN_INSTALL_DIR}
+                      -DCMAKE_INSTALL_LIBDIR:PATH=${MKLDNN_INSTALL_DIR}/lib
+                      -DCMAKE_BUILD_TYPE:STRING=Release
+)
+
+SET(MKL_LITE_DIR ${MKLDNN_SOURCES_DIR}/src/${MKLDNN_PROJECT}/external/${MKLDNN_MKL_VER})
+SET(MKL_LITE_INC_DIR ${MKL_LITE_DIR}/include)
+SET(MKL_LITE_LIB ${MKL_LITE_DIR}/lib/libmklml_intel.so)
+SET(MKL_LITE_LIB_IOMP ${MKL_LITE_DIR}/lib/libiomp5.so)
+SET(CMAKE_INSTALL_RPATH "${CMAKE_INSTALL_RPATH}" "${MKL_LITE_DIR}/lib")
+
+ADD_LIBRARY(mkldnn STATIC IMPORTED GLOBAL)
+SET_PROPERTY(TARGET mkldnn PROPERTY IMPORTED_LOCATION ${MKLDNN_LIBRARY})
+ADD_DEPENDENCIES(mkldnn ${MKLDNN_PROJECT})
+
+LIST(APPEND external_project_dependencies mkldnn)
diff --git a/paddle/math/MathFunctions.cpp b/paddle/math/MathFunctions.cpp
index 7045562dd44f8f3e0be9181b32954c04f0865fa4..999b72cc1597b7539308c8cf1cde46d588c163f4 100644
--- a/paddle/math/MathFunctions.cpp
+++ b/paddle/math/MathFunctions.cpp
@@ -202,7 +202,7 @@ double dotProduct<double>(const int n, const double* x, const double* y) {
   return cblas_ddot(n, x, 1, y, 1);
 }
 
-#ifdef PADDLE_USE_MKL
+#if defined(PADDLE_USE_MKL) || defined(PADDLE_USE_MKL_LITE)
 
 template <>
 void vExp<float>(const int n, const float* a, float* r) {
@@ -243,7 +243,55 @@ template <>
 void vAdd<double>(const int n, const double* a, const double* b, double* r) {
   vdAdd(n, a, b, r);
 }
+#else
+
+DEFINE_MATRIX_BINARY_OP(vExp, b = std::exp(a));
+template <class T>
+void vExp(const int n, const T* a, T* r) {
+  hl_cpu_apply_binary_op<T, binary::vExp<T>, 0, 0>(
+      binary::vExp<T>(), const_cast<T*>(a), r, 1, n, n, n);
+}
+
+DEFINE_MATRIX_BINARY_OP(vLog, b = std::log(a));
+template <class T>
+void vLog(const int n, const T* a, T* r) {
+  hl_cpu_apply_binary_op<T, binary::vLog<T>, 0, 0>(
+      binary::vLog<T>(), const_cast<T*>(a), r, 1, n, n, n);
+}
+
+DEFINE_MATRIX_BINARY_PARAMETER_OP(vPow, ONE_PARAMETER, b = std::pow(a, p));
+template <class T>
+void vPow(const int n, const T* a, const T b, T* r) {
+  hl_cpu_apply_binary_op<T, binary::vPow<T>, 0, 0>(
+      binary::vPow<T>(b), const_cast<T*>(a), r, 1, n, n, n);
+}
+
+DEFINE_MATRIX_TERNARY_OP(vAdd, c = a + b);
+template <class T>
+void vAdd(const int n, const T* a, const T* b, T* r) {
+  hl_cpu_apply_ternary_op<T, ternary::vAdd<T>, 0, 0>(ternary::vAdd<T>(),
+                                                     const_cast<T*>(a),
+                                                     const_cast<T*>(b),
+                                                     r,
+                                                     1,
+                                                     n,
+                                                     n,
+                                                     n,
+                                                     n);
+}
+
+template void vExp(const int n, const float* a, float* r);
+template void vExp(const int n, const double* a, double* r);
+template void vLog(const int n, const float* a, float* r);
+template void vLog(const int n, const double* a, double* r);
+template void vPow(const int n, const float* a, const float b, float* r);
+template void vPow(const int n, const double* a, const double b, double* r);
+template void vAdd(const int n, const float* a, const float* b, float* r);
+template void vAdd(const int n, const double* a, const double* b, double* r);
 
+#endif
+
+#ifdef PADDLE_USE_MKL
 template <>
 void vInvSqrt<float>(const int n, const float* a, float* r) {
   vsInvSqrt(n, a, r);
@@ -275,20 +323,6 @@ void vTanh<double>(const int n, const double* a, double* r) {
 }
 #else
 
-DEFINE_MATRIX_BINARY_OP(vExp, b = std::exp(a));
-template <class T>
-void vExp(const int n, const T* a, T* r) {
-  hl_cpu_apply_binary_op<T, binary::vExp<T>, 0, 0>(
-      binary::vExp<T>(), const_cast<T*>(a), r, 1, n, n, n);
-}
-
-DEFINE_MATRIX_BINARY_OP(vLog, b = std::log(a));
-template <class T>
-void vLog(const int n, const T* a, T* r) {
-  hl_cpu_apply_binary_op<T, binary::vLog<T>, 0, 0>(
-      binary::vLog<T>(), const_cast<T*>(a), r, 1, n, n, n);
-}
-
 DEFINE_MATRIX_BINARY_OP(vInvSqrt, b = 1.0f / std::sqrt(a));
 template <class T>
 void vInvSqrt(const int n, const T* a, T* r) {
@@ -312,41 +346,12 @@ void vTanh(const int n, const T* a, T* r) {
       binary::vTanh<T>(), const_cast<T*>(a), r, 1, n, n, n);
 }
 
-DEFINE_MATRIX_BINARY_PARAMETER_OP(vPow, ONE_PARAMETER, b = std::pow(a, p));
-template <class T>
-void vPow(const int n, const T* a, const T b, T* r) {
-  hl_cpu_apply_binary_op<T, binary::vPow<T>, 0, 0>(
-      binary::vPow<T>(b), const_cast<T*>(a), r, 1, n, n, n);
-}
-
-DEFINE_MATRIX_TERNARY_OP(vAdd, c = a + b);
-template <class T>
-void vAdd(const int n, const T* a, const T* b, T* r) {
-  hl_cpu_apply_ternary_op<T, ternary::vAdd<T>, 0, 0>(ternary::vAdd<T>(),
-                                                     const_cast<T*>(a),
-                                                     const_cast<T*>(b),
-                                                     r,
-                                                     1,
-                                                     n,
-                                                     n,
-                                                     n,
-                                                     n);
-}
-
-template void vExp(const int n, const float* a, float* r);
-template void vExp(const int n, const double* a, double* r);
-template void vLog(const int n, const float* a, float* r);
-template void vLog(const int n, const double* a, double* r);
 template void vInvSqrt(const int n, const double* a, double* r);
 template void vInvSqrt(const int n, const float* a, float* r);
 template void vLog1p(const int n, const float* a, float* r);
 template void vLog1p(const int n, const double* a, double* r);
 template void vTanh(const int n, const float* a, float* r);
 template void vTanh(const int n, const double* a, double* r);
-template void vPow(const int n, const float* a, const float b, float* r);
-template void vPow(const int n, const double* a, const double b, double* r);
-template void vAdd(const int n, const float* a, const float* b, float* r);
-template void vAdd(const int n, const double* a, const double* b, double* r);
 
 #endif
 
diff --git a/paddle/math/MathFunctions.h b/paddle/math/MathFunctions.h
index 8ada0d34c6733d13a45505492909124010c85a91..799948cf08b4ff14e9f6f41ba07d1749ba0d2ef4 100644
--- a/paddle/math/MathFunctions.h
+++ b/paddle/math/MathFunctions.h
@@ -15,6 +15,12 @@ limitations under the License. */
 #ifndef MATHFUNCTIONS_H_
 #define MATHFUNCTIONS_H_
 
+#ifdef PADDLE_USE_MKL_LITE
+#include <mkl_cblas.h>
+#include <mkl_lapacke.h>
+#include <mkl_vml_functions.h>
+#endif
+
 #ifdef PADDLE_USE_MKL
 #include <mkl.h>
 #include <mkl_lapacke.h>