diff --git a/CMakeLists.txt b/CMakeLists.txt
index b2481912232cbca95999994417d7f30e98cd4f26..ed3c390066dfac2322d802c6039bc7155a36e38a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -49,6 +49,7 @@ option(COVERALLS_UPLOAD "Package code coverage data to coveralls"       OFF)
 option(ON_TRAVIS        "Exclude special unit test on Travis CI"        OFF)
 option(WITH_C_API       "Compile PaddlePaddle with C-API(Prediction)"   OFF)
 option(WITH_GOLANG      "Compile PaddlePaddle with GOLANG"              OFF)
+option(USE_NNPACK       "Compile PaddlePaddle with NNPACK library"      OFF)
 
 # CMAKE_BUILD_TYPE
 if(NOT CMAKE_BUILD_TYPE)
@@ -129,6 +130,10 @@ if(WITH_GPU)
     endif(NOT WITH_DSO)
 endif(WITH_GPU)
 
+if(USE_NNPACK)
+  list(APPEND EXTERNAL_LIBS ${NNPACK_LIB} ${PTHREADPOOL_LIB} "rt")
+endif(USE_NNPACK)
+
 add_subdirectory(proto)
 
 # "add_subdirectory(paddle)" and "add_subdirectory(python)" should be
diff --git a/paddle/function/CMakeLists.txt b/paddle/function/CMakeLists.txt
index 1c39ced3c9e3da4079a66e29c00be9cc18411b68..1518a8a654cfb54376a49760dc5873733c916937 100644
--- a/paddle/function/CMakeLists.txt
+++ b/paddle/function/CMakeLists.txt
@@ -10,6 +10,14 @@ if(WITH_GPU)
     cuda_compile(cu_objs ${cu_files})
 endif()
 
+if(USE_NNPACK)
+  include(nnpack/nnpack.cmake)
+  list(APPEND cpp_files nnpack/NNPACKConvOp.cpp)
+  if(WITH_TESTING)
+    add_unittest(NNPACKConvOpTest nnpack/NNPACKConvOpTest.cpp)
+  endif()
+endif()
+
 add_library(paddle_function STATIC ${cpp_files} ${cu_objs})
 add_dependencies(paddle_function ${external_project_dependencies})
 add_dependencies(paddle_function paddle_proto)
diff --git a/paddle/function/nnpack/NNPACKConvOp.cpp b/paddle/function/nnpack/NNPACKConvOp.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..e8080c3d714b324f072a380f738b9764477dfe04
--- /dev/null
+++ b/paddle/function/nnpack/NNPACKConvOp.cpp
@@ -0,0 +1,238 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "nnpack.h"
+#include "paddle/function/ConvOp.h"
+
+DEFINE_bool(nnpack_allocate_outside,
+            false,
+            "Allocate and free workspace memory outside the NNPACK interface.");
+DEFINE_int32(nnpack_num_threads,
+             0,
+             "The number of nnpack threads"
+             "default: 0; 0 to disable threadpool.");
+
+namespace paddle {
+
+nnp_convolution_algorithm get_nnp_convolution_algorithm(
+    const std::string& algorithm) {
+  if (algorithm == "auto") {
+    return nnp_convolution_algorithm_auto;
+  } else if (algorithm == "ft8x8") {
+    return nnp_convolution_algorithm_ft8x8;
+  } else if (algorithm == "ft16x16") {
+    return nnp_convolution_algorithm_ft16x16;
+  } else if (algorithm == "wt8x8") {
+    return nnp_convolution_algorithm_wt8x8;
+  } else if (algorithm == "implicit-gemm") {
+    return nnp_convolution_algorithm_implicit_gemm;
+  } else if (algorithm == "direct") {
+    return nnp_convolution_algorithm_direct;
+  } else {
+    return nnp_convolution_algorithm_auto;
+  }
+}
+
+template <DeviceType Device>
+class NNPACKConvFunction : public ConvFunctionBase {
+public:
+  void init(const FuncConfig& config) override {
+    ConvFunctionBase::init(config);
+    CHECK_EQ(groups_, (size_t)1);
+    algorithm_ = get_nnp_convolution_algorithm(config.get<std::string>("algo"));
+    // algorithm_ = nnp_convolution_algorithm_auto;
+    transform_strategy_ = nnp_convolution_transform_strategy_compute;
+    nnp_status status = nnp_initialize();
+    CHECK_EQ(status, nnp_status_success);
+    workspaceBuffer_ = nullptr;
+    workspaceSize_ = 0;
+
+    threadpool_ = nullptr;
+    if (FLAGS_nnpack_num_threads) {
+      threadpool_ = pthreadpool_create(FLAGS_nnpack_num_threads);
+      VLOG(3) << "Number of threads "
+              << pthreadpool_get_threads_count(threadpool_);
+    }
+  }
+
+  ~NNPACKConvFunction() {
+    if (threadpool_) {
+      pthreadpool_destroy(threadpool_);
+    }
+    if (workspaceBuffer_) {
+      free(workspaceBuffer_);
+    }
+  }
+
+  virtual void check(const BufferArgs& inputs,
+                     const BufferArgs& outputs) override {
+    const TensorShape& input = inputs[0].shape();
+    const TensorShape& filter = inputs[1].shape();
+    const TensorShape& output = outputs[0].shape();
+    checkShape(input, filter, output);
+  }
+
+  void calc(const BufferArgs& inputs, const BufferArgs& outputs) override {
+    CHECK_EQ(numInputs_, inputs.size());
+    CHECK_EQ(numOutputs_, outputs.size());
+    CHECK_EQ(outputs[0].getArgType(), ASSIGN_TO);
+    check(inputs, outputs);
+    const TensorShape& input = inputs[0].shape();
+    const TensorShape& filter = inputs[1].shape();
+    const TensorShape& output = outputs[0].shape();
+
+    size_t batchSize = input[0];
+    size_t inputChannels = input[1];
+    size_t inputHeight = input[2];
+    size_t inputWidth = input[3];
+    size_t filterHeight = getFilterHeight(filter);
+    size_t filterWidth = getFilterWidth(filter);
+    size_t outputChannels = output[1];
+    // size_t outputHeight = output[2];
+    // size_t outputWidth = output[3];
+
+    nnp_size inputSize = {.width = inputWidth, .height = inputHeight};
+    nnp_padding padding = {.top = (size_t)paddingH(),
+                           .right = (size_t)paddingW(),
+                           .bottom = (size_t)paddingH(),
+                           .left = (size_t)paddingW()};
+    nnp_size kernelSize = {.width = filterWidth, .height = filterHeight};
+    nnp_size outputSubsampling = {.width = (size_t)strideW(),
+                                  .height = (size_t)strideH()};
+
+    float* inputData = inputs[0].data<float>();
+    float* filterData = inputs[1].data<float>();
+    float* outputData = outputs[0].data<float>();
+
+    void* bufferPtr = nullptr;
+    size_t* sizePtr = nullptr;
+    size_t needSize;
+    if (FLAGS_nnpack_allocate_outside) {
+      if (batchSize == 1) {
+        nnp_status status = nnp_convolution_inference(algorithm_,
+                                                      transform_strategy_,
+                                                      inputChannels,
+                                                      outputChannels,
+                                                      inputSize,
+                                                      padding,
+                                                      kernelSize,
+                                                      outputSubsampling,
+                                                      nullptr,
+                                                      nullptr,
+                                                      nullptr,
+                                                      nullptr,
+                                                      nullptr,
+                                                      &needSize,
+                                                      nnp_activation_identity,
+                                                      nullptr,
+                                                      nullptr,
+                                                      nullptr);
+        CHECK_EQ(status, nnp_status_success);
+      } else {
+        // only supports stride = 1
+        CHECK_EQ(strideH(), 1);
+        CHECK_EQ(strideW(), 1);
+        nnp_status status = nnp_convolution_output(algorithm_,
+                                                   batchSize,
+                                                   inputChannels,
+                                                   outputChannels,
+                                                   inputSize,
+                                                   padding,
+                                                   kernelSize,
+                                                   nullptr,
+                                                   nullptr,
+                                                   nullptr,
+                                                   nullptr,
+                                                   nullptr,
+                                                   &needSize,
+                                                   nnp_activation_identity,
+                                                   nullptr,
+                                                   nullptr,
+                                                   nullptr);
+        CHECK_EQ(status, nnp_status_success);
+      }
+
+      VLOG(3) << "workspace size is " << needSize;
+      if (needSize > workspaceSize_) {
+        workspaceSize_ = needSize;
+        if (workspaceBuffer_) {
+          free(workspaceBuffer_);
+        } else {
+          posix_memalign(&workspaceBuffer_, 64, needSize);
+        }
+      }
+
+      if (needSize) {
+        bufferPtr = workspaceBuffer_;
+        sizePtr = &needSize;
+      }
+    }
+
+    if (batchSize == 1) {
+      nnp_status status =
+          nnp_convolution_inference(algorithm_,
+                                    transform_strategy_,
+                                    inputChannels,
+                                    outputChannels,
+                                    inputSize,
+                                    padding,
+                                    kernelSize,
+                                    outputSubsampling,
+                                    inputData,
+                                    filterData,
+                                    nullptr, /* bias */
+                                    outputData,
+                                    bufferPtr,
+                                    sizePtr,
+                                    nnp_activation_identity,
+                                    nullptr,
+                                    threadpool_, /* threadpool */
+                                    nullptr);
+      CHECK_EQ(status, nnp_status_success);
+    } else {
+      // only supports stride = 1
+      CHECK_EQ(strideH(), 1);
+      CHECK_EQ(strideW(), 1);
+      nnp_status status = nnp_convolution_output(algorithm_,
+                                                 batchSize,
+                                                 inputChannels,
+                                                 outputChannels,
+                                                 inputSize,
+                                                 padding,
+                                                 kernelSize,
+                                                 inputData,
+                                                 filterData,
+                                                 nullptr, /* bias */
+                                                 outputData,
+                                                 bufferPtr,
+                                                 sizePtr,
+                                                 nnp_activation_identity,
+                                                 nullptr,
+                                                 threadpool_, /* threadpool */
+                                                 nullptr);
+      CHECK_EQ(status, nnp_status_success);
+    }
+  }
+
+private:
+  nnp_convolution_algorithm algorithm_;
+  nnp_convolution_transform_strategy transform_strategy_;
+  void* workspaceBuffer_;
+  size_t workspaceSize_;
+  pthreadpool_t threadpool_;
+};
+
+REGISTER_TYPED_FUNC(NNPACKConv, CPU, NNPACKConvFunction);
+
+}  // namespace paddle
diff --git a/paddle/function/nnpack/NNPACKConvOpTest.cpp b/paddle/function/nnpack/NNPACKConvOpTest.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..48180112111c67f36ddd425008187201655089c9
--- /dev/null
+++ b/paddle/function/nnpack/NNPACKConvOpTest.cpp
@@ -0,0 +1,99 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include "paddle/function/Function.h"
+#include "paddle/function/FunctionTest.h"
+
+DEFINE_string(algo,
+              "auto",
+              "The algorithm (auto, ft8x8, ft16x16, wt8x8, "
+              "implicit-gemm, or direct) for computing convolution of NNPACK.");
+
+namespace paddle {
+
+#define IS_NNPACK_SUPPORT(algo, filterSize, stride)        \
+  if (algo == "direct" && filterSize != 1) continue;       \
+  if (algo == "direct" && batchSize != 1) continue;        \
+  if (algo == "wt8x8" && filterSize != 3) continue;        \
+  if (algo == "implicit-gemm" && batchSize != 1) continue; \
+  if (algo != "auto" && algo != "implicit-gemm" && stride > 1) continue;
+
+class ConvolutionTest {
+public:
+  ConvolutionTest(const std::string& conv1,
+                  const std::string& conv2,
+                  std::string algo = "auto") {
+    for (size_t batchSize : {1, 32}) {
+      for (size_t inputSize : {7, 14, 54}) {
+        for (size_t filterSize : {1, 3, 5}) {
+          for (size_t inputChannels : {3, 64}) {
+            for (size_t outputChannels : {3, 64, 128}) {
+              if (inputChannels < outputChannels) break;
+              for (size_t stride : {1, 2}) {
+                // if batchSize > 1 NNPACKConv only supports stride = 1
+                if (batchSize > 1 && stride > 1) break;
+                for (size_t padding : {0, 1}) {
+                  if (padding >= filterSize) break;
+                  size_t outputSize =
+                      (inputSize - filterSize + 2 * padding + stride) / stride;
+                  IS_NNPACK_SUPPORT(algo, filterSize, stride);
+                  LOG(INFO) << " batchSize=" << batchSize
+                            << " inputChannels=" << inputChannels
+                            << " inputHeight=" << inputSize
+                            << " inputWidth=" << inputSize
+                            << " outputChannels=" << outputChannels
+                            << " filterHeight=" << filterSize
+                            << " filterWidth=" << filterSize
+                            << " outputHeight=" << outputSize
+                            << " outputWidth=" << outputSize
+                            << " stride=" << stride << " padding=" << padding;
+
+                  std::vector<size_t> paddings = {padding, padding};
+                  std::vector<size_t> strides = {stride, stride};
+                  Compare2Function<DEVICE_TYPE_CPU, DEVICE_TYPE_CPU> test(
+                      conv1,
+                      conv2,
+                      FuncConfig()
+                          .set("paddings", paddings)
+                          .set("strides", strides)
+                          .set("groups", (size_t)1)
+                          .set("algo", algo));
+
+                  TensorShape shape0{
+                      batchSize, inputChannels, inputSize, inputSize};
+                  TensorShape shape1{
+                      outputChannels, inputChannels, filterSize, filterSize};
+                  TensorShape shape2{
+                      batchSize, outputChannels, outputSize, outputSize};
+                  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, shape0));
+                  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, shape1));
+                  test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, shape2));
+                  test.run();
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+};
+
+TEST(Convolution, NNPACK) {
+  // NNPACK only supports stride = 1
+  ConvolutionTest test("GemmConv-CPU", "NNPACKConv-CPU", FLAGS_algo);
+}
+
+}  // namespace paddle
diff --git a/paddle/function/nnpack/nnpack.cmake b/paddle/function/nnpack/nnpack.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..7182730ae8f133bdc4f73bfc46fa8acbe5f3b603
--- /dev/null
+++ b/paddle/function/nnpack/nnpack.cmake
@@ -0,0 +1,16 @@
+# Find the NNPACK library
+#  NNPACK_ROOT - where to find NNPACK include and library.
+#
+
+set(NNPACK_FOUND OFF)
+set(NNPACK_ROOT $ENV{NNPACK_ROOT} CACHE PATH "Folder contains NNPACK")
+find_path(NNPACK_INC_DIR nnpack.h PATHS ${NNPACK_ROOT}/include)
+find_library(NNPACK_LIB NAMES nnpack PATHS ${NNPACK_ROOT}/lib)
+find_library(PTHREADPOOL_LIB NAMES pthreadpool PATHS ${NNPACK_ROOT}/lib)
+
+if(NNPACK_INC_DIR AND NNPACK_LIB AND PTHREADPOOL_LIB)
+  set(NNPACK_FOUND ON)
+  INCLUDE_DIRECTORIES(${NNPACK_INC_DIR})
+else()
+  message(FATAL_ERROR "Cannot find NNPACK in (${NNPACK_ROOT})")
+endif()
diff --git a/paddle/gserver/layers/ExpandConvLayer.cpp b/paddle/gserver/layers/ExpandConvLayer.cpp
index 914689e66cdb8947e886e17e75829183c1af1a42..af79e65a7c09e5a1b55febf1df1e8f5bb61bdcb8 100644
--- a/paddle/gserver/layers/ExpandConvLayer.cpp
+++ b/paddle/gserver/layers/ExpandConvLayer.cpp
@@ -16,6 +16,10 @@ limitations under the License. */
 #include "paddle/utils/Logging.h"
 #include "paddle/utils/Stat.h"
 
+DEFINE_bool(use_nnpack,
+            false,
+            "Whether to use nnpack for convolution calculation.");
+
 namespace paddle {
 
 /*
@@ -37,26 +41,38 @@ bool ExpandConvLayer::init(const LayerMap &layerMap,
   for (int i = 0; i < config_.inputs_size(); i++) {
     std::vector<size_t> paddings = {(size_t)paddingY_[i], (size_t)padding_[i]};
     std::vector<size_t> strides = {(size_t)strideY_[i], (size_t)stride_[i]};
-    createFunction(forward_,
-                   !isDeconv_ ? "GemmConv" : "GemmConvGradInput",
-                   FuncConfig()
-                       .set("paddings", paddings)
-                       .set("strides", strides)
-                       .set("groups", (size_t)groups_[i]));
-
-    createFunction(backward_,
-                   !isDeconv_ ? "GemmConvGradInput" : "GemmConv",
-                   FuncConfig()
-                       .set("paddings", paddings)
-                       .set("strides", strides)
-                       .set("groups", (size_t)groups_[i]));
-
-    createFunction(backward_,
-                   "GemmConvGradFilter",
-                   FuncConfig()
-                       .set("paddings", paddings)
-                       .set("strides", strides)
-                       .set("groups", (size_t)groups_[i]));
+
+    if (FLAGS_use_nnpack) {
+      CHECK_EQ(isDeconv_, false);
+      createFunction(forward_,
+                     "NNPACKConv",
+                     FuncConfig()
+                         .set("paddings", paddings)
+                         .set("strides", strides)
+                         .set("groups", (size_t)groups_[i])
+                         .set("algo", std::string("auto")));
+    } else {
+      createFunction(forward_,
+                     !isDeconv_ ? "GemmConv" : "GemmConvGradInput",
+                     FuncConfig()
+                         .set("paddings", paddings)
+                         .set("strides", strides)
+                         .set("groups", (size_t)groups_[i]));
+
+      createFunction(backward_,
+                     !isDeconv_ ? "GemmConvGradInput" : "GemmConv",
+                     FuncConfig()
+                         .set("paddings", paddings)
+                         .set("strides", strides)
+                         .set("groups", (size_t)groups_[i]));
+
+      createFunction(backward_,
+                     "GemmConvGradFilter",
+                     FuncConfig()
+                         .set("paddings", paddings)
+                         .set("strides", strides)
+                         .set("groups", (size_t)groups_[i]));
+    }
   }
   return true;
 }