diff --git a/paddle/gserver/activations/MKLDNNActivation.h b/paddle/gserver/activations/MKLDNNActivation.h
index bda9bbebe5600dbe26d11ff32058f7b2647b763e..86ffe387366409d81a91740cc8cea886e618f7e2 100644
--- a/paddle/gserver/activations/MKLDNNActivation.h
+++ b/paddle/gserver/activations/MKLDNNActivation.h
@@ -131,8 +131,9 @@ public:
     fwdPD_.reset(new eltwise_fwd::primitive_desc(fwdDesc, eng));
     // use inplace for forward but save input value before submit
     inVal_ = val_;
-    if (act.grad) {
-      // only copy when need do backward
+    copyInVal_ = nullptr;
+    if (act.grad && algo == mkldnn::algorithm::eltwise_tanh) {
+      // tanh need save src input for backward
       inVal_ = MKLDNNMatrix::create(nullptr, val_->getPrimitiveDesc());
       copyInVal_ = std::make_shared<mkldnn::reorder>(*val_, *inVal_);
       CHECK(copyInVal_) << "should not be emptry";
diff --git a/paddle/gserver/layers/MKLDNNConvLayer.cpp b/paddle/gserver/layers/MKLDNNConvLayer.cpp
index 2647cb600653b4f43322016afb231a55f4db5642..88b047c89bd40aba1afc456c22a2870c62989c1c 100644
--- a/paddle/gserver/layers/MKLDNNConvLayer.cpp
+++ b/paddle/gserver/layers/MKLDNNConvLayer.cpp
@@ -449,13 +449,14 @@ void MKLDNNConvLayer::resetOutGrad(
   cvtOutGrad_ = nullptr;
   if (!outputIsOnlyMKLDNN()) {
     const MatrixPtr& cpuOut = getOutput(CPU_DEVICE).grad;
+    outMat->setData(cpuOut->getData());
     // same PrimitiveDesc with cpuInVal_
     CHECK(cpuOutVal_);
     cpuOutGrad_ = MKLDNNMatrix::create(cpuOut, cpuOutVal_->getPrimitiveDesc());
     if (cpuOutGrad_->getPrimitiveDesc() == out->getPrimitiveDesc()) {
-      outMat->setData(cpuOut->getData());
       out = cpuOutGrad_;
     } else {
+      out = MKLDNNMatrix::create(nullptr, wgtPD->diff_dst_primitive_desc());
       cvtOutGrad_ = MKLDNNMatrix::createReorder(cpuOutGrad_, out);
       CHECK(cvtOutGrad_);
     }
diff --git a/paddle/gserver/layers/MKLDNNFcLayer.cpp b/paddle/gserver/layers/MKLDNNFcLayer.cpp
index 66b358bcea53f61ddcc15323704fa9f154fb2a73..afd092666bf8b8a3389b36aa1f0edb256a9968e6 100644
--- a/paddle/gserver/layers/MKLDNNFcLayer.cpp
+++ b/paddle/gserver/layers/MKLDNNFcLayer.cpp
@@ -232,6 +232,7 @@ void MKLDNNFcLayer::resetBwdBuffers(MKLDNNMatrixPtr& in,
 void MKLDNNFcLayer::resetOutGrad(MKLDNNMatrixPtr& out) {
   // TODO(TJ): merge outgrad
   int device = outputIsOnlyMKLDNN() ? MKLDNN_DEVICE : CPU_DEVICE;
+  output_.grad->setData(getOutput(device).grad->getData());
   // for MKLDNN device:
   // can not directly cast outputgrad to mkldnnmatrix,
   // since each layer can not write the inputgrad to mkldnn inputgrad.
diff --git a/paddle/gserver/layers/MKLDNNLayer.h b/paddle/gserver/layers/MKLDNNLayer.h
index c4e4a6874e6fdb491c344c70dfea422dc0924cd9..d8555a833187ddf64b096135e920e5be2b3a8c2f 100644
--- a/paddle/gserver/layers/MKLDNNLayer.h
+++ b/paddle/gserver/layers/MKLDNNLayer.h
@@ -141,18 +141,16 @@ public:
   }
 
   void backward(const UpdateCallback& callback) override {
-    /* Do derivation */ {
+    if (needResetBwd_) {
+      resetBwd(pipelineBwd_, inGrad_, wgtGrad_, biasGrad_, outGrad_);
+      needResetBwd_ = false;
+    }
+    {
       REGISTER_TIMER_INFO("BpActTimer", getName().c_str());
       backwardActivation();
     }
-
     {
       REGISTER_TIMER_INFO("mkldnn_bwdTimer", getName().c_str());
-      if (needResetBwd_) {
-        resetBwd(pipelineBwd_, inGrad_, wgtGrad_, biasGrad_, outGrad_);
-        needResetBwd_ = false;
-      }
-
       stream_->submit(pipelineBwd_);
     }
 
diff --git a/paddle/gserver/tests/test_MKLDNN.cpp b/paddle/gserver/tests/test_MKLDNN.cpp
index 406181370faf90d29167b62173ce4c8af44d243e..1bfbbde4246a10eaf86693a6a2f237f390966db3 100644
--- a/paddle/gserver/tests/test_MKLDNN.cpp
+++ b/paddle/gserver/tests/test_MKLDNN.cpp
@@ -26,17 +26,26 @@ DECLARE_bool(thread_local_rand_use_global_seed);
 DECLARE_bool(use_gpu);
 DECLARE_bool(use_mkldnn);
 
-struct testFCDesc {
+#define RUN_MKLDNN_TEST(DNN_CONFIG, REF_CONFIG, DESC)         \
+  MKLDNNTester tester;                                        \
+  for (auto bs : {DESC.bs, 1}) {                              \
+    tester.run(DNN_CONFIG, REF_CONFIG, bs, DESC.ih, DESC.iw); \
+  }
+
+#define RUN_MKLDNN_TEST_LAYER(DNN_CONFIG, REF_TYPE, DESC) \
+  TestConfig ref = DNN_CONFIG;                            \
+  ref.layerConfig.set_type(REF_TYPE);                     \
+  RUN_MKLDNN_TEST(DNN_CONFIG, ref, DESC)
+
+struct testFcDesc {
   int bs;
   int ic;
   int oc;
   int ih, iw;  // oh == ow == 1
 };
 
-void testFcLayer(const testFCDesc& pm) {
-  const std::string compareTypes[] = {"mkldnn_fc", "fc"};
-  TestConfig cfg;
-  cfg.layerConfig.set_type(compareTypes[0]);
+static void getMKLDNNFcConfig(TestConfig& cfg, const testFcDesc& pm) {
+  cfg.layerConfig.set_type("mkldnn_fc");
   cfg.layerConfig.set_size(pm.oc);
   cfg.inputDefs.push_back(
       {INPUT_DATA,
@@ -44,25 +53,25 @@ void testFcLayer(const testFCDesc& pm) {
        /* size of input layer= */ size_t(pm.ic * pm.ih * pm.iw),
        /* size of weight= */ size_t(pm.oc * pm.ic * pm.ih * pm.iw)});
   cfg.layerConfig.add_inputs();
+}
 
-  MKLDNNTester tester;
+void testFcLayer(const testFcDesc& pm) {
+  TestConfig dnnConfig;
+  getMKLDNNFcConfig(dnnConfig, pm);
   for (auto biasSize : {pm.oc, 0}) {
-    cfg.biasSize = biasSize;
-    TestConfig ref = cfg;
-    ref.layerConfig.set_type(compareTypes[1]);
-    for (auto bs : {pm.bs, 1}) {
-      tester.run(cfg, ref, bs, pm.ih, pm.iw);
-    }
+    dnnConfig.biasSize = biasSize;
+    RUN_MKLDNN_TEST_LAYER(dnnConfig, "fc", pm)
   }
 }
 
 TEST(MKLDNNLayer, FcLayer) {
-  testFcLayer({/*bs*/ 2, /*ic*/ 2, /*oc*/ 3, /*ih*/ 1, /*iw*/ 1});
-  testFcLayer({/*bs*/ 3, /*ic*/ 7, /*oc*/ 19, /*ih*/ 1, /*iw*/ 1});
-  testFcLayer({/*bs*/ 8, /*ic*/ 16, /*oc*/ 32, /*ih*/ 13, /*iw*/ 13});
-  testFcLayer({/*bs*/ 4, /*ic*/ 12, /*oc*/ 18, /*ih*/ 13, /*iw*/ 11});
-  testFcLayer({/*bs*/ 2, /*ic*/ 64, /*oc*/ 32, /*ih*/ 16, /*iw*/ 16});
-  testFcLayer({/*bs*/ 15, /*ic*/ 3, /*oc*/ 6, /*ih*/ 16, /*iw*/ 16});
+  /* bs, ic, ih, iw, oc */
+  testFcLayer({2, 2, 1, 1, 3});
+  testFcLayer({3, 7, 1, 1, 19});
+  testFcLayer({8, 16, 13, 13, 32});
+  testFcLayer({4, 12, 13, 13, 18});
+  testFcLayer({2, 64, 16, 16, 32});
+  testFcLayer({15, 3, 16, 16, 6});
 }
 
 struct testConvDesc {
@@ -75,13 +84,10 @@ struct testConvDesc {
   int dh, dw;
 };
 
-void testConvLayer(const testConvDesc& pm) {
-  const std::string compareTypes[] = {"mkldnn_conv", "exconv"};
-  TestConfig cfg;
-  cfg.layerConfig.set_type(compareTypes[0]);
+static void getMKLDNNConvConfig(TestConfig& cfg, const testConvDesc& pm) {
+  cfg.layerConfig.set_type("mkldnn_conv");
   cfg.layerConfig.set_num_filters(pm.oc);
   cfg.layerConfig.set_size(pm.oc * pm.oh * pm.ow);
-  // cfg.layerConfig.set_partial_sum(1); // TODO: check it
   cfg.layerConfig.set_shared_biases(true);
   cfg.inputDefs.push_back(
       {INPUT_DATA,
@@ -115,15 +121,14 @@ void testConvLayer(const testConvDesc& pm) {
   int oh = outputSize(pm.ih, fh, pm.ph, pm.sh, true);
   CHECK_EQ(ow, pm.ow) << "output size check failed";
   CHECK_EQ(oh, pm.oh) << "output size check failed";
+}
 
-  MKLDNNTester tester;
+void testConvLayer(const testConvDesc& pm) {
+  TestConfig dnnConfig;
+  getMKLDNNConvConfig(dnnConfig, pm);
   for (auto biasSize : {pm.oc, 0}) {
-    cfg.biasSize = biasSize;
-    TestConfig ref = cfg;
-    ref.layerConfig.set_type(compareTypes[1]);
-    for (auto bs : {pm.bs, 1}) {
-      tester.run(cfg, ref, bs, pm.ih, pm.iw);
-    }
+    dnnConfig.biasSize = biasSize;
+    RUN_MKLDNN_TEST_LAYER(dnnConfig, "exconv", pm)
   }
 }
 
@@ -143,7 +148,7 @@ TEST(MKLDNNLayer, ConvLayer) {
 }
 
 struct testPoolDesc {
-  int bs, ch;  // input channel and output channel are the same
+  int bs, ic;  // input channel and output channel are the same
   int ih, iw;
   int oh, ow;
   int fh, fw;
@@ -151,19 +156,18 @@ struct testPoolDesc {
   int sh, sw;
 };
 
-void testPoolLayer(const testPoolDesc& pm) {
-  const std::string compareTypes[] = {"mkldnn_pool", "pool"};
-  TestConfig cfg;
-  cfg.layerConfig.set_type(compareTypes[0]);
-  cfg.layerConfig.set_size(pm.ch * pm.oh * pm.ow);
+static void getMKLDNNPoolConfig(TestConfig& cfg, const testPoolDesc& pm) {
+  cfg.layerConfig.set_type("mkldnn_pool");
+  cfg.layerConfig.set_size(pm.ic * pm.oh * pm.ow);
   cfg.inputDefs.push_back(
       {INPUT_DATA,
        "layer_0",
-       /* size of input layer= */ size_t(pm.ch * pm.ih * pm.iw),
+       /* size of input layer= */ size_t(pm.ic * pm.ih * pm.iw),
        0});
   LayerInputConfig* input = cfg.layerConfig.add_inputs();
   PoolConfig* pool = input->mutable_pool_conf();
-  pool->set_channels(pm.ch);
+  pool->set_pool_type("avg-projection");
+  pool->set_channels(pm.ic);
   pool->set_img_size(pm.iw);
   pool->set_img_size_y(pm.ih);
   pool->set_output_x(pm.ow);
@@ -179,20 +183,21 @@ void testPoolLayer(const testPoolDesc& pm) {
   int ow = outputSize(pm.iw, pm.fw, pm.pw, pm.sw, false);
   CHECK_EQ(ow, pm.ow) << "output size check failed";
   CHECK_EQ(oh, pm.oh) << "output size check failed";
+}
 
-  MKLDNNTester tester;
+void testPoolLayer(const testPoolDesc& pm) {
+  TestConfig dnnConfig;
+  getMKLDNNPoolConfig(dnnConfig, pm);
+  LayerInputConfig* input = dnnConfig.layerConfig.mutable_inputs(0);
+  PoolConfig* pool = input->mutable_pool_conf();
   for (auto type : {"max-projection", "avg-projection"}) {
     pool->set_pool_type(type);
-    TestConfig ref = cfg;
-    ref.layerConfig.set_type(compareTypes[1]);
-    for (auto bs : {pm.bs, 1}) {
-      tester.run(cfg, ref, bs, pm.ih, pm.iw);
-    }
+    RUN_MKLDNN_TEST_LAYER(dnnConfig, "pool", pm)
   }
 }
 
 TEST(MKLDNNLayer, PoolLayer) {
-  /* bs, ch, ih, iw, oh, ow, fh, fw, ph, pw, sh, sw*/
+  /* bs, ch, ih, iw, oh, ow, fh, fw, ph, pw, sh, sw */
   testPoolLayer({2, 1, 4, 4, 2, 2, 3, 3, 0, 0, 2, 2});
   testPoolLayer({10, 8, 16, 16, 8, 8, 2, 2, 0, 0, 2, 2});
   testPoolLayer({4, 2, 5, 5, 3, 3, 3, 3, 1, 1, 2, 2});
@@ -204,44 +209,36 @@ TEST(MKLDNNLayer, PoolLayer) {
 }
 
 struct testActDesc {
-  int bs, ch;
-  int ih, iw;
+  int bs, ic, ih, iw;
 };
 
 static void getAddtoConfig(TestConfig& cfg, const testActDesc& pm) {
   cfg.biasSize = 0;
   cfg.layerConfig.set_type("addto");
-  cfg.layerConfig.set_size(pm.ch * pm.ih * pm.iw);
-  cfg.inputDefs.push_back(
-      {INPUT_DATA,
-       "layer_0",
-       /* size of input layer= */ size_t(pm.ch * pm.ih * pm.iw),
-       0});
+  size_t layerSize = pm.ih * pm.ih * pm.iw;
+  cfg.layerConfig.set_size(layerSize);
+  cfg.inputDefs.push_back({INPUT_DATA, "layer_0", layerSize, 0});
   cfg.layerConfig.add_inputs();
 }
 
-void testActivation(std::string& type, const testActDesc& pm) {
-  const std::string compareTypes[] = {type, type.erase(0, 7)};
+void testActivation(std::string& actType, const testActDesc& pm) {
+  // TODO(TJ): mkldnn_softmax not implemented, paddle do not have elu activation
+  if (actType == "mkldnn_softmax" || actType == "mkldnn_elu") {
+    return;
+  }
+  const std::string compareTypes[] = {actType, actType.erase(0, 7)};
   TestConfig cfg;
   getAddtoConfig(cfg, pm);
-
   TestConfig ref = cfg;
   cfg.layerConfig.set_active_type(compareTypes[0]);
   ref.layerConfig.set_active_type(compareTypes[1]);
-  MKLDNNTester tester;
-  for (auto bs : {pm.bs, 1}) {
-    tester.run(cfg, ref, bs, pm.ih, pm.iw);
-  }
+  RUN_MKLDNN_TEST(cfg, ref, pm)
 }
 
 TEST(MKLDNNActivation, Activations) {
   auto types = MKLDNNActivation::getAllRegisteredTypes();
-  // TODO(TJ): mkldnn_softmax not implemented, paddle do not have elu activation
-  std::set<string> excluded{"mkldnn_softmax", "mkldnn_elu"};
   for (auto type : types) {
-    if (excluded.count(type)) {
-      continue;
-    }
+    /* bs, c, h, w*/
     testActivation(type, {16, 64, 32, 32});
   }
 }
diff --git a/python/paddle/trainer/config_parser.py b/python/paddle/trainer/config_parser.py
index 7c32eb0069f4075d72cd4c3654c83e3d5c98fb1c..0f57b81966647ca5c6f5cd2e5518d2d34942a549 100644
--- a/python/paddle/trainer/config_parser.py
+++ b/python/paddle/trainer/config_parser.py
@@ -1565,6 +1565,10 @@ class LayerBase(object):
 
         self.config = g_config.model_config.layers.add()
         assert isinstance(self.config, LayerConfig)
+        use_mkldnn = bool(int(g_command_config_args.get("use_mkldnn", 0)))
+        mkldnn_acts = ['relu', 'tanh']
+        if use_mkldnn and active_type in mkldnn_acts:
+            active_type = "mkldnn_" + active_type
         self.config.name = name
         self.config.type = type
         self.config.active_type = active_type