diff --git a/paddle/gserver/activations/MKLDNNActivation.cpp b/paddle/gserver/activations/MKLDNNActivation.cpp
index 5b111f406257adc96aac215b521896bf6d258480..3b32d7e2d83f4c17725d4bbcb3303ed9721b0527 100644
--- a/paddle/gserver/activations/MKLDNNActivation.cpp
+++ b/paddle/gserver/activations/MKLDNNActivation.cpp
@@ -27,49 +27,52 @@ static ClassRegistrar<ActivationFunction> gMKLDNNActivationRegistrar;
 #define MKLDNN_ACTIVATION_CLASS_NAME(ACT_TYPE) mkldnn_##ACT_TYPE##Activation
 
 /**
- * @def DEFINE_MKLDNN_ACTIVATION
+ * @def BEGIN_MKLDNN_ACTIVATION
  */
-#define DEFINE_MKLDNN_ACTIVATION(ACT_TYPE, BASE_CLASS)               \
-  class MKLDNN_ACTIVATION_CLASS_NAME(ACT_TYPE) : public BASE_CLASS { \
-  private:                                                           \
-    static const std::string name;                                   \
-                                                                     \
-  public:                                                            \
-    const std::string& getName() const { return name; }              \
-  };                                                                 \
-  const std::string MKLDNN_ACTIVATION_CLASS_NAME(ACT_TYPE)::name =   \
-      "mkldnn_" #ACT_TYPE;                                           \
-  static InitFunction __reg_activation__mkldnn_##ACT_TYPE([] {       \
-    gMKLDNNActivationRegistrar                                       \
-        .registerClass<MKLDNN_ACTIVATION_CLASS_NAME(ACT_TYPE)>(      \
-            "mkldnn_" #ACT_TYPE);                                    \
+#define BEGIN_MKLDNN_ACTIVATION(ACT_TYPE, BASE_CLASS) \
+  class MKLDNN_ACTIVATION_CLASS_NAME(ACT_TYPE) : public BASE_CLASS {
+/**
+ * @def END_MKLDNN_ACTIVATION
+ */
+#define END_MKLDNN_ACTIVATION(ACT_TYPE)                            \
+private:                                                           \
+  static const std::string name;                                   \
+                                                                   \
+public:                                                            \
+  const std::string& getName() const { return name; }              \
+  }                                                                \
+  ;                                                                \
+  const std::string MKLDNN_ACTIVATION_CLASS_NAME(ACT_TYPE)::name = \
+      "mkldnn_" #ACT_TYPE;                                         \
+  static InitFunction __reg_activation__mkldnn_##ACT_TYPE([] {     \
+    gMKLDNNActivationRegistrar                                     \
+        .registerClass<MKLDNN_ACTIVATION_CLASS_NAME(ACT_TYPE)>(    \
+            "mkldnn_" #ACT_TYPE);                                  \
   });
 
+/**
+ * @def DEFINE_MKLDNN_ACTIVATION
+ */
+#define DEFINE_MKLDNN_ACTIVATION(ACT_TYPE, BASE_CLASS) \
+  BEGIN_MKLDNN_ACTIVATION(ACT_TYPE, BASE_CLASS)        \
+  END_MKLDNN_ACTIVATION(ACT_TYPE)
+
 /**
  * @def DEFINE_MKLDNN_ELTWISE_ACTIVATION
  */
-#define DEFINE_MKLDNN_ELTWISE_ACTIVATION(ACT_TYPE, ALPHA, BWD_ALPHA)        \
-  class MKLDNN_ACTIVATION_CLASS_NAME(ACT_TYPE)                              \
-      : public MKLDNNEltwiseActivation {                                    \
-  private:                                                                  \
-    static const std::string name;                                          \
-    static const float alpha;                                               \
-    static const float bwdAlpha;                                            \
-                                                                            \
-  public:                                                                   \
-    const std::string& getName() const { return name; }                     \
-    float getAlpha() const { return alpha; }                                \
-    float getBwdAlpha() const { return bwdAlpha; }                          \
-  };                                                                        \
-  const std::string MKLDNN_ACTIVATION_CLASS_NAME(ACT_TYPE)::name =          \
-      "mkldnn_" #ACT_TYPE;                                                  \
-  const float MKLDNN_ACTIVATION_CLASS_NAME(ACT_TYPE)::alpha = ALPHA;        \
-  const float MKLDNN_ACTIVATION_CLASS_NAME(ACT_TYPE)::bwdAlpha = BWD_ALPHA; \
-  static InitFunction __reg_activation__mkldnn_##ACT_TYPE([] {              \
-    gMKLDNNActivationRegistrar                                              \
-        .registerClass<MKLDNN_ACTIVATION_CLASS_NAME(ACT_TYPE)>(             \
-            "mkldnn_" #ACT_TYPE);                                           \
-  });
+#define DEFINE_MKLDNN_ELTWISE_ACTIVATION(                            \
+    ACT_TYPE, BASE_CLASS, ALPHA, BWD_ALPHA)                          \
+  BEGIN_MKLDNN_ACTIVATION(ACT_TYPE, BASE_CLASS)                      \
+private:                                                             \
+  static const float alpha;                                          \
+  static const float bwdAlpha;                                       \
+                                                                     \
+public:                                                              \
+  float getAlpha() const { return alpha; }                           \
+  float getBwdAlpha() const { return bwdAlpha; }                     \
+  END_MKLDNN_ACTIVATION(ACT_TYPE)                                    \
+  const float MKLDNN_ACTIVATION_CLASS_NAME(ACT_TYPE)::alpha = ALPHA; \
+  const float MKLDNN_ACTIVATION_CLASS_NAME(ACT_TYPE)::bwdAlpha = BWD_ALPHA;
 
 /**
  * @brief MKLDNN Relu Activation.
@@ -78,25 +81,138 @@ static ClassRegistrar<ActivationFunction> gMKLDNNActivationRegistrar;
  *  f(x) = negative_slope * x  (x <  0)
  * @note the negative_slope should be -0.f in forward
  */
-DEFINE_MKLDNN_ELTWISE_ACTIVATION(relu, -0.f, 0.f)
+DEFINE_MKLDNN_ELTWISE_ACTIVATION(relu, MKLDNNEltwiseActivation, -0.f, 0.f)
 
 /**
  * @brief MKLDNN Tanh Activation.
  */
-DEFINE_MKLDNN_ELTWISE_ACTIVATION(tanh, 0.f, 0.f)
+DEFINE_MKLDNN_ELTWISE_ACTIVATION(tanh, MKLDNNEltwiseActivation, 0.f, 0.f)
 
 /**
  * @brief MKLDNN ELU(Exponential Linear Unit) Activation.
  *  f(x) = x                              (x >= 0)
  *  f(x) = negative_slope * (exp(x) - 1)  (x <  0)
  */
-DEFINE_MKLDNN_ELTWISE_ACTIVATION(elu, 0.f, 0.f)
+DEFINE_MKLDNN_ELTWISE_ACTIVATION(elu, MKLDNNEltwiseActivation, 0.f, 0.f)
+
+mkldnn::algorithm MKLDNNEltwiseActivation::getAlgo(std::string type) const {
+  const std::map<std::string, mkldnn::algorithm> algoMap = {
+      {"relu", algorithm::eltwise_relu},
+      {"tanh", algorithm::eltwise_tanh},
+      {"elu", algorithm::eltwise_elu}};
+  type.erase(0, 7);  // remove mkldnn_
+  algorithm algo = (algorithm)0;
+  mapGet(type, algoMap, &algo);
+  return algo;
+}
+
+void MKLDNNEltwiseActivation::resetFwd(Argument& act) {
+  if (cnt_ == act.value->getElementCnt()) {
+    return;
+  }
+  MKLDNNActivation::resetFwd(act);
+  // note: alpha represents the NegativeSlope when used in relu.
+  float alpha = getAlpha();
+  float beta = getBeta();
+  algorithm algo = getAlgo(this->getName());
+  auto fwdDesc = eltwise_fwd::desc(mkldnn::prop_kind::forward_training,
+                                   algo,
+                                   val_->getMemoryDesc(),
+                                   alpha,
+                                   beta);
+  fwdPD_.reset(new eltwise_fwd::primitive_desc(fwdDesc, *engine_));
+  // use inplace for forward but save input value before submit
+  inVal_ = val_;
+  copyInVal_ = nullptr;
+  if (act.grad && algo == algorithm::eltwise_tanh) {
+    // tanh need save src input for backward
+    inVal_ = MKLDNNMatrix::create(nullptr, val_->getPrimitiveDesc());
+    copyInVal_ = std::make_shared<mkldnn::reorder>(*val_, *inVal_);
+    CHECK(copyInVal_) << "should not be emptry";
+    pipelineFwd_.push_back(*copyInVal_);
+  }
+  fwd_.reset(new eltwise_fwd(*fwdPD_, *val_, *val_));
+  pipelineFwd_.push_back(*fwd_);
+  needResetBwd_ = true;
+}
+
+void MKLDNNEltwiseActivation::resetBwd(Argument& act) {
+  if (!needResetBwd_) {
+    return;
+  }
+  VLOG(MKLDNN_BASE) << getName() << " reset mkldnn backward";
+  needResetBwd_ = false;
+  algorithm algo = getAlgo(this->getName());
+  float alpha = getBwdAlpha();
+  float beta = getBeta();
+  grad_ = MKLDNNMatrix::create(act.grad, val_->getPrimitiveDesc());
+  auto eng = CPUEngine::Instance().getEngine();
+  auto bwdDesc = eltwise_bwd::desc(
+      algo, grad_->getMemoryDesc(), val_->getMemoryDesc(), alpha, beta);
+  auto bwdPD = eltwise_bwd::primitive_desc(bwdDesc, eng, *fwdPD_);
+  CHECK(inVal_);
+  bwd_.reset(new eltwise_bwd(bwdPD, *inVal_, *grad_, *grad_));
+  pipelineBwd_.clear();
+  pipelineBwd_.push_back(*bwd_);
+}
 
 /**
  * @brief MKLDNN Softmax Activation
  */
 DEFINE_MKLDNN_ACTIVATION(softmax, MKLDNNSoftmaxActivation)
 
+void MKLDNNSoftmaxActivation::resetFwd(Argument& act) {
+  if (cnt_ == act.value->getElementCnt()) {
+    return;
+  }
+  MKLDNNActivation::resetFwd(act);
+  int axis = 1;
+  auto fwdDesc = softmax_fwd::desc(
+      mkldnn::prop_kind::forward_scoring, val_->getMemoryDesc(), axis);
+  auto fwdPD = softmax_fwd::primitive_desc(fwdDesc, *engine_);
+  fwd_.reset(new softmax_fwd(fwdPD, *val_, *val_));
+  pipelineFwd_.push_back(*fwd_);
+}
+
+Error __must_check MKLDNNSoftmaxActivation::forward(Argument& act) {
+  resetFwd(act);
+  stream_->submit(pipelineFwd_);
+  real* v = act.value->getData();
+  real threshold = exp(-64);
+#pragma omp parallel for
+  for (size_t i = 0; i < act.value->getElementCnt(); ++i) {
+    v[i] = v[i] < threshold ? threshold : v[i];
+  }
+  return Error();
+}
+
+Error __must_check MKLDNNSoftmaxActivation::backward(Argument& act) {
+  MatrixPtr outputV = act.value;
+  MatrixPtr outputG = act.grad;
+
+  if (outputG->useGpu()) {
+    outputG->softmaxBackward(*outputV);
+  } else {
+    SetDevice device(act.deviceId);
+    Matrix::resizeOrCreate(sftMaxDot_,
+                           outputG->getHeight(),
+                           outputG->getWidth(),
+                           /* trans */ false,
+                           useGpu(act.deviceId));
+    Matrix::resizeOrCreate(sftMaxSum_,
+                           outputG->getHeight(),
+                           1,
+                           /* trans */ false,
+                           useGpu(act.deviceId));
+
+    sftMaxDot_->dotMul(*outputG, *outputV);
+    sftMaxSum_->colMerge(*sftMaxDot_);
+
+    act.grad->softmaxDerivative(*act.value, *sftMaxSum_);
+  }
+  return Error();
+}
+
 ActivationFunction* MKLDNNActivation::create(const std::string& type) {
   return gMKLDNNActivationRegistrar.createByType(type);
 }
@@ -108,4 +224,34 @@ std::vector<std::string> MKLDNNActivation::getAllRegisteredTypes() {
   return types;
 }
 
+void MKLDNNActivation::resetFwd(Argument& act) {
+  VLOG(MKLDNN_BASE) << getName() << " reset mkldnn forward";
+  cnt_ = act.value->getElementCnt();
+  pipelineFwd_.clear();
+  stream_.reset(new MKLDNNStream());
+  engine_.reset(new mkldnn::engine(mkldnn::engine::cpu, 0));
+  val_ = std::dynamic_pointer_cast<MKLDNNMatrix>(act.value);
+  if (val_ == nullptr) {
+    int bs = act.getBatchSize();
+    int ih = act.getFrameHeight() > 0 ? act.getFrameHeight() : 1;
+    int iw = act.getFrameWidth() > 0 ? act.getFrameWidth() : 1;
+    int ic = cnt_ / bs / ih / iw;
+    CHECK_EQ(cnt_, (size_t)bs * ic * ih * iw);
+    val_ = MKLDNNMatrix::create(
+        act.value, {bs, ic, ih, iw}, mkldnn::memory::format::nchw, *engine_);
+    CHECK(val_);
+    val_->downSpatial();
+  }
+}
+
+Error __must_check MKLDNNActivation::forward(Argument& act) {
+  resetFwd(act);
+  stream_->submit(pipelineFwd_);
+  return Error();
+}
+Error __must_check MKLDNNActivation::backward(Argument& act) {
+  resetBwd(act);
+  stream_->submit(pipelineBwd_);
+  return Error();
+}
 }  // namespace paddle
diff --git a/paddle/gserver/activations/MKLDNNActivation.h b/paddle/gserver/activations/MKLDNNActivation.h
index ed0dd891a0f0a7e8d5d53a203aa5977b70809fdd..dd16421fd6e93b49c30b1d3b601f95980afec57b 100644
--- a/paddle/gserver/activations/MKLDNNActivation.h
+++ b/paddle/gserver/activations/MKLDNNActivation.h
@@ -52,41 +52,15 @@ public:
   /**
    * reset the forward primitives
    */
-  virtual void resetFwd(Argument& act) {
-    VLOG(MKLDNN_BASE) << getName() << " reset mkldnn forward";
-    cnt_ = act.value->getElementCnt();
-    pipelineFwd_.clear();
-    stream_.reset(new MKLDNNStream());
-    engine_.reset(new mkldnn::engine(mkldnn::engine::cpu, 0));
-    val_ = std::dynamic_pointer_cast<MKLDNNMatrix>(act.value);
-    if (val_ == nullptr) {
-      int bs = act.getBatchSize();
-      int ih = act.getFrameHeight() > 0 ? act.getFrameHeight() : 1;
-      int iw = act.getFrameWidth() > 0 ? act.getFrameWidth() : 1;
-      int ic = cnt_ / bs / ih / iw;
-      CHECK_EQ(cnt_, (size_t)bs * ic * ih * iw);
-      val_ = MKLDNNMatrix::create(
-          act.value, {bs, ic, ih, iw}, mkldnn::memory::format::nchw, *engine_);
-      CHECK(val_);
-      val_->downSpatial();
-    }
-  }
+  virtual void resetFwd(Argument& act);
   /**
    * reset the backward primitives,
    * can not merge this functions into resetFwd as the grad data
    * would be changing before backward.
    */
   virtual void resetBwd(Argument& act) {}
-  virtual Error __must_check forward(Argument& act) {
-    resetFwd(act);
-    stream_->submit(pipelineFwd_);
-    return Error();
-  }
-  virtual Error __must_check backward(Argument& act) {
-    resetBwd(act);
-    stream_->submit(pipelineBwd_);
-    return Error();
-  }
+  virtual Error __must_check forward(Argument& act);
+  virtual Error __must_check backward(Argument& act);
 };
 
 /**
@@ -96,6 +70,7 @@ public:
 class MKLDNNEltwiseActivation : public MKLDNNActivation {
   typedef mkldnn::eltwise_forward eltwise_fwd;
   typedef mkldnn::eltwise_backward eltwise_bwd;
+  typedef mkldnn::algorithm algorithm;
 
 protected:
   // save the forward primitive desc, which can be used backward
@@ -115,68 +90,9 @@ public:
   virtual float getAlpha() const = 0;
   virtual float getBwdAlpha() const = 0;
   virtual float getBeta() const { return 0.f; }
-  virtual mkldnn::algorithm getAlgo(const std::string& type) const {
-    if (type == "mkldnn_relu") {
-      return mkldnn::algorithm::eltwise_relu;
-    } else if (type == "mkldnn_tanh") {
-      return mkldnn::algorithm::eltwise_tanh;
-    } else if (type == "mkldnn_elu") {
-      return mkldnn::algorithm::eltwise_elu;
-    } else {
-      LOG(FATAL) << "Unkown eltwise activation type: " << type;
-    }
-    return (mkldnn::algorithm)0;
-  }
-
-  void resetFwd(Argument& act) override {
-    if (cnt_ == act.value->getElementCnt()) {
-      return;
-    }
-    MKLDNNActivation::resetFwd(act);
-    // note: alpha represents the NegativeSlope when used in relu.
-    float alpha = getAlpha();
-    float beta = getBeta();
-    mkldnn::algorithm algo = getAlgo(this->getName());
-    auto fwdDesc = eltwise_fwd::desc(mkldnn::prop_kind::forward_training,
-                                     algo,
-                                     val_->getMemoryDesc(),
-                                     alpha,
-                                     beta);
-    fwdPD_.reset(new eltwise_fwd::primitive_desc(fwdDesc, *engine_));
-    // use inplace for forward but save input value before submit
-    inVal_ = val_;
-    copyInVal_ = nullptr;
-    if (act.grad && algo == mkldnn::algorithm::eltwise_tanh) {
-      // tanh need save src input for backward
-      inVal_ = MKLDNNMatrix::create(nullptr, val_->getPrimitiveDesc());
-      copyInVal_ = std::make_shared<mkldnn::reorder>(*val_, *inVal_);
-      CHECK(copyInVal_) << "should not be emptry";
-      pipelineFwd_.push_back(*copyInVal_);
-    }
-    fwd_.reset(new eltwise_fwd(*fwdPD_, *val_, *val_));
-    pipelineFwd_.push_back(*fwd_);
-    needResetBwd_ = true;
-  }
-
-  void resetBwd(Argument& act) override {
-    if (!needResetBwd_) {
-      return;
-    }
-    VLOG(MKLDNN_BASE) << getName() << " reset mkldnn backward";
-    needResetBwd_ = false;
-    mkldnn::algorithm algo = getAlgo(this->getName());
-    float alpha = getBwdAlpha();
-    float beta = getBeta();
-    grad_ = MKLDNNMatrix::create(act.grad, val_->getPrimitiveDesc());
-    auto eng = CPUEngine::Instance().getEngine();
-    auto bwdDesc = eltwise_bwd::desc(
-        algo, grad_->getMemoryDesc(), val_->getMemoryDesc(), alpha, beta);
-    auto bwdPD = eltwise_bwd::primitive_desc(bwdDesc, eng, *fwdPD_);
-    CHECK(inVal_);
-    bwd_.reset(new eltwise_bwd(bwdPD, *inVal_, *grad_, *grad_));
-    pipelineBwd_.clear();
-    pipelineBwd_.push_back(*bwd_);
-  }
+  virtual algorithm getAlgo(std::string type) const;
+  void resetFwd(Argument& act) override;
+  void resetBwd(Argument& act) override;
 };
 
 /**
@@ -195,45 +111,9 @@ public:
   MKLDNNSoftmaxActivation() {}
   ~MKLDNNSoftmaxActivation() {}
   virtual const std::string& getName() const = 0;
-  void resetFwd(Argument& act) override {
-    if (cnt_ == act.value->getElementCnt()) {
-      return;
-    }
-    MKLDNNActivation::resetFwd(act);
-    int axis = 1;
-    auto fwdDesc = softmax_fwd::desc(
-        mkldnn::prop_kind::forward_scoring, val_->getMemoryDesc(), axis);
-    auto fwdPD = softmax_fwd::primitive_desc(fwdDesc, *engine_);
-    fwd_.reset(new softmax_fwd(fwdPD, *val_, *val_));
-    pipelineFwd_.push_back(*fwd_);
-  }
-
-  Error __must_check backward(Argument& act) override {
-    MatrixPtr outputV = act.value;
-    MatrixPtr outputG = act.grad;
-
-    if (outputG->useGpu()) {
-      outputG->softmaxBackward(*outputV);
-    } else {
-      SetDevice device(act.deviceId);
-      Matrix::resizeOrCreate(sftMaxDot_,
-                             outputG->getHeight(),
-                             outputG->getWidth(),
-                             /* trans */ false,
-                             useGpu(act.deviceId));
-      Matrix::resizeOrCreate(sftMaxSum_,
-                             outputG->getHeight(),
-                             1,
-                             /* trans */ false,
-                             useGpu(act.deviceId));
-
-      sftMaxDot_->dotMul(*outputG, *outputV);
-      sftMaxSum_->colMerge(*sftMaxDot_);
-
-      act.grad->softmaxDerivative(*act.value, *sftMaxSum_);
-    }
-    return Error();
-  }
+  void resetFwd(Argument& act) override;
+  Error __must_check forward(Argument& act) override;
+  Error __must_check backward(Argument& act) override;
 };
 
 }  // namespace paddle
diff --git a/paddle/math/Matrix.cpp b/paddle/math/Matrix.cpp
index 588310fbeecdf297f2eb24a6aadb26a87d25b1ff..0023b4d0f5da500f380ecb836b7c54e050b13d67 100644
--- a/paddle/math/Matrix.cpp
+++ b/paddle/math/Matrix.cpp
@@ -3637,7 +3637,7 @@ void CpuMatrix::oneHotCrossEntropy(Matrix& output, IVector& label) {
   for (size_t i = 0; i < numSamples; ++i, out += dim) {
     CHECK_GE(lbl[i], 0);
     CHECK_LT((size_t)lbl[i], dim);
-    cost[i] = -std::log(std::max(out[lbl[i]], real(FLT_MIN)));
+    cost[i] = -std::log(out[lbl[i]]);
   }
 }
 
@@ -3652,7 +3652,7 @@ void CpuMatrix::oneHotCrossEntropyBp(Matrix& output, IVector& label) {
   real* grad = getData();
   int* lbl = label.getData();
   for (size_t i = 0; i < numSamples; ++i, out += dim, grad += dim) {
-    grad[lbl[i]] -= 1 / std::max(out[lbl[i]], real(FLT_MIN));
+    grad[lbl[i]] -= 1 / out[lbl[i]];
   }
 }