diff --git a/paddle/math/Vector.h b/paddle/math/Vector.h
index 80b9775fccf10c57bb48145ef56165ec7c86d8b8..7dbf3cfb0d5433c1b44947fe7e24c7ab1f9ec183 100644
--- a/paddle/math/Vector.h
+++ b/paddle/math/Vector.h
@@ -92,6 +92,28 @@ public:
   const T* getData() const { return this->data_; }
   T* getData() { return this->data_; }
 
+#ifdef PADDLE_USE_MKLDNN
+  /**
+   * sgd update with openmp to speedup
+   */
+  void sgdUpdateWithOMP(VectorT& gradVec,
+                        VectorT& momVec,
+                        T learningRate,
+                        T momentum,
+                        T decayRate) {
+    size_t size = this->getSize();
+    T* val = this->getData();
+    T* grd = gradVec.getData();
+    T* mom = momVec.getData();
+    decayRate *= learningRate;
+#pragma omp parallel for
+    for (size_t i = 0; i < size; ++i) {
+      mom[i] = momentum * mom[i] - learningRate * grd[i] - decayRate * val[i];
+      val[i] += mom[i];
+    }
+  }
+#endif
+
   virtual void zeroMem() = 0;
   // set all elements to value
   virtual void reset(const T& value) = 0;
diff --git a/paddle/parameter/FirstOrderOptimizer.h b/paddle/parameter/FirstOrderOptimizer.h
index caa78acd98ea4b35fc69643689cfce23026275e0..73e09aee2366bed095be532ab11f3c0d40f6d01f 100644
--- a/paddle/parameter/FirstOrderOptimizer.h
+++ b/paddle/parameter/FirstOrderOptimizer.h
@@ -37,6 +37,15 @@ public:
     real torch_learningRate = optConfig_.learning_method() == "torch_momentum"
                                   ? 1.0 - paraConfig.momentum()
                                   : 1.0;
+#ifdef PADDLE_USE_MKLDNN
+    vecs[PARAMETER_VALUE]->sgdUpdateWithOMP(
+        *vecs[PARAMETER_GRADIENT],
+        *vecs[PARAMETER_MOMENTUM],
+        learningRate_ * paraConfig.learning_rate() *
+            (firstTime_ ? 1.0 : torch_learningRate),
+        paraConfig.momentum(),
+        applyDecay_ ? paraConfig.decay_rate() : 0);
+#else
     vecs[PARAMETER_VALUE]->sgdUpdate(
         *vecs[PARAMETER_GRADIENT],
         *vecs[PARAMETER_MOMENTUM],
@@ -44,6 +53,7 @@ public:
             (firstTime_ ? 1.0 : torch_learningRate),
         paraConfig.momentum(),
         applyDecay_ ? paraConfig.decay_rate() : 0);
+#endif
   }
   virtual void finishBatch() { firstTime_ = false; }
 };
diff --git a/paddle/parameter/ParameterUpdateFunctions.cpp b/paddle/parameter/ParameterUpdateFunctions.cpp
index c8af7105c78dcbf9f625a348b7f38efcf278469e..8b3be062b654a52e667626199be8c8bb4a2a96d7 100644
--- a/paddle/parameter/ParameterUpdateFunctions.cpp
+++ b/paddle/parameter/ParameterUpdateFunctions.cpp
@@ -30,6 +30,9 @@ void sgdUpdateCpu(real learningRate,
                   const real* grad,
                   real* momentumVec) {
   decayRate *= learningRate;
+#ifdef PADDLE_USE_MKLDNN
+#pragma omp parallel for
+#endif
   for (size_t i = 0; i < size; ++i) {
     momentumVec[i] = momentum * momentumVec[i] - learningRate * grad[i] -
                      decayRate * value[i];