From d6a27ade5469dbaf832983fabaa32ec70ab4c2f5 Mon Sep 17 00:00:00 2001
From: tensor-tang <jian.j.tang@intel.com>
Date: Mon, 25 Sep 2017 17:13:05 +0800
Subject: [PATCH] add OMP SGD to speedup with CPUs

---
 paddle/math/Vector.h                          | 22 +++++++++++++++++++
 paddle/parameter/FirstOrderOptimizer.h        | 10 +++++++++
 paddle/parameter/ParameterUpdateFunctions.cpp |  3 +++
 3 files changed, 35 insertions(+)

diff --git a/paddle/math/Vector.h b/paddle/math/Vector.h
index 80b9775fccf..7dbf3cfb0d5 100644
--- a/paddle/math/Vector.h
+++ b/paddle/math/Vector.h
@@ -92,6 +92,28 @@ public:
   const T* getData() const { return this->data_; }
   T* getData() { return this->data_; }
 
+#ifdef PADDLE_USE_MKLDNN
+  /**
+   * sgd update with openmp to speedup
+   */
+  void sgdUpdateWithOMP(VectorT& gradVec,
+                        VectorT& momVec,
+                        T learningRate,
+                        T momentum,
+                        T decayRate) {
+    size_t size = this->getSize();
+    T* val = this->getData();
+    T* grd = gradVec.getData();
+    T* mom = momVec.getData();
+    decayRate *= learningRate;
+#pragma omp parallel for
+    for (size_t i = 0; i < size; ++i) {
+      mom[i] = momentum * mom[i] - learningRate * grd[i] - decayRate * val[i];
+      val[i] += mom[i];
+    }
+  }
+#endif
+
   virtual void zeroMem() = 0;
   // set all elements to value
   virtual void reset(const T& value) = 0;
diff --git a/paddle/parameter/FirstOrderOptimizer.h b/paddle/parameter/FirstOrderOptimizer.h
index caa78acd98e..73e09aee236 100644
--- a/paddle/parameter/FirstOrderOptimizer.h
+++ b/paddle/parameter/FirstOrderOptimizer.h
@@ -37,6 +37,15 @@ public:
     real torch_learningRate = optConfig_.learning_method() == "torch_momentum"
                                   ? 1.0 - paraConfig.momentum()
                                   : 1.0;
+#ifdef PADDLE_USE_MKLDNN
+    vecs[PARAMETER_VALUE]->sgdUpdateWithOMP(
+        *vecs[PARAMETER_GRADIENT],
+        *vecs[PARAMETER_MOMENTUM],
+        learningRate_ * paraConfig.learning_rate() *
+            (firstTime_ ? 1.0 : torch_learningRate),
+        paraConfig.momentum(),
+        applyDecay_ ? paraConfig.decay_rate() : 0);
+#else
     vecs[PARAMETER_VALUE]->sgdUpdate(
         *vecs[PARAMETER_GRADIENT],
         *vecs[PARAMETER_MOMENTUM],
@@ -44,6 +53,7 @@ public:
             (firstTime_ ? 1.0 : torch_learningRate),
         paraConfig.momentum(),
         applyDecay_ ? paraConfig.decay_rate() : 0);
+#endif
   }
   virtual void finishBatch() { firstTime_ = false; }
 };
diff --git a/paddle/parameter/ParameterUpdateFunctions.cpp b/paddle/parameter/ParameterUpdateFunctions.cpp
index c8af7105c78..8b3be062b65 100644
--- a/paddle/parameter/ParameterUpdateFunctions.cpp
+++ b/paddle/parameter/ParameterUpdateFunctions.cpp
@@ -30,6 +30,9 @@ void sgdUpdateCpu(real learningRate,
                   const real* grad,
                   real* momentumVec) {
   decayRate *= learningRate;
+#ifdef PADDLE_USE_MKLDNN
+#pragma omp parallel for
+#endif
   for (size_t i = 0; i < size; ++i) {
     momentumVec[i] = momentum * momentumVec[i] - learningRate * grad[i] -
                      decayRate * value[i];
-- 
GitLab