add benchmark and mkl sgd implement

test=develop

add benchmark and mkl sgd implement
test=develop
8e041337 · tensor-tang · 07efdb51 · 8e041337 · 8e041337 · 8e041337
4 changed file
--- a/paddle/fluid/operators/jit/benchmark.cc
+++ b/paddle/fluid/operators/jit/benchmark.cc
@@ -332,6 +332,45 @@ void BenchEmbSeqPoolKernel() {
  }
 }

+template <jit::KernelType KT, typename T, typename PlaceType>
+void BenchSgdKernel() {
+  const T lr = 0.1;
+  auto UnDuplicatedRandomVec = [](int n, const int64_t lower,
+                                  const int64_t upper) -> std::vector<int64_t> {
+    PADDLE_ENFORCE_LE(static_cast<size_t>(upper - lower), n - 1);
+    PADDLE_ENFORCE_GT(n, 0);
+    std::vector<int64_t> all, out;
+    for (int i = 0; i < n; ++i) {
+      all.push_back(i);
+    }
+    std::random_shuffle(all.begin(), all.end());
+    out.insert(out.begin(), all.begin(), all.begin() + n);
+    return out;
+  };
+  for (int param_h : {1, 1000}) {
+    for (int grad_w : {1, 2, 8, 16, 30, 256}) {
+      // only benchmark inplace
+      Tensor param;
+      param.Resize({param_h, grad_w});
+      T* param_data = param.mutable_data<T>(PlaceType());
+      RandomVec<T>(param_h * grad_w, param_data, -2.f, 2.f);
+      for (int rows_size = 1; rows_size <= std::min(param_h, 10); ++rows_size) {
+        Tensor grad;
+        grad.Resize({rows_size, grad_w});
+        std::vector<int64_t> rows =
+            UnDuplicatedRandomVec(rows_size, 0, rows_size - 1);
+        RandomVec<T>(rows_size * grad_w, grad.mutable_data<T>(PlaceType()),
+                     -2.f, 2.f);
+        const T* grad_data = grad.data<T>();
+        const int64_t* rows_data = rows.data();
+        jit::sgd_attr_t attr(param_h, grad_w, rows_size, grad_w, rows_size);
+        BenchAllImpls<KT, jit::SgdTuples<T>, PlaceType>(
+            attr, &lr, param_data, grad_data, rows_data, param_data, &attr);
+      }
+    }
+  }
+}
+
 template <jit::KernelType KT, typename T, typename PlaceType>
 void BenchMatMulKernel() {
  for (int m : {1, 2, 3, 4}) {
@@ -477,6 +516,9 @@ BENCH_FP32_CPU(kEmbSeqPool) {
  BenchEmbSeqPoolKernel<jit::kEmbSeqPool, T, CPUPlace>();
 }

+// sgd function
+BENCH_FP32_CPU(kSgd) { BenchSgdKernel<jit::kSgd, T, CPUPlace>(); }
+
 // matmul
 BENCH_FP32_CPU(kMatMul) { BenchMatMulKernel<jit::kMatMul, T, CPUPlace>(); }


--- a/paddle/fluid/operators/jit/more/mkl/CMakeLists.txt
+++ b/paddle/fluid/operators/jit/more/mkl/CMakeLists.txt
@@ -14,3 +14,4 @@ USE_JITKERNEL_MORE(kVTanh, mkl)
 USE_JITKERNEL_MORE(kSeqPool, mkl)
 USE_JITKERNEL_MORE(kSoftmax, mkl)
 USE_JITKERNEL_MORE(kEmbSeqPool, mkl)
+USE_JITKERNEL_MORE(kSgd, mkl)
--- a/paddle/fluid/operators/jit/more/mkl/mkl.cc
+++ b/paddle/fluid/operators/jit/more/mkl/mkl.cc
@@ -184,6 +184,16 @@ bool EmbSeqPoolKernel<double>::UseMe(const emb_seq_pool_attr_t& attr) const {
  return true;
 }

+template <>
+bool SgdKernel<float>::UseMe(const sgd_attr_t& attr) const {
+  return true;
+}
+
+template <>
+bool SgdKernel<double>::UseMe(const sgd_attr_t& attr) const {
+  return true;
+}
+
 template <>
 bool MatMulKernel<float>::UseMe(const matmul_attr_t& attr) const {
  return platform::MayIUse(platform::avx);
@@ -239,5 +249,6 @@ REGISTER_MKL_KERNEL(kVTanh, VTanh);
 REGISTER_MKL_KERNEL(kSeqPool, SeqPool);
 REGISTER_MKL_KERNEL(kEmbSeqPool, EmbSeqPool);
 REGISTER_MKL_KERNEL(kSoftmax, Softmax);
+REGISTER_MKL_KERNEL(kSgd, Sgd);

 #undef REGISTER_MKL_KERNEL
--- a/paddle/fluid/operators/jit/more/mkl/mkl.h
+++ b/paddle/fluid/operators/jit/more/mkl/mkl.h
@@ -142,6 +142,32 @@ void Softmax(const T* x, T* y, int n, int bs) {
  }
 }

+template <typename T>
+void Sgd(const T* lr, const T* param, const T* grad, const int64_t* rows,
+         T* out, const sgd_attr_t* attr) {
+  PADDLE_ENFORCE_EQ(attr->param_width, attr->grad_width);
+  PADDLE_ENFORCE_LE(attr->selected_rows_size, attr->grad_height);
+  T scalar = -lr[0];
+  int width = attr->grad_width;
+  if (out == param) {
+    for (int64_t i = 0; i < attr->selected_rows_size; ++i) {
+      auto h_idx = rows[i];
+      PADDLE_ENFORCE_LT(h_idx, attr->param_height);
+      PADDLE_ENFORCE_GE(h_idx, 0);
+      VAXPY(scalar, grad + i * width, out + h_idx * width, width);
+    }
+  } else {
+    for (int64_t i = 0; i < attr->selected_rows_size; ++i) {
+      auto h_idx = rows[i];
+      PADDLE_ENFORCE_LT(h_idx, attr->param_height);
+      PADDLE_ENFORCE_GE(h_idx, 0);
+      VScal(&scalar, grad + i * width, out + h_idx * width, width);
+      VAdd(param + h_idx * width, out + h_idx * width, out + h_idx * width,
+           width);
+    }
+  }
+}
+
 #define DECLARE_MKL_KERNEL(name, tuples)                             \
  template <typename T>                                              \
  class name##Kernel : public KernelMore<tuples<T>> {                \
@@ -173,6 +199,8 @@ DECLARE_MKL_KERNEL(EmbSeqPool, EmbSeqPoolTuples);

 DECLARE_MKL_KERNEL(Softmax, SoftmaxTuples);

+DECLARE_MKL_KERNEL(Sgd, SgdTuples);
+
 #undef DECLARE_MKL_KERNEL

 }  // namespace mkl