add jit test. develop=test

f45aced5 · dengkaipeng · 51536f7f · f45aced5 · f45aced5 · f45aced5
6 changed file
--- a/paddle/fluid/operators/jit/more/mix/mix.cc
+++ b/paddle/fluid/operators/jit/more/mix/mix.cc
@@ -50,7 +50,7 @@ void VTanh(const T* x, T* y, int n) {
  compute_addbias(&b, y, y, n);
 }
-void Softmax(const T* x, T* y, int n, int bs, int m) {
+void Softmax(const T* x, T* y, int n, int bs, int remain) {
  auto compute_hmax = KernelFuncs<HMaxTuple<T>, CPUPlace>::Cache().At(n);
  auto compute_hsum = KernelFuncs<HSumTuple<T>, CPUPlace>::Cache().At(n);
  auto compute_vscal = KernelFuncs<VScalTuple<T>, CPUPlace>::Cache().At(n);
@@ -66,15 +66,15 @@ void Softmax(const T* x, T* y, int n, int bs, int m) {
    scalar = static_cast<T>(0) - scalar;
    compute_vaddbias(&scalar, x, y, n);  // x - max
    compute_vexp(y, y, n);
-    if (m == 1) {
+    if (remain == 1) {
      compute_hsum(y, &scalar, n);
      scalar = static_cast<T>(1) / scalar;
      compute_vscal(&scalar, y, y, n);
    } else {
-      for (int j = 0; j < m; ++j) {
+      for (int j = 0; j < remain; ++j) {
-        compute_stridesum(&y[j], &scalar, n, m);
+        compute_stridesum(&y[j], &scalar, n, remain);
        scalar = static_cast<T>(1) / scalar;
-        compute_stridescal(&scalar, &y[j], &y[j], n, m);
+        compute_stridescal(&scalar, &y[j], &y[j], n, remain);
      }
    }
    x += n;

--- a/paddle/fluid/operators/jit/more/mix/mix.h
+++ b/paddle/fluid/operators/jit/more/mix/mix.h
@@ -26,7 +26,7 @@ using T = float;
 void VSigmoid(const T* x, T* y, int n);
 void VTanh(const T* x, T* y, int n);
-void Softmax(const T* x, T* y, int n, int bs, int m);
+void Softmax(const T* x, T* y, int n, int bs, int remain);
 void LSTMCtHt(lstm_t* step, const lstm_attr_t* attr);
 void LSTMC1H1(lstm_t* step, const lstm_attr_t* attr);

--- a/paddle/fluid/operators/jit/more/mkl/mkl.cc
+++ b/paddle/fluid/operators/jit/more/mkl/mkl.cc
@@ -81,7 +81,7 @@ void VScal<double>(const double* a, const double* x, double* y, int n) {
 template <>
 void StrideScal<float>(const float* a, const float* x, float* y, int n, int stride) {
  if (x == y) {
-    platform::dynload::cblas_sscal(n, *a, y, stride);
+    platform::dynload::cblas_sscal(n/stride, *a, y, stride);
  } else {
    refer::StrideScal<float>(a, x, y, n, stride);
  }
@@ -90,7 +90,7 @@ void StrideScal<float>(const float* a, const float* x, float* y, int n, int stri
 template <>
 void StrideScal<double>(const double* a, const double* x, double* y, int n, int stride) {
  if (x == y) {
-    platform::dynload::cblas_dscal(n, *a, y, stride);
+    platform::dynload::cblas_dscal(n/stride, *a, y, stride);
  } else {
    refer::StrideScal<double>(a, x, y, n, stride);
  }
@@ -148,12 +148,12 @@ void ASum<double>(const double* x, double* res, int n) {
 template <>
 void StrideASum<float>(const float* x, float* res, int n, int stride) {
-  res[0] = platform::dynload::cblas_sasum(n, x, stride);
+  res[0] = platform::dynload::cblas_sasum(n/stride, x, stride);
 }
 template <>
 void StrideASum<double>(const double* x, double* res, int n, int stride) {
-  res[0] = platform::dynload::cblas_dasum(n, x, stride);
+  res[0] = platform::dynload::cblas_dasum(n/stride, x, stride);
 }
 // TODO(TJ): tuning me carefully on AVX, AVX2 and AVX512

--- a/paddle/fluid/operators/jit/more/mkl/mkl.h
+++ b/paddle/fluid/operators/jit/more/mkl/mkl.h
@@ -135,7 +135,7 @@ template <typename T>
 void StrideScal(const T* a, const T* x, T* y, int n, int stride);
 template <typename T>
-void Softmax(const T* x, T* y, int n, int bs, int m=1) {
+void Softmax(const T* x, T* y, int n, int bs, int remain=1) {
  std::vector<T> entities(bs);
  for (int i = 0; i < bs; ++i) {
    entities[i] = x[i * n];
@@ -149,15 +149,15 @@ void Softmax(const T* x, T* y, int n, int bs, int m=1) {
  VExp(y, y, n * bs);
  for (int i = 0; i < bs; ++i) {
    T sum;
-    if (m == 1) {
+    if (remain == 1) {
      ASum(&y[i * n], &sum, n);
      sum = static_cast<T>(1) / sum;
      VScal(&sum, &y[i * n], &y[i * n], n);
    } else {
-      for (int j = 0; j < m; ++j) {
+      for (int j = 0; j < remain; ++j) {
-        StrideASum(&y[i * n + j], &sum, n/m, m);
+        StrideASum(&y[i * n + j], &sum, n, remain);
        sum = static_cast<T>(1) / sum;
-        StrideScal(&sum, &y[i * n + j], &y[i * n + j], n/m, m);
+        StrideScal(&sum, &y[i * n + j], &y[i * n + j], n, remain);
      }
    }
  }

--- a/paddle/fluid/operators/jit/refer/refer.h
+++ b/paddle/fluid/operators/jit/refer/refer.h
@@ -421,30 +421,34 @@ void StrideASum(const T* x, T* res, int n, int stride) {
 template <typename T>
 void StrideScal(const T* a, const T* x, T* y, int n , int stride) {
-  for (int i = 0; i < n; i+=stride) {
+  for (int i = 0; i < n; ++i) {
-    y[i] = x[i] * a[0];
+    if (i % stride == 0) {
+      y[i] = x[i] * a[0];
+    } else {
+      y[i] = x[i];
+    }
  }
 }
 // y = e^(x - max(x))
 // y = y / sum(y)
 template <typename T>
-void Softmax(const T* x, T* y, int n, int bs = 1, int m = 1) {
+void Softmax(const T* x, T* y, int n, int bs = 1, int remain = 1) {
  for (int i = 0; i < bs; ++i) {
    T scalar;
    HMax(x, &scalar, n);
    scalar = static_cast<T>(0) - scalar;
    VAddBias(&scalar, x, y, n);  // x - max
    VExp(y, y, n);
-    if (m == 1) {
+    if (remain == 1) {
      HSum(y, &scalar, n);
      scalar = static_cast<T>(1) / scalar;
      VScal(&scalar, y, y, n);
    } else {
-      for (int j = 0; j < m; j++) {
+      for (int j = 0; j < remain; j++) {
-        StrideASum(&y[j], &scalar, n, m);
+        StrideASum(&y[j], &scalar, n, remain);
        scalar = static_cast<T>(1) / scalar;
-        StrideScal(&scalar, &y[j], &y[j], n, m);
+        StrideScal(&scalar, &y[j], &y[j], n, remain);
      }
    }
    x += n;

--- a/paddle/fluid/operators/jit/test.cc
+++ b/paddle/fluid/operators/jit/test.cc
@@ -723,11 +723,10 @@ void TestKernelSoftmax() {
  VLOG(10) << "Test JITKernel: " << jit::to_string(KernelTuple::kernel_type);
  for (int bs : {1, 2, 10}) {
    for (int n : TestSizes()) {
-      for (int m : {1, 2}) {
+      for (int m : {1, 2, 3}) { // remain
        if (m > n || n % m != 0) {
          continue;
        }
-        VLOG(10) << "Softmax: " << bs <<  ", " << n << ", " << m;
        auto ref = jit::GetReferFunc<KernelTuple>();
        EXPECT_TRUE(ref != nullptr);
        std::vector<T> x(bs * n), y(bs * n);
@@ -766,6 +765,86 @@ void TestKernelSoftmax() {
  }
 }
+template <typename KernelTuple, typename PlaceType>
+void TestKernelStrideASum() {
+  using T = typename KernelTuple::data_type;
+  VLOG(10) << "Test JITKernel: " << jit::to_string(KernelTuple::kernel_type);
+  for (int d : TestSizes()) {
+    for (int m : {1, 2, 3}) { // stride
+      if (m > d || d % m != 0) {
+        continue;
+      }
+      auto ref = jit::GetReferFunc<KernelTuple>();
+      EXPECT_TRUE(ref != nullptr);
+      std::vector<T> x(d);
+      RandomVec<T>(d, x.data());
+      T ref_res;
+      ref(x.data(), &ref_res, d, m);
+      auto verifier = [](const typename KernelTuple::func_type tgt,
+                         const std::vector<T>& x, const T ref_res, 
+                         const int m) {
+        EXPECT_TRUE(tgt != nullptr);
+        T tgt_res;
+        tgt(x.data(), &tgt_res, x.size(), m);
+        ExpectEQ<T>(&tgt_res, &ref_res, 1);
+      };
+      TestAllImpls<KernelTuple, PlaceType>(d, verifier, x, ref_res, m);
+    }
+  }
+}
+template <typename KernelTuple, typename PlaceType>
+void TestKernelStrideScal() {
+  using T = typename KernelTuple::data_type;
+  VLOG(10) << "Test JITKernel: " << jit::to_string(KernelTuple::kernel_type);
+  // for (int d : TestSizes()) {
+  //   for (int m : {1, 2, 3}) { // stride
+  for (int d : {4}) {
+    for (int m : {2}) { // stride
+      if (m > d || d % m != 0) {
+        continue;
+      }
+      auto ref = jit::GetReferFunc<KernelTuple>();
+      EXPECT_TRUE(ref != nullptr);
+      const T a = static_cast<T>(3);
+      std::vector<T> x(d), yref(d);
+      std::vector<T> xinp(d);  // inplace test
+      RandomVec<T>(d, x.data());
+      std::copy(x.begin(), x.end(), xinp.begin());
+      const T* x_data = x.data();
+      T* yref_data = yref.data();
+      T* xinp_data = xinp.data();
+      // test refer code inplace
+      ref(&a, x_data, yref_data, d, m);
+      ref(&a, xinp_data, xinp_data, d, m);
+      ExpectEQ<T>(xinp_data, yref_data, d);
+      auto verifier = [](const typename KernelTuple::func_type tgt, const T a,
+                         const std::vector<T>& x, const std::vector<T>& yref,
+                         const int m) {
+        EXPECT_TRUE(tgt != nullptr);
+        EXPECT_EQ(yref.size(), x.size());
+        const T* x_data = x.data();
+        const T* yref_data = yref.data();
+        const int d = yref.size();
+        std::vector<T> ytgt(d);
+        T* ytgt_data = ytgt.data();
+        // test normal
+        tgt(&a, x_data, ytgt_data, d, m);
+        ExpectEQ<T>(ytgt_data, yref_data, d);
+        // test inplace x
+        std::copy(x.begin(), x.end(), ytgt.begin());
+        tgt(&a, ytgt_data, ytgt_data, d, m);
+        ExpectEQ<T>(ytgt_data, yref_data, d);
+      };
+      TestAllImpls<KernelTuple, PlaceType>(d, verifier, a, x, yref, m);
+    }
+  }
+}
 template <typename KernelTuple, typename PlaceType>
 void TestKernelSgd() {
  using T = typename KernelTuple::data_type;
@@ -918,7 +997,7 @@ TEST(JITKernel_pool, more) {
  EXPECT_EQ(kers.size(), 10UL);
 #else
 #ifdef PADDLE_WITH_MKLML
-  EXPECT_EQ(kers.size(), 21UL);
+  EXPECT_EQ(kers.size(), 22UL);
 #else
  EXPECT_EQ(kers.size(), 8UL);
 #endif
@@ -927,7 +1006,7 @@ TEST(JITKernel_pool, more) {
 TEST(JITKernel_pool, refer) {
  const auto& kers = jit::ReferKernelPool::Instance().AllKernels();
-  EXPECT_EQ(kers.size(), 29UL);
+  EXPECT_EQ(kers.size(), 31UL);
 }
 // test helper
@@ -1298,3 +1377,6 @@ TEST_CPU_KERNEL(MatMul);
 TEST_CPU_KERNEL(Softmax);
 TEST_CPU_KERNEL(Sgd);
 TEST_CPU_KERNEL(VBroadcast);
+TEST_CPU_KERNEL(StrideASum);
+TEST_CPU_KERNEL(StrideScal);