add refer vscal, vaddbias and test and benchmark

e9216e82 · tensor-tang · a3703888 · e9216e82 · e9216e82 · e9216e82
9 changed file
--- a/paddle/fluid/operators/jit/README.md
+++ b/paddle/fluid/operators/jit/README.md
@@ -37,10 +37,12 @@ PaddlePaddle/Paddle/paddle/fluid/
 ## 测试

 - 逻辑测试
-    所有实现都要与refer的code对比，需要满足精度要求
+    所有实现都要与refer的code对比，需要满足精度要求， 包括float和double的数据类型
 - 性能测试
+    所有实现的性能对比，并且与最终的`jit::Get`方法对比，该方法拿到的性能需要是最好的。

 # 如何添加新的算子

- 在`KernelType` 中添加 `your_key` 
- 实现Reference 的逻辑，每个jitkernel的Reference 实现是必须的。不要依赖任何第三方库。并在`refer/CmakeLists.txt`中`USE_JITKERNEL_REFER(your_key)`
+- 在`KernelType` 中添加 `your_key` .
+- 实现Reference 的逻辑，每个jitkernel的Reference 实现是必须的。不要依赖任何第三方库。并在`refer/CmakeLists.txt`中`USE_JITKERNEL_REFER(your_key)`.
+- 必要时可以添加新的`KernelTuples`，可以参考`XYZNTuples`.
--- a/paddle/fluid/operators/jit/benchmark.cc
+++ b/paddle/fluid/operators/jit/benchmark.cc
@@ -53,7 +53,7 @@ std::vector<int> TestSizes() {

 // return this function avg time
 template <typename T, typename KernelTuples>
-double BenchTartgetFunc(const typename KernelTuples::func_type tgt,
+double BenchXYZNFunc(const typename KernelTuples::func_type tgt,
                     const std::vector<T>& x, const std::vector<T>& y,
                     std::vector<T>& z) {  // NOLINT
  const T* x_data = x.data();
@@ -83,14 +83,14 @@ void BenchXYZNKernel() {
    // refer
    auto refer = jit::GetRefer<KT, jit::XYZNTuples<T>>();
    if (refer) {
-      auto res = BenchTartgetFunc<T, jit::XYZNTuples<T>>(refer, x, y, z);
+      auto res = BenchXYZNFunc<T, jit::XYZNTuples<T>>(refer, x, y, z);
      infos.push_back(std::make_pair("Refer", res));
    }

    // test jitcode
    auto jitcode = jit::GetJitCode<KT, jit::XYZNTuples<T>, PlaceType>(d);
    if (jitcode) {
-      auto res = BenchTartgetFunc<T, jit::XYZNTuples<T>>(jitcode, x, y, z);
+      auto res = BenchXYZNFunc<T, jit::XYZNTuples<T>>(jitcode, x, y, z);
      infos.push_back(std::make_pair("JitCode", res));
    }

@@ -105,7 +105,7 @@ void BenchXYZNKernel() {
            impl.get());
        if (i && i->UseMe(d)) {
          auto more = i->GetFunc();
-          auto res = BenchTartgetFunc<T, jit::XYZNTuples<T>>(more, x, y, z);
+          auto res = BenchXYZNFunc<T, jit::XYZNTuples<T>>(more, x, y, z);
          infos.push_back(std::make_pair("More", res));
        }
      }
@@ -116,7 +116,7 @@ void BenchXYZNKernel() {
    if (!tgt) {
      LOG(ERROR) << "Target can not be empty!";
    }
-    auto res = BenchTartgetFunc<T, jit::XYZNTuples<T>>(tgt, x, y, z);
+    auto res = BenchXYZNFunc<T, jit::XYZNTuples<T>>(tgt, x, y, z);
    infos.push_back(std::make_pair("Target", res));

    // print
@@ -129,6 +129,78 @@ void BenchXYZNKernel() {
  }
 }

+// return this function avg time
+template <typename T, typename KernelTuples>
+double BenchAXYNFunc(const typename KernelTuples::func_type tgt, const T a,
+                     const std::vector<T>& x,
+                     std::vector<T>& y) {  // NOLINT
+  const T* x_data = x.data();
+  T* y_data = y.data();
+  const int d = y.size();
+  for (int i = 0; i < FLAGS_burning; ++i) {
+    tgt(&a, x_data, y_data, d);
+  }
+  auto start = GetCurrentUS();
+  for (int i = 0; i < FLAGS_repeat; ++i) {
+    tgt(&a, x_data, y_data, d);
+  }
+  auto end = GetCurrentUS();
+  return (end - start) / FLAGS_repeat;
+}
+
+template <paddle::operators::jit::KernelType KT, typename T, typename PlaceType>
+void BenchAXYNKernel() {
+  namespace jit = paddle::operators::jit;
+  for (int d : TestSizes()) {
+    std::vector<std::pair<std::string, double>> infos;
+    const T a = static_cast<T>(3);
+    std::vector<T> x(d), y(d);
+    RandomVec<T>(d, x.data());
+    // test refer
+    auto refer = jit::GetRefer<KT, jit::AXYNTuples<T>>();
+    if (refer) {
+      auto res = BenchAXYNFunc<T, jit::AXYNTuples<T>>(refer, a, x, y);
+      infos.push_back(std::make_pair("Refer", res));
+    }
+    // test jitcode
+    auto jitcode = jit::GetJitCode<KT, jit::AXYNTuples<T>, PlaceType>(d);
+    if (jitcode) {
+      auto res = BenchAXYNFunc<T, jit::AXYNTuples<T>>(jitcode, a, x, y);
+      infos.push_back(std::make_pair("JitCode", res));
+    }
+    // test all impls in more
+    jit::KernelKey kkey(KT, PlaceType());
+    auto& pool = jit::KernelPool().Instance().AllKernels();
+    auto iter = pool.find(kkey);
+    if (iter != pool.end()) {
+      auto& impls = iter->second;
+      for (auto& impl : impls) {
+        auto i = dynamic_cast<const jit::KernelImpl<jit::AXYNTuples<T>>*>(
+            impl.get());
+        if (i && i->UseMe(d)) {
+          auto more = i->GetFunc();
+          auto res = BenchAXYNFunc<T, jit::AXYNTuples<T>>(more, a, x, y);
+          infos.push_back(std::make_pair("More", res));
+        }
+      }
+    }
+    // Test result from Get function
+    auto tgt = jit::Get<KT, jit::AXYNTuples<T>, PlaceType>(d);
+    if (!tgt) {
+      LOG(ERROR) << "Target can not be empty!";
+    }
+    auto res = BenchAXYNFunc<T, jit::AXYNTuples<T>>(tgt, a, x, y);
+    infos.push_back(std::make_pair("Target", res));
+    // print
+    std::ostringstream loginfos;
+    loginfos << "Kernel Type: " << jit::to_string(KT) << ", size " << d << ": ";
+    for (auto pair : infos) {
+      loginfos << pair.first << " takes " << pair.second << " us; ";
+    }
+    LOG(INFO) << loginfos.str();
+  }
+}
+
 // Benchmark all jit kernels including jitcode, mkl and refer.
 // To use this tool, run command: ./benchmark [options...]
 // Options:
@@ -147,4 +219,7 @@ int main(int argc, char* argv[]) {
  BenchXYZNKernel<jit::vadd, T, PlaceType>();
  BenchXYZNKernel<jit::vaddrelu, T, PlaceType>();
  BenchXYZNKernel<jit::vsub, T, PlaceType>();
+
+  BenchAXYNKernel<jit::vscal, T, PlaceType>();
+  BenchAXYNKernel<jit::vaddbias, T, PlaceType>();
 }
--- a/paddle/fluid/operators/jit/helper.cc
+++ b/paddle/fluid/operators/jit/helper.cc
@@ -13,6 +13,7 @@
 * limitations under the License. */

 #include "paddle/fluid/operators/jit/helper.h"
+#include "paddle/fluid/platform/enforce.h"

 namespace paddle {
 namespace operators {
@@ -32,7 +33,10 @@ const char* to_string(KernelType kt) {
      return "vscal";
    case vexp:
      return "vexp";
+    case vaddbias:
+      return "vaddbias";
    default:
+      PADDLE_THROW("Not support type: %d", kt);
      return "NOT JITKernel";
  }
  return nullptr;

--- a/paddle/fluid/operators/jit/kernel_base.h
+++ b/paddle/fluid/operators/jit/kernel_base.h
@@ -19,7 +19,15 @@ namespace paddle {
 namespace operators {
 namespace jit {

-typedef enum { vmul = 0, vadd = 1, vaddrelu, vsub, vscal, vexp } KernelType;
+typedef enum {
+  vmul = 0,
+  vadd = 1,
+  vaddrelu,
+  vsub,
+  vscal,
+  vaddbias,
+  vexp
+} KernelType;

 template <typename T>
 struct XYZNTuples {
@@ -28,6 +36,9 @@ struct XYZNTuples {
  typedef void (*func_type)(const T*, const T*, T*, int);
 };

+template <typename T>
+struct AXYNTuples : public XYZNTuples<T> {};
+
 // Just for adding to kernel pool without template
 class Kernel {
 public:

--- a/paddle/fluid/operators/jit/refer/CMakeLists.txt
+++ b/paddle/fluid/operators/jit/refer/CMakeLists.txt
@@ -8,3 +8,8 @@ endfunction()

 # use refer kernel by name
 USE_JITKERNEL_REFER(vmul)
+USE_JITKERNEL_REFER(vadd)
+USE_JITKERNEL_REFER(vaddrelu)
+USE_JITKERNEL_REFER(vsub)
+USE_JITKERNEL_REFER(vscal)
+USE_JITKERNEL_REFER(vaddbias)
--- a/paddle/fluid/operators/jit/refer/refer.cc
+++ b/paddle/fluid/operators/jit/refer/refer.cc
@@ -26,4 +26,7 @@ REGISTER_REFER_KERNEL(vadd, VAdd);
 REGISTER_REFER_KERNEL(vaddrelu, VAddRelu);
 REGISTER_REFER_KERNEL(vsub, VSub);

+REGISTER_REFER_KERNEL(vscal, VScal);
+REGISTER_REFER_KERNEL(vaddbias, VAddBias);
+
 #undef REGISTER_REFER_KERNEL
--- a/paddle/fluid/operators/jit/refer/refer.h
+++ b/paddle/fluid/operators/jit/refer/refer.h
@@ -59,6 +59,13 @@ void VScal(const T* a, const T* x, T* y, int n) {
  }
 }

+template <typename T>
+void VAddBias(const T* a, const T* x, T* y, int n) {
+  for (int i = 0; i < n; ++i) {
+    y[i] = a[0] + x[i];
+  }
+}
+
 #define DECLARE_REFER_KERNEL(name, tuples)             \
  template <typename T>                                \
  class name##Kernel : public ReferKernel<tuples<T>> { \
@@ -66,11 +73,16 @@ void VScal(const T* a, const T* x, T* y, int n) {
    name##Kernel() { this->func = name<T>; }           \
  }

+// const T* x, const T* y, T* z, int n
 DECLARE_REFER_KERNEL(VMul, XYZNTuples);
 DECLARE_REFER_KERNEL(VAdd, XYZNTuples);
 DECLARE_REFER_KERNEL(VAddRelu, XYZNTuples);
 DECLARE_REFER_KERNEL(VSub, XYZNTuples);

+// const T* a, const T* x, T* y, int n
+DECLARE_REFER_KERNEL(VScal, AXYNTuples);
+DECLARE_REFER_KERNEL(VAddBias, AXYNTuples);
+
 #undef DECLARE_REFER_KERNEL

 }  // namespace refer

--- a/paddle/fluid/operators/jit/test.cc
+++ b/paddle/fluid/operators/jit/test.cc
@@ -12,7 +12,6 @@
 * See the License for the specific language governing permissions and
 * limitations under the License. */

-#include <cstring>  // for memcpy
 #include <random>
 #include <string>
 #include <vector>
@@ -59,7 +58,7 @@ std::vector<int> TestSizes() {
 }

 template <typename T, typename KernelTuples>
-void TestTartgetFunc(const typename KernelTuples::func_type tgt,
+void TestXYZNFunc(const typename KernelTuples::func_type tgt,
                  const std::vector<T>& x, const std::vector<T>& y,
                  const std::vector<T>& zref) {
  EXPECT_TRUE(tgt != nullptr);
@@ -88,9 +87,8 @@ void TestTartgetFunc(const typename KernelTuples::func_type tgt,
 template <paddle::operators::jit::KernelType KT, typename T, typename PlaceType>
 void TestXYZNKernel() {
  namespace jit = paddle::operators::jit;
+  VLOG(10) << "===== Test JITKernel " << jit::to_string(KT);
  for (int d : TestSizes()) {
-    VLOG(10) << "===== Test JITKernel " << jit::to_string(KT)
-             << ", size: " << d;
    auto ref = jit::GetRefer<KT, jit::XYZNTuples<T>>();
    EXPECT_TRUE(ref != nullptr);

@@ -119,7 +117,7 @@ void TestXYZNKernel() {
    auto jitcode = jit::GetJitCode<KT, jit::XYZNTuples<T>, PlaceType>(d);
    if (jitcode) {
      VLOG(10) << "Test Jitcode Kernel, size: " << d;
-      TestTartgetFunc<T, jit::XYZNTuples<T>>(jitcode, x, y, zref);
+      TestXYZNFunc<T, jit::XYZNTuples<T>>(jitcode, x, y, zref);
    }

    // test all impls in more
@@ -134,14 +132,14 @@ void TestXYZNKernel() {
        if (i && i->UseMe(d)) {
          auto more = i->GetFunc();
          VLOG(10) << "Test More Kernel, size: " << d;
-          TestTartgetFunc<T, jit::XYZNTuples<T>>(more, x, y, zref);
+          TestXYZNFunc<T, jit::XYZNTuples<T>>(more, x, y, zref);
        }
      }
    }
    // Test result from Get function
    VLOG(10) << "Test Get function, size: " << d;
    auto tgt = jit::Get<KT, jit::XYZNTuples<T>, PlaceType>(d);
-    TestTartgetFunc<T, jit::XYZNTuples<T>>(tgt, x, y, zref);
+    TestXYZNFunc<T, jit::XYZNTuples<T>>(tgt, x, y, zref);
  }
 }

@@ -169,4 +167,89 @@ TEST(JITKernel, vsub) {
  TestXYZNKernel<jit::vsub, double, paddle::platform::CPUPlace>();
 }

-TEST(JITKernel, pool) {}
+template <typename T, typename KernelTuples>
+void TestAXYNFunc(const typename KernelTuples::func_type tgt, const T a,
+                  const std::vector<T>& x, const std::vector<T>& yref) {
+  EXPECT_TRUE(tgt != nullptr);
+  EXPECT_EQ(yref.size(), x.size());
+  const T* x_data = x.data();
+  const T* yref_data = yref.data();
+  const int d = yref.size();
+  std::vector<T> ytgt(d);
+  T* ytgt_data = ytgt.data();
+  // test normal
+  tgt(&a, x_data, ytgt_data, d);
+  ExpectEQ<T>(ytgt_data, yref_data, d);
+  // test inplace x
+  std::copy(x.begin(), x.end(), ytgt.begin());
+  tgt(&a, ytgt_data, ytgt_data, d);
+  ExpectEQ<T>(ytgt_data, yref_data, d);
+}
+
+template <paddle::operators::jit::KernelType KT, typename T, typename PlaceType>
+void TestAXYNKernel() {
+  namespace jit = paddle::operators::jit;
+  VLOG(10) << "===== Test JITKernel " << jit::to_string(KT);
+  for (int d : TestSizes()) {
+    auto ref = jit::GetRefer<KT, jit::AXYNTuples<T>>();
+    EXPECT_TRUE(ref != nullptr);
+
+    const T a = static_cast<T>(3);
+    std::vector<T> x(d), yref(d);
+    std::vector<T> xinp(d);  // inplace test
+    RandomVec<T>(d, x.data());
+    std::copy(x.begin(), x.end(), xinp.begin());
+
+    const T* x_data = x.data();
+    T* yref_data = yref.data();
+    T* xinp_data = xinp.data();
+    // test refer code inplace
+    ref(&a, x_data, yref_data, d);
+    ref(&a, xinp_data, xinp_data, d);
+    ExpectEQ<T>(xinp_data, yref_data, d);
+
+    // test jitcode
+    auto jitcode = jit::GetJitCode<KT, jit::AXYNTuples<T>, PlaceType>(d);
+    if (jitcode) {
+      VLOG(10) << "Test Jitcode Kernel, size: " << d;
+      TestAXYNFunc<T, jit::AXYNTuples<T>>(jitcode, a, x, yref);
+    }
+
+    // test all impls in more
+    jit::KernelKey kkey(KT, PlaceType());
+    auto& pool = jit::KernelPool().Instance().AllKernels();
+    auto iter = pool.find(kkey);
+    if (iter != pool.end()) {
+      auto& impls = iter->second;
+      for (auto& impl : impls) {
+        auto i = dynamic_cast<const jit::KernelImpl<jit::AXYNTuples<T>>*>(
+            impl.get());
+        if (i && i->UseMe(d)) {
+          auto more = i->GetFunc();
+          VLOG(10) << "Test More Kernel, size: " << d;
+          TestAXYNFunc<T, jit::AXYNTuples<T>>(more, a, x, yref);
+        }
+      }
+    }
+    // Test result from Get function
+    VLOG(10) << "Test Get function, size: " << d;
+    auto tgt = jit::Get<KT, jit::AXYNTuples<T>, PlaceType>(d);
+    TestAXYNFunc<T, jit::AXYNTuples<T>>(tgt, a, x, yref);
+  }
+}
+
+TEST(JITKernel, vscal) {
+  namespace jit = paddle::operators::jit;
+  TestAXYNKernel<jit::vscal, float, paddle::platform::CPUPlace>();
+  TestAXYNKernel<jit::vscal, double, paddle::platform::CPUPlace>();
+}
+
+TEST(JITKernel, vaddbias) {
+  namespace jit = paddle::operators::jit;
+  TestAXYNKernel<jit::vaddbias, float, paddle::platform::CPUPlace>();
+  TestAXYNKernel<jit::vaddbias, double, paddle::platform::CPUPlace>();
+}
+
+TEST(JITKernel, pool) {
+  // TODO(TJ): add some test
+}
--- a/paddle/fluid/operators/math/jit_kernel_refer.h
+++ b/paddle/fluid/operators/math/jit_kernel_refer.h
@@ -24,13 +24,6 @@ namespace math {
 namespace jitkernel {
 namespace refer {

-template <typename T>
-void VAddBias(const T* a, const T* x, T* y, int n) {
-  for (int i = 0; i < n; ++i) {
-    y[i] = a[0] + x[i];
-  }
-}
-
 template <typename T>
 void VRelu(const T* x, T* y, int n) {
  for (int i = 0; i < n; ++i) {