add vbroadcast jitkernel refer code and use it

test=develop

add vbroadcast jitkernel refer code and use it
test=develop
41a12708 · tensor-tang · 867e93b2 · 41a12708 · 41a12708 · 41a12708
9 changed file
--- a/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.h
+++ b/paddle/fluid/operators/fused/fused_embedding_seq_pool_op.h
@@ -22,7 +22,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/selected_rows.h"
 #include "paddle/fluid/operators/jit/kernels.h"
-#include "paddle/fluid/operators/math/blas.h"
 namespace paddle {
 namespace operators {
@@ -47,7 +46,7 @@ struct EmbeddingVSumFunctor {
    auto *output = output_t->mutable_data<T>(context.GetPlace());
    PADDLE_ENFORCE_LE(table_width * idx_width, out_width);
-    PADDLE_ENFORCE_GT(ids_lod.size(), 1UL);
+    PADDLE_ENFORCE_GT(ids_lod.size(), 1UL, "The LoD[0] could NOT be empty");
    jit::emb_seq_pool_attr_t attr(table_height, table_width, 0, idx_width,
                                  out_width, jit::SeqPoolType::kSum);
@@ -83,11 +82,11 @@ class FusedEmbeddingSeqPoolKernel : public framework::OpKernel<T> {
        FusedEmbeddingSeqPoolLastDim(table_var->dims(), ids_t->dims());
    const auto &ids_lod = ids_t->lod();
    // in run time, the LoD of ids must be 1
-    PADDLE_ENFORCE(ids_lod.size(), 1u, "The LoD level of Input(Ids) must be 1");
+    PADDLE_ENFORCE(ids_lod.size(), 1UL,
-    PADDLE_ENFORCE_GE(ids_lod[0].size(), 1u, "The LoD could NOT be empty");
+                   "The LoD level of Input(Ids) must be 1");
    int64_t batch_size = ids_lod[0].size() - 1;
    // in run time, the shape from Ids -> output
-    // should be [seq_length, 1] -> [batch_size, embedding_size]
+    // should be [seq_length, 1] -> [batch_size, last_dim]
    output_t->Resize({batch_size, last_dim});
    if (combiner_type == "sum") {
@@ -125,7 +124,7 @@ class FusedEmbeddingSeqPoolGradKernel : public framework::OpKernel<T> {
      auto *ids_data = ids->data<int64_t>();
      int64_t ids_num = ids->numel();
      auto lod = ids->lod()[0];
-      int64_t row_width = d_output->dims()[1];
+      int64_t out_width = d_output->dims()[1];
      framework::Vector<int64_t> *new_rows = d_table->mutable_rows();
      new_rows->resize(ids_num);
@@ -136,15 +135,13 @@ class FusedEmbeddingSeqPoolGradKernel : public framework::OpKernel<T> {
      T *d_table_data = d_table_value->mutable_data<T>(context.GetPlace());
      const T *d_output_data = d_output->data<T>();
-      auto blas = math::GetBlas<platform::CPUDeviceContext, T>(context);
+      auto vbroadcast = jit::Get<jit::kVBroadcast, jit::VBroadcastTuples<T>,
+                                 platform::CPUPlace>(out_width);
      for (int i = 0; i < static_cast<int>(lod.size()) - 1; ++i) {
        int64_t h = static_cast<int64_t>(lod[i + 1] - lod[i]);
-        int64_t in_offset = lod[i] * row_width;
+        const T *src = d_output_data + i * out_width;
-        const T *out_pos = d_output_data + i * row_width;
+        T *dst = d_table_data + lod[i] * out_width;
-        T *in_pos = d_table_data + in_offset;
+        vbroadcast(src, dst, h, out_width);
-        for (int r = 0; r != h; ++r) {
-          blas.VCOPY(row_width, out_pos, in_pos + r * row_width);
-        }
      }
    } else {
      LOG(ERROR) << "Dense is not supported in fused_embedding_seq_pool_op now";

--- a/paddle/fluid/operators/jit/benchmark.cc
+++ b/paddle/fluid/operators/jit/benchmark.cc
@@ -474,6 +474,24 @@ void BenchCRFDecodingKernel() {
  }
 }
+template <jit::KernelType KT, typename T, typename PlaceType>
+void BenchVBroadcastKernel() {
+  for (int w : TestSizes()) {
+    Tensor x;
+    x.Resize({w});
+    RandomVec<T>(w, x.mutable_data<T>(PlaceType()));
+    const T* x_data = x.data<T>();
+    for (int64_t h : {1, 3, 6}) {
+      Tensor y;
+      y.Resize({h * w});
+      T* y_data = y.mutable_data<T>(PlaceType());
+      BenchAllImpls<KT, jit::VBroadcastTuples<T>, PlaceType>(
+          static_cast<int64_t>(w), x_data, y_data, h, static_cast<int64_t>(w));
+    }
+  }
+}
 using T = float;
 using CPUPlace = paddle::platform::CPUPlace;
@@ -536,6 +554,11 @@ BENCH_FP32_CPU(kCRFDecoding) {
  BenchCRFDecodingKernel<jit::kCRFDecoding, T, CPUPlace>();
 }
+// vbroadcast function
+BENCH_FP32_CPU(kVBroadcast) {
+  BenchVBroadcastKernel<jit::kVBroadcast, T, CPUPlace>();
+}
 // Benchmark all jit kernels including jitcode, mkl and refer.
 // To use this tool, run command: ./benchmark [options...]
 // Options:

--- a/paddle/fluid/operators/jit/helper.cc
+++ b/paddle/fluid/operators/jit/helper.cc
@@ -36,6 +36,7 @@ const char* to_string(KernelType kt) {
    ONE_CASE(kVScal);
    ONE_CASE(kVAddBias);
    ONE_CASE(kVRelu);
+    ONE_CASE(kVBroadcast);
    ONE_CASE(kVCopy);
    ONE_CASE(kVIdentity);
    ONE_CASE(kVExp);

--- a/paddle/fluid/operators/jit/kernel_base.h
+++ b/paddle/fluid/operators/jit/kernel_base.h
@@ -41,6 +41,7 @@ typedef enum {
  kVAdd,
  kVAddBias,
  kVAddRelu,
+  kVBroadcast,
  kVCopy,
  kVExp,
  kVIdentity,
@@ -134,6 +135,13 @@ struct GRUTuples {
  typedef void (*func_type)(gru_t*, const gru_attr_t*);
 };
+template <typename T>
+struct VBroadcastTuples {
+  typedef T data_type;
+  typedef int64_t attr_type;
+  typedef void (*func_type)(const T*, T*, int64_t, int64_t);
+};
 typedef struct seq_pool_attr_s {
  int h, w;  // h should always be the first one
  SeqPoolType type;

--- a/paddle/fluid/operators/jit/kernel_key.cc
+++ b/paddle/fluid/operators/jit/kernel_key.cc
@@ -24,6 +24,11 @@ size_t JitCodeKey<int>(const int& d) {
  return d;
 }
+template <>
+size_t JitCodeKey<int64_t>(const int64_t& d) {
+  return d;
+}
 // TODO(TJ): refine and benchmark JitCodeKey generatation
 constexpr int act_type_shift = 3;  // suppot 2^3 act types
 static inline int act_type_convert(KernelType type) {

--- a/paddle/fluid/operators/jit/refer/CMakeLists.txt
+++ b/paddle/fluid/operators/jit/refer/CMakeLists.txt
@@ -35,3 +35,4 @@ USE_JITKERNEL_REFER(kHMax)
 USE_JITKERNEL_REFER(kSoftmax)
 USE_JITKERNEL_REFER(kEmbSeqPool)
 USE_JITKERNEL_REFER(kSgd)
+USE_JITKERNEL_REFER(kVBroadcast)
--- a/paddle/fluid/operators/jit/refer/refer.cc
+++ b/paddle/fluid/operators/jit/refer/refer.cc
@@ -62,4 +62,6 @@ REGISTER_REFER_KERNEL(kEmbSeqPool, EmbSeqPool);
 REGISTER_REFER_KERNEL(kSgd, Sgd);
+REGISTER_REFER_KERNEL(kVBroadcast, VBroadcast);
 #undef REGISTER_REFER_KERNEL
--- a/paddle/fluid/operators/jit/refer/refer.h
+++ b/paddle/fluid/operators/jit/refer/refer.h
@@ -75,6 +75,15 @@ void VCopy(const T* x, T* y, int n) {
  std::memcpy(y, x, n * sizeof(T));
 }
+// x shape: (x_len)
+// y shape: (h, x_len)
+template <typename T>
+void VBroadcast(const T* x, T* y, int64_t y_h, int64_t x_len) {
+  for (int64_t h = 0; h < y_h; ++h) {
+    VCopy(x, y + h * x_len, x_len);
+  }
+}
 template <typename T>
 void VRelu(const T* x, T* y, int n) {
  for (int i = 0; i < n; ++i) {
@@ -534,6 +543,8 @@ DECLARE_REFER_KERNEL(EmbSeqPool, EmbSeqPoolTuples);
 DECLARE_REFER_KERNEL(Sgd, SgdTuples);
+DECLARE_REFER_KERNEL(VBroadcast, VBroadcastTuples);
 #undef DECLARE_REFER_KERNEL
 }  // namespace refer

--- a/paddle/fluid/operators/jit/test.cc
+++ b/paddle/fluid/operators/jit/test.cc
@@ -157,6 +157,26 @@ struct TestFuncWithRefer<jit::XRNTuples<T>, std::vector<T>, T> {
  }
 };
+template <typename T>
+struct TestFuncWithRefer<jit::VBroadcastTuples<T>, std::vector<T>,
+                         std::vector<T>, int64_t,
+                         typename jit::VBroadcastTuples<T>::attr_type> {
+  void operator()(const typename jit::VBroadcastTuples<T>::func_type tgt,
+                  const std::vector<T>& x, const std::vector<T>& yref,
+                  int64_t h,
+                  const typename jit::VBroadcastTuples<T>::attr_type& attr) {
+    EXPECT_TRUE(tgt != nullptr);
+    EXPECT_EQ(x.size(), static_cast<size_t>(attr));
+    EXPECT_EQ(yref.size(), x.size() * h);
+    std::vector<T> y(yref.size());
+    const T* x_data = x.data();
+    const T* yref_data = yref.data();
+    T* y_data = y.data();
+    tgt(x_data, y_data, h, attr);
+    ExpectEQ<T>(y_data, yref_data, yref.size());
+  }
+};
 template <typename T>
 struct TestFuncWithRefer<jit::XYNTuples<T>, std::vector<T>, std::vector<T>> {
  void operator()(const typename jit::XYNTuples<T>::func_type tgt,
@@ -926,6 +946,27 @@ void TestKernelCRFDecodingTuples() {
  }
 }
+template <jit::KernelType KT, typename T, typename PlaceType>
+void TestKernelVBroadcastTuples() {
+  VLOG(10) << "===== Test JITKernel " << jit::to_string(KT);
+  for (int w : TestSizes()) {
+    std::vector<T> x(w);
+    RandomVec<T>(w, x.data());
+    const T* x_data = x.data();
+    for (int64_t h : {1, 2, 6}) {
+      auto ref = jit::GetRefer<KT, jit::VBroadcastTuples<T>>();
+      EXPECT_TRUE(ref != nullptr);
+      std::vector<T> y(w * h);
+      T* y_data = y.data();
+      ref(x_data, y_data, h, w);
+      TestAllImpls<KT, jit::VBroadcastTuples<T>, PlaceType, std::vector<T>,
+                   std::vector<T>, int64_t>(static_cast<int64_t>(w), x, y, h,
+                                            static_cast<int64_t>(w));
+    }
+  }
+}
 #define TEST_CPU_KERNEL(test_tuple, kernel_type)                 \
  TEST(JITKernel, kernel_type) {                                 \
    TestKernel##test_tuple<jit::kernel_type, float, CPUPlace>(); \
@@ -967,6 +1008,7 @@ TEST_CPU_KERNEL(EmbSeqPoolTuples, kEmbSeqPool);
 TEST_CPU_KERNEL(SgdTuples, kSgd);
 TEST_CPU_KERNEL(LayerNormTuples, kLayerNorm);
 TEST_CPU_KERNEL(CRFDecodingTuples, kCRFDecoding);
+TEST_CPU_KERNEL(VBroadcastTuples, kVBroadcast);
 TEST(JITKernel_key, lstm) {
  jit::lstm_attr_t attr1(8, jit::kVIdentity, jit::kVSigmoid, jit::kVTanh);