From 6010361c7af1f24b84ac906c71cf8a500e706726 Mon Sep 17 00:00:00 2001
From: tensor-tang <tangjian03@baidu.com>
Date: Fri, 1 Mar 2019 06:32:51 +0000
Subject: [PATCH] add vbroadcast mkl code and jitcode

test=develop
---
 paddle/fluid/operators/jit/benchmark.cc       |  7 +-
 paddle/fluid/operators/jit/gen/CMakeLists.txt |  1 +
 paddle/fluid/operators/jit/gen/vbroadcast.cc  | 94 +++++++++++++++++++
 paddle/fluid/operators/jit/gen/vbroadcast.h   | 53 +++++++++++
 .../operators/jit/more/mkl/CMakeLists.txt     |  1 +
 paddle/fluid/operators/jit/more/mkl/mkl.cc    | 11 +++
 paddle/fluid/operators/jit/more/mkl/mkl.h     |  9 ++
 7 files changed, 172 insertions(+), 4 deletions(-)
 create mode 100644 paddle/fluid/operators/jit/gen/vbroadcast.cc
 create mode 100644 paddle/fluid/operators/jit/gen/vbroadcast.h
diff --git a/paddle/fluid/operators/jit/benchmark.cc b/paddle/fluid/operators/jit/benchmark.cc
index 93ebb1faa7..3088280bb9 100644
--- a/paddle/fluid/operators/jit/benchmark.cc
+++ b/paddle/fluid/operators/jit/benchmark.cc
@@ -476,18 +476,17 @@ void BenchCRFDecodingKernel() {
 
 template <jit::KernelType KT, typename T, typename PlaceType>
 void BenchVBroadcastKernel() {
-  for (int w : TestSizes()) {
+  for (int64_t w : {1, 16, 64, 100, 256}) {
     Tensor x;
     x.Resize({w});
     RandomVec<T>(w, x.mutable_data<T>(PlaceType()));
     const T* x_data = x.data<T>();
-    for (int64_t h : {1, 3, 6}) {
+    for (int h : TestSizes()) {
       Tensor y;
       y.Resize({h * w});
       T* y_data = y.mutable_data<T>(PlaceType());
-
       BenchAllImpls<KT, jit::VBroadcastTuples<T>, PlaceType>(
-          static_cast<int64_t>(w), x_data, y_data, h, static_cast<int64_t>(w));
+          w, x_data, y_data, static_cast<int64_t>(h), w);
     }
   }
 }
diff --git a/paddle/fluid/operators/jit/gen/CMakeLists.txt b/paddle/fluid/operators/jit/gen/CMakeLists.txt
index eb0c03568d..99244ea9bd 100644
--- a/paddle/fluid/operators/jit/gen/CMakeLists.txt
+++ b/paddle/fluid/operators/jit/gen/CMakeLists.txt
@@ -33,3 +33,4 @@ USE_JITKERNEL_GEN(kHMax)
 USE_JITKERNEL_GEN(kHSum)
 USE_JITKERNEL_GEN(kEmbSeqPool)
 USE_JITKERNEL_GEN(kSgd)
+USE_JITKERNEL_GEN(kVBroadcast)
diff --git a/paddle/fluid/operators/jit/gen/vbroadcast.cc b/paddle/fluid/operators/jit/gen/vbroadcast.cc
new file mode 100644
index 0000000000..31deb16430
--- /dev/null
+++ b/paddle/fluid/operators/jit/gen/vbroadcast.cc
@@ -0,0 +1,94 @@
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License. */
+
+#include "paddle/fluid/operators/jit/gen/vbroadcast.h"
+#include <memory>
+#include <vector>
+#include "paddle/fluid/operators/jit/registry.h"
+#include "paddle/fluid/platform/cpu_info.h"
+
+namespace paddle {
+namespace operators {
+namespace jit {
+namespace gen {
+
+void VBroadcastJitCode::genCode() {
+  preCode();
+  constexpr int block = YMM_FLOAT_BLOCK;
+  constexpr int max_num_regs = 16;
+  const int num_block = w_ / block;
+  const int num_groups = num_block / max_num_regs;
+  const size_t block_size = sizeof(float) * block;
+  std::vector<int> groups(num_groups, max_num_regs);
+  int rest_num_regs = num_block % max_num_regs;
+  if (rest_num_regs > 0) {
+    groups.push_back(rest_num_regs);
+  }
+
+  // protect param_h
+  const size_t width_in_byte = sizeof(float) * w_;
+  mov(reg_height, param_h);
+  int acc_num_regs = 0;
+  for (int num_regs : groups) {
+    mov(reg_ptr_src_i, param_src);
+    add(reg_ptr_src_i, acc_num_regs * block_size);
+    size_t w_offset = 0;
+    for (int reg_i = 0; reg_i < num_regs; ++reg_i) {
+      vmovups(ymm_t(reg_i), ptr[reg_ptr_src_i + w_offset]);
+      w_offset += block_size;
+    }
+
+    Label l_next_h;
+    xor_(reg_h_i, reg_h_i);
+    mov(reg_ptr_dst_i, param_dst);
+    add(reg_ptr_dst_i, acc_num_regs * block_size);
+    L(l_next_h);
+    {
+      w_offset = 0;
+      for (int reg_i = 0; reg_i < num_regs; ++reg_i) {
+        vmovups(ptr[reg_ptr_dst_i + w_offset], ymm_t(reg_i));
+        w_offset += block_size;
+      }
+      add(reg_ptr_dst_i, width_in_byte);
+      inc(reg_h_i);
+      cmp(reg_h_i, reg_height);
+      jl(l_next_h, T_NEAR);
+    }  // end of l_next_h
+    acc_num_regs += num_regs;
+  }  // end of groups
+  postCode();
+}
+
+class VBroadcastCreator : public JitCodeCreator<int64_t> {
+ public:
+  bool UseMe(const int64_t& w) const override {
+    return platform::MayIUse(platform::avx) && w % YMM_FLOAT_BLOCK == 0;
+  }
+  size_t CodeSize(const int64_t& w) const override {
+    return 96 + (w / YMM_FLOAT_BLOCK) * 16 * 8;
+  }
+  std::unique_ptr<GenBase> CreateJitCode(const int64_t& w) const override {
+    PADDLE_ENFORCE_GT(w, 0);
+    return make_unique<VBroadcastJitCode>(w, CodeSize(w));
+  }
+};
+
+}  // namespace gen
+}  // namespace jit
+}  // namespace operators
+}  // namespace paddle
+
+namespace gen = paddle::operators::jit::gen;
+
+REGISTER_JITKERNEL_GEN(kVBroadcast, gen::VBroadcastCreator);
diff --git a/paddle/fluid/operators/jit/gen/vbroadcast.h b/paddle/fluid/operators/jit/gen/vbroadcast.h
new file mode 100644
index 0000000000..27c75f6f71
--- /dev/null
+++ b/paddle/fluid/operators/jit/gen/vbroadcast.h
@@ -0,0 +1,53 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License. */
+
+#pragma once
+
+#include <string>
+#include "glog/logging.h"
+#include "paddle/fluid/operators/jit/gen/jitcode.h"
+
+namespace paddle {
+namespace operators {
+namespace jit {
+namespace gen {
+
+class VBroadcastJitCode : public JitCode {
+ public:
+  explicit VBroadcastJitCode(const int64_t& w, size_t code_size = 256 * 1024,
+                             void* code_ptr = nullptr)
+      : JitCode(code_size, code_ptr), w_(w) {
+    this->genCode();
+  }
+
+  DECLARE_JIT_CODE(VBroadcastJitCode);
+  void genCode() override;
+
+ private:
+  int w_;
+  reg64_t param_src{abi_param1};
+  reg64_t param_dst{abi_param2};
+  reg64_t param_h{abi_param3};
+  reg64_t param_w{abi_param4};
+
+  reg64_t reg_height{r9};
+  reg64_t reg_h_i{r10};
+  reg64_t reg_ptr_src_i{r11};
+  reg64_t reg_ptr_dst_i{r12};
+};
+
+}  // namespace gen
+}  // namespace jit
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/jit/more/mkl/CMakeLists.txt b/paddle/fluid/operators/jit/more/mkl/CMakeLists.txt
index d4459449a3..f69417c370 100644
--- a/paddle/fluid/operators/jit/more/mkl/CMakeLists.txt
+++ b/paddle/fluid/operators/jit/more/mkl/CMakeLists.txt
@@ -16,3 +16,4 @@ USE_JITKERNEL_MORE(kSeqPool, mkl)
 USE_JITKERNEL_MORE(kSoftmax, mkl)
 USE_JITKERNEL_MORE(kEmbSeqPool, mkl)
 USE_JITKERNEL_MORE(kSgd, mkl)
+USE_JITKERNEL_MORE(kVBroadcast, mkl)
diff --git a/paddle/fluid/operators/jit/more/mkl/mkl.cc b/paddle/fluid/operators/jit/more/mkl/mkl.cc
index 6a90be3ede..4f51353bce 100644
--- a/paddle/fluid/operators/jit/more/mkl/mkl.cc
+++ b/paddle/fluid/operators/jit/more/mkl/mkl.cc
@@ -159,6 +159,16 @@ bool VCopyKernel<float>::UseMe(const int& d) const {
   return d > 15;
 }
 
+template <>
+bool VBroadcastKernel<float>::UseMe(const int64_t& d) const {
+  return d > 127;
+}
+
+template <>
+bool VBroadcastKernel<double>::UseMe(const int64_t& attr) const {
+  return true;
+}
+
 template <>
 bool VSigmoidKernel<float>::UseMe(const int& d) const {
   return d > 7;
@@ -251,6 +261,7 @@ REGISTER_MKL_KERNEL(kVScal, VScal);
 REGISTER_MKL_KERNEL(kVExp, VExp);
 REGISTER_MKL_KERNEL(kVSquare, VSquare);
 REGISTER_MKL_KERNEL(kVCopy, VCopy);
+REGISTER_MKL_KERNEL(kVBroadcast, VBroadcast);
 REGISTER_MKL_KERNEL(kVSigmoid, VSigmoid);
 REGISTER_MKL_KERNEL(kVTanh, VTanh);
 REGISTER_MKL_KERNEL(kSeqPool, SeqPool);
diff --git a/paddle/fluid/operators/jit/more/mkl/mkl.h b/paddle/fluid/operators/jit/more/mkl/mkl.h
index a58d300ece..db2d6faed4 100644
--- a/paddle/fluid/operators/jit/more/mkl/mkl.h
+++ b/paddle/fluid/operators/jit/more/mkl/mkl.h
@@ -50,6 +50,13 @@ void VCopy(const T* x, T* y, int n);
 template <typename T>
 void VAXPY(T a, const T* x, T* y, int n);
 
+template <typename T>
+void VBroadcast(const T* x, T* y, int64_t y_h, int64_t x_len) {
+  for (int64_t h = 0; h < y_h; ++h) {
+    VCopy(x, y + h * x_len, x_len);
+  }
+}
+
 template <typename T>
 void VSigmoid(const T* x, T* y, int n) {
   const T min = SIGMOID_THRESHOLD_MIN;
@@ -202,6 +209,8 @@ DECLARE_MKL_KERNEL(Softmax, SoftmaxTuples);
 
 DECLARE_MKL_KERNEL(Sgd, SgdTuples);
 
+DECLARE_MKL_KERNEL(VBroadcast, VBroadcastTuples);
+
 #undef DECLARE_MKL_KERNEL
 
 }  // namespace mkl
-- 
GitLab