From 191948c933a15c41315228ffbdc70eb59fdb8f55 Mon Sep 17 00:00:00 2001
From: tensor-tang <tangjian03@baidu.com>
Date: Wed, 5 Dec 2018 07:15:08 +0000
Subject: [PATCH] enable jitcode

---
 .../fluid/operators/jitkernels/CMakeLists.txt |   2 +-
 .../jitkernels/jitcode/CMakeLists.txt         |   4 +-
 .../operators/jitkernels/jitcode/blas.cc      | 118 ++++++++++++++++++
 .../fluid/operators/jitkernels/jitcode/blas.h |  88 +++++++++++++
 .../operators/jitkernels/jitcode/jitcode.h    |  95 ++++++++++++--
 .../operators/jitkernels/jitcode_base.cc      |   5 +-
 .../fluid/operators/jitkernels/jitcode_base.h |  19 +--
 paddle/fluid/operators/jitkernels/kernels.h   |  59 ++++-----
 paddle/fluid/platform/cpu_info.h              |   2 +-
 9 files changed, 342 insertions(+), 50 deletions(-)
 create mode 100644 paddle/fluid/operators/jitkernels/jitcode/blas.cc
 create mode 100644 paddle/fluid/operators/jitkernels/jitcode/blas.h

diff --git a/paddle/fluid/operators/jitkernels/CMakeLists.txt b/paddle/fluid/operators/jitkernels/CMakeLists.txt
index 6392d82e1..e82e6c302 100644
--- a/paddle/fluid/operators/jitkernels/CMakeLists.txt
+++ b/paddle/fluid/operators/jitkernels/CMakeLists.txt
@@ -7,7 +7,7 @@
 
 set(JIT_KERNEL_DEPS cpu_info cblas gflags enforce place)
 
-cc_library(jit_kernel_base SRCS kernels.cc DEPS ${JIT_KERNEL_DEPS})
+cc_library(jit_kernel_base SRCS kernels.cc jitcode_base.cc DEPS ${JIT_KERNEL_DEPS})
 
 add_subdirectory(refer)
 add_subdirectory(more)
diff --git a/paddle/fluid/operators/jitkernels/jitcode/CMakeLists.txt b/paddle/fluid/operators/jitkernels/jitcode/CMakeLists.txt
index 1a5e45730..c678ea33b 100644
--- a/paddle/fluid/operators/jitkernels/jitcode/CMakeLists.txt
+++ b/paddle/fluid/operators/jitkernels/jitcode/CMakeLists.txt
@@ -1,3 +1,5 @@
 
-cc_library(jit_kernel_jitcode SRCS jitcode.cc DEPS jit_kernel_base xbyak)
+file(GLOB jitcode_cc_srcs RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*.cc")
+
+cc_library(jit_kernel_jitcode SRCS ${jitcode_cc_srcs} DEPS jit_kernel_base xbyak)
 set(JIT_KERNEL_DEPS ${JIT_KERNEL_DEPS} xbyak jit_kernel_jitcode PARENT_SCOPE)
diff --git a/paddle/fluid/operators/jitkernels/jitcode/blas.cc b/paddle/fluid/operators/jitkernels/jitcode/blas.cc
new file mode 100644
index 000000000..2691bee0f
--- /dev/null
+++ b/paddle/fluid/operators/jitkernels/jitcode/blas.cc
@@ -0,0 +1,118 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License. */
+#include "paddle/fluid/operators/jitkernels/jitcode/blas.h"
+#include "paddle/fluid/operators/jitkernels/registry.h"
+
+namespace paddle {
+namespace operators {
+namespace jitkernels {
+namespace jitcode {
+
+void VXXJitCode::genCode() {
+  // do not need push stack, and do not need save avx512reg if do not use avx512
+  int offset = 0;
+  if (with_relu_) {
+    vxorps(ymm_zero, ymm_zero, ymm_zero);
+  }
+  if (scalar_index_ == 1) {
+    vbroadcastss(ymm_src1, ptr[param1]);
+  } else if (scalar_index_ == 2) {
+    vbroadcastss(ymm_src2, ptr[param2]);
+  }
+  for (int i = 0; i < num_ / YMM_FLOAT_BLOCK; ++i) {
+    if (scalar_index_ != 1) {
+      vmovups(ymm_src1, ptr[param1 + offset]);
+    }
+    if (scalar_index_ != 2) {
+      vmovups(ymm_src2, ptr[param2 + offset]);
+    }
+    if (type_ == operand_type::mul) {
+      vmulps(ymm_dst, ymm_src1, ymm_src2);
+    } else if (type_ == operand_type::add) {
+      vaddps(ymm_dst, ymm_src1, ymm_src2);
+    }
+    if (with_relu_) {
+      vmaxps(ymm_dst, ymm_zero, ymm_dst);
+    }
+    vmovups(ptr[param3 + offset], ymm_dst);
+    offset += sizeof(float) * YMM_FLOAT_BLOCK;
+  }
+  int rest = num_ % YMM_FLOAT_BLOCK;
+  while (rest > 0) {
+    int block = XMM_FLOAT_BLOCK;
+    if (rest >= 4) {
+      block = 4;
+      if (scalar_index_ != 1) {
+        vmovups(xmm_src1, ptr[param1 + offset]);
+      }
+      if (scalar_index_ != 2) {
+        vmovups(xmm_src2, ptr[param2 + offset]);
+      }
+    } else if (rest >= 2) {
+      block = 2;
+      if (scalar_index_ != 1) {
+        vmovq(xmm_src1, ptr[param1 + offset]);
+      }
+      if (scalar_index_ != 2) {
+        vmovq(xmm_src2, ptr[param2 + offset]);
+      }
+    } else {
+      block = 1;
+      if (scalar_index_ != 1) {
+        vmovss(xmm_src1, ptr[param1 + offset]);
+      }
+      if (scalar_index_ != 2) {
+        vmovss(xmm_src2, ptr[param2 + offset]);
+      }
+    }
+    switch (type_) {
+      case operand_type::mul:
+        vmulps(xmm_dst, xmm_src1, xmm_src2);
+        break;
+      case operand_type::add:
+        vaddps(xmm_dst, xmm_src1, xmm_src2);
+        break;
+      default:
+        break;
+    }
+    if (with_relu_) {
+      vmaxps(xmm_dst, xmm_zero, xmm_dst);
+    }
+    if (rest >= 4) {
+      vmovups(ptr[param3 + offset], xmm_dst);
+    } else if (rest >= 2) {
+      vmovq(ptr[param3 + offset], xmm_dst);
+    } else {
+      vmovss(ptr[param3 + offset], xmm_dst);
+    }
+    offset += sizeof(float) * block;
+    rest -= block;
+  }
+  ret();
+}
+
+}  // namespace jitcode
+
+template <>
+std::unique_ptr<JitBase> CreateJitCode<KernelType::vmul, float, int>(int attr) {
+  if (UseJitCode<KernelType::vmul, float, int>(attr)) {
+    return make_unique<jitcode::VMulJitCode>(
+        attr, CodeSize<KernelType::vmul, float, int>(attr));
+  }
+  return nullptr;
+}
+
+}  // namespace jitkernels
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/jitkernels/jitcode/blas.h b/paddle/fluid/operators/jitkernels/jitcode/blas.h
new file mode 100644
index 000000000..a1aca9772
--- /dev/null
+++ b/paddle/fluid/operators/jitkernels/jitcode/blas.h
@@ -0,0 +1,88 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License. */
+
+#pragma once
+
+#include <string>
+#include "paddle/fluid/operators/jitkernels/jitcode/jitcode.h"
+
+namespace paddle {
+namespace operators {
+namespace jitkernels {
+namespace jitcode {
+
+// function: vec = Operand(vec(or scalar), vec(or scalar)) (maybe with relu)
+class VXXJitCode : public JitCode {
+ public:
+  const char* name() const override {
+    std::string base = "VXXJitCode";
+    if (scalar_index_ == 1) {
+      base += "_Scalar";
+    } else {
+      base += "_Vec";
+    }
+    if (type_ == operand_type::mul) {
+      base += "_Mul";
+    } else if (type_ == operand_type::add) {
+      base += "_Add";
+    }
+    if (scalar_index_ == 2) {
+      base += "_Scalar";
+    } else {
+      base += "_Vec";
+    }
+    base += (with_relu_ ? "_Relu" : "");
+    return base.c_str();
+  }
+  explicit VXXJitCode(int d, operand_type type, int scalar_index,
+                      bool with_relu, size_t code_size = 256 * 1024,
+                      void* code_ptr = nullptr)
+      : JitCode(code_size, code_ptr),
+        num_(d),
+        type_(type),
+        scalar_index_(scalar_index),
+        with_relu_(with_relu) {}
+  // static bool init(int d, int scalar_index = 0);
+  void genCode() override;
+
+ private:
+  int num_;
+  operand_type type_;
+  int scalar_index_;
+  bool with_relu_;
+  reg64_t param1{abi_param1};
+  reg64_t param2{abi_param2};
+  reg64_t param3{abi_param3};
+
+  xmm_t xmm_src1 = xmm_t(0);
+  xmm_t xmm_src2 = xmm_t(1);
+  xmm_t xmm_dst = xmm_t(2);
+  xmm_t xmm_zero = xmm_t(3);
+
+  ymm_t ymm_src1 = ymm_t(0);
+  ymm_t ymm_src2 = ymm_t(1);
+  ymm_t ymm_dst = ymm_t(2);
+  ymm_t ymm_zero = ymm_t(3);
+};
+
+class VMulJitCode : public VXXJitCode {
+ public:
+  explicit VMulJitCode(int d, size_t code_size, void* code_ptr = nullptr)
+      : VXXJitCode(d, operand_type::mul, 0, false, code_size, code_ptr) {}
+};
+
+}  // namespace jitcode
+}  // namespace jitkernels
+}  // namespace operators
+}  // namespace paddle
diff --git a/paddle/fluid/operators/jitkernels/jitcode/jitcode.h b/paddle/fluid/operators/jitkernels/jitcode/jitcode.h
index 7e0b6442e..a3582e528 100644
--- a/paddle/fluid/operators/jitkernels/jitcode/jitcode.h
+++ b/paddle/fluid/operators/jitkernels/jitcode/jitcode.h
@@ -16,7 +16,7 @@
 
 #include <type_traits>
 #include "paddle/fluid/operators/jitkernels/jitcode_base.h"
-#include "paddle/fluid/operators/jitkernels/kernels.h"
+#include "paddle/fluid/platform/cpu_info.h"
 
 #define XBYAK_USE_MMAP_ALLOCATOR
 #include "xbyak/xbyak.h"
@@ -30,23 +30,102 @@ namespace jitcode {
 // Application Binary Interface
 constexpr Xbyak::Operand::Code abi_param1(Xbyak::Operand::RDI),
     abi_param2(Xbyak::Operand::RSI), abi_param3(Xbyak::Operand::RDX),
-    abi_param4(Xbyak::Operand::RCX), abi_not_param1(Xbyak::Operand::RCX);
+    abi_param4(Xbyak::Operand::RCX);
 
-template <typename Attr>
-class VMulJitCode : public JitBase, public Xbyak::CodeGenerator {
+constexpr Xbyak::Operand::Code g_abi_regs[] = {
+    Xbyak::Operand::RBX, Xbyak::Operand::RBP, Xbyak::Operand::R12,
+    Xbyak::Operand::R13, Xbyak::Operand::R14, Xbyak::Operand::R15};
+
+constexpr int num_g_abi_regs = sizeof(g_abi_regs) / sizeof(g_abi_regs[0]);
+
+using reg64_t = const Xbyak::Reg64;
+using reg32_t = const Xbyak::Reg32;
+using xmm_t = const Xbyak::Xmm;
+using ymm_t = const Xbyak::Ymm;
+using zmm_t = const Xbyak::Zmm;
+using Label = Xbyak::Label;
+
+typedef enum {
+  mul = 0,
+  add,
+  sub,
+  relu,
+  exp,
+  sigmoid,
+  tanh,
+  identity
+} operand_type;
+
+#define XMM_FLOAT_BLOCK 4
+#define YMM_FLOAT_BLOCK 8
+#define ZMM_FLOAT_BLOCK 16
+
+#define SIGMOID_THRESHOLD_MIN -40.0
+#define SIGMOID_THRESHOLD_MAX 13.0
+#define EXP_MAX_INPUT 40.0
+
+#define DECLARE_JIT_CODE(codename) \
+  const char* name() const override { return #codename; }
+
+class JitCode : public JitBase, public Xbyak::CodeGenerator {
  public:
-  VMulJitCode(Attr attr, size_t code_size, void* code_ptr = nullptr)
+  explicit JitCode(size_t code_size, void* code_ptr = nullptr)
       : Xbyak::CodeGenerator(code_size, code_ptr) {
     this->genCode();
   }
 
-  virtual const char* name() const = 0;
-  virtual void genCode() = 0;
-
+  size_t getSize() const override { return CodeGenerator::getSize(); }
   const unsigned char* getCodeInternal() override {
     const Xbyak::uint8* code = CodeGenerator::getCode();
     return code;
   }
+
+  virtual const char* name() const = 0;
+  virtual void genCode() = 0;
+
+ protected:
+  Xbyak::Reg64 param1{abi_param1};
+  const int EVEX_max_8b_offt = 0x200;
+  const Xbyak::Reg64 reg_EVEX_max_8b_offt = rbp;
+
+  virtual void preCode() {
+    for (int i = 0; i < num_g_abi_regs; ++i) {
+      push(Xbyak::Reg64(g_abi_regs[i]));
+    }
+    if (platform::jit::MayIUse(platform::jit::avx512f)) {
+      mov(reg_EVEX_max_8b_offt, 2 * EVEX_max_8b_offt);
+    }
+  }
+  virtual void postCode() {
+    for (int i = 0; i < num_g_abi_regs; ++i) {
+      pop(Xbyak::Reg64(g_abi_regs[num_g_abi_regs - 1 - i]));
+    }
+    ret();
+  }
+  void L(const char* label) { Xbyak::CodeGenerator::L(label); }
+  void L(const Xbyak::Label& label) { Xbyak::CodeGenerator::L(label); }
+  // Enhanced vector extension
+  Xbyak::Address EVEX_compress_addr(Xbyak::Reg64 base, int offt,
+                                    bool bcast = false) {
+    int scale = 0;
+    // Learn from https://github.com/intel/mkl-dnn
+    if (EVEX_max_8b_offt <= offt && offt < 3 * EVEX_max_8b_offt) {
+      offt = offt - 2 * EVEX_max_8b_offt;
+      scale = 1;
+    } else if (3 * EVEX_max_8b_offt <= offt && offt < 5 * EVEX_max_8b_offt) {
+      offt = offt - 4 * EVEX_max_8b_offt;
+      scale = 2;
+    }
+    auto re = Xbyak::RegExp() + base + offt;
+    if (scale) {
+      re = re + reg_EVEX_max_8b_offt * scale;
+    }
+    if (bcast) {
+      return zword_b[re];
+    } else {
+      return zword[re];
+    }
+  }
 };
 
 }  // namespace jitcode
diff --git a/paddle/fluid/operators/jitkernels/jitcode_base.cc b/paddle/fluid/operators/jitkernels/jitcode_base.cc
index 417c4d4b9..1da2af51f 100644
--- a/paddle/fluid/operators/jitkernels/jitcode_base.cc
+++ b/paddle/fluid/operators/jitkernels/jitcode_base.cc
@@ -13,6 +13,9 @@
  * limitations under the License. */
 
 #include "paddle/fluid/operators/jitkernels/jitcode_base.h"
+#include <fstream>
+#include <iostream>
+#include <sstream>
 
 DEFINE_bool(dump_jitcode, false, "Whether to dump the jitcode to file");
 
@@ -29,7 +32,7 @@ void JitBase::dumpCode(const unsigned char* code) const {
     counter++;
     std::ofstream fout(filename.str(), std::ios::out);
     if (fout.is_open()) {
-      fout.write(reinterpret_cast<const char*>(code), getSize());
+      fout.write(reinterpret_cast<const char*>(code), this->getSize());
       fout.close();
     }
   }
diff --git a/paddle/fluid/operators/jitkernels/jitcode_base.h b/paddle/fluid/operators/jitkernels/jitcode_base.h
index a16474656..ffec62163 100644
--- a/paddle/fluid/operators/jitkernels/jitcode_base.h
+++ b/paddle/fluid/operators/jitkernels/jitcode_base.h
@@ -28,7 +28,7 @@ namespace jitkernels {
 // TODO(TJ): make these functions as virtual of a class
 
 // Every JitCode should estimate the code size itself
-template <KernelType KT, typename Attr>
+template <KernelType KT, typename T, typename Attr>
 size_t CodeSize(Attr attr) {
   return 4096;
 }
@@ -43,13 +43,11 @@ bool UseJitCode(Attr attr) {
 template <typename Attr>
 size_t GetKey(Attr attr);
 
-class JitBase {
+class JitBase : public Kernel {
  public:
-  JitBase() = default;
-  virtual ~JitBase() = default;
   virtual const char* name() const = 0;
   virtual const unsigned char* getCodeInternal() = 0;
-
+  virtual size_t getSize() const = 0;
   template <typename FUNC>
   const FUNC getCode() {
     const unsigned char* code = this->getCodeInternal();
@@ -58,14 +56,17 @@ class JitBase {
     }
     return reinterpret_cast<const FUNC>(code);
   }
-  DISABLE_COPY_AND_ASSIGN(JitBase);
 
  protected:
-  void dumpCode(const unsigned char* code);
+  void dumpCode(const unsigned char* code) const;
 };
 
-template <KernelType KT, typename Attr>
-std::shared_ptr<const JitBase> CreateJitCode(Attr attr);
+template <KernelType KT, typename T, typename Attr>
+std::unique_ptr<JitBase> CreateJitCode(Attr attr);  //{
+//   if (UseJitCode<KT,T,Attr>) {
+//     return make_unique<xxxxclass>(attr, CodeSize<KT,T,Attr>());
+//   }
+// }
 
 }  // namespace jitkernels
 }  // namespace operators
diff --git a/paddle/fluid/operators/jitkernels/kernels.h b/paddle/fluid/operators/jitkernels/kernels.h
index 866f72cce..f398093df 100644
--- a/paddle/fluid/operators/jitkernels/kernels.h
+++ b/paddle/fluid/operators/jitkernels/kernels.h
@@ -31,6 +31,9 @@ namespace jitkernels {
 
 template <KernelType KT>
 class JitCodePool {
+  typedef std::unique_ptr<JitBase> JitBasePtr;
+  typedef std::unordered_map<size_t, JitBasePtr> JitBaseMap;
+
  public:
   JitCodePool() = default;
   static JitCodePool& Instance() {
@@ -38,29 +41,26 @@ class JitCodePool {
     return g_jit_codes;
   }
 
-  std::shared_ptr<const JitBase> Get(size_t key) const {
-    if (codes_.find(key) == codes_.end()) {
-      return nullptr;
-    }
-    return codes_.at(key);
-  }
+  const JitBaseMap& AllKernels() { return codes_; }
+
+  bool Has(size_t key) const { return codes_.find(key) != codes_.end(); }
 
-  void Insert(size_t key, const std::shared_ptr<const JitBase>& value) {
-    codes_.insert({key, value});
+  void Insert(size_t key, JitBasePtr value) {
+    codes_.emplace(key, std::move(value));
   }
 
  private:
-  std::unordered_map<size_t, std::shared_ptr<const JitBase>> codes_;
+  JitBaseMap codes_;
   DISABLE_COPY_AND_ASSIGN(JitCodePool);
 };
 
 // TODO(TJ): std::tuple<T, Func, Attr>
-template <typename T, typename Func, typename Attr>
-struct KernelAttr {
-  typedef T data_type;
-  typedef Func return_type;
-  typedef Attr attr_type;
-};
+// template <typename T, typename Func, typename Attr>
+// struct KernelAttr {
+//   typedef T data_type;
+//   typedef Func return_type;
+//   typedef Attr attr_type;
+// };
 
 typedef std::unique_ptr<const Kernel> KernelPtr;
 typedef std::unordered_map<KernelKey, std::vector<KernelPtr>, KernelKey::Hash>
@@ -123,20 +123,21 @@ inline Func GetRefer() {
 // TODO(TJ): make tuple? named KernelAttr
 template <KernelType KT, typename T, typename Func, typename Attr,
           typename PlaceType = platform::CPUPlace>
-Func Get(Attr attr) {
-  // size_t key = GetKey<Attr>(attr);
-  // auto jitcode = JitCodePool<KT>().Instance().Get(key);
-  // if (jitcode) {
-  //   return jitcode->template getCode<Func>();
-  // }
-
-  if (std::is_same<PlaceType, platform::CPUPlace>::value &&
-      std::is_same<T, float>::value) {  // TODO(TJ): float move to create
-    // auto p = CreateJitCode<KT, Attr>(attr);
-    // if (p) {
-    //   JitCodePool<KT>().Instance().Insert(key, p);
-    //   return p->template getCode<Func>();
-    // }
+const Func Get(Attr attr) {
+  size_t key = GetKey<Attr>(attr);
+  auto& codes = JitCodePool<KT>().Instance();
+  if (codes.Has(key)) {
+    return codes.AllKernels().at(key)->template getCode<Func>();
+  }
+
+  if (std::is_same<PlaceType, platform::CPUPlace>::value) {  // TODO(TJ): float
+                                                             // move to create
+    auto p = CreateJitCode<KT, T, Attr>(attr);
+    if (p) {
+      auto f = p->template getCode<Func>();
+      codes.Insert(key, std::move(p));
+      return f;
+    }
   }
 
   // pool: (KernelKey(type, place), vector<Kernel>)
diff --git a/paddle/fluid/platform/cpu_info.h b/paddle/fluid/platform/cpu_info.h
index 663a9fbf4..fd31ef77b 100644
--- a/paddle/fluid/platform/cpu_info.h
+++ b/paddle/fluid/platform/cpu_info.h
@@ -39,7 +39,7 @@ size_t CUDAPinnedMinChunkSize();
 //! Get the maximum chunk size for buddy allocator.
 size_t CUDAPinnedMaxChunkSize();
 
-namespace jit {  // remove this namespace
+namespace jit {
 typedef enum {
   isa_any,
   sse42,
-- 
GitLab