提交 191948c9 编写于 作者: T tensor-tang

enable jitcode

上级 45bfa70c
......@@ -7,7 +7,7 @@
set(JIT_KERNEL_DEPS cpu_info cblas gflags enforce place)
cc_library(jit_kernel_base SRCS kernels.cc DEPS ${JIT_KERNEL_DEPS})
cc_library(jit_kernel_base SRCS kernels.cc jitcode_base.cc DEPS ${JIT_KERNEL_DEPS})
add_subdirectory(refer)
add_subdirectory(more)
......
cc_library(jit_kernel_jitcode SRCS jitcode.cc DEPS jit_kernel_base xbyak)
file(GLOB jitcode_cc_srcs RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "*.cc")
cc_library(jit_kernel_jitcode SRCS ${jitcode_cc_srcs} DEPS jit_kernel_base xbyak)
set(JIT_KERNEL_DEPS ${JIT_KERNEL_DEPS} xbyak jit_kernel_jitcode PARENT_SCOPE)
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License. */
#include "paddle/fluid/operators/jitkernels/jitcode/blas.h"
#include "paddle/fluid/operators/jitkernels/registry.h"
namespace paddle {
namespace operators {
namespace jitkernels {
namespace jitcode {
void VXXJitCode::genCode() {
// do not need push stack, and do not need save avx512reg if do not use avx512
int offset = 0;
if (with_relu_) {
vxorps(ymm_zero, ymm_zero, ymm_zero);
}
if (scalar_index_ == 1) {
vbroadcastss(ymm_src1, ptr[param1]);
} else if (scalar_index_ == 2) {
vbroadcastss(ymm_src2, ptr[param2]);
}
for (int i = 0; i < num_ / YMM_FLOAT_BLOCK; ++i) {
if (scalar_index_ != 1) {
vmovups(ymm_src1, ptr[param1 + offset]);
}
if (scalar_index_ != 2) {
vmovups(ymm_src2, ptr[param2 + offset]);
}
if (type_ == operand_type::mul) {
vmulps(ymm_dst, ymm_src1, ymm_src2);
} else if (type_ == operand_type::add) {
vaddps(ymm_dst, ymm_src1, ymm_src2);
}
if (with_relu_) {
vmaxps(ymm_dst, ymm_zero, ymm_dst);
}
vmovups(ptr[param3 + offset], ymm_dst);
offset += sizeof(float) * YMM_FLOAT_BLOCK;
}
int rest = num_ % YMM_FLOAT_BLOCK;
while (rest > 0) {
int block = XMM_FLOAT_BLOCK;
if (rest >= 4) {
block = 4;
if (scalar_index_ != 1) {
vmovups(xmm_src1, ptr[param1 + offset]);
}
if (scalar_index_ != 2) {
vmovups(xmm_src2, ptr[param2 + offset]);
}
} else if (rest >= 2) {
block = 2;
if (scalar_index_ != 1) {
vmovq(xmm_src1, ptr[param1 + offset]);
}
if (scalar_index_ != 2) {
vmovq(xmm_src2, ptr[param2 + offset]);
}
} else {
block = 1;
if (scalar_index_ != 1) {
vmovss(xmm_src1, ptr[param1 + offset]);
}
if (scalar_index_ != 2) {
vmovss(xmm_src2, ptr[param2 + offset]);
}
}
switch (type_) {
case operand_type::mul:
vmulps(xmm_dst, xmm_src1, xmm_src2);
break;
case operand_type::add:
vaddps(xmm_dst, xmm_src1, xmm_src2);
break;
default:
break;
}
if (with_relu_) {
vmaxps(xmm_dst, xmm_zero, xmm_dst);
}
if (rest >= 4) {
vmovups(ptr[param3 + offset], xmm_dst);
} else if (rest >= 2) {
vmovq(ptr[param3 + offset], xmm_dst);
} else {
vmovss(ptr[param3 + offset], xmm_dst);
}
offset += sizeof(float) * block;
rest -= block;
}
ret();
}
} // namespace jitcode
template <>
std::unique_ptr<JitBase> CreateJitCode<KernelType::vmul, float, int>(int attr) {
if (UseJitCode<KernelType::vmul, float, int>(attr)) {
return make_unique<jitcode::VMulJitCode>(
attr, CodeSize<KernelType::vmul, float, int>(attr));
}
return nullptr;
}
} // namespace jitkernels
} // namespace operators
} // namespace paddle
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License. */
#pragma once
#include <string>
#include "paddle/fluid/operators/jitkernels/jitcode/jitcode.h"
namespace paddle {
namespace operators {
namespace jitkernels {
namespace jitcode {
// function: vec = Operand(vec(or scalar), vec(or scalar)) (maybe with relu)
class VXXJitCode : public JitCode {
public:
const char* name() const override {
std::string base = "VXXJitCode";
if (scalar_index_ == 1) {
base += "_Scalar";
} else {
base += "_Vec";
}
if (type_ == operand_type::mul) {
base += "_Mul";
} else if (type_ == operand_type::add) {
base += "_Add";
}
if (scalar_index_ == 2) {
base += "_Scalar";
} else {
base += "_Vec";
}
base += (with_relu_ ? "_Relu" : "");
return base.c_str();
}
explicit VXXJitCode(int d, operand_type type, int scalar_index,
bool with_relu, size_t code_size = 256 * 1024,
void* code_ptr = nullptr)
: JitCode(code_size, code_ptr),
num_(d),
type_(type),
scalar_index_(scalar_index),
with_relu_(with_relu) {}
// static bool init(int d, int scalar_index = 0);
void genCode() override;
private:
int num_;
operand_type type_;
int scalar_index_;
bool with_relu_;
reg64_t param1{abi_param1};
reg64_t param2{abi_param2};
reg64_t param3{abi_param3};
xmm_t xmm_src1 = xmm_t(0);
xmm_t xmm_src2 = xmm_t(1);
xmm_t xmm_dst = xmm_t(2);
xmm_t xmm_zero = xmm_t(3);
ymm_t ymm_src1 = ymm_t(0);
ymm_t ymm_src2 = ymm_t(1);
ymm_t ymm_dst = ymm_t(2);
ymm_t ymm_zero = ymm_t(3);
};
class VMulJitCode : public VXXJitCode {
public:
explicit VMulJitCode(int d, size_t code_size, void* code_ptr = nullptr)
: VXXJitCode(d, operand_type::mul, 0, false, code_size, code_ptr) {}
};
} // namespace jitcode
} // namespace jitkernels
} // namespace operators
} // namespace paddle
......@@ -16,7 +16,7 @@
#include <type_traits>
#include "paddle/fluid/operators/jitkernels/jitcode_base.h"
#include "paddle/fluid/operators/jitkernels/kernels.h"
#include "paddle/fluid/platform/cpu_info.h"
#define XBYAK_USE_MMAP_ALLOCATOR
#include "xbyak/xbyak.h"
......@@ -30,23 +30,102 @@ namespace jitcode {
// Application Binary Interface
constexpr Xbyak::Operand::Code abi_param1(Xbyak::Operand::RDI),
abi_param2(Xbyak::Operand::RSI), abi_param3(Xbyak::Operand::RDX),
abi_param4(Xbyak::Operand::RCX), abi_not_param1(Xbyak::Operand::RCX);
abi_param4(Xbyak::Operand::RCX);
template <typename Attr>
class VMulJitCode : public JitBase, public Xbyak::CodeGenerator {
constexpr Xbyak::Operand::Code g_abi_regs[] = {
Xbyak::Operand::RBX, Xbyak::Operand::RBP, Xbyak::Operand::R12,
Xbyak::Operand::R13, Xbyak::Operand::R14, Xbyak::Operand::R15};
constexpr int num_g_abi_regs = sizeof(g_abi_regs) / sizeof(g_abi_regs[0]);
using reg64_t = const Xbyak::Reg64;
using reg32_t = const Xbyak::Reg32;
using xmm_t = const Xbyak::Xmm;
using ymm_t = const Xbyak::Ymm;
using zmm_t = const Xbyak::Zmm;
using Label = Xbyak::Label;
typedef enum {
mul = 0,
add,
sub,
relu,
exp,
sigmoid,
tanh,
identity
} operand_type;
#define XMM_FLOAT_BLOCK 4
#define YMM_FLOAT_BLOCK 8
#define ZMM_FLOAT_BLOCK 16
#define SIGMOID_THRESHOLD_MIN -40.0
#define SIGMOID_THRESHOLD_MAX 13.0
#define EXP_MAX_INPUT 40.0
#define DECLARE_JIT_CODE(codename) \
const char* name() const override { return #codename; }
class JitCode : public JitBase, public Xbyak::CodeGenerator {
public:
VMulJitCode(Attr attr, size_t code_size, void* code_ptr = nullptr)
explicit JitCode(size_t code_size, void* code_ptr = nullptr)
: Xbyak::CodeGenerator(code_size, code_ptr) {
this->genCode();
}
virtual const char* name() const = 0;
virtual void genCode() = 0;
size_t getSize() const override { return CodeGenerator::getSize(); }
const unsigned char* getCodeInternal() override {
const Xbyak::uint8* code = CodeGenerator::getCode();
return code;
}
virtual const char* name() const = 0;
virtual void genCode() = 0;
protected:
Xbyak::Reg64 param1{abi_param1};
const int EVEX_max_8b_offt = 0x200;
const Xbyak::Reg64 reg_EVEX_max_8b_offt = rbp;
virtual void preCode() {
for (int i = 0; i < num_g_abi_regs; ++i) {
push(Xbyak::Reg64(g_abi_regs[i]));
}
if (platform::jit::MayIUse(platform::jit::avx512f)) {
mov(reg_EVEX_max_8b_offt, 2 * EVEX_max_8b_offt);
}
}
virtual void postCode() {
for (int i = 0; i < num_g_abi_regs; ++i) {
pop(Xbyak::Reg64(g_abi_regs[num_g_abi_regs - 1 - i]));
}
ret();
}
void L(const char* label) { Xbyak::CodeGenerator::L(label); }
void L(const Xbyak::Label& label) { Xbyak::CodeGenerator::L(label); }
// Enhanced vector extension
Xbyak::Address EVEX_compress_addr(Xbyak::Reg64 base, int offt,
bool bcast = false) {
int scale = 0;
// Learn from https://github.com/intel/mkl-dnn
if (EVEX_max_8b_offt <= offt && offt < 3 * EVEX_max_8b_offt) {
offt = offt - 2 * EVEX_max_8b_offt;
scale = 1;
} else if (3 * EVEX_max_8b_offt <= offt && offt < 5 * EVEX_max_8b_offt) {
offt = offt - 4 * EVEX_max_8b_offt;
scale = 2;
}
auto re = Xbyak::RegExp() + base + offt;
if (scale) {
re = re + reg_EVEX_max_8b_offt * scale;
}
if (bcast) {
return zword_b[re];
} else {
return zword[re];
}
}
};
} // namespace jitcode
......
......@@ -13,6 +13,9 @@
* limitations under the License. */
#include "paddle/fluid/operators/jitkernels/jitcode_base.h"
#include <fstream>
#include <iostream>
#include <sstream>
DEFINE_bool(dump_jitcode, false, "Whether to dump the jitcode to file");
......@@ -29,7 +32,7 @@ void JitBase::dumpCode(const unsigned char* code) const {
counter++;
std::ofstream fout(filename.str(), std::ios::out);
if (fout.is_open()) {
fout.write(reinterpret_cast<const char*>(code), getSize());
fout.write(reinterpret_cast<const char*>(code), this->getSize());
fout.close();
}
}
......
......@@ -28,7 +28,7 @@ namespace jitkernels {
// TODO(TJ): make these functions as virtual of a class
// Every JitCode should estimate the code size itself
template <KernelType KT, typename Attr>
template <KernelType KT, typename T, typename Attr>
size_t CodeSize(Attr attr) {
return 4096;
}
......@@ -43,13 +43,11 @@ bool UseJitCode(Attr attr) {
template <typename Attr>
size_t GetKey(Attr attr);
class JitBase {
class JitBase : public Kernel {
public:
JitBase() = default;
virtual ~JitBase() = default;
virtual const char* name() const = 0;
virtual const unsigned char* getCodeInternal() = 0;
virtual size_t getSize() const = 0;
template <typename FUNC>
const FUNC getCode() {
const unsigned char* code = this->getCodeInternal();
......@@ -58,14 +56,17 @@ class JitBase {
}
return reinterpret_cast<const FUNC>(code);
}
DISABLE_COPY_AND_ASSIGN(JitBase);
protected:
void dumpCode(const unsigned char* code);
void dumpCode(const unsigned char* code) const;
};
template <KernelType KT, typename Attr>
std::shared_ptr<const JitBase> CreateJitCode(Attr attr);
template <KernelType KT, typename T, typename Attr>
std::unique_ptr<JitBase> CreateJitCode(Attr attr); //{
// if (UseJitCode<KT,T,Attr>) {
// return make_unique<xxxxclass>(attr, CodeSize<KT,T,Attr>());
// }
// }
} // namespace jitkernels
} // namespace operators
......
......@@ -31,6 +31,9 @@ namespace jitkernels {
template <KernelType KT>
class JitCodePool {
typedef std::unique_ptr<JitBase> JitBasePtr;
typedef std::unordered_map<size_t, JitBasePtr> JitBaseMap;
public:
JitCodePool() = default;
static JitCodePool& Instance() {
......@@ -38,29 +41,26 @@ class JitCodePool {
return g_jit_codes;
}
std::shared_ptr<const JitBase> Get(size_t key) const {
if (codes_.find(key) == codes_.end()) {
return nullptr;
}
return codes_.at(key);
}
const JitBaseMap& AllKernels() { return codes_; }
bool Has(size_t key) const { return codes_.find(key) != codes_.end(); }
void Insert(size_t key, const std::shared_ptr<const JitBase>& value) {
codes_.insert({key, value});
void Insert(size_t key, JitBasePtr value) {
codes_.emplace(key, std::move(value));
}
private:
std::unordered_map<size_t, std::shared_ptr<const JitBase>> codes_;
JitBaseMap codes_;
DISABLE_COPY_AND_ASSIGN(JitCodePool);
};
// TODO(TJ): std::tuple<T, Func, Attr>
template <typename T, typename Func, typename Attr>
struct KernelAttr {
typedef T data_type;
typedef Func return_type;
typedef Attr attr_type;
};
// template <typename T, typename Func, typename Attr>
// struct KernelAttr {
// typedef T data_type;
// typedef Func return_type;
// typedef Attr attr_type;
// };
typedef std::unique_ptr<const Kernel> KernelPtr;
typedef std::unordered_map<KernelKey, std::vector<KernelPtr>, KernelKey::Hash>
......@@ -123,20 +123,21 @@ inline Func GetRefer() {
// TODO(TJ): make tuple? named KernelAttr
template <KernelType KT, typename T, typename Func, typename Attr,
typename PlaceType = platform::CPUPlace>
Func Get(Attr attr) {
// size_t key = GetKey<Attr>(attr);
// auto jitcode = JitCodePool<KT>().Instance().Get(key);
// if (jitcode) {
// return jitcode->template getCode<Func>();
// }
if (std::is_same<PlaceType, platform::CPUPlace>::value &&
std::is_same<T, float>::value) { // TODO(TJ): float move to create
// auto p = CreateJitCode<KT, Attr>(attr);
// if (p) {
// JitCodePool<KT>().Instance().Insert(key, p);
// return p->template getCode<Func>();
// }
const Func Get(Attr attr) {
size_t key = GetKey<Attr>(attr);
auto& codes = JitCodePool<KT>().Instance();
if (codes.Has(key)) {
return codes.AllKernels().at(key)->template getCode<Func>();
}
if (std::is_same<PlaceType, platform::CPUPlace>::value) { // TODO(TJ): float
// move to create
auto p = CreateJitCode<KT, T, Attr>(attr);
if (p) {
auto f = p->template getCode<Func>();
codes.Insert(key, std::move(p));
return f;
}
}
// pool: (KernelKey(type, place), vector<Kernel>)
......
......@@ -39,7 +39,7 @@ size_t CUDAPinnedMinChunkSize();
//! Get the maximum chunk size for buddy allocator.
size_t CUDAPinnedMaxChunkSize();
namespace jit { // remove this namespace
namespace jit {
typedef enum {
isa_any,
sse42,
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册