提交 641b3ccc 编写于 作者: T tensor-tang

add vbroadcast mkl code and jitcode

test=develop
上级 41a12708
...@@ -476,18 +476,17 @@ void BenchCRFDecodingKernel() { ...@@ -476,18 +476,17 @@ void BenchCRFDecodingKernel() {
template <jit::KernelType KT, typename T, typename PlaceType> template <jit::KernelType KT, typename T, typename PlaceType>
void BenchVBroadcastKernel() { void BenchVBroadcastKernel() {
for (int w : TestSizes()) { for (int64_t w : {1, 16, 64, 100, 256}) {
Tensor x; Tensor x;
x.Resize({w}); x.Resize({w});
RandomVec<T>(w, x.mutable_data<T>(PlaceType())); RandomVec<T>(w, x.mutable_data<T>(PlaceType()));
const T* x_data = x.data<T>(); const T* x_data = x.data<T>();
for (int64_t h : {1, 3, 6}) { for (int h : TestSizes()) {
Tensor y; Tensor y;
y.Resize({h * w}); y.Resize({h * w});
T* y_data = y.mutable_data<T>(PlaceType()); T* y_data = y.mutable_data<T>(PlaceType());
BenchAllImpls<KT, jit::VBroadcastTuples<T>, PlaceType>( BenchAllImpls<KT, jit::VBroadcastTuples<T>, PlaceType>(
static_cast<int64_t>(w), x_data, y_data, h, static_cast<int64_t>(w)); w, x_data, y_data, static_cast<int64_t>(h), w);
} }
} }
} }
......
...@@ -33,3 +33,4 @@ USE_JITKERNEL_GEN(kHMax) ...@@ -33,3 +33,4 @@ USE_JITKERNEL_GEN(kHMax)
USE_JITKERNEL_GEN(kHSum) USE_JITKERNEL_GEN(kHSum)
USE_JITKERNEL_GEN(kEmbSeqPool) USE_JITKERNEL_GEN(kEmbSeqPool)
USE_JITKERNEL_GEN(kSgd) USE_JITKERNEL_GEN(kSgd)
USE_JITKERNEL_GEN(kVBroadcast)
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License. */
#include "paddle/fluid/operators/jit/gen/vbroadcast.h"
#include <memory>
#include <vector>
#include "paddle/fluid/operators/jit/registry.h"
#include "paddle/fluid/platform/cpu_info.h"
namespace paddle {
namespace operators {
namespace jit {
namespace gen {
void VBroadcastJitCode::genCode() {
preCode();
constexpr int block = YMM_FLOAT_BLOCK;
constexpr int max_num_regs = 16;
const int num_block = w_ / block;
const int num_groups = num_block / max_num_regs;
const size_t block_size = sizeof(float) * block;
std::vector<int> groups(num_groups, max_num_regs);
int rest_num_regs = num_block % max_num_regs;
if (rest_num_regs > 0) {
groups.push_back(rest_num_regs);
}
// protect param_h
const size_t width_in_byte = sizeof(float) * w_;
mov(reg_height, param_h);
int acc_num_regs = 0;
for (int num_regs : groups) {
mov(reg_ptr_src_i, param_src);
add(reg_ptr_src_i, acc_num_regs * block_size);
size_t w_offset = 0;
for (int reg_i = 0; reg_i < num_regs; ++reg_i) {
vmovups(ymm_t(reg_i), ptr[reg_ptr_src_i + w_offset]);
w_offset += block_size;
}
Label l_next_h;
xor_(reg_h_i, reg_h_i);
mov(reg_ptr_dst_i, param_dst);
add(reg_ptr_dst_i, acc_num_regs * block_size);
L(l_next_h);
{
w_offset = 0;
for (int reg_i = 0; reg_i < num_regs; ++reg_i) {
vmovups(ptr[reg_ptr_dst_i + w_offset], ymm_t(reg_i));
w_offset += block_size;
}
add(reg_ptr_dst_i, width_in_byte);
inc(reg_h_i);
cmp(reg_h_i, reg_height);
jl(l_next_h, T_NEAR);
} // end of l_next_h
acc_num_regs += num_regs;
} // end of groups
postCode();
}
class VBroadcastCreator : public JitCodeCreator<int64_t> {
public:
bool UseMe(const int64_t& w) const override {
return platform::MayIUse(platform::avx) && w % YMM_FLOAT_BLOCK == 0;
}
size_t CodeSize(const int64_t& w) const override {
return 96 + (w / YMM_FLOAT_BLOCK) * 16 * 8;
}
std::unique_ptr<GenBase> CreateJitCode(const int64_t& w) const override {
PADDLE_ENFORCE_GT(w, 0);
return make_unique<VBroadcastJitCode>(w, CodeSize(w));
}
};
} // namespace gen
} // namespace jit
} // namespace operators
} // namespace paddle
namespace gen = paddle::operators::jit::gen;
REGISTER_JITKERNEL_GEN(kVBroadcast, gen::VBroadcastCreator);
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License. */
#pragma once
#include <string>
#include "glog/logging.h"
#include "paddle/fluid/operators/jit/gen/jitcode.h"
namespace paddle {
namespace operators {
namespace jit {
namespace gen {
class VBroadcastJitCode : public JitCode {
public:
explicit VBroadcastJitCode(const int64_t& w, size_t code_size = 256 * 1024,
void* code_ptr = nullptr)
: JitCode(code_size, code_ptr), w_(w) {
this->genCode();
}
DECLARE_JIT_CODE(VBroadcastJitCode);
void genCode() override;
private:
int w_;
reg64_t param_src{abi_param1};
reg64_t param_dst{abi_param2};
reg64_t param_h{abi_param3};
reg64_t param_w{abi_param4};
reg64_t reg_height{r9};
reg64_t reg_h_i{r10};
reg64_t reg_ptr_src_i{r11};
reg64_t reg_ptr_dst_i{r12};
};
} // namespace gen
} // namespace jit
} // namespace operators
} // namespace paddle
...@@ -16,3 +16,4 @@ USE_JITKERNEL_MORE(kSeqPool, mkl) ...@@ -16,3 +16,4 @@ USE_JITKERNEL_MORE(kSeqPool, mkl)
USE_JITKERNEL_MORE(kSoftmax, mkl) USE_JITKERNEL_MORE(kSoftmax, mkl)
USE_JITKERNEL_MORE(kEmbSeqPool, mkl) USE_JITKERNEL_MORE(kEmbSeqPool, mkl)
USE_JITKERNEL_MORE(kSgd, mkl) USE_JITKERNEL_MORE(kSgd, mkl)
USE_JITKERNEL_MORE(kVBroadcast, mkl)
...@@ -159,6 +159,16 @@ bool VCopyKernel<float>::UseMe(const int& d) const { ...@@ -159,6 +159,16 @@ bool VCopyKernel<float>::UseMe(const int& d) const {
return d > 15; return d > 15;
} }
template <>
bool VBroadcastKernel<float>::UseMe(const int64_t& d) const {
return d > 127;
}
template <>
bool VBroadcastKernel<double>::UseMe(const int64_t& attr) const {
return true;
}
template <> template <>
bool VSigmoidKernel<float>::UseMe(const int& d) const { bool VSigmoidKernel<float>::UseMe(const int& d) const {
return d > 7; return d > 7;
...@@ -251,6 +261,7 @@ REGISTER_MKL_KERNEL(kVScal, VScal); ...@@ -251,6 +261,7 @@ REGISTER_MKL_KERNEL(kVScal, VScal);
REGISTER_MKL_KERNEL(kVExp, VExp); REGISTER_MKL_KERNEL(kVExp, VExp);
REGISTER_MKL_KERNEL(kVSquare, VSquare); REGISTER_MKL_KERNEL(kVSquare, VSquare);
REGISTER_MKL_KERNEL(kVCopy, VCopy); REGISTER_MKL_KERNEL(kVCopy, VCopy);
REGISTER_MKL_KERNEL(kVBroadcast, VBroadcast);
REGISTER_MKL_KERNEL(kVSigmoid, VSigmoid); REGISTER_MKL_KERNEL(kVSigmoid, VSigmoid);
REGISTER_MKL_KERNEL(kVTanh, VTanh); REGISTER_MKL_KERNEL(kVTanh, VTanh);
REGISTER_MKL_KERNEL(kSeqPool, SeqPool); REGISTER_MKL_KERNEL(kSeqPool, SeqPool);
......
...@@ -50,6 +50,13 @@ void VCopy(const T* x, T* y, int n); ...@@ -50,6 +50,13 @@ void VCopy(const T* x, T* y, int n);
template <typename T> template <typename T>
void VAXPY(T a, const T* x, T* y, int n); void VAXPY(T a, const T* x, T* y, int n);
template <typename T>
void VBroadcast(const T* x, T* y, int64_t y_h, int64_t x_len) {
for (int64_t h = 0; h < y_h; ++h) {
VCopy(x, y + h * x_len, x_len);
}
}
template <typename T> template <typename T>
void VSigmoid(const T* x, T* y, int n) { void VSigmoid(const T* x, T* y, int n) {
const T min = SIGMOID_THRESHOLD_MIN; const T min = SIGMOID_THRESHOLD_MIN;
...@@ -202,6 +209,8 @@ DECLARE_MKL_KERNEL(Softmax, SoftmaxTuples); ...@@ -202,6 +209,8 @@ DECLARE_MKL_KERNEL(Softmax, SoftmaxTuples);
DECLARE_MKL_KERNEL(Sgd, SgdTuples); DECLARE_MKL_KERNEL(Sgd, SgdTuples);
DECLARE_MKL_KERNEL(VBroadcast, VBroadcastTuples);
#undef DECLARE_MKL_KERNEL #undef DECLARE_MKL_KERNEL
} // namespace mkl } // namespace mkl
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册