From 641b3cccce2eab3afc7b81fea6276ee2d5aad1f7 Mon Sep 17 00:00:00 2001 From: tensor-tang Date: Fri, 1 Mar 2019 06:32:51 +0000 Subject: [PATCH] add vbroadcast mkl code and jitcode test=develop --- paddle/fluid/operators/jit/benchmark.cc | 7 +- paddle/fluid/operators/jit/gen/CMakeLists.txt | 1 + paddle/fluid/operators/jit/gen/vbroadcast.cc | 94 +++++++++++++++++++ paddle/fluid/operators/jit/gen/vbroadcast.h | 53 +++++++++++ .../operators/jit/more/mkl/CMakeLists.txt | 1 + paddle/fluid/operators/jit/more/mkl/mkl.cc | 11 +++ paddle/fluid/operators/jit/more/mkl/mkl.h | 9 ++ 7 files changed, 172 insertions(+), 4 deletions(-) create mode 100644 paddle/fluid/operators/jit/gen/vbroadcast.cc create mode 100644 paddle/fluid/operators/jit/gen/vbroadcast.h diff --git a/paddle/fluid/operators/jit/benchmark.cc b/paddle/fluid/operators/jit/benchmark.cc index 93ebb1faa..3088280bb 100644 --- a/paddle/fluid/operators/jit/benchmark.cc +++ b/paddle/fluid/operators/jit/benchmark.cc @@ -476,18 +476,17 @@ void BenchCRFDecodingKernel() { template void BenchVBroadcastKernel() { - for (int w : TestSizes()) { + for (int64_t w : {1, 16, 64, 100, 256}) { Tensor x; x.Resize({w}); RandomVec(w, x.mutable_data(PlaceType())); const T* x_data = x.data(); - for (int64_t h : {1, 3, 6}) { + for (int h : TestSizes()) { Tensor y; y.Resize({h * w}); T* y_data = y.mutable_data(PlaceType()); - BenchAllImpls, PlaceType>( - static_cast(w), x_data, y_data, h, static_cast(w)); + w, x_data, y_data, static_cast(h), w); } } } diff --git a/paddle/fluid/operators/jit/gen/CMakeLists.txt b/paddle/fluid/operators/jit/gen/CMakeLists.txt index eb0c03568..99244ea9b 100644 --- a/paddle/fluid/operators/jit/gen/CMakeLists.txt +++ b/paddle/fluid/operators/jit/gen/CMakeLists.txt @@ -33,3 +33,4 @@ USE_JITKERNEL_GEN(kHMax) USE_JITKERNEL_GEN(kHSum) USE_JITKERNEL_GEN(kEmbSeqPool) USE_JITKERNEL_GEN(kSgd) +USE_JITKERNEL_GEN(kVBroadcast) diff --git a/paddle/fluid/operators/jit/gen/vbroadcast.cc b/paddle/fluid/operators/jit/gen/vbroadcast.cc new file mode 100644 index 000000000..31deb1643 --- /dev/null +++ b/paddle/fluid/operators/jit/gen/vbroadcast.cc @@ -0,0 +1,94 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. */ + +#include "paddle/fluid/operators/jit/gen/vbroadcast.h" +#include +#include +#include "paddle/fluid/operators/jit/registry.h" +#include "paddle/fluid/platform/cpu_info.h" + +namespace paddle { +namespace operators { +namespace jit { +namespace gen { + +void VBroadcastJitCode::genCode() { + preCode(); + constexpr int block = YMM_FLOAT_BLOCK; + constexpr int max_num_regs = 16; + const int num_block = w_ / block; + const int num_groups = num_block / max_num_regs; + const size_t block_size = sizeof(float) * block; + std::vector groups(num_groups, max_num_regs); + int rest_num_regs = num_block % max_num_regs; + if (rest_num_regs > 0) { + groups.push_back(rest_num_regs); + } + + // protect param_h + const size_t width_in_byte = sizeof(float) * w_; + mov(reg_height, param_h); + int acc_num_regs = 0; + for (int num_regs : groups) { + mov(reg_ptr_src_i, param_src); + add(reg_ptr_src_i, acc_num_regs * block_size); + size_t w_offset = 0; + for (int reg_i = 0; reg_i < num_regs; ++reg_i) { + vmovups(ymm_t(reg_i), ptr[reg_ptr_src_i + w_offset]); + w_offset += block_size; + } + + Label l_next_h; + xor_(reg_h_i, reg_h_i); + mov(reg_ptr_dst_i, param_dst); + add(reg_ptr_dst_i, acc_num_regs * block_size); + L(l_next_h); + { + w_offset = 0; + for (int reg_i = 0; reg_i < num_regs; ++reg_i) { + vmovups(ptr[reg_ptr_dst_i + w_offset], ymm_t(reg_i)); + w_offset += block_size; + } + add(reg_ptr_dst_i, width_in_byte); + inc(reg_h_i); + cmp(reg_h_i, reg_height); + jl(l_next_h, T_NEAR); + } // end of l_next_h + acc_num_regs += num_regs; + } // end of groups + postCode(); +} + +class VBroadcastCreator : public JitCodeCreator { + public: + bool UseMe(const int64_t& w) const override { + return platform::MayIUse(platform::avx) && w % YMM_FLOAT_BLOCK == 0; + } + size_t CodeSize(const int64_t& w) const override { + return 96 + (w / YMM_FLOAT_BLOCK) * 16 * 8; + } + std::unique_ptr CreateJitCode(const int64_t& w) const override { + PADDLE_ENFORCE_GT(w, 0); + return make_unique(w, CodeSize(w)); + } +}; + +} // namespace gen +} // namespace jit +} // namespace operators +} // namespace paddle + +namespace gen = paddle::operators::jit::gen; + +REGISTER_JITKERNEL_GEN(kVBroadcast, gen::VBroadcastCreator); diff --git a/paddle/fluid/operators/jit/gen/vbroadcast.h b/paddle/fluid/operators/jit/gen/vbroadcast.h new file mode 100644 index 000000000..27c75f6f7 --- /dev/null +++ b/paddle/fluid/operators/jit/gen/vbroadcast.h @@ -0,0 +1,53 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. */ + +#pragma once + +#include +#include "glog/logging.h" +#include "paddle/fluid/operators/jit/gen/jitcode.h" + +namespace paddle { +namespace operators { +namespace jit { +namespace gen { + +class VBroadcastJitCode : public JitCode { + public: + explicit VBroadcastJitCode(const int64_t& w, size_t code_size = 256 * 1024, + void* code_ptr = nullptr) + : JitCode(code_size, code_ptr), w_(w) { + this->genCode(); + } + + DECLARE_JIT_CODE(VBroadcastJitCode); + void genCode() override; + + private: + int w_; + reg64_t param_src{abi_param1}; + reg64_t param_dst{abi_param2}; + reg64_t param_h{abi_param3}; + reg64_t param_w{abi_param4}; + + reg64_t reg_height{r9}; + reg64_t reg_h_i{r10}; + reg64_t reg_ptr_src_i{r11}; + reg64_t reg_ptr_dst_i{r12}; +}; + +} // namespace gen +} // namespace jit +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/jit/more/mkl/CMakeLists.txt b/paddle/fluid/operators/jit/more/mkl/CMakeLists.txt index d4459449a..f69417c37 100644 --- a/paddle/fluid/operators/jit/more/mkl/CMakeLists.txt +++ b/paddle/fluid/operators/jit/more/mkl/CMakeLists.txt @@ -16,3 +16,4 @@ USE_JITKERNEL_MORE(kSeqPool, mkl) USE_JITKERNEL_MORE(kSoftmax, mkl) USE_JITKERNEL_MORE(kEmbSeqPool, mkl) USE_JITKERNEL_MORE(kSgd, mkl) +USE_JITKERNEL_MORE(kVBroadcast, mkl) diff --git a/paddle/fluid/operators/jit/more/mkl/mkl.cc b/paddle/fluid/operators/jit/more/mkl/mkl.cc index 6a90be3ed..4f51353bc 100644 --- a/paddle/fluid/operators/jit/more/mkl/mkl.cc +++ b/paddle/fluid/operators/jit/more/mkl/mkl.cc @@ -159,6 +159,16 @@ bool VCopyKernel::UseMe(const int& d) const { return d > 15; } +template <> +bool VBroadcastKernel::UseMe(const int64_t& d) const { + return d > 127; +} + +template <> +bool VBroadcastKernel::UseMe(const int64_t& attr) const { + return true; +} + template <> bool VSigmoidKernel::UseMe(const int& d) const { return d > 7; @@ -251,6 +261,7 @@ REGISTER_MKL_KERNEL(kVScal, VScal); REGISTER_MKL_KERNEL(kVExp, VExp); REGISTER_MKL_KERNEL(kVSquare, VSquare); REGISTER_MKL_KERNEL(kVCopy, VCopy); +REGISTER_MKL_KERNEL(kVBroadcast, VBroadcast); REGISTER_MKL_KERNEL(kVSigmoid, VSigmoid); REGISTER_MKL_KERNEL(kVTanh, VTanh); REGISTER_MKL_KERNEL(kSeqPool, SeqPool); diff --git a/paddle/fluid/operators/jit/more/mkl/mkl.h b/paddle/fluid/operators/jit/more/mkl/mkl.h index a58d300ec..db2d6faed 100644 --- a/paddle/fluid/operators/jit/more/mkl/mkl.h +++ b/paddle/fluid/operators/jit/more/mkl/mkl.h @@ -50,6 +50,13 @@ void VCopy(const T* x, T* y, int n); template void VAXPY(T a, const T* x, T* y, int n); +template +void VBroadcast(const T* x, T* y, int64_t y_h, int64_t x_len) { + for (int64_t h = 0; h < y_h; ++h) { + VCopy(x, y + h * x_len, x_len); + } +} + template void VSigmoid(const T* x, T* y, int n) { const T min = SIGMOID_THRESHOLD_MIN; @@ -202,6 +209,8 @@ DECLARE_MKL_KERNEL(Softmax, SoftmaxTuples); DECLARE_MKL_KERNEL(Sgd, SgdTuples); +DECLARE_MKL_KERNEL(VBroadcast, VBroadcastTuples); + #undef DECLARE_MKL_KERNEL } // namespace mkl -- GitLab