diff --git a/paddle/fluid/operators/jit/benchmark.cc b/paddle/fluid/operators/jit/benchmark.cc index 186c37c56ec9410ac9a31503e33e7e334d0afc40..5c5a61f64093802697eb21452267471129c7fcf3 100644 --- a/paddle/fluid/operators/jit/benchmark.cc +++ b/paddle/fluid/operators/jit/benchmark.cc @@ -158,7 +158,7 @@ void BenchAllImpls(const typename KernelTuples::attr_type& attr, Args... args) { using Tensor = paddle::framework::Tensor; -template +template void BenchXYZNKernel() { for (int d : TestSizes()) { Tensor x, y, z; @@ -175,7 +175,7 @@ void BenchXYZNKernel() { } } -template +template void BenchAXYNKernel() { for (int d : TestSizes()) { const T a = static_cast(3); @@ -187,10 +187,23 @@ void BenchAXYNKernel() { RandomVec(d, x_data); BenchAllImpls, PlaceType>(d, &a, x.data(), y_data, d); + // test inplace + BenchAllImpls, PlaceType>(d, &a, x.data(), x_data, + d); + } +} + +template +void BenchXRNKernel() { + for (int d : TestSizes()) { + Tensor x; + RandomVec(d, x.mutable_data({d}, PlaceType())); + T res; + BenchAllImpls, PlaceType>(d, x.data(), &res, d); } } -template +template void BenchXYNKernel() { for (int d : TestSizes()) { Tensor x, y; @@ -203,7 +216,7 @@ void BenchXYNKernel() { } } -template +template void BenchLSTMKernel() { for (bool use_peephole : {true, false}) { for (int d : TestSizes()) { @@ -240,7 +253,7 @@ void BenchLSTMKernel() { } } -template +template void BenchGRUKernel() { for (int d : TestSizes()) { const jit::gru_attr_t attr(d, jit::kVSigmoid, jit::kVTanh); @@ -262,7 +275,7 @@ void BenchGRUKernel() { } } -template +template void BenchSeqPoolKernel() { std::vector pool_types = { jit::SeqPoolType::kSum, jit::SeqPoolType::kAvg, jit::SeqPoolType::kSqrt}; @@ -284,7 +297,7 @@ void BenchSeqPoolKernel() { } } -template +template void BenchMatMulKernel() { for (int m : {1, 2, 3, 4}) { for (int n : TestSizes()) { @@ -305,57 +318,64 @@ void BenchMatMulKernel() { } } +template +void BenchSoftmaxKernel() { + for (int bs : {1, 2, 10}) { + for (int n : TestSizes()) { + Tensor x, y; + x.Resize({bs, n}); + y.Resize({bs, n}); + RandomVec(bs * n, x.mutable_data(PlaceType()), -2.f, 2.f); + const T* x_data = x.data(); + T* y_data = y.mutable_data(PlaceType()); + BenchAllImpls, PlaceType>(n, x_data, y_data, n, + bs); + } + } +} + using T = float; -using PlaceType = paddle::platform::CPUPlace; +using CPUPlace = paddle::platform::CPUPlace; // xyzn -BENCH_FP32_CPU(kVMul) { BenchXYZNKernel(); } - -BENCH_FP32_CPU(kVAdd) { BenchXYZNKernel(); } - -BENCH_FP32_CPU(kVAddRelu) { BenchXYZNKernel(); } - -BENCH_FP32_CPU(kVSub) { BenchXYZNKernel(); } +BENCH_FP32_CPU(kVMul) { BenchXYZNKernel(); } +BENCH_FP32_CPU(kVAdd) { BenchXYZNKernel(); } +BENCH_FP32_CPU(kVAddRelu) { BenchXYZNKernel(); } +BENCH_FP32_CPU(kVSub) { BenchXYZNKernel(); } // axyn -BENCH_FP32_CPU(kVScal) { BenchAXYNKernel(); } +BENCH_FP32_CPU(kVScal) { BenchAXYNKernel(); } +BENCH_FP32_CPU(kVAddBias) { BenchAXYNKernel(); } -BENCH_FP32_CPU(kVAddBias) { BenchAXYNKernel(); } +// xrn +BENCH_FP32_CPU(kHSum) { BenchXRNKernel(); } +BENCH_FP32_CPU(kHMax) { BenchXRNKernel(); } // xyn -BENCH_FP32_CPU(kVRelu) { BenchXYNKernel(); } - -BENCH_FP32_CPU(kVIdentity) { BenchXYNKernel(); } - -BENCH_FP32_CPU(kVSquare) { BenchXYNKernel(); } - -BENCH_FP32_CPU(kVExp) { BenchXYNKernel(); } - -BENCH_FP32_CPU(kVSigmoid) { BenchXYNKernel(); } - -BENCH_FP32_CPU(kVTanh) { BenchXYNKernel(); } +BENCH_FP32_CPU(kVRelu) { BenchXYNKernel(); } +BENCH_FP32_CPU(kVIdentity) { BenchXYNKernel(); } +BENCH_FP32_CPU(kVSquare) { BenchXYNKernel(); } +BENCH_FP32_CPU(kVExp) { BenchXYNKernel(); } +BENCH_FP32_CPU(kVSigmoid) { BenchXYNKernel(); } +BENCH_FP32_CPU(kVTanh) { BenchXYNKernel(); } // lstm and peephole -BENCH_FP32_CPU(kLSTMCtHt) { BenchLSTMKernel(); } - -BENCH_FP32_CPU(kLSTMC1H1) { BenchLSTMKernel(); } +BENCH_FP32_CPU(kLSTMCtHt) { BenchLSTMKernel(); } +BENCH_FP32_CPU(kLSTMC1H1) { BenchLSTMKernel(); } // gru functions -BENCH_FP32_CPU(kGRUH1) { BenchGRUKernel(); } - -BENCH_FP32_CPU(kGRUHtPart1) { - BenchGRUKernel(); -} - -BENCH_FP32_CPU(kGRUHtPart2) { - BenchGRUKernel(); -} +BENCH_FP32_CPU(kGRUH1) { BenchGRUKernel(); } +BENCH_FP32_CPU(kGRUHtPart1) { BenchGRUKernel(); } +BENCH_FP32_CPU(kGRUHtPart2) { BenchGRUKernel(); } // seq pool function -BENCH_FP32_CPU(kSeqPool) { BenchSeqPoolKernel(); } +BENCH_FP32_CPU(kSeqPool) { BenchSeqPoolKernel(); } // matmul -BENCH_FP32_CPU(kMatMul) { BenchMatMulKernel(); } +BENCH_FP32_CPU(kMatMul) { BenchMatMulKernel(); } + +// softmax +BENCH_FP32_CPU(kSoftmax) { BenchSoftmaxKernel(); } // Benchmark all jit kernels including jitcode, mkl and refer. // To use this tool, run command: ./benchmark [options...] diff --git a/paddle/fluid/operators/jit/gen/CMakeLists.txt b/paddle/fluid/operators/jit/gen/CMakeLists.txt index 40310c2d2b372a414054f75348e8e1b4471bf3d2..2ea8f927e1a13867fa2065841fac05e766735237 100644 --- a/paddle/fluid/operators/jit/gen/CMakeLists.txt +++ b/paddle/fluid/operators/jit/gen/CMakeLists.txt @@ -28,3 +28,5 @@ USE_JITKERNEL_GEN(kGRUHtPart1) USE_JITKERNEL_GEN(kGRUHtPart2) USE_JITKERNEL_GEN(kNCHW16CMulNC) USE_JITKERNEL_GEN(kSeqPool) +USE_JITKERNEL_GEN(kHMax) +USE_JITKERNEL_GEN(kHSum) diff --git a/paddle/fluid/operators/jit/gen/act.cc b/paddle/fluid/operators/jit/gen/act.cc index a2a5661b93ad3d885983c502566860aa313d110f..e7a7375879064eb27c94315fe7b93eece7866b92 100644 --- a/paddle/fluid/operators/jit/gen/act.cc +++ b/paddle/fluid/operators/jit/gen/act.cc @@ -81,9 +81,7 @@ void VActJitCode::genCode() { #define DECLARE_ACT_CREATOR(name) \ class name##Creator : public JitCodeCreator { \ public: \ - bool UseMe(const int& attr) const override { \ - return platform::MayIUse(platform::avx); \ - } \ + bool UseMe(const int& attr) const override; \ size_t CodeSize(const int& d) const override; \ std::unique_ptr CreateJitCode(const int& attr) const override { \ return make_unique(attr, CodeSize(attr)); \ @@ -98,6 +96,30 @@ DECLARE_ACT_CREATOR(VSigmoid); DECLARE_ACT_CREATOR(VTanh); // TODO(TJ): tuning use me +bool VReluCreator::UseMe(const int& d) const { + return platform::MayIUse(platform::avx); +} + +bool VSquareCreator::UseMe(const int& d) const { + return platform::MayIUse(platform::avx); +} + +bool VIdentityCreator::UseMe(const int& d) const { + return platform::MayIUse(platform::avx); +} + +bool VExpCreator::UseMe(const int& d) const { + return platform::MayIUse(platform::avx) && d < 32; +} + +bool VSigmoidCreator::UseMe(const int& d) const { + return platform::MayIUse(platform::avx); +} + +bool VTanhCreator::UseMe(const int& d) const { + return platform::MayIUse(platform::avx); +} + size_t VReluCreator::CodeSize(const int& d) const { return 96 /* init size */ + (d / YMM_FLOAT_BLOCK + 3) * 4 /* instructions */ * diff --git a/paddle/fluid/operators/jit/gen/hopv.cc b/paddle/fluid/operators/jit/gen/hopv.cc new file mode 100644 index 0000000000000000000000000000000000000000..e7884017198623d996fe98a55691da6e342d656a --- /dev/null +++ b/paddle/fluid/operators/jit/gen/hopv.cc @@ -0,0 +1,103 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. */ + +#include "paddle/fluid/operators/jit/gen/hopv.h" +#include "paddle/fluid/operators/jit/registry.h" +#include "paddle/fluid/platform/cpu_info.h" + +namespace paddle { +namespace operators { +namespace jit { +namespace gen { + +void HOPVJitCode::genCode() { + const int num_blocks = num_ / YMM_FLOAT_BLOCK; + int offset = 0; + + if (num_blocks > 0) { + // load one firstly + vmovups(ymm_tmp, ptr[param_src]); + offset += sizeof(float) * YMM_FLOAT_BLOCK; + for (int i = 1; i < num_blocks; ++i) { + vmovups(ymm_src, ptr[param_src + offset]); + process(ymm_tmp, ymm_src, ymm_tmp); + offset += sizeof(float) * YMM_FLOAT_BLOCK; + } + vextractf128(xmm_dst, ymm_tmp, 1); + process(xmm_dst, xmm_dst, xmm_tmp); + } else { + if (type_ == operand_type::MAX) { + vbroadcastss(ymm_dst, ptr[param_src]); + } else if (type_ == operand_type::ADD) { + vxorps(ymm_dst, ymm_dst, ymm_dst); + } + } + + int rest = num_ % YMM_FLOAT_BLOCK; + if (rest >= 4) { + vmovups(xmm_src, ptr[param_src + offset]); + offset += sizeof(float) * 4; + rest -= 4; + process(xmm_dst, xmm_dst, xmm_src); + } + + vpermilps(xmm_tmp, xmm_dst, 16 + 8 + 3); + process(xmm_dst, xmm_dst, xmm_tmp); + + if (rest >= 2) { + vmovq(xmm_src, ptr[param_src + offset]); + offset += sizeof(float) * 2; + rest -= 2; + process(xmm_dst, xmm_dst, xmm_src); + } + + vpermilps(xmm_tmp, xmm_dst, 1); + process(xmm_dst, xmm_dst, xmm_tmp); + + if (rest >= 1) { + vmovss(xmm_src, ptr[param_src + offset]); + process(xmm_dst, xmm_dst, xmm_src); + } + vmovss(ptr[param_dst], xmm_dst); + ret(); +} + +#define DECLARE_HOP_CREATOR(name) \ + class name##Creator : public JitCodeCreator { \ + public: \ + bool UseMe(const int& attr) const override { \ + return platform::MayIUse(platform::avx); \ + } \ + size_t CodeSize(const int& d) const override { \ + return 96 + d / YMM_FLOAT_BLOCK * 4 * 8; \ + } \ + std::unique_ptr CreateJitCode(const int& attr) const override { \ + return make_unique(attr, CodeSize(attr)); \ + } \ + } + +DECLARE_HOP_CREATOR(HMax); +DECLARE_HOP_CREATOR(HSum); + +#undef DECLARE_HOP_CREATOR + +} // namespace gen +} // namespace jit +} // namespace operators +} // namespace paddle + +namespace gen = paddle::operators::jit::gen; + +REGISTER_JITKERNEL_GEN(kHMax, gen::HMaxCreator); +REGISTER_JITKERNEL_GEN(kHSum, gen::HSumCreator); diff --git a/paddle/fluid/operators/jit/gen/hopv.h b/paddle/fluid/operators/jit/gen/hopv.h new file mode 100644 index 0000000000000000000000000000000000000000..d3bc94b63d3f962cd655367a2afe1a08582b06fa --- /dev/null +++ b/paddle/fluid/operators/jit/gen/hopv.h @@ -0,0 +1,90 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. */ + +#pragma once + +#include +#include "glog/logging.h" +#include "paddle/fluid/operators/jit/gen/jitcode.h" + +namespace paddle { +namespace operators { +namespace jit { +namespace gen { + +// horizontal operand vector +class HOPVJitCode : public JitCode { + public: + explicit HOPVJitCode(int d, operand_type type, size_t code_size = 256 * 1024, + void* code_ptr = nullptr) + : JitCode(code_size, code_ptr), num_(d), type_(type) { + if (!(type_ == operand_type::MAX || type_ == operand_type::ADD)) { + LOG(FATAL) << "Do not support this operand type: " << type_; + } + this->genCode(); + } + + virtual const char* name() const { + std::string base = "VXXJitCode"; + if (type_ == operand_type::MAX) { + base += "_MAX"; + } else { + base += "_SUM"; + } + return base.c_str(); + } + void genCode() override; + + protected: + template + void process(JMM& dst, JMM& src1, JMM& src2) { // NOLINT + if (type_ == operand_type::MAX) { + vmaxps(dst, src1, src2); + } else if (type_ == operand_type::ADD) { + vaddps(dst, src1, src2); + } + } + + private: + int num_; + operand_type type_; + reg64_t param_src{abi_param1}; + reg64_t param_dst{abi_param2}; + reg64_t param_attr{abi_param3}; + + ymm_t ymm_tmp = ymm_t(0); + ymm_t ymm_src = ymm_t(1); + ymm_t ymm_dst = ymm_t(2); + + xmm_t xmm_tmp = xmm_t(0); + xmm_t xmm_src = xmm_t(1); + xmm_t xmm_dst = xmm_t(2); +}; + +#define DECLARE_HOP_JITCODE(name, op_type) \ + class name##JitCode : public HOPVJitCode { \ + public: \ + explicit name##JitCode(int d, size_t code_size, void* code_ptr = nullptr) \ + : HOPVJitCode(d, op_type, code_size, code_ptr) {} \ + }; + +DECLARE_HOP_JITCODE(HMax, operand_type::MAX); +DECLARE_HOP_JITCODE(HSum, operand_type::ADD); + +#undef DECLARE_HOP_JITCODE + +} // namespace gen +} // namespace jit +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/jit/gen/jitcode.h b/paddle/fluid/operators/jit/gen/jitcode.h index f63d40ad5a559ab87a9b3735406671cfd936d9e4..c388109604bc57e8475e79a6c57eecb5bfebfb52 100644 --- a/paddle/fluid/operators/jit/gen/jitcode.h +++ b/paddle/fluid/operators/jit/gen/jitcode.h @@ -47,6 +47,7 @@ using Label = Xbyak::Label; typedef enum { MUL = 0, + MAX, ADD, SUB, RELU, diff --git a/paddle/fluid/operators/jit/helper.cc b/paddle/fluid/operators/jit/helper.cc index 5dbe22a81b4866bdf60a03710d8ffd0b7bcb597b..4dac2f2460f72c7da63f48c82549b948cc253153 100644 --- a/paddle/fluid/operators/jit/helper.cc +++ b/paddle/fluid/operators/jit/helper.cc @@ -49,6 +49,9 @@ const char* to_string(KernelType kt) { ONE_CASE(kNCHW16CMulNC); ONE_CASE(kSeqPool); ONE_CASE(kMatMul); + ONE_CASE(kHMax); + ONE_CASE(kHSum); + ONE_CASE(kSoftmax); default: PADDLE_THROW("Not support type: %d, or forget to add it.", kt); return "NOT JITKernel"; diff --git a/paddle/fluid/operators/jit/helper.h b/paddle/fluid/operators/jit/helper.h index fbf34fc4b3db49596b6be0360c00e77c12fab9b8..7bdc45779b7d39d36db0d52ca9361943cdcdef3e 100644 --- a/paddle/fluid/operators/jit/helper.h +++ b/paddle/fluid/operators/jit/helper.h @@ -118,6 +118,28 @@ typename KernelTuples::func_type Get( return GetRefer(); } +template +class KernelFuncsCache { + public: + KernelFuncsCache() = default; + static KernelFuncsCache& Instance() { + static thread_local KernelFuncsCache g_func_cache; + return g_func_cache; + } + + bool Has(int key) const { return funcs_.find(key) != funcs_.end(); } + + typename KernelTuples::func_type At(int key) { return funcs_.at(key); } + + void Insert(int key, typename KernelTuples::func_type func) { + funcs_.emplace(key, func); + } + + private: + std::unordered_map funcs_; + DISABLE_COPY_AND_ASSIGN(KernelFuncsCache); +}; + const char* to_string(KernelType kt); const char* to_string(SeqPoolType kt); diff --git a/paddle/fluid/operators/jit/kernel_base.h b/paddle/fluid/operators/jit/kernel_base.h index adb101bd5cdf231ac330dbf44beb4c24c1fcf29e..42a58580f7b1e0832af57398ba9c29882b6cc6fb 100644 --- a/paddle/fluid/operators/jit/kernel_base.h +++ b/paddle/fluid/operators/jit/kernel_base.h @@ -20,6 +20,7 @@ namespace paddle { namespace operators { namespace jit { +// TODO(TJ): reorder by alphabet typedef enum { kNone = 0, kVMul = 1, @@ -44,6 +45,9 @@ typedef enum { kNCHW16CMulNC, kSeqPool, kMatMul, + kHSum, // horizontal max + kHMax, // horizontal sum + kSoftmax, } KernelType; typedef enum { @@ -70,6 +74,10 @@ struct XYNTuples { typedef void (*func_type)(const T*, T*, int); }; +// x, return and int +template +struct XRNTuples : public XYNTuples {}; + typedef struct { void* gates; // gates: x_ch, x_ih, x_fh, x_oh const void* ct_1; @@ -159,6 +167,13 @@ struct LayerNormTuples { const float, int); }; +template +struct SoftmaxTuples { + typedef T data_type; + typedef int attr_type; + typedef void (*func_type)(const T*, T*, int, int); +}; + // nChw16c = nChw16c .* NC template struct NCHW16CMulNCTuples { diff --git a/paddle/fluid/operators/jit/more/mix/CMakeLists.txt b/paddle/fluid/operators/jit/more/mix/CMakeLists.txt index e05f204b1eebd03c7a00157d96d0482f4a44a7fb..dd039d29152961210958470a48f086a133ab640c 100644 --- a/paddle/fluid/operators/jit/more/mix/CMakeLists.txt +++ b/paddle/fluid/operators/jit/more/mix/CMakeLists.txt @@ -12,3 +12,4 @@ USE_JITKERNEL_MORE(kLSTMC1H1, mix) USE_JITKERNEL_MORE(kGRUH1, mix) USE_JITKERNEL_MORE(kGRUHtPart1, mix) USE_JITKERNEL_MORE(kGRUHtPart2, mix) +USE_JITKERNEL_MORE(kSoftmax, mix) diff --git a/paddle/fluid/operators/jit/more/mix/mix.cc b/paddle/fluid/operators/jit/more/mix/mix.cc index df0a85256b1f546d5f64be73925cf58b87a25bd7..0f42ac158ca7926981df55936cb903d5f4ae4806 100644 --- a/paddle/fluid/operators/jit/more/mix/mix.cc +++ b/paddle/fluid/operators/jit/more/mix/mix.cc @@ -48,6 +48,65 @@ void VTanh(const T* x, T* y, int n) { compute_addbias(&b, y, y, n); } +void Softmax(const T* x, T* y, int n, int bs) { + typename XRNTuples::func_type compute_hmax{nullptr}; + typename XRNTuples::func_type compute_hsum{nullptr}; + typename AXYNTuples::func_type compute_vscal{nullptr}; + typename AXYNTuples::func_type compute_vaddbias{nullptr}; + typename XYNTuples::func_type compute_vexp{nullptr}; + + if (!KernelFuncsCache>::Instance().Has(n)) { + compute_hmax = Get, platform::CPUPlace>(n); + KernelFuncsCache>::Instance().Insert(n, compute_hmax); + } else { + compute_hmax = KernelFuncsCache>::Instance().At(n); + } + + if (!KernelFuncsCache>::Instance().Has(n)) { + compute_hsum = Get, platform::CPUPlace>(n); + KernelFuncsCache>::Instance().Insert(n, compute_hsum); + } else { + compute_hsum = KernelFuncsCache>::Instance().At(n); + } + + if (!KernelFuncsCache>::Instance().Has(n)) { + compute_vscal = Get, platform::CPUPlace>(n); + KernelFuncsCache>::Instance().Insert(n, + compute_vscal); + } else { + compute_vscal = KernelFuncsCache>::Instance().At(n); + } + + if (!KernelFuncsCache>::Instance().Has(n)) { + compute_vaddbias = Get, platform::CPUPlace>(n); + KernelFuncsCache>::Instance().Insert( + n, compute_vaddbias); + } else { + compute_vaddbias = + KernelFuncsCache>::Instance().At(n); + } + + if (!KernelFuncsCache>::Instance().Has(n)) { + compute_vexp = Get, platform::CPUPlace>(n); + KernelFuncsCache>::Instance().Insert(n, compute_vexp); + } else { + compute_vexp = KernelFuncsCache>::Instance().At(n); + } + + for (int i = 0; i < bs; ++i) { + T scalar; + compute_hmax(x, &scalar, n); + scalar = static_cast(0) - scalar; + compute_vaddbias(&scalar, x, y, n); // x - max + compute_vexp(y, y, n); + compute_hsum(y, &scalar, n); + scalar = static_cast(1) / scalar; + compute_vscal(&scalar, y, y, n); + x += n; + y += n; + } +} + void (*getActFunc(KernelType type, int d))(const T*, T*, int) { // NOLINT if (type == kVSigmoid) { return Get, platform::CPUPlace>(d); @@ -184,6 +243,8 @@ bool VSigmoidKernel::UseMe(const int& d) const { return true; } bool VTanhKernel::UseMe(const int& d) const { return true; } +bool SoftmaxKernel::UseMe(const int& d) const { return true; } + bool LSTMCtHtKernel::UseMe(const lstm_attr_t& attr) const { return true; } bool LSTMC1H1Kernel::UseMe(const lstm_attr_t& attr) const { return true; } @@ -207,6 +268,7 @@ namespace mix = paddle::operators::jit::more::mix; REGISTER_MORE_KERNEL(kVSigmoid, VSigmoid); REGISTER_MORE_KERNEL(kVTanh, VTanh); +REGISTER_MORE_KERNEL(kSoftmax, Softmax); REGISTER_MORE_KERNEL(kLSTMCtHt, LSTMCtHt); REGISTER_MORE_KERNEL(kLSTMC1H1, LSTMC1H1); REGISTER_MORE_KERNEL(kGRUH1, GRUH1); diff --git a/paddle/fluid/operators/jit/more/mix/mix.h b/paddle/fluid/operators/jit/more/mix/mix.h index a70ecdf9348f511311307b4c27bb4506222a7439..d64af192197a0b339a39a1862c028875da2f3900 100644 --- a/paddle/fluid/operators/jit/more/mix/mix.h +++ b/paddle/fluid/operators/jit/more/mix/mix.h @@ -26,6 +26,7 @@ using T = float; void VSigmoid(const T* x, T* y, int n); void VTanh(const T* x, T* y, int n); +void Softmax(const T* x, T* y, int n, int bs); void LSTMCtHt(lstm_t* step, const lstm_attr_t* attr); void LSTMC1H1(lstm_t* step, const lstm_attr_t* attr); @@ -45,6 +46,9 @@ void GRUHtPart2(gru_t* step, const gru_attr_t* attr); DECLARE_MORE_KERNEL(VSigmoid, XYNTuples); DECLARE_MORE_KERNEL(VTanh, XYNTuples); +// XRN +DECLARE_MORE_KERNEL(Softmax, SoftmaxTuples); + DECLARE_MORE_KERNEL(LSTMCtHt, LSTMTuples); DECLARE_MORE_KERNEL(LSTMC1H1, LSTMTuples); diff --git a/paddle/fluid/operators/jit/more/mkl/CMakeLists.txt b/paddle/fluid/operators/jit/more/mkl/CMakeLists.txt index 667c6dfad6676d00ab994564bff57c90caa0cb41..f9e5aea32e7cd48e9b39c4c3ee0e30f4a5c84f6f 100644 --- a/paddle/fluid/operators/jit/more/mkl/CMakeLists.txt +++ b/paddle/fluid/operators/jit/more/mkl/CMakeLists.txt @@ -12,3 +12,4 @@ USE_JITKERNEL_MORE(kVSquare, mkl) USE_JITKERNEL_MORE(kVSigmoid, mkl) USE_JITKERNEL_MORE(kVTanh, mkl) USE_JITKERNEL_MORE(kSeqPool, mkl) +USE_JITKERNEL_MORE(kSoftmax, mkl) diff --git a/paddle/fluid/operators/jit/more/mkl/mkl.cc b/paddle/fluid/operators/jit/more/mkl/mkl.cc index fccdc68f5efa34bac6f5a34a41569d2f77416284..28a37198dae19a57509934ec784746bc23436e7a 100644 --- a/paddle/fluid/operators/jit/more/mkl/mkl.cc +++ b/paddle/fluid/operators/jit/more/mkl/mkl.cc @@ -116,6 +116,16 @@ void VAXPY(double a, const double* x, double* y, int n) { platform::dynload::cblas_daxpy(n, a, x, 1, y, 1); } +template <> +void ASum(const float* x, float* res, int n) { + res[0] = platform::dynload::cblas_sasum(n, x, 1); +} + +template <> +void ASum(const double* x, double* res, int n) { + res[0] = platform::dynload::cblas_dasum(n, x, 1); +} + // TODO(TJ): tuning me carefully on AVX, AVX2 and AVX512 template <> bool MatMulKernel::UseMe(const int& d) const { @@ -167,6 +177,12 @@ bool SeqPoolKernel::UseMe(const seq_pool_attr_t& attr) const { return true; } +template <> +bool SoftmaxKernel::UseMe(const int& d) const { + // tuned on avx2 + return platform::MayIUse(platform::avx) && d < 60; +} + #define AWALYS_USE_ME_WITH_DOUBLE(func) \ template <> \ bool func##Kernel::UseMe(const int& d) const { \ @@ -181,6 +197,7 @@ AWALYS_USE_ME_WITH_DOUBLE(VExp); AWALYS_USE_ME_WITH_DOUBLE(VSigmoid); AWALYS_USE_ME_WITH_DOUBLE(VTanh); AWALYS_USE_ME_WITH_DOUBLE(VSquare); +AWALYS_USE_ME_WITH_DOUBLE(Softmax); #undef AWALYS_USE_ME_WITH_DOUBLE } // namespace mkl @@ -204,5 +221,6 @@ REGISTER_MKL_KERNEL(kVSquare, VSquare); REGISTER_MKL_KERNEL(kVSigmoid, VSigmoid); REGISTER_MKL_KERNEL(kVTanh, VTanh); REGISTER_MKL_KERNEL(kSeqPool, SeqPool); +REGISTER_MKL_KERNEL(kSoftmax, Softmax); #undef REGISTER_MKL_KERNEL diff --git a/paddle/fluid/operators/jit/more/mkl/mkl.h b/paddle/fluid/operators/jit/more/mkl/mkl.h index a27196fa19f1d3e9aa6c414b6b9f99a21ef49025..6b95b9c872dc12cccaef0b0737edd760447a47d0 100644 --- a/paddle/fluid/operators/jit/more/mkl/mkl.h +++ b/paddle/fluid/operators/jit/more/mkl/mkl.h @@ -16,6 +16,7 @@ #include #include +#include #include "paddle/fluid/operators/jit/kernel_base.h" namespace paddle { @@ -90,6 +91,30 @@ void SeqPool(const T* x, T* y, const seq_pool_attr_t* attr) { } } +template +void ASum(const T* x, T* res, int n); + +template +void Softmax(const T* x, T* y, int n, int bs) { + std::vector entities(bs); + for (int i = 0; i < bs; ++i) { + entities[i] = x[i * n]; + for (int c = 1; c < n; ++c) { + entities[i] = x[i * n + c] > entities[i] ? x[i * n + c] : entities[i]; + } + for (int c = 0; c < n; ++c) { + y[i * n + c] = x[i * n + c] - entities[i]; + } + } + VExp(y, y, n * bs); + for (int i = 0; i < bs; ++i) { + T sum; + ASum(&y[i * n], &sum, n); + sum = static_cast(1) / sum; + VScal(&sum, &y[i * n], &y[i * n], n); + } +} + #define DECLARE_MKL_KERNEL(name, tuples) \ template \ class name##Kernel : public KernelMore> { \ @@ -117,6 +142,8 @@ DECLARE_MKL_KERNEL(VSquare, XYNTuples); DECLARE_MKL_KERNEL(SeqPool, SeqPoolTuples); +DECLARE_MKL_KERNEL(Softmax, SoftmaxTuples); + #undef DECLARE_MKL_KERNEL } // namespace mkl diff --git a/paddle/fluid/operators/jit/refer/CMakeLists.txt b/paddle/fluid/operators/jit/refer/CMakeLists.txt index 4b9bc5e8d49c62404d5d4ef99b7c50987fcb415a..9f2935828ca300dbdb71b0fefb6b9883cb45e4b0 100644 --- a/paddle/fluid/operators/jit/refer/CMakeLists.txt +++ b/paddle/fluid/operators/jit/refer/CMakeLists.txt @@ -29,3 +29,6 @@ USE_JITKERNEL_REFER(kNCHW16CMulNC) USE_JITKERNEL_REFER(kSeqPool) USE_JITKERNEL_REFER(kMatMul) USE_JITKERNEL_REFER(kVSquare) +USE_JITKERNEL_REFER(kHSum) +USE_JITKERNEL_REFER(kHMax) +USE_JITKERNEL_REFER(kSoftmax) diff --git a/paddle/fluid/operators/jit/refer/refer.cc b/paddle/fluid/operators/jit/refer/refer.cc index 3512ad7fe7921381afb6152330fff6be34de5ad7..b8adb40ec7e1b64df2b04a3201292db235af7b19 100644 --- a/paddle/fluid/operators/jit/refer/refer.cc +++ b/paddle/fluid/operators/jit/refer/refer.cc @@ -52,4 +52,9 @@ REGISTER_REFER_KERNEL(kSeqPool, SeqPool); REGISTER_REFER_KERNEL(kMatMul, MatMul); +REGISTER_REFER_KERNEL(kHMax, HMax); +REGISTER_REFER_KERNEL(kHSum, HSum); + +REGISTER_REFER_KERNEL(kSoftmax, Softmax); + #undef REGISTER_REFER_KERNEL diff --git a/paddle/fluid/operators/jit/refer/refer.h b/paddle/fluid/operators/jit/refer/refer.h index 97d029358594d757f0e1874e9c87ecb8f97c9d50..5a074db7e0e8ab49dc281e1809edef23e6a25c42 100644 --- a/paddle/fluid/operators/jit/refer/refer.h +++ b/paddle/fluid/operators/jit/refer/refer.h @@ -378,6 +378,40 @@ void MatMul(const T* A, const T* B, T* C, int M, int N, int K) { } } +template +void HMax(const T* x, T* res, int n) { + res[0] = x[0]; + for (int i = 1; i < n; ++i) { + res[0] = res[0] < x[i] ? x[i] : res[0]; + } +} + +template +void HSum(const T* x, T* res, int n) { + res[0] = x[0]; + for (int i = 1; i < n; ++i) { + res[0] += x[i]; + } +} + +// y = e^(x - max(x)) +// y = y / sum(y) +template +void Softmax(const T* x, T* y, int n, int bs = 1) { + for (int i = 0; i < bs; ++i) { + T scalar; + HMax(x, &scalar, n); + scalar = static_cast(0) - scalar; + VAddBias(&scalar, x, y, n); // x - max + VExp(y, y, n); + HSum(y, &scalar, n); + scalar = static_cast(1) / scalar; + VScal(&scalar, y, y, n); + x += n; + y += n; + } +} + #define DECLARE_REFER_KERNEL(name, tuples) \ template \ class name##Kernel : public ReferKernel> { \ @@ -421,6 +455,11 @@ DECLARE_REFER_KERNEL(SeqPool, SeqPoolTuples); DECLARE_REFER_KERNEL(MatMul, MatMulTuples); +DECLARE_REFER_KERNEL(HMax, XRNTuples); +DECLARE_REFER_KERNEL(HSum, XRNTuples); + +DECLARE_REFER_KERNEL(Softmax, SoftmaxTuples); + #undef DECLARE_REFER_KERNEL } // namespace refer diff --git a/paddle/fluid/operators/jit/test.cc b/paddle/fluid/operators/jit/test.cc index 68a79b6314e4cf86f5b715b9c6694924126b12da..cc461552898fc68661ce548a520d65215d3572b4 100644 --- a/paddle/fluid/operators/jit/test.cc +++ b/paddle/fluid/operators/jit/test.cc @@ -61,6 +61,7 @@ std::vector TestSizes() { } namespace jit = paddle::operators::jit; +using CPUPlace = paddle::platform::CPUPlace; template struct TestFuncWithRefer { @@ -121,6 +122,40 @@ struct TestFuncWithRefer, T, std::vector, } }; +template +struct TestFuncWithRefer, std::vector, std::vector, + int, int> { + void operator()(const typename jit::SoftmaxTuples::func_type tgt, + const std::vector& x, const std::vector& yref, int n, + int bs) { + EXPECT_TRUE(tgt != nullptr); + EXPECT_EQ(yref.size(), x.size()); + EXPECT_EQ(x.size(), static_cast(n * bs)); + const T* x_data = x.data(); + const T* yref_data = yref.data(); + std::vector ytgt(n * bs); + T* ytgt_data = ytgt.data(); + // test normal + tgt(x_data, ytgt_data, n, bs); + ExpectEQ(ytgt_data, yref_data, n * bs); + // test inplace x + std::copy(x.begin(), x.end(), ytgt.begin()); + tgt(ytgt_data, ytgt_data, n, bs); + ExpectEQ(ytgt_data, yref_data, n * bs); + } +}; + +template +struct TestFuncWithRefer, std::vector, T> { + void operator()(const typename jit::XRNTuples::func_type tgt, + const std::vector& x, const T ref_res) { + EXPECT_TRUE(tgt != nullptr); + T tgt_res; + tgt(x.data(), &tgt_res, x.size()); + ExpectEQ(&tgt_res, &ref_res, 1); + } +}; + template struct TestFuncWithRefer, std::vector, std::vector> { void operator()(const typename jit::XYNTuples::func_type tgt, @@ -172,7 +207,7 @@ struct TestFuncWithRefer, std::vector, std::vector, T* ht_data = ht.data(); T* checked_data = checked.data(); - paddle::operators::jit::lstm_t step; + jit::lstm_t step; step.gates = x_data; step.ct_1 = ct_1_data; step.ct = ct_data; @@ -208,7 +243,7 @@ struct TestFuncWithRefer, std::vector, std::vector, const T* ht_ref_data = ht_ref.data(); T* x_data = x.data(); T* ht_data = ht.data(); - paddle::operators::jit::gru_t step; + jit::gru_t step; step.gates = x_data; step.ht_1 = ht_1_data; step.ht = ht_data; @@ -255,8 +290,8 @@ struct TestFuncWithRefer, std::vector, std::vector, } }; -template +template void TestAllImpls(const typename KernelTuples::attr_type& attr, Args... args) { TestFuncWithRefer test; // test jitcode @@ -286,9 +321,8 @@ void TestAllImpls(const typename KernelTuples::attr_type& attr, Args... args) { test(tgt, args...); } -template +template void TestXYZNKernel() { - namespace jit = paddle::operators::jit; VLOG(10) << "===== Test JITKernel " << jit::to_string(KT); for (int d : TestSizes()) { auto ref = jit::GetRefer>(); @@ -320,9 +354,8 @@ void TestXYZNKernel() { } } -template +template void TestAXYNKernel() { - namespace jit = paddle::operators::jit; VLOG(10) << "===== Test JITKernel " << jit::to_string(KT); for (int d : TestSizes()) { auto ref = jit::GetRefer>(); @@ -347,9 +380,26 @@ void TestAXYNKernel() { } } -template +template +void TestXRNKernel() { + VLOG(10) << "===== Test JITKernel " << jit::to_string(KT); + auto last_acc = acc; + acc = 1e-4; + for (int d : TestSizes()) { + auto ref = jit::GetRefer>(); + EXPECT_TRUE(ref != nullptr); + std::vector x(d); + RandomVec(d, x.data(), -2.f, 2.f); + T ref_res; + ref(x.data(), &ref_res, d); + TestAllImpls, PlaceType, std::vector, T>(d, x, + ref_res); + } + acc = last_acc; +} + +template void TestXYNKernel() { - namespace jit = paddle::operators::jit; VLOG(10) << "===== Test JITKernel " << jit::to_string(KT); for (int d : TestSizes()) { auto ref = jit::GetRefer>(); @@ -373,9 +423,8 @@ void TestXYNKernel() { } } -template +template void TestLSTMKernel() { - namespace jit = paddle::operators::jit; VLOG(10) << "===== Test JITKernel " << jit::to_string(KT); std::vector all_acts = {"sigmoid", "tanh", "relu", "identity"}; for (int d : TestSizes()) { @@ -424,9 +473,8 @@ void TestLSTMKernel() { } } -template +template void TestGRUKernel() { - namespace jit = paddle::operators::jit; VLOG(10) << "===== Test JITKernel " << jit::to_string(KT); std::vector all_acts = {"sigmoid", "tanh", "relu", "identity"}; for (int d : TestSizes()) { @@ -459,7 +507,7 @@ void TestGRUKernel() { } } -template +template void TestSeqPoolKernel() { VLOG(10) << "===== Test JITKernel " << jit::to_string(KT); std::vector pool_types = { @@ -484,7 +532,7 @@ void TestSeqPoolKernel() { } } -template +template void TestMatMulKernel() { VLOG(10) << "===== Test JITKernel " << jit::to_string(KT); auto last_acc = acc; @@ -510,7 +558,32 @@ void TestMatMulKernel() { acc = last_acc; } -template +template +void TestSoftmaxKernel() { + VLOG(10) << "===== Test JITKernel " << jit::to_string(KT); + for (int bs : {1, 2, 10}) { + for (int n : TestSizes()) { + auto ref = jit::GetRefer>(); + EXPECT_TRUE(ref != nullptr); + std::vector x(bs * n), y(bs * n); + RandomVec(bs * n, x.data(), -2.f, 2.f); + const T* x_data = x.data(); + T* y_data = y.data(); + + std::vector xinp(x.size()); // inplace test + std::copy(x.begin(), x.end(), xinp.begin()); + ref(x_data, y_data, n, bs); + T* xinp_data = xinp.data(); + ref(xinp_data, xinp_data, n, bs); + ExpectEQ(xinp_data, y_data, n * bs); + + TestAllImpls, PlaceType, std::vector, + std::vector>(n, x, y, n, bs); + } + } +} + +template void TestNCHW16CMulNCKernel() { VLOG(10) << "===== Test JITKernel " << jit::to_string(KT); const int n = 3, c = 16 * 4, h = 10, w = 10; @@ -565,129 +638,123 @@ void TestNCHW16CMulNCKernel() { // XYZNTuple TEST(JITKernel, kVMul) { - namespace jit = paddle::operators::jit; - TestXYZNKernel(); - TestXYZNKernel(); + TestXYZNKernel(); + TestXYZNKernel(); } TEST(JITKernel, kVAdd) { - namespace jit = paddle::operators::jit; - TestXYZNKernel(); - TestXYZNKernel(); + TestXYZNKernel(); + TestXYZNKernel(); } TEST(JITKernel, kVAddRelu) { - namespace jit = paddle::operators::jit; - TestXYZNKernel(); - TestXYZNKernel(); + TestXYZNKernel(); + TestXYZNKernel(); } TEST(JITKernel, kVSub) { - namespace jit = paddle::operators::jit; - TestXYZNKernel(); - TestXYZNKernel(); + TestXYZNKernel(); + TestXYZNKernel(); } // AXYNTuples TEST(JITKernel, kVScal) { - namespace jit = paddle::operators::jit; - TestAXYNKernel(); - TestAXYNKernel(); + TestAXYNKernel(); + TestAXYNKernel(); } TEST(JITKernel, kVAddBias) { - namespace jit = paddle::operators::jit; - TestAXYNKernel(); - TestAXYNKernel(); + TestAXYNKernel(); + TestAXYNKernel(); +} + +// XRNTuples +TEST(JITKernel, kHMax) { + TestXRNKernel(); + TestXRNKernel(); +} + +TEST(JITKernel, kHSum) { + TestXRNKernel(); + TestXRNKernel(); } // XYNTuples TEST(JITKernel, kVRelu) { - namespace jit = paddle::operators::jit; - TestXYNKernel(); - TestXYNKernel(); + TestXYNKernel(); + TestXYNKernel(); } TEST(JITKernel, kVIdentity) { - namespace jit = paddle::operators::jit; - TestXYNKernel(); - TestXYNKernel(); + TestXYNKernel(); + TestXYNKernel(); } TEST(JITKernel, kVSquare) { - namespace jit = paddle::operators::jit; - TestXYNKernel(); - TestXYNKernel(); + TestXYNKernel(); + TestXYNKernel(); } TEST(JITKernel, kVExp) { - namespace jit = paddle::operators::jit; - TestXYNKernel(); - TestXYNKernel(); + TestXYNKernel(); + TestXYNKernel(); } TEST(JITKernel, kVSigmoid) { - namespace jit = paddle::operators::jit; - TestXYNKernel(); - TestXYNKernel(); + TestXYNKernel(); + TestXYNKernel(); } TEST(JITKernel, kVTanh) { - namespace jit = paddle::operators::jit; - TestXYNKernel(); - TestXYNKernel(); + TestXYNKernel(); + TestXYNKernel(); } // LSTM TEST(JITKernel, kLSTMCtHt) { - namespace jit = paddle::operators::jit; - TestLSTMKernel(); - TestLSTMKernel(); + TestLSTMKernel(); + TestLSTMKernel(); } TEST(JITKernel, kLSTMC1H1) { - namespace jit = paddle::operators::jit; - TestLSTMKernel(); - TestLSTMKernel(); + TestLSTMKernel(); + TestLSTMKernel(); } // GRU TEST(JITKernel, kGRUH1) { - namespace jit = paddle::operators::jit; - TestGRUKernel(); - TestGRUKernel(); + TestGRUKernel(); + TestGRUKernel(); } TEST(JITKernel, kGRUHtPart1) { - namespace jit = paddle::operators::jit; - TestGRUKernel(); - TestGRUKernel(); + TestGRUKernel(); + TestGRUKernel(); } TEST(JITKernel, kGRUHtPart2) { - namespace jit = paddle::operators::jit; - TestGRUKernel(); - TestGRUKernel(); + TestGRUKernel(); + TestGRUKernel(); } TEST(JITKernel, kSeqPool) { - namespace jit = paddle::operators::jit; - TestSeqPoolKernel(); - TestSeqPoolKernel(); + TestSeqPoolKernel(); + TestSeqPoolKernel(); } TEST(JITKernel, kMatMul) { - namespace jit = paddle::operators::jit; - TestMatMulKernel(); - TestMatMulKernel(); + TestMatMulKernel(); + TestMatMulKernel(); +} + +TEST(JITKernel, kSoftmax) { + TestSoftmaxKernel(); + TestSoftmaxKernel(); } TEST(JITKernel, kNCHW16CMulNC) { - namespace jit = paddle::operators::jit; - TestNCHW16CMulNCKernel(); - TestNCHW16CMulNCKernel(); + TestNCHW16CMulNCKernel(); + TestNCHW16CMulNCKernel(); } // TODO(yihua/TJ): add crf decoding and layer norm unit tests diff --git a/paddle/fluid/operators/math/CMakeLists.txt b/paddle/fluid/operators/math/CMakeLists.txt index 6bbb7155dda9b2c844f793a63adb861c2ed956e8..e20524012a5839fd250b7426a5efc42b7e87fe87 100644 --- a/paddle/fluid/operators/math/CMakeLists.txt +++ b/paddle/fluid/operators/math/CMakeLists.txt @@ -53,7 +53,7 @@ math_library(sequence2batch) math_library(sequence_padding) math_library(sequence_pooling DEPS math_function jit_kernel_helper) math_library(sequence_scale) -math_library(softmax DEPS math_function) +math_library(softmax DEPS math_function jit_kernel_helper) math_library(beam_search DEPS math_function) math_library(matrix_bit_code) diff --git a/paddle/fluid/operators/math/softmax_impl.h b/paddle/fluid/operators/math/softmax_impl.h index 1d9d98b10646af9e199f6c481740d30745888707..1ff9ff684fc8001afb0f768a033b4c5bd1592702 100644 --- a/paddle/fluid/operators/math/softmax_impl.h +++ b/paddle/fluid/operators/math/softmax_impl.h @@ -16,8 +16,8 @@ limitations under the License. */ #include #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/tensor.h" +#include "paddle/fluid/operators/jit/kernels.h" -#include "paddle/fluid/operators/math/blas.h" namespace paddle { namespace operators { namespace math { @@ -81,28 +81,10 @@ class SoftmaxFunctor> { const int kBatchDim = 0; const int kClassDim = 1; // 2D data. Batch x C - const int batch_size = in_dims[kBatchDim]; - const int num_classes = in_dims[kClassDim]; - std::vector entities(batch_size); - auto blas = math::GetBlas(context); - for (int n = 0; n < batch_size; ++n) { - entities[n] = in_data[n * num_classes]; - for (int c = 1; c < num_classes; ++c) { - entities[n] = in_data[n * num_classes + c] > entities[n] - ? in_data[n * num_classes + c] - : entities[n]; - } - for (int c = 0; c < num_classes; ++c) { - out_data[n * num_classes + c] = - in_data[n * num_classes + c] - entities[n]; - } - } - - blas.VEXP(num_classes * batch_size, out_data, out_data); - for (int n = 0; n < batch_size; ++n) { - auto sum = blas.ASUM(num_classes, &out_data[n * num_classes], 1); - blas.SCAL(num_classes, 1.0f / sum, &out_data[n * num_classes]); - } + auto compute_softmax = + jit::Get, platform::CPUPlace>( + in_dims[kClassDim]); + compute_softmax(in_data, out_data, in_dims[kClassDim], in_dims[kBatchDim]); } }; diff --git a/paddle/fluid/platform/dynload/mklml.h b/paddle/fluid/platform/dynload/mklml.h index d0619293acf2d2df0d925e969bdeb8e45cda6e2b..a260cda49138580b209e647af459e9392d9f18f1 100644 --- a/paddle/fluid/platform/dynload/mklml.h +++ b/paddle/fluid/platform/dynload/mklml.h @@ -70,6 +70,8 @@ extern void* mklml_dso_handle; __macro(cblas_ddot); \ __macro(cblas_sasum); \ __macro(cblas_dasum); \ + __macro(cblas_isamax); \ + __macro(cblas_idamax); \ __macro(cblas_sscal); \ __macro(cblas_dscal); \ __macro(vsAdd); \