未验证 提交 c7449227 编写于 作者: T tensor-tang 提交者: GitHub

Merge pull request #15563 from tensor-tang/jit/softmax

refine softmax kernel
...@@ -158,7 +158,7 @@ void BenchAllImpls(const typename KernelTuples::attr_type& attr, Args... args) { ...@@ -158,7 +158,7 @@ void BenchAllImpls(const typename KernelTuples::attr_type& attr, Args... args) {
using Tensor = paddle::framework::Tensor; using Tensor = paddle::framework::Tensor;
template <paddle::operators::jit::KernelType KT, typename T, typename PlaceType> template <jit::KernelType KT, typename T, typename PlaceType>
void BenchXYZNKernel() { void BenchXYZNKernel() {
for (int d : TestSizes()) { for (int d : TestSizes()) {
Tensor x, y, z; Tensor x, y, z;
...@@ -175,7 +175,7 @@ void BenchXYZNKernel() { ...@@ -175,7 +175,7 @@ void BenchXYZNKernel() {
} }
} }
template <paddle::operators::jit::KernelType KT, typename T, typename PlaceType> template <jit::KernelType KT, typename T, typename PlaceType>
void BenchAXYNKernel() { void BenchAXYNKernel() {
for (int d : TestSizes()) { for (int d : TestSizes()) {
const T a = static_cast<T>(3); const T a = static_cast<T>(3);
...@@ -187,10 +187,23 @@ void BenchAXYNKernel() { ...@@ -187,10 +187,23 @@ void BenchAXYNKernel() {
RandomVec<T>(d, x_data); RandomVec<T>(d, x_data);
BenchAllImpls<KT, jit::AXYNTuples<T>, PlaceType>(d, &a, x.data<T>(), y_data, BenchAllImpls<KT, jit::AXYNTuples<T>, PlaceType>(d, &a, x.data<T>(), y_data,
d); d);
// test inplace
BenchAllImpls<KT, jit::AXYNTuples<T>, PlaceType>(d, &a, x.data<T>(), x_data,
d);
}
}
template <jit::KernelType KT, typename T, typename PlaceType>
void BenchXRNKernel() {
for (int d : TestSizes()) {
Tensor x;
RandomVec<T>(d, x.mutable_data<T>({d}, PlaceType()));
T res;
BenchAllImpls<KT, jit::XRNTuples<T>, PlaceType>(d, x.data<T>(), &res, d);
} }
} }
template <paddle::operators::jit::KernelType KT, typename T, typename PlaceType> template <jit::KernelType KT, typename T, typename PlaceType>
void BenchXYNKernel() { void BenchXYNKernel() {
for (int d : TestSizes()) { for (int d : TestSizes()) {
Tensor x, y; Tensor x, y;
...@@ -203,7 +216,7 @@ void BenchXYNKernel() { ...@@ -203,7 +216,7 @@ void BenchXYNKernel() {
} }
} }
template <paddle::operators::jit::KernelType KT, typename T, typename PlaceType> template <jit::KernelType KT, typename T, typename PlaceType>
void BenchLSTMKernel() { void BenchLSTMKernel() {
for (bool use_peephole : {true, false}) { for (bool use_peephole : {true, false}) {
for (int d : TestSizes()) { for (int d : TestSizes()) {
...@@ -240,7 +253,7 @@ void BenchLSTMKernel() { ...@@ -240,7 +253,7 @@ void BenchLSTMKernel() {
} }
} }
template <paddle::operators::jit::KernelType KT, typename T, typename PlaceType> template <jit::KernelType KT, typename T, typename PlaceType>
void BenchGRUKernel() { void BenchGRUKernel() {
for (int d : TestSizes()) { for (int d : TestSizes()) {
const jit::gru_attr_t attr(d, jit::kVSigmoid, jit::kVTanh); const jit::gru_attr_t attr(d, jit::kVSigmoid, jit::kVTanh);
...@@ -262,7 +275,7 @@ void BenchGRUKernel() { ...@@ -262,7 +275,7 @@ void BenchGRUKernel() {
} }
} }
template <paddle::operators::jit::KernelType KT, typename T, typename PlaceType> template <jit::KernelType KT, typename T, typename PlaceType>
void BenchSeqPoolKernel() { void BenchSeqPoolKernel() {
std::vector<jit::SeqPoolType> pool_types = { std::vector<jit::SeqPoolType> pool_types = {
jit::SeqPoolType::kSum, jit::SeqPoolType::kAvg, jit::SeqPoolType::kSqrt}; jit::SeqPoolType::kSum, jit::SeqPoolType::kAvg, jit::SeqPoolType::kSqrt};
...@@ -284,7 +297,7 @@ void BenchSeqPoolKernel() { ...@@ -284,7 +297,7 @@ void BenchSeqPoolKernel() {
} }
} }
template <paddle::operators::jit::KernelType KT, typename T, typename PlaceType> template <jit::KernelType KT, typename T, typename PlaceType>
void BenchMatMulKernel() { void BenchMatMulKernel() {
for (int m : {1, 2, 3, 4}) { for (int m : {1, 2, 3, 4}) {
for (int n : TestSizes()) { for (int n : TestSizes()) {
...@@ -305,57 +318,64 @@ void BenchMatMulKernel() { ...@@ -305,57 +318,64 @@ void BenchMatMulKernel() {
} }
} }
template <jit::KernelType KT, typename T, typename PlaceType>
void BenchSoftmaxKernel() {
for (int bs : {1, 2, 10}) {
for (int n : TestSizes()) {
Tensor x, y;
x.Resize({bs, n});
y.Resize({bs, n});
RandomVec<T>(bs * n, x.mutable_data<T>(PlaceType()), -2.f, 2.f);
const T* x_data = x.data<T>();
T* y_data = y.mutable_data<T>(PlaceType());
BenchAllImpls<KT, jit::SoftmaxTuples<T>, PlaceType>(n, x_data, y_data, n,
bs);
}
}
}
using T = float; using T = float;
using PlaceType = paddle::platform::CPUPlace; using CPUPlace = paddle::platform::CPUPlace;
// xyzn // xyzn
BENCH_FP32_CPU(kVMul) { BenchXYZNKernel<jit::kVMul, T, PlaceType>(); } BENCH_FP32_CPU(kVMul) { BenchXYZNKernel<jit::kVMul, T, CPUPlace>(); }
BENCH_FP32_CPU(kVAdd) { BenchXYZNKernel<jit::kVAdd, T, CPUPlace>(); }
BENCH_FP32_CPU(kVAdd) { BenchXYZNKernel<jit::kVAdd, T, PlaceType>(); } BENCH_FP32_CPU(kVAddRelu) { BenchXYZNKernel<jit::kVAddRelu, T, CPUPlace>(); }
BENCH_FP32_CPU(kVSub) { BenchXYZNKernel<jit::kVSub, T, CPUPlace>(); }
BENCH_FP32_CPU(kVAddRelu) { BenchXYZNKernel<jit::kVAddRelu, T, PlaceType>(); }
BENCH_FP32_CPU(kVSub) { BenchXYZNKernel<jit::kVSub, T, PlaceType>(); }
// axyn // axyn
BENCH_FP32_CPU(kVScal) { BenchAXYNKernel<jit::kVScal, T, PlaceType>(); } BENCH_FP32_CPU(kVScal) { BenchAXYNKernel<jit::kVScal, T, CPUPlace>(); }
BENCH_FP32_CPU(kVAddBias) { BenchAXYNKernel<jit::kVAddBias, T, CPUPlace>(); }
BENCH_FP32_CPU(kVAddBias) { BenchAXYNKernel<jit::kVAddBias, T, PlaceType>(); } // xrn
BENCH_FP32_CPU(kHSum) { BenchXRNKernel<jit::kHSum, T, CPUPlace>(); }
BENCH_FP32_CPU(kHMax) { BenchXRNKernel<jit::kHMax, T, CPUPlace>(); }
// xyn // xyn
BENCH_FP32_CPU(kVRelu) { BenchXYNKernel<jit::kVRelu, T, PlaceType>(); } BENCH_FP32_CPU(kVRelu) { BenchXYNKernel<jit::kVRelu, T, CPUPlace>(); }
BENCH_FP32_CPU(kVIdentity) { BenchXYNKernel<jit::kVIdentity, T, CPUPlace>(); }
BENCH_FP32_CPU(kVIdentity) { BenchXYNKernel<jit::kVIdentity, T, PlaceType>(); } BENCH_FP32_CPU(kVSquare) { BenchXYNKernel<jit::kVSquare, T, CPUPlace>(); }
BENCH_FP32_CPU(kVExp) { BenchXYNKernel<jit::kVExp, T, CPUPlace>(); }
BENCH_FP32_CPU(kVSquare) { BenchXYNKernel<jit::kVSquare, T, PlaceType>(); } BENCH_FP32_CPU(kVSigmoid) { BenchXYNKernel<jit::kVSigmoid, T, CPUPlace>(); }
BENCH_FP32_CPU(kVTanh) { BenchXYNKernel<jit::kVTanh, T, CPUPlace>(); }
BENCH_FP32_CPU(kVExp) { BenchXYNKernel<jit::kVExp, T, PlaceType>(); }
BENCH_FP32_CPU(kVSigmoid) { BenchXYNKernel<jit::kVSigmoid, T, PlaceType>(); }
BENCH_FP32_CPU(kVTanh) { BenchXYNKernel<jit::kVTanh, T, PlaceType>(); }
// lstm and peephole // lstm and peephole
BENCH_FP32_CPU(kLSTMCtHt) { BenchLSTMKernel<jit::kLSTMCtHt, T, PlaceType>(); } BENCH_FP32_CPU(kLSTMCtHt) { BenchLSTMKernel<jit::kLSTMCtHt, T, CPUPlace>(); }
BENCH_FP32_CPU(kLSTMC1H1) { BenchLSTMKernel<jit::kLSTMC1H1, T, CPUPlace>(); }
BENCH_FP32_CPU(kLSTMC1H1) { BenchLSTMKernel<jit::kLSTMC1H1, T, PlaceType>(); }
// gru functions // gru functions
BENCH_FP32_CPU(kGRUH1) { BenchGRUKernel<jit::kGRUH1, T, PlaceType>(); } BENCH_FP32_CPU(kGRUH1) { BenchGRUKernel<jit::kGRUH1, T, CPUPlace>(); }
BENCH_FP32_CPU(kGRUHtPart1) { BenchGRUKernel<jit::kGRUHtPart1, T, CPUPlace>(); }
BENCH_FP32_CPU(kGRUHtPart1) { BENCH_FP32_CPU(kGRUHtPart2) { BenchGRUKernel<jit::kGRUHtPart2, T, CPUPlace>(); }
BenchGRUKernel<jit::kGRUHtPart1, T, PlaceType>();
}
BENCH_FP32_CPU(kGRUHtPart2) {
BenchGRUKernel<jit::kGRUHtPart2, T, PlaceType>();
}
// seq pool function // seq pool function
BENCH_FP32_CPU(kSeqPool) { BenchSeqPoolKernel<jit::kSeqPool, T, PlaceType>(); } BENCH_FP32_CPU(kSeqPool) { BenchSeqPoolKernel<jit::kSeqPool, T, CPUPlace>(); }
// matmul // matmul
BENCH_FP32_CPU(kMatMul) { BenchMatMulKernel<jit::kMatMul, T, PlaceType>(); } BENCH_FP32_CPU(kMatMul) { BenchMatMulKernel<jit::kMatMul, T, CPUPlace>(); }
// softmax
BENCH_FP32_CPU(kSoftmax) { BenchSoftmaxKernel<jit::kSoftmax, T, CPUPlace>(); }
// Benchmark all jit kernels including jitcode, mkl and refer. // Benchmark all jit kernels including jitcode, mkl and refer.
// To use this tool, run command: ./benchmark [options...] // To use this tool, run command: ./benchmark [options...]
......
...@@ -28,3 +28,5 @@ USE_JITKERNEL_GEN(kGRUHtPart1) ...@@ -28,3 +28,5 @@ USE_JITKERNEL_GEN(kGRUHtPart1)
USE_JITKERNEL_GEN(kGRUHtPart2) USE_JITKERNEL_GEN(kGRUHtPart2)
USE_JITKERNEL_GEN(kNCHW16CMulNC) USE_JITKERNEL_GEN(kNCHW16CMulNC)
USE_JITKERNEL_GEN(kSeqPool) USE_JITKERNEL_GEN(kSeqPool)
USE_JITKERNEL_GEN(kHMax)
USE_JITKERNEL_GEN(kHSum)
...@@ -81,9 +81,7 @@ void VActJitCode::genCode() { ...@@ -81,9 +81,7 @@ void VActJitCode::genCode() {
#define DECLARE_ACT_CREATOR(name) \ #define DECLARE_ACT_CREATOR(name) \
class name##Creator : public JitCodeCreator<int> { \ class name##Creator : public JitCodeCreator<int> { \
public: \ public: \
bool UseMe(const int& attr) const override { \ bool UseMe(const int& attr) const override; \
return platform::MayIUse(platform::avx); \
} \
size_t CodeSize(const int& d) const override; \ size_t CodeSize(const int& d) const override; \
std::unique_ptr<GenBase> CreateJitCode(const int& attr) const override { \ std::unique_ptr<GenBase> CreateJitCode(const int& attr) const override { \
return make_unique<name##JitCode>(attr, CodeSize(attr)); \ return make_unique<name##JitCode>(attr, CodeSize(attr)); \
...@@ -98,6 +96,30 @@ DECLARE_ACT_CREATOR(VSigmoid); ...@@ -98,6 +96,30 @@ DECLARE_ACT_CREATOR(VSigmoid);
DECLARE_ACT_CREATOR(VTanh); DECLARE_ACT_CREATOR(VTanh);
// TODO(TJ): tuning use me // TODO(TJ): tuning use me
bool VReluCreator::UseMe(const int& d) const {
return platform::MayIUse(platform::avx);
}
bool VSquareCreator::UseMe(const int& d) const {
return platform::MayIUse(platform::avx);
}
bool VIdentityCreator::UseMe(const int& d) const {
return platform::MayIUse(platform::avx);
}
bool VExpCreator::UseMe(const int& d) const {
return platform::MayIUse(platform::avx) && d < 32;
}
bool VSigmoidCreator::UseMe(const int& d) const {
return platform::MayIUse(platform::avx);
}
bool VTanhCreator::UseMe(const int& d) const {
return platform::MayIUse(platform::avx);
}
size_t VReluCreator::CodeSize(const int& d) const { size_t VReluCreator::CodeSize(const int& d) const {
return 96 /* init size */ + return 96 /* init size */ +
(d / YMM_FLOAT_BLOCK + 3) * 4 /* instructions */ * (d / YMM_FLOAT_BLOCK + 3) * 4 /* instructions */ *
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License. */
#include "paddle/fluid/operators/jit/gen/hopv.h"
#include "paddle/fluid/operators/jit/registry.h"
#include "paddle/fluid/platform/cpu_info.h"
namespace paddle {
namespace operators {
namespace jit {
namespace gen {
void HOPVJitCode::genCode() {
const int num_blocks = num_ / YMM_FLOAT_BLOCK;
int offset = 0;
if (num_blocks > 0) {
// load one firstly
vmovups(ymm_tmp, ptr[param_src]);
offset += sizeof(float) * YMM_FLOAT_BLOCK;
for (int i = 1; i < num_blocks; ++i) {
vmovups(ymm_src, ptr[param_src + offset]);
process(ymm_tmp, ymm_src, ymm_tmp);
offset += sizeof(float) * YMM_FLOAT_BLOCK;
}
vextractf128(xmm_dst, ymm_tmp, 1);
process(xmm_dst, xmm_dst, xmm_tmp);
} else {
if (type_ == operand_type::MAX) {
vbroadcastss(ymm_dst, ptr[param_src]);
} else if (type_ == operand_type::ADD) {
vxorps(ymm_dst, ymm_dst, ymm_dst);
}
}
int rest = num_ % YMM_FLOAT_BLOCK;
if (rest >= 4) {
vmovups(xmm_src, ptr[param_src + offset]);
offset += sizeof(float) * 4;
rest -= 4;
process(xmm_dst, xmm_dst, xmm_src);
}
vpermilps(xmm_tmp, xmm_dst, 16 + 8 + 3);
process(xmm_dst, xmm_dst, xmm_tmp);
if (rest >= 2) {
vmovq(xmm_src, ptr[param_src + offset]);
offset += sizeof(float) * 2;
rest -= 2;
process(xmm_dst, xmm_dst, xmm_src);
}
vpermilps(xmm_tmp, xmm_dst, 1);
process(xmm_dst, xmm_dst, xmm_tmp);
if (rest >= 1) {
vmovss(xmm_src, ptr[param_src + offset]);
process(xmm_dst, xmm_dst, xmm_src);
}
vmovss(ptr[param_dst], xmm_dst);
ret();
}
#define DECLARE_HOP_CREATOR(name) \
class name##Creator : public JitCodeCreator<int> { \
public: \
bool UseMe(const int& attr) const override { \
return platform::MayIUse(platform::avx); \
} \
size_t CodeSize(const int& d) const override { \
return 96 + d / YMM_FLOAT_BLOCK * 4 * 8; \
} \
std::unique_ptr<GenBase> CreateJitCode(const int& attr) const override { \
return make_unique<name##JitCode>(attr, CodeSize(attr)); \
} \
}
DECLARE_HOP_CREATOR(HMax);
DECLARE_HOP_CREATOR(HSum);
#undef DECLARE_HOP_CREATOR
} // namespace gen
} // namespace jit
} // namespace operators
} // namespace paddle
namespace gen = paddle::operators::jit::gen;
REGISTER_JITKERNEL_GEN(kHMax, gen::HMaxCreator);
REGISTER_JITKERNEL_GEN(kHSum, gen::HSumCreator);
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License. */
#pragma once
#include <string>
#include "glog/logging.h"
#include "paddle/fluid/operators/jit/gen/jitcode.h"
namespace paddle {
namespace operators {
namespace jit {
namespace gen {
// horizontal operand vector
class HOPVJitCode : public JitCode {
public:
explicit HOPVJitCode(int d, operand_type type, size_t code_size = 256 * 1024,
void* code_ptr = nullptr)
: JitCode(code_size, code_ptr), num_(d), type_(type) {
if (!(type_ == operand_type::MAX || type_ == operand_type::ADD)) {
LOG(FATAL) << "Do not support this operand type: " << type_;
}
this->genCode();
}
virtual const char* name() const {
std::string base = "VXXJitCode";
if (type_ == operand_type::MAX) {
base += "_MAX";
} else {
base += "_SUM";
}
return base.c_str();
}
void genCode() override;
protected:
template <typename JMM>
void process(JMM& dst, JMM& src1, JMM& src2) { // NOLINT
if (type_ == operand_type::MAX) {
vmaxps(dst, src1, src2);
} else if (type_ == operand_type::ADD) {
vaddps(dst, src1, src2);
}
}
private:
int num_;
operand_type type_;
reg64_t param_src{abi_param1};
reg64_t param_dst{abi_param2};
reg64_t param_attr{abi_param3};
ymm_t ymm_tmp = ymm_t(0);
ymm_t ymm_src = ymm_t(1);
ymm_t ymm_dst = ymm_t(2);
xmm_t xmm_tmp = xmm_t(0);
xmm_t xmm_src = xmm_t(1);
xmm_t xmm_dst = xmm_t(2);
};
#define DECLARE_HOP_JITCODE(name, op_type) \
class name##JitCode : public HOPVJitCode { \
public: \
explicit name##JitCode(int d, size_t code_size, void* code_ptr = nullptr) \
: HOPVJitCode(d, op_type, code_size, code_ptr) {} \
};
DECLARE_HOP_JITCODE(HMax, operand_type::MAX);
DECLARE_HOP_JITCODE(HSum, operand_type::ADD);
#undef DECLARE_HOP_JITCODE
} // namespace gen
} // namespace jit
} // namespace operators
} // namespace paddle
...@@ -47,6 +47,7 @@ using Label = Xbyak::Label; ...@@ -47,6 +47,7 @@ using Label = Xbyak::Label;
typedef enum { typedef enum {
MUL = 0, MUL = 0,
MAX,
ADD, ADD,
SUB, SUB,
RELU, RELU,
......
...@@ -49,6 +49,9 @@ const char* to_string(KernelType kt) { ...@@ -49,6 +49,9 @@ const char* to_string(KernelType kt) {
ONE_CASE(kNCHW16CMulNC); ONE_CASE(kNCHW16CMulNC);
ONE_CASE(kSeqPool); ONE_CASE(kSeqPool);
ONE_CASE(kMatMul); ONE_CASE(kMatMul);
ONE_CASE(kHMax);
ONE_CASE(kHSum);
ONE_CASE(kSoftmax);
default: default:
PADDLE_THROW("Not support type: %d, or forget to add it.", kt); PADDLE_THROW("Not support type: %d, or forget to add it.", kt);
return "NOT JITKernel"; return "NOT JITKernel";
......
...@@ -118,6 +118,28 @@ typename KernelTuples::func_type Get( ...@@ -118,6 +118,28 @@ typename KernelTuples::func_type Get(
return GetRefer<KT, KernelTuples>(); return GetRefer<KT, KernelTuples>();
} }
template <KernelType KT, typename KernelTuples>
class KernelFuncsCache {
public:
KernelFuncsCache() = default;
static KernelFuncsCache& Instance() {
static thread_local KernelFuncsCache<KT, KernelTuples> g_func_cache;
return g_func_cache;
}
bool Has(int key) const { return funcs_.find(key) != funcs_.end(); }
typename KernelTuples::func_type At(int key) { return funcs_.at(key); }
void Insert(int key, typename KernelTuples::func_type func) {
funcs_.emplace(key, func);
}
private:
std::unordered_map<int, typename KernelTuples::func_type> funcs_;
DISABLE_COPY_AND_ASSIGN(KernelFuncsCache);
};
const char* to_string(KernelType kt); const char* to_string(KernelType kt);
const char* to_string(SeqPoolType kt); const char* to_string(SeqPoolType kt);
......
...@@ -20,6 +20,7 @@ namespace paddle { ...@@ -20,6 +20,7 @@ namespace paddle {
namespace operators { namespace operators {
namespace jit { namespace jit {
// TODO(TJ): reorder by alphabet
typedef enum { typedef enum {
kNone = 0, kNone = 0,
kVMul = 1, kVMul = 1,
...@@ -44,6 +45,9 @@ typedef enum { ...@@ -44,6 +45,9 @@ typedef enum {
kNCHW16CMulNC, kNCHW16CMulNC,
kSeqPool, kSeqPool,
kMatMul, kMatMul,
kHSum, // horizontal max
kHMax, // horizontal sum
kSoftmax,
} KernelType; } KernelType;
typedef enum { typedef enum {
...@@ -70,6 +74,10 @@ struct XYNTuples { ...@@ -70,6 +74,10 @@ struct XYNTuples {
typedef void (*func_type)(const T*, T*, int); typedef void (*func_type)(const T*, T*, int);
}; };
// x, return and int
template <typename T>
struct XRNTuples : public XYNTuples<T> {};
typedef struct { typedef struct {
void* gates; // gates: x_ch, x_ih, x_fh, x_oh void* gates; // gates: x_ch, x_ih, x_fh, x_oh
const void* ct_1; const void* ct_1;
...@@ -159,6 +167,13 @@ struct LayerNormTuples { ...@@ -159,6 +167,13 @@ struct LayerNormTuples {
const float, int); const float, int);
}; };
template <typename T>
struct SoftmaxTuples {
typedef T data_type;
typedef int attr_type;
typedef void (*func_type)(const T*, T*, int, int);
};
// nChw16c = nChw16c .* NC // nChw16c = nChw16c .* NC
template <typename T> template <typename T>
struct NCHW16CMulNCTuples { struct NCHW16CMulNCTuples {
......
...@@ -12,3 +12,4 @@ USE_JITKERNEL_MORE(kLSTMC1H1, mix) ...@@ -12,3 +12,4 @@ USE_JITKERNEL_MORE(kLSTMC1H1, mix)
USE_JITKERNEL_MORE(kGRUH1, mix) USE_JITKERNEL_MORE(kGRUH1, mix)
USE_JITKERNEL_MORE(kGRUHtPart1, mix) USE_JITKERNEL_MORE(kGRUHtPart1, mix)
USE_JITKERNEL_MORE(kGRUHtPart2, mix) USE_JITKERNEL_MORE(kGRUHtPart2, mix)
USE_JITKERNEL_MORE(kSoftmax, mix)
...@@ -48,6 +48,65 @@ void VTanh(const T* x, T* y, int n) { ...@@ -48,6 +48,65 @@ void VTanh(const T* x, T* y, int n) {
compute_addbias(&b, y, y, n); compute_addbias(&b, y, y, n);
} }
void Softmax(const T* x, T* y, int n, int bs) {
typename XRNTuples<T>::func_type compute_hmax{nullptr};
typename XRNTuples<T>::func_type compute_hsum{nullptr};
typename AXYNTuples<T>::func_type compute_vscal{nullptr};
typename AXYNTuples<T>::func_type compute_vaddbias{nullptr};
typename XYNTuples<T>::func_type compute_vexp{nullptr};
if (!KernelFuncsCache<kHMax, XRNTuples<T>>::Instance().Has(n)) {
compute_hmax = Get<kHMax, XRNTuples<T>, platform::CPUPlace>(n);
KernelFuncsCache<kHMax, XRNTuples<T>>::Instance().Insert(n, compute_hmax);
} else {
compute_hmax = KernelFuncsCache<kHMax, XRNTuples<T>>::Instance().At(n);
}
if (!KernelFuncsCache<kHSum, XRNTuples<T>>::Instance().Has(n)) {
compute_hsum = Get<kHSum, XRNTuples<T>, platform::CPUPlace>(n);
KernelFuncsCache<kHSum, XRNTuples<T>>::Instance().Insert(n, compute_hsum);
} else {
compute_hsum = KernelFuncsCache<kHSum, XRNTuples<T>>::Instance().At(n);
}
if (!KernelFuncsCache<kVScal, AXYNTuples<T>>::Instance().Has(n)) {
compute_vscal = Get<kVScal, AXYNTuples<T>, platform::CPUPlace>(n);
KernelFuncsCache<kVScal, AXYNTuples<T>>::Instance().Insert(n,
compute_vscal);
} else {
compute_vscal = KernelFuncsCache<kVScal, AXYNTuples<T>>::Instance().At(n);
}
if (!KernelFuncsCache<kVAddBias, AXYNTuples<T>>::Instance().Has(n)) {
compute_vaddbias = Get<kVAddBias, AXYNTuples<T>, platform::CPUPlace>(n);
KernelFuncsCache<kVAddBias, AXYNTuples<T>>::Instance().Insert(
n, compute_vaddbias);
} else {
compute_vaddbias =
KernelFuncsCache<kVAddBias, AXYNTuples<T>>::Instance().At(n);
}
if (!KernelFuncsCache<kVExp, XYNTuples<T>>::Instance().Has(n)) {
compute_vexp = Get<KernelType::kVExp, XYNTuples<T>, platform::CPUPlace>(n);
KernelFuncsCache<kVExp, XYNTuples<T>>::Instance().Insert(n, compute_vexp);
} else {
compute_vexp = KernelFuncsCache<kVExp, XYNTuples<T>>::Instance().At(n);
}
for (int i = 0; i < bs; ++i) {
T scalar;
compute_hmax(x, &scalar, n);
scalar = static_cast<T>(0) - scalar;
compute_vaddbias(&scalar, x, y, n); // x - max
compute_vexp(y, y, n);
compute_hsum(y, &scalar, n);
scalar = static_cast<T>(1) / scalar;
compute_vscal(&scalar, y, y, n);
x += n;
y += n;
}
}
void (*getActFunc(KernelType type, int d))(const T*, T*, int) { // NOLINT void (*getActFunc(KernelType type, int d))(const T*, T*, int) { // NOLINT
if (type == kVSigmoid) { if (type == kVSigmoid) {
return Get<kVSigmoid, XYNTuples<T>, platform::CPUPlace>(d); return Get<kVSigmoid, XYNTuples<T>, platform::CPUPlace>(d);
...@@ -184,6 +243,8 @@ bool VSigmoidKernel::UseMe(const int& d) const { return true; } ...@@ -184,6 +243,8 @@ bool VSigmoidKernel::UseMe(const int& d) const { return true; }
bool VTanhKernel::UseMe(const int& d) const { return true; } bool VTanhKernel::UseMe(const int& d) const { return true; }
bool SoftmaxKernel::UseMe(const int& d) const { return true; }
bool LSTMCtHtKernel::UseMe(const lstm_attr_t& attr) const { return true; } bool LSTMCtHtKernel::UseMe(const lstm_attr_t& attr) const { return true; }
bool LSTMC1H1Kernel::UseMe(const lstm_attr_t& attr) const { return true; } bool LSTMC1H1Kernel::UseMe(const lstm_attr_t& attr) const { return true; }
...@@ -207,6 +268,7 @@ namespace mix = paddle::operators::jit::more::mix; ...@@ -207,6 +268,7 @@ namespace mix = paddle::operators::jit::more::mix;
REGISTER_MORE_KERNEL(kVSigmoid, VSigmoid); REGISTER_MORE_KERNEL(kVSigmoid, VSigmoid);
REGISTER_MORE_KERNEL(kVTanh, VTanh); REGISTER_MORE_KERNEL(kVTanh, VTanh);
REGISTER_MORE_KERNEL(kSoftmax, Softmax);
REGISTER_MORE_KERNEL(kLSTMCtHt, LSTMCtHt); REGISTER_MORE_KERNEL(kLSTMCtHt, LSTMCtHt);
REGISTER_MORE_KERNEL(kLSTMC1H1, LSTMC1H1); REGISTER_MORE_KERNEL(kLSTMC1H1, LSTMC1H1);
REGISTER_MORE_KERNEL(kGRUH1, GRUH1); REGISTER_MORE_KERNEL(kGRUH1, GRUH1);
......
...@@ -26,6 +26,7 @@ using T = float; ...@@ -26,6 +26,7 @@ using T = float;
void VSigmoid(const T* x, T* y, int n); void VSigmoid(const T* x, T* y, int n);
void VTanh(const T* x, T* y, int n); void VTanh(const T* x, T* y, int n);
void Softmax(const T* x, T* y, int n, int bs);
void LSTMCtHt(lstm_t* step, const lstm_attr_t* attr); void LSTMCtHt(lstm_t* step, const lstm_attr_t* attr);
void LSTMC1H1(lstm_t* step, const lstm_attr_t* attr); void LSTMC1H1(lstm_t* step, const lstm_attr_t* attr);
...@@ -45,6 +46,9 @@ void GRUHtPart2(gru_t* step, const gru_attr_t* attr); ...@@ -45,6 +46,9 @@ void GRUHtPart2(gru_t* step, const gru_attr_t* attr);
DECLARE_MORE_KERNEL(VSigmoid, XYNTuples); DECLARE_MORE_KERNEL(VSigmoid, XYNTuples);
DECLARE_MORE_KERNEL(VTanh, XYNTuples); DECLARE_MORE_KERNEL(VTanh, XYNTuples);
// XRN
DECLARE_MORE_KERNEL(Softmax, SoftmaxTuples);
DECLARE_MORE_KERNEL(LSTMCtHt, LSTMTuples); DECLARE_MORE_KERNEL(LSTMCtHt, LSTMTuples);
DECLARE_MORE_KERNEL(LSTMC1H1, LSTMTuples); DECLARE_MORE_KERNEL(LSTMC1H1, LSTMTuples);
......
...@@ -12,3 +12,4 @@ USE_JITKERNEL_MORE(kVSquare, mkl) ...@@ -12,3 +12,4 @@ USE_JITKERNEL_MORE(kVSquare, mkl)
USE_JITKERNEL_MORE(kVSigmoid, mkl) USE_JITKERNEL_MORE(kVSigmoid, mkl)
USE_JITKERNEL_MORE(kVTanh, mkl) USE_JITKERNEL_MORE(kVTanh, mkl)
USE_JITKERNEL_MORE(kSeqPool, mkl) USE_JITKERNEL_MORE(kSeqPool, mkl)
USE_JITKERNEL_MORE(kSoftmax, mkl)
...@@ -116,6 +116,16 @@ void VAXPY<double>(double a, const double* x, double* y, int n) { ...@@ -116,6 +116,16 @@ void VAXPY<double>(double a, const double* x, double* y, int n) {
platform::dynload::cblas_daxpy(n, a, x, 1, y, 1); platform::dynload::cblas_daxpy(n, a, x, 1, y, 1);
} }
template <>
void ASum<float>(const float* x, float* res, int n) {
res[0] = platform::dynload::cblas_sasum(n, x, 1);
}
template <>
void ASum<double>(const double* x, double* res, int n) {
res[0] = platform::dynload::cblas_dasum(n, x, 1);
}
// TODO(TJ): tuning me carefully on AVX, AVX2 and AVX512 // TODO(TJ): tuning me carefully on AVX, AVX2 and AVX512
template <> template <>
bool MatMulKernel<float>::UseMe(const int& d) const { bool MatMulKernel<float>::UseMe(const int& d) const {
...@@ -167,6 +177,12 @@ bool SeqPoolKernel<double>::UseMe(const seq_pool_attr_t& attr) const { ...@@ -167,6 +177,12 @@ bool SeqPoolKernel<double>::UseMe(const seq_pool_attr_t& attr) const {
return true; return true;
} }
template <>
bool SoftmaxKernel<float>::UseMe(const int& d) const {
// tuned on avx2
return platform::MayIUse(platform::avx) && d < 60;
}
#define AWALYS_USE_ME_WITH_DOUBLE(func) \ #define AWALYS_USE_ME_WITH_DOUBLE(func) \
template <> \ template <> \
bool func##Kernel<double>::UseMe(const int& d) const { \ bool func##Kernel<double>::UseMe(const int& d) const { \
...@@ -181,6 +197,7 @@ AWALYS_USE_ME_WITH_DOUBLE(VExp); ...@@ -181,6 +197,7 @@ AWALYS_USE_ME_WITH_DOUBLE(VExp);
AWALYS_USE_ME_WITH_DOUBLE(VSigmoid); AWALYS_USE_ME_WITH_DOUBLE(VSigmoid);
AWALYS_USE_ME_WITH_DOUBLE(VTanh); AWALYS_USE_ME_WITH_DOUBLE(VTanh);
AWALYS_USE_ME_WITH_DOUBLE(VSquare); AWALYS_USE_ME_WITH_DOUBLE(VSquare);
AWALYS_USE_ME_WITH_DOUBLE(Softmax);
#undef AWALYS_USE_ME_WITH_DOUBLE #undef AWALYS_USE_ME_WITH_DOUBLE
} // namespace mkl } // namespace mkl
...@@ -204,5 +221,6 @@ REGISTER_MKL_KERNEL(kVSquare, VSquare); ...@@ -204,5 +221,6 @@ REGISTER_MKL_KERNEL(kVSquare, VSquare);
REGISTER_MKL_KERNEL(kVSigmoid, VSigmoid); REGISTER_MKL_KERNEL(kVSigmoid, VSigmoid);
REGISTER_MKL_KERNEL(kVTanh, VTanh); REGISTER_MKL_KERNEL(kVTanh, VTanh);
REGISTER_MKL_KERNEL(kSeqPool, SeqPool); REGISTER_MKL_KERNEL(kSeqPool, SeqPool);
REGISTER_MKL_KERNEL(kSoftmax, Softmax);
#undef REGISTER_MKL_KERNEL #undef REGISTER_MKL_KERNEL
...@@ -16,6 +16,7 @@ ...@@ -16,6 +16,7 @@
#include <cmath> #include <cmath>
#include <type_traits> #include <type_traits>
#include <vector>
#include "paddle/fluid/operators/jit/kernel_base.h" #include "paddle/fluid/operators/jit/kernel_base.h"
namespace paddle { namespace paddle {
...@@ -90,6 +91,30 @@ void SeqPool(const T* x, T* y, const seq_pool_attr_t* attr) { ...@@ -90,6 +91,30 @@ void SeqPool(const T* x, T* y, const seq_pool_attr_t* attr) {
} }
} }
template <typename T>
void ASum(const T* x, T* res, int n);
template <typename T>
void Softmax(const T* x, T* y, int n, int bs) {
std::vector<T> entities(bs);
for (int i = 0; i < bs; ++i) {
entities[i] = x[i * n];
for (int c = 1; c < n; ++c) {
entities[i] = x[i * n + c] > entities[i] ? x[i * n + c] : entities[i];
}
for (int c = 0; c < n; ++c) {
y[i * n + c] = x[i * n + c] - entities[i];
}
}
VExp(y, y, n * bs);
for (int i = 0; i < bs; ++i) {
T sum;
ASum(&y[i * n], &sum, n);
sum = static_cast<T>(1) / sum;
VScal(&sum, &y[i * n], &y[i * n], n);
}
}
#define DECLARE_MKL_KERNEL(name, tuples) \ #define DECLARE_MKL_KERNEL(name, tuples) \
template <typename T> \ template <typename T> \
class name##Kernel : public KernelMore<tuples<T>> { \ class name##Kernel : public KernelMore<tuples<T>> { \
...@@ -117,6 +142,8 @@ DECLARE_MKL_KERNEL(VSquare, XYNTuples); ...@@ -117,6 +142,8 @@ DECLARE_MKL_KERNEL(VSquare, XYNTuples);
DECLARE_MKL_KERNEL(SeqPool, SeqPoolTuples); DECLARE_MKL_KERNEL(SeqPool, SeqPoolTuples);
DECLARE_MKL_KERNEL(Softmax, SoftmaxTuples);
#undef DECLARE_MKL_KERNEL #undef DECLARE_MKL_KERNEL
} // namespace mkl } // namespace mkl
......
...@@ -29,3 +29,6 @@ USE_JITKERNEL_REFER(kNCHW16CMulNC) ...@@ -29,3 +29,6 @@ USE_JITKERNEL_REFER(kNCHW16CMulNC)
USE_JITKERNEL_REFER(kSeqPool) USE_JITKERNEL_REFER(kSeqPool)
USE_JITKERNEL_REFER(kMatMul) USE_JITKERNEL_REFER(kMatMul)
USE_JITKERNEL_REFER(kVSquare) USE_JITKERNEL_REFER(kVSquare)
USE_JITKERNEL_REFER(kHSum)
USE_JITKERNEL_REFER(kHMax)
USE_JITKERNEL_REFER(kSoftmax)
...@@ -52,4 +52,9 @@ REGISTER_REFER_KERNEL(kSeqPool, SeqPool); ...@@ -52,4 +52,9 @@ REGISTER_REFER_KERNEL(kSeqPool, SeqPool);
REGISTER_REFER_KERNEL(kMatMul, MatMul); REGISTER_REFER_KERNEL(kMatMul, MatMul);
REGISTER_REFER_KERNEL(kHMax, HMax);
REGISTER_REFER_KERNEL(kHSum, HSum);
REGISTER_REFER_KERNEL(kSoftmax, Softmax);
#undef REGISTER_REFER_KERNEL #undef REGISTER_REFER_KERNEL
...@@ -378,6 +378,40 @@ void MatMul(const T* A, const T* B, T* C, int M, int N, int K) { ...@@ -378,6 +378,40 @@ void MatMul(const T* A, const T* B, T* C, int M, int N, int K) {
} }
} }
template <typename T>
void HMax(const T* x, T* res, int n) {
res[0] = x[0];
for (int i = 1; i < n; ++i) {
res[0] = res[0] < x[i] ? x[i] : res[0];
}
}
template <typename T>
void HSum(const T* x, T* res, int n) {
res[0] = x[0];
for (int i = 1; i < n; ++i) {
res[0] += x[i];
}
}
// y = e^(x - max(x))
// y = y / sum(y)
template <typename T>
void Softmax(const T* x, T* y, int n, int bs = 1) {
for (int i = 0; i < bs; ++i) {
T scalar;
HMax(x, &scalar, n);
scalar = static_cast<T>(0) - scalar;
VAddBias(&scalar, x, y, n); // x - max
VExp(y, y, n);
HSum(y, &scalar, n);
scalar = static_cast<T>(1) / scalar;
VScal(&scalar, y, y, n);
x += n;
y += n;
}
}
#define DECLARE_REFER_KERNEL(name, tuples) \ #define DECLARE_REFER_KERNEL(name, tuples) \
template <typename T> \ template <typename T> \
class name##Kernel : public ReferKernel<tuples<T>> { \ class name##Kernel : public ReferKernel<tuples<T>> { \
...@@ -421,6 +455,11 @@ DECLARE_REFER_KERNEL(SeqPool, SeqPoolTuples); ...@@ -421,6 +455,11 @@ DECLARE_REFER_KERNEL(SeqPool, SeqPoolTuples);
DECLARE_REFER_KERNEL(MatMul, MatMulTuples); DECLARE_REFER_KERNEL(MatMul, MatMulTuples);
DECLARE_REFER_KERNEL(HMax, XRNTuples);
DECLARE_REFER_KERNEL(HSum, XRNTuples);
DECLARE_REFER_KERNEL(Softmax, SoftmaxTuples);
#undef DECLARE_REFER_KERNEL #undef DECLARE_REFER_KERNEL
} // namespace refer } // namespace refer
......
...@@ -61,6 +61,7 @@ std::vector<int> TestSizes() { ...@@ -61,6 +61,7 @@ std::vector<int> TestSizes() {
} }
namespace jit = paddle::operators::jit; namespace jit = paddle::operators::jit;
using CPUPlace = paddle::platform::CPUPlace;
template <typename KernelTuples, typename... Args> template <typename KernelTuples, typename... Args>
struct TestFuncWithRefer { struct TestFuncWithRefer {
...@@ -121,6 +122,40 @@ struct TestFuncWithRefer<jit::AXYNTuples<T>, T, std::vector<T>, ...@@ -121,6 +122,40 @@ struct TestFuncWithRefer<jit::AXYNTuples<T>, T, std::vector<T>,
} }
}; };
template <typename T>
struct TestFuncWithRefer<jit::SoftmaxTuples<T>, std::vector<T>, std::vector<T>,
int, int> {
void operator()(const typename jit::SoftmaxTuples<T>::func_type tgt,
const std::vector<T>& x, const std::vector<T>& yref, int n,
int bs) {
EXPECT_TRUE(tgt != nullptr);
EXPECT_EQ(yref.size(), x.size());
EXPECT_EQ(x.size(), static_cast<size_t>(n * bs));
const T* x_data = x.data();
const T* yref_data = yref.data();
std::vector<T> ytgt(n * bs);
T* ytgt_data = ytgt.data();
// test normal
tgt(x_data, ytgt_data, n, bs);
ExpectEQ<T>(ytgt_data, yref_data, n * bs);
// test inplace x
std::copy(x.begin(), x.end(), ytgt.begin());
tgt(ytgt_data, ytgt_data, n, bs);
ExpectEQ<T>(ytgt_data, yref_data, n * bs);
}
};
template <typename T>
struct TestFuncWithRefer<jit::XRNTuples<T>, std::vector<T>, T> {
void operator()(const typename jit::XRNTuples<T>::func_type tgt,
const std::vector<T>& x, const T ref_res) {
EXPECT_TRUE(tgt != nullptr);
T tgt_res;
tgt(x.data(), &tgt_res, x.size());
ExpectEQ<T>(&tgt_res, &ref_res, 1);
}
};
template <typename T> template <typename T>
struct TestFuncWithRefer<jit::XYNTuples<T>, std::vector<T>, std::vector<T>> { struct TestFuncWithRefer<jit::XYNTuples<T>, std::vector<T>, std::vector<T>> {
void operator()(const typename jit::XYNTuples<T>::func_type tgt, void operator()(const typename jit::XYNTuples<T>::func_type tgt,
...@@ -172,7 +207,7 @@ struct TestFuncWithRefer<jit::LSTMTuples<T>, std::vector<T>, std::vector<T>, ...@@ -172,7 +207,7 @@ struct TestFuncWithRefer<jit::LSTMTuples<T>, std::vector<T>, std::vector<T>,
T* ht_data = ht.data(); T* ht_data = ht.data();
T* checked_data = checked.data(); T* checked_data = checked.data();
paddle::operators::jit::lstm_t step; jit::lstm_t step;
step.gates = x_data; step.gates = x_data;
step.ct_1 = ct_1_data; step.ct_1 = ct_1_data;
step.ct = ct_data; step.ct = ct_data;
...@@ -208,7 +243,7 @@ struct TestFuncWithRefer<jit::GRUTuples<T>, std::vector<T>, std::vector<T>, ...@@ -208,7 +243,7 @@ struct TestFuncWithRefer<jit::GRUTuples<T>, std::vector<T>, std::vector<T>,
const T* ht_ref_data = ht_ref.data(); const T* ht_ref_data = ht_ref.data();
T* x_data = x.data(); T* x_data = x.data();
T* ht_data = ht.data(); T* ht_data = ht.data();
paddle::operators::jit::gru_t step; jit::gru_t step;
step.gates = x_data; step.gates = x_data;
step.ht_1 = ht_1_data; step.ht_1 = ht_1_data;
step.ht = ht_data; step.ht = ht_data;
...@@ -255,8 +290,8 @@ struct TestFuncWithRefer<jit::MatMulTuples<T>, std::vector<T>, std::vector<T>, ...@@ -255,8 +290,8 @@ struct TestFuncWithRefer<jit::MatMulTuples<T>, std::vector<T>, std::vector<T>,
} }
}; };
template <paddle::operators::jit::KernelType KT, typename KernelTuples, template <jit::KernelType KT, typename KernelTuples, typename PlaceType,
typename PlaceType, typename... Args> typename... Args>
void TestAllImpls(const typename KernelTuples::attr_type& attr, Args... args) { void TestAllImpls(const typename KernelTuples::attr_type& attr, Args... args) {
TestFuncWithRefer<KernelTuples, Args...> test; TestFuncWithRefer<KernelTuples, Args...> test;
// test jitcode // test jitcode
...@@ -286,9 +321,8 @@ void TestAllImpls(const typename KernelTuples::attr_type& attr, Args... args) { ...@@ -286,9 +321,8 @@ void TestAllImpls(const typename KernelTuples::attr_type& attr, Args... args) {
test(tgt, args...); test(tgt, args...);
} }
template <paddle::operators::jit::KernelType KT, typename T, typename PlaceType> template <jit::KernelType KT, typename T, typename PlaceType>
void TestXYZNKernel() { void TestXYZNKernel() {
namespace jit = paddle::operators::jit;
VLOG(10) << "===== Test JITKernel " << jit::to_string(KT); VLOG(10) << "===== Test JITKernel " << jit::to_string(KT);
for (int d : TestSizes()) { for (int d : TestSizes()) {
auto ref = jit::GetRefer<KT, jit::XYZNTuples<T>>(); auto ref = jit::GetRefer<KT, jit::XYZNTuples<T>>();
...@@ -320,9 +354,8 @@ void TestXYZNKernel() { ...@@ -320,9 +354,8 @@ void TestXYZNKernel() {
} }
} }
template <paddle::operators::jit::KernelType KT, typename T, typename PlaceType> template <jit::KernelType KT, typename T, typename PlaceType>
void TestAXYNKernel() { void TestAXYNKernel() {
namespace jit = paddle::operators::jit;
VLOG(10) << "===== Test JITKernel " << jit::to_string(KT); VLOG(10) << "===== Test JITKernel " << jit::to_string(KT);
for (int d : TestSizes()) { for (int d : TestSizes()) {
auto ref = jit::GetRefer<KT, jit::AXYNTuples<T>>(); auto ref = jit::GetRefer<KT, jit::AXYNTuples<T>>();
...@@ -347,9 +380,26 @@ void TestAXYNKernel() { ...@@ -347,9 +380,26 @@ void TestAXYNKernel() {
} }
} }
template <paddle::operators::jit::KernelType KT, typename T, typename PlaceType> template <jit::KernelType KT, typename T, typename PlaceType>
void TestXRNKernel() {
VLOG(10) << "===== Test JITKernel " << jit::to_string(KT);
auto last_acc = acc;
acc = 1e-4;
for (int d : TestSizes()) {
auto ref = jit::GetRefer<KT, jit::XRNTuples<T>>();
EXPECT_TRUE(ref != nullptr);
std::vector<T> x(d);
RandomVec<T>(d, x.data(), -2.f, 2.f);
T ref_res;
ref(x.data(), &ref_res, d);
TestAllImpls<KT, jit::XRNTuples<T>, PlaceType, std::vector<T>, T>(d, x,
ref_res);
}
acc = last_acc;
}
template <jit::KernelType KT, typename T, typename PlaceType>
void TestXYNKernel() { void TestXYNKernel() {
namespace jit = paddle::operators::jit;
VLOG(10) << "===== Test JITKernel " << jit::to_string(KT); VLOG(10) << "===== Test JITKernel " << jit::to_string(KT);
for (int d : TestSizes()) { for (int d : TestSizes()) {
auto ref = jit::GetRefer<KT, jit::XYNTuples<T>>(); auto ref = jit::GetRefer<KT, jit::XYNTuples<T>>();
...@@ -373,9 +423,8 @@ void TestXYNKernel() { ...@@ -373,9 +423,8 @@ void TestXYNKernel() {
} }
} }
template <paddle::operators::jit::KernelType KT, typename T, typename PlaceType> template <jit::KernelType KT, typename T, typename PlaceType>
void TestLSTMKernel() { void TestLSTMKernel() {
namespace jit = paddle::operators::jit;
VLOG(10) << "===== Test JITKernel " << jit::to_string(KT); VLOG(10) << "===== Test JITKernel " << jit::to_string(KT);
std::vector<std::string> all_acts = {"sigmoid", "tanh", "relu", "identity"}; std::vector<std::string> all_acts = {"sigmoid", "tanh", "relu", "identity"};
for (int d : TestSizes()) { for (int d : TestSizes()) {
...@@ -424,9 +473,8 @@ void TestLSTMKernel() { ...@@ -424,9 +473,8 @@ void TestLSTMKernel() {
} }
} }
template <paddle::operators::jit::KernelType KT, typename T, typename PlaceType> template <jit::KernelType KT, typename T, typename PlaceType>
void TestGRUKernel() { void TestGRUKernel() {
namespace jit = paddle::operators::jit;
VLOG(10) << "===== Test JITKernel " << jit::to_string(KT); VLOG(10) << "===== Test JITKernel " << jit::to_string(KT);
std::vector<std::string> all_acts = {"sigmoid", "tanh", "relu", "identity"}; std::vector<std::string> all_acts = {"sigmoid", "tanh", "relu", "identity"};
for (int d : TestSizes()) { for (int d : TestSizes()) {
...@@ -459,7 +507,7 @@ void TestGRUKernel() { ...@@ -459,7 +507,7 @@ void TestGRUKernel() {
} }
} }
template <paddle::operators::jit::KernelType KT, typename T, typename PlaceType> template <jit::KernelType KT, typename T, typename PlaceType>
void TestSeqPoolKernel() { void TestSeqPoolKernel() {
VLOG(10) << "===== Test JITKernel " << jit::to_string(KT); VLOG(10) << "===== Test JITKernel " << jit::to_string(KT);
std::vector<jit::SeqPoolType> pool_types = { std::vector<jit::SeqPoolType> pool_types = {
...@@ -484,7 +532,7 @@ void TestSeqPoolKernel() { ...@@ -484,7 +532,7 @@ void TestSeqPoolKernel() {
} }
} }
template <paddle::operators::jit::KernelType KT, typename T, typename PlaceType> template <jit::KernelType KT, typename T, typename PlaceType>
void TestMatMulKernel() { void TestMatMulKernel() {
VLOG(10) << "===== Test JITKernel " << jit::to_string(KT); VLOG(10) << "===== Test JITKernel " << jit::to_string(KT);
auto last_acc = acc; auto last_acc = acc;
...@@ -510,7 +558,32 @@ void TestMatMulKernel() { ...@@ -510,7 +558,32 @@ void TestMatMulKernel() {
acc = last_acc; acc = last_acc;
} }
template <paddle::operators::jit::KernelType KT, typename T, typename PlaceType> template <jit::KernelType KT, typename T, typename PlaceType>
void TestSoftmaxKernel() {
VLOG(10) << "===== Test JITKernel " << jit::to_string(KT);
for (int bs : {1, 2, 10}) {
for (int n : TestSizes()) {
auto ref = jit::GetRefer<KT, jit::SoftmaxTuples<T>>();
EXPECT_TRUE(ref != nullptr);
std::vector<T> x(bs * n), y(bs * n);
RandomVec<T>(bs * n, x.data(), -2.f, 2.f);
const T* x_data = x.data();
T* y_data = y.data();
std::vector<T> xinp(x.size()); // inplace test
std::copy(x.begin(), x.end(), xinp.begin());
ref(x_data, y_data, n, bs);
T* xinp_data = xinp.data();
ref(xinp_data, xinp_data, n, bs);
ExpectEQ<T>(xinp_data, y_data, n * bs);
TestAllImpls<KT, jit::SoftmaxTuples<T>, PlaceType, std::vector<T>,
std::vector<T>>(n, x, y, n, bs);
}
}
}
template <jit::KernelType KT, typename T, typename PlaceType>
void TestNCHW16CMulNCKernel() { void TestNCHW16CMulNCKernel() {
VLOG(10) << "===== Test JITKernel " << jit::to_string(KT); VLOG(10) << "===== Test JITKernel " << jit::to_string(KT);
const int n = 3, c = 16 * 4, h = 10, w = 10; const int n = 3, c = 16 * 4, h = 10, w = 10;
...@@ -565,129 +638,123 @@ void TestNCHW16CMulNCKernel() { ...@@ -565,129 +638,123 @@ void TestNCHW16CMulNCKernel() {
// XYZNTuple // XYZNTuple
TEST(JITKernel, kVMul) { TEST(JITKernel, kVMul) {
namespace jit = paddle::operators::jit; TestXYZNKernel<jit::kVMul, float, CPUPlace>();
TestXYZNKernel<jit::kVMul, float, paddle::platform::CPUPlace>(); TestXYZNKernel<jit::kVMul, double, CPUPlace>();
TestXYZNKernel<jit::kVMul, double, paddle::platform::CPUPlace>();
} }
TEST(JITKernel, kVAdd) { TEST(JITKernel, kVAdd) {
namespace jit = paddle::operators::jit; TestXYZNKernel<jit::kVAdd, float, CPUPlace>();
TestXYZNKernel<jit::kVAdd, float, paddle::platform::CPUPlace>(); TestXYZNKernel<jit::kVAdd, double, CPUPlace>();
TestXYZNKernel<jit::kVAdd, double, paddle::platform::CPUPlace>();
} }
TEST(JITKernel, kVAddRelu) { TEST(JITKernel, kVAddRelu) {
namespace jit = paddle::operators::jit; TestXYZNKernel<jit::kVAddRelu, float, CPUPlace>();
TestXYZNKernel<jit::kVAddRelu, float, paddle::platform::CPUPlace>(); TestXYZNKernel<jit::kVAddRelu, double, CPUPlace>();
TestXYZNKernel<jit::kVAddRelu, double, paddle::platform::CPUPlace>();
} }
TEST(JITKernel, kVSub) { TEST(JITKernel, kVSub) {
namespace jit = paddle::operators::jit; TestXYZNKernel<jit::kVSub, float, CPUPlace>();
TestXYZNKernel<jit::kVSub, float, paddle::platform::CPUPlace>(); TestXYZNKernel<jit::kVSub, double, CPUPlace>();
TestXYZNKernel<jit::kVSub, double, paddle::platform::CPUPlace>();
} }
// AXYNTuples // AXYNTuples
TEST(JITKernel, kVScal) { TEST(JITKernel, kVScal) {
namespace jit = paddle::operators::jit; TestAXYNKernel<jit::kVScal, float, CPUPlace>();
TestAXYNKernel<jit::kVScal, float, paddle::platform::CPUPlace>(); TestAXYNKernel<jit::kVScal, double, CPUPlace>();
TestAXYNKernel<jit::kVScal, double, paddle::platform::CPUPlace>();
} }
TEST(JITKernel, kVAddBias) { TEST(JITKernel, kVAddBias) {
namespace jit = paddle::operators::jit; TestAXYNKernel<jit::kVAddBias, float, CPUPlace>();
TestAXYNKernel<jit::kVAddBias, float, paddle::platform::CPUPlace>(); TestAXYNKernel<jit::kVAddBias, double, CPUPlace>();
TestAXYNKernel<jit::kVAddBias, double, paddle::platform::CPUPlace>(); }
// XRNTuples
TEST(JITKernel, kHMax) {
TestXRNKernel<jit::kHMax, float, CPUPlace>();
TestXRNKernel<jit::kHMax, double, CPUPlace>();
}
TEST(JITKernel, kHSum) {
TestXRNKernel<jit::kHSum, float, CPUPlace>();
TestXRNKernel<jit::kHSum, double, CPUPlace>();
} }
// XYNTuples // XYNTuples
TEST(JITKernel, kVRelu) { TEST(JITKernel, kVRelu) {
namespace jit = paddle::operators::jit; TestXYNKernel<jit::kVRelu, float, CPUPlace>();
TestXYNKernel<jit::kVRelu, float, paddle::platform::CPUPlace>(); TestXYNKernel<jit::kVRelu, double, CPUPlace>();
TestXYNKernel<jit::kVRelu, double, paddle::platform::CPUPlace>();
} }
TEST(JITKernel, kVIdentity) { TEST(JITKernel, kVIdentity) {
namespace jit = paddle::operators::jit; TestXYNKernel<jit::kVIdentity, float, CPUPlace>();
TestXYNKernel<jit::kVIdentity, float, paddle::platform::CPUPlace>(); TestXYNKernel<jit::kVIdentity, double, CPUPlace>();
TestXYNKernel<jit::kVIdentity, double, paddle::platform::CPUPlace>();
} }
TEST(JITKernel, kVSquare) { TEST(JITKernel, kVSquare) {
namespace jit = paddle::operators::jit; TestXYNKernel<jit::kVSquare, float, CPUPlace>();
TestXYNKernel<jit::kVSquare, float, paddle::platform::CPUPlace>(); TestXYNKernel<jit::kVSquare, double, CPUPlace>();
TestXYNKernel<jit::kVSquare, double, paddle::platform::CPUPlace>();
} }
TEST(JITKernel, kVExp) { TEST(JITKernel, kVExp) {
namespace jit = paddle::operators::jit; TestXYNKernel<jit::kVExp, float, CPUPlace>();
TestXYNKernel<jit::kVExp, float, paddle::platform::CPUPlace>(); TestXYNKernel<jit::kVExp, double, CPUPlace>();
TestXYNKernel<jit::kVExp, double, paddle::platform::CPUPlace>();
} }
TEST(JITKernel, kVSigmoid) { TEST(JITKernel, kVSigmoid) {
namespace jit = paddle::operators::jit; TestXYNKernel<jit::kVSigmoid, float, CPUPlace>();
TestXYNKernel<jit::kVSigmoid, float, paddle::platform::CPUPlace>(); TestXYNKernel<jit::kVSigmoid, double, CPUPlace>();
TestXYNKernel<jit::kVSigmoid, double, paddle::platform::CPUPlace>();
} }
TEST(JITKernel, kVTanh) { TEST(JITKernel, kVTanh) {
namespace jit = paddle::operators::jit; TestXYNKernel<jit::kVTanh, float, CPUPlace>();
TestXYNKernel<jit::kVTanh, float, paddle::platform::CPUPlace>(); TestXYNKernel<jit::kVTanh, double, CPUPlace>();
TestXYNKernel<jit::kVTanh, double, paddle::platform::CPUPlace>();
} }
// LSTM // LSTM
TEST(JITKernel, kLSTMCtHt) { TEST(JITKernel, kLSTMCtHt) {
namespace jit = paddle::operators::jit; TestLSTMKernel<jit::kLSTMCtHt, float, CPUPlace>();
TestLSTMKernel<jit::kLSTMCtHt, float, paddle::platform::CPUPlace>(); TestLSTMKernel<jit::kLSTMCtHt, double, CPUPlace>();
TestLSTMKernel<jit::kLSTMCtHt, double, paddle::platform::CPUPlace>();
} }
TEST(JITKernel, kLSTMC1H1) { TEST(JITKernel, kLSTMC1H1) {
namespace jit = paddle::operators::jit; TestLSTMKernel<jit::kLSTMC1H1, float, CPUPlace>();
TestLSTMKernel<jit::kLSTMC1H1, float, paddle::platform::CPUPlace>(); TestLSTMKernel<jit::kLSTMC1H1, double, CPUPlace>();
TestLSTMKernel<jit::kLSTMC1H1, double, paddle::platform::CPUPlace>();
} }
// GRU // GRU
TEST(JITKernel, kGRUH1) { TEST(JITKernel, kGRUH1) {
namespace jit = paddle::operators::jit; TestGRUKernel<jit::kGRUH1, float, CPUPlace>();
TestGRUKernel<jit::kGRUH1, float, paddle::platform::CPUPlace>(); TestGRUKernel<jit::kGRUH1, double, CPUPlace>();
TestGRUKernel<jit::kGRUH1, double, paddle::platform::CPUPlace>();
} }
TEST(JITKernel, kGRUHtPart1) { TEST(JITKernel, kGRUHtPart1) {
namespace jit = paddle::operators::jit; TestGRUKernel<jit::kGRUHtPart1, float, CPUPlace>();
TestGRUKernel<jit::kGRUHtPart1, float, paddle::platform::CPUPlace>(); TestGRUKernel<jit::kGRUHtPart1, double, CPUPlace>();
TestGRUKernel<jit::kGRUHtPart1, double, paddle::platform::CPUPlace>();
} }
TEST(JITKernel, kGRUHtPart2) { TEST(JITKernel, kGRUHtPart2) {
namespace jit = paddle::operators::jit; TestGRUKernel<jit::kGRUHtPart2, float, CPUPlace>();
TestGRUKernel<jit::kGRUHtPart2, float, paddle::platform::CPUPlace>(); TestGRUKernel<jit::kGRUHtPart2, double, CPUPlace>();
TestGRUKernel<jit::kGRUHtPart2, double, paddle::platform::CPUPlace>();
} }
TEST(JITKernel, kSeqPool) { TEST(JITKernel, kSeqPool) {
namespace jit = paddle::operators::jit; TestSeqPoolKernel<jit::kSeqPool, float, CPUPlace>();
TestSeqPoolKernel<jit::kSeqPool, float, paddle::platform::CPUPlace>(); TestSeqPoolKernel<jit::kSeqPool, double, CPUPlace>();
TestSeqPoolKernel<jit::kSeqPool, double, paddle::platform::CPUPlace>();
} }
TEST(JITKernel, kMatMul) { TEST(JITKernel, kMatMul) {
namespace jit = paddle::operators::jit; TestMatMulKernel<jit::kMatMul, float, CPUPlace>();
TestMatMulKernel<jit::kMatMul, float, paddle::platform::CPUPlace>(); TestMatMulKernel<jit::kMatMul, double, CPUPlace>();
TestMatMulKernel<jit::kMatMul, double, paddle::platform::CPUPlace>(); }
TEST(JITKernel, kSoftmax) {
TestSoftmaxKernel<jit::kSoftmax, float, CPUPlace>();
TestSoftmaxKernel<jit::kSoftmax, double, CPUPlace>();
} }
TEST(JITKernel, kNCHW16CMulNC) { TEST(JITKernel, kNCHW16CMulNC) {
namespace jit = paddle::operators::jit; TestNCHW16CMulNCKernel<jit::kNCHW16CMulNC, float, CPUPlace>();
TestNCHW16CMulNCKernel<jit::kNCHW16CMulNC, float, TestNCHW16CMulNCKernel<jit::kNCHW16CMulNC, double, CPUPlace>();
paddle::platform::CPUPlace>();
TestNCHW16CMulNCKernel<jit::kNCHW16CMulNC, double,
paddle::platform::CPUPlace>();
} }
// TODO(yihua/TJ): add crf decoding and layer norm unit tests // TODO(yihua/TJ): add crf decoding and layer norm unit tests
......
...@@ -53,7 +53,7 @@ math_library(sequence2batch) ...@@ -53,7 +53,7 @@ math_library(sequence2batch)
math_library(sequence_padding) math_library(sequence_padding)
math_library(sequence_pooling DEPS math_function jit_kernel_helper) math_library(sequence_pooling DEPS math_function jit_kernel_helper)
math_library(sequence_scale) math_library(sequence_scale)
math_library(softmax DEPS math_function) math_library(softmax DEPS math_function jit_kernel_helper)
math_library(beam_search DEPS math_function) math_library(beam_search DEPS math_function)
math_library(matrix_bit_code) math_library(matrix_bit_code)
......
...@@ -16,8 +16,8 @@ limitations under the License. */ ...@@ -16,8 +16,8 @@ limitations under the License. */
#include <vector> #include <vector>
#include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/tensor.h" #include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/operators/jit/kernels.h"
#include "paddle/fluid/operators/math/blas.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
namespace math { namespace math {
...@@ -81,28 +81,10 @@ class SoftmaxFunctor<DeviceContext, float, true, enable_if_CPU<DeviceContext>> { ...@@ -81,28 +81,10 @@ class SoftmaxFunctor<DeviceContext, float, true, enable_if_CPU<DeviceContext>> {
const int kBatchDim = 0; const int kBatchDim = 0;
const int kClassDim = 1; const int kClassDim = 1;
// 2D data. Batch x C // 2D data. Batch x C
const int batch_size = in_dims[kBatchDim]; auto compute_softmax =
const int num_classes = in_dims[kClassDim]; jit::Get<jit::kSoftmax, jit::SoftmaxTuples<float>, platform::CPUPlace>(
std::vector<float> entities(batch_size); in_dims[kClassDim]);
auto blas = math::GetBlas<DeviceContext, float>(context); compute_softmax(in_data, out_data, in_dims[kClassDim], in_dims[kBatchDim]);
for (int n = 0; n < batch_size; ++n) {
entities[n] = in_data[n * num_classes];
for (int c = 1; c < num_classes; ++c) {
entities[n] = in_data[n * num_classes + c] > entities[n]
? in_data[n * num_classes + c]
: entities[n];
}
for (int c = 0; c < num_classes; ++c) {
out_data[n * num_classes + c] =
in_data[n * num_classes + c] - entities[n];
}
}
blas.VEXP(num_classes * batch_size, out_data, out_data);
for (int n = 0; n < batch_size; ++n) {
auto sum = blas.ASUM(num_classes, &out_data[n * num_classes], 1);
blas.SCAL(num_classes, 1.0f / sum, &out_data[n * num_classes]);
}
} }
}; };
......
...@@ -70,6 +70,8 @@ extern void* mklml_dso_handle; ...@@ -70,6 +70,8 @@ extern void* mklml_dso_handle;
__macro(cblas_ddot); \ __macro(cblas_ddot); \
__macro(cblas_sasum); \ __macro(cblas_sasum); \
__macro(cblas_dasum); \ __macro(cblas_dasum); \
__macro(cblas_isamax); \
__macro(cblas_idamax); \
__macro(cblas_sscal); \ __macro(cblas_sscal); \
__macro(cblas_dscal); \ __macro(cblas_dscal); \
__macro(vsAdd); \ __macro(vsAdd); \
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册