提交 d10a9df7 编写于 作者: T tensor-tang

add vaddbias and unit test

上级 3c8b6511
...@@ -82,6 +82,12 @@ class VScalKernel : public Kernel { ...@@ -82,6 +82,12 @@ class VScalKernel : public Kernel {
virtual void Compute(const int n, const T a, T *x) const = 0; virtual void Compute(const int n, const T a, T *x) const = 0;
}; };
template <typename T>
class VAddBiasKernel : public Kernel {
public:
virtual void Compute(const int n, const T a, const T *x, T *y) const = 0;
};
template <typename T> template <typename T>
class VExpKernel : public Kernel { class VExpKernel : public Kernel {
public: public:
......
...@@ -216,9 +216,61 @@ INTRI8_INPLACE_FLOAT(jit::avx512f); ...@@ -216,9 +216,61 @@ INTRI8_INPLACE_FLOAT(jit::avx512f);
#undef MKL_FLOAT #undef MKL_FLOAT
#undef MKL_DOUBLE #undef MKL_DOUBLE
/* VAddBias JitKernel */
template <typename T, platform::jit::cpu_isa_t isa, jit_block>
class VAddBiasKernelImpl : public VAddBiasKernel<T> {
public:
void Compute(const int n, const T a, const T* x, T* y) const override {
for (int i = 0; i < n; ++i) {
y[i] = x[i] + a;
}
}
};
#define INTRI8_FLOAT(isa) \
template <> \
void VAddBiasKernelImpl<float, isa, kEQ8>::Compute( \
const int n, const float a, const float* x, float* y) const { \
__m256 tmp = _mm256_loadu_ps(x); \
tmp = _mm256_add_ps(tmp, _mm256_set1_ps(a)); \
_mm256_storeu_ps(y, tmp); \
}
#define INTRI16_FLOAT(isa) \
template <> \
void VAddBiasKernelImpl<float, isa, kEQ16>::Compute( \
const int n, const float a, const float* x, float* y) const { \
__m256 tmp0 = _mm256_loadu_ps(x); \
__m256 tmp1 = _mm256_loadu_ps(x + 8); \
tmp0 = _mm256_add_ps(tmp0, _mm256_set1_ps(a)); \
tmp1 = _mm256_add_ps(tmp1, _mm256_set1_ps(a)); \
_mm256_storeu_ps(y, tmp0); \
_mm256_storeu_ps(y + 8, tmp1); \
}
#ifdef __AVX__
INTRI8_FLOAT(jit::avx);
INTRI16_FLOAT(jit::avx);
#endif
#ifdef __AVX2__
INTRI8_FLOAT(jit::avx2);
INTRI16_FLOAT(jit::avx2);
#endif
#ifdef __AVX512F__
INTRI8_FLOAT(jit::avx512f);
INTRI16_FLOAT(jit::avx512f);
#endif
// TODO(TJ): eq16 test and complete avx512
#undef INTRI8_FLOAT
#undef INTRI16_FLOAT
#undef INTRI_GT8LT16_FLOAT
#undef INTRI_GT16_FLOAT
REGISTER_JITKERNEL(vmul, VMulKernel); REGISTER_JITKERNEL(vmul, VMulKernel);
REGISTER_JITKERNEL(vadd, VAddKernel); REGISTER_JITKERNEL(vadd, VAddKernel);
REGISTER_JITKERNEL(vscal, VScalKernel); REGISTER_JITKERNEL(vscal, VScalKernel);
REGISTER_JITKERNEL(vaddb, VAddBiasKernel);
} // namespace jitkernel } // namespace jitkernel
} // namespace math } // namespace math
......
...@@ -221,16 +221,15 @@ INTRI_GT16_FLOAT(jit::avx); ...@@ -221,16 +221,15 @@ INTRI_GT16_FLOAT(jit::avx);
#ifdef __AVX2__ #ifdef __AVX2__
INTRI8_FLOAT(jit::avx2); INTRI8_FLOAT(jit::avx2);
INTRI16_FLOAT(jit::avx2); INTRI16_FLOAT(jit::avx2);
INTRI_GT8LT16_FLOAT(jit::avx2); // INTRI_GT8LT16_FLOAT(jit::avx2);
INTRI_GT16_FLOAT(jit::avx2); // INTRI_GT16_FLOAT(jit::avx2);
#endif #endif
#ifdef __AVX512F__ #ifdef __AVX512F__
INTRI8_FLOAT(jit::avx512f); INTRI8_FLOAT(jit::avx512f);
INTRI16_FLOAT(jit::avx512f); INTRI16_FLOAT(jit::avx512f);
INTRI_GT8LT16_FLOAT(jit::avx512f); // INTRI_GT8LT16_FLOAT(jit::avx512f);
INTRI_GT16_FLOAT(jit::avx512f); // INTRI_GT16_FLOAT(jit::avx512f);
#endif #endif
// TODO(TJ): eq16 test and complete avx512
#undef INTRI8_FLOAT #undef INTRI8_FLOAT
#undef INTRI16_FLOAT #undef INTRI16_FLOAT
......
...@@ -48,6 +48,43 @@ void RandomVec(const int n, T* a, const T lower = static_cast<T>(-20.f), ...@@ -48,6 +48,43 @@ void RandomVec(const int n, T* a, const T lower = static_cast<T>(-20.f),
} }
} }
void vaddbias_ref(const int n, const float a, const float* x, float* y) {
for (int i = 0; i < n; ++i) {
y[i] = x[i] + a;
}
}
TEST(JitKernel, vaddbias) {
namespace jit = paddle::operators::math::jitkernel;
for (int d : {7, 8, 15, 16, 30, 64, 100, 128, 256}) {
std::vector<float> x(d);
std::vector<float> zref(d), ztgt(d);
RandomVec<float>(d, x.data(), -2.f, 2.f);
const auto& ker =
jit::KernelPool::Instance().template Get<jit::VAddBiasKernel<float>>(d);
const float a = 2.f;
const float* x_data = x.data();
float* ztgt_data = ztgt.data();
float* zref_data = zref.data();
auto trefs = GetCurrentUS();
for (int i = 0; i < repeat; ++i) {
vaddbias_ref(d, a, x_data, zref_data);
}
auto trefe = GetCurrentUS();
auto ttgts = GetCurrentUS();
for (int i = 0; i < repeat; ++i) {
ker->Compute(d, a, x_data, ztgt_data);
}
auto ttgte = GetCurrentUS();
VLOG(3) << "Vec size " << d << ": refer takes: " << (trefe - trefs) / repeat
<< " us, tgt takes: " << (ttgte - ttgts) / repeat;
for (int i = 0; i < d; ++i) {
EXPECT_NEAR(ztgt_data[i], zref_data[i], 1e-3);
}
}
}
void vexp_ref(const int n, const float* x, float* y) { void vexp_ref(const int n, const float* x, float* y) {
for (int i = 0; i < n; ++i) { for (int i = 0; i < n; ++i) {
y[i] = std::exp(x[i]); y[i] = std::exp(x[i]);
...@@ -135,7 +172,7 @@ void vsigmoid_better( ...@@ -135,7 +172,7 @@ void vsigmoid_better(
TEST(JitKernel, vsigmoid) { TEST(JitKernel, vsigmoid) {
namespace jit = paddle::operators::math::jitkernel; namespace jit = paddle::operators::math::jitkernel;
for (int d : {7, 8, 15, 16, 30, 128}) { for (int d : {7, 8, 15, 16, 30, 32, 64, 100, 128, 256}) {
std::vector<float> x(d); std::vector<float> x(d);
std::vector<float> zref(d), ztgt(d); std::vector<float> zref(d), ztgt(d);
RandomVec<float>(d, x.data(), -2.f, 2.f); RandomVec<float>(d, x.data(), -2.f, 2.f);
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册