Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
机器未来
Paddle
提交
e6d8aca3
P
Paddle
项目概览
机器未来
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
e6d8aca3
编写于
10月 08, 2018
作者:
T
tensor-tang
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
refine code and fix
上级
ea7dc9cb
变更
5
隐藏空白更改
内联
并排
Showing
5 changed file
with
214 addition
and
213 deletion
+214
-213
paddle/fluid/operators/math/jit_kernel.h
paddle/fluid/operators/math/jit_kernel.h
+6
-6
paddle/fluid/operators/math/jit_kernel_blas.cc
paddle/fluid/operators/math/jit_kernel_blas.cc
+97
-93
paddle/fluid/operators/math/jit_kernel_exp.cc
paddle/fluid/operators/math/jit_kernel_exp.cc
+99
-102
paddle/fluid/operators/math/jit_kernel_macro.h
paddle/fluid/operators/math/jit_kernel_macro.h
+1
-1
paddle/fluid/operators/math/jit_kernel_test.cc
paddle/fluid/operators/math/jit_kernel_test.cc
+11
-11
未找到文件。
paddle/fluid/operators/math/jit_kernel.h
浏览文件 @
e6d8aca3
...
...
@@ -64,32 +64,32 @@ class KernelPool {
template
<
typename
T
>
class
VMulKernel
:
public
Kernel
{
public:
virtual
void
Compute
(
const
int
n
,
const
T
*
x
,
const
T
*
y
,
T
*
z
)
const
=
0
;
virtual
void
Compute
(
const
T
*
x
,
const
T
*
y
,
T
*
z
)
const
=
0
;
};
template
<
typename
T
>
class
VAddKernel
:
public
Kernel
{
public:
virtual
void
Compute
(
const
int
n
,
const
T
*
x
,
const
T
*
y
,
T
*
z
)
const
=
0
;
virtual
void
Compute
(
const
T
*
x
,
const
T
*
y
,
T
*
z
)
const
=
0
;
};
template
<
typename
T
>
class
VScalKernel
:
public
Kernel
{
public:
virtual
void
Compute
(
const
int
n
,
const
T
a
,
const
T
*
x
,
T
*
y
)
const
=
0
;
virtual
void
Compute
(
const
int
n
,
const
T
a
,
T
*
x
)
const
=
0
;
virtual
void
Compute
(
const
T
a
,
const
T
*
x
,
T
*
y
)
const
=
0
;
virtual
void
Compute
(
const
T
a
,
T
*
x
)
const
=
0
;
};
template
<
typename
T
>
class
VAddBiasKernel
:
public
Kernel
{
public:
virtual
void
Compute
(
const
int
n
,
const
T
a
,
const
T
*
x
,
T
*
y
)
const
=
0
;
virtual
void
Compute
(
const
T
a
,
const
T
*
x
,
T
*
y
)
const
=
0
;
};
template
<
typename
T
>
class
VExpKernel
:
public
Kernel
{
public:
virtual
void
Compute
(
const
int
n
,
const
T
*
x
,
T
*
y
)
const
=
0
;
virtual
void
Compute
(
const
T
*
x
,
T
*
y
)
const
=
0
;
};
template
<
typename
T
>
...
...
paddle/fluid/operators/math/jit_kernel_blas.cc
浏览文件 @
e6d8aca3
...
...
@@ -34,41 +34,42 @@ namespace jit = platform::jit;
template
<
typename
T
,
platform
::
jit
::
cpu_isa_t
isa
,
jit_block
>
class
VMulKernelImpl
:
public
VMulKernel
<
T
>
{
public:
void
Compute
(
const
int
n
,
const
T
*
x
,
const
T
*
y
,
T
*
z
)
const
override
{
for
(
int
i
=
0
;
i
<
n
;
++
i
)
{
explicit
VMulKernelImpl
(
int
d
)
:
VMulKernel
<
T
>
()
{
this
->
num_
=
d
;
}
void
Compute
(
const
T
*
x
,
const
T
*
y
,
T
*
z
)
const
override
{
for
(
int
i
=
0
;
i
<
this
->
num_
;
++
i
)
{
z
[
i
]
=
x
[
i
]
*
y
[
i
];
}
}
};
#ifdef PADDLE_WITH_MKLML
#define MKL_FLOAT(isa, block)
\
template <>
\
void VMulKernelImpl<float, isa, block>::Compute(
\
const
int n, const
float* x, const float* y, float* z) const { \
platform::dynload::vsMul(
n, x, y, z);
\
#define MKL_FLOAT(isa, block) \
template <> \
void VMulKernelImpl<float, isa, block>::Compute( \
const float* x, const float* y, float* z) const { \
platform::dynload::vsMul(
this->num_, x, y, z);
\
}
#define MKL_DOUBLE(isa, block)
\
template <>
\
void VMulKernelImpl<double, isa, block>::Compute(
\
const
int n, const
double* x, const double* y, double* z) const { \
platform::dynload::vdMul(
n, x, y, z);
\
#define MKL_DOUBLE(isa, block) \
template <> \
void VMulKernelImpl<double, isa, block>::Compute( \
const double* x, const double* y, double* z) const { \
platform::dynload::vdMul(
this->num_, x, y, z);
\
}
FOR_EACH_ISA
(
MKL_FLOAT
,
kGT16
);
FOR_EACH_ISA_BLOCK
(
MKL_DOUBLE
);
#endif
#define INTRI8_FLOAT(isa)
\
template <>
\
void VMulKernelImpl<float, isa, kEQ8>::Compute(
\
const
int n, const
float* x, const float* y, float* z) const { \
__m256 tmpx, tmpy;
\
tmpx = _mm256_loadu_ps(x);
\
tmpy = _mm256_loadu_ps(y);
\
tmpx = _mm256_mul_ps(tmpx, tmpy);
\
_mm256_storeu_ps(z, tmpx);
\
#define INTRI8_FLOAT(isa) \
template <> \
void VMulKernelImpl<float, isa, kEQ8>::Compute( \
const float* x, const float* y, float* z) const { \
__m256 tmpx, tmpy; \
tmpx = _mm256_loadu_ps(x); \
tmpy = _mm256_loadu_ps(y); \
tmpx = _mm256_mul_ps(tmpx, tmpy); \
_mm256_storeu_ps(z, tmpx); \
}
// avx > for > mkl
...
...
@@ -90,41 +91,42 @@ INTRI8_FLOAT(jit::avx512f);
template
<
typename
T
,
platform
::
jit
::
cpu_isa_t
isa
,
jit_block
>
class
VAddKernelImpl
:
public
VAddKernel
<
T
>
{
public:
void
Compute
(
const
int
n
,
const
T
*
x
,
const
T
*
y
,
T
*
z
)
const
override
{
for
(
int
i
=
0
;
i
<
n
;
++
i
)
{
explicit
VAddKernelImpl
(
int
d
)
:
VAddKernel
<
T
>
()
{
this
->
num_
=
d
;
}
void
Compute
(
const
T
*
x
,
const
T
*
y
,
T
*
z
)
const
override
{
for
(
int
i
=
0
;
i
<
this
->
num_
;
++
i
)
{
z
[
i
]
=
x
[
i
]
+
y
[
i
];
}
}
};
#ifdef PADDLE_WITH_MKLML
#define MKL_FLOAT(isa, block)
\
template <>
\
void VAddKernelImpl<float, isa, block>::Compute(
\
const
int n, const
float* x, const float* y, float* z) const { \
platform::dynload::vsAdd(
n, x, y, z);
\
#define MKL_FLOAT(isa, block) \
template <> \
void VAddKernelImpl<float, isa, block>::Compute( \
const float* x, const float* y, float* z) const { \
platform::dynload::vsAdd(
this->num_, x, y, z);
\
}
#define MKL_DOUBLE(isa, block)
\
template <>
\
void VAddKernelImpl<double, isa, block>::Compute(
\
const
int n, const
double* x, const double* y, double* z) const { \
platform::dynload::vdAdd(
n, x, y, z);
\
#define MKL_DOUBLE(isa, block) \
template <> \
void VAddKernelImpl<double, isa, block>::Compute( \
const double* x, const double* y, double* z) const { \
platform::dynload::vdAdd(
this->num_, x, y, z);
\
}
FOR_EACH_ISA
(
MKL_FLOAT
,
kGT16
);
FOR_EACH_ISA_BLOCK
(
MKL_DOUBLE
);
#endif
#define INTRI8_FLOAT(isa)
\
template <>
\
void VAddKernelImpl<float, isa, kEQ8>::Compute(
\
const
int n, const
float* x, const float* y, float* z) const { \
__m256 tmpx, tmpy;
\
tmpx = _mm256_loadu_ps(x);
\
tmpy = _mm256_loadu_ps(y);
\
tmpx = _mm256_add_ps(tmpx, tmpy);
\
_mm256_storeu_ps(z, tmpx);
\
#define INTRI8_FLOAT(isa) \
template <> \
void VAddKernelImpl<float, isa, kEQ8>::Compute( \
const float* x, const float* y, float* z) const { \
__m256 tmpx, tmpy; \
tmpx = _mm256_loadu_ps(x); \
tmpy = _mm256_loadu_ps(y); \
tmpx = _mm256_add_ps(tmpx, tmpy); \
_mm256_storeu_ps(z, tmpx); \
}
#ifdef __AVX__
INTRI8_FLOAT
(
jit
::
avx
);
...
...
@@ -145,56 +147,57 @@ INTRI8_FLOAT(jit::avx512f);
template
<
typename
T
,
platform
::
jit
::
cpu_isa_t
isa
,
jit_block
>
class
VScalKernelImpl
:
public
VScalKernel
<
T
>
{
public:
void
Compute
(
const
int
n
,
const
T
a
,
const
T
*
x
,
T
*
y
)
const
override
{
for
(
int
i
=
0
;
i
<
n
;
++
i
)
{
explicit
VScalKernelImpl
(
int
d
)
:
VScalKernel
<
T
>
()
{
this
->
num_
=
d
;
}
void
Compute
(
const
T
a
,
const
T
*
x
,
T
*
y
)
const
override
{
for
(
int
i
=
0
;
i
<
this
->
num_
;
++
i
)
{
y
[
i
]
=
a
*
x
[
i
];
}
}
void
Compute
(
const
int
n
,
const
T
a
,
T
*
x
)
const
override
{
for
(
int
i
=
0
;
i
<
n
;
++
i
)
{
void
Compute
(
const
T
a
,
T
*
x
)
const
override
{
for
(
int
i
=
0
;
i
<
this
->
num_
;
++
i
)
{
x
[
i
]
=
a
*
x
[
i
];
}
}
};
#ifdef PADDLE_WITH_MKLML
#define MKL_FLOAT(isa, block)
\
template <>
\
void VScalKernelImpl<float, isa, block>::Compute(const
int n, const float a,
\
float* x) const {
\
platform::dynload::cblas_sscal(
n, a, x, 1);
\
#define MKL_FLOAT(isa, block) \
template <> \
void VScalKernelImpl<float, isa, block>::Compute(const
float a, float* x)
\
const {
\
platform::dynload::cblas_sscal(
this->num_, a, x, 1);
\
}
#define MKL_DOUBLE(isa, block) \
template <> \
void VScalKernelImpl<double, isa, block>::Compute(
\
const
int n, const double a, double* x) const {
\
platform::dynload::cblas_dscal(
n, a, x, 1);
\
#define MKL_DOUBLE(isa, block)
\
template <>
\
void VScalKernelImpl<double, isa, block>::Compute(
const double a, double* x)
\
const
{
\
platform::dynload::cblas_dscal(
this->num_, a, x, 1);
\
}
FOR_EACH_ISA
(
MKL_FLOAT
,
kGT16
);
FOR_EACH_ISA_BLOCK
(
MKL_DOUBLE
);
#endif
#define INTRI8_FLOAT(isa)
\
template <>
\
void VScalKernelImpl<float, isa, kEQ8>::Compute(
\
const
int n, const
float a, const float* x, float* y) const { \
__m256 tmp;
\
__m256 scalar = _mm256_set1_ps(a);
\
tmp = _mm256_loadu_ps(x);
\
tmp = _mm256_mul_ps(tmp, scalar);
\
_mm256_storeu_ps(y, tmp);
\
#define INTRI8_FLOAT(isa) \
template <> \
void VScalKernelImpl<float, isa, kEQ8>::Compute( \
const float a, const float* x, float* y) const { \
__m256 tmp; \
__m256 scalar = _mm256_set1_ps(a); \
tmp = _mm256_loadu_ps(x); \
tmp = _mm256_mul_ps(tmp, scalar); \
_mm256_storeu_ps(y, tmp); \
}
#define INTRI8_INPLACE_FLOAT(isa)
\
template <>
\
void VScalKernelImpl<float, isa, kEQ8>::Compute(const
int n, const float a,
\
float* x) const {
\
__m256 tmp;
\
__m256 scalar = _mm256_set1_ps(a);
\
tmp = _mm256_loadu_ps(x);
\
tmp = _mm256_mul_ps(tmp, scalar);
\
_mm256_storeu_ps(x, tmp);
\
#define INTRI8_INPLACE_FLOAT(isa) \
template <> \
void VScalKernelImpl<float, isa, kEQ8>::Compute(const
float a, float* x)
\
const {
\
__m256 tmp; \
__m256 scalar = _mm256_set1_ps(a); \
tmp = _mm256_loadu_ps(x); \
tmp = _mm256_mul_ps(tmp, scalar); \
_mm256_storeu_ps(x, tmp); \
}
#ifdef __AVX__
...
...
@@ -220,32 +223,33 @@ INTRI8_INPLACE_FLOAT(jit::avx512f);
template
<
typename
T
,
platform
::
jit
::
cpu_isa_t
isa
,
jit_block
>
class
VAddBiasKernelImpl
:
public
VAddBiasKernel
<
T
>
{
public:
void
Compute
(
const
int
n
,
const
T
a
,
const
T
*
x
,
T
*
y
)
const
override
{
for
(
int
i
=
0
;
i
<
n
;
++
i
)
{
explicit
VAddBiasKernelImpl
(
int
d
)
:
VAddBiasKernel
<
T
>
()
{
this
->
num_
=
d
;
}
void
Compute
(
const
T
a
,
const
T
*
x
,
T
*
y
)
const
override
{
for
(
int
i
=
0
;
i
<
this
->
num_
;
++
i
)
{
y
[
i
]
=
x
[
i
]
+
a
;
}
}
};
#define INTRI8_FLOAT(isa)
\
template <>
\
void VAddBiasKernelImpl<float, isa, kEQ8>::Compute(
\
const
int n, const
float a, const float* x, float* y) const { \
__m256 tmp = _mm256_loadu_ps(x);
\
tmp = _mm256_add_ps(tmp, _mm256_set1_ps(a));
\
_mm256_storeu_ps(y, tmp);
\
#define INTRI8_FLOAT(isa) \
template <> \
void VAddBiasKernelImpl<float, isa, kEQ8>::Compute( \
const float a, const float* x, float* y) const { \
__m256 tmp = _mm256_loadu_ps(x); \
tmp = _mm256_add_ps(tmp, _mm256_set1_ps(a)); \
_mm256_storeu_ps(y, tmp); \
}
#define INTRI16_FLOAT(isa)
\
template <>
\
void VAddBiasKernelImpl<float, isa, kEQ16>::Compute(
\
const
int n, const
float a, const float* x, float* y) const { \
__m256 tmp0 = _mm256_loadu_ps(x);
\
__m256 tmp1 = _mm256_loadu_ps(x + 8);
\
tmp0 = _mm256_add_ps(tmp0, _mm256_set1_ps(a));
\
tmp1 = _mm256_add_ps(tmp1, _mm256_set1_ps(a));
\
_mm256_storeu_ps(y, tmp0);
\
_mm256_storeu_ps(y + 8, tmp1);
\
#define INTRI16_FLOAT(isa) \
template <> \
void VAddBiasKernelImpl<float, isa, kEQ16>::Compute( \
const float a, const float* x, float* y) const { \
__m256 tmp0 = _mm256_loadu_ps(x); \
__m256 tmp1 = _mm256_loadu_ps(x + 8); \
tmp0 = _mm256_add_ps(tmp0, _mm256_set1_ps(a)); \
tmp1 = _mm256_add_ps(tmp1, _mm256_set1_ps(a)); \
_mm256_storeu_ps(y, tmp0); \
_mm256_storeu_ps(y + 8, tmp1); \
}
#ifdef __AVX__
...
...
paddle/fluid/operators/math/jit_kernel_exp.cc
浏览文件 @
e6d8aca3
...
...
@@ -40,26 +40,27 @@ namespace jit = platform::jit;
template
<
typename
T
,
jit
::
cpu_isa_t
isa
,
jit_block
>
class
VExpKernelImpl
:
public
VExpKernel
<
T
>
{
public:
void
Compute
(
const
int
n
,
const
T
*
x
,
T
*
y
)
const
override
{
for
(
int
i
=
0
;
i
<
n
;
++
i
)
{
explicit
VExpKernelImpl
(
int
d
)
:
VExpKernel
<
T
>
()
{
this
->
num_
=
d
;
}
void
Compute
(
const
T
*
x
,
T
*
y
)
const
override
{
for
(
int
i
=
0
;
i
<
this
->
num_
;
++
i
)
{
y
[
i
]
=
std
::
exp
(
x
[
i
]);
}
}
};
#ifdef PADDLE_WITH_MKLML
#define MKL_FLOAT(isa, block)
\
template <>
\
void VExpKernelImpl<float, isa, block>::Compute(const
int n, const float* x,
\
float* y) const {
\
platform::dynload::vsExp(
n, x, y);
\
#define MKL_FLOAT(isa, block) \
template <> \
void VExpKernelImpl<float, isa, block>::Compute(const
float* x, float* y)
\
const {
\
platform::dynload::vsExp(
this->num_, x, y);
\
}
#define MKL_DOUBLE(isa, block) \
template <> \
void VExpKernelImpl<double, isa, block>::Compute(
\
const
int n, const double* x, double* y) const {
\
platform::dynload::vdExp(
n, x, y);
\
#define MKL_DOUBLE(isa, block)
\
template <>
\
void VExpKernelImpl<double, isa, block>::Compute(
const double* x, double* y)
\
const
{
\
platform::dynload::vdExp(
this->num_, x, y);
\
}
FOR_EACH_ISA
(
MKL_FLOAT
,
kLT8
);
FOR_EACH_ISA
(
MKL_FLOAT
,
kGT8LT16
);
...
...
@@ -67,24 +68,24 @@ FOR_EACH_ISA(MKL_FLOAT, kGT16);
FOR_EACH_ISA_BLOCK
(
MKL_DOUBLE
);
#endif
#define INTRI8_FLOAT(isa)
\
template <>
\
void VExpKernelImpl<float, isa, kEQ8>::Compute(const
int n, const float* x,
\
float* y) const {
\
__m256 tmp = _mm256_loadu_ps(x);
\
_mm256_storeu_ps(y, detail::Exp(tmp));
\
#define INTRI8_FLOAT(isa) \
template <> \
void VExpKernelImpl<float, isa, kEQ8>::Compute(const
float* x, float* y)
\
const {
\
__m256 tmp = _mm256_loadu_ps(x); \
_mm256_storeu_ps(y, detail::Exp(tmp)); \
}
#define INTRI16_FLOAT(isa)
\
template <>
\
void VExpKernelImpl<float, isa, kEQ16>::Compute(const
int n, const float* x,
\
float* y) const {
\
__m256 tmp0 = _mm256_loadu_ps(x);
\
__m256 tmp1 = _mm256_loadu_ps(x + 8);
\
tmp0 = detail::Exp(tmp0);
\
tmp1 = detail::Exp(tmp1);
\
_mm256_storeu_ps(y, tmp0);
\
_mm256_storeu_ps(y + 8, tmp1);
\
#define INTRI16_FLOAT(isa) \
template <> \
void VExpKernelImpl<float, isa, kEQ16>::Compute(const
float* x, float* y)
\
const {
\
__m256 tmp0 = _mm256_loadu_ps(x); \
__m256 tmp1 = _mm256_loadu_ps(x + 8); \
tmp0 = detail::Exp(tmp0); \
tmp1 = detail::Exp(tmp1); \
_mm256_storeu_ps(y, tmp0); \
_mm256_storeu_ps(y + 8, tmp1); \
}
#ifdef __AVX__
...
...
@@ -123,7 +124,7 @@ class VSigmoidKernelImpl : public VSigmoidKernel<T> {
y
[
i
]
=
(
x
[
i
]
<
min
)
?
min
:
((
x
[
i
]
>
max
)
?
max
:
x
[
i
]);
y
[
i
]
=
static_cast
<
T
>
(
0
)
-
y
[
i
];
}
vexp_
->
Compute
(
this
->
num_
,
y
,
y
);
vexp_
->
Compute
(
y
,
y
);
for
(
int
i
=
0
;
i
<
this
->
num_
;
++
i
)
{
y
[
i
]
=
static_cast
<
T
>
(
1
)
/
(
static_cast
<
T
>
(
1
)
+
y
[
i
]);
}
...
...
@@ -166,64 +167,66 @@ class VSigmoidKernelImpl : public VSigmoidKernel<T> {
_mm256_storeu_ps(y + 8, tmp1); \
}
#define INTRI_GT8LT16_FLOAT(isa) \
template <> \
VSigmoidKernelImpl<float, isa, kGT8LT16>::VSigmoidKernelImpl(int d) \
: VSigmoidKernel<float>() { \
this->num_ = d; \
this->end_ = AVX_FLOAT_BLOCK; \
this->rest_ = d - this->end_; \
vexp_ = KernelPool::Instance().template Get<VExpKernel<float>>(d); \
} \
template <> \
void VSigmoidKernelImpl<float, isa, kGT8LT16>::Compute(const float* x, \
float* y) const { \
__m256 max = _mm256_set1_ps(SIGMOID_THRESHOLD_MAX); \
__m256 min = _mm256_set1_ps(SIGMOID_THRESHOLD_MIN); \
__m256 tmp = _mm256_loadu_ps(x); \
INTRI_SIGMOID(tmp, min, max); \
_mm256_storeu_ps(y, tmp); \
const float min_ = SIGMOID_THRESHOLD_MIN; \
const float max_ = SIGMOID_THRESHOLD_MAX; \
for (int i = this->end_; i < this->num_; ++i) { \
y[i] = (x[i] < min_) ? min_ : ((x[i] > max_) ? max_ : x[i]); \
y[i] = 0.f - y[i]; \
} \
vexp_->Compute(this->rest_, y + this->end_, y + this->end_); \
for (int i = this->end_; i < this->num_; ++i) { \
y[i] = 1.f / (1.f + y[i]); \
} \
#define INTRI_GT8LT16_FLOAT(isa) \
template <> \
VSigmoidKernelImpl<float, isa, kGT8LT16>::VSigmoidKernelImpl(int d) \
: VSigmoidKernel<float>() { \
this->num_ = d; \
this->end_ = AVX_FLOAT_BLOCK; \
this->rest_ = d - this->end_; \
vexp_ = \
KernelPool::Instance().template Get<VExpKernel<float>>(this->rest_); \
} \
template <> \
void VSigmoidKernelImpl<float, isa, kGT8LT16>::Compute(const float* x, \
float* y) const { \
__m256 max = _mm256_set1_ps(SIGMOID_THRESHOLD_MAX); \
__m256 min = _mm256_set1_ps(SIGMOID_THRESHOLD_MIN); \
__m256 tmp = _mm256_loadu_ps(x); \
INTRI_SIGMOID(tmp, min, max); \
_mm256_storeu_ps(y, tmp); \
const float min_ = SIGMOID_THRESHOLD_MIN; \
const float max_ = SIGMOID_THRESHOLD_MAX; \
for (int i = this->end_; i < this->num_; ++i) { \
y[i] = (x[i] < min_) ? min_ : ((x[i] > max_) ? max_ : x[i]); \
y[i] = 0.f - y[i]; \
} \
vexp_->Compute(y + this->end_, y + this->end_); \
for (int i = this->end_; i < this->num_; ++i) { \
y[i] = 1.f / (1.f + y[i]); \
} \
}
#define INTRI_GT16_FLOAT(isa) \
template <> \
VSigmoidKernelImpl<float, isa, kGT16>::VSigmoidKernelImpl(int d) \
: VSigmoidKernel<float>() { \
this->num_ = d; \
this->rest_ = d % AVX_FLOAT_BLOCK; \
this->end_ = d - this->rest_; \
vexp_ = KernelPool::Instance().template Get<VExpKernel<float>>(d); \
} \
template <> \
void VSigmoidKernelImpl<float, isa, kGT16>::Compute(const float* x, \
float* y) const { \
__m256 max = _mm256_set1_ps(SIGMOID_THRESHOLD_MAX); \
__m256 min = _mm256_set1_ps(SIGMOID_THRESHOLD_MIN); \
for (int i = 0; i < this->end_; i += AVX_FLOAT_BLOCK) { \
__m256 tmp = _mm256_loadu_ps(x + i); \
INTRI_SIGMOID(tmp, min, max); \
_mm256_storeu_ps(y + i, tmp); \
} \
const float min_ = SIGMOID_THRESHOLD_MIN; \
const float max_ = SIGMOID_THRESHOLD_MAX; \
for (int i = this->end_; i < this->num_; ++i) { \
y[i] = (x[i] < min_) ? min_ : ((x[i] > max_) ? max_ : x[i]); \
y[i] = 0.f - y[i]; \
} \
vexp_->Compute(this->rest_, y + this->end_, y + this->end_); \
for (int i = this->end_; i < this->num_; ++i) { \
y[i] = 1.f / (1.f + y[i]); \
} \
#define INTRI_GT16_FLOAT(isa) \
template <> \
VSigmoidKernelImpl<float, isa, kGT16>::VSigmoidKernelImpl(int d) \
: VSigmoidKernel<float>() { \
this->num_ = d; \
this->rest_ = d % AVX_FLOAT_BLOCK; \
this->end_ = d - this->rest_; \
vexp_ = \
KernelPool::Instance().template Get<VExpKernel<float>>(this->rest_); \
} \
template <> \
void VSigmoidKernelImpl<float, isa, kGT16>::Compute(const float* x, \
float* y) const { \
__m256 max = _mm256_set1_ps(SIGMOID_THRESHOLD_MAX); \
__m256 min = _mm256_set1_ps(SIGMOID_THRESHOLD_MIN); \
for (int i = 0; i < this->end_; i += AVX_FLOAT_BLOCK) { \
__m256 tmp = _mm256_loadu_ps(x + i); \
INTRI_SIGMOID(tmp, min, max); \
_mm256_storeu_ps(y + i, tmp); \
} \
const float min_ = SIGMOID_THRESHOLD_MIN; \
const float max_ = SIGMOID_THRESHOLD_MAX; \
for (int i = this->end_; i < this->num_; ++i) { \
y[i] = (x[i] < min_) ? min_ : ((x[i] > max_) ? max_ : x[i]); \
y[i] = 0.f - y[i]; \
} \
vexp_->Compute(y + this->end_, y + this->end_); \
for (int i = this->end_; i < this->num_; ++i) { \
y[i] = 1.f / (1.f + y[i]); \
} \
}
#ifdef __AVX__
...
...
@@ -251,12 +254,7 @@ INTRI16_FLOAT(jit::avx512f);
#undef INTRI_GT16_FLOAT
#undef INTRI_VSIGMOID
#define JITKERNEL_NEW_ACT_IMPL(ker, dtype, isa, k) \
p = std::dynamic_pointer_cast<ker<dtype>>( \
std::make_shared<ker##Impl<dtype, isa, k>>(d))
REGISTER_JITKERNEL_ARGS
(
vsigmoid
,
VSigmoidKernel
,
JITKERNEL_DECLARE
,
JITKERNEL_KEY
,
JITKERNEL_NEW_ACT_IMPL
);
REGISTER_JITKERNEL
(
vsigmoid
,
VSigmoidKernel
);
/* VTanh JitKernel */
template
<
typename
T
,
jit
::
cpu_isa_t
isa
,
jit_block
>
...
...
@@ -269,10 +267,10 @@ class VTanhKernelImpl : public VTanhKernel<T> {
vaddbias_
=
KernelPool
::
Instance
().
template
Get
<
VAddBiasKernel
<
T
>
>
(
d
);
}
void
Compute
(
const
T
*
x
,
T
*
y
)
const
override
{
vscal_
->
Compute
(
this
->
num_
,
static_cast
<
T
>
(
2
),
x
,
y
);
vscal_
->
Compute
(
static_cast
<
T
>
(
2
),
x
,
y
);
vsigmoid_
->
Compute
(
y
,
y
);
vscal_
->
Compute
(
this
->
num_
,
static_cast
<
T
>
(
2
),
y
);
vaddbias_
->
Compute
(
this
->
num_
,
static_cast
<
T
>
(
-
1
),
y
,
y
);
vscal_
->
Compute
(
static_cast
<
T
>
(
2
),
y
);
vaddbias_
->
Compute
(
static_cast
<
T
>
(
-
1
),
y
,
y
);
}
private:
...
...
@@ -332,10 +330,10 @@ class VTanhKernelImpl : public VTanhKernel<T> {
_mm256_storeu_ps(y, tmp); \
x += AVX_FLOAT_BLOCK; \
y += AVX_FLOAT_BLOCK; \
vscal_->Compute(
this->rest_, 2.f, x, y);
\
vscal_->Compute(
2.f, x, y);
\
vsigmoid_->Compute(y, y); \
vscal_->Compute(
this->rest_, 2.f, y);
\
vaddbias_->Compute(
this->rest_, -1.f, y, y);
\
vscal_->Compute(
2.f, y);
\
vaddbias_->Compute(
-1.f, y, y);
\
}
#define INTRI_GT16_FLOAT(isa) \
...
...
@@ -362,10 +360,10 @@ class VTanhKernelImpl : public VTanhKernel<T> {
} \
x += this->end_; \
y += this->end_; \
vscal_->Compute(
this->rest_, 2.f, x, y);
\
vscal_->Compute(
2.f, x, y);
\
vsigmoid_->Compute(y, y); \
vscal_->Compute(
this->rest_, 2.f, y);
\
vaddbias_->Compute(
this->rest_, -1.f, y, y);
\
vscal_->Compute(
2.f, y);
\
vaddbias_->Compute(
-1.f, y, y);
\
}
#ifdef __AVX__
...
...
@@ -391,8 +389,7 @@ INTRI16_FLOAT(jit::avx512f);
#undef INTRI_GT16_FLOAT
#undef INTRI_VTANH
REGISTER_JITKERNEL_ARGS
(
vtanh
,
VTanhKernel
,
JITKERNEL_DECLARE
,
JITKERNEL_KEY
,
JITKERNEL_NEW_ACT_IMPL
);
REGISTER_JITKERNEL
(
vtanh
,
VTanhKernel
);
#undef JITKERNEL_NEW_ACT_IMPL
...
...
paddle/fluid/operators/math/jit_kernel_macro.h
浏览文件 @
e6d8aca3
...
...
@@ -57,7 +57,7 @@ namespace jit = platform::jit;
#define JITKERNEL_NEW_IMPL(ker, dtype, isa, k) \
p = std::dynamic_pointer_cast<ker<dtype>>( \
std::make_shared<ker##Impl<dtype, isa, k>>())
std::make_shared<ker##Impl<dtype, isa, k>>(
d
))
#define JITKERNEL_WITH_DTYPE(ker_key, ker_class, ker_dtype, dtype_key, \
marco_declare, macro_key, macro_impl) \
...
...
paddle/fluid/operators/math/jit_kernel_test.cc
浏览文件 @
e6d8aca3
...
...
@@ -73,7 +73,7 @@ TEST(JitKernel, vaddbias) {
auto
trefe
=
GetCurrentUS
();
auto
ttgts
=
GetCurrentUS
();
for
(
int
i
=
0
;
i
<
repeat
;
++
i
)
{
ker
->
Compute
(
d
,
a
,
x_data
,
ztgt_data
);
ker
->
Compute
(
a
,
x_data
,
ztgt_data
);
}
auto
ttgte
=
GetCurrentUS
();
...
...
@@ -99,7 +99,7 @@ void vexp_mkl(const int n, const float* x, float* y) {
TEST
(
JitKernel
,
vexp
)
{
namespace
jit
=
paddle
::
operators
::
math
::
jitkernel
;
for
(
int
d
:
{
7
,
8
,
15
,
16
,
30
,
128
})
{
for
(
int
d
:
{
7
,
8
,
15
,
16
,
30
,
128
,
256
})
{
std
::
vector
<
float
>
x
(
d
);
std
::
vector
<
float
>
zref
(
d
),
ztgt
(
d
);
RandomVec
<
float
>
(
d
,
x
.
data
(),
-
2.
f
,
2.
f
);
...
...
@@ -124,7 +124,7 @@ TEST(JitKernel, vexp) {
auto
ttgts
=
GetCurrentUS
();
for
(
int
i
=
0
;
i
<
repeat
;
++
i
)
{
ker
->
Compute
(
d
,
x_data
,
ztgt_data
);
ker
->
Compute
(
x_data
,
ztgt_data
);
}
auto
ttgte
=
GetCurrentUS
();
...
...
@@ -164,7 +164,7 @@ void vsigmoid_better(
y
[
i
]
=
(
x
[
i
]
<
min
)
?
min
:
((
x
[
i
]
>
max
)
?
max
:
x
[
i
]);
y
[
i
]
=
0.
f
-
y
[
i
];
}
vexp
->
Compute
(
n
,
y
,
y
);
vexp
->
Compute
(
y
,
y
);
for
(
int
i
=
0
;
i
<
n
;
++
i
)
{
y
[
i
]
=
1.
f
/
(
1.
f
+
y
[
i
]);
}
...
...
@@ -226,10 +226,10 @@ void vtanh_better(
const
paddle
::
operators
::
math
::
jitkernel
::
VAddBiasKernel
<
float
>>&
vaddbias
,
const
int
n
,
const
float
*
x
,
float
*
y
)
{
vscal
->
Compute
(
n
,
2.
f
,
x
,
y
);
vscal
->
Compute
(
2.
f
,
x
,
y
);
vsigmoid
->
Compute
(
y
,
y
);
vscal
->
Compute
(
n
,
2.
f
,
y
);
vaddbias
->
Compute
(
n
,
-
1.
f
,
y
,
y
);
vscal
->
Compute
(
2.
f
,
y
);
vaddbias
->
Compute
(
-
1.
f
,
y
,
y
);
}
TEST
(
JitKernel
,
vtanh
)
{
...
...
@@ -359,12 +359,12 @@ TEST(JitKernel, vscal) {
auto
ttgts
=
GetCurrentUS
();
for
(
int
i
=
0
;
i
<
repeat
;
++
i
)
{
ker
->
Compute
(
d
,
a
,
x_data
,
ztgt_data
);
ker
->
Compute
(
a
,
x_data
,
ztgt_data
);
}
auto
ttgte
=
GetCurrentUS
();
auto
ttgts1
=
GetCurrentUS
();
for
(
int
i
=
0
;
i
<
repeat
;
++
i
)
{
ker
->
Compute
(
d
,
a
,
y_data
);
ker
->
Compute
(
a
,
y_data
);
}
auto
ttgte1
=
GetCurrentUS
();
VLOG
(
3
)
<<
"Vec size "
<<
d
<<
": refer takes: "
<<
(
trefe
-
trefs
)
/
repeat
...
...
@@ -444,7 +444,7 @@ TEST(JitKernel, vmul) {
auto
ttgts
=
GetCurrentUS
();
for
(
int
i
=
0
;
i
<
repeat
;
++
i
)
{
ker
->
Compute
(
d
,
x_data
,
y_data
,
ztgt_data
);
ker
->
Compute
(
x_data
,
y_data
,
ztgt_data
);
}
auto
ttgte
=
GetCurrentUS
();
...
...
@@ -523,7 +523,7 @@ TEST(JitKernel, vadd) {
auto
ttgts
=
GetCurrentUS
();
for
(
int
i
=
0
;
i
<
repeat
;
++
i
)
{
ker
->
Compute
(
d
,
x_data
,
y_data
,
ztgt_data
);
ker
->
Compute
(
x_data
,
y_data
,
ztgt_data
);
}
auto
ttgte
=
GetCurrentUS
();
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录