Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
BaiXuePrincess
Paddle
提交
2513b2cc
P
Paddle
项目概览
BaiXuePrincess
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
2513b2cc
编写于
9月 30, 2018
作者:
T
tensor-tang
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
fix bug vtanh
上级
cf8c8e72
变更
3
隐藏空白更改
内联
并排
Showing
3 changed file
with
167 addition
and
129 deletion
+167
-129
paddle/fluid/operators/math/jit_kernel.h
paddle/fluid/operators/math/jit_kernel.h
+5
-5
paddle/fluid/operators/math/jit_kernel_exp.cc
paddle/fluid/operators/math/jit_kernel_exp.cc
+159
-121
paddle/fluid/operators/math/jit_kernel_test.cc
paddle/fluid/operators/math/jit_kernel_test.cc
+3
-3
未找到文件。
paddle/fluid/operators/math/jit_kernel.h
浏览文件 @
2513b2cc
...
...
@@ -29,7 +29,6 @@ namespace jitkernel {
#define SIGMOID_THRESHOLD_MIN -40.0
#define SIGMOID_THRESHOLD_MAX 13.0
#define EXP_MAX_INPUT 40.0
#define AVX_FLOAT_BLOCK 8
#define AVX2_FLOAT_BLOCK 8
#define AVX512_FLOAT_BLOCK 16
...
...
@@ -40,8 +39,9 @@ class Kernel {
public:
Kernel
()
=
default
;
virtual
~
Kernel
()
=
default
;
private:
int
num_
{
0
};
int
end_
{
0
};
int
rest_
{
0
};
DISABLE_COPY_AND_ASSIGN
(
Kernel
);
};
...
...
@@ -95,13 +95,13 @@ class VExpKernel : public Kernel {
template
<
typename
T
>
class
VSigmoidKernel
:
public
Kernel
{
public:
virtual
void
Compute
(
const
int
n
,
const
T
*
x
,
T
*
y
)
const
=
0
;
virtual
void
Compute
(
const
T
*
x
,
T
*
y
)
const
=
0
;
};
template
<
typename
T
>
class
VTanhKernel
:
public
Kernel
{
public:
virtual
void
Compute
(
const
int
n
,
const
T
*
x
,
T
*
y
)
const
=
0
;
virtual
void
Compute
(
const
T
*
x
,
T
*
y
)
const
=
0
;
};
template
<
typename
T
>
...
...
paddle/fluid/operators/math/jit_kernel_exp.cc
浏览文件 @
2513b2cc
...
...
@@ -113,17 +113,18 @@ template <typename T, jit::cpu_isa_t isa, jit_block>
class
VSigmoidKernelImpl
:
public
VSigmoidKernel
<
T
>
{
public:
explicit
VSigmoidKernelImpl
(
int
d
)
:
VSigmoidKernel
<
T
>
()
{
this
->
num_
=
d
;
vexp_
=
KernelPool
::
Instance
().
template
Get
<
VExpKernel
<
T
>
>
(
d
);
}
void
Compute
(
const
int
n
,
const
T
*
x
,
T
*
y
)
const
override
{
void
Compute
(
const
T
*
x
,
T
*
y
)
const
override
{
const
T
min
=
SIGMOID_THRESHOLD_MIN
;
const
T
max
=
SIGMOID_THRESHOLD_MAX
;
for
(
int
i
=
0
;
i
<
n
;
++
i
)
{
for
(
int
i
=
0
;
i
<
this
->
num_
;
++
i
)
{
y
[
i
]
=
(
x
[
i
]
<
min
)
?
min
:
((
x
[
i
]
>
max
)
?
max
:
x
[
i
]);
y
[
i
]
=
static_cast
<
T
>
(
0
)
-
y
[
i
];
}
vexp_
->
Compute
(
n
,
y
,
y
);
for
(
int
i
=
0
;
i
<
n
;
++
i
)
{
vexp_
->
Compute
(
this
->
num_
,
y
,
y
);
for
(
int
i
=
0
;
i
<
this
->
num_
;
++
i
)
{
y
[
i
]
=
static_cast
<
T
>
(
1
)
/
(
static_cast
<
T
>
(
1
)
+
y
[
i
]);
}
}
...
...
@@ -140,76 +141,89 @@ class VSigmoidKernelImpl : public VSigmoidKernel<T> {
tmp = _mm256_add_ps(_mm256_set1_ps(1.0f), tmp); \
tmp = _mm256_div_ps(_mm256_set1_ps(1.0f), tmp)
#define INTRI8_FLOAT(isa) \
template <> \
void VSigmoidKernelImpl<float, isa, kEQ8>::Compute(
\
const
int n, const float* x, float* y) const {
\
__m256 max = _mm256_set1_ps(SIGMOID_THRESHOLD_MAX); \
__m256 min = _mm256_set1_ps(SIGMOID_THRESHOLD_MIN); \
__m256 tmp = _mm256_loadu_ps(x); \
INTRI_SIGMOID(tmp, min, max); \
_mm256_storeu_ps(y, tmp); \
#define INTRI8_FLOAT(isa)
\
template <>
\
void VSigmoidKernelImpl<float, isa, kEQ8>::Compute(
const float* x, float* y)
\
const
{
\
__m256 max = _mm256_set1_ps(SIGMOID_THRESHOLD_MAX);
\
__m256 min = _mm256_set1_ps(SIGMOID_THRESHOLD_MIN);
\
__m256 tmp = _mm256_loadu_ps(x);
\
INTRI_SIGMOID(tmp, min, max);
\
_mm256_storeu_ps(y, tmp);
\
}
#define INTRI16_FLOAT(isa) \
template <> \
void VSigmoidKernelImpl<float, isa, kEQ16>::Compute( \
const int n, const float* x, float* y) const {
\
__m256 max = _mm256_set1_ps(SIGMOID_THRESHOLD_MAX); \
__m256 min = _mm256_set1_ps(SIGMOID_THRESHOLD_MIN); \
__m256 tmp0 = _mm256_loadu_ps(x); \
__m256 tmp1 = _mm256_loadu_ps(x + 8); \
INTRI_SIGMOID(tmp0, min, max); \
INTRI_SIGMOID(tmp1, min, max); \
_mm256_storeu_ps(y, tmp0); \
_mm256_storeu_ps(y + 8, tmp1); \
#define INTRI16_FLOAT(isa)
\
template <>
\
void VSigmoidKernelImpl<float, isa, kEQ16>::Compute(
const float* x,
\
float* y) const {
\
__m256 max = _mm256_set1_ps(SIGMOID_THRESHOLD_MAX);
\
__m256 min = _mm256_set1_ps(SIGMOID_THRESHOLD_MIN);
\
__m256 tmp0 = _mm256_loadu_ps(x);
\
__m256 tmp1 = _mm256_loadu_ps(x + 8);
\
INTRI_SIGMOID(tmp0, min, max);
\
INTRI_SIGMOID(tmp1, min, max);
\
_mm256_storeu_ps(y, tmp0);
\
_mm256_storeu_ps(y + 8, tmp1);
\
}
#define INTRI_GT8LT16_FLOAT(isa) \
template <> \
void VSigmoidKernelImpl<float, isa, kGT8LT16>::Compute( \
const int n, const float* x, float* y) const { \
__m256 max = _mm256_set1_ps(SIGMOID_THRESHOLD_MAX); \
__m256 min = _mm256_set1_ps(SIGMOID_THRESHOLD_MIN); \
__m256 tmp = _mm256_loadu_ps(x); \
INTRI_SIGMOID(tmp, min, max); \
_mm256_storeu_ps(y, tmp); \
const float min_ = SIGMOID_THRESHOLD_MIN; \
const float max_ = SIGMOID_THRESHOLD_MAX; \
for (int i = AVX_FLOAT_BLOCK; i < n; ++i) { \
y[i] = (x[i] < min_) ? min_ : ((x[i] > max_) ? max_ : x[i]); \
y[i] = 0.f - y[i]; \
} \
vexp_->Compute(n - AVX_FLOAT_BLOCK, y + AVX_FLOAT_BLOCK, \
y + AVX_FLOAT_BLOCK); \
for (int i = AVX_FLOAT_BLOCK; i < n; ++i) { \
y[i] = 1.f / (1.f + y[i]); \
} \
#define INTRI_GT8LT16_FLOAT(isa) \
template <> \
VSigmoidKernelImpl<float, isa, kGT8LT16>::VSigmoidKernelImpl(int d) \
: VSigmoidKernel<float>() { \
this->num_ = d; \
this->end_ = AVX_FLOAT_BLOCK; \
this->rest_ = d - this->end_; \
vexp_ = KernelPool::Instance().template Get<VExpKernel<float>>(d); \
} \
template <> \
void VSigmoidKernelImpl<float, isa, kGT8LT16>::Compute(const float* x, \
float* y) const { \
__m256 max = _mm256_set1_ps(SIGMOID_THRESHOLD_MAX); \
__m256 min = _mm256_set1_ps(SIGMOID_THRESHOLD_MIN); \
__m256 tmp = _mm256_loadu_ps(x); \
INTRI_SIGMOID(tmp, min, max); \
_mm256_storeu_ps(y, tmp); \
const float min_ = SIGMOID_THRESHOLD_MIN; \
const float max_ = SIGMOID_THRESHOLD_MAX; \
for (int i = this->end_; i < this->num_; ++i) { \
y[i] = (x[i] < min_) ? min_ : ((x[i] > max_) ? max_ : x[i]); \
y[i] = 0.f - y[i]; \
} \
vexp_->Compute(this->rest_, y + this->end_, y + this->end_); \
for (int i = this->end_; i < this->num_; ++i) { \
y[i] = 1.f / (1.f + y[i]); \
} \
}
#define INTRI_GT16_FLOAT(isa) \
template <> \
void VSigmoidKernelImpl<float, isa, kGT16>::Compute( \
const int n, const float* x, float* y) const { \
__m256 max = _mm256_set1_ps(SIGMOID_THRESHOLD_MAX); \
__m256 min = _mm256_set1_ps(SIGMOID_THRESHOLD_MIN); \
const int rest = n % AVX_FLOAT_BLOCK; \
const int end = n - rest; \
for (int i = 0; i < end; i += AVX_FLOAT_BLOCK) { \
__m256 tmp = _mm256_loadu_ps(x + i); \
INTRI_SIGMOID(tmp, min, max); \
_mm256_storeu_ps(y + i, tmp); \
} \
const float min_ = SIGMOID_THRESHOLD_MIN; \
const float max_ = SIGMOID_THRESHOLD_MAX; \
for (int i = end; i < n; ++i) { \
y[i] = (x[i] < min_) ? min_ : ((x[i] > max_) ? max_ : x[i]); \
y[i] = 0.f - y[i]; \
} \
vexp_->Compute(rest, y + end, y + end); \
for (int i = end; i < n; ++i) { \
y[i] = 1.f / (1.f + y[i]); \
} \
#define INTRI_GT16_FLOAT(isa) \
template <> \
VSigmoidKernelImpl<float, isa, kGT16>::VSigmoidKernelImpl(int d) \
: VSigmoidKernel<float>() { \
this->num_ = d; \
this->rest_ = d % AVX_FLOAT_BLOCK; \
this->end_ = d - this->rest_; \
vexp_ = KernelPool::Instance().template Get<VExpKernel<float>>(d); \
} \
template <> \
void VSigmoidKernelImpl<float, isa, kGT16>::Compute(const float* x, \
float* y) const { \
__m256 max = _mm256_set1_ps(SIGMOID_THRESHOLD_MAX); \
__m256 min = _mm256_set1_ps(SIGMOID_THRESHOLD_MIN); \
for (int i = 0; i < this->end_; i += AVX_FLOAT_BLOCK) { \
__m256 tmp = _mm256_loadu_ps(x + i); \
INTRI_SIGMOID(tmp, min, max); \
_mm256_storeu_ps(y + i, tmp); \
} \
const float min_ = SIGMOID_THRESHOLD_MIN; \
const float max_ = SIGMOID_THRESHOLD_MAX; \
for (int i = this->end_; i < this->num_; ++i) { \
y[i] = (x[i] < min_) ? min_ : ((x[i] > max_) ? max_ : x[i]); \
y[i] = 0.f - y[i]; \
} \
vexp_->Compute(this->rest_, y + this->end_, y + this->end_); \
for (int i = this->end_; i < this->num_; ++i) { \
y[i] = 1.f / (1.f + y[i]); \
} \
}
#ifdef __AVX__
...
...
@@ -249,15 +263,16 @@ template <typename T, jit::cpu_isa_t isa, jit_block>
class
VTanhKernelImpl
:
public
VTanhKernel
<
T
>
{
public:
explicit
VTanhKernelImpl
(
int
d
)
:
VTanhKernel
<
T
>
()
{
this
->
num_
=
d
;
vscal_
=
KernelPool
::
Instance
().
template
Get
<
VScalKernel
<
T
>
>
(
d
);
vsigmoid_
=
KernelPool
::
Instance
().
template
Get
<
VSigmoidKernel
<
T
>
>
(
d
);
vaddbias_
=
KernelPool
::
Instance
().
template
Get
<
VAddBiasKernel
<
T
>
>
(
d
);
}
void
Compute
(
const
int
n
,
const
T
*
x
,
T
*
y
)
const
override
{
vscal_
->
Compute
(
n
,
static_cast
<
T
>
(
2
),
x
,
y
);
vsigmoid_
->
Compute
(
n
,
y
,
y
);
vscal_
->
Compute
(
n
,
static_cast
<
T
>
(
2
),
y
);
vaddbias_
->
Compute
(
n
,
static_cast
<
T
>
(
-
1
),
y
,
y
);
void
Compute
(
const
T
*
x
,
T
*
y
)
const
override
{
vscal_
->
Compute
(
this
->
num_
,
static_cast
<
T
>
(
2
),
x
,
y
);
vsigmoid_
->
Compute
(
y
,
y
);
vscal_
->
Compute
(
this
->
num_
,
static_cast
<
T
>
(
2
),
y
);
vaddbias_
->
Compute
(
this
->
num_
,
static_cast
<
T
>
(
-
1
),
y
,
y
);
}
private:
...
...
@@ -274,60 +289,83 @@ class VTanhKernelImpl : public VTanhKernel<T> {
tmp = _mm256_div_ps(_mm256_set1_ps(2.0f), tmp); \
tmp = _mm256_sub_ps(tmp, _mm256_set1_ps(1.0f))
#define INTRI8_FLOAT(isa)
\
template <>
\
void VTanhKernelImpl<float, isa, kEQ8>::Compute(const
int n, const float* x,
\
float* y) const {
\
__m256 tmp = _mm256_loadu_ps(x);
\
INTRI_VTANH(tmp);
\
_mm256_storeu_ps(y, tmp);
\
#define INTRI8_FLOAT(isa) \
template <> \
void VTanhKernelImpl<float, isa, kEQ8>::Compute(const
float* x, float* y)
\
const {
\
__m256 tmp = _mm256_loadu_ps(x); \
INTRI_VTANH(tmp); \
_mm256_storeu_ps(y, tmp); \
}
#define INTRI16_FLOAT(isa) \
template <> \
void VTanhKernelImpl<float, isa, kEQ16>::Compute(
\
const
int n, const float* x, float* y) const {
\
__m256 tmp0 = _mm256_loadu_ps(x); \
__m256 tmp1 = _mm256_loadu_ps(x + 8); \
INTRI_VTANH(tmp0); \
INTRI_VTANH(tmp1); \
_mm256_storeu_ps(y, tmp0); \
_mm256_storeu_ps(y + 8, tmp1); \
#define INTRI16_FLOAT(isa)
\
template <>
\
void VTanhKernelImpl<float, isa, kEQ16>::Compute(
const float* x, float* y)
\
const
{
\
__m256 tmp0 = _mm256_loadu_ps(x);
\
__m256 tmp1 = _mm256_loadu_ps(x + 8);
\
INTRI_VTANH(tmp0);
\
INTRI_VTANH(tmp1);
\
_mm256_storeu_ps(y, tmp0);
\
_mm256_storeu_ps(y + 8, tmp1);
\
}
#define INTRI_GT8LT16_FLOAT(isa) \
template <> \
void VTanhKernelImpl<float, isa, kGT8LT16>::Compute( \
const int n, const float* x, float* y) const { \
__m256 tmp = _mm256_loadu_ps(x); \
INTRI_VTANH(tmp); \
_mm256_storeu_ps(y, tmp); \
x += AVX_FLOAT_BLOCK; \
y += AVX_FLOAT_BLOCK; \
const int rest = n - AVX_FLOAT_BLOCK; \
vscal_->Compute(rest, 2.f, x, y); \
vsigmoid_->Compute(rest, y, y); \
vscal_->Compute(rest, 2.f, y); \
vaddbias_->Compute(rest, -1.f, y, y); \
#define INTRI_GT8LT16_FLOAT(isa) \
template <> \
VTanhKernelImpl<float, isa, kGT8LT16>::VTanhKernelImpl(int d) \
: VTanhKernel<float>() { \
this->num_ = d; \
this->end_ = AVX_FLOAT_BLOCK; \
this->rest_ = d - this->end_; \
vscal_ = \
KernelPool::Instance().template Get<VScalKernel<float>>(this->rest_); \
vsigmoid_ = KernelPool::Instance().template Get<VSigmoidKernel<float>>( \
this->rest_); \
vaddbias_ = KernelPool::Instance().template Get<VAddBiasKernel<float>>( \
this->rest_); \
} \
template <> \
void VTanhKernelImpl<float, isa, kGT8LT16>::Compute(const float* x, \
float* y) const { \
__m256 tmp = _mm256_loadu_ps(x); \
INTRI_VTANH(tmp); \
_mm256_storeu_ps(y, tmp); \
x += AVX_FLOAT_BLOCK; \
y += AVX_FLOAT_BLOCK; \
vscal_->Compute(this->rest_, 2.f, x, y); \
vsigmoid_->Compute(y, y); \
vscal_->Compute(this->rest_, 2.f, y); \
vaddbias_->Compute(this->rest_, -1.f, y, y); \
}
#define INTRI_GT16_FLOAT(isa) \
template <> \
void VTanhKernelImpl<float, isa, kGT16>::Compute( \
const int n, const float* x, float* y) const { \
const int rest = n % AVX_FLOAT_BLOCK; \
const int end = n - rest; \
for (int i = 0; i < end; i += AVX_FLOAT_BLOCK) { \
__m256 tmp = _mm256_loadu_ps(x + i); \
INTRI_VTANH(tmp); \
_mm256_storeu_ps(y + i, tmp); \
} \
x += end; \
y += end; \
vscal_->Compute(rest, 2.f, x, y); \
vsigmoid_->Compute(rest, y, y); \
vscal_->Compute(rest, 2.f, y); \
vaddbias_->Compute(rest, -1.f, y, y); \
#define INTRI_GT16_FLOAT(isa) \
template <> \
VTanhKernelImpl<float, isa, kGT16>::VTanhKernelImpl(int d) \
: VTanhKernel<float>() { \
this->num_ = d; \
this->rest_ = d % AVX_FLOAT_BLOCK; \
this->end_ = d - this->rest_; \
vscal_ = \
KernelPool::Instance().template Get<VScalKernel<float>>(this->rest_); \
vsigmoid_ = KernelPool::Instance().template Get<VSigmoidKernel<float>>( \
this->rest_); \
vaddbias_ = KernelPool::Instance().template Get<VAddBiasKernel<float>>( \
this->rest_); \
} \
template <> \
void VTanhKernelImpl<float, isa, kGT16>::Compute(const float* x, float* y) \
const { \
for (int i = 0; i < this->end_; i += AVX_FLOAT_BLOCK) { \
__m256 tmp = _mm256_loadu_ps(x + i); \
INTRI_VTANH(tmp); \
_mm256_storeu_ps(y + i, tmp); \
} \
x += this->end_; \
y += this->end_; \
vscal_->Compute(this->rest_, 2.f, x, y); \
vsigmoid_->Compute(y, y); \
vscal_->Compute(this->rest_, 2.f, y); \
vaddbias_->Compute(this->rest_, -1.f, y, y); \
}
#ifdef __AVX__
...
...
paddle/fluid/operators/math/jit_kernel_test.cc
浏览文件 @
2513b2cc
...
...
@@ -195,7 +195,7 @@ TEST(JitKernel, vsigmoid) {
auto
trefe
=
GetCurrentUS
();
auto
ttgts
=
GetCurrentUS
();
for
(
int
i
=
0
;
i
<
repeat
;
++
i
)
{
ker
->
Compute
(
d
,
x_data
,
ztgt_data
);
ker
->
Compute
(
x_data
,
ztgt_data
);
}
auto
ttgte
=
GetCurrentUS
();
...
...
@@ -227,7 +227,7 @@ void vtanh_better(
vaddbias
,
const
int
n
,
const
float
*
x
,
float
*
y
)
{
vscal
->
Compute
(
n
,
2.
f
,
x
,
y
);
vsigmoid
->
Compute
(
n
,
y
,
y
);
vsigmoid
->
Compute
(
y
,
y
);
vscal
->
Compute
(
n
,
2.
f
,
y
);
vaddbias
->
Compute
(
n
,
-
1.
f
,
y
,
y
);
}
...
...
@@ -261,7 +261,7 @@ TEST(JitKernel, vtanh) {
auto
trefe
=
GetCurrentUS
();
auto
ttgts
=
GetCurrentUS
();
for
(
int
i
=
0
;
i
<
repeat
;
++
i
)
{
ker
->
Compute
(
d
,
x_data
,
ztgt_data
);
ker
->
Compute
(
x_data
,
ztgt_data
);
}
auto
ttgte
=
GetCurrentUS
();
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录