Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
PaddleDetection
提交
2513b2cc
P
PaddleDetection
项目概览
PaddlePaddle
/
PaddleDetection
大约 1 年 前同步成功
通知
695
Star
11112
Fork
2696
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
184
列表
看板
标记
里程碑
合并请求
40
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
PaddleDetection
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
184
Issue
184
列表
看板
标记
里程碑
合并请求
40
合并请求
40
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
2513b2cc
编写于
9月 30, 2018
作者:
T
tensor-tang
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
fix bug vtanh
上级
cf8c8e72
变更
3
隐藏空白更改
内联
并排
Showing
3 changed file
with
167 addition
and
129 deletion
+167
-129
paddle/fluid/operators/math/jit_kernel.h
paddle/fluid/operators/math/jit_kernel.h
+5
-5
paddle/fluid/operators/math/jit_kernel_exp.cc
paddle/fluid/operators/math/jit_kernel_exp.cc
+159
-121
paddle/fluid/operators/math/jit_kernel_test.cc
paddle/fluid/operators/math/jit_kernel_test.cc
+3
-3
未找到文件。
paddle/fluid/operators/math/jit_kernel.h
浏览文件 @
2513b2cc
...
...
@@ -29,7 +29,6 @@ namespace jitkernel {
#define SIGMOID_THRESHOLD_MIN -40.0
#define SIGMOID_THRESHOLD_MAX 13.0
#define EXP_MAX_INPUT 40.0
#define AVX_FLOAT_BLOCK 8
#define AVX2_FLOAT_BLOCK 8
#define AVX512_FLOAT_BLOCK 16
...
...
@@ -40,8 +39,9 @@ class Kernel {
public:
Kernel
()
=
default
;
virtual
~
Kernel
()
=
default
;
private:
int
num_
{
0
};
int
end_
{
0
};
int
rest_
{
0
};
DISABLE_COPY_AND_ASSIGN
(
Kernel
);
};
...
...
@@ -95,13 +95,13 @@ class VExpKernel : public Kernel {
template
<
typename
T
>
class
VSigmoidKernel
:
public
Kernel
{
public:
virtual
void
Compute
(
const
int
n
,
const
T
*
x
,
T
*
y
)
const
=
0
;
virtual
void
Compute
(
const
T
*
x
,
T
*
y
)
const
=
0
;
};
template
<
typename
T
>
class
VTanhKernel
:
public
Kernel
{
public:
virtual
void
Compute
(
const
int
n
,
const
T
*
x
,
T
*
y
)
const
=
0
;
virtual
void
Compute
(
const
T
*
x
,
T
*
y
)
const
=
0
;
};
template
<
typename
T
>
...
...
paddle/fluid/operators/math/jit_kernel_exp.cc
浏览文件 @
2513b2cc
...
...
@@ -113,17 +113,18 @@ template <typename T, jit::cpu_isa_t isa, jit_block>
class
VSigmoidKernelImpl
:
public
VSigmoidKernel
<
T
>
{
public:
explicit
VSigmoidKernelImpl
(
int
d
)
:
VSigmoidKernel
<
T
>
()
{
this
->
num_
=
d
;
vexp_
=
KernelPool
::
Instance
().
template
Get
<
VExpKernel
<
T
>
>
(
d
);
}
void
Compute
(
const
int
n
,
const
T
*
x
,
T
*
y
)
const
override
{
void
Compute
(
const
T
*
x
,
T
*
y
)
const
override
{
const
T
min
=
SIGMOID_THRESHOLD_MIN
;
const
T
max
=
SIGMOID_THRESHOLD_MAX
;
for
(
int
i
=
0
;
i
<
n
;
++
i
)
{
for
(
int
i
=
0
;
i
<
this
->
num_
;
++
i
)
{
y
[
i
]
=
(
x
[
i
]
<
min
)
?
min
:
((
x
[
i
]
>
max
)
?
max
:
x
[
i
]);
y
[
i
]
=
static_cast
<
T
>
(
0
)
-
y
[
i
];
}
vexp_
->
Compute
(
n
,
y
,
y
);
for
(
int
i
=
0
;
i
<
n
;
++
i
)
{
vexp_
->
Compute
(
this
->
num_
,
y
,
y
);
for
(
int
i
=
0
;
i
<
this
->
num_
;
++
i
)
{
y
[
i
]
=
static_cast
<
T
>
(
1
)
/
(
static_cast
<
T
>
(
1
)
+
y
[
i
]);
}
}
...
...
@@ -140,76 +141,89 @@ class VSigmoidKernelImpl : public VSigmoidKernel<T> {
tmp = _mm256_add_ps(_mm256_set1_ps(1.0f), tmp); \
tmp = _mm256_div_ps(_mm256_set1_ps(1.0f), tmp)
#define INTRI8_FLOAT(isa) \
template <> \
void VSigmoidKernelImpl<float, isa, kEQ8>::Compute(
\
const
int n, const float* x, float* y) const {
\
__m256 max = _mm256_set1_ps(SIGMOID_THRESHOLD_MAX); \
__m256 min = _mm256_set1_ps(SIGMOID_THRESHOLD_MIN); \
__m256 tmp = _mm256_loadu_ps(x); \
INTRI_SIGMOID(tmp, min, max); \
_mm256_storeu_ps(y, tmp); \
#define INTRI8_FLOAT(isa)
\
template <>
\
void VSigmoidKernelImpl<float, isa, kEQ8>::Compute(
const float* x, float* y)
\
const
{
\
__m256 max = _mm256_set1_ps(SIGMOID_THRESHOLD_MAX);
\
__m256 min = _mm256_set1_ps(SIGMOID_THRESHOLD_MIN);
\
__m256 tmp = _mm256_loadu_ps(x);
\
INTRI_SIGMOID(tmp, min, max);
\
_mm256_storeu_ps(y, tmp);
\
}
#define INTRI16_FLOAT(isa) \
template <> \
void VSigmoidKernelImpl<float, isa, kEQ16>::Compute( \
const int n, const float* x, float* y) const {
\
__m256 max = _mm256_set1_ps(SIGMOID_THRESHOLD_MAX); \
__m256 min = _mm256_set1_ps(SIGMOID_THRESHOLD_MIN); \
__m256 tmp0 = _mm256_loadu_ps(x); \
__m256 tmp1 = _mm256_loadu_ps(x + 8); \
INTRI_SIGMOID(tmp0, min, max); \
INTRI_SIGMOID(tmp1, min, max); \
_mm256_storeu_ps(y, tmp0); \
_mm256_storeu_ps(y + 8, tmp1); \
#define INTRI16_FLOAT(isa)
\
template <>
\
void VSigmoidKernelImpl<float, isa, kEQ16>::Compute(
const float* x,
\
float* y) const {
\
__m256 max = _mm256_set1_ps(SIGMOID_THRESHOLD_MAX);
\
__m256 min = _mm256_set1_ps(SIGMOID_THRESHOLD_MIN);
\
__m256 tmp0 = _mm256_loadu_ps(x);
\
__m256 tmp1 = _mm256_loadu_ps(x + 8);
\
INTRI_SIGMOID(tmp0, min, max);
\
INTRI_SIGMOID(tmp1, min, max);
\
_mm256_storeu_ps(y, tmp0);
\
_mm256_storeu_ps(y + 8, tmp1);
\
}
#define INTRI_GT8LT16_FLOAT(isa) \
template <> \
void VSigmoidKernelImpl<float, isa, kGT8LT16>::Compute( \
const int n, const float* x, float* y) const { \
__m256 max = _mm256_set1_ps(SIGMOID_THRESHOLD_MAX); \
__m256 min = _mm256_set1_ps(SIGMOID_THRESHOLD_MIN); \
__m256 tmp = _mm256_loadu_ps(x); \
INTRI_SIGMOID(tmp, min, max); \
_mm256_storeu_ps(y, tmp); \
const float min_ = SIGMOID_THRESHOLD_MIN; \
const float max_ = SIGMOID_THRESHOLD_MAX; \
for (int i = AVX_FLOAT_BLOCK; i < n; ++i) { \
y[i] = (x[i] < min_) ? min_ : ((x[i] > max_) ? max_ : x[i]); \
y[i] = 0.f - y[i]; \
} \
vexp_->Compute(n - AVX_FLOAT_BLOCK, y + AVX_FLOAT_BLOCK, \
y + AVX_FLOAT_BLOCK); \
for (int i = AVX_FLOAT_BLOCK; i < n; ++i) { \
y[i] = 1.f / (1.f + y[i]); \
} \
#define INTRI_GT8LT16_FLOAT(isa) \
template <> \
VSigmoidKernelImpl<float, isa, kGT8LT16>::VSigmoidKernelImpl(int d) \
: VSigmoidKernel<float>() { \
this->num_ = d; \
this->end_ = AVX_FLOAT_BLOCK; \
this->rest_ = d - this->end_; \
vexp_ = KernelPool::Instance().template Get<VExpKernel<float>>(d); \
} \
template <> \
void VSigmoidKernelImpl<float, isa, kGT8LT16>::Compute(const float* x, \
float* y) const { \
__m256 max = _mm256_set1_ps(SIGMOID_THRESHOLD_MAX); \
__m256 min = _mm256_set1_ps(SIGMOID_THRESHOLD_MIN); \
__m256 tmp = _mm256_loadu_ps(x); \
INTRI_SIGMOID(tmp, min, max); \
_mm256_storeu_ps(y, tmp); \
const float min_ = SIGMOID_THRESHOLD_MIN; \
const float max_ = SIGMOID_THRESHOLD_MAX; \
for (int i = this->end_; i < this->num_; ++i) { \
y[i] = (x[i] < min_) ? min_ : ((x[i] > max_) ? max_ : x[i]); \
y[i] = 0.f - y[i]; \
} \
vexp_->Compute(this->rest_, y + this->end_, y + this->end_); \
for (int i = this->end_; i < this->num_; ++i) { \
y[i] = 1.f / (1.f + y[i]); \
} \
}
#define INTRI_GT16_FLOAT(isa) \
template <> \
void VSigmoidKernelImpl<float, isa, kGT16>::Compute( \
const int n, const float* x, float* y) const { \
__m256 max = _mm256_set1_ps(SIGMOID_THRESHOLD_MAX); \
__m256 min = _mm256_set1_ps(SIGMOID_THRESHOLD_MIN); \
const int rest = n % AVX_FLOAT_BLOCK; \
const int end = n - rest; \
for (int i = 0; i < end; i += AVX_FLOAT_BLOCK) { \
__m256 tmp = _mm256_loadu_ps(x + i); \
INTRI_SIGMOID(tmp, min, max); \
_mm256_storeu_ps(y + i, tmp); \
} \
const float min_ = SIGMOID_THRESHOLD_MIN; \
const float max_ = SIGMOID_THRESHOLD_MAX; \
for (int i = end; i < n; ++i) { \
y[i] = (x[i] < min_) ? min_ : ((x[i] > max_) ? max_ : x[i]); \
y[i] = 0.f - y[i]; \
} \
vexp_->Compute(rest, y + end, y + end); \
for (int i = end; i < n; ++i) { \
y[i] = 1.f / (1.f + y[i]); \
} \
#define INTRI_GT16_FLOAT(isa) \
template <> \
VSigmoidKernelImpl<float, isa, kGT16>::VSigmoidKernelImpl(int d) \
: VSigmoidKernel<float>() { \
this->num_ = d; \
this->rest_ = d % AVX_FLOAT_BLOCK; \
this->end_ = d - this->rest_; \
vexp_ = KernelPool::Instance().template Get<VExpKernel<float>>(d); \
} \
template <> \
void VSigmoidKernelImpl<float, isa, kGT16>::Compute(const float* x, \
float* y) const { \
__m256 max = _mm256_set1_ps(SIGMOID_THRESHOLD_MAX); \
__m256 min = _mm256_set1_ps(SIGMOID_THRESHOLD_MIN); \
for (int i = 0; i < this->end_; i += AVX_FLOAT_BLOCK) { \
__m256 tmp = _mm256_loadu_ps(x + i); \
INTRI_SIGMOID(tmp, min, max); \
_mm256_storeu_ps(y + i, tmp); \
} \
const float min_ = SIGMOID_THRESHOLD_MIN; \
const float max_ = SIGMOID_THRESHOLD_MAX; \
for (int i = this->end_; i < this->num_; ++i) { \
y[i] = (x[i] < min_) ? min_ : ((x[i] > max_) ? max_ : x[i]); \
y[i] = 0.f - y[i]; \
} \
vexp_->Compute(this->rest_, y + this->end_, y + this->end_); \
for (int i = this->end_; i < this->num_; ++i) { \
y[i] = 1.f / (1.f + y[i]); \
} \
}
#ifdef __AVX__
...
...
@@ -249,15 +263,16 @@ template <typename T, jit::cpu_isa_t isa, jit_block>
class
VTanhKernelImpl
:
public
VTanhKernel
<
T
>
{
public:
explicit
VTanhKernelImpl
(
int
d
)
:
VTanhKernel
<
T
>
()
{
this
->
num_
=
d
;
vscal_
=
KernelPool
::
Instance
().
template
Get
<
VScalKernel
<
T
>
>
(
d
);
vsigmoid_
=
KernelPool
::
Instance
().
template
Get
<
VSigmoidKernel
<
T
>
>
(
d
);
vaddbias_
=
KernelPool
::
Instance
().
template
Get
<
VAddBiasKernel
<
T
>
>
(
d
);
}
void
Compute
(
const
int
n
,
const
T
*
x
,
T
*
y
)
const
override
{
vscal_
->
Compute
(
n
,
static_cast
<
T
>
(
2
),
x
,
y
);
vsigmoid_
->
Compute
(
n
,
y
,
y
);
vscal_
->
Compute
(
n
,
static_cast
<
T
>
(
2
),
y
);
vaddbias_
->
Compute
(
n
,
static_cast
<
T
>
(
-
1
),
y
,
y
);
void
Compute
(
const
T
*
x
,
T
*
y
)
const
override
{
vscal_
->
Compute
(
this
->
num_
,
static_cast
<
T
>
(
2
),
x
,
y
);
vsigmoid_
->
Compute
(
y
,
y
);
vscal_
->
Compute
(
this
->
num_
,
static_cast
<
T
>
(
2
),
y
);
vaddbias_
->
Compute
(
this
->
num_
,
static_cast
<
T
>
(
-
1
),
y
,
y
);
}
private:
...
...
@@ -274,60 +289,83 @@ class VTanhKernelImpl : public VTanhKernel<T> {
tmp = _mm256_div_ps(_mm256_set1_ps(2.0f), tmp); \
tmp = _mm256_sub_ps(tmp, _mm256_set1_ps(1.0f))
#define INTRI8_FLOAT(isa)
\
template <>
\
void VTanhKernelImpl<float, isa, kEQ8>::Compute(const
int n, const float* x,
\
float* y) const {
\
__m256 tmp = _mm256_loadu_ps(x);
\
INTRI_VTANH(tmp);
\
_mm256_storeu_ps(y, tmp);
\
#define INTRI8_FLOAT(isa) \
template <> \
void VTanhKernelImpl<float, isa, kEQ8>::Compute(const
float* x, float* y)
\
const {
\
__m256 tmp = _mm256_loadu_ps(x); \
INTRI_VTANH(tmp); \
_mm256_storeu_ps(y, tmp); \
}
#define INTRI16_FLOAT(isa) \
template <> \
void VTanhKernelImpl<float, isa, kEQ16>::Compute(
\
const
int n, const float* x, float* y) const {
\
__m256 tmp0 = _mm256_loadu_ps(x); \
__m256 tmp1 = _mm256_loadu_ps(x + 8); \
INTRI_VTANH(tmp0); \
INTRI_VTANH(tmp1); \
_mm256_storeu_ps(y, tmp0); \
_mm256_storeu_ps(y + 8, tmp1); \
#define INTRI16_FLOAT(isa)
\
template <>
\
void VTanhKernelImpl<float, isa, kEQ16>::Compute(
const float* x, float* y)
\
const
{
\
__m256 tmp0 = _mm256_loadu_ps(x);
\
__m256 tmp1 = _mm256_loadu_ps(x + 8);
\
INTRI_VTANH(tmp0);
\
INTRI_VTANH(tmp1);
\
_mm256_storeu_ps(y, tmp0);
\
_mm256_storeu_ps(y + 8, tmp1);
\
}
#define INTRI_GT8LT16_FLOAT(isa) \
template <> \
void VTanhKernelImpl<float, isa, kGT8LT16>::Compute( \
const int n, const float* x, float* y) const { \
__m256 tmp = _mm256_loadu_ps(x); \
INTRI_VTANH(tmp); \
_mm256_storeu_ps(y, tmp); \
x += AVX_FLOAT_BLOCK; \
y += AVX_FLOAT_BLOCK; \
const int rest = n - AVX_FLOAT_BLOCK; \
vscal_->Compute(rest, 2.f, x, y); \
vsigmoid_->Compute(rest, y, y); \
vscal_->Compute(rest, 2.f, y); \
vaddbias_->Compute(rest, -1.f, y, y); \
#define INTRI_GT8LT16_FLOAT(isa) \
template <> \
VTanhKernelImpl<float, isa, kGT8LT16>::VTanhKernelImpl(int d) \
: VTanhKernel<float>() { \
this->num_ = d; \
this->end_ = AVX_FLOAT_BLOCK; \
this->rest_ = d - this->end_; \
vscal_ = \
KernelPool::Instance().template Get<VScalKernel<float>>(this->rest_); \
vsigmoid_ = KernelPool::Instance().template Get<VSigmoidKernel<float>>( \
this->rest_); \
vaddbias_ = KernelPool::Instance().template Get<VAddBiasKernel<float>>( \
this->rest_); \
} \
template <> \
void VTanhKernelImpl<float, isa, kGT8LT16>::Compute(const float* x, \
float* y) const { \
__m256 tmp = _mm256_loadu_ps(x); \
INTRI_VTANH(tmp); \
_mm256_storeu_ps(y, tmp); \
x += AVX_FLOAT_BLOCK; \
y += AVX_FLOAT_BLOCK; \
vscal_->Compute(this->rest_, 2.f, x, y); \
vsigmoid_->Compute(y, y); \
vscal_->Compute(this->rest_, 2.f, y); \
vaddbias_->Compute(this->rest_, -1.f, y, y); \
}
#define INTRI_GT16_FLOAT(isa) \
template <> \
void VTanhKernelImpl<float, isa, kGT16>::Compute( \
const int n, const float* x, float* y) const { \
const int rest = n % AVX_FLOAT_BLOCK; \
const int end = n - rest; \
for (int i = 0; i < end; i += AVX_FLOAT_BLOCK) { \
__m256 tmp = _mm256_loadu_ps(x + i); \
INTRI_VTANH(tmp); \
_mm256_storeu_ps(y + i, tmp); \
} \
x += end; \
y += end; \
vscal_->Compute(rest, 2.f, x, y); \
vsigmoid_->Compute(rest, y, y); \
vscal_->Compute(rest, 2.f, y); \
vaddbias_->Compute(rest, -1.f, y, y); \
#define INTRI_GT16_FLOAT(isa) \
template <> \
VTanhKernelImpl<float, isa, kGT16>::VTanhKernelImpl(int d) \
: VTanhKernel<float>() { \
this->num_ = d; \
this->rest_ = d % AVX_FLOAT_BLOCK; \
this->end_ = d - this->rest_; \
vscal_ = \
KernelPool::Instance().template Get<VScalKernel<float>>(this->rest_); \
vsigmoid_ = KernelPool::Instance().template Get<VSigmoidKernel<float>>( \
this->rest_); \
vaddbias_ = KernelPool::Instance().template Get<VAddBiasKernel<float>>( \
this->rest_); \
} \
template <> \
void VTanhKernelImpl<float, isa, kGT16>::Compute(const float* x, float* y) \
const { \
for (int i = 0; i < this->end_; i += AVX_FLOAT_BLOCK) { \
__m256 tmp = _mm256_loadu_ps(x + i); \
INTRI_VTANH(tmp); \
_mm256_storeu_ps(y + i, tmp); \
} \
x += this->end_; \
y += this->end_; \
vscal_->Compute(this->rest_, 2.f, x, y); \
vsigmoid_->Compute(y, y); \
vscal_->Compute(this->rest_, 2.f, y); \
vaddbias_->Compute(this->rest_, -1.f, y, y); \
}
#ifdef __AVX__
...
...
paddle/fluid/operators/math/jit_kernel_test.cc
浏览文件 @
2513b2cc
...
...
@@ -195,7 +195,7 @@ TEST(JitKernel, vsigmoid) {
auto
trefe
=
GetCurrentUS
();
auto
ttgts
=
GetCurrentUS
();
for
(
int
i
=
0
;
i
<
repeat
;
++
i
)
{
ker
->
Compute
(
d
,
x_data
,
ztgt_data
);
ker
->
Compute
(
x_data
,
ztgt_data
);
}
auto
ttgte
=
GetCurrentUS
();
...
...
@@ -227,7 +227,7 @@ void vtanh_better(
vaddbias
,
const
int
n
,
const
float
*
x
,
float
*
y
)
{
vscal
->
Compute
(
n
,
2.
f
,
x
,
y
);
vsigmoid
->
Compute
(
n
,
y
,
y
);
vsigmoid
->
Compute
(
y
,
y
);
vscal
->
Compute
(
n
,
2.
f
,
y
);
vaddbias
->
Compute
(
n
,
-
1.
f
,
y
,
y
);
}
...
...
@@ -261,7 +261,7 @@ TEST(JitKernel, vtanh) {
auto
trefe
=
GetCurrentUS
();
auto
ttgts
=
GetCurrentUS
();
for
(
int
i
=
0
;
i
<
repeat
;
++
i
)
{
ker
->
Compute
(
d
,
x_data
,
ztgt_data
);
ker
->
Compute
(
x_data
,
ztgt_data
);
}
auto
ttgte
=
GetCurrentUS
();
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录