Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
PaddleDetection
提交
b1516783
P
PaddleDetection
项目概览
PaddlePaddle
/
PaddleDetection
大约 1 年 前同步成功
通知
695
Star
11112
Fork
2696
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
184
列表
看板
标记
里程碑
合并请求
40
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
PaddleDetection
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
184
Issue
184
列表
看板
标记
里程碑
合并请求
40
合并请求
40
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
b1516783
编写于
12月 18, 2018
作者:
T
tensor-tang
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
enable crf decoding intrinsic code
上级
4cc7707d
变更
4
显示空白变更内容
内联
并排
Showing
4 changed file
with
150 addition
and
164 deletion
+150
-164
paddle/fluid/operators/jit/README.md
paddle/fluid/operators/jit/README.md
+4
-0
paddle/fluid/operators/jit/more/intrinsic/CMakeLists.txt
paddle/fluid/operators/jit/more/intrinsic/CMakeLists.txt
+0
-1
paddle/fluid/operators/jit/more/intrinsic/crf_decoding.cc
paddle/fluid/operators/jit/more/intrinsic/crf_decoding.cc
+137
-104
paddle/fluid/operators/jit/more/intrinsic/crf_decoding.h
paddle/fluid/operators/jit/more/intrinsic/crf_decoding.h
+9
-59
未找到文件。
paddle/fluid/operators/jit/README.md
浏览文件 @
b1516783
...
@@ -19,6 +19,10 @@ PaddlePaddle/Paddle/paddle/fluid/
...
@@ -19,6 +19,10 @@ PaddlePaddle/Paddle/paddle/fluid/
│ ├── ...
│ ├── ...
│ ├── mkl/
│ ├── mkl/
│ │ └── ...
│ │ └── ...
│ ├── mkldnn/
│ │ └── ...
│ ├── intrinsic/
│ │ └── ...
│ └── openblas/
│ └── openblas/
│ └── ...
│ └── ...
└── refer/
└── refer/
...
...
paddle/fluid/operators/jit/more/intrinsic/CMakeLists.txt
浏览文件 @
b1516783
...
@@ -6,4 +6,3 @@ set(JIT_KERNEL_DEPS ${JIT_KERNEL_DEPS} jit_kernel_intrinsic PARENT_SCOPE)
...
@@ -6,4 +6,3 @@ set(JIT_KERNEL_DEPS ${JIT_KERNEL_DEPS} jit_kernel_intrinsic PARENT_SCOPE)
# use mkl kernels by name and type
# use mkl kernels by name and type
USE_JITKERNEL_MORE
(
crfdecoding, intrinsic
)
USE_JITKERNEL_MORE
(
crfdecoding, intrinsic
)
USE_JITKERNEL_MORE
(
layernorm, intrinsic
)
paddle/fluid/operators/jit/more/intrinsic/crf_decoding.cc
浏览文件 @
b1516783
...
@@ -13,7 +13,7 @@
...
@@ -13,7 +13,7 @@
* limitations under the License. */
* limitations under the License. */
#include "paddle/fluid/operators/jit/more/intrinsic/crf_decoding.h"
#include "paddle/fluid/operators/jit/more/intrinsic/crf_decoding.h"
#include
"paddle/fluid/operators/jit/refer/refer.h"
#include
<limits>
#include "paddle/fluid/operators/jit/registry.h"
#include "paddle/fluid/operators/jit/registry.h"
#include "paddle/fluid/platform/cpu_info.h"
#include "paddle/fluid/platform/cpu_info.h"
...
@@ -21,118 +21,151 @@ namespace paddle {
...
@@ -21,118 +21,151 @@ namespace paddle {
namespace
operators
{
namespace
operators
{
namespace
jit
{
namespace
jit
{
namespace
more
{
namespace
more
{
namespace
mkl
{
namespace
intrinsic
{
template
<
>
void
CRFDecoding
(
const
int
seq_len
,
const
float
*
x
,
const
float
*
w
,
void
VMul
<
float
>
(
const
float
*
x
,
const
float
*
y
,
float
*
z
,
int
n
)
{
float
*
alpha
,
int
*
track
,
int
tag_num
)
{
platform
::
dynload
::
vsMul
(
n
,
x
,
y
,
z
);
const
int
step_size
=
}
platform
::
MayIUse
(
platform
::
avx512f
)
?
ZMM_FLOAT_BLOCK
:
YMM_FLOAT_BLOCK
;
const
int
end
=
tag_num
/
step_size
;
template
<
>
const
int
rest
=
tag_num
%
step_size
;
void
VMul
<
double
>
(
const
double
*
x
,
const
double
*
y
,
double
*
z
,
int
n
)
{
/* Setup the alpha initial value.*/
platform
::
dynload
::
vdMul
(
n
,
x
,
y
,
z
);
int
i_offset
=
0
;
}
int
last_offset
=
rest
-
step_size
;
for
(
int
i
=
0
;
i
<=
end
;
++
i
)
{
template
<
>
#ifdef __AVX512F__
void
VAdd
<
float
>
(
const
float
*
x
,
const
float
*
y
,
float
*
z
,
int
n
)
{
// Declare the variable for the content of weights, input and alpha values.
platform
::
dynload
::
vsAdd
(
n
,
x
,
y
,
z
);
__m512
w_content
,
x_content
,
alpha_content
;
}
// Load the relevant data into the variables from un-aligned address.
w_content
=
_mm512_loadu_ps
(
w
+
i_offset
);
template
<
>
x_content
=
_mm512_loadu_ps
(
x
+
i_offset
);
void
VAdd
<
double
>
(
const
double
*
x
,
const
double
*
y
,
double
*
z
,
int
n
)
{
alpha_content
=
_mm512_add_ps
(
w_content
,
x_content
);
platform
::
dynload
::
vdAdd
(
n
,
x
,
y
,
z
);
// Save the alpha value.
}
_mm512_storeu_ps
(
alpha_value
+
i_offset
,
alpha_content
);
#else
template
<
>
// AVX or AVX2
void
VScal
<
float
>
(
const
float
*
a
,
const
float
*
x
,
float
*
y
,
int
n
)
{
// weights, input and alpha values.
if
(
x
==
y
)
{
__m256
w_content
,
x_content
,
alpha_content
;
platform
::
dynload
::
cblas_sscal
(
n
,
*
a
,
y
,
1
);
// Load the relevant data into the variables from un-aligned address.
w_content
=
_mm256_loadu_ps
(
w
+
i_offset
);
x_content
=
_mm256_loadu_ps
(
x
+
i_offset
);
alpha_content
=
_mm256_add_ps
(
w_content
,
x_content
);
_mm256_storeu_ps
(
alpha
+
i_offset
,
alpha_content
);
#endif
i_offset
+=
step_size
;
if
(
i
==
end
-
1
)
{
if
(
rest
>
0
)
{
i_offset
+=
last_offset
;
}
else
{
}
else
{
refer
::
VScal
<
float
>
(
a
,
x
,
y
,
n
)
;
break
;
}
}
}
}
}
template
<
>
// Use the column-major strategy to get the location of maximum score.
void
VScal
<
double
>
(
const
double
*
a
,
const
double
*
x
,
double
*
y
,
int
n
)
{
int
seq_offset
=
0
;
if
(
x
==
y
)
{
constexpr
int
state_trans_base_idx
=
2
;
platform
::
dynload
::
cblas_dscal
(
n
,
*
a
,
y
,
1
);
for
(
int
k
=
1
;
k
<
seq_len
;
++
k
)
{
int
j_offset
=
0
;
for
(
int
j
=
0
;
j
<=
end
;
++
j
)
{
/* Initialize the variables of maximum score and location.*/
#ifdef __AVX512F__
__m512
max_score
=
_mm512_set1_ps
(
-
std
::
numeric_limits
<
float
>::
max
());
__m512i
max_j
=
_mm512_setzero_si512
();
#else
__m256
max_score
=
_mm256_set1_ps
(
-
std
::
numeric_limits
<
float
>::
max
());
__m256i
max_j
=
_mm256_set1_epi32
(
0
);
#endif
/* Calculate the offset of transition_weights.*/
int
trans_offset
=
state_trans_base_idx
*
tag_num
+
j_offset
;
for
(
int
i
=
0
;
i
<
tag_num
;
++
i
)
{
/* Initalize the content of alpha variable with related offset.*/
#ifdef __AVX512F__
__m512
alpha_content
=
_mm512_set1_ps
(
*
(
alpha
+
seq_offset
+
i
));
/* Obtain the content of weights from un-aligned address.*/
__m512
w_content
=
_mm512_loadu_ps
(
w
+
trans_offset
);
__m512
score_v
=
_mm512_add_ps
(
alpha_content
,
w_content
);
__mmask16
mask
=
_mm512_cmp_ps_mask
(
score_v
,
max_score
,
_CMP_GT_OS
);
/* AVX512 instructions.*/
max_j
=
_mm512_mask_set1_epi32
(
max_j
,
mask
,
i
);
/* Update the max_score value.*/
max_score
=
_mm512_max_ps
(
max_score
,
score_v
);
#else
__m256
alpha_content
=
_mm256_broadcast_ss
(
alpha
+
seq_offset
+
i
);
/* Obtain the content of weights from un-aligned address.*/
__m256
w_content
=
_mm256_loadu_ps
(
w
+
trans_offset
);
__m256
score_v
=
_mm256_add_ps
(
alpha_content
,
w_content
);
__m256
mask
=
_mm256_cmp_ps
(
score_v
,
max_score
,
_CMP_GT_OS
);
/* According to the mask value, update the index of the max_score.*/
#ifdef __AVX2__
max_j
=
_mm256_or_si256
(
_mm256_andnot_si256
((
__m256i
)
mask
,
max_j
),
_mm256_and_si256
((
__m256i
)
mask
,
_mm256_set1_epi32
(
i
)));
#else
__m128i
lo_max_j
=
_mm256_extractf128_si256
(
max_j
,
0
);
__m128i
hi_max_j
=
_mm256_extractf128_si256
(
max_j
,
1
);
__m128i
lo_mask
=
_mm256_extractf128_si256
(
*
(
__m256i
*
)
&
mask
,
0
);
// NOLINT
__m128i
hi_mask
=
_mm256_extractf128_si256
(
*
(
__m256i
*
)
&
mask
,
1
);
// NOLINT
lo_max_j
=
_mm_andnot_si128
(
lo_mask
,
lo_max_j
);
hi_max_j
=
_mm_andnot_si128
(
hi_mask
,
hi_max_j
);
lo_mask
=
_mm_and_si128
(
lo_mask
,
_mm_set1_epi32
(
i
));
hi_mask
=
_mm_and_si128
(
hi_mask
,
_mm_set1_epi32
(
i
));
lo_max_j
=
_mm_or_si128
(
lo_mask
,
lo_max_j
);
hi_max_j
=
_mm_or_si128
(
hi_mask
,
hi_max_j
);
max_j
=
_mm256_insertf128_si256
(
max_j
,
lo_max_j
,
0
);
max_j
=
_mm256_insertf128_si256
(
max_j
,
hi_max_j
,
1
);
#endif
/* Update the max_score value.*/
max_score
=
_mm256_max_ps
(
max_score
,
score_v
);
#endif
trans_offset
+=
tag_num
;
}
/* Update the alpha and track values. */
#ifdef __AVX512F__
__m512
x_content
=
_mm512_loadu_ps
(
x
+
seq_offset
+
this
->
num_
+
j_offset
);
max_score
=
_mm512_add_ps
(
max_score
,
x_content
);
_mm512_storeu_ps
(
alpha
+
seq_offset
+
this
->
num_
+
j_offset
,
max_score
);
_mm512_storeu_si512
(
reinterpret_cast
<
__m512i
*>
(
track
+
seq_offset
+
this
->
num_
+
j_offset
),
max_j
);
#else
__m256
x_content
=
_mm256_loadu_ps
(
x
+
seq_offset
+
tag_num
+
j_offset
);
max_score
=
_mm256_add_ps
(
max_score
,
x_content
);
_mm256_storeu_ps
(
alpha
+
seq_offset
+
tag_num
+
j_offset
,
max_score
);
_mm256_storeu_si256
(
reinterpret_cast
<
__m256i
*>
(
track
+
seq_offset
+
tag_num
+
j_offset
),
max_j
);
#endif
/* Calculate the offset of next step*/
j_offset
+=
step_size
;
if
(
j
==
end
-
1
)
{
if
(
rest
>
0
)
{
j_offset
+=
last_offset
;
}
else
{
}
else
{
refer
::
VScal
<
double
>
(
a
,
x
,
y
,
n
);
break
;
}
}
}
seq_offset
+=
tag_num
;
}
}
}
}
template
<
>
bool
CRFDecodingKernel
::
UseMe
(
int
d
)
const
{
void
VExp
<
float
>
(
const
float
*
x
,
float
*
y
,
int
n
)
{
return
platform
::
MayIUse
(
platform
::
avx
);
platform
::
dynload
::
vsExp
(
n
,
x
,
y
);
}
template
<
>
void
VExp
<
double
>
(
const
double
*
x
,
double
*
y
,
int
n
)
{
platform
::
dynload
::
vdExp
(
n
,
x
,
y
);
}
// TODO(TJ): tuning me carefully on AVX, AVX2 and AVX512
template
<
>
bool
VMulKernel
<
float
>::
UseMe
(
int
d
)
const
{
return
platform
::
MayIUse
(
platform
::
avx512f
)
&&
d
>
512
;
}
template
<
>
bool
VAddKernel
<
float
>::
UseMe
(
int
d
)
const
{
return
platform
::
MayIUse
(
platform
::
avx512f
)
&&
d
>
512
;
}
template
<
>
bool
VScalKernel
<
float
>::
UseMe
(
int
d
)
const
{
return
platform
::
MayIUse
(
platform
::
avx512f
)
&&
d
>
512
;
}
template
<
>
bool
VExpKernel
<
float
>::
UseMe
(
int
d
)
const
{
return
d
>
7
;
}
template
<
>
bool
VSigmoidKernel
<
float
>::
UseMe
(
int
d
)
const
{
return
d
>
7
;
}
template
<
>
bool
VTanhKernel
<
float
>::
UseMe
(
int
d
)
const
{
return
d
>
7
;
}
}
#define AWALYS_USE_ME_WITH_DOUBLE(func) \
}
// namespace intrinsic
template <> \
bool func##Kernel<double>::UseMe(int d) const { \
return true; \
}
AWALYS_USE_ME_WITH_DOUBLE
(
VMul
);
AWALYS_USE_ME_WITH_DOUBLE
(
VAdd
);
AWALYS_USE_ME_WITH_DOUBLE
(
VScal
);
AWALYS_USE_ME_WITH_DOUBLE
(
VExp
);
AWALYS_USE_ME_WITH_DOUBLE
(
VSigmoid
);
AWALYS_USE_ME_WITH_DOUBLE
(
VTanh
);
#undef AWALYS_USE_ME_WITH_DOUBLE
}
// namespace mkl
}
// namespace more
}
// namespace more
}
// namespace jit
}
// namespace jit
}
// namespace operators
}
// namespace operators
}
// namespace paddle
}
// namespace paddle
namespace
mkl
=
paddle
::
operators
::
jit
::
more
::
mkl
;
namespace
intrinsic
=
paddle
::
operators
::
jit
::
more
::
intrinsic
;
#define REGISTER_MKL_KERNEL(key, func) \
REGISTER_JITKERNEL_MORE(key, mkl, mkl::func##Kernel<float>, \
mkl::func##Kernel<double>)
REGISTER_MKL_KERNEL
(
vmul
,
VMul
);
REGISTER_MKL_KERNEL
(
vadd
,
VAdd
);
REGISTER_MKL_KERNEL
(
vscal
,
VScal
);
REGISTER_MKL_KERNEL
(
vexp
,
VExp
);
REGISTER_MKL_KERNEL
(
vsigmoid
,
VSigmoid
);
REGISTER_MKL_KERNEL
(
vtanh
,
VTanh
);
#undef REGISTER_MKL_KERNEL
REGISTER_JITKERNEL_MORE
(
crfdecoding
,
intrinsic
,
intrinsic
::
CRFDecodingKernel
);
paddle/fluid/operators/jit/more/intrinsic/crf_decoding.h
浏览文件 @
b1516783
...
@@ -21,68 +21,18 @@ namespace paddle {
...
@@ -21,68 +21,18 @@ namespace paddle {
namespace
operators
{
namespace
operators
{
namespace
jit
{
namespace
jit
{
namespace
more
{
namespace
more
{
namespace
mkl
{
namespace
intrinsic
{
template
<
typename
T
>
void
CRFDecoding
(
const
int
seq_len
,
const
float
*
x
,
const
float
*
w
,
void
VMul
(
const
T
*
x
,
const
T
*
y
,
T
*
z
,
int
n
);
float
*
alpha
,
int
*
track
,
int
tag_num
);
template
<
typename
T
>
class
CRFDecodingKernel
:
public
KernelImpl
<
CRFDecodingTuples
<
float
>>
{
void
VAdd
(
const
T
*
x
,
const
T
*
y
,
T
*
z
,
int
n
);
public:
CRFDecodingKernel
()
{
this
->
func
=
CRFDecoding
;
}
bool
UseMe
(
typename
CRFDecodingTuples
<
float
>::
attr_type
)
const
override
;
};
template
<
typename
T
>
}
// namespace intrinsic
void
VScal
(
const
T
*
a
,
const
T
*
x
,
T
*
y
,
int
n
);
template
<
typename
T
>
void
VExp
(
const
T
*
x
,
T
*
y
,
int
n
);
template
<
typename
T
>
void
VSigmoid
(
const
T
*
x
,
T
*
y
,
int
n
)
{
const
T
min
=
SIGMOID_THRESHOLD_MIN
;
const
T
max
=
SIGMOID_THRESHOLD_MAX
;
for
(
int
i
=
0
;
i
<
n
;
++
i
)
{
y
[
i
]
=
(
x
[
i
]
<
min
)
?
min
:
((
x
[
i
]
>
max
)
?
max
:
x
[
i
]);
y
[
i
]
=
static_cast
<
T
>
(
0
)
-
y
[
i
];
}
VExp
(
y
,
y
,
n
);
for
(
int
i
=
0
;
i
<
n
;
++
i
)
{
y
[
i
]
=
static_cast
<
T
>
(
1
)
/
(
static_cast
<
T
>
(
1
)
+
y
[
i
]);
}
}
template
<
typename
T
>
void
VTanh
(
const
T
*
x
,
T
*
y
,
int
n
)
{
for
(
int
i
=
0
;
i
<
n
;
++
i
)
{
y
[
i
]
=
static_cast
<
T
>
(
2
)
*
x
[
i
];
}
VSigmoid
(
y
,
y
,
n
);
for
(
int
i
=
0
;
i
<
n
;
++
i
)
{
y
[
i
]
=
static_cast
<
T
>
(
2
)
*
y
[
i
]
-
static_cast
<
T
>
(
1
);
}
}
#define DECLARE_MKL_KERNEL(name, tuples) \
template <typename T> \
class name##Kernel : public KernelImpl<tuples<T>> { \
public: \
name##Kernel() { this->func = name<T>; } \
bool UseMe(typename tuples<T>::attr_type) const override; \
}
// XYZN
DECLARE_MKL_KERNEL
(
VMul
,
XYZNTuples
);
DECLARE_MKL_KERNEL
(
VAdd
,
XYZNTuples
);
// AXYN
DECLARE_MKL_KERNEL
(
VScal
,
AXYNTuples
);
// XYN
DECLARE_MKL_KERNEL
(
VExp
,
XYNTuples
);
DECLARE_MKL_KERNEL
(
VSigmoid
,
XYNTuples
);
DECLARE_MKL_KERNEL
(
VTanh
,
XYNTuples
);
#undef DECLARE_MKL_KERNEL
}
// namespace mkl
}
// namespace more
}
// namespace more
}
// namespace jit
}
// namespace jit
}
// namespace operators
}
// namespace operators
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录