Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
BaiXuePrincess
Paddle
提交
01dd9061
P
Paddle
项目概览
BaiXuePrincess
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
01dd9061
编写于
12月 17, 2018
作者:
P
peizhilin
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
add avx support for windows
test=develop
上级
363bf8a4
变更
9
展开全部
显示空白变更内容
内联
并排
Showing
9 changed file
with
757 addition
and
43 deletion
+757
-43
CMakeLists.txt
CMakeLists.txt
+0
-2
paddle/fluid/operators/math/cpu_vec.h
paddle/fluid/operators/math/cpu_vec.h
+0
-3
paddle/fluid/operators/math/detail/activation_functions.h
paddle/fluid/operators/math/detail/activation_functions.h
+1
-5
paddle/fluid/operators/math/detail/avx_functions.cc
paddle/fluid/operators/math/detail/avx_functions.cc
+1
-3
paddle/fluid/operators/math/detail/avx_mathfun.h
paddle/fluid/operators/math/detail/avx_mathfun.h
+731
-0
paddle/fluid/operators/math/jit_code.cc
paddle/fluid/operators/math/jit_code.cc
+20
-19
paddle/fluid/operators/math/jit_code.h
paddle/fluid/operators/math/jit_code.h
+0
-1
paddle/fluid/operators/math/jit_kernel_crf_decode.cc
paddle/fluid/operators/math/jit_kernel_crf_decode.cc
+2
-5
paddle/fluid/operators/math/jit_kernel_layer_norm.cc
paddle/fluid/operators/math/jit_kernel_layer_norm.cc
+2
-5
未找到文件。
CMakeLists.txt
浏览文件 @
01dd9061
...
@@ -131,8 +131,6 @@ if (APPLE OR WIN32)
...
@@ -131,8 +131,6 @@ if (APPLE OR WIN32)
endif
()
endif
()
if
(
WIN32
)
if
(
WIN32
)
set
(
WITH_AVX OFF CACHE STRING
"Disable AVX when compiling for Windows"
FORCE
)
set
(
WITH_DSO OFF CACHE STRING
set
(
WITH_DSO OFF CACHE STRING
"Disable DSO when compiling for Windows"
FORCE
)
"Disable DSO when compiling for Windows"
FORCE
)
set
(
WITH_MKL OFF CACHE STRING
set
(
WITH_MKL OFF CACHE STRING
...
...
paddle/fluid/operators/math/cpu_vec.h
浏览文件 @
01dd9061
...
@@ -18,9 +18,6 @@ limitations under the License. */
...
@@ -18,9 +18,6 @@ limitations under the License. */
#include <string>
#include <string>
#include "paddle/fluid/platform/cpu_info.h"
#include "paddle/fluid/platform/cpu_info.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/enforce.h"
#ifdef __AVX__
#include <immintrin.h>
#endif
#ifdef PADDLE_WITH_MKLML
#ifdef PADDLE_WITH_MKLML
#include "paddle/fluid/platform/dynload/mklml.h"
#include "paddle/fluid/platform/dynload/mklml.h"
...
...
paddle/fluid/operators/math/detail/activation_functions.h
浏览文件 @
01dd9061
...
@@ -15,14 +15,10 @@ limitations under the License. */
...
@@ -15,14 +15,10 @@ limitations under the License. */
#pragma once
#pragma once
#include <math.h>
#include <math.h>
#include <string>
#include <string>
#include "paddle/fluid/platform/cpu_info.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/hostdevice.h"
#include "paddle/fluid/platform/hostdevice.h"
#ifdef __AVX__
#include <immintrin.h>
#endif
namespace
paddle
{
namespace
paddle
{
namespace
operators
{
namespace
operators
{
namespace
math
{
namespace
math
{
...
...
paddle/fluid/operators/math/detail/avx_functions.cc
浏览文件 @
01dd9061
...
@@ -14,10 +14,8 @@ limitations under the License. */
...
@@ -14,10 +14,8 @@ limitations under the License. */
#ifdef __AVX__
#ifdef __AVX__
#include <immintrin.h>
#include "paddle/fluid/operators/math/detail/activation_functions.h"
#include "paddle/fluid/operators/math/detail/activation_functions.h"
// TODO(qingqing) refine this dependence
#include "paddle/fluid/operators/math/detail/avx_mathfun.h"
#include "paddle/legacy/cuda/src/avx_mathfun.h"
namespace
paddle
{
namespace
paddle
{
namespace
operators
{
namespace
operators
{
...
...
paddle/fluid/operators/math/detail/avx_mathfun.h
0 → 100644
浏览文件 @
01dd9061
此差异已折叠。
点击以展开。
paddle/fluid/operators/math/jit_code.cc
浏览文件 @
01dd9061
...
@@ -113,7 +113,8 @@ void VXXJitCode::generate() {
...
@@ -113,7 +113,8 @@ void VXXJitCode::generate() {
ret
();
ret
();
}
}
const
float
exp_float_consts
[]
ALIGN32
=
{
REPEAT_8TIMES
(
1.
f
),
const
float
ALIGN32_BEG
exp_float_consts
[]
ALIGN32_END
=
{
REPEAT_8TIMES
(
1.
f
),
REPEAT_8TIMES
(
2.
f
),
REPEAT_8TIMES
(
2.
f
),
REPEAT_8TIMES
(
0.5
f
),
REPEAT_8TIMES
(
0.5
f
),
REPEAT_8TIMES
(
EXP_HIG
),
REPEAT_8TIMES
(
EXP_HIG
),
...
@@ -131,8 +132,8 @@ const float exp_float_consts[] ALIGN32 = {REPEAT_8TIMES(1.f),
...
@@ -131,8 +132,8 @@ const float exp_float_consts[] ALIGN32 = {REPEAT_8TIMES(1.f),
REPEAT_8TIMES
(
SIGMOID_THRESHOLD_MAX
),
REPEAT_8TIMES
(
SIGMOID_THRESHOLD_MAX
),
REPEAT_8TIMES
(
SIGMOID_THRESHOLD_MIN
)};
REPEAT_8TIMES
(
SIGMOID_THRESHOLD_MIN
)};
const
int
exp_int_0x7f
[]
ALIGN32
=
{
REPEAT_8TIMES
(
0x7f
)};
const
int
ALIGN32_BEG
exp_int_0x7f
[]
ALIGN32_END
=
{
REPEAT_8TIMES
(
0x7f
)};
int
g_tmp_mem
[
16
]
ALIGN32
=
{
0
};
int
ALIGN32_BEG
g_tmp_mem
[
16
]
ALIGN32_END
=
{
0
};
bool
VActJitCode
::
init
(
int
d
,
operand_type
type
)
{
bool
VActJitCode
::
init
(
int
d
,
operand_type
type
)
{
// TODO(TJ): implement avx512, avx_exp is slower than mkl when d >= 256
// TODO(TJ): implement avx512, avx_exp is slower than mkl when d >= 256
...
...
paddle/fluid/operators/math/jit_code.h
浏览文件 @
01dd9061
...
@@ -47,7 +47,6 @@ extern const float exp_float_consts[];
...
@@ -47,7 +47,6 @@ extern const float exp_float_consts[];
extern
const
int
exp_int_0x7f
[];
extern
const
int
exp_int_0x7f
[];
extern
int
g_tmp_mem
[];
extern
int
g_tmp_mem
[];
#define ALIGN32 __attribute__((aligned(32)))
#define EXP_HIG 88.3762626647949f
#define EXP_HIG 88.3762626647949f
#define EXP_LOW -88.3762626647949f
#define EXP_LOW -88.3762626647949f
#define CEPHES_LOG2EF 1.44269504088896341
#define CEPHES_LOG2EF 1.44269504088896341
...
...
paddle/fluid/operators/math/jit_kernel_crf_decode.cc
浏览文件 @
01dd9061
...
@@ -16,9 +16,6 @@ limitations under the License. */
...
@@ -16,9 +16,6 @@ limitations under the License. */
#include <limits>
#include <limits>
#include <string>
#include <string>
#include "paddle/fluid/operators/math/jit_kernel_macro.h"
#include "paddle/fluid/operators/math/jit_kernel_macro.h"
#ifdef __AVX__
#include <immintrin.h>
#endif
namespace
paddle
{
namespace
paddle
{
namespace
operators
{
namespace
operators
{
...
@@ -133,8 +130,8 @@ class CRFDecodeKernelImpl : public CRFDecodeKernel<T> {
...
@@ -133,8 +130,8 @@ class CRFDecodeKernelImpl : public CRFDecodeKernel<T> {
/* AVX instructions.*/
\
/* AVX instructions.*/
\
__m128i lo_max_j = _mm256_extractf128_si256(max_j, 0); \
__m128i lo_max_j = _mm256_extractf128_si256(max_j, 0); \
__m128i hi_max_j = _mm256_extractf128_si256(max_j, 1); \
__m128i hi_max_j = _mm256_extractf128_si256(max_j, 1); \
__m128i lo_mask = _mm256_extractf128_si256(
(__m256i)mask, 0);
\
__m128i lo_mask = _mm256_extractf128_si256(
*(__m256i*)&mask, 0);
\
__m128i hi_mask = _mm256_extractf128_si256(
(__m256i)mask, 1);
\
__m128i hi_mask = _mm256_extractf128_si256(
*(__m256i*)&mask, 1);
\
lo_max_j = _mm_andnot_si128(lo_mask, lo_max_j); \
lo_max_j = _mm_andnot_si128(lo_mask, lo_max_j); \
hi_max_j = _mm_andnot_si128(hi_mask, hi_max_j); \
hi_max_j = _mm_andnot_si128(hi_mask, hi_max_j); \
lo_mask = _mm_and_si128(lo_mask, _mm_set1_epi32(i)); \
lo_mask = _mm_and_si128(lo_mask, _mm_set1_epi32(i)); \
...
...
paddle/fluid/operators/math/jit_kernel_layer_norm.cc
浏览文件 @
01dd9061
...
@@ -13,9 +13,6 @@ limitations under the License. */
...
@@ -13,9 +13,6 @@ limitations under the License. */
#include <limits>
#include <limits>
#include <string>
#include <string>
#include "paddle/fluid/operators/math/jit_kernel_macro.h"
#include "paddle/fluid/operators/math/jit_kernel_macro.h"
#ifdef __AVX__
#include <immintrin.h>
#endif
namespace
paddle
{
namespace
paddle
{
namespace
operators
{
namespace
operators
{
...
@@ -121,7 +118,7 @@ class LayerNormKernelImpl : public LayerNormKernel<T> {
...
@@ -121,7 +118,7 @@ class LayerNormKernelImpl : public LayerNormKernel<T> {
if (rest_ != 0) { \
if (rest_ != 0) { \
j = offset + this->num_ - block; \
j = offset + this->num_ - block; \
tmp = _mm256_loadu_ps((const float*)x + j); \
tmp = _mm256_loadu_ps((const float*)x + j); \
tmp = _mm256_blendv_ps(_mm256_setzero_ps(), tmp,
(__m256)mask_vec);
\
tmp = _mm256_blendv_ps(_mm256_setzero_ps(), tmp,
*(__m256*)&mask_vec);
\
sum = _mm256_add_ps(sum, tmp); \
sum = _mm256_add_ps(sum, tmp); \
} \
} \
hi = _mm256_extractf128_ps(sum, 1); \
hi = _mm256_extractf128_ps(sum, 1); \
...
@@ -145,7 +142,7 @@ class LayerNormKernelImpl : public LayerNormKernel<T> {
...
@@ -145,7 +142,7 @@ class LayerNormKernelImpl : public LayerNormKernel<T> {
j = offset + this->num_ - block; \
j = offset + this->num_ - block; \
tmp = _mm256_sub_ps(_mm256_loadu_ps((const float*)x + j), mean_vec); \
tmp = _mm256_sub_ps(_mm256_loadu_ps((const float*)x + j), mean_vec); \
tmp = _mm256_mul_ps(tmp, tmp); \
tmp = _mm256_mul_ps(tmp, tmp); \
tmp = _mm256_blendv_ps(_mm256_setzero_ps(), tmp,
(__m256)mask_vec);
\
tmp = _mm256_blendv_ps(_mm256_setzero_ps(), tmp,
*(__m256*)&mask_vec);
\
sum = _mm256_add_ps(sum, tmp); \
sum = _mm256_add_ps(sum, tmp); \
} \
} \
hi = _mm256_extractf128_ps(sum, 1); \
hi = _mm256_extractf128_ps(sum, 1); \
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录