Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
BaiXuePrincess
Paddle
提交
344b99e1
P
Paddle
项目概览
BaiXuePrincess
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
344b99e1
编写于
12月 15, 2022
作者:
H
huangjiyi
提交者:
GitHub
12月 15, 2022
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
[PHI decoupling] move softmax from fluid to phi and remove cpu_vec.h in fluid (#48970)
上级
4672ea8e
变更
65
隐藏空白更改
内联
并排
Showing
65 changed file
with
371 addition
and
1074 deletion
+371
-1074
paddle/fluid/inference/api/analysis_config.cc
paddle/fluid/inference/api/analysis_config.cc
+4
-3
paddle/fluid/inference/api/analysis_predictor_tester.cc
paddle/fluid/inference/api/analysis_predictor_tester.cc
+2
-2
paddle/fluid/inference/api/onnxruntime_predictor_tester.cc
paddle/fluid/inference/api/onnxruntime_predictor_tester.cc
+1
-1
paddle/fluid/inference/tests/api/analyzer_bfloat16_image_classification_tester.cc
...ests/api/analyzer_bfloat16_image_classification_tester.cc
+2
-2
paddle/fluid/memory/allocation/buddy_allocator.h
paddle/fluid/memory/allocation/buddy_allocator.h
+1
-1
paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
+6
-6
paddle/fluid/memory/allocation/system_allocator.cc
paddle/fluid/memory/allocation/system_allocator.cc
+3
-3
paddle/fluid/memory/pinned_memory_test.cu
paddle/fluid/memory/pinned_memory_test.cu
+1
-1
paddle/fluid/operators/attention_lstm_op.cc
paddle/fluid/operators/attention_lstm_op.cc
+10
-9
paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.cu
...d/operators/collective/c_softmax_with_cross_entropy_op.cu
+21
-15
paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.h
...id/operators/collective/c_softmax_with_cross_entropy_op.h
+1
-1
paddle/fluid/operators/elementwise/elementwise_mul_op.h
paddle/fluid/operators/elementwise/elementwise_mul_op.h
+1
-1
paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.cc
paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.cc
+4
-4
paddle/fluid/operators/fused/fusion_seqexpand_concat_fc_op.cc
...le/fluid/operators/fused/fusion_seqexpand_concat_fc_op.cc
+4
-4
paddle/fluid/operators/jit/CMakeLists.txt
paddle/fluid/operators/jit/CMakeLists.txt
+1
-1
paddle/fluid/operators/jit/gen/act.cc
paddle/fluid/operators/jit/gen/act.cc
+7
-7
paddle/fluid/operators/jit/gen/act.h
paddle/fluid/operators/jit/gen/act.h
+9
-8
paddle/fluid/operators/jit/gen/adam.cc
paddle/fluid/operators/jit/gen/adam.cc
+2
-2
paddle/fluid/operators/jit/gen/adamw.cc
paddle/fluid/operators/jit/gen/adamw.cc
+2
-2
paddle/fluid/operators/jit/gen/blas.cc
paddle/fluid/operators/jit/gen/blas.cc
+4
-3
paddle/fluid/operators/jit/gen/embseqpool.cc
paddle/fluid/operators/jit/gen/embseqpool.cc
+2
-2
paddle/fluid/operators/jit/gen/gru.cc
paddle/fluid/operators/jit/gen/gru.cc
+16
-15
paddle/fluid/operators/jit/gen/hopv.cc
paddle/fluid/operators/jit/gen/hopv.cc
+2
-2
paddle/fluid/operators/jit/gen/jitcode.h
paddle/fluid/operators/jit/gen/jitcode.h
+2
-2
paddle/fluid/operators/jit/gen/lstm.cc
paddle/fluid/operators/jit/gen/lstm.cc
+16
-15
paddle/fluid/operators/jit/gen/matmul.cc
paddle/fluid/operators/jit/gen/matmul.cc
+4
-3
paddle/fluid/operators/jit/gen/seqpool.cc
paddle/fluid/operators/jit/gen/seqpool.cc
+2
-2
paddle/fluid/operators/jit/gen/sgd.cc
paddle/fluid/operators/jit/gen/sgd.cc
+2
-2
paddle/fluid/operators/jit/gen/vbroadcast.cc
paddle/fluid/operators/jit/gen/vbroadcast.cc
+3
-2
paddle/fluid/operators/jit/gen_base.cc
paddle/fluid/operators/jit/gen_base.cc
+2
-2
paddle/fluid/operators/jit/more/intrinsic/crf_decoding.cc
paddle/fluid/operators/jit/more/intrinsic/crf_decoding.cc
+2
-2
paddle/fluid/operators/jit/more/intrinsic/layer_norm.cc
paddle/fluid/operators/jit/more/intrinsic/layer_norm.cc
+3
-2
paddle/fluid/operators/jit/more/mkl/mkl.cc
paddle/fluid/operators/jit/more/mkl/mkl.cc
+6
-6
paddle/fluid/operators/jit/test.cc
paddle/fluid/operators/jit/test.cc
+3
-3
paddle/fluid/operators/math/CMakeLists.txt
paddle/fluid/operators/math/CMakeLists.txt
+0
-1
paddle/fluid/operators/math/cpu_vec.h
paddle/fluid/operators/math/cpu_vec.h
+0
-664
paddle/fluid/operators/sample_logits_op.cu
paddle/fluid/operators/sample_logits_op.cu
+1
-1
paddle/fluid/operators/sample_logits_op.h
paddle/fluid/operators/sample_logits_op.h
+1
-1
paddle/fluid/operators/sequence_ops/sequence_softmax_cudnn_op.cu.cc
...id/operators/sequence_ops/sequence_softmax_cudnn_op.cu.cc
+3
-3
paddle/fluid/operators/softmax_with_cross_entropy_op_npu.cc
paddle/fluid/operators/softmax_with_cross_entropy_op_npu.cc
+1
-1
paddle/fluid/platform/CMakeLists.txt
paddle/fluid/platform/CMakeLists.txt
+1
-10
paddle/fluid/platform/cpu_info.h
paddle/fluid/platform/cpu_info.h
+0
-92
paddle/fluid/platform/cpu_info_test.cc
paddle/fluid/platform/cpu_info_test.cc
+3
-2
paddle/fluid/platform/init.cc
paddle/fluid/platform/init.cc
+1
-1
paddle/fluid/platform/profiler/CMakeLists.txt
paddle/fluid/platform/profiler/CMakeLists.txt
+1
-1
paddle/fluid/pybind/parallel_executor.cc
paddle/fluid/pybind/parallel_executor.cc
+1
-1
paddle/fluid/pybind/place.cc
paddle/fluid/pybind/place.cc
+1
-1
paddle/fluid/pybind/pybind.cc
paddle/fluid/pybind/pybind.cc
+8
-7
paddle/fluid/pybind/tensor.cc
paddle/fluid/pybind/tensor.cc
+1
-1
paddle/phi/backends/CMakeLists.txt
paddle/phi/backends/CMakeLists.txt
+4
-1
paddle/phi/backends/cpu/cpu_info.cc
paddle/phi/backends/cpu/cpu_info.cc
+17
-10
paddle/phi/backends/cpu/cpu_info.h
paddle/phi/backends/cpu/cpu_info.h
+43
-4
paddle/phi/kernels/funcs/CMakeLists.txt
paddle/phi/kernels/funcs/CMakeLists.txt
+1
-0
paddle/phi/kernels/funcs/eigen/common.h
paddle/phi/kernels/funcs/eigen/common.h
+1
-0
paddle/phi/kernels/funcs/softmax.cc
paddle/phi/kernels/funcs/softmax.cc
+6
-8
paddle/phi/kernels/funcs/softmax.cu
paddle/phi/kernels/funcs/softmax.cu
+62
-64
paddle/phi/kernels/funcs/softmax.h
paddle/phi/kernels/funcs/softmax.h
+5
-7
paddle/phi/kernels/funcs/softmax_impl.h
paddle/phi/kernels/funcs/softmax_impl.h
+44
-41
paddle/phi/kernels/gpu/cross_entropy_grad_kernel.cu
paddle/phi/kernels/gpu/cross_entropy_grad_kernel.cu
+1
-1
paddle/phi/kernels/gpu/cross_entropy_kernel.cu
paddle/phi/kernels/gpu/cross_entropy_kernel.cu
+2
-2
paddle/phi/kernels/impl/gumbel_softmax_grad_kernel_impl.h
paddle/phi/kernels/impl/gumbel_softmax_grad_kernel_impl.h
+3
-3
paddle/phi/kernels/impl/gumbel_softmax_kernel_impl.h
paddle/phi/kernels/impl/gumbel_softmax_kernel_impl.h
+3
-4
paddle/phi/kernels/impl/softmax_grad_kernel_impl.h
paddle/phi/kernels/impl/softmax_grad_kernel_impl.h
+2
-2
paddle/phi/kernels/impl/softmax_kernel_impl.h
paddle/phi/kernels/impl/softmax_kernel_impl.h
+2
-3
paddle/phi/tests/kernels/CMakeLists.txt
paddle/phi/tests/kernels/CMakeLists.txt
+1
-1
未找到文件。
paddle/fluid/inference/api/analysis_config.cc
浏览文件 @
344b99e1
...
@@ -19,9 +19,9 @@
...
@@ -19,9 +19,9 @@
#include "paddle/fluid/inference/api/paddle_analysis_config.h"
#include "paddle/fluid/inference/api/paddle_analysis_config.h"
#include "paddle/fluid/inference/api/paddle_pass_builder.h"
#include "paddle/fluid/inference/api/paddle_pass_builder.h"
#include "paddle/fluid/inference/utils/table_printer.h"
#include "paddle/fluid/inference/utils/table_printer.h"
#include "paddle/fluid/platform/cpu_info.h"
#include "paddle/fluid/platform/device/gpu/gpu_info.h"
#include "paddle/fluid/platform/device/gpu/gpu_info.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/phi/backends/cpu/cpu_info.h"
#include "paddle/utils/string/split.h"
#include "paddle/utils/string/split.h"
#ifdef PADDLE_WITH_TENSORRT
#ifdef PADDLE_WITH_TENSORRT
...
@@ -624,10 +624,11 @@ void AnalysisConfig::EnableMkldnnQuantizer() {
...
@@ -624,10 +624,11 @@ void AnalysisConfig::EnableMkldnnQuantizer() {
void
AnalysisConfig
::
EnableMkldnnBfloat16
()
{
void
AnalysisConfig
::
EnableMkldnnBfloat16
()
{
#ifdef PADDLE_WITH_MKLDNN
#ifdef PADDLE_WITH_MKLDNN
if
(
p
latform
::
MayIUse
(
platform
::
cpu_isa_t
::
avx512_core
))
{
if
(
p
hi
::
backends
::
cpu
::
MayIUse
(
phi
::
backends
::
cpu
::
cpu_isa_t
::
avx512_core
))
{
use_mkldnn_bfloat16_
=
true
;
use_mkldnn_bfloat16_
=
true
;
LOG
(
INFO
)
<<
"Hardware support for BFLOAT16"
LOG
(
INFO
)
<<
"Hardware support for BFLOAT16"
<<
(
platform
::
MayIUse
(
platform
::
cpu_isa_t
::
avx512_bf16
)
<<
(
phi
::
backends
::
cpu
::
MayIUse
(
phi
::
backends
::
cpu
::
cpu_isa_t
::
avx512_bf16
)
?
" is enabled"
?
" is enabled"
:
" is disabled. Simulation will be used"
);
:
" is disabled. Simulation will be used"
);
}
else
{
}
else
{
...
...
paddle/fluid/inference/api/analysis_predictor_tester.cc
浏览文件 @
344b99e1
...
@@ -29,7 +29,7 @@
...
@@ -29,7 +29,7 @@
#include "paddle/fluid/inference/api/paddle_inference_api.h"
#include "paddle/fluid/inference/api/paddle_inference_api.h"
#include "paddle/fluid/inference/tests/api/tester_helper.h"
#include "paddle/fluid/inference/tests/api/tester_helper.h"
#include "paddle/fluid/inference/utils/io_utils.h"
#include "paddle/fluid/inference/utils/io_utils.h"
#include "paddle/
fluid/platform
/cpu_info.h"
#include "paddle/
phi/backends/cpu
/cpu_info.h"
DEFINE_string
(
dirname
,
""
,
"dirname to tests."
);
DEFINE_string
(
dirname
,
""
,
"dirname to tests."
);
...
@@ -327,7 +327,7 @@ TEST(AnalysisPredictor, bf16_gpu_pass_strategy) {
...
@@ -327,7 +327,7 @@ TEST(AnalysisPredictor, bf16_gpu_pass_strategy) {
config
.
EnableUseGpu
(
100
,
0
);
config
.
EnableUseGpu
(
100
,
0
);
config
.
EnableMkldnnBfloat16
();
config
.
EnableMkldnnBfloat16
();
#ifdef PADDLE_WITH_MKLDNN
#ifdef PADDLE_WITH_MKLDNN
if
(
p
latform
::
MayIUse
(
platform
::
cpu_isa_t
::
avx512_core
))
if
(
p
hi
::
backends
::
cpu
::
MayIUse
(
phi
::
backends
::
cpu
::
cpu_isa_t
::
avx512_core
))
ASSERT_EQ
(
config
.
mkldnn_bfloat16_enabled
(),
true
);
ASSERT_EQ
(
config
.
mkldnn_bfloat16_enabled
(),
true
);
else
else
ASSERT_EQ
(
config
.
mkldnn_bfloat16_enabled
(),
false
);
ASSERT_EQ
(
config
.
mkldnn_bfloat16_enabled
(),
false
);
...
...
paddle/fluid/inference/api/onnxruntime_predictor_tester.cc
浏览文件 @
344b99e1
...
@@ -27,7 +27,7 @@
...
@@ -27,7 +27,7 @@
#include "paddle/fluid/inference/api/paddle_inference_api.h"
#include "paddle/fluid/inference/api/paddle_inference_api.h"
#include "paddle/fluid/inference/tests/api/tester_helper.h"
#include "paddle/fluid/inference/tests/api/tester_helper.h"
#include "paddle/fluid/inference/utils/io_utils.h"
#include "paddle/fluid/inference/utils/io_utils.h"
#include "paddle/
fluid/platform
/cpu_info.h"
#include "paddle/
phi/backends/cpu
/cpu_info.h"
DEFINE_string
(
dirname
,
""
,
"dirname to tests."
);
DEFINE_string
(
dirname
,
""
,
"dirname to tests."
);
...
...
paddle/fluid/inference/tests/api/analyzer_bfloat16_image_classification_tester.cc
浏览文件 @
344b99e1
...
@@ -14,7 +14,7 @@ limitations under the License. */
...
@@ -14,7 +14,7 @@ limitations under the License. */
#include "paddle/fluid/inference/api/paddle_analysis_config.h"
#include "paddle/fluid/inference/api/paddle_analysis_config.h"
#include "paddle/fluid/inference/tests/api/tester_helper.h"
#include "paddle/fluid/inference/tests/api/tester_helper.h"
#include "paddle/
fluid/platform
/cpu_info.h"
#include "paddle/
phi/backends/cpu
/cpu_info.h"
DEFINE_bool
(
enable_mkldnn
,
true
,
"Enable MKLDNN"
);
DEFINE_bool
(
enable_mkldnn
,
true
,
"Enable MKLDNN"
);
...
@@ -47,7 +47,7 @@ TEST(Analyzer_bfloat16_image_classification, bfloat16) {
...
@@ -47,7 +47,7 @@ TEST(Analyzer_bfloat16_image_classification, bfloat16) {
std
::
vector
<
std
::
vector
<
PaddleTensor
>>
input_slots_all
;
std
::
vector
<
std
::
vector
<
PaddleTensor
>>
input_slots_all
;
SetInputs
(
&
input_slots_all
);
SetInputs
(
&
input_slots_all
);
if
(
FLAGS_enable_mkldnn
&&
FLAGS_enable_bf16
&&
if
(
FLAGS_enable_mkldnn
&&
FLAGS_enable_bf16
&&
p
latform
::
MayIUse
(
platform
::
cpu_isa_t
::
avx512_bf16
))
{
p
hi
::
backends
::
cpu
::
MayIUse
(
phi
::
backends
::
cpu
::
cpu_isa_t
::
avx512_bf16
))
{
b_cfg
.
EnableMkldnnBfloat16
();
b_cfg
.
EnableMkldnnBfloat16
();
}
else
{
}
else
{
FLAGS_enable_bf16
=
false
;
FLAGS_enable_bf16
=
false
;
...
...
paddle/fluid/memory/allocation/buddy_allocator.h
浏览文件 @
344b99e1
...
@@ -27,9 +27,9 @@ limitations under the License. */
...
@@ -27,9 +27,9 @@ limitations under the License. */
#include "paddle/fluid/memory/allocation/memory_block.h"
#include "paddle/fluid/memory/allocation/memory_block.h"
#include "paddle/fluid/memory/allocation/system_allocator.h"
#include "paddle/fluid/memory/allocation/system_allocator.h"
#include "paddle/fluid/platform/cpu_info.h"
#include "paddle/fluid/platform/device/gpu/gpu_info.h"
#include "paddle/fluid/platform/device/gpu/gpu_info.h"
#include "paddle/fluid/platform/device/npu/npu_info.h"
#include "paddle/fluid/platform/device/npu/npu_info.h"
#include "paddle/phi/backends/cpu/cpu_info.h"
namespace
paddle
{
namespace
paddle
{
namespace
memory
{
namespace
memory
{
...
...
paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
浏览文件 @
344b99e1
...
@@ -78,8 +78,8 @@ BuddyAllocator *GetCPUBuddyAllocator() {
...
@@ -78,8 +78,8 @@ BuddyAllocator *GetCPUBuddyAllocator() {
std
::
call_once
(
init_flag
,
[]()
{
std
::
call_once
(
init_flag
,
[]()
{
a
=
new
detail
::
BuddyAllocator
(
a
=
new
detail
::
BuddyAllocator
(
std
::
unique_ptr
<
detail
::
SystemAllocator
>
(
new
detail
::
CPUAllocator
),
std
::
unique_ptr
<
detail
::
SystemAllocator
>
(
new
detail
::
CPUAllocator
),
p
latform
::
CpuMinChunkSize
(),
p
hi
::
backends
::
cpu
::
CpuMinChunkSize
(),
p
latform
::
CpuMaxChunkSize
());
p
hi
::
backends
::
cpu
::
CpuMaxChunkSize
());
});
});
return
a
;
return
a
;
...
@@ -290,8 +290,8 @@ BuddyAllocator *GetNPUPinnedBuddyAllocator() {
...
@@ -290,8 +290,8 @@ BuddyAllocator *GetNPUPinnedBuddyAllocator() {
std
::
call_once
(
init_flag
,
[]()
{
std
::
call_once
(
init_flag
,
[]()
{
ba
=
new
BuddyAllocator
(
std
::
unique_ptr
<
detail
::
SystemAllocator
>
(
ba
=
new
BuddyAllocator
(
std
::
unique_ptr
<
detail
::
SystemAllocator
>
(
new
detail
::
NPUPinnedAllocator
),
new
detail
::
NPUPinnedAllocator
),
p
latform
::
NPUPinnedMinChunkSize
(),
p
hi
::
backends
::
cpu
::
NPUPinnedMinChunkSize
(),
p
latform
::
NPUPinnedMaxChunkSize
());
p
hi
::
backends
::
cpu
::
NPUPinnedMaxChunkSize
());
});
});
return
ba
;
return
ba
;
...
@@ -562,8 +562,8 @@ BuddyAllocator *GetCUDAPinnedBuddyAllocator() {
...
@@ -562,8 +562,8 @@ BuddyAllocator *GetCUDAPinnedBuddyAllocator() {
std
::
call_once
(
init_flag
,
[]()
{
std
::
call_once
(
init_flag
,
[]()
{
ba
=
new
BuddyAllocator
(
std
::
unique_ptr
<
detail
::
SystemAllocator
>
(
ba
=
new
BuddyAllocator
(
std
::
unique_ptr
<
detail
::
SystemAllocator
>
(
new
detail
::
CUDAPinnedAllocator
),
new
detail
::
CUDAPinnedAllocator
),
p
latform
::
CUDAPinnedMinChunkSize
(),
p
hi
::
backends
::
cpu
::
CUDAPinnedMinChunkSize
(),
p
latform
::
CUDAPinnedMaxChunkSize
());
p
hi
::
backends
::
cpu
::
CUDAPinnedMaxChunkSize
());
});
});
return
ba
;
return
ba
;
...
...
paddle/fluid/memory/allocation/system_allocator.cc
浏览文件 @
344b99e1
...
@@ -28,10 +28,10 @@ limitations under the License. */
...
@@ -28,10 +28,10 @@ limitations under the License. */
#endif
#endif
#include "gflags/gflags.h"
#include "gflags/gflags.h"
#include "paddle/fluid/memory/allocation/allocator.h"
#include "paddle/fluid/memory/allocation/allocator.h"
#include "paddle/fluid/platform/cpu_info.h"
#include "paddle/fluid/platform/device/gpu/gpu_info.h"
#include "paddle/fluid/platform/device/gpu/gpu_info.h"
#include "paddle/fluid/platform/device/npu/npu_info.h"
#include "paddle/fluid/platform/device/npu/npu_info.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/phi/backends/cpu/cpu_info.h"
#ifdef PADDLE_WITH_MLU
#ifdef PADDLE_WITH_MLU
#include "paddle/fluid/platform/device/mlu/mlu_info.h"
#include "paddle/fluid/platform/device/mlu/mlu_info.h"
#endif
#endif
...
@@ -206,7 +206,7 @@ void* CUDAPinnedAllocator::Alloc(size_t* index, size_t size) {
...
@@ -206,7 +206,7 @@ void* CUDAPinnedAllocator::Alloc(size_t* index, size_t size) {
// of host pinned allocation. Allocates too much would reduce
// of host pinned allocation. Allocates too much would reduce
// the amount of memory available to the underlying system for paging.
// the amount of memory available to the underlying system for paging.
size_t
usable
=
size_t
usable
=
p
addle
::
platform
::
CUDAPinnedMaxAllocSize
()
-
cuda_pinnd_alloc_size_
;
p
hi
::
backends
::
cpu
::
CUDAPinnedMaxAllocSize
()
-
cuda_pinnd_alloc_size_
;
if
(
size
>
usable
)
{
if
(
size
>
usable
)
{
LOG
(
WARNING
)
<<
"Cannot malloc "
<<
size
/
1024.0
/
1024.0
LOG
(
WARNING
)
<<
"Cannot malloc "
<<
size
/
1024.0
/
1024.0
...
@@ -362,7 +362,7 @@ void* NPUPinnedAllocator::Alloc(size_t* index, size_t size) {
...
@@ -362,7 +362,7 @@ void* NPUPinnedAllocator::Alloc(size_t* index, size_t size) {
if
(
size
<=
0
)
return
nullptr
;
if
(
size
<=
0
)
return
nullptr
;
size_t
usable
=
size_t
usable
=
p
addle
::
platform
::
NPUPinnedMaxAllocSize
()
-
npu_pinnd_alloc_size_
;
p
hi
::
backends
::
cpu
::
NPUPinnedMaxAllocSize
()
-
npu_pinnd_alloc_size_
;
if
(
size
>
usable
)
{
if
(
size
>
usable
)
{
LOG
(
WARNING
)
<<
"Cannot malloc "
<<
size
/
1024.0
/
1024.0
LOG
(
WARNING
)
<<
"Cannot malloc "
<<
size
/
1024.0
/
1024.0
...
...
paddle/fluid/memory/pinned_memory_test.cu
浏览文件 @
344b99e1
...
@@ -18,9 +18,9 @@ limitations under the License. */
...
@@ -18,9 +18,9 @@ limitations under the License. */
#include "paddle/fluid/memory/allocation/memory_block.h"
#include "paddle/fluid/memory/allocation/memory_block.h"
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/fluid/memory/memory.h"
#include "paddle/fluid/memory/memory.h"
#include "paddle/fluid/platform/cpu_info.h"
#include "paddle/fluid/platform/device/gpu/gpu_info.h"
#include "paddle/fluid/platform/device/gpu/gpu_info.h"
#include "paddle/fluid/platform/place.h"
#include "paddle/fluid/platform/place.h"
#include "paddle/phi/backends/cpu/cpu_info.h"
// This unit test is an example comparing the performance between using pinned
// This unit test is an example comparing the performance between using pinned
// memory and not. In general, using pinned memory will be faster.
// memory and not. In general, using pinned memory will be faster.
...
...
paddle/fluid/operators/attention_lstm_op.cc
浏览文件 @
344b99e1
...
@@ -16,7 +16,7 @@ limitations under the License. */
...
@@ -16,7 +16,7 @@ limitations under the License. */
#include <string>
#include <string>
#include "paddle/
fluid/platform
/cpu_info.h"
#include "paddle/
phi/backends/cpu
/cpu_info.h"
#include "paddle/phi/kernels/funcs/blas/blas.h"
#include "paddle/phi/kernels/funcs/blas/blas.h"
#include "paddle/phi/kernels/funcs/cpu_vec.h"
#include "paddle/phi/kernels/funcs/cpu_vec.h"
#include "paddle/phi/kernels/funcs/fc_functor.h"
#include "paddle/phi/kernels/funcs/fc_functor.h"
...
@@ -315,10 +315,10 @@ use lstm_x_t as input and compute as standard LSTM.
...
@@ -315,10 +315,10 @@ use lstm_x_t as input and compute as standard LSTM.
template
<
typename
T
>
template
<
typename
T
>
inline
void
bias_relu
(
const
int
n
,
const
T
*
x
,
const
T
*
bias
,
T
*
y
)
{
inline
void
bias_relu
(
const
int
n
,
const
T
*
x
,
const
T
*
bias
,
T
*
y
)
{
if
(
bias
)
{
if
(
bias
)
{
phi
::
funcs
::
vec_add_bias
<
T
,
p
latform
::
avx
>
(
n
,
*
bias
,
x
,
y
);
phi
::
funcs
::
vec_add_bias
<
T
,
p
hi
::
backends
::
cpu
::
avx
>
(
n
,
*
bias
,
x
,
y
);
phi
::
funcs
::
vec_relu
<
T
,
p
latform
::
avx
>
(
n
,
y
,
y
);
phi
::
funcs
::
vec_relu
<
T
,
p
hi
::
backends
::
cpu
::
avx
>
(
n
,
y
,
y
);
}
else
{
}
else
{
phi
::
funcs
::
vec_relu
<
T
,
p
latform
::
avx
>
(
n
,
x
,
y
);
phi
::
funcs
::
vec_relu
<
T
,
p
hi
::
backends
::
cpu
::
avx
>
(
n
,
x
,
y
);
}
}
}
}
...
@@ -329,8 +329,9 @@ inline void vec_softmax(const int n, const T* x, T* y) {
...
@@ -329,8 +329,9 @@ inline void vec_softmax(const int n, const T* x, T* y) {
for
(
int
i
=
1
;
i
<
n
;
++
i
)
{
for
(
int
i
=
1
;
i
<
n
;
++
i
)
{
scalar
=
scalar
<
x
[
i
]
?
x
[
i
]
:
scalar
;
scalar
=
scalar
<
x
[
i
]
?
x
[
i
]
:
scalar
;
}
}
phi
::
funcs
::
vec_add_bias
<
T
,
platform
::
avx
>
(
n
,
-
scalar
,
x
,
y
);
// sub
phi
::
funcs
::
vec_add_bias
<
T
,
phi
::
backends
::
cpu
::
avx
>
(
phi
::
funcs
::
vec_exp
<
T
>
(
n
,
y
,
y
);
// exp
n
,
-
scalar
,
x
,
y
);
// sub
phi
::
funcs
::
vec_exp
<
T
>
(
n
,
y
,
y
);
// exp
// sum
// sum
scalar
=
T
(
0
);
scalar
=
T
(
0
);
for
(
int
i
=
0
;
i
<
n
;
++
i
)
{
for
(
int
i
=
0
;
i
<
n
;
++
i
)
{
...
@@ -393,13 +394,13 @@ class AttentionLSTMKernel : public framework::OpKernel<T> {
...
@@ -393,13 +394,13 @@ class AttentionLSTMKernel : public framework::OpKernel<T> {
auto
&
act_gate_str
=
ctx
.
Attr
<
std
::
string
>
(
"gate_activation"
);
auto
&
act_gate_str
=
ctx
.
Attr
<
std
::
string
>
(
"gate_activation"
);
auto
&
act_cell_str
=
ctx
.
Attr
<
std
::
string
>
(
"cell_activation"
);
auto
&
act_cell_str
=
ctx
.
Attr
<
std
::
string
>
(
"cell_activation"
);
auto
&
act_cand_str
=
ctx
.
Attr
<
std
::
string
>
(
"candidate_activation"
);
auto
&
act_cand_str
=
ctx
.
Attr
<
std
::
string
>
(
"candidate_activation"
);
if
(
p
latform
::
MayIUse
(
platform
::
avx
))
{
if
(
p
hi
::
backends
::
cpu
::
MayIUse
(
phi
::
backends
::
cpu
::
avx
))
{
phi
::
funcs
::
VecActivations
<
T
,
p
latform
::
avx
>
act_functor
;
phi
::
funcs
::
VecActivations
<
T
,
p
hi
::
backends
::
cpu
::
avx
>
act_functor
;
act_gate
=
act_functor
(
act_gate_str
);
act_gate
=
act_functor
(
act_gate_str
);
act_cell
=
act_functor
(
act_cell_str
);
act_cell
=
act_functor
(
act_cell_str
);
act_cand
=
act_functor
(
act_cand_str
);
act_cand
=
act_functor
(
act_cand_str
);
}
else
{
}
else
{
phi
::
funcs
::
VecActivations
<
T
,
p
latform
::
isa_any
>
act_functor
;
phi
::
funcs
::
VecActivations
<
T
,
p
hi
::
backends
::
cpu
::
isa_any
>
act_functor
;
act_gate
=
act_functor
(
act_gate_str
);
act_gate
=
act_functor
(
act_gate_str
);
act_cell
=
act_functor
(
act_cell_str
);
act_cell
=
act_functor
(
act_cell_str
);
act_cand
=
act_functor
(
act_cand_str
);
act_cand
=
act_functor
(
act_cand_str
);
...
...
paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.cu
浏览文件 @
344b99e1
...
@@ -13,13 +13,15 @@ See the License for the specific language governing permissions and
...
@@ -13,13 +13,15 @@ See the License for the specific language governing permissions and
limitations under the License. */
limitations under the License. */
#include "paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.h"
#include "paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.h"
#include "paddle/fluid/operators/math/softmax_impl.h"
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/platform/collective_helper.h"
#include "paddle/fluid/platform/collective_helper.h"
#include "paddle/fluid/platform/device/gpu/nccl_helper.h"
#include "paddle/fluid/platform/device/gpu/nccl_helper.h"
#include "paddle/fluid/string/string_helper.h"
#include "paddle/fluid/string/string_helper.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/kernels/funcs/axis_utils.h"
#include "paddle/phi/kernels/funcs/axis_utils.h"
#include "paddle/phi/kernels/funcs/cross_entropy.h"
#include "paddle/phi/kernels/funcs/cross_entropy.h"
#include "paddle/phi/kernels/funcs/softmax_impl.h"
namespace
paddle
{
namespace
paddle
{
namespace
operators
{
namespace
operators
{
...
@@ -129,15 +131,15 @@ struct CSoftmaxWithCrossEntropyFunctor<phi::GPUContext, T> {
...
@@ -129,15 +131,15 @@ struct CSoftmaxWithCrossEntropyFunctor<phi::GPUContext, T> {
softmax_2d
.
ShareDataWith
(
*
softmax
).
Resize
({
N
,
D
});
softmax_2d
.
ShareDataWith
(
*
softmax
).
Resize
({
N
,
D
});
loss_2d
.
ShareDataWith
(
*
loss
).
Resize
({
N
,
1
});
loss_2d
.
ShareDataWith
(
*
loss
).
Resize
({
N
,
1
});
auto
eigen_logits
=
math
::
EigenMatrix
<
T
>::
From
(
logits_2d
);
auto
eigen_logits
=
phi
::
funcs
::
EigenMatrix
<
T
>::
From
(
logits_2d
);
auto
eigen_softmax
=
math
::
EigenMatrix
<
T
>::
From
(
softmax_2d
);
auto
eigen_softmax
=
phi
::
funcs
::
EigenMatrix
<
T
>::
From
(
softmax_2d
);
// step 1, obtain logit_max
// step 1, obtain logit_max
phi
::
DenseTensor
logits_max
;
phi
::
DenseTensor
logits_max
;
logits_max
=
ctx
.
AllocateTmpTensor
<
T
,
phi
::
GPUContext
>
({
N
,
1
},
dev_ctx
);
logits_max
=
ctx
.
AllocateTmpTensor
<
T
,
phi
::
GPUContext
>
({
N
,
1
},
dev_ctx
);
void
*
logits_max_buff
=
logits_max
.
mutable_data
<
T
>
(
place
);
void
*
logits_max_buff
=
logits_max
.
mutable_data
<
T
>
(
place
);
auto
eigen_logits_max
=
math
::
EigenMatrix
<
T
>::
From
(
logits_max
);
auto
eigen_logits_max
=
phi
::
funcs
::
EigenMatrix
<
T
>::
From
(
logits_max
);
Eigen
::
DSizes
<
int
,
1
>
along_axis
(
1
);
Eigen
::
DSizes
<
int
,
1
>
along_axis
(
1
);
eigen_logits_max
.
device
(
*
dev_ctx
.
eigen_device
())
=
eigen_logits_max
.
device
(
*
dev_ctx
.
eigen_device
())
=
eigen_logits
.
maximum
(
along_axis
);
eigen_logits
.
maximum
(
along_axis
);
...
@@ -158,7 +160,7 @@ struct CSoftmaxWithCrossEntropyFunctor<phi::GPUContext, T> {
...
@@ -158,7 +160,7 @@ struct CSoftmaxWithCrossEntropyFunctor<phi::GPUContext, T> {
eigen_softmax
.
device
(
*
dev_ctx
.
eigen_device
())
=
eigen_softmax
.
device
(
*
dev_ctx
.
eigen_device
())
=
(
eigen_logits
-
(
eigen_logits
-
eigen_logits_max
.
reshape
(
batch_by_one
).
broadcast
(
one_by_class
))
eigen_logits_max
.
reshape
(
batch_by_one
).
broadcast
(
one_by_class
))
.
unaryExpr
(
math
::
ValueClip
<
T
>
());
.
unaryExpr
(
phi
::
funcs
::
ValueClip
<
T
>
());
// step 3, obtain predict target
// step 3, obtain predict target
phi
::
DenseTensor
predicted_logits
;
phi
::
DenseTensor
predicted_logits
;
...
@@ -217,7 +219,8 @@ struct CSoftmaxWithCrossEntropyFunctor<phi::GPUContext, T> {
...
@@ -217,7 +219,8 @@ struct CSoftmaxWithCrossEntropyFunctor<phi::GPUContext, T> {
sum_exp_logits
=
ctx
.
AllocateTmpTensor
<
T
,
phi
::
GPUContext
>
({
N
,
1
},
dev_ctx
);
sum_exp_logits
=
ctx
.
AllocateTmpTensor
<
T
,
phi
::
GPUContext
>
({
N
,
1
},
dev_ctx
);
void
*
sum_exp_logits_buff
=
sum_exp_logits
.
mutable_data
<
T
>
(
place
);
void
*
sum_exp_logits_buff
=
sum_exp_logits
.
mutable_data
<
T
>
(
place
);
auto
eigen_sum_exp_logits
=
math
::
EigenMatrix
<
T
>::
From
(
sum_exp_logits
);
auto
eigen_sum_exp_logits
=
phi
::
funcs
::
EigenMatrix
<
T
>::
From
(
sum_exp_logits
);
eigen_sum_exp_logits
.
device
(
*
dev_ctx
.
eigen_device
())
=
eigen_sum_exp_logits
.
device
(
*
dev_ctx
.
eigen_device
())
=
eigen_softmax
.
sum
(
along_axis
);
eigen_softmax
.
sum
(
along_axis
);
...
@@ -231,8 +234,9 @@ struct CSoftmaxWithCrossEntropyFunctor<phi::GPUContext, T> {
...
@@ -231,8 +234,9 @@ struct CSoftmaxWithCrossEntropyFunctor<phi::GPUContext, T> {
comm
->
comm
(),
comm
->
comm
(),
stream
));
stream
));
auto
eigen_loss
=
math
::
EigenMatrix
<
T
>::
From
(
loss_2d
);
auto
eigen_loss
=
phi
::
funcs
::
EigenMatrix
<
T
>::
From
(
loss_2d
);
auto
eigen_predicted_logits
=
math
::
EigenMatrix
<
T
>::
From
(
predicted_logits
);
auto
eigen_predicted_logits
=
phi
::
funcs
::
EigenMatrix
<
T
>::
From
(
predicted_logits
);
eigen_loss
.
device
(
*
dev_ctx
.
eigen_device
())
=
eigen_loss
.
device
(
*
dev_ctx
.
eigen_device
())
=
(
eigen_sum_exp_logits
.
log
().
unaryExpr
(
phi
::
funcs
::
TolerableValue
<
T
>
())
-
(
eigen_sum_exp_logits
.
log
().
unaryExpr
(
phi
::
funcs
::
TolerableValue
<
T
>
())
-
...
@@ -281,14 +285,14 @@ struct CSoftmaxWithCrossEntropyProcessGroupFunctor<phi::GPUContext, T> {
...
@@ -281,14 +285,14 @@ struct CSoftmaxWithCrossEntropyProcessGroupFunctor<phi::GPUContext, T> {
softmax_2d
.
ShareDataWith
(
*
softmax
).
Resize
({
N
,
D
});
softmax_2d
.
ShareDataWith
(
*
softmax
).
Resize
({
N
,
D
});
loss_2d
.
ShareDataWith
(
*
loss
).
Resize
({
N
,
1
});
loss_2d
.
ShareDataWith
(
*
loss
).
Resize
({
N
,
1
});
auto
eigen_logits
=
math
::
EigenMatrix
<
T
>::
From
(
logits_2d
);
auto
eigen_logits
=
phi
::
funcs
::
EigenMatrix
<
T
>::
From
(
logits_2d
);
auto
eigen_softmax
=
math
::
EigenMatrix
<
T
>::
From
(
softmax_2d
);
auto
eigen_softmax
=
phi
::
funcs
::
EigenMatrix
<
T
>::
From
(
softmax_2d
);
// step 1, obtain logit_max
// step 1, obtain logit_max
phi
::
DenseTensor
logits_max
;
phi
::
DenseTensor
logits_max
;
logits_max
=
ctx
.
AllocateTmpTensor
<
T
,
phi
::
GPUContext
>
({
N
,
1
},
dev_ctx
);
logits_max
=
ctx
.
AllocateTmpTensor
<
T
,
phi
::
GPUContext
>
({
N
,
1
},
dev_ctx
);
auto
eigen_logits_max
=
math
::
EigenMatrix
<
T
>::
From
(
logits_max
);
auto
eigen_logits_max
=
phi
::
funcs
::
EigenMatrix
<
T
>::
From
(
logits_max
);
Eigen
::
DSizes
<
int
,
1
>
along_axis
(
1
);
Eigen
::
DSizes
<
int
,
1
>
along_axis
(
1
);
eigen_logits_max
.
device
(
*
dev_ctx
.
eigen_device
())
=
eigen_logits_max
.
device
(
*
dev_ctx
.
eigen_device
())
=
eigen_logits
.
maximum
(
along_axis
);
eigen_logits
.
maximum
(
along_axis
);
...
@@ -304,7 +308,7 @@ struct CSoftmaxWithCrossEntropyProcessGroupFunctor<phi::GPUContext, T> {
...
@@ -304,7 +308,7 @@ struct CSoftmaxWithCrossEntropyProcessGroupFunctor<phi::GPUContext, T> {
eigen_softmax
.
device
(
*
dev_ctx
.
eigen_device
())
=
eigen_softmax
.
device
(
*
dev_ctx
.
eigen_device
())
=
(
eigen_logits
-
(
eigen_logits
-
eigen_logits_max
.
reshape
(
batch_by_one
).
broadcast
(
one_by_class
))
eigen_logits_max
.
reshape
(
batch_by_one
).
broadcast
(
one_by_class
))
.
unaryExpr
(
math
::
ValueClip
<
T
>
());
.
unaryExpr
(
phi
::
funcs
::
ValueClip
<
T
>
());
// step 3, obtain predict target
// step 3, obtain predict target
phi
::
DenseTensor
predicted_logits
;
phi
::
DenseTensor
predicted_logits
;
...
@@ -357,7 +361,8 @@ struct CSoftmaxWithCrossEntropyProcessGroupFunctor<phi::GPUContext, T> {
...
@@ -357,7 +361,8 @@ struct CSoftmaxWithCrossEntropyProcessGroupFunctor<phi::GPUContext, T> {
sum_exp_logits
=
ctx
.
AllocateTmpTensor
<
T
,
phi
::
GPUContext
>
({
N
,
1
},
dev_ctx
);
sum_exp_logits
=
ctx
.
AllocateTmpTensor
<
T
,
phi
::
GPUContext
>
({
N
,
1
},
dev_ctx
);
void
*
sum_exp_logits_buff
=
sum_exp_logits
.
mutable_data
<
T
>
(
place
);
void
*
sum_exp_logits_buff
=
sum_exp_logits
.
mutable_data
<
T
>
(
place
);
auto
eigen_sum_exp_logits
=
math
::
EigenMatrix
<
T
>::
From
(
sum_exp_logits
);
auto
eigen_sum_exp_logits
=
phi
::
funcs
::
EigenMatrix
<
T
>::
From
(
sum_exp_logits
);
eigen_sum_exp_logits
.
device
(
*
dev_ctx
.
eigen_device
())
=
eigen_sum_exp_logits
.
device
(
*
dev_ctx
.
eigen_device
())
=
eigen_softmax
.
sum
(
along_axis
);
eigen_softmax
.
sum
(
along_axis
);
...
@@ -366,8 +371,9 @@ struct CSoftmaxWithCrossEntropyProcessGroupFunctor<phi::GPUContext, T> {
...
@@ -366,8 +371,9 @@ struct CSoftmaxWithCrossEntropyProcessGroupFunctor<phi::GPUContext, T> {
opts
.
reduce_op
=
distributed
::
ReduceOp
::
SUM
;
opts
.
reduce_op
=
distributed
::
ReduceOp
::
SUM
;
pg
->
AllReduce
(
in_out
,
in_out
,
opts
)
->
Synchronize
();
pg
->
AllReduce
(
in_out
,
in_out
,
opts
)
->
Synchronize
();
auto
eigen_loss
=
math
::
EigenMatrix
<
T
>::
From
(
loss_2d
);
auto
eigen_loss
=
phi
::
funcs
::
EigenMatrix
<
T
>::
From
(
loss_2d
);
auto
eigen_predicted_logits
=
math
::
EigenMatrix
<
T
>::
From
(
predicted_logits
);
auto
eigen_predicted_logits
=
phi
::
funcs
::
EigenMatrix
<
T
>::
From
(
predicted_logits
);
eigen_loss
.
device
(
*
dev_ctx
.
eigen_device
())
=
eigen_loss
.
device
(
*
dev_ctx
.
eigen_device
())
=
(
eigen_sum_exp_logits
.
log
().
unaryExpr
(
phi
::
funcs
::
TolerableValue
<
T
>
())
-
(
eigen_sum_exp_logits
.
log
().
unaryExpr
(
phi
::
funcs
::
TolerableValue
<
T
>
())
-
...
...
paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.h
浏览文件 @
344b99e1
...
@@ -22,9 +22,9 @@ limitations under the License. */
...
@@ -22,9 +22,9 @@ limitations under the License. */
#include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/softmax.h"
#include "paddle/phi/api/include/tensor.h"
#include "paddle/phi/api/include/tensor.h"
#include "paddle/phi/kernels/funcs/cross_entropy.h"
#include "paddle/phi/kernels/funcs/cross_entropy.h"
#include "paddle/phi/kernels/funcs/softmax.h"
namespace
paddle
{
namespace
paddle
{
namespace
operators
{
namespace
operators
{
...
...
paddle/fluid/operators/elementwise/elementwise_mul_op.h
浏览文件 @
344b99e1
...
@@ -17,7 +17,7 @@ limitations under the License. */
...
@@ -17,7 +17,7 @@ limitations under the License. */
#include <string>
#include <string>
#include "paddle/fluid/operators/elementwise/elementwise_op.h"
#include "paddle/fluid/operators/elementwise/elementwise_op.h"
#include "paddle/
fluid/platform
/cpu_info.h"
#include "paddle/
phi/backends/cpu
/cpu_info.h"
#include "paddle/phi/kernels/elementwise_kernel.h"
#include "paddle/phi/kernels/elementwise_kernel.h"
namespace
paddle
{
namespace
paddle
{
...
...
paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.cc
浏览文件 @
344b99e1
...
@@ -16,7 +16,7 @@ limitations under the License. */
...
@@ -16,7 +16,7 @@ limitations under the License. */
#include <string>
#include <string>
#include "paddle/
fluid/platform
/cpu_info.h"
#include "paddle/
phi/backends/cpu
/cpu_info.h"
#include "paddle/phi/kernels/funcs/blas/blas.h"
#include "paddle/phi/kernels/funcs/blas/blas.h"
#include "paddle/phi/kernels/funcs/cpu_vec.h"
#include "paddle/phi/kernels/funcs/cpu_vec.h"
#include "paddle/phi/kernels/funcs/sequence2batch.h"
#include "paddle/phi/kernels/funcs/sequence2batch.h"
...
@@ -278,13 +278,13 @@ class FusedEmbeddingFCLSTMKernel : public framework::OpKernel<T> {
...
@@ -278,13 +278,13 @@ class FusedEmbeddingFCLSTMKernel : public framework::OpKernel<T> {
auto& act_gate_str = ctx.Attr<std::string>("gate_activation"); \
auto& act_gate_str = ctx.Attr<std::string>("gate_activation"); \
auto& act_cell_str = ctx.Attr<std::string>("cell_activation"); \
auto& act_cell_str = ctx.Attr<std::string>("cell_activation"); \
auto& act_cand_str = ctx.Attr<std::string>("candidate_activation"); \
auto& act_cand_str = ctx.Attr<std::string>("candidate_activation"); \
if (p
latform::MayIUse(platform::avx)) {
\
if (p
hi::backends::cpu::MayIUse(phi::backends::cpu::avx)) {
\
phi::funcs::VecActivations<T, p
latform::avx> act_functor;
\
phi::funcs::VecActivations<T, p
hi::backends::cpu::avx> act_functor;
\
act_gate = act_functor(act_gate_str); \
act_gate = act_functor(act_gate_str); \
act_cell = act_functor(act_cell_str); \
act_cell = act_functor(act_cell_str); \
act_cand = act_functor(act_cand_str); \
act_cand = act_functor(act_cand_str); \
} else { \
} else { \
phi::funcs::VecActivations<T, p
latform::isa_any> act_functor;
\
phi::funcs::VecActivations<T, p
hi::backends::cpu::isa_any> act_functor;
\
act_gate = act_functor(act_gate_str); \
act_gate = act_functor(act_gate_str); \
act_cell = act_functor(act_cell_str); \
act_cell = act_functor(act_cell_str); \
act_cand = act_functor(act_cand_str); \
act_cand = act_functor(act_cand_str); \
...
...
paddle/fluid/operators/fused/fusion_seqexpand_concat_fc_op.cc
浏览文件 @
344b99e1
...
@@ -16,7 +16,7 @@ limitations under the License. */
...
@@ -16,7 +16,7 @@ limitations under the License. */
#include <string>
#include <string>
#include "paddle/
fluid/platform
/cpu_info.h"
#include "paddle/
phi/backends/cpu
/cpu_info.h"
#include "paddle/phi/kernels/funcs/blas/blas.h"
#include "paddle/phi/kernels/funcs/blas/blas.h"
#include "paddle/phi/kernels/funcs/cpu_vec.h"
#include "paddle/phi/kernels/funcs/cpu_vec.h"
#include "paddle/phi/kernels/funcs/fc_functor.h"
#include "paddle/phi/kernels/funcs/fc_functor.h"
...
@@ -225,11 +225,11 @@ class FusionSeqExpandConcatFCOpKernel : public framework::OpKernel<T> {
...
@@ -225,11 +225,11 @@ class FusionSeqExpandConcatFCOpKernel : public framework::OpKernel<T> {
std
::
function
<
void
(
const
int
,
const
T
*
,
T
*
)
>
fc_act
;
std
::
function
<
void
(
const
int
,
const
T
*
,
T
*
)
>
fc_act
;
auto
&
fc_act_str
=
ctx
.
Attr
<
std
::
string
>
(
"fc_activation"
);
auto
&
fc_act_str
=
ctx
.
Attr
<
std
::
string
>
(
"fc_activation"
);
if
(
p
latform
::
MayIUse
(
platform
::
avx
))
{
if
(
p
hi
::
backends
::
cpu
::
MayIUse
(
phi
::
backends
::
cpu
::
avx
))
{
phi
::
funcs
::
VecActivations
<
T
,
p
latform
::
avx
>
act_functor
;
phi
::
funcs
::
VecActivations
<
T
,
p
hi
::
backends
::
cpu
::
avx
>
act_functor
;
fc_act
=
act_functor
(
fc_act_str
);
fc_act
=
act_functor
(
fc_act_str
);
}
else
{
}
else
{
phi
::
funcs
::
VecActivations
<
T
,
p
latform
::
isa_any
>
act_functor
;
phi
::
funcs
::
VecActivations
<
T
,
p
hi
::
backends
::
cpu
::
isa_any
>
act_functor
;
fc_act
=
act_functor
(
fc_act_str
);
fc_act
=
act_functor
(
fc_act_str
);
}
}
...
...
paddle/fluid/operators/jit/CMakeLists.txt
浏览文件 @
344b99e1
...
@@ -9,7 +9,7 @@ file(APPEND ${jit_file} "\#include \"paddle/fluid/operators/jit/helper.h\"\n")
...
@@ -9,7 +9,7 @@ file(APPEND ${jit_file} "\#include \"paddle/fluid/operators/jit/helper.h\"\n")
file
(
APPEND
${
jit_file
}
file
(
APPEND
${
jit_file
}
"
\#
include
\"
paddle/fluid/operators/jit/registry.h
\"\n\n
"
)
"
\#
include
\"
paddle/fluid/operators/jit/registry.h
\"\n\n
"
)
set
(
JIT_KERNEL_DEPS
cpu_info
cblas gflags enforce place xxhash
)
set
(
JIT_KERNEL_DEPS
device_context
cblas gflags enforce place xxhash
)
file
(
file
(
GLOB jit_kernel_cc_srcs
GLOB jit_kernel_cc_srcs
...
...
paddle/fluid/operators/jit/gen/act.cc
浏览文件 @
344b99e1
...
@@ -15,7 +15,7 @@
...
@@ -15,7 +15,7 @@
#include "paddle/fluid/operators/jit/gen/act.h"
#include "paddle/fluid/operators/jit/gen/act.h"
#include "paddle/fluid/operators/jit/registry.h"
#include "paddle/fluid/operators/jit/registry.h"
#include "paddle/
fluid/platform
/cpu_info.h"
#include "paddle/
phi/backends/cpu
/cpu_info.h"
namespace
paddle
{
namespace
paddle
{
namespace
operators
{
namespace
operators
{
...
@@ -98,27 +98,27 @@ DECLARE_ACT_CREATOR(VTanh);
...
@@ -98,27 +98,27 @@ DECLARE_ACT_CREATOR(VTanh);
// TODO(TJ): tuning use me
// TODO(TJ): tuning use me
bool
VReluCreator
::
CanBeUsed
(
const
int
&
d
)
const
{
bool
VReluCreator
::
CanBeUsed
(
const
int
&
d
)
const
{
return
p
latform
::
MayIUse
(
platform
::
avx
);
return
p
hi
::
backends
::
cpu
::
MayIUse
(
phi
::
backends
::
cpu
::
avx
);
}
}
bool
VSquareCreator
::
CanBeUsed
(
const
int
&
d
)
const
{
bool
VSquareCreator
::
CanBeUsed
(
const
int
&
d
)
const
{
return
p
latform
::
MayIUse
(
platform
::
avx
);
return
p
hi
::
backends
::
cpu
::
MayIUse
(
phi
::
backends
::
cpu
::
avx
);
}
}
bool
VIdentityCreator
::
CanBeUsed
(
const
int
&
d
)
const
{
bool
VIdentityCreator
::
CanBeUsed
(
const
int
&
d
)
const
{
return
p
latform
::
MayIUse
(
platform
::
avx
);
return
p
hi
::
backends
::
cpu
::
MayIUse
(
phi
::
backends
::
cpu
::
avx
);
}
}
bool
VExpCreator
::
CanBeUsed
(
const
int
&
d
)
const
{
bool
VExpCreator
::
CanBeUsed
(
const
int
&
d
)
const
{
return
p
latform
::
MayIUse
(
platform
::
avx
)
&&
d
<
32
;
return
p
hi
::
backends
::
cpu
::
MayIUse
(
phi
::
backends
::
cpu
::
avx
)
&&
d
<
32
;
}
}
bool
VSigmoidCreator
::
CanBeUsed
(
const
int
&
d
)
const
{
bool
VSigmoidCreator
::
CanBeUsed
(
const
int
&
d
)
const
{
return
p
latform
::
MayIUse
(
platform
::
avx
);
return
p
hi
::
backends
::
cpu
::
MayIUse
(
phi
::
backends
::
cpu
::
avx
);
}
}
bool
VTanhCreator
::
CanBeUsed
(
const
int
&
d
)
const
{
bool
VTanhCreator
::
CanBeUsed
(
const
int
&
d
)
const
{
return
p
latform
::
MayIUse
(
platform
::
avx
);
return
p
hi
::
backends
::
cpu
::
MayIUse
(
phi
::
backends
::
cpu
::
avx
);
}
}
size_t
VReluCreator
::
CodeSize
(
const
int
&
d
)
const
{
size_t
VReluCreator
::
CodeSize
(
const
int
&
d
)
const
{
...
...
paddle/fluid/operators/jit/gen/act.h
浏览文件 @
344b99e1
...
@@ -84,8 +84,8 @@ class VActFunc : public JitCode {
...
@@ -84,8 +84,8 @@ class VActFunc : public JitCode {
// compute EXP with ymm, xmm
// compute EXP with ymm, xmm
template
<
typename
JMM
>
template
<
typename
JMM
>
void
exp_jmm
(
JMM
&
dst
,
void
exp_jmm
(
JMM
&
dst
,
// NOLINT
JMM
&
src
,
JMM
&
src
,
// NOLINT
int
src_idx
=
11
,
int
src_idx
=
11
,
int
fx_idx
=
12
,
// NOLINT
int
fx_idx
=
12
,
// NOLINT
int
fy_idx
=
13
,
int
fy_idx
=
13
,
...
@@ -144,10 +144,11 @@ class VActFunc : public JitCode {
...
@@ -144,10 +144,11 @@ class VActFunc : public JitCode {
vcvttps2dq
(
ymm_int
,
jmm_fx
);
vcvttps2dq
(
ymm_int
,
jmm_fx
);
mov
(
reg_ptr_global
,
reinterpret_cast
<
size_t
>
(
exp_int_0x7f
));
mov
(
reg_ptr_global
,
reinterpret_cast
<
size_t
>
(
exp_int_0x7f
));
vmovdqa
(
jmm_tmp
,
ptr
[
reg_ptr_global
]);
vmovdqa
(
jmm_tmp
,
ptr
[
reg_ptr_global
]);
if
(
MayIUse
(
avx2
)
||
std
::
is_same
<
JMM
,
xmm_t
>::
value
)
{
if
(
phi
::
backends
::
cpu
::
MayIUse
(
phi
::
backends
::
cpu
::
avx2
)
||
std
::
is_same
<
JMM
,
xmm_t
>::
value
)
{
vpaddd
(
ymm_int
,
ymm_int
,
jmm_tmp
);
vpaddd
(
ymm_int
,
ymm_int
,
jmm_tmp
);
vpslld
(
ymm_int
,
ymm_int
,
23
);
vpslld
(
ymm_int
,
ymm_int
,
23
);
}
else
if
(
MayIUse
(
avx
))
{
}
else
if
(
phi
::
backends
::
cpu
::
MayIUse
(
phi
::
backends
::
cpu
::
avx
))
{
xmm_t
xtmp1
=
xmm_t
(
ymm_int
.
getIdx
());
xmm_t
xtmp1
=
xmm_t
(
ymm_int
.
getIdx
());
xmm_t
xtmp2
=
xmm_t
(
jmm_tmp
.
getIdx
());
xmm_t
xtmp2
=
xmm_t
(
jmm_tmp
.
getIdx
());
reg64_t
reg_ptr_tmp
=
reg_ptr_global
;
reg64_t
reg_ptr_tmp
=
reg_ptr_global
;
...
@@ -174,8 +175,8 @@ class VActFunc : public JitCode {
...
@@ -174,8 +175,8 @@ class VActFunc : public JitCode {
// compute SIGMOID with ymm, xmm
// compute SIGMOID with ymm, xmm
template
<
typename
JMM
>
template
<
typename
JMM
>
void
sigmoid_jmm
(
JMM
&
dst
,
void
sigmoid_jmm
(
JMM
&
dst
,
// NOLINT
JMM
&
src
,
JMM
&
src
,
// NOLINT
int
src_idx
=
11
,
// NOLINT
int
src_idx
=
11
,
// NOLINT
int
fx_idx
=
12
,
int
fx_idx
=
12
,
int
fy_idx
=
13
,
int
fy_idx
=
13
,
...
@@ -203,8 +204,8 @@ class VActFunc : public JitCode {
...
@@ -203,8 +204,8 @@ class VActFunc : public JitCode {
// compute TANH with ymm, xmm
// compute TANH with ymm, xmm
template
<
typename
JMM
>
template
<
typename
JMM
>
void
tanh_jmm
(
JMM
&
dst
,
void
tanh_jmm
(
JMM
&
dst
,
// NOLINT
JMM
&
src
,
JMM
&
src
,
// NOLINT
int
src_idx
=
11
,
// NOLINT
int
src_idx
=
11
,
// NOLINT
int
fx_idx
=
12
,
int
fx_idx
=
12
,
int
fy_idx
=
13
,
int
fy_idx
=
13
,
...
...
paddle/fluid/operators/jit/gen/adam.cc
浏览文件 @
344b99e1
...
@@ -17,7 +17,7 @@
...
@@ -17,7 +17,7 @@
#include <stddef.h> // offsetof
#include <stddef.h> // offsetof
#include "paddle/fluid/operators/jit/registry.h"
#include "paddle/fluid/operators/jit/registry.h"
#include "paddle/
fluid/platform
/cpu_info.h"
#include "paddle/
phi/backends/cpu
/cpu_info.h"
namespace
paddle
{
namespace
paddle
{
namespace
operators
{
namespace
operators
{
...
@@ -132,7 +132,7 @@ void AdamJitCode::genCode() {
...
@@ -132,7 +132,7 @@ void AdamJitCode::genCode() {
class
AdamCreator
:
public
JitCodeCreator
<
adam_attr_t
>
{
class
AdamCreator
:
public
JitCodeCreator
<
adam_attr_t
>
{
public:
public:
bool
CanBeUsed
(
const
adam_attr_t
&
attr
)
const
override
{
bool
CanBeUsed
(
const
adam_attr_t
&
attr
)
const
override
{
return
p
latform
::
MayIUse
(
platform
::
avx512f
);
return
p
hi
::
backends
::
cpu
::
MayIUse
(
phi
::
backends
::
cpu
::
avx512f
);
}
}
size_t
CodeSize
(
const
adam_attr_t
&
attr
)
const
override
{
size_t
CodeSize
(
const
adam_attr_t
&
attr
)
const
override
{
return
96
+
32
*
8
;
return
96
+
32
*
8
;
...
...
paddle/fluid/operators/jit/gen/adamw.cc
浏览文件 @
344b99e1
...
@@ -17,7 +17,7 @@
...
@@ -17,7 +17,7 @@
#include <stddef.h> // offsetof
#include <stddef.h> // offsetof
#include "paddle/fluid/operators/jit/registry.h"
#include "paddle/fluid/operators/jit/registry.h"
#include "paddle/
fluid/platform
/cpu_info.h"
#include "paddle/
phi/backends/cpu
/cpu_info.h"
namespace
paddle
{
namespace
paddle
{
namespace
operators
{
namespace
operators
{
...
@@ -147,7 +147,7 @@ void AdamWJitCode::genCode() {
...
@@ -147,7 +147,7 @@ void AdamWJitCode::genCode() {
class
AdamWCreator
:
public
JitCodeCreator
<
int
>
{
class
AdamWCreator
:
public
JitCodeCreator
<
int
>
{
public:
public:
bool
CanBeUsed
(
const
int
&
attr
)
const
override
{
bool
CanBeUsed
(
const
int
&
attr
)
const
override
{
return
p
latform
::
MayIUse
(
platform
::
avx512f
);
return
p
hi
::
backends
::
cpu
::
MayIUse
(
phi
::
backends
::
cpu
::
avx512f
);
}
}
size_t
CodeSize
(
const
int
&
attr
)
const
override
{
return
96
+
32
*
8
;
}
size_t
CodeSize
(
const
int
&
attr
)
const
override
{
return
96
+
32
*
8
;
}
std
::
unique_ptr
<
GenBase
>
CreateJitCode
(
const
int
&
attr
)
const
override
{
std
::
unique_ptr
<
GenBase
>
CreateJitCode
(
const
int
&
attr
)
const
override
{
...
...
paddle/fluid/operators/jit/gen/blas.cc
浏览文件 @
344b99e1
...
@@ -16,7 +16,7 @@
...
@@ -16,7 +16,7 @@
#include "paddle/fluid/operators/jit/macro.h"
#include "paddle/fluid/operators/jit/macro.h"
#include "paddle/fluid/operators/jit/registry.h"
#include "paddle/fluid/operators/jit/registry.h"
#include "paddle/
fluid/platform
/cpu_info.h"
#include "paddle/
phi/backends/cpu
/cpu_info.h"
namespace
paddle
{
namespace
paddle
{
namespace
operators
{
namespace
operators
{
...
@@ -145,7 +145,7 @@ void NCHW16CMulNCJitCode::genCode() {
...
@@ -145,7 +145,7 @@ void NCHW16CMulNCJitCode::genCode() {
class
NCHW16CMulNCCreator
:
public
JitCodeCreator
<
int
>
{
class
NCHW16CMulNCCreator
:
public
JitCodeCreator
<
int
>
{
public:
public:
bool
CanBeUsed
(
const
int
&
attr
)
const
override
{
bool
CanBeUsed
(
const
int
&
attr
)
const
override
{
return
p
latform
::
MayIUse
(
platform
::
avx512f
);
return
p
hi
::
backends
::
cpu
::
MayIUse
(
phi
::
backends
::
cpu
::
avx512f
);
}
}
size_t
CodeSize
(
const
int
&
d
)
const
override
{
return
256
*
1024
;
}
size_t
CodeSize
(
const
int
&
d
)
const
override
{
return
256
*
1024
;
}
std
::
unique_ptr
<
GenBase
>
CreateJitCode
(
const
int
&
attr
)
const
override
{
std
::
unique_ptr
<
GenBase
>
CreateJitCode
(
const
int
&
attr
)
const
override
{
...
@@ -157,7 +157,8 @@ class NCHW16CMulNCCreator : public JitCodeCreator<int> {
...
@@ -157,7 +157,8 @@ class NCHW16CMulNCCreator : public JitCodeCreator<int> {
class name##Creator : public JitCodeCreator<int> { \
class name##Creator : public JitCodeCreator<int> { \
public: \
public: \
bool CanBeUsed(const int& attr) const override { \
bool CanBeUsed(const int& attr) const override { \
return platform::MayIUse(platform::avx) && attr <= 1024; \
return phi::backends::cpu::MayIUse(phi::backends::cpu::avx) && \
attr <= 1024; \
} \
} \
size_t CodeSize(const int& d) const override { \
size_t CodeSize(const int& d) const override { \
return 96 + d / YMM_FLOAT_BLOCK * 4 * 8; \
return 96 + d / YMM_FLOAT_BLOCK * 4 * 8; \
...
...
paddle/fluid/operators/jit/gen/embseqpool.cc
浏览文件 @
344b99e1
...
@@ -18,7 +18,7 @@
...
@@ -18,7 +18,7 @@
#include "paddle/fluid/operators/jit/macro.h"
#include "paddle/fluid/operators/jit/macro.h"
#include "paddle/fluid/operators/jit/registry.h"
#include "paddle/fluid/operators/jit/registry.h"
#include "paddle/
fluid/platform
/cpu_info.h"
#include "paddle/
phi/backends/cpu
/cpu_info.h"
namespace
paddle
{
namespace
paddle
{
namespace
operators
{
namespace
operators
{
...
@@ -123,7 +123,7 @@ void EmbSeqPoolJitCode::genCode() {
...
@@ -123,7 +123,7 @@ void EmbSeqPoolJitCode::genCode() {
class
EmbSeqPoolCreator
:
public
JitCodeCreator
<
emb_seq_pool_attr_t
>
{
class
EmbSeqPoolCreator
:
public
JitCodeCreator
<
emb_seq_pool_attr_t
>
{
public:
public:
bool
CanBeUsed
(
const
emb_seq_pool_attr_t
&
attr
)
const
override
{
bool
CanBeUsed
(
const
emb_seq_pool_attr_t
&
attr
)
const
override
{
return
p
latform
::
MayIUse
(
platform
::
avx
)
&&
return
p
hi
::
backends
::
cpu
::
MayIUse
(
phi
::
backends
::
cpu
::
avx
)
&&
attr
.
table_width
%
YMM_FLOAT_BLOCK
==
0
;
attr
.
table_width
%
YMM_FLOAT_BLOCK
==
0
;
}
}
size_t
CodeSize
(
const
emb_seq_pool_attr_t
&
attr
)
const
override
{
size_t
CodeSize
(
const
emb_seq_pool_attr_t
&
attr
)
const
override
{
...
...
paddle/fluid/operators/jit/gen/gru.cc
浏览文件 @
344b99e1
...
@@ -18,7 +18,7 @@
...
@@ -18,7 +18,7 @@
#include "paddle/fluid/operators/jit/macro.h"
#include "paddle/fluid/operators/jit/macro.h"
#include "paddle/fluid/operators/jit/registry.h"
#include "paddle/fluid/operators/jit/registry.h"
#include "paddle/
fluid/platform
/cpu_info.h"
#include "paddle/
phi/backends/cpu
/cpu_info.h"
namespace
paddle
{
namespace
paddle
{
namespace
operators
{
namespace
operators
{
...
@@ -85,20 +85,21 @@ void GRUJitCode::genCode() {
...
@@ -85,20 +85,21 @@ void GRUJitCode::genCode() {
ret
();
ret
();
}
}
#define DECLARE_GRU_CREATOR(name) \
#define DECLARE_GRU_CREATOR(name) \
class name##Creator : public JitCodeCreator<gru_attr_t> { \
class name##Creator : public JitCodeCreator<gru_attr_t> { \
public: \
public: \
/* TODO(TJ): enable more */
\
/* TODO(TJ): enable more */
\
bool CanBeUsed(const gru_attr_t& attr) const override { \
bool CanBeUsed(const gru_attr_t& attr) const override { \
return platform::MayIUse(platform::avx) && attr.d % 8 == 0; \
return phi::backends::cpu::MayIUse(phi::backends::cpu::avx) && \
} \
attr.d % 8 == 0; \
size_t CodeSize(const gru_attr_t& attr) const override { \
} \
return 96 + attr.d / YMM_FLOAT_BLOCK * 96 * 2 * 8; \
size_t CodeSize(const gru_attr_t& attr) const override { \
} \
return 96 + attr.d / YMM_FLOAT_BLOCK * 96 * 2 * 8; \
std::unique_ptr<GenBase> CreateJitCode( \
} \
const gru_attr_t& attr) const override { \
std::unique_ptr<GenBase> CreateJitCode( \
return make_unique<name##JitCode>(attr, CodeSize(attr)); \
const gru_attr_t& attr) const override { \
} \
return make_unique<name##JitCode>(attr, CodeSize(attr)); \
} \
}
}
DECLARE_GRU_CREATOR
(
GRUH1
);
DECLARE_GRU_CREATOR
(
GRUH1
);
...
...
paddle/fluid/operators/jit/gen/hopv.cc
浏览文件 @
344b99e1
...
@@ -15,7 +15,7 @@
...
@@ -15,7 +15,7 @@
#include "paddle/fluid/operators/jit/gen/hopv.h"
#include "paddle/fluid/operators/jit/gen/hopv.h"
#include "paddle/fluid/operators/jit/registry.h"
#include "paddle/fluid/operators/jit/registry.h"
#include "paddle/
fluid/platform
/cpu_info.h"
#include "paddle/
phi/backends/cpu
/cpu_info.h"
namespace
paddle
{
namespace
paddle
{
namespace
operators
{
namespace
operators
{
...
@@ -78,7 +78,7 @@ void HOPVJitCode::genCode() {
...
@@ -78,7 +78,7 @@ void HOPVJitCode::genCode() {
class name##Creator : public JitCodeCreator<int> { \
class name##Creator : public JitCodeCreator<int> { \
public: \
public: \
bool CanBeUsed(const int& attr) const override { \
bool CanBeUsed(const int& attr) const override { \
return p
latform::MayIUse(platform::avx);
\
return p
hi::backends::cpu::MayIUse(phi::backends::cpu::avx);
\
} \
} \
size_t CodeSize(const int& d) const override { \
size_t CodeSize(const int& d) const override { \
return 96 + d / YMM_FLOAT_BLOCK * 4 * 8; \
return 96 + d / YMM_FLOAT_BLOCK * 4 * 8; \
...
...
paddle/fluid/operators/jit/gen/jitcode.h
浏览文件 @
344b99e1
...
@@ -18,7 +18,7 @@
...
@@ -18,7 +18,7 @@
#include <type_traits>
#include <type_traits>
#include "paddle/fluid/operators/jit/gen_base.h"
#include "paddle/fluid/operators/jit/gen_base.h"
#include "paddle/
fluid/platform
/cpu_info.h"
#include "paddle/
phi/backends/cpu
/cpu_info.h"
#define XBYAK_USE_MMAP_ALLOCATOR
#define XBYAK_USE_MMAP_ALLOCATOR
#include "xbyak/xbyak.h"
#include "xbyak/xbyak.h"
...
@@ -92,7 +92,7 @@ class JitCode : public GenBase, public Xbyak::CodeGenerator {
...
@@ -92,7 +92,7 @@ class JitCode : public GenBase, public Xbyak::CodeGenerator {
for
(
int
i
=
0
;
i
<
num_g_abi_regs
;
++
i
)
{
for
(
int
i
=
0
;
i
<
num_g_abi_regs
;
++
i
)
{
push
(
Xbyak
::
Reg64
(
g_abi_regs
[
i
]));
push
(
Xbyak
::
Reg64
(
g_abi_regs
[
i
]));
}
}
if
(
p
latform
::
MayIUse
(
platform
::
avx512f
))
{
if
(
p
hi
::
backends
::
cpu
::
MayIUse
(
phi
::
backends
::
cpu
::
avx512f
))
{
mov
(
reg_EVEX_max_8b_offt
,
2
*
EVEX_max_8b_offt
);
mov
(
reg_EVEX_max_8b_offt
,
2
*
EVEX_max_8b_offt
);
}
}
}
}
...
...
paddle/fluid/operators/jit/gen/lstm.cc
浏览文件 @
344b99e1
...
@@ -18,7 +18,7 @@
...
@@ -18,7 +18,7 @@
#include "paddle/fluid/operators/jit/macro.h"
#include "paddle/fluid/operators/jit/macro.h"
#include "paddle/fluid/operators/jit/registry.h"
#include "paddle/fluid/operators/jit/registry.h"
#include "paddle/
fluid/platform
/cpu_info.h"
#include "paddle/
phi/backends/cpu
/cpu_info.h"
namespace
paddle
{
namespace
paddle
{
namespace
operators
{
namespace
operators
{
...
@@ -113,20 +113,21 @@ void LSTMJitCode::genCode() {
...
@@ -113,20 +113,21 @@ void LSTMJitCode::genCode() {
}
}
}
}
#define DECLARE_LSTM_CREATOR(name) \
#define DECLARE_LSTM_CREATOR(name) \
class name##Creator : public JitCodeCreator<lstm_attr_t> { \
class name##Creator : public JitCodeCreator<lstm_attr_t> { \
public: \
public: \
/* TODO(TJ): enable more */
\
/* TODO(TJ): enable more */
\
bool CanBeUsed(const lstm_attr_t& attr) const override { \
bool CanBeUsed(const lstm_attr_t& attr) const override { \
return platform::MayIUse(platform::avx) && attr.d % 8 == 0; \
return phi::backends::cpu::MayIUse(phi::backends::cpu::avx) && \
} \
attr.d % 8 == 0; \
size_t CodeSize(const lstm_attr_t& attr) const override { \
} \
return 96 + attr.d / YMM_FLOAT_BLOCK * 90 * 4 * 8; \
size_t CodeSize(const lstm_attr_t& attr) const override { \
} \
return 96 + attr.d / YMM_FLOAT_BLOCK * 90 * 4 * 8; \
std::unique_ptr<GenBase> CreateJitCode( \
} \
const lstm_attr_t& attr) const override { \
std::unique_ptr<GenBase> CreateJitCode( \
return make_unique<name##JitCode>(attr, CodeSize(attr)); \
const lstm_attr_t& attr) const override { \
} \
return make_unique<name##JitCode>(attr, CodeSize(attr)); \
} \
}
}
DECLARE_LSTM_CREATOR
(
LSTMCtHt
);
DECLARE_LSTM_CREATOR
(
LSTMCtHt
);
...
...
paddle/fluid/operators/jit/gen/matmul.cc
浏览文件 @
344b99e1
...
@@ -17,7 +17,7 @@
...
@@ -17,7 +17,7 @@
#include <stddef.h> // offsetof
#include <stddef.h> // offsetof
#include "paddle/fluid/operators/jit/registry.h"
#include "paddle/fluid/operators/jit/registry.h"
#include "paddle/
fluid/platform
/cpu_info.h"
#include "paddle/
phi/backends/cpu
/cpu_info.h"
namespace
paddle
{
namespace
paddle
{
namespace
operators
{
namespace
operators
{
...
@@ -110,12 +110,13 @@ void MatMulJitCode::genCode() {
...
@@ -110,12 +110,13 @@ void MatMulJitCode::genCode() {
class
MatMulCreator
:
public
JitCodeCreator
<
matmul_attr_t
>
{
class
MatMulCreator
:
public
JitCodeCreator
<
matmul_attr_t
>
{
public:
public:
bool
CanBeUsed
(
const
matmul_attr_t
&
attr
)
const
override
{
bool
CanBeUsed
(
const
matmul_attr_t
&
attr
)
const
override
{
return
attr
.
m
==
1
&&
platform
::
MayIUse
(
platform
::
avx512f
)
&&
return
attr
.
m
==
1
&&
phi
::
backends
::
cpu
::
MayIUse
(
phi
::
backends
::
cpu
::
avx512f
)
&&
attr
.
n
%
ZMM_FLOAT_BLOCK
==
0
&&
attr
.
k
<
512
;
attr
.
n
%
ZMM_FLOAT_BLOCK
==
0
&&
attr
.
k
<
512
;
}
}
size_t
CodeSize
(
const
matmul_attr_t
&
attr
)
const
override
{
size_t
CodeSize
(
const
matmul_attr_t
&
attr
)
const
override
{
int
block
=
YMM_FLOAT_BLOCK
;
int
block
=
YMM_FLOAT_BLOCK
;
if
(
p
latform
::
MayIUse
(
platform
::
avx512f
))
{
if
(
p
hi
::
backends
::
cpu
::
MayIUse
(
phi
::
backends
::
cpu
::
avx512f
))
{
block
=
ZMM_FLOAT_BLOCK
;
block
=
ZMM_FLOAT_BLOCK
;
}
}
return
96
+
4
*
attr
.
k
*
(
attr
.
n
/
block
+
1
)
*
8
;
return
96
+
4
*
attr
.
k
*
(
attr
.
n
/
block
+
1
)
*
8
;
...
...
paddle/fluid/operators/jit/gen/seqpool.cc
浏览文件 @
344b99e1
...
@@ -16,7 +16,7 @@
...
@@ -16,7 +16,7 @@
#include "paddle/fluid/operators/jit/gen/act.h" // for exp_float_consts ones
#include "paddle/fluid/operators/jit/gen/act.h" // for exp_float_consts ones
#include "paddle/fluid/operators/jit/registry.h"
#include "paddle/fluid/operators/jit/registry.h"
#include "paddle/
fluid/platform
/cpu_info.h"
#include "paddle/
phi/backends/cpu
/cpu_info.h"
namespace
paddle
{
namespace
paddle
{
namespace
operators
{
namespace
operators
{
...
@@ -59,7 +59,7 @@ void SeqPoolJitCode::genCode() {
...
@@ -59,7 +59,7 @@ void SeqPoolJitCode::genCode() {
class
SeqPoolCreator
:
public
JitCodeCreator
<
seq_pool_attr_t
>
{
class
SeqPoolCreator
:
public
JitCodeCreator
<
seq_pool_attr_t
>
{
public:
public:
bool
CanBeUsed
(
const
seq_pool_attr_t
&
attr
)
const
override
{
bool
CanBeUsed
(
const
seq_pool_attr_t
&
attr
)
const
override
{
return
p
latform
::
MayIUse
(
platform
::
avx
);
return
p
hi
::
backends
::
cpu
::
MayIUse
(
phi
::
backends
::
cpu
::
avx
);
}
}
size_t
CodeSize
(
const
seq_pool_attr_t
&
attr
)
const
override
{
size_t
CodeSize
(
const
seq_pool_attr_t
&
attr
)
const
override
{
return
96
+
((
attr
.
w
/
YMM_FLOAT_BLOCK
+
4
/* for rest */
)
*
return
96
+
((
attr
.
w
/
YMM_FLOAT_BLOCK
+
4
/* for rest */
)
*
...
...
paddle/fluid/operators/jit/gen/sgd.cc
浏览文件 @
344b99e1
...
@@ -17,7 +17,7 @@
...
@@ -17,7 +17,7 @@
#include <stddef.h> // offsetof
#include <stddef.h> // offsetof
#include "paddle/fluid/operators/jit/registry.h"
#include "paddle/fluid/operators/jit/registry.h"
#include "paddle/
fluid/platform
/cpu_info.h"
#include "paddle/
phi/backends/cpu
/cpu_info.h"
namespace
paddle
{
namespace
paddle
{
namespace
operators
{
namespace
operators
{
...
@@ -109,7 +109,7 @@ void SgdJitCode::genCode() {
...
@@ -109,7 +109,7 @@ void SgdJitCode::genCode() {
class
SgdCreator
:
public
JitCodeCreator
<
sgd_attr_t
>
{
class
SgdCreator
:
public
JitCodeCreator
<
sgd_attr_t
>
{
public:
public:
bool
CanBeUsed
(
const
sgd_attr_t
&
attr
)
const
override
{
bool
CanBeUsed
(
const
sgd_attr_t
&
attr
)
const
override
{
return
p
latform
::
MayIUse
(
platform
::
avx
)
&&
return
p
hi
::
backends
::
cpu
::
MayIUse
(
phi
::
backends
::
cpu
::
avx
)
&&
attr
.
grad_width
%
YMM_FLOAT_BLOCK
==
0
;
attr
.
grad_width
%
YMM_FLOAT_BLOCK
==
0
;
}
}
size_t
CodeSize
(
const
sgd_attr_t
&
attr
)
const
override
{
return
96
+
32
*
8
;
}
size_t
CodeSize
(
const
sgd_attr_t
&
attr
)
const
override
{
return
96
+
32
*
8
;
}
...
...
paddle/fluid/operators/jit/gen/vbroadcast.cc
浏览文件 @
344b99e1
...
@@ -15,7 +15,7 @@
...
@@ -15,7 +15,7 @@
#include "paddle/fluid/operators/jit/gen/vbroadcast.h"
#include "paddle/fluid/operators/jit/gen/vbroadcast.h"
#include "paddle/fluid/operators/jit/registry.h"
#include "paddle/fluid/operators/jit/registry.h"
#include "paddle/
fluid/platform
/cpu_info.h"
#include "paddle/
phi/backends/cpu
/cpu_info.h"
namespace
paddle
{
namespace
paddle
{
namespace
operators
{
namespace
operators
{
...
@@ -69,7 +69,8 @@ void VBroadcastJitCode::genCode() {
...
@@ -69,7 +69,8 @@ void VBroadcastJitCode::genCode() {
class
VBroadcastCreator
:
public
JitCodeCreator
<
int64_t
>
{
class
VBroadcastCreator
:
public
JitCodeCreator
<
int64_t
>
{
public:
public:
bool
CanBeUsed
(
const
int64_t
&
w
)
const
override
{
bool
CanBeUsed
(
const
int64_t
&
w
)
const
override
{
return
platform
::
MayIUse
(
platform
::
avx
)
&&
w
%
YMM_FLOAT_BLOCK
==
0
;
return
phi
::
backends
::
cpu
::
MayIUse
(
phi
::
backends
::
cpu
::
avx
)
&&
w
%
YMM_FLOAT_BLOCK
==
0
;
}
}
size_t
CodeSize
(
const
int64_t
&
w
)
const
override
{
size_t
CodeSize
(
const
int64_t
&
w
)
const
override
{
return
96
+
(
w
/
YMM_FLOAT_BLOCK
)
*
16
*
8
;
return
96
+
(
w
/
YMM_FLOAT_BLOCK
)
*
16
*
8
;
...
...
paddle/fluid/operators/jit/gen_base.cc
浏览文件 @
344b99e1
...
@@ -17,8 +17,8 @@
...
@@ -17,8 +17,8 @@
#include <fstream>
#include <fstream>
#include "paddle/fluid/memory/allocation/cpu_allocator.h" // for posix_memalign
#include "paddle/fluid/memory/allocation/cpu_allocator.h" // for posix_memalign
#include "paddle/fluid/platform/cpu_info.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/phi/backends/cpu/cpu_info.h"
#ifndef _WIN32
#ifndef _WIN32
#define posix_memalign_free free
#define posix_memalign_free free
...
@@ -66,7 +66,7 @@ void GenBase::operator delete(void* ptr) { posix_memalign_free(ptr); }
...
@@ -66,7 +66,7 @@ void GenBase::operator delete(void* ptr) { posix_memalign_free(ptr); }
std
::
vector
<
int
>
packed_groups
(
int
n
,
int
k
,
int
*
block_out
,
int
*
rest_out
)
{
std
::
vector
<
int
>
packed_groups
(
int
n
,
int
k
,
int
*
block_out
,
int
*
rest_out
)
{
int
block
;
int
block
;
int
max_num_regs
;
int
max_num_regs
;
if
(
p
latform
::
MayIUse
(
platform
::
avx512f
))
{
if
(
p
hi
::
backends
::
cpu
::
MayIUse
(
phi
::
backends
::
cpu
::
avx512f
))
{
block
=
ZMM_FLOAT_BLOCK
;
block
=
ZMM_FLOAT_BLOCK
;
max_num_regs
=
32
;
max_num_regs
=
32
;
}
else
{
}
else
{
...
...
paddle/fluid/operators/jit/more/intrinsic/crf_decoding.cc
浏览文件 @
344b99e1
...
@@ -17,7 +17,7 @@
...
@@ -17,7 +17,7 @@
#include <limits>
#include <limits>
#include "paddle/fluid/operators/jit/registry.h"
#include "paddle/fluid/operators/jit/registry.h"
#include "paddle/
fluid/platform
/cpu_info.h"
#include "paddle/
phi/backends/cpu
/cpu_info.h"
namespace
paddle
{
namespace
paddle
{
namespace
operators
{
namespace
operators
{
...
@@ -172,7 +172,7 @@ bool CRFDecodingKernel::CanBeUsed(const int& d) const {
...
@@ -172,7 +172,7 @@ bool CRFDecodingKernel::CanBeUsed(const int& d) const {
#else
#else
constexpr
int
block
=
YMM_FLOAT_BLOCK
;
constexpr
int
block
=
YMM_FLOAT_BLOCK
;
#endif
#endif
return
p
latform
::
MayIUse
(
platform
::
avx
)
&&
d
>=
block
;
return
p
hi
::
backends
::
cpu
::
MayIUse
(
phi
::
backends
::
cpu
::
avx
)
&&
d
>=
block
;
}
}
}
// namespace intrinsic
}
// namespace intrinsic
...
...
paddle/fluid/operators/jit/more/intrinsic/layer_norm.cc
浏览文件 @
344b99e1
...
@@ -17,7 +17,7 @@
...
@@ -17,7 +17,7 @@
#include <limits>
#include <limits>
#include "paddle/fluid/operators/jit/registry.h"
#include "paddle/fluid/operators/jit/registry.h"
#include "paddle/
fluid/platform
/cpu_info.h"
#include "paddle/
phi/backends/cpu
/cpu_info.h"
namespace
paddle
{
namespace
paddle
{
namespace
operators
{
namespace
operators
{
...
@@ -179,7 +179,8 @@ void LayerNorm(float* x,
...
@@ -179,7 +179,8 @@ void LayerNorm(float* x,
}
}
bool
LayerNormKernel
::
CanBeUsed
(
const
int
&
d
)
const
{
bool
LayerNormKernel
::
CanBeUsed
(
const
int
&
d
)
const
{
return
platform
::
MayIUse
(
platform
::
avx
)
&&
d
>=
YMM_FLOAT_BLOCK
;
return
phi
::
backends
::
cpu
::
MayIUse
(
phi
::
backends
::
cpu
::
avx
)
&&
d
>=
YMM_FLOAT_BLOCK
;
}
}
}
// namespace intrinsic
}
// namespace intrinsic
...
...
paddle/fluid/operators/jit/more/mkl/mkl.cc
浏览文件 @
344b99e1
...
@@ -16,8 +16,8 @@
...
@@ -16,8 +16,8 @@
#include "paddle/fluid/operators/jit/refer/refer.h"
#include "paddle/fluid/operators/jit/refer/refer.h"
#include "paddle/fluid/operators/jit/registry.h"
#include "paddle/fluid/operators/jit/registry.h"
#include "paddle/fluid/platform/cpu_info.h"
#include "paddle/fluid/platform/dynload/mklml.h"
#include "paddle/fluid/platform/dynload/mklml.h"
#include "paddle/phi/backends/cpu/cpu_info.h"
namespace
paddle
{
namespace
paddle
{
namespace
operators
{
namespace
operators
{
...
@@ -188,17 +188,17 @@ void StrideASum<double>(const double* x, double* res, int n, int stride) {
...
@@ -188,17 +188,17 @@ void StrideASum<double>(const double* x, double* res, int n, int stride) {
// TODO(TJ): tuning me carefully on AVX, AVX2 and AVX512
// TODO(TJ): tuning me carefully on AVX, AVX2 and AVX512
template
<
>
template
<
>
bool
VMulKernel
<
float
>::
CanBeUsed
(
const
int
&
d
)
const
{
bool
VMulKernel
<
float
>::
CanBeUsed
(
const
int
&
d
)
const
{
return
p
latform
::
MayIUse
(
platform
::
avx512f
)
&&
d
>
512
;
return
p
hi
::
backends
::
cpu
::
MayIUse
(
phi
::
backends
::
cpu
::
avx512f
)
&&
d
>
512
;
}
}
template
<
>
template
<
>
bool
VAddKernel
<
float
>::
CanBeUsed
(
const
int
&
d
)
const
{
bool
VAddKernel
<
float
>::
CanBeUsed
(
const
int
&
d
)
const
{
return
p
latform
::
MayIUse
(
platform
::
avx
)
&&
d
>
512
;
return
p
hi
::
backends
::
cpu
::
MayIUse
(
phi
::
backends
::
cpu
::
avx
)
&&
d
>
512
;
}
}
template
<
>
template
<
>
bool
VScalKernel
<
float
>::
CanBeUsed
(
const
int
&
d
)
const
{
bool
VScalKernel
<
float
>::
CanBeUsed
(
const
int
&
d
)
const
{
return
p
latform
::
MayIUse
(
platform
::
avx512f
)
&&
d
>
512
;
return
p
hi
::
backends
::
cpu
::
MayIUse
(
phi
::
backends
::
cpu
::
avx512f
)
&&
d
>
512
;
}
}
template
<
>
template
<
>
...
@@ -274,7 +274,7 @@ bool SgdKernel<double>::CanBeUsed(const sgd_attr_t& attr) const {
...
@@ -274,7 +274,7 @@ bool SgdKernel<double>::CanBeUsed(const sgd_attr_t& attr) const {
template
<
>
template
<
>
bool
MatMulKernel
<
float
>::
CanBeUsed
(
const
matmul_attr_t
&
attr
)
const
{
bool
MatMulKernel
<
float
>::
CanBeUsed
(
const
matmul_attr_t
&
attr
)
const
{
return
p
latform
::
MayIUse
(
platform
::
avx
);
return
p
hi
::
backends
::
cpu
::
MayIUse
(
phi
::
backends
::
cpu
::
avx
);
}
}
template
<
>
template
<
>
...
@@ -285,7 +285,7 @@ bool MatMulKernel<double>::CanBeUsed(const matmul_attr_t& attr) const {
...
@@ -285,7 +285,7 @@ bool MatMulKernel<double>::CanBeUsed(const matmul_attr_t& attr) const {
template
<
>
template
<
>
bool
SoftmaxKernel
<
float
>::
CanBeUsed
(
const
int
&
d
)
const
{
bool
SoftmaxKernel
<
float
>::
CanBeUsed
(
const
int
&
d
)
const
{
// tuned on avx2
// tuned on avx2
return
p
latform
::
MayIUse
(
platform
::
avx
)
&&
d
<
60
;
return
p
hi
::
backends
::
cpu
::
MayIUse
(
phi
::
backends
::
cpu
::
avx
)
&&
d
<
60
;
}
}
#define AWALYS_USE_ME_WITH_DOUBLE(func) \
#define AWALYS_USE_ME_WITH_DOUBLE(func) \
...
...
paddle/fluid/operators/jit/test.cc
浏览文件 @
344b99e1
...
@@ -19,8 +19,8 @@ limitations under the License. */
...
@@ -19,8 +19,8 @@ limitations under the License. */
#include "glog/logging.h"
#include "glog/logging.h"
#include "gtest/gtest.h"
#include "gtest/gtest.h"
#include "paddle/fluid/operators/jit/kernels.h"
#include "paddle/fluid/operators/jit/kernels.h"
#include "paddle/fluid/platform/cpu_info.h"
#include "paddle/fluid/platform/place.h"
#include "paddle/fluid/platform/place.h"
#include "paddle/phi/backends/cpu/cpu_info.h"
DEFINE_double
(
acc
,
1e-5
,
"Test accuracy threshold."
);
DEFINE_double
(
acc
,
1e-5
,
"Test accuracy threshold."
);
...
@@ -437,7 +437,7 @@ void TestKernelNCHW16CMulNC() {
...
@@ -437,7 +437,7 @@ void TestKernelNCHW16CMulNC() {
EXPECT_TRUE
(
tgt
!=
nullptr
);
EXPECT_TRUE
(
tgt
!=
nullptr
);
if
(
std
::
is_same
<
T
,
float
>::
value
&&
if
(
std
::
is_same
<
T
,
float
>::
value
&&
p
addle
::
platform
::
MayIUse
(
paddle
::
platform
::
avx512f
))
{
p
hi
::
backends
::
cpu
::
MayIUse
(
phi
::
backends
::
cpu
::
avx512f
))
{
EXPECT_TRUE
(
jitcode
!=
nullptr
);
EXPECT_TRUE
(
jitcode
!=
nullptr
);
}
}
for
(
int
ni
=
0
;
ni
<
n
;
ni
++
)
{
for
(
int
ni
=
0
;
ni
<
n
;
ni
++
)
{
...
@@ -1393,7 +1393,7 @@ TEST(JITKernel_helper, pack_weights) {
...
@@ -1393,7 +1393,7 @@ TEST(JITKernel_helper, pack_weights) {
}
}
int
block
=
0
;
int
block
=
0
;
std
::
vector
<
int
>
groups
;
std
::
vector
<
int
>
groups
;
if
(
p
addle
::
platform
::
MayIUse
(
paddle
::
platform
::
avx512f
))
{
if
(
p
hi
::
backends
::
cpu
::
MayIUse
(
phi
::
backends
::
cpu
::
avx512f
))
{
block
=
ZMM_FLOAT_BLOCK
;
block
=
ZMM_FLOAT_BLOCK
;
groups
.
push_back
(
30
);
groups
.
push_back
(
30
);
}
else
{
}
else
{
...
...
paddle/fluid/operators/math/CMakeLists.txt
浏览文件 @
344b99e1
...
@@ -32,7 +32,6 @@ math_library(maxouting)
...
@@ -32,7 +32,6 @@ math_library(maxouting)
math_library
(
sequence_padding
)
math_library
(
sequence_padding
)
math_library
(
sequence_pooling DEPS math_function jit_kernel_helper
)
math_library
(
sequence_pooling DEPS math_function jit_kernel_helper
)
math_library
(
sequence_scale
)
math_library
(
sequence_scale
)
math_library
(
softmax DEPS math_function jit_kernel_helper
)
if
(
WITH_ASCEND_CL
)
if
(
WITH_ASCEND_CL
)
math_library
(
beam_search DEPS math_function beam_search_npu
)
math_library
(
beam_search DEPS math_function beam_search_npu
)
elseif
(
WITH_XPU
)
elseif
(
WITH_XPU
)
...
...
paddle/fluid/operators/math/cpu_vec.h
已删除
100644 → 0
浏览文件 @
4672ea8e
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <cmath>
#include <functional>
#include <string>
#include "paddle/fluid/platform/cpu_info.h"
#include "paddle/fluid/platform/enforce.h"
#ifdef PADDLE_WITH_MKLML
#include "paddle/fluid/platform/dynload/mklml.h"
#endif
namespace
paddle
{
namespace
operators
{
namespace
math
{
#define SIGMOID_THRESHOLD_MIN -40.0
#define SIGMOID_THRESHOLD_MAX 13.0
#define YMM_FLOAT_BLOCK 8
#define AVX_DOUBLE_BLOCK 4
#define YMM_FLOAT_BLOCK 8
#define AVX2_DOUBLE_BLOCK 4
#define ZMM_FLOAT_BLOCK 16
#define AVX512_DOUBLE_BLOCK 8
template
<
typename
T
>
inline
void
vec_exp
(
const
int
n
,
const
T
*
x
,
T
*
y
)
{
for
(
int
i
=
0
;
i
<
n
;
++
i
)
{
y
[
i
]
=
std
::
exp
(
x
[
i
]);
}
}
template
<
typename
T
>
inline
void
vec_scal
(
const
int
n
,
const
T
a
,
T
*
x
)
{
for
(
int
i
=
0
;
i
<
n
;
++
i
)
{
x
[
i
]
=
a
*
x
[
i
];
}
}
#ifdef PADDLE_WITH_MKLML
template
<
>
inline
void
vec_exp
<
float
>
(
const
int
n
,
const
float
*
x
,
float
*
y
)
{
constexpr
int
small_enough
=
128
;
if
(
n
<
small_enough
)
{
for
(
int
i
=
0
;
i
<
n
;
++
i
)
{
y
[
i
]
=
std
::
exp
(
x
[
i
]);
}
}
else
{
platform
::
dynload
::
vsExp
(
n
,
x
,
y
);
}
}
template
<
>
inline
void
vec_exp
<
double
>
(
const
int
n
,
const
double
*
x
,
double
*
y
)
{
platform
::
dynload
::
vdExp
(
n
,
x
,
y
);
}
template
<
>
inline
void
vec_scal
<
float
>
(
const
int
n
,
const
float
a
,
float
*
x
)
{
platform
::
dynload
::
cblas_sscal
(
n
,
a
,
x
,
1
);
}
template
<
>
inline
void
vec_scal
<
double
>
(
const
int
n
,
const
double
a
,
double
*
x
)
{
platform
::
dynload
::
cblas_dscal
(
n
,
a
,
x
,
1
);
}
#endif
// MKL scal only support inplace, choose this if src and dst are not equal
template
<
typename
T
,
platform
::
cpu_isa_t
isa
=
platform
::
isa_any
>
inline
void
vec_scal
(
const
int
n
,
const
T
a
,
const
T
*
x
,
T
*
y
)
{
for
(
int
i
=
0
;
i
<
n
;
++
i
)
{
y
[
i
]
=
a
*
x
[
i
];
}
}
template
<
>
inline
void
vec_scal
<
float
,
platform
::
avx
>
(
const
int
n
,
const
float
a
,
const
float
*
x
,
float
*
y
)
{
#ifdef __AVX__
constexpr
int
block
=
YMM_FLOAT_BLOCK
;
if
(
n
<
block
)
{
vec_scal
<
float
,
platform
::
isa_any
>
(
n
,
a
,
x
,
y
);
return
;
}
const
int
rest
=
n
%
block
;
const
int
end
=
n
-
rest
;
int
i
=
0
;
__m256
scalar
=
_mm256_set1_ps
(
a
);
__m256
tmp
;
#define MOVE_ONE_STEP \
tmp = _mm256_loadu_ps(x + i); \
tmp = _mm256_mul_ps(tmp, scalar); \
_mm256_storeu_ps(y + i, tmp)
for
(
i
=
0
;
i
<
end
;
i
+=
block
)
{
MOVE_ONE_STEP
;
}
#undef MOVE_ONE_STEP
if
(
rest
==
0
)
{
return
;
}
// can not continue move step if src and dst are inplace
for
(
i
=
n
-
rest
;
i
<
n
;
++
i
)
{
y
[
i
]
=
a
*
x
[
i
];
}
#else
vec_scal
<
float
,
platform
::
isa_any
>
(
n
,
a
,
x
,
y
);
#endif
}
template
<
>
inline
void
vec_scal
<
float
,
platform
::
avx2
>
(
const
int
n
,
const
float
a
,
const
float
*
x
,
float
*
y
)
{
vec_scal
<
float
,
platform
::
avx
>
(
n
,
a
,
x
,
y
);
}
template
<
>
inline
void
vec_scal
<
float
,
platform
::
avx512f
>
(
const
int
n
,
const
float
a
,
const
float
*
x
,
float
*
y
)
{
// TODO(TJ): enable me
vec_scal
<
float
,
platform
::
avx2
>
(
n
,
a
,
x
,
y
);
}
template
<
typename
T
,
platform
::
cpu_isa_t
isa
=
platform
::
isa_any
>
inline
void
vec_sum
(
const
size_t
n
,
const
T
*
x
,
T
*
s
)
{
s
[
0
]
=
x
[
0
];
for
(
size_t
i
=
1
;
i
<
n
;
++
i
)
{
s
[
0
]
+=
x
[
i
];
}
}
template
<
>
inline
void
vec_sum
<
float
,
platform
::
avx
>
(
const
size_t
n
,
const
float
*
x
,
float
*
s
)
{
#ifdef __AVX__
constexpr
unsigned
int
block
=
YMM_FLOAT_BLOCK
;
if
(
n
<
block
)
{
vec_sum
<
float
,
platform
::
isa_any
>
(
n
,
x
,
s
);
return
;
}
unsigned
int
i
,
end
;
i
=
end
=
0
;
s
[
0
]
=
0.
f
;
end
=
n
&
~
(
block
-
1
);
__m256
tmp
=
_mm256_setzero_ps
();
for
(
i
=
0
;
i
<
end
;
i
+=
block
)
{
tmp
=
_mm256_add_ps
(
tmp
,
_mm256_loadu_ps
(
x
+
i
));
}
__m256
hsum
=
_mm256_hadd_ps
(
tmp
,
tmp
);
hsum
=
_mm256_add_ps
(
hsum
,
_mm256_permute2f128_ps
(
hsum
,
hsum
,
0x1
));
_mm_store_ss
(
s
,
_mm_hadd_ps
(
_mm256_castps256_ps128
(
hsum
),
_mm256_castps256_ps128
(
hsum
)));
for
(;
i
<
n
;
i
++
)
{
s
[
0
]
+=
x
[
i
];
}
#else
vec_sum
<
float
,
platform
::
isa_any
>
(
n
,
x
,
s
);
#endif
}
template
<
typename
T
,
platform
::
cpu_isa_t
isa
=
platform
::
isa_any
>
inline
void
vec_mul
(
const
size_t
n
,
const
T
*
x
,
const
T
*
y
,
T
*
z
)
{
for
(
size_t
i
=
0
;
i
<
n
;
++
i
)
{
z
[
i
]
=
x
[
i
]
*
y
[
i
];
}
}
template
<
>
inline
void
vec_mul
<
float
,
platform
::
avx
>
(
const
size_t
n
,
const
float
*
x
,
const
float
*
y
,
float
*
z
)
{
#ifdef __AVX__
constexpr
unsigned
int
block
=
YMM_FLOAT_BLOCK
;
if
(
n
<
block
)
{
vec_mul
<
float
,
platform
::
isa_any
>
(
n
,
x
,
y
,
z
);
return
;
}
unsigned
int
i
=
0
,
end
=
0
;
end
=
n
&
~
(
block
-
1
);
for
(
i
=
0
;
i
<
end
;
i
+=
block
)
{
_mm256_storeu_ps
(
z
+
i
,
_mm256_mul_ps
(
_mm256_loadu_ps
(
x
+
i
),
_mm256_loadu_ps
(
y
+
i
)));
}
for
(;
i
<
n
;
i
++
)
{
z
[
i
]
=
x
[
i
]
*
y
[
i
];
}
#else
vec_mul
<
float
,
platform
::
isa_any
>
(
n
,
x
,
y
,
z
);
#endif
}
template
<
typename
T
,
platform
::
cpu_isa_t
isa
=
platform
::
isa_any
>
inline
void
vec_mul_reduce
(
const
size_t
n
,
const
T
*
x
,
const
T
*
y
,
T
*
z
)
{
z
[
0
]
=
x
[
0
]
*
y
[
0
];
for
(
size_t
i
=
1
;
i
<
n
;
++
i
)
{
z
[
0
]
+=
x
[
i
]
*
y
[
i
];
}
}
template
<
>
inline
void
vec_mul_reduce
<
float
,
platform
::
avx
>
(
const
size_t
n
,
const
float
*
x
,
const
float
*
y
,
float
*
z
)
{
#ifdef __AVX__
constexpr
unsigned
int
block
=
YMM_FLOAT_BLOCK
;
if
(
n
<
block
)
{
vec_mul_reduce
<
float
,
platform
::
isa_any
>
(
n
,
x
,
y
,
z
);
return
;
}
unsigned
int
i
=
0
,
end
=
0
;
z
[
0
]
=
0.
f
;
end
=
n
&
~
(
block
-
1
);
__m256
tmp
=
_mm256_setzero_ps
();
for
(
i
=
0
;
i
<
end
;
i
+=
block
)
{
tmp
=
_mm256_add_ps
(
tmp
,
_mm256_mul_ps
(
_mm256_loadu_ps
(
x
+
i
),
_mm256_loadu_ps
(
y
+
i
)));
}
__m256
hsum
=
_mm256_hadd_ps
(
tmp
,
tmp
);
hsum
=
_mm256_add_ps
(
hsum
,
_mm256_permute2f128_ps
(
hsum
,
hsum
,
0x1
));
_mm_store_ss
(
z
,
_mm_hadd_ps
(
_mm256_castps256_ps128
(
hsum
),
_mm256_castps256_ps128
(
hsum
)));
for
(;
i
<
n
;
i
++
)
{
z
[
0
]
+=
x
[
i
]
*
y
[
i
];
}
#else
vec_mul_reduce
<
float
,
platform
::
isa_any
>
(
n
,
x
,
y
,
z
);
#endif
}
template
<
typename
T
,
platform
::
cpu_isa_t
isa
=
platform
::
isa_any
>
inline
void
vec_bias_sub
(
const
int
n
,
const
T
a
,
const
T
*
x
,
T
*
y
)
{
for
(
int
i
=
0
;
i
<
n
;
++
i
)
{
y
[
i
]
=
a
-
x
[
i
];
}
}
template
<
>
inline
void
vec_bias_sub
<
float
,
platform
::
avx
>
(
const
int
n
,
const
float
a
,
const
float
*
x
,
float
*
y
)
{
#ifdef __AVX__
constexpr
int
block
=
YMM_FLOAT_BLOCK
;
if
(
n
<
block
)
{
vec_bias_sub
<
float
,
platform
::
isa_any
>
(
n
,
a
,
x
,
y
);
return
;
}
const
int
rest
=
n
%
block
;
const
int
end
=
n
-
rest
;
int
i
=
0
;
__m256
bias
=
_mm256_set1_ps
(
a
);
__m256
tmp
;
#define MOVE_ONE_STEP \
tmp = _mm256_loadu_ps(x + i); \
tmp = _mm256_sub_ps(bias, tmp); \
_mm256_storeu_ps(y + i, tmp)
for
(
i
=
0
;
i
<
end
;
i
+=
block
)
{
MOVE_ONE_STEP
;
}
#undef MOVE_ONE_STEP
if
(
rest
==
0
)
{
return
;
}
// can not continue move step if src and dst are inplace
for
(
i
=
n
-
rest
;
i
<
n
;
++
i
)
{
y
[
i
]
=
a
-
x
[
i
];
}
#else
vec_bias_sub
<
float
,
platform
::
isa_any
>
(
n
,
a
,
x
,
y
);
#endif
}
template
<
>
inline
void
vec_bias_sub
<
float
,
platform
::
avx2
>
(
const
int
n
,
const
float
a
,
const
float
*
x
,
float
*
y
)
{
vec_bias_sub
<
float
,
platform
::
avx
>
(
n
,
a
,
x
,
y
);
}
template
<
>
inline
void
vec_bias_sub
<
float
,
platform
::
avx512f
>
(
const
int
n
,
const
float
a
,
const
float
*
x
,
float
*
y
)
{
// TODO(TJ): enable me
vec_bias_sub
<
float
,
platform
::
avx2
>
(
n
,
a
,
x
,
y
);
}
// out = x*y + (1-x)*z
template
<
typename
T
,
platform
::
cpu_isa_t
isa
=
platform
::
isa_any
>
inline
void
vec_cross
(
const
int
n
,
const
T
*
x
,
const
T
*
y
,
const
T
*
z
,
T
*
out
)
{
for
(
int
i
=
0
;
i
<
n
;
++
i
)
{
out
[
i
]
=
x
[
i
]
*
y
[
i
]
+
(
static_cast
<
T
>
(
1
)
-
x
[
i
])
*
z
[
i
];
}
}
template
<
>
inline
void
vec_cross
<
float
,
platform
::
avx
>
(
const
int
n
,
const
float
*
x
,
const
float
*
y
,
const
float
*
z
,
float
*
out
)
{
#ifdef __AVX__
constexpr
int
block
=
YMM_FLOAT_BLOCK
;
if
(
n
<
block
)
{
vec_cross
<
float
,
platform
::
isa_any
>
(
n
,
x
,
y
,
z
,
out
);
return
;
}
const
int
rest
=
n
%
block
;
const
int
end
=
n
-
rest
;
int
i
=
0
;
__m256
bias
=
_mm256_set1_ps
(
1.
f
);
__m256
tmpx
,
tmpy
,
tmpz
;
for
(
i
=
0
;
i
<
end
;
i
+=
block
)
{
tmpx
=
_mm256_loadu_ps
(
x
+
i
);
tmpy
=
_mm256_loadu_ps
(
y
+
i
);
tmpz
=
_mm256_loadu_ps
(
z
+
i
);
tmpy
=
_mm256_mul_ps
(
tmpx
,
tmpy
);
tmpx
=
_mm256_sub_ps
(
bias
,
tmpx
);
tmpz
=
_mm256_mul_ps
(
tmpx
,
tmpz
);
tmpz
=
_mm256_add_ps
(
tmpy
,
tmpz
);
_mm256_storeu_ps
(
out
+
i
,
tmpz
);
}
if
(
rest
==
0
)
{
return
;
}
// can not continue move step if src and dst are inplace
for
(
i
=
n
-
rest
;
i
<
n
;
++
i
)
{
out
[
i
]
=
x
[
i
]
*
y
[
i
]
+
(
1.
f
-
x
[
i
])
*
z
[
i
];
}
#else
vec_cross
<
float
,
platform
::
isa_any
>
(
n
,
x
,
y
,
z
,
out
);
#endif
}
template
<
>
inline
void
vec_cross
<
float
,
platform
::
avx2
>
(
const
int
n
,
const
float
*
x
,
const
float
*
y
,
const
float
*
z
,
float
*
out
)
{
vec_cross
<
float
,
platform
::
avx
>
(
n
,
x
,
y
,
z
,
out
);
}
template
<
>
inline
void
vec_cross
<
float
,
platform
::
avx512f
>
(
const
int
n
,
const
float
*
x
,
const
float
*
y
,
const
float
*
z
,
float
*
out
)
{
// TODO(TJ): enable me
vec_cross
<
float
,
platform
::
avx
>
(
n
,
x
,
y
,
z
,
out
);
}
template
<
typename
T
,
platform
::
cpu_isa_t
isa
=
platform
::
isa_any
>
inline
void
vec_clip
(
const
size_t
n
,
const
T
a
,
const
T
*
x
,
T
*
y
)
{
for
(
size_t
i
=
0
;
i
<
n
;
++
i
)
{
y
[
i
]
=
x
[
i
]
<
a
?
a
:
x
[
i
];
}
}
template
<
>
inline
void
vec_clip
<
float
,
platform
::
avx
>
(
const
size_t
n
,
const
float
a
,
const
float
*
x
,
float
*
y
)
{
#ifdef __AVX__
constexpr
unsigned
int
block
=
YMM_FLOAT_BLOCK
;
if
(
n
<
block
)
{
vec_clip
<
float
,
platform
::
isa_any
>
(
n
,
a
,
x
,
y
);
return
;
}
unsigned
int
i
=
0
,
end
=
0
;
end
=
n
&
~
(
block
-
1
);
__m256
threshold
=
_mm256_set1_ps
(
a
);
for
(
i
=
0
;
i
<
end
;
i
+=
block
)
{
_mm256_storeu_ps
(
y
+
i
,
_mm256_max_ps
(
_mm256_loadu_ps
(
x
+
i
),
threshold
));
}
for
(;
i
<
n
;
i
++
)
{
y
[
i
]
=
x
[
i
]
<
a
?
a
:
x
[
i
];
}
#else
vec_clip
<
float
,
platform
::
isa_any
>
(
n
,
a
,
x
,
y
);
#endif
}
template
<
typename
T
,
platform
::
cpu_isa_t
isa
=
platform
::
isa_any
>
inline
void
vec_add_bias
(
const
int
n
,
const
T
a
,
const
T
*
x
,
T
*
y
)
{
for
(
int
i
=
0
;
i
<
n
;
++
i
)
{
y
[
i
]
=
x
[
i
]
+
a
;
}
}
template
<
>
inline
void
vec_add_bias
<
float
,
platform
::
avx
>
(
const
int
n
,
const
float
a
,
const
float
*
x
,
float
*
y
)
{
#ifdef __AVX__
constexpr
int
block
=
YMM_FLOAT_BLOCK
;
if
(
n
<
block
)
{
vec_add_bias
<
float
,
platform
::
isa_any
>
(
n
,
a
,
x
,
y
);
return
;
}
const
int
rest
=
n
%
block
;
const
int
end
=
n
-
rest
;
int
i
=
0
;
__m256
bias
=
_mm256_set1_ps
(
a
);
__m256
tmp
;
#define MOVE_ONE_STEP \
tmp = _mm256_loadu_ps(x + i); \
tmp = _mm256_add_ps(tmp, bias); \
_mm256_storeu_ps(y + i, tmp)
for
(
i
=
0
;
i
<
end
;
i
+=
block
)
{
MOVE_ONE_STEP
;
}
#undef MOVE_ONE_STEP
if
(
rest
==
0
)
{
return
;
}
// can not continue move step if src and dst are inplace
for
(
i
=
n
-
rest
;
i
<
n
;
++
i
)
{
y
[
i
]
=
x
[
i
]
+
a
;
}
#else
vec_add_bias
<
float
,
platform
::
isa_any
>
(
n
,
a
,
x
,
y
);
#endif
}
template
<
>
inline
void
vec_add_bias
<
float
,
platform
::
avx2
>
(
const
int
n
,
const
float
a
,
const
float
*
x
,
float
*
y
)
{
vec_add_bias
<
float
,
platform
::
avx
>
(
n
,
a
,
x
,
y
);
}
template
<
>
inline
void
vec_add_bias
<
float
,
platform
::
avx512f
>
(
const
int
n
,
const
float
a
,
const
float
*
x
,
float
*
y
)
{
// TODO(TJ): enable me
vec_add_bias
<
float
,
platform
::
avx2
>
(
n
,
a
,
x
,
y
);
}
template
<
typename
T
,
platform
::
cpu_isa_t
isa
=
platform
::
isa_any
>
inline
void
vec_identity
(
const
int
n
,
const
T
*
x
,
T
*
y
)
{
// do nothing
return
;
}
template
<
typename
T
,
platform
::
cpu_isa_t
isa
=
platform
::
isa_any
>
inline
void
vec_sigmoid
(
const
int
n
,
const
T
*
x
,
T
*
y
)
{
const
T
min
=
SIGMOID_THRESHOLD_MIN
;
const
T
max
=
SIGMOID_THRESHOLD_MAX
;
for
(
int
i
=
0
;
i
<
n
;
++
i
)
{
y
[
i
]
=
(
x
[
i
]
<
min
)
?
min
:
((
x
[
i
]
>
max
)
?
max
:
x
[
i
]);
y
[
i
]
=
static_cast
<
T
>
(
0
)
-
y
[
i
];
}
vec_exp
<
T
>
(
n
,
y
,
y
);
for
(
int
i
=
0
;
i
<
n
;
++
i
)
{
y
[
i
]
=
static_cast
<
T
>
(
1
)
/
(
static_cast
<
T
>
(
1
)
+
y
[
i
]);
}
}
template
<
>
inline
void
vec_sigmoid
<
float
,
platform
::
avx
>
(
const
int
n
,
const
float
*
x
,
float
*
y
)
{
#ifdef __AVX__
constexpr
int
block
=
YMM_FLOAT_BLOCK
;
if
(
n
<
block
)
{
vec_sigmoid
<
float
,
platform
::
isa_any
>
(
n
,
x
,
y
);
return
;
}
const
int
rest
=
n
%
block
;
const
int
end
=
n
-
rest
;
int
i
=
0
;
__m256
max
=
_mm256_set1_ps
(
SIGMOID_THRESHOLD_MAX
);
__m256
min
=
_mm256_set1_ps
(
SIGMOID_THRESHOLD_MIN
);
__m256
zeros
=
_mm256_setzero_ps
();
__m256
tmp
;
#define MOVE_ONE_STEP \
tmp = _mm256_loadu_ps(x + i); \
tmp = _mm256_max_ps(tmp, min); \
tmp = _mm256_min_ps(tmp, max); \
tmp = _mm256_sub_ps(zeros, tmp); \
_mm256_storeu_ps(y + i, tmp)
for
(
i
=
0
;
i
<
end
;
i
+=
block
)
{
MOVE_ONE_STEP
;
}
#undef MOVE_ONE_STEP
if
(
rest
!=
0
)
{
// can not continue move step since the src and dst address could be equal
const
float
xmin
=
SIGMOID_THRESHOLD_MIN
;
const
float
xmax
=
SIGMOID_THRESHOLD_MAX
;
for
(
i
=
n
-
rest
;
i
<
n
;
++
i
)
{
y
[
i
]
=
0.
f
-
((
x
[
i
]
<
xmin
)
?
xmin
:
((
x
[
i
]
>
xmax
)
?
xmax
:
x
[
i
]));
}
}
vec_exp
<
float
>
(
n
,
y
,
y
);
__m256
ones
=
_mm256_set1_ps
(
1.0
f
);
#define MOVE_ONE_STEP \
tmp = _mm256_loadu_ps(y + i); \
tmp = _mm256_add_ps(ones, tmp); \
tmp = _mm256_div_ps(ones, tmp); \
_mm256_storeu_ps(y + i, tmp)
for
(
i
=
0
;
i
<
end
;
i
+=
block
)
{
MOVE_ONE_STEP
;
}
#undef MOVE_ONE_STEP
if
(
rest
==
0
)
{
return
;
}
// can not continue move step
for
(
i
=
n
-
rest
;
i
<
n
;
++
i
)
{
y
[
i
]
=
1.
f
/
(
1.
f
+
y
[
i
]);
}
#else
vec_sigmoid
<
float
,
platform
::
isa_any
>
(
n
,
x
,
y
);
#endif
}
template
<
>
inline
void
vec_sigmoid
<
float
,
platform
::
avx2
>
(
const
int
n
,
const
float
*
x
,
float
*
y
)
{
vec_sigmoid
<
float
,
platform
::
avx
>
(
n
,
x
,
y
);
}
template
<
>
inline
void
vec_sigmoid
<
float
,
platform
::
avx512f
>
(
const
int
n
,
const
float
*
x
,
float
*
y
)
{
// TODO(TJ): enable me
vec_sigmoid
<
float
,
platform
::
avx2
>
(
n
,
x
,
y
);
}
template
<
typename
T
,
platform
::
cpu_isa_t
isa
=
platform
::
isa_any
>
inline
void
vec_tanh
(
const
int
n
,
const
T
*
x
,
T
*
y
)
{
vec_scal
<
T
,
isa
>
(
n
,
static_cast
<
T
>
(
2
),
x
,
y
);
vec_sigmoid
<
T
,
isa
>
(
n
,
y
,
y
);
vec_scal
<
T
>
(
n
,
static_cast
<
T
>
(
2
),
y
);
vec_add_bias
<
T
,
isa
>
(
n
,
static_cast
<
T
>
(
-
1
),
y
,
y
);
}
// TODO(TJ): make relu clip
template
<
typename
T
,
platform
::
cpu_isa_t
isa
=
platform
::
isa_any
>
inline
void
vec_relu
(
const
int
n
,
const
T
*
x
,
T
*
y
)
{
for
(
int
i
=
0
;
i
<
n
;
++
i
)
{
y
[
i
]
=
x
[
i
]
>
0
?
x
[
i
]
:
0
;
}
}
template
<
>
inline
void
vec_relu
<
float
,
platform
::
avx
>
(
const
int
n
,
const
float
*
x
,
float
*
y
)
{
#ifdef __AVX__
constexpr
int
block
=
YMM_FLOAT_BLOCK
;
if
(
n
<
block
*
4
)
{
vec_relu
<
float
,
platform
::
isa_any
>
(
n
,
x
,
y
);
return
;
}
const
int
rest
=
n
%
block
;
const
int
end
=
n
-
rest
;
int
i
=
0
;
__m256
zeros
=
_mm256_setzero_ps
();
__m256
tmp
;
#define MOVE_ONE_STEP \
tmp = _mm256_loadu_ps(x + i); \
tmp = _mm256_max_ps(tmp, zeros); \
_mm256_storeu_ps(y + i, tmp)
for
(
i
=
0
;
i
<
end
;
i
+=
block
)
{
MOVE_ONE_STEP
;
}
if
(
rest
==
0
)
{
return
;
}
i
=
n
-
block
;
MOVE_ONE_STEP
;
#undef MOVE_ONE_STEP
#else
vec_relu
<
float
,
platform
::
isa_any
>
(
n
,
x
,
y
);
#endif
}
template
<
>
inline
void
vec_relu
<
float
,
platform
::
avx2
>
(
const
int
n
,
const
float
*
x
,
float
*
y
)
{
vec_relu
<
float
,
platform
::
avx
>
(
n
,
x
,
y
);
}
template
<
>
inline
void
vec_relu
<
float
,
platform
::
avx512f
>
(
const
int
n
,
const
float
*
x
,
float
*
y
)
{
// TODO(TJ): enable me
vec_relu
<
float
,
platform
::
avx2
>
(
n
,
x
,
y
);
}
// TODO(TJ): optimize double of sigmoid, tanh and relu if necessary
template
<
typename
T
,
platform
::
cpu_isa_t
isa
=
platform
::
isa_any
>
class
VecActivations
{
public:
std
::
function
<
void
(
const
int
,
const
T
*
,
T
*
)
>
operator
()(
const
std
::
string
&
type
)
{
if
(
type
==
"sigmoid"
)
{
return
vec_sigmoid
<
T
,
isa
>
;
}
else
if
(
type
==
"relu"
)
{
return
vec_relu
<
T
,
isa
>
;
}
else
if
(
type
==
"tanh"
)
{
return
vec_tanh
<
T
,
isa
>
;
}
else
if
(
type
==
"identity"
||
type
==
""
)
{
return
vec_identity
<
T
,
isa
>
;
}
PADDLE_THROW
(
platform
::
errors
::
InvalidArgument
(
"Expected type should be one of sigmod, relu, tanh, identity. But got "
"not support type: %s."
,
type
));
}
};
}
// namespace math
}
// namespace operators
}
// namespace paddle
paddle/fluid/operators/sample_logits_op.cu
浏览文件 @
344b99e1
...
@@ -21,9 +21,9 @@ limitations under the License. */
...
@@ -21,9 +21,9 @@ limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/tensor_util.h"
#include "paddle/fluid/framework/tensor_util.h"
#include "paddle/fluid/operators/math/sample_prob.h"
#include "paddle/fluid/operators/math/sample_prob.h"
#include "paddle/fluid/operators/math/softmax.h"
#include "paddle/fluid/operators/sample_logits_op.h"
#include "paddle/fluid/operators/sample_logits_op.h"
#include "paddle/phi/kernels/funcs/math_function.h"
#include "paddle/phi/kernels/funcs/math_function.h"
#include "paddle/phi/kernels/funcs/softmax.h"
namespace
paddle
{
namespace
paddle
{
namespace
operators
{
namespace
operators
{
...
...
paddle/fluid/operators/sample_logits_op.h
浏览文件 @
344b99e1
...
@@ -21,8 +21,8 @@ limitations under the License. */
...
@@ -21,8 +21,8 @@ limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/tensor_util.h"
#include "paddle/fluid/framework/tensor_util.h"
#include "paddle/fluid/operators/math/sample_prob.h"
#include "paddle/fluid/operators/math/sample_prob.h"
#include "paddle/fluid/operators/math/softmax.h"
#include "paddle/phi/kernels/funcs/math_function.h"
#include "paddle/phi/kernels/funcs/math_function.h"
#include "paddle/phi/kernels/funcs/softmax.h"
namespace
paddle
{
namespace
paddle
{
namespace
operators
{
namespace
operators
{
...
...
paddle/fluid/operators/sequence_ops/sequence_softmax_cudnn_op.cu.cc
浏览文件 @
344b99e1
...
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
...
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
limitations under the License. */
limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/softmax.h"
#include "paddle/phi/kernels/funcs/math_function.h"
#include "paddle/phi/kernels/funcs/math_function.h"
#include "paddle/phi/kernels/funcs/softmax.h"
namespace
paddle
{
namespace
paddle
{
namespace
operators
{
namespace
operators
{
...
@@ -61,7 +61,7 @@ class SequenceSoftmaxCUDNNKernel : public framework::OpKernel<T> {
...
@@ -61,7 +61,7 @@ class SequenceSoftmaxCUDNNKernel : public framework::OpKernel<T> {
phi
::
make_ddim
({
1UL
,
end_pos
-
start_pos
});
phi
::
make_ddim
({
1UL
,
end_pos
-
start_pos
});
x_i
.
Resize
(
dims_i
);
x_i
.
Resize
(
dims_i
);
out_i
.
Resize
(
dims_i
);
out_i
.
Resize
(
dims_i
);
math
::
SoftmaxCUDNNFunctor
<
T
,
phi
::
GPUContext
>
()(
phi
::
funcs
::
SoftmaxCUDNNFunctor
<
T
,
phi
::
GPUContext
>
()(
ctx
.
template
device_context
<
phi
::
GPUContext
>(),
&
x_i
,
&
out_i
);
ctx
.
template
device_context
<
phi
::
GPUContext
>(),
&
x_i
,
&
out_i
);
}
}
}
}
...
@@ -95,7 +95,7 @@ class SequenceSoftmaxGradCUDNNKernel : public framework::OpKernel<T> {
...
@@ -95,7 +95,7 @@ class SequenceSoftmaxGradCUDNNKernel : public framework::OpKernel<T> {
out_i
.
Resize
(
dims_i
);
out_i
.
Resize
(
dims_i
);
out_grad_i
.
Resize
(
dims_i
);
out_grad_i
.
Resize
(
dims_i
);
x_grad_i
.
Resize
(
dims_i
);
x_grad_i
.
Resize
(
dims_i
);
math
::
SoftmaxGradCUDNNFunctor
<
T
,
phi
::
GPUContext
>
()(
phi
::
funcs
::
SoftmaxGradCUDNNFunctor
<
T
,
phi
::
GPUContext
>
()(
ctx
.
template
device_context
<
phi
::
GPUContext
>(),
ctx
.
template
device_context
<
phi
::
GPUContext
>(),
&
out_i
,
&
out_i
,
&
out_grad_i
,
&
out_grad_i
,
...
...
paddle/fluid/operators/softmax_with_cross_entropy_op_npu.cc
浏览文件 @
344b99e1
...
@@ -16,10 +16,10 @@ limitations under the License. */
...
@@ -16,10 +16,10 @@ limitations under the License. */
#include <string>
#include <string>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/softmax.h"
#include "paddle/fluid/platform/device/npu/npu_op_runner.h"
#include "paddle/fluid/platform/device/npu/npu_op_runner.h"
#include "paddle/phi/kernels/funcs/axis_utils.h"
#include "paddle/phi/kernels/funcs/axis_utils.h"
#include "paddle/phi/kernels/funcs/cross_entropy.h"
#include "paddle/phi/kernels/funcs/cross_entropy.h"
#include "paddle/phi/kernels/funcs/softmax.h"
namespace
paddle
{
namespace
paddle
{
namespace
operators
{
namespace
operators
{
...
...
paddle/fluid/platform/CMakeLists.txt
浏览文件 @
344b99e1
...
@@ -56,18 +56,10 @@ cc_test(
...
@@ -56,18 +56,10 @@ cc_test(
SRCS enforce_test.cc
SRCS enforce_test.cc
DEPS enforce
)
DEPS enforce
)
set
(
CPU_INFO_DEPS gflags glog enforce
)
if
(
WITH_XBYAK
)
list
(
APPEND CPU_INFO_DEPS xbyak
)
endif
()
cc_library
(
cpu_info
SRCS cpu_info.cc
DEPS
${
CPU_INFO_DEPS
}
)
cc_test
(
cc_test
(
cpu_info_test
cpu_info_test
SRCS cpu_info_test.cc
SRCS cpu_info_test.cc
DEPS
cpu_info
)
DEPS
phi_backends
)
cc_library
(
cc_library
(
os_info
os_info
SRCS os_info.cc
SRCS os_info.cc
...
@@ -194,7 +186,6 @@ cc_library(
...
@@ -194,7 +186,6 @@ cc_library(
phi_place
phi_place
eigen3
eigen3
cpu_helper
cpu_helper
cpu_info
framework_proto
framework_proto
${
IPU_CTX_DEPS
}
${
IPU_CTX_DEPS
}
${
GPU_CTX_DEPS
}
${
GPU_CTX_DEPS
}
...
...
paddle/fluid/platform/cpu_info.h
已删除
100644 → 0
浏览文件 @
4672ea8e
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <stddef.h>
#ifdef _WIN32
#if defined(__AVX2__)
#include <immintrin.h> // avx2
#elif defined(__AVX__)
#include <intrin.h> // avx
#endif // AVX
#else // WIN32
#ifdef __AVX__
#include <immintrin.h>
#endif
#endif // WIN32
#if defined(_WIN32)
#define ALIGN32_BEG __declspec(align(32))
#define ALIGN32_END
#else
#define ALIGN32_BEG
#define ALIGN32_END __attribute__((aligned(32)))
#endif // _WIN32
#ifndef PADDLE_WITH_XBYAK
#ifdef _WIN32
#define cpuid(reg, x) __cpuidex(reg, x, 0)
#else
#if !defined(WITH_NV_JETSON) && !defined(PADDLE_WITH_ARM) && \
!defined(PADDLE_WITH_SW) && !defined(PADDLE_WITH_MIPS)
#include <cpuid.h>
inline
void
cpuid
(
int
reg
[
4
],
int
x
)
{
__cpuid_count
(
x
,
0
,
reg
[
0
],
reg
[
1
],
reg
[
2
],
reg
[
3
]);
}
#endif
#endif
#endif
#include "paddle/phi/backends/cpu/cpu_info.h"
namespace
paddle
{
namespace
platform
{
size_t
CpuTotalPhysicalMemory
();
//! Get the maximum allocation size for a machine.
size_t
CpuMaxAllocSize
();
//! Get the maximum allocation size for a machine.
size_t
CUDAPinnedMaxAllocSize
();
using
phi
::
backends
::
cpu
::
CpuMinChunkSize
;
//! Get the maximum chunk size for buddy allocator.
size_t
CpuMaxChunkSize
();
//! Get the minimum chunk size for buddy allocator.
size_t
CUDAPinnedMinChunkSize
();
//! Get the maximum chunk size for buddy allocator.
size_t
CUDAPinnedMaxChunkSize
();
//! Get the maximum allocation size for a machine.
size_t
NPUPinnedMaxAllocSize
();
//! Get the minimum chunk size for buddy allocator.
size_t
NPUPinnedMinChunkSize
();
//! Get the maximum chunk size for buddy allocator.
size_t
NPUPinnedMaxChunkSize
();
using
namespace
phi
::
backends
::
cpu
;
// NOLINT
// May I use some instruction
bool
MayIUse
(
const
cpu_isa_t
cpu_isa
);
}
// namespace platform
}
// namespace paddle
paddle/fluid/platform/cpu_info_test.cc
浏览文件 @
344b99e1
...
@@ -11,7 +11,7 @@
...
@@ -11,7 +11,7 @@
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// See the License for the specific language governing permissions and
// limitations under the License.
// limitations under the License.
#include "paddle/
fluid/platform
/cpu_info.h"
#include "paddle/
phi/backends/cpu
/cpu_info.h"
#include <sstream>
#include <sstream>
...
@@ -23,7 +23,8 @@ DECLARE_double(fraction_of_cpu_memory_to_use);
...
@@ -23,7 +23,8 @@ DECLARE_double(fraction_of_cpu_memory_to_use);
TEST
(
CpuMemoryUsage
,
Print
)
{
TEST
(
CpuMemoryUsage
,
Print
)
{
std
::
stringstream
ss
;
std
::
stringstream
ss
;
size_t
memory_size
=
paddle
::
platform
::
CpuMaxAllocSize
()
/
1024
/
1024
/
1024
;
size_t
memory_size
=
phi
::
backends
::
cpu
::
CpuMaxAllocSize
()
/
1024
/
1024
/
1024
;
float
use_percent
=
FLAGS_fraction_of_cpu_memory_to_use
*
100
;
float
use_percent
=
FLAGS_fraction_of_cpu_memory_to_use
*
100
;
std
::
cout
<<
paddle
::
string
::
Sprintf
(
"
\n
%.2f %% of CPU Memory Usage: %d GB
\n
"
,
std
::
cout
<<
paddle
::
string
::
Sprintf
(
"
\n
%.2f %% of CPU Memory Usage: %d GB
\n
"
,
...
...
paddle/fluid/platform/init.cc
浏览文件 @
344b99e1
...
@@ -16,9 +16,9 @@ limitations under the License. */
...
@@ -16,9 +16,9 @@ limitations under the License. */
#include <string>
#include <string>
#include "paddle/fluid/platform/cpu_helper.h"
#include "paddle/fluid/platform/cpu_helper.h"
#include "paddle/fluid/platform/cpu_info.h"
#include "paddle/fluid/platform/device/npu/npu_info.h"
#include "paddle/fluid/platform/device/npu/npu_info.h"
#include "paddle/fluid/string/split.h"
#include "paddle/fluid/string/split.h"
#include "paddle/phi/backends/cpu/cpu_info.h"
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#include "paddle/fluid/platform/cuda_device_guard.h"
#include "paddle/fluid/platform/cuda_device_guard.h"
#endif
#endif
...
...
paddle/fluid/platform/profiler/CMakeLists.txt
浏览文件 @
344b99e1
...
@@ -29,7 +29,7 @@ cc_library(
...
@@ -29,7 +29,7 @@ cc_library(
cc_library
(
cc_library
(
cpu_utilization
cpu_utilization
SRCS cpu_utilization.cc
SRCS cpu_utilization.cc
DEPS
cpu_info
os_info enforce glog
)
DEPS
phi_backends
os_info enforce glog
)
cc_library
(
cc_library
(
new_profiler
new_profiler
SRCS profiler.cc
SRCS profiler.cc
...
...
paddle/fluid/pybind/parallel_executor.cc
浏览文件 @
344b99e1
...
@@ -72,7 +72,6 @@ limitations under the License. */
...
@@ -72,7 +72,6 @@ limitations under the License. */
#include "paddle/fluid/operators/common_infer_shape_functions.h"
#include "paddle/fluid/operators/common_infer_shape_functions.h"
#include "paddle/fluid/operators/py_func_op.h"
#include "paddle/fluid/operators/py_func_op.h"
#include "paddle/fluid/platform/cpu_helper.h"
#include "paddle/fluid/platform/cpu_helper.h"
#include "paddle/fluid/platform/cpu_info.h"
#include "paddle/fluid/platform/device/device_wrapper.h"
#include "paddle/fluid/platform/device/device_wrapper.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/dynload/dynamic_loader.h"
#include "paddle/fluid/platform/dynload/dynamic_loader.h"
...
@@ -89,6 +88,7 @@ limitations under the License. */
...
@@ -89,6 +88,7 @@ limitations under the License. */
#include "paddle/fluid/pybind/eager.h"
#include "paddle/fluid/pybind/eager.h"
#include "paddle/fluid/pybind/imperative.h"
#include "paddle/fluid/pybind/imperative.h"
#include "paddle/fluid/pybind/io.h"
#include "paddle/fluid/pybind/io.h"
#include "paddle/phi/backends/cpu/cpu_info.h"
#include "paddle/phi/core/compat/convert_utils.h"
#include "paddle/phi/core/compat/convert_utils.h"
#include "paddle/phi/core/lod_utils.h"
#include "paddle/phi/core/lod_utils.h"
#include "paddle/utils/none.h"
#include "paddle/utils/none.h"
...
...
paddle/fluid/pybind/place.cc
浏览文件 @
344b99e1
...
@@ -72,7 +72,6 @@ limitations under the License. */
...
@@ -72,7 +72,6 @@ limitations under the License. */
#include "paddle/fluid/operators/common_infer_shape_functions.h"
#include "paddle/fluid/operators/common_infer_shape_functions.h"
#include "paddle/fluid/operators/py_func_op.h"
#include "paddle/fluid/operators/py_func_op.h"
#include "paddle/fluid/platform/cpu_helper.h"
#include "paddle/fluid/platform/cpu_helper.h"
#include "paddle/fluid/platform/cpu_info.h"
#include "paddle/fluid/platform/device/device_wrapper.h"
#include "paddle/fluid/platform/device/device_wrapper.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/dynload/dynamic_loader.h"
#include "paddle/fluid/platform/dynload/dynamic_loader.h"
...
@@ -89,6 +88,7 @@ limitations under the License. */
...
@@ -89,6 +88,7 @@ limitations under the License. */
#include "paddle/fluid/pybind/eager.h"
#include "paddle/fluid/pybind/eager.h"
#include "paddle/fluid/pybind/imperative.h"
#include "paddle/fluid/pybind/imperative.h"
#include "paddle/fluid/pybind/io.h"
#include "paddle/fluid/pybind/io.h"
#include "paddle/phi/backends/cpu/cpu_info.h"
#include "paddle/phi/core/compat/convert_utils.h"
#include "paddle/phi/core/compat/convert_utils.h"
#include "paddle/phi/core/lod_utils.h"
#include "paddle/phi/core/lod_utils.h"
#include "paddle/utils/none.h"
#include "paddle/utils/none.h"
...
...
paddle/fluid/pybind/pybind.cc
浏览文件 @
344b99e1
...
@@ -75,7 +75,6 @@ limitations under the License. */
...
@@ -75,7 +75,6 @@ limitations under the License. */
#include "paddle/fluid/operators/ops_extra_info.h"
#include "paddle/fluid/operators/ops_extra_info.h"
#include "paddle/fluid/operators/py_func_op.h"
#include "paddle/fluid/operators/py_func_op.h"
#include "paddle/fluid/platform/cpu_helper.h"
#include "paddle/fluid/platform/cpu_helper.h"
#include "paddle/fluid/platform/cpu_info.h"
#include "paddle/fluid/platform/device/device_wrapper.h"
#include "paddle/fluid/platform/device/device_wrapper.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/dynload/dynamic_loader.h"
#include "paddle/fluid/platform/dynload/dynamic_loader.h"
...
@@ -94,6 +93,7 @@ limitations under the License. */
...
@@ -94,6 +93,7 @@ limitations under the License. */
#include "paddle/fluid/pybind/io.h"
#include "paddle/fluid/pybind/io.h"
#include "paddle/fluid/pybind/jit.h"
#include "paddle/fluid/pybind/jit.h"
#include "paddle/fluid/pybind/xpu_streams_py.h"
#include "paddle/fluid/pybind/xpu_streams_py.h"
#include "paddle/phi/backends/cpu/cpu_info.h"
#include "paddle/phi/core/compat/convert_utils.h"
#include "paddle/phi/core/compat/convert_utils.h"
#include "paddle/phi/core/lod_utils.h"
#include "paddle/phi/core/lod_utils.h"
#include "paddle/utils/none.h"
#include "paddle/utils/none.h"
...
@@ -327,7 +327,7 @@ bool SupportsBfloat16() {
...
@@ -327,7 +327,7 @@ bool SupportsBfloat16() {
#ifndef PADDLE_WITH_MKLDNN
#ifndef PADDLE_WITH_MKLDNN
return
false
;
return
false
;
#else
#else
if
(
p
latform
::
MayIUse
(
platform
::
cpu_isa_t
::
avx512_core
))
if
(
p
hi
::
backends
::
cpu
::
MayIUse
(
phi
::
backends
::
cpu
::
cpu_isa_t
::
avx512_core
))
return
true
;
return
true
;
else
else
return
false
;
return
false
;
...
@@ -338,7 +338,7 @@ bool SupportsBfloat16FastPerformance() {
...
@@ -338,7 +338,7 @@ bool SupportsBfloat16FastPerformance() {
#ifndef PADDLE_WITH_MKLDNN
#ifndef PADDLE_WITH_MKLDNN
return
false
;
return
false
;
#else
#else
if
(
p
latform
::
MayIUse
(
platform
::
cpu_isa_t
::
avx512_bf16
))
if
(
p
hi
::
backends
::
cpu
::
MayIUse
(
phi
::
backends
::
cpu
::
cpu_isa_t
::
avx512_bf16
))
return
true
;
return
true
;
else
else
return
false
;
return
false
;
...
@@ -349,8 +349,8 @@ bool SupportsInt8() {
...
@@ -349,8 +349,8 @@ bool SupportsInt8() {
#ifndef PADDLE_WITH_MKLDNN
#ifndef PADDLE_WITH_MKLDNN
return
false
;
return
false
;
#else
#else
return
(
p
latform
::
MayIUse
(
platform
::
cpu_isa_t
::
avx2
)
||
return
(
p
hi
::
backends
::
cpu
::
MayIUse
(
phi
::
backends
::
cpu
::
cpu_isa_t
::
avx2
)
||
p
latform
::
MayIUse
(
platform
::
cpu_isa_t
::
avx512f
));
p
hi
::
backends
::
cpu
::
MayIUse
(
phi
::
backends
::
cpu
::
cpu_isa_t
::
avx512f
));
#endif
#endif
}
}
...
@@ -358,7 +358,8 @@ bool SupportsVNNI() {
...
@@ -358,7 +358,8 @@ bool SupportsVNNI() {
#ifndef PADDLE_WITH_MKLDNN
#ifndef PADDLE_WITH_MKLDNN
return
false
;
return
false
;
#else
#else
return
platform
::
MayIUse
(
platform
::
cpu_isa_t
::
avx512_core_vnni
);
return
phi
::
backends
::
cpu
::
MayIUse
(
phi
::
backends
::
cpu
::
cpu_isa_t
::
avx512_core_vnni
);
#endif
#endif
}
}
...
@@ -615,7 +616,7 @@ PYBIND11_MODULE(libpaddle, m) {
...
@@ -615,7 +616,7 @@ PYBIND11_MODULE(libpaddle, m) {
BindJit
(
&
m
);
BindJit
(
&
m
);
// Not used, just make sure cpu_info.cc is linked.
// Not used, just make sure cpu_info.cc is linked.
p
addle
::
platform
::
CpuTotalPhysicalMemory
();
p
hi
::
backends
::
cpu
::
CpuTotalPhysicalMemory
();
paddle
::
memory
::
allocation
::
UseAllocatorStrategyGFlag
();
paddle
::
memory
::
allocation
::
UseAllocatorStrategyGFlag
();
...
...
paddle/fluid/pybind/tensor.cc
浏览文件 @
344b99e1
...
@@ -72,7 +72,6 @@ limitations under the License. */
...
@@ -72,7 +72,6 @@ limitations under the License. */
#include "paddle/fluid/operators/common_infer_shape_functions.h"
#include "paddle/fluid/operators/common_infer_shape_functions.h"
#include "paddle/fluid/operators/py_func_op.h"
#include "paddle/fluid/operators/py_func_op.h"
#include "paddle/fluid/platform/cpu_helper.h"
#include "paddle/fluid/platform/cpu_helper.h"
#include "paddle/fluid/platform/cpu_info.h"
#include "paddle/fluid/platform/device/device_wrapper.h"
#include "paddle/fluid/platform/device/device_wrapper.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/dynload/dynamic_loader.h"
#include "paddle/fluid/platform/dynload/dynamic_loader.h"
...
@@ -89,6 +88,7 @@ limitations under the License. */
...
@@ -89,6 +88,7 @@ limitations under the License. */
#include "paddle/fluid/pybind/eager.h"
#include "paddle/fluid/pybind/eager.h"
#include "paddle/fluid/pybind/imperative.h"
#include "paddle/fluid/pybind/imperative.h"
#include "paddle/fluid/pybind/io.h"
#include "paddle/fluid/pybind/io.h"
#include "paddle/phi/backends/cpu/cpu_info.h"
#include "paddle/phi/core/compat/convert_utils.h"
#include "paddle/phi/core/compat/convert_utils.h"
#include "paddle/phi/core/lod_utils.h"
#include "paddle/phi/core/lod_utils.h"
#include "paddle/utils/none.h"
#include "paddle/utils/none.h"
...
...
paddle/phi/backends/CMakeLists.txt
浏览文件 @
344b99e1
add_subdirectory
(
dynload
)
add_subdirectory
(
dynload
)
add_subdirectory
(
gpu
)
add_subdirectory
(
gpu
)
set
(
BACKENDS_SRCS all_context.cc cpu/cpu_context.cc
)
set
(
BACKENDS_SRCS all_context.cc cpu/cpu_context.cc
cpu/cpu_info.cc
)
set
(
BACKENDS_DEPS enforce place flags eigen3 phi_device_context
)
set
(
BACKENDS_DEPS enforce place flags eigen3 phi_device_context
)
if
(
WITH_XBYAK
)
list
(
APPEND BACKENDS_DEPS xbyak
)
endif
()
if
(
WITH_GPU OR WITH_ROCM
)
if
(
WITH_GPU OR WITH_ROCM
)
list
(
APPEND BACKENDS_SRCS gpu/gpu_context.cc gpu/gpu_info.cc
list
(
APPEND BACKENDS_SRCS gpu/gpu_context.cc gpu/gpu_info.cc
...
...
paddle/
fluid/platform
/cpu_info.cc
→
paddle/
phi/backends/cpu
/cpu_info.cc
浏览文件 @
344b99e1
/* Copyright (c) 20
16
PaddlePaddle Authors. All Rights Reserved.
/* Copyright (c) 20
22
PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
you may not use this file except in compliance with the License.
...
@@ -12,11 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
...
@@ -12,11 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
See the License for the specific language governing permissions and
limitations under the License. */
limitations under the License. */
#include "paddle/fluid/platform/cpu_info.h"
#include "paddle/phi/backends/cpu/cpu_info.h"
#ifdef PADDLE_WITH_XBYAK
#include "xbyak/xbyak_util.h"
#endif
#ifdef __APPLE__
#ifdef __APPLE__
#include <sys/sysctl.h>
#include <sys/sysctl.h>
...
@@ -30,6 +26,10 @@ limitations under the License. */
...
@@ -30,6 +26,10 @@ limitations under the License. */
#include <unistd.h>
#include <unistd.h>
#endif // _WIN32
#endif // _WIN32
#ifdef PADDLE_WITH_XBYAK
#include "xbyak/xbyak_util.h"
#endif
#include <algorithm>
#include <algorithm>
#include "paddle/phi/core/flags.h"
#include "paddle/phi/core/flags.h"
...
@@ -47,8 +47,9 @@ PADDLE_DEFINE_EXPORTED_bool(use_pinned_memory,
...
@@ -47,8 +47,9 @@ PADDLE_DEFINE_EXPORTED_bool(use_pinned_memory,
true
,
true
,
"If set, allocate cpu pinned memory."
);
"If set, allocate cpu pinned memory."
);
namespace
paddle
{
namespace
phi
{
namespace
platform
{
namespace
backends
{
namespace
cpu
{
size_t
CpuTotalPhysicalMemory
()
{
size_t
CpuTotalPhysicalMemory
()
{
#ifdef __APPLE__
#ifdef __APPLE__
...
@@ -87,6 +88,11 @@ size_t CpuMaxChunkSize() {
...
@@ -87,6 +88,11 @@ size_t CpuMaxChunkSize() {
static_cast
<
size_t
>
(
FLAGS_initial_cpu_memory_in_mb
*
1
<<
20
));
static_cast
<
size_t
>
(
FLAGS_initial_cpu_memory_in_mb
*
1
<<
20
));
}
}
size_t
CpuMinChunkSize
()
{
// Allow to allocate the minimum chunk size is 4 KB.
return
1
<<
12
;
}
size_t
CUDAPinnedMaxAllocSize
()
{
size_t
CUDAPinnedMaxAllocSize
()
{
// For distributed systems, it requires configuring and limiting
// For distributed systems, it requires configuring and limiting
// the fraction of memory to use.
// the fraction of memory to use.
...
@@ -206,5 +212,6 @@ bool MayIUse(const cpu_isa_t cpu_isa) {
...
@@ -206,5 +212,6 @@ bool MayIUse(const cpu_isa_t cpu_isa) {
}
}
#endif
#endif
}
// namespace platform
}
// namespace cpu
}
// namespace paddle
}
// namespace backends
}
// namespace phi
paddle/phi/backends/cpu/cpu_info.h
浏览文件 @
344b99e1
...
@@ -36,15 +36,52 @@
...
@@ -36,15 +36,52 @@
#define ALIGN32_END __attribute__((aligned(32)))
#define ALIGN32_END __attribute__((aligned(32)))
#endif // _WIN32
#endif // _WIN32
#ifndef PADDLE_WITH_XBYAK
#ifdef _WIN32
#define cpuid(reg, x) __cpuidex(reg, x, 0)
#else
#if !defined(WITH_NV_JETSON) && !defined(PADDLE_WITH_ARM) && \
!defined(PADDLE_WITH_SW) && !defined(PADDLE_WITH_MIPS)
#include <cpuid.h>
inline
void
cpuid
(
int
reg
[
4
],
int
x
)
{
__cpuid_count
(
x
,
0
,
reg
[
0
],
reg
[
1
],
reg
[
2
],
reg
[
3
]);
}
#endif
#endif
#endif
namespace
phi
{
namespace
phi
{
namespace
backends
{
namespace
backends
{
namespace
cpu
{
namespace
cpu
{
size_t
CpuTotalPhysicalMemory
();
//! Get the maximum allocation size for a machine.
size_t
CpuMaxAllocSize
();
//! Get the maximum allocation size for a machine.
size_t
CUDAPinnedMaxAllocSize
();
//! Get the minimum chunk size for buddy allocator.
//! Get the minimum chunk size for buddy allocator.
inline
size_t
CpuMinChunkSize
()
{
size_t
CpuMinChunkSize
();
// Allow to allocate the minimum chunk size is 4 KB.
return
1
<<
12
;
//! Get the maximum chunk size for buddy allocator.
}
size_t
CpuMaxChunkSize
();
//! Get the minimum chunk size for buddy allocator.
size_t
CUDAPinnedMinChunkSize
();
//! Get the maximum chunk size for buddy allocator.
size_t
CUDAPinnedMaxChunkSize
();
//! Get the maximum allocation size for a machine.
size_t
NPUPinnedMaxAllocSize
();
//! Get the minimum chunk size for buddy allocator.
size_t
NPUPinnedMinChunkSize
();
//! Get the maximum chunk size for buddy allocator.
size_t
NPUPinnedMaxChunkSize
();
typedef
enum
{
typedef
enum
{
isa_any
,
isa_any
,
...
@@ -59,6 +96,8 @@ typedef enum {
...
@@ -59,6 +96,8 @@ typedef enum {
avx512_bf16
,
avx512_bf16
,
}
cpu_isa_t
;
// Instruction set architecture
}
cpu_isa_t
;
// Instruction set architecture
// May I use some instruction
bool
MayIUse
(
const
cpu_isa_t
cpu_isa
);
}
// namespace cpu
}
// namespace cpu
}
// namespace backends
}
// namespace backends
}
// namespace phi
}
// namespace phi
paddle/phi/kernels/funcs/CMakeLists.txt
浏览文件 @
344b99e1
...
@@ -19,6 +19,7 @@ math_library(matrix_solve DEPS dense_tensor eigen3 blas math_function)
...
@@ -19,6 +19,7 @@ math_library(matrix_solve DEPS dense_tensor eigen3 blas math_function)
math_library
(
cross_entropy
)
math_library
(
cross_entropy
)
math_library
(
im2col
)
math_library
(
im2col
)
math_library
(
vol2col
)
math_library
(
vol2col
)
math_library
(
softmax DEPS math_function
)
cc_library
(
cc_library
(
phi_data_layout_transform
phi_data_layout_transform
...
...
paddle/phi/kernels/funcs/eigen/common.h
浏览文件 @
344b99e1
...
@@ -17,6 +17,7 @@ limitations under the License. */
...
@@ -17,6 +17,7 @@ limitations under the License. */
#include <stdint.h>
#include <stdint.h>
#include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/kernels/funcs/eigen/extensions.h"
#include "unsupported/Eigen/CXX11/Tensor"
#include "unsupported/Eigen/CXX11/Tensor"
namespace
phi
{
namespace
phi
{
...
...
paddle/
fluid/operators/math
/softmax.cc
→
paddle/
phi/kernels/funcs
/softmax.cc
浏览文件 @
344b99e1
...
@@ -12,20 +12,18 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
...
@@ -12,20 +12,18 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
See the License for the specific language governing permissions and
limitations under the License. */
limitations under the License. */
#include "paddle/
fluid/operators/math
/softmax.h"
#include "paddle/
phi/kernels/funcs
/softmax.h"
#include "paddle/fluid/operators/math/softmax_impl.h"
#include "paddle/phi/backends/cpu/cpu_context.h"
#include "paddle/phi/backends/cpu/cpu_context.h"
#include "paddle/phi/kernels/funcs/softmax_impl.h"
namespace
paddle
{
namespace
phi
{
namespace
operators
{
namespace
funcs
{
namespace
math
{
template
class
SoftmaxFunctor
<
phi
::
CPUContext
,
float
>;
template
class
SoftmaxFunctor
<
phi
::
CPUContext
,
float
>;
template
class
SoftmaxFunctor
<
phi
::
CPUContext
,
double
>;
template
class
SoftmaxFunctor
<
phi
::
CPUContext
,
double
>;
template
class
SoftmaxGradFunctor
<
phi
::
CPUContext
,
float
>;
template
class
SoftmaxGradFunctor
<
phi
::
CPUContext
,
float
>;
template
class
SoftmaxGradFunctor
<
phi
::
CPUContext
,
double
>;
template
class
SoftmaxGradFunctor
<
phi
::
CPUContext
,
double
>;
}
// namespace math
}
// namespace funcs
}
// namespace operators
}
// namespace phi
}
// namespace paddle
paddle/
fluid/operators/math
/softmax.cu
→
paddle/
phi/kernels/funcs
/softmax.cu
浏览文件 @
344b99e1
...
@@ -13,20 +13,19 @@ See the License for the specific language governing permissions and
...
@@ -13,20 +13,19 @@ See the License for the specific language governing permissions and
limitations under the License. */
limitations under the License. */
#include <vector>
#include <vector>
#include "paddle/fluid/operators/math/softmax.h"
#include "paddle/fluid/operators/math/softmax_impl.h"
#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/backends/gpu/gpu_dnn.h"
#include "paddle/phi/kernels/funcs/math_function.h"
#include "paddle/phi/kernels/funcs/math_function.h"
#include "paddle/phi/kernels/funcs/softmax.h"
#include "paddle/phi/kernels/funcs/softmax_impl.h"
namespace
paddle
{
namespace
phi
{
namespace
operators
{
namespace
funcs
{
namespace
math
{
using
ScopedTensorDescriptor
=
p
latform
::
ScopedTensorDescriptor
;
using
ScopedTensorDescriptor
=
p
hi
::
backends
::
gpu
::
ScopedTensorDescriptor
;
using
DataLayout
=
p
latform
::
DataLayout
;
using
DataLayout
=
p
hi
::
backends
::
gpu
::
DataLayout
;
template
<
typename
T
>
template
<
typename
T
>
using
CudnnDataType
=
p
latform
::
CudnnDataType
<
T
>
;
using
CudnnDataType
=
p
hi
::
backends
::
gpu
::
CudnnDataType
<
T
>
;
template
<
typename
T
,
typename
DeviceContext
>
template
<
typename
T
,
typename
DeviceContext
>
void
SoftmaxCUDNNFunctor
<
T
,
DeviceContext
>::
operator
()(
void
SoftmaxCUDNNFunctor
<
T
,
DeviceContext
>::
operator
()(
...
@@ -51,31 +50,31 @@ void SoftmaxCUDNNFunctor<T, DeviceContext>::operator()(
...
@@ -51,31 +50,31 @@ void SoftmaxCUDNNFunctor<T, DeviceContext>::operator()(
xDesc
.
descriptor
<
T
>
(
layout
,
cudnn_tensor_dims
);
xDesc
.
descriptor
<
T
>
(
layout
,
cudnn_tensor_dims
);
miopenTensorDescriptor_t
cudnn_y_desc
=
miopenTensorDescriptor_t
cudnn_y_desc
=
xDesc
.
descriptor
<
T
>
(
layout
,
cudnn_tensor_dims
);
xDesc
.
descriptor
<
T
>
(
layout
,
cudnn_tensor_dims
);
PADDLE_ENFORCE_GPU_SUCCESS
(
platform
::
dynload
::
miopenSoftmaxForward_V2
(
PADDLE_ENFORCE_GPU_SUCCESS
(
context
.
cudnn_handle
(),
phi
::
dynload
::
miopenSoftmaxForward_V2
(
context
.
cudnn_handle
(),
CudnnDataType
<
T
>::
kOne
(),
CudnnDataType
<
T
>::
kOne
(),
cudnn_x_desc
,
cudnn_x_desc
,
X
->
data
<
T
>
(),
X
->
data
<
T
>
(),
CudnnDataType
<
T
>::
kZero
(),
CudnnDataType
<
T
>::
kZero
(),
cudnn_y_desc
,
cudnn_y_desc
,
Y
->
mutable_data
<
T
>
(
context
.
GetPlace
()
),
context
.
template
Alloc
<
T
>(
Y
),
MIOPEN_SOFTMAX_ACCURATE
,
MIOPEN_SOFTMAX_ACCURATE
,
MIOPEN_SOFTMAX_MODE_INSTANCE
));
MIOPEN_SOFTMAX_MODE_INSTANCE
));
#else
#else
cudnnTensorDescriptor_t
cudnn_x_desc
=
cudnnTensorDescriptor_t
cudnn_x_desc
=
xDesc
.
descriptor
<
T
>
(
layout
,
cudnn_tensor_dims
);
xDesc
.
descriptor
<
T
>
(
layout
,
cudnn_tensor_dims
);
cudnnTensorDescriptor_t
cudnn_y_desc
=
cudnnTensorDescriptor_t
cudnn_y_desc
=
xDesc
.
descriptor
<
T
>
(
layout
,
cudnn_tensor_dims
);
xDesc
.
descriptor
<
T
>
(
layout
,
cudnn_tensor_dims
);
PADDLE_ENFORCE_GPU_SUCCESS
(
platform
::
dynload
::
cudnnSoftmaxForward
(
PADDLE_ENFORCE_GPU_SUCCESS
(
context
.
cudnn_handle
(),
phi
::
dynload
::
cudnnSoftmaxForward
(
context
.
cudnn_handle
(),
CUDNN_SOFTMAX_ACCURATE
,
CUDNN_SOFTMAX_ACCURATE
,
CUDNN_SOFTMAX_MODE_INSTANCE
,
CUDNN_SOFTMAX_MODE_INSTANCE
,
CudnnDataType
<
T
>::
kOne
(),
CudnnDataType
<
T
>::
kOne
(),
cudnn_x_desc
,
cudnn_x_desc
,
X
->
data
<
T
>
(),
X
->
data
<
T
>
(),
CudnnDataType
<
T
>::
kZero
(),
CudnnDataType
<
T
>::
kZero
(),
cudnn_y_desc
,
cudnn_y_desc
,
Y
->
mutable_data
<
T
>
(
context
.
GetPlace
()
)));
context
.
template
Alloc
<
T
>(
Y
)));
#endif
#endif
}
}
...
@@ -106,18 +105,18 @@ void SoftmaxGradCUDNNFunctor<T, DeviceContext>::operator()(
...
@@ -106,18 +105,18 @@ void SoftmaxGradCUDNNFunctor<T, DeviceContext>::operator()(
dxDesc
.
descriptor
<
T
>
(
layout
,
cudnn_tensor_dims
);
dxDesc
.
descriptor
<
T
>
(
layout
,
cudnn_tensor_dims
);
miopenTensorDescriptor_t
cudnn_ygrad_desc
=
miopenTensorDescriptor_t
cudnn_ygrad_desc
=
dyDesc
.
descriptor
<
T
>
(
layout
,
cudnn_tensor_dims
);
dyDesc
.
descriptor
<
T
>
(
layout
,
cudnn_tensor_dims
);
PADDLE_ENFORCE_GPU_SUCCESS
(
platform
::
dynload
::
miopenSoftmaxBackward_V2
(
PADDLE_ENFORCE_GPU_SUCCESS
(
context
.
cudnn_handle
(),
phi
::
dynload
::
miopenSoftmaxBackward_V2
(
context
.
cudnn_handle
(),
CudnnDataType
<
T
>::
kOne
(),
CudnnDataType
<
T
>::
kOne
(),
cudnn_y_desc
,
cudnn_y_desc
,
Y
->
data
<
T
>
(),
Y
->
data
<
T
>
(),
cudnn_ygrad_desc
,
cudnn_ygrad_desc
,
YGrad
->
data
<
T
>
(),
YGrad
->
data
<
T
>
(),
CudnnDataType
<
T
>::
kZero
(),
CudnnDataType
<
T
>::
kZero
(),
cudnn_xgrad_desc
,
cudnn_xgrad_desc
,
XGrad
->
mutable_data
<
T
>
(
context
.
GetPlace
()
),
context
.
template
Alloc
<
T
>(
XGrad
),
MIOPEN_SOFTMAX_ACCURATE
,
MIOPEN_SOFTMAX_ACCURATE
,
MIOPEN_SOFTMAX_MODE_INSTANCE
));
MIOPEN_SOFTMAX_MODE_INSTANCE
));
#else
#else
cudnnTensorDescriptor_t
cudnn_y_desc
=
cudnnTensorDescriptor_t
cudnn_y_desc
=
yDesc
.
descriptor
<
T
>
(
layout
,
cudnn_tensor_dims
);
yDesc
.
descriptor
<
T
>
(
layout
,
cudnn_tensor_dims
);
...
@@ -125,28 +124,28 @@ void SoftmaxGradCUDNNFunctor<T, DeviceContext>::operator()(
...
@@ -125,28 +124,28 @@ void SoftmaxGradCUDNNFunctor<T, DeviceContext>::operator()(
dxDesc
.
descriptor
<
T
>
(
layout
,
cudnn_tensor_dims
);
dxDesc
.
descriptor
<
T
>
(
layout
,
cudnn_tensor_dims
);
cudnnTensorDescriptor_t
cudnn_ygrad_desc
=
cudnnTensorDescriptor_t
cudnn_ygrad_desc
=
dyDesc
.
descriptor
<
T
>
(
layout
,
cudnn_tensor_dims
);
dyDesc
.
descriptor
<
T
>
(
layout
,
cudnn_tensor_dims
);
PADDLE_ENFORCE_GPU_SUCCESS
(
platform
::
dynload
::
cudnnSoftmaxBackward
(
PADDLE_ENFORCE_GPU_SUCCESS
(
context
.
cudnn_handle
(),
phi
::
dynload
::
cudnnSoftmaxBackward
(
context
.
cudnn_handle
(),
CUDNN_SOFTMAX_ACCURATE
,
CUDNN_SOFTMAX_ACCURATE
,
CUDNN_SOFTMAX_MODE_INSTANCE
,
CUDNN_SOFTMAX_MODE_INSTANCE
,
CudnnDataType
<
T
>::
kOne
(),
CudnnDataType
<
T
>::
kOne
(),
cudnn_y_desc
,
cudnn_y_desc
,
Y
->
data
<
T
>
(),
Y
->
data
<
T
>
(),
cudnn_ygrad_desc
,
cudnn_ygrad_desc
,
YGrad
->
data
<
T
>
(),
YGrad
->
data
<
T
>
(),
CudnnDataType
<
T
>::
kZero
(),
CudnnDataType
<
T
>::
kZero
(),
cudnn_xgrad_desc
,
cudnn_xgrad_desc
,
XGrad
->
mutable_data
<
T
>
(
context
.
GetPlace
()
)));
context
.
template
Alloc
<
T
>(
XGrad
)));
#endif
#endif
}
}
template
class
SoftmaxCUDNNFunctor
<
float
,
phi
::
GPUContext
>;
template
class
SoftmaxCUDNNFunctor
<
float
,
phi
::
GPUContext
>;
template
class
SoftmaxCUDNNFunctor
<
p
latform
::
float16
,
phi
::
GPUContext
>;
template
class
SoftmaxCUDNNFunctor
<
p
hi
::
dtype
::
float16
,
phi
::
GPUContext
>;
template
class
SoftmaxGradCUDNNFunctor
<
float
,
phi
::
GPUContext
>;
template
class
SoftmaxGradCUDNNFunctor
<
float
,
phi
::
GPUContext
>;
template
class
SoftmaxGradCUDNNFunctor
<
p
latform
::
float16
,
phi
::
GPUContext
>;
template
class
SoftmaxGradCUDNNFunctor
<
p
hi
::
dtype
::
float16
,
phi
::
GPUContext
>;
#if CUDNN_VERSION_MIN(8, 1, 0)
#if CUDNN_VERSION_MIN(8, 1, 0)
template
class
SoftmaxCUDNNFunctor
<
p
latform
::
bfloat16
,
phi
::
GPUContext
>;
template
class
SoftmaxCUDNNFunctor
<
p
hi
::
dtype
::
bfloat16
,
phi
::
GPUContext
>;
template
class
SoftmaxGradCUDNNFunctor
<
p
latform
::
bfloat16
,
phi
::
GPUContext
>;
template
class
SoftmaxGradCUDNNFunctor
<
p
hi
::
dtype
::
bfloat16
,
phi
::
GPUContext
>;
#endif
#endif
// MIOPEN do not support double
// MIOPEN do not support double
...
@@ -155,15 +154,14 @@ template class SoftmaxCUDNNFunctor<double, phi::GPUContext>;
...
@@ -155,15 +154,14 @@ template class SoftmaxCUDNNFunctor<double, phi::GPUContext>;
template
class
SoftmaxGradCUDNNFunctor
<
double
,
phi
::
GPUContext
>;
template
class
SoftmaxGradCUDNNFunctor
<
double
,
phi
::
GPUContext
>;
#endif
#endif
template
class
SoftmaxFunctor
<
phi
::
GPUContext
,
p
latform
::
float16
>;
template
class
SoftmaxFunctor
<
phi
::
GPUContext
,
p
hi
::
dtype
::
float16
>;
template
class
SoftmaxFunctor
<
phi
::
GPUContext
,
p
latform
::
bfloat16
>;
template
class
SoftmaxFunctor
<
phi
::
GPUContext
,
p
hi
::
dtype
::
bfloat16
>;
template
class
SoftmaxFunctor
<
phi
::
GPUContext
,
float
>;
template
class
SoftmaxFunctor
<
phi
::
GPUContext
,
float
>;
template
class
SoftmaxFunctor
<
phi
::
GPUContext
,
double
>;
template
class
SoftmaxFunctor
<
phi
::
GPUContext
,
double
>;
template
class
SoftmaxGradFunctor
<
phi
::
GPUContext
,
float
>;
template
class
SoftmaxGradFunctor
<
phi
::
GPUContext
,
float
>;
template
class
SoftmaxGradFunctor
<
phi
::
GPUContext
,
double
>;
template
class
SoftmaxGradFunctor
<
phi
::
GPUContext
,
double
>;
template
class
SoftmaxGradFunctor
<
phi
::
GPUContext
,
p
latform
::
float16
>;
template
class
SoftmaxGradFunctor
<
phi
::
GPUContext
,
p
hi
::
dtype
::
float16
>;
template
class
SoftmaxGradFunctor
<
phi
::
GPUContext
,
p
latform
::
bfloat16
>;
template
class
SoftmaxGradFunctor
<
phi
::
GPUContext
,
p
hi
::
dtype
::
bfloat16
>;
}
// namespace math
}
// namespace funcs
}
// namespace operators
}
// namespace phi
}
// namespace paddle
paddle/
fluid/operators/math
/softmax.h
→
paddle/
phi/kernels/funcs
/softmax.h
浏览文件 @
344b99e1
...
@@ -13,11 +13,10 @@ See the License for the specific language governing permissions and
...
@@ -13,11 +13,10 @@ See the License for the specific language governing permissions and
limitations under the License. */
limitations under the License. */
#pragma once
#pragma once
#include "paddle/
fluid/framework/
tensor.h"
#include "paddle/
phi/core/dense_
tensor.h"
namespace
paddle
{
namespace
phi
{
namespace
operators
{
namespace
funcs
{
namespace
math
{
template
<
typename
DeviceContext
,
typename
T
,
typename
Enable
=
void
>
template
<
typename
DeviceContext
,
typename
T
,
typename
Enable
=
void
>
class
SoftmaxFunctor
{
class
SoftmaxFunctor
{
...
@@ -58,6 +57,5 @@ class SoftmaxGradCUDNNFunctor {
...
@@ -58,6 +57,5 @@ class SoftmaxGradCUDNNFunctor {
#endif
#endif
}
// namespace math
}
// namespace funcs
}
// namespace operators
}
// namespace phi
}
// namespace paddle
paddle/
fluid/operators/math
/softmax_impl.h
→
paddle/
phi/kernels/funcs
/softmax_impl.h
浏览文件 @
344b99e1
...
@@ -15,24 +15,22 @@ limitations under the License. */
...
@@ -15,24 +15,22 @@ limitations under the License. */
#pragma once
#pragma once
#include <vector>
#include <vector>
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/operators/jit/kernels.h"
#include "paddle/fluid/operators/math/cpu_vec.h"
#include "paddle/fluid/platform/bfloat16.h"
#include "paddle/fluid/platform/cpu_info.h"
#include "paddle/fluid/platform/float16.h"
#include "paddle/phi/backends/cpu/cpu_context.h"
#include "paddle/phi/backends/cpu/cpu_context.h"
#include "paddle/phi/backends/cpu/cpu_info.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/common/bfloat16.h"
#include "paddle/phi/common/float16.h"
#include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/kernels/funcs/cpu_vec.h"
#include "paddle/phi/kernels/funcs/eigen/common.h"
namespace
paddle
{
namespace
phi
{
namespace
operators
{
namespace
funcs
{
namespace
math
{
template
<
typename
T
,
template
<
typename
T
,
int
MajorType
=
Eigen
::
RowMajor
,
int
MajorType
=
Eigen
::
RowMajor
,
typename
IndexType
=
Eigen
::
DenseIndex
>
typename
IndexType
=
Eigen
::
DenseIndex
>
using
EigenMatrix
=
framework
::
EigenMatrix
<
T
,
MajorType
,
IndexType
>
;
using
EigenMatrix
=
phi
::
EigenMatrix
<
T
,
MajorType
,
IndexType
>
;
template
<
typename
T
>
template
<
typename
T
>
struct
ValueClip
{
struct
ValueClip
{
...
@@ -104,7 +102,7 @@ class SoftmaxEigen {
...
@@ -104,7 +102,7 @@ class SoftmaxEigen {
};
};
template
<
typename
DeviceContext
>
template
<
typename
DeviceContext
>
class
SoftmaxEigen
<
DeviceContext
,
p
latform
::
float16
>
{
class
SoftmaxEigen
<
DeviceContext
,
p
hi
::
dtype
::
float16
>
{
public:
public:
void
operator
()(
const
DeviceContext
&
context
,
void
operator
()(
const
DeviceContext
&
context
,
const
int
axis_dim
,
const
int
axis_dim
,
...
@@ -114,8 +112,8 @@ class SoftmaxEigen<DeviceContext, platform::float16> {
...
@@ -114,8 +112,8 @@ class SoftmaxEigen<DeviceContext, platform::float16> {
constexpr
int
kClassDim
=
1
;
constexpr
int
kClassDim
=
1
;
constexpr
int
kAxisDim
=
1
;
constexpr
int
kAxisDim
=
1
;
auto
logits
=
EigenMatrix
<
p
latform
::
float16
>::
From
(
*
X
);
auto
logits
=
EigenMatrix
<
p
hi
::
dtype
::
float16
>::
From
(
*
X
);
auto
softmax
=
EigenMatrix
<
p
latform
::
float16
>::
From
(
*
Y
);
auto
softmax
=
EigenMatrix
<
p
hi
::
dtype
::
float16
>::
From
(
*
Y
);
const
int
batch_size
=
logits
.
dimension
(
kBatchDim
);
const
int
batch_size
=
logits
.
dimension
(
kBatchDim
);
const
int
num_classes
=
logits
.
dimension
(
kClassDim
);
const
int
num_classes
=
logits
.
dimension
(
kClassDim
);
...
@@ -139,7 +137,7 @@ class SoftmaxEigen<DeviceContext, platform::float16> {
...
@@ -139,7 +137,7 @@ class SoftmaxEigen<DeviceContext, platform::float16> {
(
logits
-
logits
.
maximum
(
along_axis
)
(
logits
-
logits
.
maximum
(
along_axis
)
.
reshape
(
batch_by_one
)
.
reshape
(
batch_by_one
)
.
broadcast
(
one_by_class
))
.
broadcast
(
one_by_class
))
.
unaryExpr
(
ValueClip
<
p
latform
::
float16
>
());
.
unaryExpr
(
ValueClip
<
p
hi
::
dtype
::
float16
>
());
}
else
{
}
else
{
// axis != -1, class dimension split into (axis, remain), max and sum
// axis != -1, class dimension split into (axis, remain), max and sum
// should be calculated along axis dimension
// should be calculated along axis dimension
...
@@ -149,7 +147,7 @@ class SoftmaxEigen<DeviceContext, platform::float16> {
...
@@ -149,7 +147,7 @@ class SoftmaxEigen<DeviceContext, platform::float16> {
.
reshape
(
batch_one_remain
)
.
reshape
(
batch_one_remain
)
.
broadcast
(
one_axis_one
)
.
broadcast
(
one_axis_one
)
.
reshape
(
batch_classes
))
.
reshape
(
batch_classes
))
.
unaryExpr
(
ValueClip
<
p
latform
::
float16
>
());
.
unaryExpr
(
ValueClip
<
p
hi
::
dtype
::
float16
>
());
}
}
softmax
.
device
(
*
context
.
eigen_device
())
=
softmax
.
exp
();
softmax
.
device
(
*
context
.
eigen_device
())
=
softmax
.
exp
();
...
@@ -162,7 +160,7 @@ class SoftmaxEigen<DeviceContext, platform::float16> {
...
@@ -162,7 +160,7 @@ class SoftmaxEigen<DeviceContext, platform::float16> {
};
};
template
<
typename
DeviceContext
>
template
<
typename
DeviceContext
>
class
SoftmaxEigen
<
DeviceContext
,
p
latform
::
bfloat16
>
{
class
SoftmaxEigen
<
DeviceContext
,
p
hi
::
dtype
::
bfloat16
>
{
public:
public:
void
operator
()(
const
DeviceContext
&
context
,
void
operator
()(
const
DeviceContext
&
context
,
const
int
axis_dim
,
const
int
axis_dim
,
...
@@ -172,8 +170,8 @@ class SoftmaxEigen<DeviceContext, platform::bfloat16> {
...
@@ -172,8 +170,8 @@ class SoftmaxEigen<DeviceContext, platform::bfloat16> {
constexpr
int
kClassDim
=
1
;
constexpr
int
kClassDim
=
1
;
constexpr
int
kAxisDim
=
1
;
constexpr
int
kAxisDim
=
1
;
auto
logits
=
EigenMatrix
<
p
latform
::
bfloat16
>::
From
(
*
X
);
auto
logits
=
EigenMatrix
<
p
hi
::
dtype
::
bfloat16
>::
From
(
*
X
);
auto
softmax
=
EigenMatrix
<
p
latform
::
bfloat16
>::
From
(
*
Y
);
auto
softmax
=
EigenMatrix
<
p
hi
::
dtype
::
bfloat16
>::
From
(
*
Y
);
const
int
batch_size
=
logits
.
dimension
(
kBatchDim
);
const
int
batch_size
=
logits
.
dimension
(
kBatchDim
);
const
int
num_classes
=
logits
.
dimension
(
kClassDim
);
const
int
num_classes
=
logits
.
dimension
(
kClassDim
);
...
@@ -197,7 +195,7 @@ class SoftmaxEigen<DeviceContext, platform::bfloat16> {
...
@@ -197,7 +195,7 @@ class SoftmaxEigen<DeviceContext, platform::bfloat16> {
(
logits
-
logits
.
maximum
(
along_axis
)
(
logits
-
logits
.
maximum
(
along_axis
)
.
reshape
(
batch_by_one
)
.
reshape
(
batch_by_one
)
.
broadcast
(
one_by_class
))
.
broadcast
(
one_by_class
))
.
unaryExpr
(
ValueClip
<
p
latform
::
bfloat16
>
());
.
unaryExpr
(
ValueClip
<
p
hi
::
dtype
::
bfloat16
>
());
}
else
{
}
else
{
// axis != -1, class dimension split into (axis, remain), max and sum
// axis != -1, class dimension split into (axis, remain), max and sum
// should be calculated along axis dimension
// should be calculated along axis dimension
...
@@ -207,7 +205,7 @@ class SoftmaxEigen<DeviceContext, platform::bfloat16> {
...
@@ -207,7 +205,7 @@ class SoftmaxEigen<DeviceContext, platform::bfloat16> {
.
reshape
(
batch_one_remain
)
.
reshape
(
batch_one_remain
)
.
broadcast
(
one_axis_one
)
.
broadcast
(
one_axis_one
)
.
reshape
(
batch_classes
))
.
reshape
(
batch_classes
))
.
unaryExpr
(
ValueClip
<
p
latform
::
bfloat16
>
());
.
unaryExpr
(
ValueClip
<
p
hi
::
dtype
::
bfloat16
>
());
}
}
softmax
.
device
(
*
context
.
eigen_device
())
=
softmax
.
exp
();
softmax
.
device
(
*
context
.
eigen_device
())
=
softmax
.
exp
();
...
@@ -247,21 +245,24 @@ class SoftmaxFunctor<DeviceContext, T, enable_if_CPU<DeviceContext>> {
...
@@ -247,21 +245,24 @@ class SoftmaxFunctor<DeviceContext, T, enable_if_CPU<DeviceContext>> {
const
int
batch_size
=
in_dims
[
kBatchDim
];
const
int
batch_size
=
in_dims
[
kBatchDim
];
const
int
num_remain
=
num_classes
/
axis_dim
;
const
int
num_remain
=
num_classes
/
axis_dim
;
if
(
num_remain
==
1
&&
platform
::
MayIUse
(
platform
::
avx
))
{
if
(
num_remain
==
1
&&
phi
::
backends
::
cpu
::
MayIUse
(
phi
::
backends
::
cpu
::
avx
))
{
const
T
*
in_data
=
X
->
data
<
T
>
();
const
T
*
in_data
=
X
->
data
<
T
>
();
T
*
out_data
=
Y
->
data
<
T
>
();
T
*
out_data
=
Y
->
data
<
T
>
();
for
(
int
bs
=
0
;
bs
<
batch_size
;
++
bs
)
{
for
(
int
bs
=
0
;
bs
<
batch_size
;
++
bs
)
{
T
max_val
=
*
std
::
max_element
(
in_data
,
in_data
+
num_classes
);
T
max_val
=
*
std
::
max_element
(
in_data
,
in_data
+
num_classes
);
max_val
*=
static_cast
<
T
>
(
-
1
);
max_val
*=
static_cast
<
T
>
(
-
1
);
vec_add_bias
<
T
,
platform
::
avx
>
(
num_classes
,
max_val
,
in_data
,
out_data
);
vec_add_bias
<
T
,
phi
::
backends
::
cpu
::
avx
>
(
vec_clip
<
T
,
platform
::
avx
>
(
num_classes
,
max_val
,
in_data
,
out_data
);
vec_clip
<
T
,
phi
::
backends
::
cpu
::
avx
>
(
num_classes
,
static_cast
<
T
>
(
-
64
),
out_data
,
out_data
);
num_classes
,
static_cast
<
T
>
(
-
64
),
out_data
,
out_data
);
vec_exp
<
T
>
(
num_classes
,
out_data
,
out_data
);
vec_exp
<
T
>
(
num_classes
,
out_data
,
out_data
);
T
sum
=
0
;
T
sum
=
0
;
vec_sum
<
T
,
p
latform
::
avx
>
(
num_classes
,
out_data
,
&
sum
);
vec_sum
<
T
,
p
hi
::
backends
::
cpu
::
avx
>
(
num_classes
,
out_data
,
&
sum
);
sum
=
static_cast
<
T
>
(
1
)
/
sum
;
sum
=
static_cast
<
T
>
(
1
)
/
sum
;
vec_scal
<
T
,
platform
::
avx
>
(
num_classes
,
sum
,
out_data
,
out_data
);
vec_scal
<
T
,
phi
::
backends
::
cpu
::
avx
>
(
num_classes
,
sum
,
out_data
,
out_data
);
in_data
+=
num_classes
;
in_data
+=
num_classes
;
out_data
+=
num_classes
;
out_data
+=
num_classes
;
...
@@ -308,16 +309,16 @@ class SoftmaxGradEigen {
...
@@ -308,16 +309,16 @@ class SoftmaxGradEigen {
};
};
template
<
typename
DeviceContext
>
template
<
typename
DeviceContext
>
class
SoftmaxGradEigen
<
DeviceContext
,
p
latform
::
float16
>
{
class
SoftmaxGradEigen
<
DeviceContext
,
p
hi
::
dtype
::
float16
>
{
public:
public:
void
operator
()(
const
DeviceContext
&
context
,
void
operator
()(
const
DeviceContext
&
context
,
const
int
axis_dim
,
const
int
axis_dim
,
const
phi
::
DenseTensor
*
y
,
const
phi
::
DenseTensor
*
y
,
const
phi
::
DenseTensor
*
y_grad
,
const
phi
::
DenseTensor
*
y_grad
,
phi
::
DenseTensor
*
x_grad
)
{
phi
::
DenseTensor
*
x_grad
)
{
auto
softmax
=
EigenMatrix
<
p
latform
::
float16
>::
From
(
*
y
);
auto
softmax
=
EigenMatrix
<
p
hi
::
dtype
::
float16
>::
From
(
*
y
);
auto
softmax_grad
=
EigenMatrix
<
p
latform
::
float16
>::
From
(
*
y_grad
);
auto
softmax_grad
=
EigenMatrix
<
p
hi
::
dtype
::
float16
>::
From
(
*
y_grad
);
auto
logits_grad
=
EigenMatrix
<
p
latform
::
float16
>::
From
(
*
x_grad
);
auto
logits_grad
=
EigenMatrix
<
p
hi
::
dtype
::
float16
>::
From
(
*
x_grad
);
constexpr
int
kBatchDim
=
0
;
constexpr
int
kBatchDim
=
0
;
constexpr
int
kClassDim
=
1
;
constexpr
int
kClassDim
=
1
;
...
@@ -342,16 +343,16 @@ class SoftmaxGradEigen<DeviceContext, platform::float16> {
...
@@ -342,16 +343,16 @@ class SoftmaxGradEigen<DeviceContext, platform::float16> {
};
};
template
<
typename
DeviceContext
>
template
<
typename
DeviceContext
>
class
SoftmaxGradEigen
<
DeviceContext
,
p
latform
::
bfloat16
>
{
class
SoftmaxGradEigen
<
DeviceContext
,
p
hi
::
dtype
::
bfloat16
>
{
public:
public:
void
operator
()(
const
DeviceContext
&
context
,
void
operator
()(
const
DeviceContext
&
context
,
const
int
axis_dim
,
const
int
axis_dim
,
const
phi
::
DenseTensor
*
y
,
const
phi
::
DenseTensor
*
y
,
const
phi
::
DenseTensor
*
y_grad
,
const
phi
::
DenseTensor
*
y_grad
,
phi
::
DenseTensor
*
x_grad
)
{
phi
::
DenseTensor
*
x_grad
)
{
auto
softmax
=
EigenMatrix
<
p
latform
::
bfloat16
>::
From
(
*
y
);
auto
softmax
=
EigenMatrix
<
p
hi
::
dtype
::
bfloat16
>::
From
(
*
y
);
auto
softmax_grad
=
EigenMatrix
<
p
latform
::
bfloat16
>::
From
(
*
y_grad
);
auto
softmax_grad
=
EigenMatrix
<
p
hi
::
dtype
::
bfloat16
>::
From
(
*
y_grad
);
auto
logits_grad
=
EigenMatrix
<
p
latform
::
bfloat16
>::
From
(
*
x_grad
);
auto
logits_grad
=
EigenMatrix
<
p
hi
::
dtype
::
bfloat16
>::
From
(
*
x_grad
);
constexpr
int
kBatchDim
=
0
;
constexpr
int
kBatchDim
=
0
;
constexpr
int
kClassDim
=
1
;
constexpr
int
kClassDim
=
1
;
...
@@ -400,17 +401,20 @@ class SoftmaxGradFunctor<DeviceContext, T, enable_if_CPU<DeviceContext>> {
...
@@ -400,17 +401,20 @@ class SoftmaxGradFunctor<DeviceContext, T, enable_if_CPU<DeviceContext>> {
const
int
batch_size
=
out_dims
[
kBatchDim
];
const
int
batch_size
=
out_dims
[
kBatchDim
];
const
int
num_remain
=
num_classes
/
axis_dim
;
const
int
num_remain
=
num_classes
/
axis_dim
;
if
(
num_remain
==
1
&&
platform
::
MayIUse
(
platform
::
avx
))
{
if
(
num_remain
==
1
&&
phi
::
backends
::
cpu
::
MayIUse
(
phi
::
backends
::
cpu
::
avx
))
{
const
T
*
out_data
=
y
->
data
<
T
>
();
const
T
*
out_data
=
y
->
data
<
T
>
();
const
T
*
out_grad
=
y_grad
->
data
<
T
>
();
const
T
*
out_grad
=
y_grad
->
data
<
T
>
();
T
*
in_grad
=
x_grad
->
data
<
T
>
();
T
*
in_grad
=
x_grad
->
data
<
T
>
();
for
(
int
bs
=
0
;
bs
<
batch_size
;
++
bs
)
{
for
(
int
bs
=
0
;
bs
<
batch_size
;
++
bs
)
{
T
scalar
;
T
scalar
;
vec_mul_reduce
<
T
,
p
latform
::
avx
>
(
vec_mul_reduce
<
T
,
p
hi
::
backends
::
cpu
::
avx
>
(
num_classes
,
out_grad
,
out_data
,
&
scalar
);
num_classes
,
out_grad
,
out_data
,
&
scalar
);
scalar
*=
static_cast
<
T
>
(
-
1
);
scalar
*=
static_cast
<
T
>
(
-
1
);
vec_add_bias
<
T
,
platform
::
avx
>
(
num_classes
,
scalar
,
out_grad
,
in_grad
);
vec_add_bias
<
T
,
phi
::
backends
::
cpu
::
avx
>
(
vec_mul
<
T
,
platform
::
avx
>
(
num_classes
,
out_data
,
in_grad
,
in_grad
);
num_classes
,
scalar
,
out_grad
,
in_grad
);
vec_mul
<
T
,
phi
::
backends
::
cpu
::
avx
>
(
num_classes
,
out_data
,
in_grad
,
in_grad
);
out_data
+=
num_classes
;
out_data
+=
num_classes
;
out_grad
+=
num_classes
;
out_grad
+=
num_classes
;
in_grad
+=
num_classes
;
in_grad
+=
num_classes
;
...
@@ -422,6 +426,5 @@ class SoftmaxGradFunctor<DeviceContext, T, enable_if_CPU<DeviceContext>> {
...
@@ -422,6 +426,5 @@ class SoftmaxGradFunctor<DeviceContext, T, enable_if_CPU<DeviceContext>> {
}
}
};
};
}
// namespace math
}
// namespace funcs
}
// namespace operators
}
// namespace phi
}
// namespace paddle
paddle/phi/kernels/gpu/cross_entropy_grad_kernel.cu
浏览文件 @
344b99e1
...
@@ -22,7 +22,6 @@ limitations under the License. */
...
@@ -22,7 +22,6 @@ limitations under the License. */
namespace
cub
=
hipcub
;
namespace
cub
=
hipcub
;
#endif
#endif
#include "paddle/fluid/operators/math/softmax.h"
#include "paddle/phi/backends/gpu/gpu_device_function.h"
#include "paddle/phi/backends/gpu/gpu_device_function.h"
#include "paddle/phi/backends/gpu/gpu_dnn.h"
#include "paddle/phi/backends/gpu/gpu_dnn.h"
#include "paddle/phi/common/amp_type_traits.h"
#include "paddle/phi/common/amp_type_traits.h"
...
@@ -32,6 +31,7 @@ namespace cub = hipcub;
...
@@ -32,6 +31,7 @@ namespace cub = hipcub;
#include "paddle/phi/kernels/funcs/axis_utils.h"
#include "paddle/phi/kernels/funcs/axis_utils.h"
#include "paddle/phi/kernels/funcs/for_range.h"
#include "paddle/phi/kernels/funcs/for_range.h"
#include "paddle/phi/kernels/funcs/math_function.h"
#include "paddle/phi/kernels/funcs/math_function.h"
#include "paddle/phi/kernels/funcs/softmax.h"
#include "paddle/phi/kernels/gpudnn/softmax_gpudnn.h"
#include "paddle/phi/kernels/gpudnn/softmax_gpudnn.h"
namespace
phi
{
namespace
phi
{
...
...
paddle/phi/kernels/gpu/cross_entropy_kernel.cu
浏览文件 @
344b99e1
...
@@ -22,7 +22,6 @@ limitations under the License. */
...
@@ -22,7 +22,6 @@ limitations under the License. */
namespace
cub
=
hipcub
;
namespace
cub
=
hipcub
;
#endif
#endif
#include "paddle/fluid/operators/math/softmax.h"
#include "paddle/phi/backends/gpu/gpu_device_function.h"
#include "paddle/phi/backends/gpu/gpu_device_function.h"
#include "paddle/phi/backends/gpu/gpu_dnn.h"
#include "paddle/phi/backends/gpu/gpu_dnn.h"
#include "paddle/phi/common/amp_type_traits.h"
#include "paddle/phi/common/amp_type_traits.h"
...
@@ -33,6 +32,7 @@ namespace cub = hipcub;
...
@@ -33,6 +32,7 @@ namespace cub = hipcub;
#include "paddle/phi/kernels/funcs/cross_entropy.h"
#include "paddle/phi/kernels/funcs/cross_entropy.h"
#include "paddle/phi/kernels/funcs/for_range.h"
#include "paddle/phi/kernels/funcs/for_range.h"
#include "paddle/phi/kernels/funcs/math_function.h"
#include "paddle/phi/kernels/funcs/math_function.h"
#include "paddle/phi/kernels/funcs/softmax.h"
#include "paddle/phi/kernels/gpudnn/softmax_gpudnn.h"
#include "paddle/phi/kernels/gpudnn/softmax_gpudnn.h"
namespace
phi
{
namespace
phi
{
...
@@ -1386,7 +1386,7 @@ void CrossEntropyWithSoftmaxCUDAKernel(const GPUContext& dev_ctx,
...
@@ -1386,7 +1386,7 @@ void CrossEntropyWithSoftmaxCUDAKernel(const GPUContext& dev_ctx,
labels_2d
.
Resize
({
n
,
label
.
numel
()
/
n
});
labels_2d
.
Resize
({
n
,
label
.
numel
()
/
n
});
DenseTensor
loss_2d
(
*
loss
);
DenseTensor
loss_2d
(
*
loss
);
loss_2d
.
Resize
({
n
,
1
});
loss_2d
.
Resize
({
n
,
1
});
p
addle
::
operators
::
math
::
SoftmaxCUDNNFunctor
<
T
,
GPUContext
>
()(
p
hi
::
funcs
::
SoftmaxCUDNNFunctor
<
T
,
GPUContext
>
()(
dev_ctx
,
&
logits_2d
,
&
softmax_2d
);
dev_ctx
,
&
logits_2d
,
&
softmax_2d
);
phi
::
funcs
::
CrossEntropyFunctor
<
GPUContext
,
T
>
()(
dev_ctx
,
phi
::
funcs
::
CrossEntropyFunctor
<
GPUContext
,
T
>
()(
dev_ctx
,
&
loss_2d
,
&
loss_2d
,
...
...
paddle/phi/kernels/impl/gumbel_softmax_grad_kernel_impl.h
浏览文件 @
344b99e1
...
@@ -14,11 +14,11 @@
...
@@ -14,11 +14,11 @@
#pragma once
#pragma once
#include "paddle/fluid/operators/math/softmax.h"
#include "paddle/fluid/operators/math/softmax_impl.h"
#include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/kernels/funcs/axis_utils.h"
#include "paddle/phi/kernels/funcs/axis_utils.h"
#include "paddle/phi/kernels/funcs/math_function.h"
#include "paddle/phi/kernels/funcs/math_function.h"
#include "paddle/phi/kernels/funcs/softmax.h"
#include "paddle/phi/kernels/funcs/softmax_impl.h"
namespace
phi
{
namespace
phi
{
...
@@ -50,7 +50,7 @@ void GumbelSoftmaxGradKernel(const Context& ctx,
...
@@ -50,7 +50,7 @@ void GumbelSoftmaxGradKernel(const Context& ctx,
dx_2d
.
Resize
({
size_to_axis
,
size_from_axis
});
dx_2d
.
Resize
({
size_to_axis
,
size_from_axis
});
out_2d
.
Resize
({
size_to_axis
,
size_from_axis
});
out_2d
.
Resize
({
size_to_axis
,
size_from_axis
});
dout_2d
.
Resize
({
size_to_axis
,
size_from_axis
});
dout_2d
.
Resize
({
size_to_axis
,
size_from_axis
});
p
addle
::
operators
::
math
::
SoftmaxGradFunctor
<
Context
,
T
>
()(
p
hi
::
funcs
::
SoftmaxGradFunctor
<
Context
,
T
>
()(
ctx
,
axis_dim
,
&
out_2d
,
&
dout_2d
,
&
dx_2d
);
ctx
,
axis_dim
,
&
out_2d
,
&
dout_2d
,
&
dx_2d
);
}
}
...
...
paddle/phi/kernels/impl/gumbel_softmax_kernel_impl.h
浏览文件 @
344b99e1
...
@@ -16,12 +16,12 @@
...
@@ -16,12 +16,12 @@
#include <random>
#include <random>
#include "paddle/fluid/operators/math/softmax.h"
#include "paddle/fluid/operators/math/softmax_impl.h"
#include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/kernels/funcs/axis_utils.h"
#include "paddle/phi/kernels/funcs/axis_utils.h"
#include "paddle/phi/kernels/funcs/eigen/common.h"
#include "paddle/phi/kernels/funcs/eigen/common.h"
#include "paddle/phi/kernels/funcs/math_function.h"
#include "paddle/phi/kernels/funcs/math_function.h"
#include "paddle/phi/kernels/funcs/softmax.h"
#include "paddle/phi/kernels/funcs/softmax_impl.h"
namespace
phi
{
namespace
phi
{
...
@@ -87,8 +87,7 @@ void GumbelSoftmaxKernelHelper(const Context& ctx,
...
@@ -87,8 +87,7 @@ void GumbelSoftmaxKernelHelper(const Context& ctx,
size_to_axis
,
size_to_axis
,
size_from_axis
,
size_from_axis
,
temperature
);
temperature
);
paddle
::
operators
::
math
::
SoftmaxFunctor
<
Context
,
T
>
()(
phi
::
funcs
::
SoftmaxFunctor
<
Context
,
T
>
()(
ctx
,
axis_dim
,
&
x_noise_2d
,
&
out_2d
);
ctx
,
axis_dim
,
&
x_noise_2d
,
&
out_2d
);
if
(
hard
)
{
if
(
hard
)
{
OneHotGenerator
<
Context
,
T
>::
Transform
(
ctx
,
x
,
out
,
axis
);
OneHotGenerator
<
Context
,
T
>::
Transform
(
ctx
,
x
,
out
,
axis
);
...
...
paddle/phi/kernels/impl/softmax_grad_kernel_impl.h
浏览文件 @
344b99e1
...
@@ -14,9 +14,9 @@ limitations under the License. */
...
@@ -14,9 +14,9 @@ limitations under the License. */
#pragma once
#pragma once
#include "paddle/fluid/operators/math/softmax.h"
#include "paddle/phi/kernels/funcs/axis_utils.h"
#include "paddle/phi/kernels/funcs/axis_utils.h"
#include "paddle/phi/kernels/funcs/math_function.h"
#include "paddle/phi/kernels/funcs/math_function.h"
#include "paddle/phi/kernels/funcs/softmax.h"
#include "paddle/phi/kernels/softmax_grad_kernel.h"
#include "paddle/phi/kernels/softmax_grad_kernel.h"
namespace
phi
{
namespace
phi
{
...
@@ -50,7 +50,7 @@ void SoftmaxGradKernel(const Context& dev_ctx,
...
@@ -50,7 +50,7 @@ void SoftmaxGradKernel(const Context& dev_ctx,
Out_2d
.
ShareDataWith
(
out
).
Resize
({
n
,
d
});
Out_2d
.
ShareDataWith
(
out
).
Resize
({
n
,
d
});
dOut_2d
.
ShareDataWith
(
out_grad
).
Resize
({
n
,
d
});
dOut_2d
.
ShareDataWith
(
out_grad
).
Resize
({
n
,
d
});
p
addle
::
operators
::
math
::
SoftmaxGradFunctor
<
Context
,
T
>
()(
p
hi
::
funcs
::
SoftmaxGradFunctor
<
Context
,
T
>
()(
dev_ctx
,
axis_dim
,
&
Out_2d
,
&
dOut_2d
,
&
dX_2d
);
dev_ctx
,
axis_dim
,
&
Out_2d
,
&
dOut_2d
,
&
dX_2d
);
}
}
...
...
paddle/phi/kernels/impl/softmax_kernel_impl.h
浏览文件 @
344b99e1
...
@@ -14,9 +14,9 @@ limitations under the License. */
...
@@ -14,9 +14,9 @@ limitations under the License. */
#pragma once
#pragma once
#include "paddle/fluid/operators/math/softmax.h"
#include "paddle/phi/kernels/funcs/axis_utils.h"
#include "paddle/phi/kernels/funcs/axis_utils.h"
#include "paddle/phi/kernels/funcs/math_function.h"
#include "paddle/phi/kernels/funcs/math_function.h"
#include "paddle/phi/kernels/funcs/softmax.h"
#include "paddle/phi/kernels/softmax_kernel.h"
#include "paddle/phi/kernels/softmax_kernel.h"
namespace
phi
{
namespace
phi
{
...
@@ -47,8 +47,7 @@ void SoftmaxKernel(const Context& dev_ctx,
...
@@ -47,8 +47,7 @@ void SoftmaxKernel(const Context& dev_ctx,
DenseTensor
X_2d
,
Out_2d
;
DenseTensor
X_2d
,
Out_2d
;
X_2d
.
ShareDataWith
(
x
).
Resize
({
n
,
d
});
X_2d
.
ShareDataWith
(
x
).
Resize
({
n
,
d
});
Out_2d
.
ShareDataWith
(
*
out
).
Resize
({
n
,
d
});
Out_2d
.
ShareDataWith
(
*
out
).
Resize
({
n
,
d
});
paddle
::
operators
::
math
::
SoftmaxFunctor
<
Context
,
T
>
()(
phi
::
funcs
::
SoftmaxFunctor
<
Context
,
T
>
()(
dev_ctx
,
axis_dim
,
&
X_2d
,
&
Out_2d
);
dev_ctx
,
axis_dim
,
&
X_2d
,
&
Out_2d
);
}
}
}
// namespace phi
}
// namespace phi
paddle/phi/tests/kernels/CMakeLists.txt
浏览文件 @
344b99e1
...
@@ -22,7 +22,7 @@ endif()
...
@@ -22,7 +22,7 @@ endif()
cc_test
(
cc_test
(
test_cpu_vec
test_cpu_vec
SRCS test_cpu_vec.cc
SRCS test_cpu_vec.cc
DEPS blas
cpu_info
)
DEPS blas
phi_backends
)
# For String Kernels
# For String Kernels
cc_test
(
cc_test
(
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录