Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
BaiXuePrincess
Paddle
提交
344b99e1
P
Paddle
项目概览
BaiXuePrincess
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
344b99e1
编写于
12月 15, 2022
作者:
H
huangjiyi
提交者:
GitHub
12月 15, 2022
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
[PHI decoupling] move softmax from fluid to phi and remove cpu_vec.h in fluid (#48970)
上级
4672ea8e
变更
65
隐藏空白更改
内联
并排
Showing
65 changed file
with
371 addition
and
1074 deletion
+371
-1074
paddle/fluid/inference/api/analysis_config.cc
paddle/fluid/inference/api/analysis_config.cc
+4
-3
paddle/fluid/inference/api/analysis_predictor_tester.cc
paddle/fluid/inference/api/analysis_predictor_tester.cc
+2
-2
paddle/fluid/inference/api/onnxruntime_predictor_tester.cc
paddle/fluid/inference/api/onnxruntime_predictor_tester.cc
+1
-1
paddle/fluid/inference/tests/api/analyzer_bfloat16_image_classification_tester.cc
...ests/api/analyzer_bfloat16_image_classification_tester.cc
+2
-2
paddle/fluid/memory/allocation/buddy_allocator.h
paddle/fluid/memory/allocation/buddy_allocator.h
+1
-1
paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
+6
-6
paddle/fluid/memory/allocation/system_allocator.cc
paddle/fluid/memory/allocation/system_allocator.cc
+3
-3
paddle/fluid/memory/pinned_memory_test.cu
paddle/fluid/memory/pinned_memory_test.cu
+1
-1
paddle/fluid/operators/attention_lstm_op.cc
paddle/fluid/operators/attention_lstm_op.cc
+10
-9
paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.cu
...d/operators/collective/c_softmax_with_cross_entropy_op.cu
+21
-15
paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.h
...id/operators/collective/c_softmax_with_cross_entropy_op.h
+1
-1
paddle/fluid/operators/elementwise/elementwise_mul_op.h
paddle/fluid/operators/elementwise/elementwise_mul_op.h
+1
-1
paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.cc
paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.cc
+4
-4
paddle/fluid/operators/fused/fusion_seqexpand_concat_fc_op.cc
...le/fluid/operators/fused/fusion_seqexpand_concat_fc_op.cc
+4
-4
paddle/fluid/operators/jit/CMakeLists.txt
paddle/fluid/operators/jit/CMakeLists.txt
+1
-1
paddle/fluid/operators/jit/gen/act.cc
paddle/fluid/operators/jit/gen/act.cc
+7
-7
paddle/fluid/operators/jit/gen/act.h
paddle/fluid/operators/jit/gen/act.h
+9
-8
paddle/fluid/operators/jit/gen/adam.cc
paddle/fluid/operators/jit/gen/adam.cc
+2
-2
paddle/fluid/operators/jit/gen/adamw.cc
paddle/fluid/operators/jit/gen/adamw.cc
+2
-2
paddle/fluid/operators/jit/gen/blas.cc
paddle/fluid/operators/jit/gen/blas.cc
+4
-3
paddle/fluid/operators/jit/gen/embseqpool.cc
paddle/fluid/operators/jit/gen/embseqpool.cc
+2
-2
paddle/fluid/operators/jit/gen/gru.cc
paddle/fluid/operators/jit/gen/gru.cc
+16
-15
paddle/fluid/operators/jit/gen/hopv.cc
paddle/fluid/operators/jit/gen/hopv.cc
+2
-2
paddle/fluid/operators/jit/gen/jitcode.h
paddle/fluid/operators/jit/gen/jitcode.h
+2
-2
paddle/fluid/operators/jit/gen/lstm.cc
paddle/fluid/operators/jit/gen/lstm.cc
+16
-15
paddle/fluid/operators/jit/gen/matmul.cc
paddle/fluid/operators/jit/gen/matmul.cc
+4
-3
paddle/fluid/operators/jit/gen/seqpool.cc
paddle/fluid/operators/jit/gen/seqpool.cc
+2
-2
paddle/fluid/operators/jit/gen/sgd.cc
paddle/fluid/operators/jit/gen/sgd.cc
+2
-2
paddle/fluid/operators/jit/gen/vbroadcast.cc
paddle/fluid/operators/jit/gen/vbroadcast.cc
+3
-2
paddle/fluid/operators/jit/gen_base.cc
paddle/fluid/operators/jit/gen_base.cc
+2
-2
paddle/fluid/operators/jit/more/intrinsic/crf_decoding.cc
paddle/fluid/operators/jit/more/intrinsic/crf_decoding.cc
+2
-2
paddle/fluid/operators/jit/more/intrinsic/layer_norm.cc
paddle/fluid/operators/jit/more/intrinsic/layer_norm.cc
+3
-2
paddle/fluid/operators/jit/more/mkl/mkl.cc
paddle/fluid/operators/jit/more/mkl/mkl.cc
+6
-6
paddle/fluid/operators/jit/test.cc
paddle/fluid/operators/jit/test.cc
+3
-3
paddle/fluid/operators/math/CMakeLists.txt
paddle/fluid/operators/math/CMakeLists.txt
+0
-1
paddle/fluid/operators/math/cpu_vec.h
paddle/fluid/operators/math/cpu_vec.h
+0
-664
paddle/fluid/operators/sample_logits_op.cu
paddle/fluid/operators/sample_logits_op.cu
+1
-1
paddle/fluid/operators/sample_logits_op.h
paddle/fluid/operators/sample_logits_op.h
+1
-1
paddle/fluid/operators/sequence_ops/sequence_softmax_cudnn_op.cu.cc
...id/operators/sequence_ops/sequence_softmax_cudnn_op.cu.cc
+3
-3
paddle/fluid/operators/softmax_with_cross_entropy_op_npu.cc
paddle/fluid/operators/softmax_with_cross_entropy_op_npu.cc
+1
-1
paddle/fluid/platform/CMakeLists.txt
paddle/fluid/platform/CMakeLists.txt
+1
-10
paddle/fluid/platform/cpu_info.h
paddle/fluid/platform/cpu_info.h
+0
-92
paddle/fluid/platform/cpu_info_test.cc
paddle/fluid/platform/cpu_info_test.cc
+3
-2
paddle/fluid/platform/init.cc
paddle/fluid/platform/init.cc
+1
-1
paddle/fluid/platform/profiler/CMakeLists.txt
paddle/fluid/platform/profiler/CMakeLists.txt
+1
-1
paddle/fluid/pybind/parallel_executor.cc
paddle/fluid/pybind/parallel_executor.cc
+1
-1
paddle/fluid/pybind/place.cc
paddle/fluid/pybind/place.cc
+1
-1
paddle/fluid/pybind/pybind.cc
paddle/fluid/pybind/pybind.cc
+8
-7
paddle/fluid/pybind/tensor.cc
paddle/fluid/pybind/tensor.cc
+1
-1
paddle/phi/backends/CMakeLists.txt
paddle/phi/backends/CMakeLists.txt
+4
-1
paddle/phi/backends/cpu/cpu_info.cc
paddle/phi/backends/cpu/cpu_info.cc
+17
-10
paddle/phi/backends/cpu/cpu_info.h
paddle/phi/backends/cpu/cpu_info.h
+43
-4
paddle/phi/kernels/funcs/CMakeLists.txt
paddle/phi/kernels/funcs/CMakeLists.txt
+1
-0
paddle/phi/kernels/funcs/eigen/common.h
paddle/phi/kernels/funcs/eigen/common.h
+1
-0
paddle/phi/kernels/funcs/softmax.cc
paddle/phi/kernels/funcs/softmax.cc
+6
-8
paddle/phi/kernels/funcs/softmax.cu
paddle/phi/kernels/funcs/softmax.cu
+62
-64
paddle/phi/kernels/funcs/softmax.h
paddle/phi/kernels/funcs/softmax.h
+5
-7
paddle/phi/kernels/funcs/softmax_impl.h
paddle/phi/kernels/funcs/softmax_impl.h
+44
-41
paddle/phi/kernels/gpu/cross_entropy_grad_kernel.cu
paddle/phi/kernels/gpu/cross_entropy_grad_kernel.cu
+1
-1
paddle/phi/kernels/gpu/cross_entropy_kernel.cu
paddle/phi/kernels/gpu/cross_entropy_kernel.cu
+2
-2
paddle/phi/kernels/impl/gumbel_softmax_grad_kernel_impl.h
paddle/phi/kernels/impl/gumbel_softmax_grad_kernel_impl.h
+3
-3
paddle/phi/kernels/impl/gumbel_softmax_kernel_impl.h
paddle/phi/kernels/impl/gumbel_softmax_kernel_impl.h
+3
-4
paddle/phi/kernels/impl/softmax_grad_kernel_impl.h
paddle/phi/kernels/impl/softmax_grad_kernel_impl.h
+2
-2
paddle/phi/kernels/impl/softmax_kernel_impl.h
paddle/phi/kernels/impl/softmax_kernel_impl.h
+2
-3
paddle/phi/tests/kernels/CMakeLists.txt
paddle/phi/tests/kernels/CMakeLists.txt
+1
-1
未找到文件。
paddle/fluid/inference/api/analysis_config.cc
浏览文件 @
344b99e1
...
...
@@ -19,9 +19,9 @@
#include "paddle/fluid/inference/api/paddle_analysis_config.h"
#include "paddle/fluid/inference/api/paddle_pass_builder.h"
#include "paddle/fluid/inference/utils/table_printer.h"
#include "paddle/fluid/platform/cpu_info.h"
#include "paddle/fluid/platform/device/gpu/gpu_info.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/phi/backends/cpu/cpu_info.h"
#include "paddle/utils/string/split.h"
#ifdef PADDLE_WITH_TENSORRT
...
...
@@ -624,10 +624,11 @@ void AnalysisConfig::EnableMkldnnQuantizer() {
void
AnalysisConfig
::
EnableMkldnnBfloat16
()
{
#ifdef PADDLE_WITH_MKLDNN
if
(
p
latform
::
MayIUse
(
platform
::
cpu_isa_t
::
avx512_core
))
{
if
(
p
hi
::
backends
::
cpu
::
MayIUse
(
phi
::
backends
::
cpu
::
cpu_isa_t
::
avx512_core
))
{
use_mkldnn_bfloat16_
=
true
;
LOG
(
INFO
)
<<
"Hardware support for BFLOAT16"
<<
(
platform
::
MayIUse
(
platform
::
cpu_isa_t
::
avx512_bf16
)
<<
(
phi
::
backends
::
cpu
::
MayIUse
(
phi
::
backends
::
cpu
::
cpu_isa_t
::
avx512_bf16
)
?
" is enabled"
:
" is disabled. Simulation will be used"
);
}
else
{
...
...
paddle/fluid/inference/api/analysis_predictor_tester.cc
浏览文件 @
344b99e1
...
...
@@ -29,7 +29,7 @@
#include "paddle/fluid/inference/api/paddle_inference_api.h"
#include "paddle/fluid/inference/tests/api/tester_helper.h"
#include "paddle/fluid/inference/utils/io_utils.h"
#include "paddle/
fluid/platform
/cpu_info.h"
#include "paddle/
phi/backends/cpu
/cpu_info.h"
DEFINE_string
(
dirname
,
""
,
"dirname to tests."
);
...
...
@@ -327,7 +327,7 @@ TEST(AnalysisPredictor, bf16_gpu_pass_strategy) {
config
.
EnableUseGpu
(
100
,
0
);
config
.
EnableMkldnnBfloat16
();
#ifdef PADDLE_WITH_MKLDNN
if
(
p
latform
::
MayIUse
(
platform
::
cpu_isa_t
::
avx512_core
))
if
(
p
hi
::
backends
::
cpu
::
MayIUse
(
phi
::
backends
::
cpu
::
cpu_isa_t
::
avx512_core
))
ASSERT_EQ
(
config
.
mkldnn_bfloat16_enabled
(),
true
);
else
ASSERT_EQ
(
config
.
mkldnn_bfloat16_enabled
(),
false
);
...
...
paddle/fluid/inference/api/onnxruntime_predictor_tester.cc
浏览文件 @
344b99e1
...
...
@@ -27,7 +27,7 @@
#include "paddle/fluid/inference/api/paddle_inference_api.h"
#include "paddle/fluid/inference/tests/api/tester_helper.h"
#include "paddle/fluid/inference/utils/io_utils.h"
#include "paddle/
fluid/platform
/cpu_info.h"
#include "paddle/
phi/backends/cpu
/cpu_info.h"
DEFINE_string
(
dirname
,
""
,
"dirname to tests."
);
...
...
paddle/fluid/inference/tests/api/analyzer_bfloat16_image_classification_tester.cc
浏览文件 @
344b99e1
...
...
@@ -14,7 +14,7 @@ limitations under the License. */
#include "paddle/fluid/inference/api/paddle_analysis_config.h"
#include "paddle/fluid/inference/tests/api/tester_helper.h"
#include "paddle/
fluid/platform
/cpu_info.h"
#include "paddle/
phi/backends/cpu
/cpu_info.h"
DEFINE_bool
(
enable_mkldnn
,
true
,
"Enable MKLDNN"
);
...
...
@@ -47,7 +47,7 @@ TEST(Analyzer_bfloat16_image_classification, bfloat16) {
std
::
vector
<
std
::
vector
<
PaddleTensor
>>
input_slots_all
;
SetInputs
(
&
input_slots_all
);
if
(
FLAGS_enable_mkldnn
&&
FLAGS_enable_bf16
&&
p
latform
::
MayIUse
(
platform
::
cpu_isa_t
::
avx512_bf16
))
{
p
hi
::
backends
::
cpu
::
MayIUse
(
phi
::
backends
::
cpu
::
cpu_isa_t
::
avx512_bf16
))
{
b_cfg
.
EnableMkldnnBfloat16
();
}
else
{
FLAGS_enable_bf16
=
false
;
...
...
paddle/fluid/memory/allocation/buddy_allocator.h
浏览文件 @
344b99e1
...
...
@@ -27,9 +27,9 @@ limitations under the License. */
#include "paddle/fluid/memory/allocation/memory_block.h"
#include "paddle/fluid/memory/allocation/system_allocator.h"
#include "paddle/fluid/platform/cpu_info.h"
#include "paddle/fluid/platform/device/gpu/gpu_info.h"
#include "paddle/fluid/platform/device/npu/npu_info.h"
#include "paddle/phi/backends/cpu/cpu_info.h"
namespace
paddle
{
namespace
memory
{
...
...
paddle/fluid/memory/allocation/naive_best_fit_allocator.cc
浏览文件 @
344b99e1
...
...
@@ -78,8 +78,8 @@ BuddyAllocator *GetCPUBuddyAllocator() {
std
::
call_once
(
init_flag
,
[]()
{
a
=
new
detail
::
BuddyAllocator
(
std
::
unique_ptr
<
detail
::
SystemAllocator
>
(
new
detail
::
CPUAllocator
),
p
latform
::
CpuMinChunkSize
(),
p
latform
::
CpuMaxChunkSize
());
p
hi
::
backends
::
cpu
::
CpuMinChunkSize
(),
p
hi
::
backends
::
cpu
::
CpuMaxChunkSize
());
});
return
a
;
...
...
@@ -290,8 +290,8 @@ BuddyAllocator *GetNPUPinnedBuddyAllocator() {
std
::
call_once
(
init_flag
,
[]()
{
ba
=
new
BuddyAllocator
(
std
::
unique_ptr
<
detail
::
SystemAllocator
>
(
new
detail
::
NPUPinnedAllocator
),
p
latform
::
NPUPinnedMinChunkSize
(),
p
latform
::
NPUPinnedMaxChunkSize
());
p
hi
::
backends
::
cpu
::
NPUPinnedMinChunkSize
(),
p
hi
::
backends
::
cpu
::
NPUPinnedMaxChunkSize
());
});
return
ba
;
...
...
@@ -562,8 +562,8 @@ BuddyAllocator *GetCUDAPinnedBuddyAllocator() {
std
::
call_once
(
init_flag
,
[]()
{
ba
=
new
BuddyAllocator
(
std
::
unique_ptr
<
detail
::
SystemAllocator
>
(
new
detail
::
CUDAPinnedAllocator
),
p
latform
::
CUDAPinnedMinChunkSize
(),
p
latform
::
CUDAPinnedMaxChunkSize
());
p
hi
::
backends
::
cpu
::
CUDAPinnedMinChunkSize
(),
p
hi
::
backends
::
cpu
::
CUDAPinnedMaxChunkSize
());
});
return
ba
;
...
...
paddle/fluid/memory/allocation/system_allocator.cc
浏览文件 @
344b99e1
...
...
@@ -28,10 +28,10 @@ limitations under the License. */
#endif
#include "gflags/gflags.h"
#include "paddle/fluid/memory/allocation/allocator.h"
#include "paddle/fluid/platform/cpu_info.h"
#include "paddle/fluid/platform/device/gpu/gpu_info.h"
#include "paddle/fluid/platform/device/npu/npu_info.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/phi/backends/cpu/cpu_info.h"
#ifdef PADDLE_WITH_MLU
#include "paddle/fluid/platform/device/mlu/mlu_info.h"
#endif
...
...
@@ -206,7 +206,7 @@ void* CUDAPinnedAllocator::Alloc(size_t* index, size_t size) {
// of host pinned allocation. Allocates too much would reduce
// the amount of memory available to the underlying system for paging.
size_t
usable
=
p
addle
::
platform
::
CUDAPinnedMaxAllocSize
()
-
cuda_pinnd_alloc_size_
;
p
hi
::
backends
::
cpu
::
CUDAPinnedMaxAllocSize
()
-
cuda_pinnd_alloc_size_
;
if
(
size
>
usable
)
{
LOG
(
WARNING
)
<<
"Cannot malloc "
<<
size
/
1024.0
/
1024.0
...
...
@@ -362,7 +362,7 @@ void* NPUPinnedAllocator::Alloc(size_t* index, size_t size) {
if
(
size
<=
0
)
return
nullptr
;
size_t
usable
=
p
addle
::
platform
::
NPUPinnedMaxAllocSize
()
-
npu_pinnd_alloc_size_
;
p
hi
::
backends
::
cpu
::
NPUPinnedMaxAllocSize
()
-
npu_pinnd_alloc_size_
;
if
(
size
>
usable
)
{
LOG
(
WARNING
)
<<
"Cannot malloc "
<<
size
/
1024.0
/
1024.0
...
...
paddle/fluid/memory/pinned_memory_test.cu
浏览文件 @
344b99e1
...
...
@@ -18,9 +18,9 @@ limitations under the License. */
#include "paddle/fluid/memory/allocation/memory_block.h"
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/fluid/memory/memory.h"
#include "paddle/fluid/platform/cpu_info.h"
#include "paddle/fluid/platform/device/gpu/gpu_info.h"
#include "paddle/fluid/platform/place.h"
#include "paddle/phi/backends/cpu/cpu_info.h"
// This unit test is an example comparing the performance between using pinned
// memory and not. In general, using pinned memory will be faster.
...
...
paddle/fluid/operators/attention_lstm_op.cc
浏览文件 @
344b99e1
...
...
@@ -16,7 +16,7 @@ limitations under the License. */
#include <string>
#include "paddle/
fluid/platform
/cpu_info.h"
#include "paddle/
phi/backends/cpu
/cpu_info.h"
#include "paddle/phi/kernels/funcs/blas/blas.h"
#include "paddle/phi/kernels/funcs/cpu_vec.h"
#include "paddle/phi/kernels/funcs/fc_functor.h"
...
...
@@ -315,10 +315,10 @@ use lstm_x_t as input and compute as standard LSTM.
template
<
typename
T
>
inline
void
bias_relu
(
const
int
n
,
const
T
*
x
,
const
T
*
bias
,
T
*
y
)
{
if
(
bias
)
{
phi
::
funcs
::
vec_add_bias
<
T
,
p
latform
::
avx
>
(
n
,
*
bias
,
x
,
y
);
phi
::
funcs
::
vec_relu
<
T
,
p
latform
::
avx
>
(
n
,
y
,
y
);
phi
::
funcs
::
vec_add_bias
<
T
,
p
hi
::
backends
::
cpu
::
avx
>
(
n
,
*
bias
,
x
,
y
);
phi
::
funcs
::
vec_relu
<
T
,
p
hi
::
backends
::
cpu
::
avx
>
(
n
,
y
,
y
);
}
else
{
phi
::
funcs
::
vec_relu
<
T
,
p
latform
::
avx
>
(
n
,
x
,
y
);
phi
::
funcs
::
vec_relu
<
T
,
p
hi
::
backends
::
cpu
::
avx
>
(
n
,
x
,
y
);
}
}
...
...
@@ -329,8 +329,9 @@ inline void vec_softmax(const int n, const T* x, T* y) {
for
(
int
i
=
1
;
i
<
n
;
++
i
)
{
scalar
=
scalar
<
x
[
i
]
?
x
[
i
]
:
scalar
;
}
phi
::
funcs
::
vec_add_bias
<
T
,
platform
::
avx
>
(
n
,
-
scalar
,
x
,
y
);
// sub
phi
::
funcs
::
vec_exp
<
T
>
(
n
,
y
,
y
);
// exp
phi
::
funcs
::
vec_add_bias
<
T
,
phi
::
backends
::
cpu
::
avx
>
(
n
,
-
scalar
,
x
,
y
);
// sub
phi
::
funcs
::
vec_exp
<
T
>
(
n
,
y
,
y
);
// exp
// sum
scalar
=
T
(
0
);
for
(
int
i
=
0
;
i
<
n
;
++
i
)
{
...
...
@@ -393,13 +394,13 @@ class AttentionLSTMKernel : public framework::OpKernel<T> {
auto
&
act_gate_str
=
ctx
.
Attr
<
std
::
string
>
(
"gate_activation"
);
auto
&
act_cell_str
=
ctx
.
Attr
<
std
::
string
>
(
"cell_activation"
);
auto
&
act_cand_str
=
ctx
.
Attr
<
std
::
string
>
(
"candidate_activation"
);
if
(
p
latform
::
MayIUse
(
platform
::
avx
))
{
phi
::
funcs
::
VecActivations
<
T
,
p
latform
::
avx
>
act_functor
;
if
(
p
hi
::
backends
::
cpu
::
MayIUse
(
phi
::
backends
::
cpu
::
avx
))
{
phi
::
funcs
::
VecActivations
<
T
,
p
hi
::
backends
::
cpu
::
avx
>
act_functor
;
act_gate
=
act_functor
(
act_gate_str
);
act_cell
=
act_functor
(
act_cell_str
);
act_cand
=
act_functor
(
act_cand_str
);
}
else
{
phi
::
funcs
::
VecActivations
<
T
,
p
latform
::
isa_any
>
act_functor
;
phi
::
funcs
::
VecActivations
<
T
,
p
hi
::
backends
::
cpu
::
isa_any
>
act_functor
;
act_gate
=
act_functor
(
act_gate_str
);
act_cell
=
act_functor
(
act_cell_str
);
act_cand
=
act_functor
(
act_cand_str
);
...
...
paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.cu
浏览文件 @
344b99e1
...
...
@@ -13,13 +13,15 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.h"
#include "paddle/fluid/operators/math/softmax_impl.h"
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/platform/collective_helper.h"
#include "paddle/fluid/platform/device/gpu/nccl_helper.h"
#include "paddle/fluid/string/string_helper.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/kernels/funcs/axis_utils.h"
#include "paddle/phi/kernels/funcs/cross_entropy.h"
#include "paddle/phi/kernels/funcs/softmax_impl.h"
namespace
paddle
{
namespace
operators
{
...
...
@@ -129,15 +131,15 @@ struct CSoftmaxWithCrossEntropyFunctor<phi::GPUContext, T> {
softmax_2d
.
ShareDataWith
(
*
softmax
).
Resize
({
N
,
D
});
loss_2d
.
ShareDataWith
(
*
loss
).
Resize
({
N
,
1
});
auto
eigen_logits
=
math
::
EigenMatrix
<
T
>::
From
(
logits_2d
);
auto
eigen_softmax
=
math
::
EigenMatrix
<
T
>::
From
(
softmax_2d
);
auto
eigen_logits
=
phi
::
funcs
::
EigenMatrix
<
T
>::
From
(
logits_2d
);
auto
eigen_softmax
=
phi
::
funcs
::
EigenMatrix
<
T
>::
From
(
softmax_2d
);
// step 1, obtain logit_max
phi
::
DenseTensor
logits_max
;
logits_max
=
ctx
.
AllocateTmpTensor
<
T
,
phi
::
GPUContext
>
({
N
,
1
},
dev_ctx
);
void
*
logits_max_buff
=
logits_max
.
mutable_data
<
T
>
(
place
);
auto
eigen_logits_max
=
math
::
EigenMatrix
<
T
>::
From
(
logits_max
);
auto
eigen_logits_max
=
phi
::
funcs
::
EigenMatrix
<
T
>::
From
(
logits_max
);
Eigen
::
DSizes
<
int
,
1
>
along_axis
(
1
);
eigen_logits_max
.
device
(
*
dev_ctx
.
eigen_device
())
=
eigen_logits
.
maximum
(
along_axis
);
...
...
@@ -158,7 +160,7 @@ struct CSoftmaxWithCrossEntropyFunctor<phi::GPUContext, T> {
eigen_softmax
.
device
(
*
dev_ctx
.
eigen_device
())
=
(
eigen_logits
-
eigen_logits_max
.
reshape
(
batch_by_one
).
broadcast
(
one_by_class
))
.
unaryExpr
(
math
::
ValueClip
<
T
>
());
.
unaryExpr
(
phi
::
funcs
::
ValueClip
<
T
>
());
// step 3, obtain predict target
phi
::
DenseTensor
predicted_logits
;
...
...
@@ -217,7 +219,8 @@ struct CSoftmaxWithCrossEntropyFunctor<phi::GPUContext, T> {
sum_exp_logits
=
ctx
.
AllocateTmpTensor
<
T
,
phi
::
GPUContext
>
({
N
,
1
},
dev_ctx
);
void
*
sum_exp_logits_buff
=
sum_exp_logits
.
mutable_data
<
T
>
(
place
);
auto
eigen_sum_exp_logits
=
math
::
EigenMatrix
<
T
>::
From
(
sum_exp_logits
);
auto
eigen_sum_exp_logits
=
phi
::
funcs
::
EigenMatrix
<
T
>::
From
(
sum_exp_logits
);
eigen_sum_exp_logits
.
device
(
*
dev_ctx
.
eigen_device
())
=
eigen_softmax
.
sum
(
along_axis
);
...
...
@@ -231,8 +234,9 @@ struct CSoftmaxWithCrossEntropyFunctor<phi::GPUContext, T> {
comm
->
comm
(),
stream
));
auto
eigen_loss
=
math
::
EigenMatrix
<
T
>::
From
(
loss_2d
);
auto
eigen_predicted_logits
=
math
::
EigenMatrix
<
T
>::
From
(
predicted_logits
);
auto
eigen_loss
=
phi
::
funcs
::
EigenMatrix
<
T
>::
From
(
loss_2d
);
auto
eigen_predicted_logits
=
phi
::
funcs
::
EigenMatrix
<
T
>::
From
(
predicted_logits
);
eigen_loss
.
device
(
*
dev_ctx
.
eigen_device
())
=
(
eigen_sum_exp_logits
.
log
().
unaryExpr
(
phi
::
funcs
::
TolerableValue
<
T
>
())
-
...
...
@@ -281,14 +285,14 @@ struct CSoftmaxWithCrossEntropyProcessGroupFunctor<phi::GPUContext, T> {
softmax_2d
.
ShareDataWith
(
*
softmax
).
Resize
({
N
,
D
});
loss_2d
.
ShareDataWith
(
*
loss
).
Resize
({
N
,
1
});
auto
eigen_logits
=
math
::
EigenMatrix
<
T
>::
From
(
logits_2d
);
auto
eigen_softmax
=
math
::
EigenMatrix
<
T
>::
From
(
softmax_2d
);
auto
eigen_logits
=
phi
::
funcs
::
EigenMatrix
<
T
>::
From
(
logits_2d
);
auto
eigen_softmax
=
phi
::
funcs
::
EigenMatrix
<
T
>::
From
(
softmax_2d
);
// step 1, obtain logit_max
phi
::
DenseTensor
logits_max
;
logits_max
=
ctx
.
AllocateTmpTensor
<
T
,
phi
::
GPUContext
>
({
N
,
1
},
dev_ctx
);
auto
eigen_logits_max
=
math
::
EigenMatrix
<
T
>::
From
(
logits_max
);
auto
eigen_logits_max
=
phi
::
funcs
::
EigenMatrix
<
T
>::
From
(
logits_max
);
Eigen
::
DSizes
<
int
,
1
>
along_axis
(
1
);
eigen_logits_max
.
device
(
*
dev_ctx
.
eigen_device
())
=
eigen_logits
.
maximum
(
along_axis
);
...
...
@@ -304,7 +308,7 @@ struct CSoftmaxWithCrossEntropyProcessGroupFunctor<phi::GPUContext, T> {
eigen_softmax
.
device
(
*
dev_ctx
.
eigen_device
())
=
(
eigen_logits
-
eigen_logits_max
.
reshape
(
batch_by_one
).
broadcast
(
one_by_class
))
.
unaryExpr
(
math
::
ValueClip
<
T
>
());
.
unaryExpr
(
phi
::
funcs
::
ValueClip
<
T
>
());
// step 3, obtain predict target
phi
::
DenseTensor
predicted_logits
;
...
...
@@ -357,7 +361,8 @@ struct CSoftmaxWithCrossEntropyProcessGroupFunctor<phi::GPUContext, T> {
sum_exp_logits
=
ctx
.
AllocateTmpTensor
<
T
,
phi
::
GPUContext
>
({
N
,
1
},
dev_ctx
);
void
*
sum_exp_logits_buff
=
sum_exp_logits
.
mutable_data
<
T
>
(
place
);
auto
eigen_sum_exp_logits
=
math
::
EigenMatrix
<
T
>::
From
(
sum_exp_logits
);
auto
eigen_sum_exp_logits
=
phi
::
funcs
::
EigenMatrix
<
T
>::
From
(
sum_exp_logits
);
eigen_sum_exp_logits
.
device
(
*
dev_ctx
.
eigen_device
())
=
eigen_softmax
.
sum
(
along_axis
);
...
...
@@ -366,8 +371,9 @@ struct CSoftmaxWithCrossEntropyProcessGroupFunctor<phi::GPUContext, T> {
opts
.
reduce_op
=
distributed
::
ReduceOp
::
SUM
;
pg
->
AllReduce
(
in_out
,
in_out
,
opts
)
->
Synchronize
();
auto
eigen_loss
=
math
::
EigenMatrix
<
T
>::
From
(
loss_2d
);
auto
eigen_predicted_logits
=
math
::
EigenMatrix
<
T
>::
From
(
predicted_logits
);
auto
eigen_loss
=
phi
::
funcs
::
EigenMatrix
<
T
>::
From
(
loss_2d
);
auto
eigen_predicted_logits
=
phi
::
funcs
::
EigenMatrix
<
T
>::
From
(
predicted_logits
);
eigen_loss
.
device
(
*
dev_ctx
.
eigen_device
())
=
(
eigen_sum_exp_logits
.
log
().
unaryExpr
(
phi
::
funcs
::
TolerableValue
<
T
>
())
-
...
...
paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.h
浏览文件 @
344b99e1
...
...
@@ -22,9 +22,9 @@ limitations under the License. */
#include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/softmax.h"
#include "paddle/phi/api/include/tensor.h"
#include "paddle/phi/kernels/funcs/cross_entropy.h"
#include "paddle/phi/kernels/funcs/softmax.h"
namespace
paddle
{
namespace
operators
{
...
...
paddle/fluid/operators/elementwise/elementwise_mul_op.h
浏览文件 @
344b99e1
...
...
@@ -17,7 +17,7 @@ limitations under the License. */
#include <string>
#include "paddle/fluid/operators/elementwise/elementwise_op.h"
#include "paddle/
fluid/platform
/cpu_info.h"
#include "paddle/
phi/backends/cpu
/cpu_info.h"
#include "paddle/phi/kernels/elementwise_kernel.h"
namespace
paddle
{
...
...
paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.cc
浏览文件 @
344b99e1
...
...
@@ -16,7 +16,7 @@ limitations under the License. */
#include <string>
#include "paddle/
fluid/platform
/cpu_info.h"
#include "paddle/
phi/backends/cpu
/cpu_info.h"
#include "paddle/phi/kernels/funcs/blas/blas.h"
#include "paddle/phi/kernels/funcs/cpu_vec.h"
#include "paddle/phi/kernels/funcs/sequence2batch.h"
...
...
@@ -278,13 +278,13 @@ class FusedEmbeddingFCLSTMKernel : public framework::OpKernel<T> {
auto& act_gate_str = ctx.Attr<std::string>("gate_activation"); \
auto& act_cell_str = ctx.Attr<std::string>("cell_activation"); \
auto& act_cand_str = ctx.Attr<std::string>("candidate_activation"); \
if (p
latform::MayIUse(platform::avx)) {
\
phi::funcs::VecActivations<T, p
latform::avx> act_functor;
\
if (p
hi::backends::cpu::MayIUse(phi::backends::cpu::avx)) {
\
phi::funcs::VecActivations<T, p
hi::backends::cpu::avx> act_functor;
\
act_gate = act_functor(act_gate_str); \
act_cell = act_functor(act_cell_str); \
act_cand = act_functor(act_cand_str); \
} else { \
phi::funcs::VecActivations<T, p
latform::isa_any> act_functor;
\
phi::funcs::VecActivations<T, p
hi::backends::cpu::isa_any> act_functor;
\
act_gate = act_functor(act_gate_str); \
act_cell = act_functor(act_cell_str); \
act_cand = act_functor(act_cand_str); \
...
...
paddle/fluid/operators/fused/fusion_seqexpand_concat_fc_op.cc
浏览文件 @
344b99e1
...
...
@@ -16,7 +16,7 @@ limitations under the License. */
#include <string>
#include "paddle/
fluid/platform
/cpu_info.h"
#include "paddle/
phi/backends/cpu
/cpu_info.h"
#include "paddle/phi/kernels/funcs/blas/blas.h"
#include "paddle/phi/kernels/funcs/cpu_vec.h"
#include "paddle/phi/kernels/funcs/fc_functor.h"
...
...
@@ -225,11 +225,11 @@ class FusionSeqExpandConcatFCOpKernel : public framework::OpKernel<T> {
std
::
function
<
void
(
const
int
,
const
T
*
,
T
*
)
>
fc_act
;
auto
&
fc_act_str
=
ctx
.
Attr
<
std
::
string
>
(
"fc_activation"
);
if
(
p
latform
::
MayIUse
(
platform
::
avx
))
{
phi
::
funcs
::
VecActivations
<
T
,
p
latform
::
avx
>
act_functor
;
if
(
p
hi
::
backends
::
cpu
::
MayIUse
(
phi
::
backends
::
cpu
::
avx
))
{
phi
::
funcs
::
VecActivations
<
T
,
p
hi
::
backends
::
cpu
::
avx
>
act_functor
;
fc_act
=
act_functor
(
fc_act_str
);
}
else
{
phi
::
funcs
::
VecActivations
<
T
,
p
latform
::
isa_any
>
act_functor
;
phi
::
funcs
::
VecActivations
<
T
,
p
hi
::
backends
::
cpu
::
isa_any
>
act_functor
;
fc_act
=
act_functor
(
fc_act_str
);
}
...
...
paddle/fluid/operators/jit/CMakeLists.txt
浏览文件 @
344b99e1
...
...
@@ -9,7 +9,7 @@ file(APPEND ${jit_file} "\#include \"paddle/fluid/operators/jit/helper.h\"\n")
file
(
APPEND
${
jit_file
}
"
\#
include
\"
paddle/fluid/operators/jit/registry.h
\"\n\n
"
)
set
(
JIT_KERNEL_DEPS
cpu_info
cblas gflags enforce place xxhash
)
set
(
JIT_KERNEL_DEPS
device_context
cblas gflags enforce place xxhash
)
file
(
GLOB jit_kernel_cc_srcs
...
...
paddle/fluid/operators/jit/gen/act.cc
浏览文件 @
344b99e1
...
...
@@ -15,7 +15,7 @@
#include "paddle/fluid/operators/jit/gen/act.h"
#include "paddle/fluid/operators/jit/registry.h"
#include "paddle/
fluid/platform
/cpu_info.h"
#include "paddle/
phi/backends/cpu
/cpu_info.h"
namespace
paddle
{
namespace
operators
{
...
...
@@ -98,27 +98,27 @@ DECLARE_ACT_CREATOR(VTanh);
// TODO(TJ): tuning use me
bool
VReluCreator
::
CanBeUsed
(
const
int
&
d
)
const
{
return
p
latform
::
MayIUse
(
platform
::
avx
);
return
p
hi
::
backends
::
cpu
::
MayIUse
(
phi
::
backends
::
cpu
::
avx
);
}
bool
VSquareCreator
::
CanBeUsed
(
const
int
&
d
)
const
{
return
p
latform
::
MayIUse
(
platform
::
avx
);
return
p
hi
::
backends
::
cpu
::
MayIUse
(
phi
::
backends
::
cpu
::
avx
);
}
bool
VIdentityCreator
::
CanBeUsed
(
const
int
&
d
)
const
{
return
p
latform
::
MayIUse
(
platform
::
avx
);
return
p
hi
::
backends
::
cpu
::
MayIUse
(
phi
::
backends
::
cpu
::
avx
);
}
bool
VExpCreator
::
CanBeUsed
(
const
int
&
d
)
const
{
return
p
latform
::
MayIUse
(
platform
::
avx
)
&&
d
<
32
;
return
p
hi
::
backends
::
cpu
::
MayIUse
(
phi
::
backends
::
cpu
::
avx
)
&&
d
<
32
;
}
bool
VSigmoidCreator
::
CanBeUsed
(
const
int
&
d
)
const
{
return
p
latform
::
MayIUse
(
platform
::
avx
);
return
p
hi
::
backends
::
cpu
::
MayIUse
(
phi
::
backends
::
cpu
::
avx
);
}
bool
VTanhCreator
::
CanBeUsed
(
const
int
&
d
)
const
{
return
p
latform
::
MayIUse
(
platform
::
avx
);
return
p
hi
::
backends
::
cpu
::
MayIUse
(
phi
::
backends
::
cpu
::
avx
);
}
size_t
VReluCreator
::
CodeSize
(
const
int
&
d
)
const
{
...
...
paddle/fluid/operators/jit/gen/act.h
浏览文件 @
344b99e1
...
...
@@ -84,8 +84,8 @@ class VActFunc : public JitCode {
// compute EXP with ymm, xmm
template
<
typename
JMM
>
void
exp_jmm
(
JMM
&
dst
,
JMM
&
src
,
void
exp_jmm
(
JMM
&
dst
,
// NOLINT
JMM
&
src
,
// NOLINT
int
src_idx
=
11
,
int
fx_idx
=
12
,
// NOLINT
int
fy_idx
=
13
,
...
...
@@ -144,10 +144,11 @@ class VActFunc : public JitCode {
vcvttps2dq
(
ymm_int
,
jmm_fx
);
mov
(
reg_ptr_global
,
reinterpret_cast
<
size_t
>
(
exp_int_0x7f
));
vmovdqa
(
jmm_tmp
,
ptr
[
reg_ptr_global
]);
if
(
MayIUse
(
avx2
)
||
std
::
is_same
<
JMM
,
xmm_t
>::
value
)
{
if
(
phi
::
backends
::
cpu
::
MayIUse
(
phi
::
backends
::
cpu
::
avx2
)
||
std
::
is_same
<
JMM
,
xmm_t
>::
value
)
{
vpaddd
(
ymm_int
,
ymm_int
,
jmm_tmp
);
vpslld
(
ymm_int
,
ymm_int
,
23
);
}
else
if
(
MayIUse
(
avx
))
{
}
else
if
(
phi
::
backends
::
cpu
::
MayIUse
(
phi
::
backends
::
cpu
::
avx
))
{
xmm_t
xtmp1
=
xmm_t
(
ymm_int
.
getIdx
());
xmm_t
xtmp2
=
xmm_t
(
jmm_tmp
.
getIdx
());
reg64_t
reg_ptr_tmp
=
reg_ptr_global
;
...
...
@@ -174,8 +175,8 @@ class VActFunc : public JitCode {
// compute SIGMOID with ymm, xmm
template
<
typename
JMM
>
void
sigmoid_jmm
(
JMM
&
dst
,
JMM
&
src
,
void
sigmoid_jmm
(
JMM
&
dst
,
// NOLINT
JMM
&
src
,
// NOLINT
int
src_idx
=
11
,
// NOLINT
int
fx_idx
=
12
,
int
fy_idx
=
13
,
...
...
@@ -203,8 +204,8 @@ class VActFunc : public JitCode {
// compute TANH with ymm, xmm
template
<
typename
JMM
>
void
tanh_jmm
(
JMM
&
dst
,
JMM
&
src
,
void
tanh_jmm
(
JMM
&
dst
,
// NOLINT
JMM
&
src
,
// NOLINT
int
src_idx
=
11
,
// NOLINT
int
fx_idx
=
12
,
int
fy_idx
=
13
,
...
...
paddle/fluid/operators/jit/gen/adam.cc
浏览文件 @
344b99e1
...
...
@@ -17,7 +17,7 @@
#include <stddef.h> // offsetof
#include "paddle/fluid/operators/jit/registry.h"
#include "paddle/
fluid/platform
/cpu_info.h"
#include "paddle/
phi/backends/cpu
/cpu_info.h"
namespace
paddle
{
namespace
operators
{
...
...
@@ -132,7 +132,7 @@ void AdamJitCode::genCode() {
class
AdamCreator
:
public
JitCodeCreator
<
adam_attr_t
>
{
public:
bool
CanBeUsed
(
const
adam_attr_t
&
attr
)
const
override
{
return
p
latform
::
MayIUse
(
platform
::
avx512f
);
return
p
hi
::
backends
::
cpu
::
MayIUse
(
phi
::
backends
::
cpu
::
avx512f
);
}
size_t
CodeSize
(
const
adam_attr_t
&
attr
)
const
override
{
return
96
+
32
*
8
;
...
...
paddle/fluid/operators/jit/gen/adamw.cc
浏览文件 @
344b99e1
...
...
@@ -17,7 +17,7 @@
#include <stddef.h> // offsetof
#include "paddle/fluid/operators/jit/registry.h"
#include "paddle/
fluid/platform
/cpu_info.h"
#include "paddle/
phi/backends/cpu
/cpu_info.h"
namespace
paddle
{
namespace
operators
{
...
...
@@ -147,7 +147,7 @@ void AdamWJitCode::genCode() {
class
AdamWCreator
:
public
JitCodeCreator
<
int
>
{
public:
bool
CanBeUsed
(
const
int
&
attr
)
const
override
{
return
p
latform
::
MayIUse
(
platform
::
avx512f
);
return
p
hi
::
backends
::
cpu
::
MayIUse
(
phi
::
backends
::
cpu
::
avx512f
);
}
size_t
CodeSize
(
const
int
&
attr
)
const
override
{
return
96
+
32
*
8
;
}
std
::
unique_ptr
<
GenBase
>
CreateJitCode
(
const
int
&
attr
)
const
override
{
...
...
paddle/fluid/operators/jit/gen/blas.cc
浏览文件 @
344b99e1
...
...
@@ -16,7 +16,7 @@
#include "paddle/fluid/operators/jit/macro.h"
#include "paddle/fluid/operators/jit/registry.h"
#include "paddle/
fluid/platform
/cpu_info.h"
#include "paddle/
phi/backends/cpu
/cpu_info.h"
namespace
paddle
{
namespace
operators
{
...
...
@@ -145,7 +145,7 @@ void NCHW16CMulNCJitCode::genCode() {
class
NCHW16CMulNCCreator
:
public
JitCodeCreator
<
int
>
{
public:
bool
CanBeUsed
(
const
int
&
attr
)
const
override
{
return
p
latform
::
MayIUse
(
platform
::
avx512f
);
return
p
hi
::
backends
::
cpu
::
MayIUse
(
phi
::
backends
::
cpu
::
avx512f
);
}
size_t
CodeSize
(
const
int
&
d
)
const
override
{
return
256
*
1024
;
}
std
::
unique_ptr
<
GenBase
>
CreateJitCode
(
const
int
&
attr
)
const
override
{
...
...
@@ -157,7 +157,8 @@ class NCHW16CMulNCCreator : public JitCodeCreator<int> {
class name##Creator : public JitCodeCreator<int> { \
public: \
bool CanBeUsed(const int& attr) const override { \
return platform::MayIUse(platform::avx) && attr <= 1024; \
return phi::backends::cpu::MayIUse(phi::backends::cpu::avx) && \
attr <= 1024; \
} \
size_t CodeSize(const int& d) const override { \
return 96 + d / YMM_FLOAT_BLOCK * 4 * 8; \
...
...
paddle/fluid/operators/jit/gen/embseqpool.cc
浏览文件 @
344b99e1
...
...
@@ -18,7 +18,7 @@
#include "paddle/fluid/operators/jit/macro.h"
#include "paddle/fluid/operators/jit/registry.h"
#include "paddle/
fluid/platform
/cpu_info.h"
#include "paddle/
phi/backends/cpu
/cpu_info.h"
namespace
paddle
{
namespace
operators
{
...
...
@@ -123,7 +123,7 @@ void EmbSeqPoolJitCode::genCode() {
class
EmbSeqPoolCreator
:
public
JitCodeCreator
<
emb_seq_pool_attr_t
>
{
public:
bool
CanBeUsed
(
const
emb_seq_pool_attr_t
&
attr
)
const
override
{
return
p
latform
::
MayIUse
(
platform
::
avx
)
&&
return
p
hi
::
backends
::
cpu
::
MayIUse
(
phi
::
backends
::
cpu
::
avx
)
&&
attr
.
table_width
%
YMM_FLOAT_BLOCK
==
0
;
}
size_t
CodeSize
(
const
emb_seq_pool_attr_t
&
attr
)
const
override
{
...
...
paddle/fluid/operators/jit/gen/gru.cc
浏览文件 @
344b99e1
...
...
@@ -18,7 +18,7 @@
#include "paddle/fluid/operators/jit/macro.h"
#include "paddle/fluid/operators/jit/registry.h"
#include "paddle/
fluid/platform
/cpu_info.h"
#include "paddle/
phi/backends/cpu
/cpu_info.h"
namespace
paddle
{
namespace
operators
{
...
...
@@ -85,20 +85,21 @@ void GRUJitCode::genCode() {
ret
();
}
#define DECLARE_GRU_CREATOR(name) \
class name##Creator : public JitCodeCreator<gru_attr_t> { \
public: \
/* TODO(TJ): enable more */
\
bool CanBeUsed(const gru_attr_t& attr) const override { \
return platform::MayIUse(platform::avx) && attr.d % 8 == 0; \
} \
size_t CodeSize(const gru_attr_t& attr) const override { \
return 96 + attr.d / YMM_FLOAT_BLOCK * 96 * 2 * 8; \
} \
std::unique_ptr<GenBase> CreateJitCode( \
const gru_attr_t& attr) const override { \
return make_unique<name##JitCode>(attr, CodeSize(attr)); \
} \
#define DECLARE_GRU_CREATOR(name) \
class name##Creator : public JitCodeCreator<gru_attr_t> { \
public: \
/* TODO(TJ): enable more */
\
bool CanBeUsed(const gru_attr_t& attr) const override { \
return phi::backends::cpu::MayIUse(phi::backends::cpu::avx) && \
attr.d % 8 == 0; \
} \
size_t CodeSize(const gru_attr_t& attr) const override { \
return 96 + attr.d / YMM_FLOAT_BLOCK * 96 * 2 * 8; \
} \
std::unique_ptr<GenBase> CreateJitCode( \
const gru_attr_t& attr) const override { \
return make_unique<name##JitCode>(attr, CodeSize(attr)); \
} \
}
DECLARE_GRU_CREATOR
(
GRUH1
);
...
...
paddle/fluid/operators/jit/gen/hopv.cc
浏览文件 @
344b99e1
...
...
@@ -15,7 +15,7 @@
#include "paddle/fluid/operators/jit/gen/hopv.h"
#include "paddle/fluid/operators/jit/registry.h"
#include "paddle/
fluid/platform
/cpu_info.h"
#include "paddle/
phi/backends/cpu
/cpu_info.h"
namespace
paddle
{
namespace
operators
{
...
...
@@ -78,7 +78,7 @@ void HOPVJitCode::genCode() {
class name##Creator : public JitCodeCreator<int> { \
public: \
bool CanBeUsed(const int& attr) const override { \
return p
latform::MayIUse(platform::avx);
\
return p
hi::backends::cpu::MayIUse(phi::backends::cpu::avx);
\
} \
size_t CodeSize(const int& d) const override { \
return 96 + d / YMM_FLOAT_BLOCK * 4 * 8; \
...
...
paddle/fluid/operators/jit/gen/jitcode.h
浏览文件 @
344b99e1
...
...
@@ -18,7 +18,7 @@
#include <type_traits>
#include "paddle/fluid/operators/jit/gen_base.h"
#include "paddle/
fluid/platform
/cpu_info.h"
#include "paddle/
phi/backends/cpu
/cpu_info.h"
#define XBYAK_USE_MMAP_ALLOCATOR
#include "xbyak/xbyak.h"
...
...
@@ -92,7 +92,7 @@ class JitCode : public GenBase, public Xbyak::CodeGenerator {
for
(
int
i
=
0
;
i
<
num_g_abi_regs
;
++
i
)
{
push
(
Xbyak
::
Reg64
(
g_abi_regs
[
i
]));
}
if
(
p
latform
::
MayIUse
(
platform
::
avx512f
))
{
if
(
p
hi
::
backends
::
cpu
::
MayIUse
(
phi
::
backends
::
cpu
::
avx512f
))
{
mov
(
reg_EVEX_max_8b_offt
,
2
*
EVEX_max_8b_offt
);
}
}
...
...
paddle/fluid/operators/jit/gen/lstm.cc
浏览文件 @
344b99e1
...
...
@@ -18,7 +18,7 @@
#include "paddle/fluid/operators/jit/macro.h"
#include "paddle/fluid/operators/jit/registry.h"
#include "paddle/
fluid/platform
/cpu_info.h"
#include "paddle/
phi/backends/cpu
/cpu_info.h"
namespace
paddle
{
namespace
operators
{
...
...
@@ -113,20 +113,21 @@ void LSTMJitCode::genCode() {
}
}
#define DECLARE_LSTM_CREATOR(name) \
class name##Creator : public JitCodeCreator<lstm_attr_t> { \
public: \
/* TODO(TJ): enable more */
\
bool CanBeUsed(const lstm_attr_t& attr) const override { \
return platform::MayIUse(platform::avx) && attr.d % 8 == 0; \
} \
size_t CodeSize(const lstm_attr_t& attr) const override { \
return 96 + attr.d / YMM_FLOAT_BLOCK * 90 * 4 * 8; \
} \
std::unique_ptr<GenBase> CreateJitCode( \
const lstm_attr_t& attr) const override { \
return make_unique<name##JitCode>(attr, CodeSize(attr)); \
} \
#define DECLARE_LSTM_CREATOR(name) \
class name##Creator : public JitCodeCreator<lstm_attr_t> { \
public: \
/* TODO(TJ): enable more */
\
bool CanBeUsed(const lstm_attr_t& attr) const override { \
return phi::backends::cpu::MayIUse(phi::backends::cpu::avx) && \
attr.d % 8 == 0; \
} \
size_t CodeSize(const lstm_attr_t& attr) const override { \
return 96 + attr.d / YMM_FLOAT_BLOCK * 90 * 4 * 8; \
} \
std::unique_ptr<GenBase> CreateJitCode( \
const lstm_attr_t& attr) const override { \
return make_unique<name##JitCode>(attr, CodeSize(attr)); \
} \
}
DECLARE_LSTM_CREATOR
(
LSTMCtHt
);
...
...
paddle/fluid/operators/jit/gen/matmul.cc
浏览文件 @
344b99e1
...
...
@@ -17,7 +17,7 @@
#include <stddef.h> // offsetof
#include "paddle/fluid/operators/jit/registry.h"
#include "paddle/
fluid/platform
/cpu_info.h"
#include "paddle/
phi/backends/cpu
/cpu_info.h"
namespace
paddle
{
namespace
operators
{
...
...
@@ -110,12 +110,13 @@ void MatMulJitCode::genCode() {
class
MatMulCreator
:
public
JitCodeCreator
<
matmul_attr_t
>
{
public:
bool
CanBeUsed
(
const
matmul_attr_t
&
attr
)
const
override
{
return
attr
.
m
==
1
&&
platform
::
MayIUse
(
platform
::
avx512f
)
&&
return
attr
.
m
==
1
&&
phi
::
backends
::
cpu
::
MayIUse
(
phi
::
backends
::
cpu
::
avx512f
)
&&
attr
.
n
%
ZMM_FLOAT_BLOCK
==
0
&&
attr
.
k
<
512
;
}
size_t
CodeSize
(
const
matmul_attr_t
&
attr
)
const
override
{
int
block
=
YMM_FLOAT_BLOCK
;
if
(
p
latform
::
MayIUse
(
platform
::
avx512f
))
{
if
(
p
hi
::
backends
::
cpu
::
MayIUse
(
phi
::
backends
::
cpu
::
avx512f
))
{
block
=
ZMM_FLOAT_BLOCK
;
}
return
96
+
4
*
attr
.
k
*
(
attr
.
n
/
block
+
1
)
*
8
;
...
...
paddle/fluid/operators/jit/gen/seqpool.cc
浏览文件 @
344b99e1
...
...
@@ -16,7 +16,7 @@
#include "paddle/fluid/operators/jit/gen/act.h" // for exp_float_consts ones
#include "paddle/fluid/operators/jit/registry.h"
#include "paddle/
fluid/platform
/cpu_info.h"
#include "paddle/
phi/backends/cpu
/cpu_info.h"
namespace
paddle
{
namespace
operators
{
...
...
@@ -59,7 +59,7 @@ void SeqPoolJitCode::genCode() {
class
SeqPoolCreator
:
public
JitCodeCreator
<
seq_pool_attr_t
>
{
public:
bool
CanBeUsed
(
const
seq_pool_attr_t
&
attr
)
const
override
{
return
p
latform
::
MayIUse
(
platform
::
avx
);
return
p
hi
::
backends
::
cpu
::
MayIUse
(
phi
::
backends
::
cpu
::
avx
);
}
size_t
CodeSize
(
const
seq_pool_attr_t
&
attr
)
const
override
{
return
96
+
((
attr
.
w
/
YMM_FLOAT_BLOCK
+
4
/* for rest */
)
*
...
...
paddle/fluid/operators/jit/gen/sgd.cc
浏览文件 @
344b99e1
...
...
@@ -17,7 +17,7 @@
#include <stddef.h> // offsetof
#include "paddle/fluid/operators/jit/registry.h"
#include "paddle/
fluid/platform
/cpu_info.h"
#include "paddle/
phi/backends/cpu
/cpu_info.h"
namespace
paddle
{
namespace
operators
{
...
...
@@ -109,7 +109,7 @@ void SgdJitCode::genCode() {
class
SgdCreator
:
public
JitCodeCreator
<
sgd_attr_t
>
{
public:
bool
CanBeUsed
(
const
sgd_attr_t
&
attr
)
const
override
{
return
p
latform
::
MayIUse
(
platform
::
avx
)
&&
return
p
hi
::
backends
::
cpu
::
MayIUse
(
phi
::
backends
::
cpu
::
avx
)
&&
attr
.
grad_width
%
YMM_FLOAT_BLOCK
==
0
;
}
size_t
CodeSize
(
const
sgd_attr_t
&
attr
)
const
override
{
return
96
+
32
*
8
;
}
...
...
paddle/fluid/operators/jit/gen/vbroadcast.cc
浏览文件 @
344b99e1
...
...
@@ -15,7 +15,7 @@
#include "paddle/fluid/operators/jit/gen/vbroadcast.h"
#include "paddle/fluid/operators/jit/registry.h"
#include "paddle/
fluid/platform
/cpu_info.h"
#include "paddle/
phi/backends/cpu
/cpu_info.h"
namespace
paddle
{
namespace
operators
{
...
...
@@ -69,7 +69,8 @@ void VBroadcastJitCode::genCode() {
class
VBroadcastCreator
:
public
JitCodeCreator
<
int64_t
>
{
public:
bool
CanBeUsed
(
const
int64_t
&
w
)
const
override
{
return
platform
::
MayIUse
(
platform
::
avx
)
&&
w
%
YMM_FLOAT_BLOCK
==
0
;
return
phi
::
backends
::
cpu
::
MayIUse
(
phi
::
backends
::
cpu
::
avx
)
&&
w
%
YMM_FLOAT_BLOCK
==
0
;
}
size_t
CodeSize
(
const
int64_t
&
w
)
const
override
{
return
96
+
(
w
/
YMM_FLOAT_BLOCK
)
*
16
*
8
;
...
...
paddle/fluid/operators/jit/gen_base.cc
浏览文件 @
344b99e1
...
...
@@ -17,8 +17,8 @@
#include <fstream>
#include "paddle/fluid/memory/allocation/cpu_allocator.h" // for posix_memalign
#include "paddle/fluid/platform/cpu_info.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/phi/backends/cpu/cpu_info.h"
#ifndef _WIN32
#define posix_memalign_free free
...
...
@@ -66,7 +66,7 @@ void GenBase::operator delete(void* ptr) { posix_memalign_free(ptr); }
std
::
vector
<
int
>
packed_groups
(
int
n
,
int
k
,
int
*
block_out
,
int
*
rest_out
)
{
int
block
;
int
max_num_regs
;
if
(
p
latform
::
MayIUse
(
platform
::
avx512f
))
{
if
(
p
hi
::
backends
::
cpu
::
MayIUse
(
phi
::
backends
::
cpu
::
avx512f
))
{
block
=
ZMM_FLOAT_BLOCK
;
max_num_regs
=
32
;
}
else
{
...
...
paddle/fluid/operators/jit/more/intrinsic/crf_decoding.cc
浏览文件 @
344b99e1
...
...
@@ -17,7 +17,7 @@
#include <limits>
#include "paddle/fluid/operators/jit/registry.h"
#include "paddle/
fluid/platform
/cpu_info.h"
#include "paddle/
phi/backends/cpu
/cpu_info.h"
namespace
paddle
{
namespace
operators
{
...
...
@@ -172,7 +172,7 @@ bool CRFDecodingKernel::CanBeUsed(const int& d) const {
#else
constexpr
int
block
=
YMM_FLOAT_BLOCK
;
#endif
return
p
latform
::
MayIUse
(
platform
::
avx
)
&&
d
>=
block
;
return
p
hi
::
backends
::
cpu
::
MayIUse
(
phi
::
backends
::
cpu
::
avx
)
&&
d
>=
block
;
}
}
// namespace intrinsic
...
...
paddle/fluid/operators/jit/more/intrinsic/layer_norm.cc
浏览文件 @
344b99e1
...
...
@@ -17,7 +17,7 @@
#include <limits>
#include "paddle/fluid/operators/jit/registry.h"
#include "paddle/
fluid/platform
/cpu_info.h"
#include "paddle/
phi/backends/cpu
/cpu_info.h"
namespace
paddle
{
namespace
operators
{
...
...
@@ -179,7 +179,8 @@ void LayerNorm(float* x,
}
bool
LayerNormKernel
::
CanBeUsed
(
const
int
&
d
)
const
{
return
platform
::
MayIUse
(
platform
::
avx
)
&&
d
>=
YMM_FLOAT_BLOCK
;
return
phi
::
backends
::
cpu
::
MayIUse
(
phi
::
backends
::
cpu
::
avx
)
&&
d
>=
YMM_FLOAT_BLOCK
;
}
}
// namespace intrinsic
...
...
paddle/fluid/operators/jit/more/mkl/mkl.cc
浏览文件 @
344b99e1
...
...
@@ -16,8 +16,8 @@
#include "paddle/fluid/operators/jit/refer/refer.h"
#include "paddle/fluid/operators/jit/registry.h"
#include "paddle/fluid/platform/cpu_info.h"
#include "paddle/fluid/platform/dynload/mklml.h"
#include "paddle/phi/backends/cpu/cpu_info.h"
namespace
paddle
{
namespace
operators
{
...
...
@@ -188,17 +188,17 @@ void StrideASum<double>(const double* x, double* res, int n, int stride) {
// TODO(TJ): tuning me carefully on AVX, AVX2 and AVX512
template
<
>
bool
VMulKernel
<
float
>::
CanBeUsed
(
const
int
&
d
)
const
{
return
p
latform
::
MayIUse
(
platform
::
avx512f
)
&&
d
>
512
;
return
p
hi
::
backends
::
cpu
::
MayIUse
(
phi
::
backends
::
cpu
::
avx512f
)
&&
d
>
512
;
}
template
<
>
bool
VAddKernel
<
float
>::
CanBeUsed
(
const
int
&
d
)
const
{
return
p
latform
::
MayIUse
(
platform
::
avx
)
&&
d
>
512
;
return
p
hi
::
backends
::
cpu
::
MayIUse
(
phi
::
backends
::
cpu
::
avx
)
&&
d
>
512
;
}
template
<
>
bool
VScalKernel
<
float
>::
CanBeUsed
(
const
int
&
d
)
const
{
return
p
latform
::
MayIUse
(
platform
::
avx512f
)
&&
d
>
512
;
return
p
hi
::
backends
::
cpu
::
MayIUse
(
phi
::
backends
::
cpu
::
avx512f
)
&&
d
>
512
;
}
template
<
>
...
...
@@ -274,7 +274,7 @@ bool SgdKernel<double>::CanBeUsed(const sgd_attr_t& attr) const {
template
<
>
bool
MatMulKernel
<
float
>::
CanBeUsed
(
const
matmul_attr_t
&
attr
)
const
{
return
p
latform
::
MayIUse
(
platform
::
avx
);
return
p
hi
::
backends
::
cpu
::
MayIUse
(
phi
::
backends
::
cpu
::
avx
);
}
template
<
>
...
...
@@ -285,7 +285,7 @@ bool MatMulKernel<double>::CanBeUsed(const matmul_attr_t& attr) const {
template
<
>
bool
SoftmaxKernel
<
float
>::
CanBeUsed
(
const
int
&
d
)
const
{
// tuned on avx2
return
p
latform
::
MayIUse
(
platform
::
avx
)
&&
d
<
60
;
return
p
hi
::
backends
::
cpu
::
MayIUse
(
phi
::
backends
::
cpu
::
avx
)
&&
d
<
60
;
}
#define AWALYS_USE_ME_WITH_DOUBLE(func) \
...
...
paddle/fluid/operators/jit/test.cc
浏览文件 @
344b99e1
...
...
@@ -19,8 +19,8 @@ limitations under the License. */
#include "glog/logging.h"
#include "gtest/gtest.h"
#include "paddle/fluid/operators/jit/kernels.h"
#include "paddle/fluid/platform/cpu_info.h"
#include "paddle/fluid/platform/place.h"
#include "paddle/phi/backends/cpu/cpu_info.h"
DEFINE_double
(
acc
,
1e-5
,
"Test accuracy threshold."
);
...
...
@@ -437,7 +437,7 @@ void TestKernelNCHW16CMulNC() {
EXPECT_TRUE
(
tgt
!=
nullptr
);
if
(
std
::
is_same
<
T
,
float
>::
value
&&
p
addle
::
platform
::
MayIUse
(
paddle
::
platform
::
avx512f
))
{
p
hi
::
backends
::
cpu
::
MayIUse
(
phi
::
backends
::
cpu
::
avx512f
))
{
EXPECT_TRUE
(
jitcode
!=
nullptr
);
}
for
(
int
ni
=
0
;
ni
<
n
;
ni
++
)
{
...
...
@@ -1393,7 +1393,7 @@ TEST(JITKernel_helper, pack_weights) {
}
int
block
=
0
;
std
::
vector
<
int
>
groups
;
if
(
p
addle
::
platform
::
MayIUse
(
paddle
::
platform
::
avx512f
))
{
if
(
p
hi
::
backends
::
cpu
::
MayIUse
(
phi
::
backends
::
cpu
::
avx512f
))
{
block
=
ZMM_FLOAT_BLOCK
;
groups
.
push_back
(
30
);
}
else
{
...
...
paddle/fluid/operators/math/CMakeLists.txt
浏览文件 @
344b99e1
...
...
@@ -32,7 +32,6 @@ math_library(maxouting)
math_library
(
sequence_padding
)
math_library
(
sequence_pooling DEPS math_function jit_kernel_helper
)
math_library
(
sequence_scale
)
math_library
(
softmax DEPS math_function jit_kernel_helper
)
if
(
WITH_ASCEND_CL
)
math_library
(
beam_search DEPS math_function beam_search_npu
)
elseif
(
WITH_XPU
)
...
...
paddle/fluid/operators/math/cpu_vec.h
已删除
100644 → 0
浏览文件 @
4672ea8e
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <cmath>
#include <functional>
#include <string>
#include "paddle/fluid/platform/cpu_info.h"
#include "paddle/fluid/platform/enforce.h"
#ifdef PADDLE_WITH_MKLML
#include "paddle/fluid/platform/dynload/mklml.h"
#endif
namespace
paddle
{
namespace
operators
{
namespace
math
{
#define SIGMOID_THRESHOLD_MIN -40.0
#define SIGMOID_THRESHOLD_MAX 13.0
#define YMM_FLOAT_BLOCK 8
#define AVX_DOUBLE_BLOCK 4
#define YMM_FLOAT_BLOCK 8
#define AVX2_DOUBLE_BLOCK 4
#define ZMM_FLOAT_BLOCK 16
#define AVX512_DOUBLE_BLOCK 8
template
<
typename
T
>
inline
void
vec_exp
(
const
int
n
,
const
T
*
x
,
T
*
y
)
{
for
(
int
i
=
0
;
i
<
n
;
++
i
)
{
y
[
i
]
=
std
::
exp
(
x
[
i
]);
}
}
template
<
typename
T
>
inline
void
vec_scal
(
const
int
n
,
const
T
a
,
T
*
x
)
{
for
(
int
i
=
0
;
i
<
n
;
++
i
)
{
x
[
i
]
=
a
*
x
[
i
];
}
}
#ifdef PADDLE_WITH_MKLML
template
<
>
inline
void
vec_exp
<
float
>
(
const
int
n
,
const
float
*
x
,
float
*
y
)
{
constexpr
int
small_enough
=
128
;
if
(
n
<
small_enough
)
{
for
(
int
i
=
0
;
i
<
n
;
++
i
)
{
y
[
i
]
=
std
::
exp
(
x
[
i
]);
}
}
else
{
platform
::
dynload
::
vsExp
(
n
,
x
,
y
);
}
}
template
<
>
inline
void
vec_exp
<
double
>
(
const
int
n
,
const
double
*
x
,
double
*
y
)
{
platform
::
dynload
::
vdExp
(
n
,
x
,
y
);
}
template
<
>
inline
void
vec_scal
<
float
>
(
const
int
n
,
const
float
a
,
float
*
x
)
{
platform
::
dynload
::
cblas_sscal
(
n
,
a
,
x
,
1
);
}
template
<
>
inline
void
vec_scal
<
double
>
(
const
int
n
,
const
double
a
,
double
*
x
)
{
platform
::
dynload
::
cblas_dscal
(
n
,
a
,
x
,
1
);
}
#endif
// MKL scal only support inplace, choose this if src and dst are not equal
template
<
typename
T
,
platform
::
cpu_isa_t
isa
=
platform
::
isa_any
>
inline
void
vec_scal
(
const
int
n
,
const
T
a
,
const
T
*
x
,
T
*
y
)
{
for
(
int
i
=
0
;
i
<
n
;
++
i
)
{
y
[
i
]
=
a
*
x
[
i
];
}
}
template
<
>
inline
void
vec_scal
<
float
,
platform
::
avx
>
(
const
int
n
,
const
float
a
,
const
float
*
x
,
float
*
y
)
{
#ifdef __AVX__
constexpr
int
block
=
YMM_FLOAT_BLOCK
;
if
(
n
<
block
)
{
vec_scal
<
float
,
platform
::
isa_any
>
(
n
,
a
,
x
,
y
);
return
;
}
const
int
rest
=
n
%
block
;
const
int
end
=
n
-
rest
;
int
i
=
0
;
__m256
scalar
=
_mm256_set1_ps
(
a
);
__m256
tmp
;
#define MOVE_ONE_STEP \
tmp = _mm256_loadu_ps(x + i); \
tmp = _mm256_mul_ps(tmp, scalar); \
_mm256_storeu_ps(y + i, tmp)
for
(
i
=
0
;
i
<
end
;
i
+=
block
)
{
MOVE_ONE_STEP
;
}
#undef MOVE_ONE_STEP
if
(
rest
==
0
)
{
return
;
}
// can not continue move step if src and dst are inplace
for
(
i
=
n
-
rest
;
i
<
n
;
++
i
)
{
y
[
i
]
=
a
*
x
[
i
];
}
#else
vec_scal
<
float
,
platform
::
isa_any
>
(
n
,
a
,
x
,
y
);
#endif
}
template
<
>
inline
void
vec_scal
<
float
,
platform
::
avx2
>
(
const
int
n
,
const
float
a
,
const
float
*
x
,
float
*
y
)
{
vec_scal
<
float
,
platform
::
avx
>
(
n
,
a
,
x
,
y
);
}
template
<
>
inline
void
vec_scal
<
float
,
platform
::
avx512f
>
(
const
int
n
,
const
float
a
,
const
float
*
x
,
float
*
y
)
{
// TODO(TJ): enable me
vec_scal
<
float
,
platform
::
avx2
>
(
n
,
a
,
x
,
y
);
}
template
<
typename
T
,
platform
::
cpu_isa_t
isa
=
platform
::
isa_any
>
inline
void
vec_sum
(
const
size_t
n
,
const
T
*
x
,
T
*
s
)
{
s
[
0
]
=
x
[
0
];
for
(
size_t
i
=
1
;
i
<
n
;
++
i
)
{
s
[
0
]
+=
x
[
i
];
}
}
template
<
>
inline
void
vec_sum
<
float
,
platform
::
avx
>
(
const
size_t
n
,
const
float
*
x
,
float
*
s
)
{
#ifdef __AVX__
constexpr
unsigned
int
block
=
YMM_FLOAT_BLOCK
;
if
(
n
<
block
)
{
vec_sum
<
float
,
platform
::
isa_any
>
(
n
,
x
,
s
);
return
;
}
unsigned
int
i
,
end
;
i
=
end
=
0
;
s
[
0
]
=
0.
f
;
end
=
n
&
~
(
block
-
1
);
__m256
tmp
=
_mm256_setzero_ps
();
for
(
i
=
0
;
i
<
end
;
i
+=
block
)
{
tmp
=
_mm256_add_ps
(
tmp
,
_mm256_loadu_ps
(
x
+
i
));
}
__m256
hsum
=
_mm256_hadd_ps
(
tmp
,
tmp
);
hsum
=
_mm256_add_ps
(
hsum
,
_mm256_permute2f128_ps
(
hsum
,
hsum
,
0x1
));
_mm_store_ss
(
s
,
_mm_hadd_ps
(
_mm256_castps256_ps128
(
hsum
),
_mm256_castps256_ps128
(
hsum
)));
for
(;
i
<
n
;
i
++
)
{
s
[
0
]
+=
x
[
i
];
}
#else
vec_sum
<
float
,
platform
::
isa_any
>
(
n
,
x
,
s
);
#endif
}
template
<
typename
T
,
platform
::
cpu_isa_t
isa
=
platform
::
isa_any
>
inline
void
vec_mul
(
const
size_t
n
,
const
T
*
x
,
const
T
*
y
,
T
*
z
)
{
for
(
size_t
i
=
0
;
i
<
n
;
++
i
)
{
z
[
i
]
=
x
[
i
]
*
y
[
i
];
}
}
template
<
>
inline
void
vec_mul
<
float
,
platform
::
avx
>
(
const
size_t
n
,
const
float
*
x
,
const
float
*
y
,
float
*
z
)
{
#ifdef __AVX__
constexpr
unsigned
int
block
=
YMM_FLOAT_BLOCK
;
if
(
n
<
block
)
{
vec_mul
<
float
,
platform
::
isa_any
>
(
n
,
x
,
y
,
z
);
return
;
}
unsigned
int
i
=
0
,
end
=
0
;
end
=
n
&
~
(
block
-
1
);
for
(
i
=
0
;
i
<
end
;
i
+=
block
)
{
_mm256_storeu_ps
(
z
+
i
,
_mm256_mul_ps
(
_mm256_loadu_ps
(
x
+
i
),
_mm256_loadu_ps
(
y
+
i
)));
}
for
(;
i
<
n
;
i
++
)
{
z
[
i
]
=
x
[
i
]
*
y
[
i
];
}
#else
vec_mul
<
float
,
platform
::
isa_any
>
(
n
,
x
,
y
,
z
);
#endif
}
template
<
typename
T
,
platform
::
cpu_isa_t
isa
=
platform
::
isa_any
>
inline
void
vec_mul_reduce
(
const
size_t
n
,
const
T
*
x
,
const
T
*
y
,
T
*
z
)
{
z
[
0
]
=
x
[
0
]
*
y
[
0
];
for
(
size_t
i
=
1
;
i
<
n
;
++
i
)
{
z
[
0
]
+=
x
[
i
]
*
y
[
i
];
}
}
template
<
>
inline
void
vec_mul_reduce
<
float
,
platform
::
avx
>
(
const
size_t
n
,
const
float
*
x
,
const
float
*
y
,
float
*
z
)
{
#ifdef __AVX__
constexpr
unsigned
int
block
=
YMM_FLOAT_BLOCK
;
if
(
n
<
block
)
{
vec_mul_reduce
<
float
,
platform
::
isa_any
>
(
n
,
x
,
y
,
z
);
return
;
}
unsigned
int
i
=
0
,
end
=
0
;
z
[
0
]
=
0.
f
;
end
=
n
&
~
(
block
-
1
);
__m256
tmp
=
_mm256_setzero_ps
();
for
(
i
=
0
;
i
<
end
;
i
+=
block
)
{
tmp
=
_mm256_add_ps
(
tmp
,
_mm256_mul_ps
(
_mm256_loadu_ps
(
x
+
i
),
_mm256_loadu_ps
(
y
+
i
)));
}
__m256
hsum
=
_mm256_hadd_ps
(
tmp
,
tmp
);
hsum
=
_mm256_add_ps
(
hsum
,
_mm256_permute2f128_ps
(
hsum
,
hsum
,
0x1
));
_mm_store_ss
(
z
,
_mm_hadd_ps
(
_mm256_castps256_ps128
(
hsum
),
_mm256_castps256_ps128
(
hsum
)));
for
(;
i
<
n
;
i
++
)
{
z
[
0
]
+=
x
[
i
]
*
y
[
i
];
}
#else
vec_mul_reduce
<
float
,
platform
::
isa_any
>
(
n
,
x
,
y
,
z
);
#endif
}
template
<
typename
T
,
platform
::
cpu_isa_t
isa
=
platform
::
isa_any
>
inline
void
vec_bias_sub
(
const
int
n
,
const
T
a
,
const
T
*
x
,
T
*
y
)
{
for
(
int
i
=
0
;
i
<
n
;
++
i
)
{
y
[
i
]
=
a
-
x
[
i
];
}
}
template
<
>
inline
void
vec_bias_sub
<
float
,
platform
::
avx
>
(
const
int
n
,
const
float
a
,
const
float
*
x
,
float
*
y
)
{
#ifdef __AVX__
constexpr
int
block
=
YMM_FLOAT_BLOCK
;
if
(
n
<
block
)
{
vec_bias_sub
<
float
,
platform
::
isa_any
>
(
n
,
a
,
x
,
y
);
return
;
}
const
int
rest
=
n
%
block
;
const
int
end
=
n
-
rest
;
int
i
=
0
;
__m256
bias
=
_mm256_set1_ps
(
a
);
__m256
tmp
;
#define MOVE_ONE_STEP \
tmp = _mm256_loadu_ps(x + i); \
tmp = _mm256_sub_ps(bias, tmp); \
_mm256_storeu_ps(y + i, tmp)
for
(
i
=
0
;
i
<
end
;
i
+=
block
)
{
MOVE_ONE_STEP
;
}
#undef MOVE_ONE_STEP
if
(
rest
==
0
)
{
return
;
}
// can not continue move step if src and dst are inplace
for
(
i
=
n
-
rest
;
i
<
n
;
++
i
)
{
y
[
i
]
=
a
-
x
[
i
];
}
#else
vec_bias_sub
<
float
,
platform
::
isa_any
>
(
n
,
a
,
x
,
y
);
#endif
}
template
<
>
inline
void
vec_bias_sub
<
float
,
platform
::
avx2
>
(
const
int
n
,
const
float
a
,
const
float
*
x
,
float
*
y
)
{
vec_bias_sub
<
float
,
platform
::
avx
>
(
n
,
a
,
x
,
y
);
}
template
<
>
inline
void
vec_bias_sub
<
float
,
platform
::
avx512f
>
(
const
int
n
,
const
float
a
,
const
float
*
x
,
float
*
y
)
{
// TODO(TJ): enable me
vec_bias_sub
<
float
,
platform
::
avx2
>
(
n
,
a
,
x
,
y
);
}
// out = x*y + (1-x)*z
template
<
typename
T
,
platform
::
cpu_isa_t
isa
=
platform
::
isa_any
>
inline
void
vec_cross
(
const
int
n
,
const
T
*
x
,
const
T
*
y
,
const
T
*
z
,
T
*
out
)
{
for
(
int
i
=
0
;
i
<
n
;
++
i
)
{
out
[
i
]
=
x
[
i
]
*
y
[
i
]
+
(
static_cast
<
T
>
(
1
)
-
x
[
i
])
*
z
[
i
];
}
}
template
<
>
inline
void
vec_cross
<
float
,
platform
::
avx
>
(
const
int
n
,
const
float
*
x
,
const
float
*
y
,
const
float
*
z
,
float
*
out
)
{
#ifdef __AVX__
constexpr
int
block
=
YMM_FLOAT_BLOCK
;
if
(
n
<
block
)
{
vec_cross
<
float
,
platform
::
isa_any
>
(
n
,
x
,
y
,
z
,
out
);
return
;
}
const
int
rest
=
n
%
block
;
const
int
end
=
n
-
rest
;
int
i
=
0
;
__m256
bias
=
_mm256_set1_ps
(
1.
f
);
__m256
tmpx
,
tmpy
,
tmpz
;
for
(
i
=
0
;
i
<
end
;
i
+=
block
)
{
tmpx
=
_mm256_loadu_ps
(
x
+
i
);
tmpy
=
_mm256_loadu_ps
(
y
+
i
);
tmpz
=
_mm256_loadu_ps
(
z
+
i
);
tmpy
=
_mm256_mul_ps
(
tmpx
,
tmpy
);
tmpx
=
_mm256_sub_ps
(
bias
,
tmpx
);
tmpz
=
_mm256_mul_ps
(
tmpx
,
tmpz
);
tmpz
=
_mm256_add_ps
(
tmpy
,
tmpz
);
_mm256_storeu_ps
(
out
+
i
,
tmpz
);
}
if
(
rest
==
0
)
{
return
;
}
// can not continue move step if src and dst are inplace
for
(
i
=
n
-
rest
;
i
<
n
;
++
i
)
{
out
[
i
]
=
x
[
i
]
*
y
[
i
]
+
(
1.
f
-
x
[
i
])
*
z
[
i
];
}
#else
vec_cross
<
float
,
platform
::
isa_any
>
(
n
,
x
,
y
,
z
,
out
);
#endif
}
template
<
>
inline
void
vec_cross
<
float
,
platform
::
avx2
>
(
const
int
n
,
const
float
*
x
,
const
float
*
y
,
const
float
*
z
,
float
*
out
)
{
vec_cross
<
float
,
platform
::
avx
>
(
n
,
x
,
y
,
z
,
out
);
}
template
<
>
inline
void
vec_cross
<
float
,
platform
::
avx512f
>
(
const
int
n
,
const
float
*
x
,
const
float
*
y
,
const
float
*
z
,
float
*
out
)
{
// TODO(TJ): enable me
vec_cross
<
float
,
platform
::
avx
>
(
n
,
x
,
y
,
z
,
out
);
}
template
<
typename
T
,
platform
::
cpu_isa_t
isa
=
platform
::
isa_any
>
inline
void
vec_clip
(
const
size_t
n
,
const
T
a
,
const
T
*
x
,
T
*
y
)
{
for
(
size_t
i
=
0
;
i
<
n
;
++
i
)
{
y
[
i
]
=
x
[
i
]
<
a
?
a
:
x
[
i
];
}
}
template
<
>
inline
void
vec_clip
<
float
,
platform
::
avx
>
(
const
size_t
n
,
const
float
a
,
const
float
*
x
,
float
*
y
)
{
#ifdef __AVX__
constexpr
unsigned
int
block
=
YMM_FLOAT_BLOCK
;
if
(
n
<
block
)
{
vec_clip
<
float
,
platform
::
isa_any
>
(
n
,
a
,
x
,
y
);
return
;
}
unsigned
int
i
=
0
,
end
=
0
;
end
=
n
&
~
(
block
-
1
);
__m256
threshold
=
_mm256_set1_ps
(
a
);
for
(
i
=
0
;
i
<
end
;
i
+=
block
)
{
_mm256_storeu_ps
(
y
+
i
,
_mm256_max_ps
(
_mm256_loadu_ps
(
x
+
i
),
threshold
));
}
for
(;
i
<
n
;
i
++
)
{
y
[
i
]
=
x
[
i
]
<
a
?
a
:
x
[
i
];
}
#else
vec_clip
<
float
,
platform
::
isa_any
>
(
n
,
a
,
x
,
y
);
#endif
}
template
<
typename
T
,
platform
::
cpu_isa_t
isa
=
platform
::
isa_any
>
inline
void
vec_add_bias
(
const
int
n
,
const
T
a
,
const
T
*
x
,
T
*
y
)
{
for
(
int
i
=
0
;
i
<
n
;
++
i
)
{
y
[
i
]
=
x
[
i
]
+
a
;
}
}
template
<
>
inline
void
vec_add_bias
<
float
,
platform
::
avx
>
(
const
int
n
,
const
float
a
,
const
float
*
x
,
float
*
y
)
{
#ifdef __AVX__
constexpr
int
block
=
YMM_FLOAT_BLOCK
;
if
(
n
<
block
)
{
vec_add_bias
<
float
,
platform
::
isa_any
>
(
n
,
a
,
x
,
y
);
return
;
}
const
int
rest
=
n
%
block
;
const
int
end
=
n
-
rest
;
int
i
=
0
;
__m256
bias
=
_mm256_set1_ps
(
a
);
__m256
tmp
;
#define MOVE_ONE_STEP \
tmp = _mm256_loadu_ps(x + i); \
tmp = _mm256_add_ps(tmp, bias); \
_mm256_storeu_ps(y + i, tmp)
for
(
i
=
0
;
i
<
end
;
i
+=
block
)
{
MOVE_ONE_STEP
;
}
#undef MOVE_ONE_STEP
if
(
rest
==
0
)
{
return
;
}
// can not continue move step if src and dst are inplace
for
(
i
=
n
-
rest
;
i
<
n
;
++
i
)
{
y
[
i
]
=
x
[
i
]
+
a
;
}
#else
vec_add_bias
<
float
,
platform
::
isa_any
>
(
n
,
a
,
x
,
y
);
#endif
}
template
<
>
inline
void
vec_add_bias
<
float
,
platform
::
avx2
>
(
const
int
n
,
const
float
a
,
const
float
*
x
,
float
*
y
)
{
vec_add_bias
<
float
,
platform
::
avx
>
(
n
,
a
,
x
,
y
);
}
template
<
>
inline
void
vec_add_bias
<
float
,
platform
::
avx512f
>
(
const
int
n
,
const
float
a
,
const
float
*
x
,
float
*
y
)
{
// TODO(TJ): enable me
vec_add_bias
<
float
,
platform
::
avx2
>
(
n
,
a
,
x
,
y
);
}
template
<
typename
T
,
platform
::
cpu_isa_t
isa
=
platform
::
isa_any
>
inline
void
vec_identity
(
const
int
n
,
const
T
*
x
,
T
*
y
)
{
// do nothing
return
;
}
template
<
typename
T
,
platform
::
cpu_isa_t
isa
=
platform
::
isa_any
>
inline
void
vec_sigmoid
(
const
int
n
,
const
T
*
x
,
T
*
y
)
{
const
T
min
=
SIGMOID_THRESHOLD_MIN
;
const
T
max
=
SIGMOID_THRESHOLD_MAX
;
for
(
int
i
=
0
;
i
<
n
;
++
i
)
{
y
[
i
]
=
(
x
[
i
]
<
min
)
?
min
:
((
x
[
i
]
>
max
)
?
max
:
x
[
i
]);
y
[
i
]
=
static_cast
<
T
>
(
0
)
-
y
[
i
];
}
vec_exp
<
T
>
(
n
,
y
,
y
);
for
(
int
i
=
0
;
i
<
n
;
++
i
)
{
y
[
i
]
=
static_cast
<
T
>
(
1
)
/
(
static_cast
<
T
>
(
1
)
+
y
[
i
]);
}
}
template
<
>
inline
void
vec_sigmoid
<
float
,
platform
::
avx
>
(
const
int
n
,
const
float
*
x
,
float
*
y
)
{
#ifdef __AVX__
constexpr
int
block
=
YMM_FLOAT_BLOCK
;
if
(
n
<
block
)
{
vec_sigmoid
<
float
,
platform
::
isa_any
>
(
n
,
x
,
y
);
return
;
}
const
int
rest
=
n
%
block
;
const
int
end
=
n
-
rest
;
int
i
=
0
;
__m256
max
=
_mm256_set1_ps
(
SIGMOID_THRESHOLD_MAX
);
__m256
min
=
_mm256_set1_ps
(
SIGMOID_THRESHOLD_MIN
);
__m256
zeros
=
_mm256_setzero_ps
();
__m256
tmp
;
#define MOVE_ONE_STEP \
tmp = _mm256_loadu_ps(x + i); \
tmp = _mm256_max_ps(tmp, min); \
tmp = _mm256_min_ps(tmp, max); \
tmp = _mm256_sub_ps(zeros, tmp); \
_mm256_storeu_ps(y + i, tmp)
for
(
i
=
0
;
i
<
end
;
i
+=
block
)
{
MOVE_ONE_STEP
;
}
#undef MOVE_ONE_STEP
if
(
rest
!=
0
)
{
// can not continue move step since the src and dst address could be equal
const
float
xmin
=
SIGMOID_THRESHOLD_MIN
;
const
float
xmax
=
SIGMOID_THRESHOLD_MAX
;
for
(
i
=
n
-
rest
;
i
<
n
;
++
i
)
{
y
[
i
]
=
0.
f
-
((
x
[
i
]
<
xmin
)
?
xmin
:
((
x
[
i
]
>
xmax
)
?
xmax
:
x
[
i
]));
}
}
vec_exp
<
float
>
(
n
,
y
,
y
);
__m256
ones
=
_mm256_set1_ps
(
1.0
f
);
#define MOVE_ONE_STEP \
tmp = _mm256_loadu_ps(y + i); \
tmp = _mm256_add_ps(ones, tmp); \
tmp = _mm256_div_ps(ones, tmp); \
_mm256_storeu_ps(y + i, tmp)
for
(
i
=
0
;
i
<
end
;
i
+=
block
)
{
MOVE_ONE_STEP
;
}
#undef MOVE_ONE_STEP
if
(
rest
==
0
)
{
return
;
}
// can not continue move step
for
(
i
=
n
-
rest
;
i
<
n
;
++
i
)
{
y
[
i
]
=
1.
f
/
(
1.
f
+
y
[
i
]);
}
#else
vec_sigmoid
<
float
,
platform
::
isa_any
>
(
n
,
x
,
y
);
#endif
}
template
<
>
inline
void
vec_sigmoid
<
float
,
platform
::
avx2
>
(
const
int
n
,
const
float
*
x
,
float
*
y
)
{
vec_sigmoid
<
float
,
platform
::
avx
>
(
n
,
x
,
y
);
}
template
<
>
inline
void
vec_sigmoid
<
float
,
platform
::
avx512f
>
(
const
int
n
,
const
float
*
x
,
float
*
y
)
{
// TODO(TJ): enable me
vec_sigmoid
<
float
,
platform
::
avx2
>
(
n
,
x
,
y
);
}
template
<
typename
T
,
platform
::
cpu_isa_t
isa
=
platform
::
isa_any
>
inline
void
vec_tanh
(
const
int
n
,
const
T
*
x
,
T
*
y
)
{
vec_scal
<
T
,
isa
>
(
n
,
static_cast
<
T
>
(
2
),
x
,
y
);
vec_sigmoid
<
T
,
isa
>
(
n
,
y
,
y
);
vec_scal
<
T
>
(
n
,
static_cast
<
T
>
(
2
),
y
);
vec_add_bias
<
T
,
isa
>
(
n
,
static_cast
<
T
>
(
-
1
),
y
,
y
);
}
// TODO(TJ): make relu clip
template
<
typename
T
,
platform
::
cpu_isa_t
isa
=
platform
::
isa_any
>
inline
void
vec_relu
(
const
int
n
,
const
T
*
x
,
T
*
y
)
{
for
(
int
i
=
0
;
i
<
n
;
++
i
)
{
y
[
i
]
=
x
[
i
]
>
0
?
x
[
i
]
:
0
;
}
}
template
<
>
inline
void
vec_relu
<
float
,
platform
::
avx
>
(
const
int
n
,
const
float
*
x
,
float
*
y
)
{
#ifdef __AVX__
constexpr
int
block
=
YMM_FLOAT_BLOCK
;
if
(
n
<
block
*
4
)
{
vec_relu
<
float
,
platform
::
isa_any
>
(
n
,
x
,
y
);
return
;
}
const
int
rest
=
n
%
block
;
const
int
end
=
n
-
rest
;
int
i
=
0
;
__m256
zeros
=
_mm256_setzero_ps
();
__m256
tmp
;
#define MOVE_ONE_STEP \
tmp = _mm256_loadu_ps(x + i); \
tmp = _mm256_max_ps(tmp, zeros); \
_mm256_storeu_ps(y + i, tmp)
for
(
i
=
0
;
i
<
end
;
i
+=
block
)
{
MOVE_ONE_STEP
;
}
if
(
rest
==
0
)
{
return
;
}
i
=
n
-
block
;
MOVE_ONE_STEP
;
#undef MOVE_ONE_STEP
#else
vec_relu
<
float
,
platform
::
isa_any
>
(
n
,
x
,
y
);
#endif
}
template
<
>
inline
void
vec_relu
<
float
,
platform
::
avx2
>
(
const
int
n
,
const
float
*
x
,
float
*
y
)
{
vec_relu
<
float
,
platform
::
avx
>
(
n
,
x
,
y
);
}
template
<
>
inline
void
vec_relu
<
float
,
platform
::
avx512f
>
(
const
int
n
,
const
float
*
x
,
float
*
y
)
{
// TODO(TJ): enable me
vec_relu
<
float
,
platform
::
avx2
>
(
n
,
x
,
y
);
}
// TODO(TJ): optimize double of sigmoid, tanh and relu if necessary
template
<
typename
T
,
platform
::
cpu_isa_t
isa
=
platform
::
isa_any
>
class
VecActivations
{
public:
std
::
function
<
void
(
const
int
,
const
T
*
,
T
*
)
>
operator
()(
const
std
::
string
&
type
)
{
if
(
type
==
"sigmoid"
)
{
return
vec_sigmoid
<
T
,
isa
>
;
}
else
if
(
type
==
"relu"
)
{
return
vec_relu
<
T
,
isa
>
;
}
else
if
(
type
==
"tanh"
)
{
return
vec_tanh
<
T
,
isa
>
;
}
else
if
(
type
==
"identity"
||
type
==
""
)
{
return
vec_identity
<
T
,
isa
>
;
}
PADDLE_THROW
(
platform
::
errors
::
InvalidArgument
(
"Expected type should be one of sigmod, relu, tanh, identity. But got "
"not support type: %s."
,
type
));
}
};
}
// namespace math
}
// namespace operators
}
// namespace paddle
paddle/fluid/operators/sample_logits_op.cu
浏览文件 @
344b99e1
...
...
@@ -21,9 +21,9 @@ limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/tensor_util.h"
#include "paddle/fluid/operators/math/sample_prob.h"
#include "paddle/fluid/operators/math/softmax.h"
#include "paddle/fluid/operators/sample_logits_op.h"
#include "paddle/phi/kernels/funcs/math_function.h"
#include "paddle/phi/kernels/funcs/softmax.h"
namespace
paddle
{
namespace
operators
{
...
...
paddle/fluid/operators/sample_logits_op.h
浏览文件 @
344b99e1
...
...
@@ -21,8 +21,8 @@ limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/tensor_util.h"
#include "paddle/fluid/operators/math/sample_prob.h"
#include "paddle/fluid/operators/math/softmax.h"
#include "paddle/phi/kernels/funcs/math_function.h"
#include "paddle/phi/kernels/funcs/softmax.h"
namespace
paddle
{
namespace
operators
{
...
...
paddle/fluid/operators/sequence_ops/sequence_softmax_cudnn_op.cu.cc
浏览文件 @
344b99e1
...
...
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/softmax.h"
#include "paddle/phi/kernels/funcs/math_function.h"
#include "paddle/phi/kernels/funcs/softmax.h"
namespace
paddle
{
namespace
operators
{
...
...
@@ -61,7 +61,7 @@ class SequenceSoftmaxCUDNNKernel : public framework::OpKernel<T> {
phi
::
make_ddim
({
1UL
,
end_pos
-
start_pos
});
x_i
.
Resize
(
dims_i
);
out_i
.
Resize
(
dims_i
);
math
::
SoftmaxCUDNNFunctor
<
T
,
phi
::
GPUContext
>
()(
phi
::
funcs
::
SoftmaxCUDNNFunctor
<
T
,
phi
::
GPUContext
>
()(
ctx
.
template
device_context
<
phi
::
GPUContext
>(),
&
x_i
,
&
out_i
);
}
}
...
...
@@ -95,7 +95,7 @@ class SequenceSoftmaxGradCUDNNKernel : public framework::OpKernel<T> {
out_i
.
Resize
(
dims_i
);
out_grad_i
.
Resize
(
dims_i
);
x_grad_i
.
Resize
(
dims_i
);
math
::
SoftmaxGradCUDNNFunctor
<
T
,
phi
::
GPUContext
>
()(
phi
::
funcs
::
SoftmaxGradCUDNNFunctor
<
T
,
phi
::
GPUContext
>
()(
ctx
.
template
device_context
<
phi
::
GPUContext
>(),
&
out_i
,
&
out_grad_i
,
...
...
paddle/fluid/operators/softmax_with_cross_entropy_op_npu.cc
浏览文件 @
344b99e1
...
...
@@ -16,10 +16,10 @@ limitations under the License. */
#include <string>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/softmax.h"
#include "paddle/fluid/platform/device/npu/npu_op_runner.h"
#include "paddle/phi/kernels/funcs/axis_utils.h"
#include "paddle/phi/kernels/funcs/cross_entropy.h"
#include "paddle/phi/kernels/funcs/softmax.h"
namespace
paddle
{
namespace
operators
{
...
...
paddle/fluid/platform/CMakeLists.txt
浏览文件 @
344b99e1
...
...
@@ -56,18 +56,10 @@ cc_test(
SRCS enforce_test.cc
DEPS enforce
)
set
(
CPU_INFO_DEPS gflags glog enforce
)
if
(
WITH_XBYAK
)
list
(
APPEND CPU_INFO_DEPS xbyak
)
endif
()
cc_library
(
cpu_info
SRCS cpu_info.cc
DEPS
${
CPU_INFO_DEPS
}
)
cc_test
(
cpu_info_test
SRCS cpu_info_test.cc
DEPS
cpu_info
)
DEPS
phi_backends
)
cc_library
(
os_info
SRCS os_info.cc
...
...
@@ -194,7 +186,6 @@ cc_library(
phi_place
eigen3
cpu_helper
cpu_info
framework_proto
${
IPU_CTX_DEPS
}
${
GPU_CTX_DEPS
}
...
...
paddle/fluid/platform/cpu_info.h
已删除
100644 → 0
浏览文件 @
4672ea8e
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <stddef.h>
#ifdef _WIN32
#if defined(__AVX2__)
#include <immintrin.h> // avx2
#elif defined(__AVX__)
#include <intrin.h> // avx
#endif // AVX
#else // WIN32
#ifdef __AVX__
#include <immintrin.h>
#endif
#endif // WIN32
#if defined(_WIN32)
#define ALIGN32_BEG __declspec(align(32))
#define ALIGN32_END
#else
#define ALIGN32_BEG
#define ALIGN32_END __attribute__((aligned(32)))
#endif // _WIN32
#ifndef PADDLE_WITH_XBYAK
#ifdef _WIN32
#define cpuid(reg, x) __cpuidex(reg, x, 0)
#else
#if !defined(WITH_NV_JETSON) && !defined(PADDLE_WITH_ARM) && \
!defined(PADDLE_WITH_SW) && !defined(PADDLE_WITH_MIPS)
#include <cpuid.h>
inline
void
cpuid
(
int
reg
[
4
],
int
x
)
{
__cpuid_count
(
x
,
0
,
reg
[
0
],
reg
[
1
],
reg
[
2
],
reg
[
3
]);
}
#endif
#endif
#endif
#include "paddle/phi/backends/cpu/cpu_info.h"
namespace
paddle
{
namespace
platform
{
size_t
CpuTotalPhysicalMemory
();
//! Get the maximum allocation size for a machine.
size_t
CpuMaxAllocSize
();
//! Get the maximum allocation size for a machine.
size_t
CUDAPinnedMaxAllocSize
();
using
phi
::
backends
::
cpu
::
CpuMinChunkSize
;
//! Get the maximum chunk size for buddy allocator.
size_t
CpuMaxChunkSize
();
//! Get the minimum chunk size for buddy allocator.
size_t
CUDAPinnedMinChunkSize
();
//! Get the maximum chunk size for buddy allocator.
size_t
CUDAPinnedMaxChunkSize
();
//! Get the maximum allocation size for a machine.
size_t
NPUPinnedMaxAllocSize
();
//! Get the minimum chunk size for buddy allocator.
size_t
NPUPinnedMinChunkSize
();
//! Get the maximum chunk size for buddy allocator.
size_t
NPUPinnedMaxChunkSize
();
using
namespace
phi
::
backends
::
cpu
;
// NOLINT
// May I use some instruction
bool
MayIUse
(
const
cpu_isa_t
cpu_isa
);
}
// namespace platform
}
// namespace paddle
paddle/fluid/platform/cpu_info_test.cc
浏览文件 @
344b99e1
...
...
@@ -11,7 +11,7 @@
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/
fluid/platform
/cpu_info.h"
#include "paddle/
phi/backends/cpu
/cpu_info.h"
#include <sstream>
...
...
@@ -23,7 +23,8 @@ DECLARE_double(fraction_of_cpu_memory_to_use);
TEST
(
CpuMemoryUsage
,
Print
)
{
std
::
stringstream
ss
;
size_t
memory_size
=
paddle
::
platform
::
CpuMaxAllocSize
()
/
1024
/
1024
/
1024
;
size_t
memory_size
=
phi
::
backends
::
cpu
::
CpuMaxAllocSize
()
/
1024
/
1024
/
1024
;
float
use_percent
=
FLAGS_fraction_of_cpu_memory_to_use
*
100
;
std
::
cout
<<
paddle
::
string
::
Sprintf
(
"
\n
%.2f %% of CPU Memory Usage: %d GB
\n
"
,
...
...
paddle/fluid/platform/init.cc
浏览文件 @
344b99e1
...
...
@@ -16,9 +16,9 @@ limitations under the License. */
#include <string>
#include "paddle/fluid/platform/cpu_helper.h"
#include "paddle/fluid/platform/cpu_info.h"
#include "paddle/fluid/platform/device/npu/npu_info.h"
#include "paddle/fluid/string/split.h"
#include "paddle/phi/backends/cpu/cpu_info.h"
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#include "paddle/fluid/platform/cuda_device_guard.h"
#endif
...
...
paddle/fluid/platform/profiler/CMakeLists.txt
浏览文件 @
344b99e1
...
...
@@ -29,7 +29,7 @@ cc_library(
cc_library
(
cpu_utilization
SRCS cpu_utilization.cc
DEPS
cpu_info
os_info enforce glog
)
DEPS
phi_backends
os_info enforce glog
)
cc_library
(
new_profiler
SRCS profiler.cc
...
...
paddle/fluid/pybind/parallel_executor.cc
浏览文件 @
344b99e1
...
...
@@ -72,7 +72,6 @@ limitations under the License. */
#include "paddle/fluid/operators/common_infer_shape_functions.h"
#include "paddle/fluid/operators/py_func_op.h"
#include "paddle/fluid/platform/cpu_helper.h"
#include "paddle/fluid/platform/cpu_info.h"
#include "paddle/fluid/platform/device/device_wrapper.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/dynload/dynamic_loader.h"
...
...
@@ -89,6 +88,7 @@ limitations under the License. */
#include "paddle/fluid/pybind/eager.h"
#include "paddle/fluid/pybind/imperative.h"
#include "paddle/fluid/pybind/io.h"
#include "paddle/phi/backends/cpu/cpu_info.h"
#include "paddle/phi/core/compat/convert_utils.h"
#include "paddle/phi/core/lod_utils.h"
#include "paddle/utils/none.h"
...
...
paddle/fluid/pybind/place.cc
浏览文件 @
344b99e1
...
...
@@ -72,7 +72,6 @@ limitations under the License. */
#include "paddle/fluid/operators/common_infer_shape_functions.h"
#include "paddle/fluid/operators/py_func_op.h"
#include "paddle/fluid/platform/cpu_helper.h"
#include "paddle/fluid/platform/cpu_info.h"
#include "paddle/fluid/platform/device/device_wrapper.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/dynload/dynamic_loader.h"
...
...
@@ -89,6 +88,7 @@ limitations under the License. */
#include "paddle/fluid/pybind/eager.h"
#include "paddle/fluid/pybind/imperative.h"
#include "paddle/fluid/pybind/io.h"
#include "paddle/phi/backends/cpu/cpu_info.h"
#include "paddle/phi/core/compat/convert_utils.h"
#include "paddle/phi/core/lod_utils.h"
#include "paddle/utils/none.h"
...
...
paddle/fluid/pybind/pybind.cc
浏览文件 @
344b99e1
...
...
@@ -75,7 +75,6 @@ limitations under the License. */
#include "paddle/fluid/operators/ops_extra_info.h"
#include "paddle/fluid/operators/py_func_op.h"
#include "paddle/fluid/platform/cpu_helper.h"
#include "paddle/fluid/platform/cpu_info.h"
#include "paddle/fluid/platform/device/device_wrapper.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/dynload/dynamic_loader.h"
...
...
@@ -94,6 +93,7 @@ limitations under the License. */
#include "paddle/fluid/pybind/io.h"
#include "paddle/fluid/pybind/jit.h"
#include "paddle/fluid/pybind/xpu_streams_py.h"
#include "paddle/phi/backends/cpu/cpu_info.h"
#include "paddle/phi/core/compat/convert_utils.h"
#include "paddle/phi/core/lod_utils.h"
#include "paddle/utils/none.h"
...
...
@@ -327,7 +327,7 @@ bool SupportsBfloat16() {
#ifndef PADDLE_WITH_MKLDNN
return
false
;
#else
if
(
p
latform
::
MayIUse
(
platform
::
cpu_isa_t
::
avx512_core
))
if
(
p
hi
::
backends
::
cpu
::
MayIUse
(
phi
::
backends
::
cpu
::
cpu_isa_t
::
avx512_core
))
return
true
;
else
return
false
;
...
...
@@ -338,7 +338,7 @@ bool SupportsBfloat16FastPerformance() {
#ifndef PADDLE_WITH_MKLDNN
return
false
;
#else
if
(
p
latform
::
MayIUse
(
platform
::
cpu_isa_t
::
avx512_bf16
))
if
(
p
hi
::
backends
::
cpu
::
MayIUse
(
phi
::
backends
::
cpu
::
cpu_isa_t
::
avx512_bf16
))
return
true
;
else
return
false
;
...
...
@@ -349,8 +349,8 @@ bool SupportsInt8() {
#ifndef PADDLE_WITH_MKLDNN
return
false
;
#else
return
(
p
latform
::
MayIUse
(
platform
::
cpu_isa_t
::
avx2
)
||
p
latform
::
MayIUse
(
platform
::
cpu_isa_t
::
avx512f
));
return
(
p
hi
::
backends
::
cpu
::
MayIUse
(
phi
::
backends
::
cpu
::
cpu_isa_t
::
avx2
)
||
p
hi
::
backends
::
cpu
::
MayIUse
(
phi
::
backends
::
cpu
::
cpu_isa_t
::
avx512f
));
#endif
}
...
...
@@ -358,7 +358,8 @@ bool SupportsVNNI() {
#ifndef PADDLE_WITH_MKLDNN
return
false
;
#else
return
platform
::
MayIUse
(
platform
::
cpu_isa_t
::
avx512_core_vnni
);
return
phi
::
backends
::
cpu
::
MayIUse
(
phi
::
backends
::
cpu
::
cpu_isa_t
::
avx512_core_vnni
);
#endif
}
...
...
@@ -615,7 +616,7 @@ PYBIND11_MODULE(libpaddle, m) {
BindJit
(
&
m
);
// Not used, just make sure cpu_info.cc is linked.
p
addle
::
platform
::
CpuTotalPhysicalMemory
();
p
hi
::
backends
::
cpu
::
CpuTotalPhysicalMemory
();
paddle
::
memory
::
allocation
::
UseAllocatorStrategyGFlag
();
...
...
paddle/fluid/pybind/tensor.cc
浏览文件 @
344b99e1
...
...
@@ -72,7 +72,6 @@ limitations under the License. */
#include "paddle/fluid/operators/common_infer_shape_functions.h"
#include "paddle/fluid/operators/py_func_op.h"
#include "paddle/fluid/platform/cpu_helper.h"
#include "paddle/fluid/platform/cpu_info.h"
#include "paddle/fluid/platform/device/device_wrapper.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/dynload/dynamic_loader.h"
...
...
@@ -89,6 +88,7 @@ limitations under the License. */
#include "paddle/fluid/pybind/eager.h"
#include "paddle/fluid/pybind/imperative.h"
#include "paddle/fluid/pybind/io.h"
#include "paddle/phi/backends/cpu/cpu_info.h"
#include "paddle/phi/core/compat/convert_utils.h"
#include "paddle/phi/core/lod_utils.h"
#include "paddle/utils/none.h"
...
...
paddle/phi/backends/CMakeLists.txt
浏览文件 @
344b99e1
add_subdirectory
(
dynload
)
add_subdirectory
(
gpu
)
set
(
BACKENDS_SRCS all_context.cc cpu/cpu_context.cc
)
set
(
BACKENDS_SRCS all_context.cc cpu/cpu_context.cc
cpu/cpu_info.cc
)
set
(
BACKENDS_DEPS enforce place flags eigen3 phi_device_context
)
if
(
WITH_XBYAK
)
list
(
APPEND BACKENDS_DEPS xbyak
)
endif
()
if
(
WITH_GPU OR WITH_ROCM
)
list
(
APPEND BACKENDS_SRCS gpu/gpu_context.cc gpu/gpu_info.cc
...
...
paddle/
fluid/platform
/cpu_info.cc
→
paddle/
phi/backends/cpu
/cpu_info.cc
浏览文件 @
344b99e1
/* Copyright (c) 20
16
PaddlePaddle Authors. All Rights Reserved.
/* Copyright (c) 20
22
PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
...
...
@@ -12,11 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/platform/cpu_info.h"
#ifdef PADDLE_WITH_XBYAK
#include "xbyak/xbyak_util.h"
#endif
#include "paddle/phi/backends/cpu/cpu_info.h"
#ifdef __APPLE__
#include <sys/sysctl.h>
...
...
@@ -30,6 +26,10 @@ limitations under the License. */
#include <unistd.h>
#endif // _WIN32
#ifdef PADDLE_WITH_XBYAK
#include "xbyak/xbyak_util.h"
#endif
#include <algorithm>
#include "paddle/phi/core/flags.h"
...
...
@@ -47,8 +47,9 @@ PADDLE_DEFINE_EXPORTED_bool(use_pinned_memory,
true
,
"If set, allocate cpu pinned memory."
);
namespace
paddle
{
namespace
platform
{
namespace
phi
{
namespace
backends
{
namespace
cpu
{
size_t
CpuTotalPhysicalMemory
()
{
#ifdef __APPLE__
...
...
@@ -87,6 +88,11 @@ size_t CpuMaxChunkSize() {
static_cast
<
size_t
>
(
FLAGS_initial_cpu_memory_in_mb
*
1
<<
20
));
}
size_t
CpuMinChunkSize
()
{
// Allow to allocate the minimum chunk size is 4 KB.
return
1
<<
12
;
}
size_t
CUDAPinnedMaxAllocSize
()
{
// For distributed systems, it requires configuring and limiting
// the fraction of memory to use.
...
...
@@ -206,5 +212,6 @@ bool MayIUse(const cpu_isa_t cpu_isa) {
}
#endif
}
// namespace platform
}
// namespace paddle
}
// namespace cpu
}
// namespace backends
}
// namespace phi
paddle/phi/backends/cpu/cpu_info.h
浏览文件 @
344b99e1
...
...
@@ -36,15 +36,52 @@
#define ALIGN32_END __attribute__((aligned(32)))
#endif // _WIN32
#ifndef PADDLE_WITH_XBYAK
#ifdef _WIN32
#define cpuid(reg, x) __cpuidex(reg, x, 0)
#else
#if !defined(WITH_NV_JETSON) && !defined(PADDLE_WITH_ARM) && \
!defined(PADDLE_WITH_SW) && !defined(PADDLE_WITH_MIPS)
#include <cpuid.h>
inline
void
cpuid
(
int
reg
[
4
],
int
x
)
{
__cpuid_count
(
x
,
0
,
reg
[
0
],
reg
[
1
],
reg
[
2
],
reg
[
3
]);
}
#endif
#endif
#endif
namespace
phi
{
namespace
backends
{
namespace
cpu
{
size_t
CpuTotalPhysicalMemory
();
//! Get the maximum allocation size for a machine.
size_t
CpuMaxAllocSize
();
//! Get the maximum allocation size for a machine.
size_t
CUDAPinnedMaxAllocSize
();
//! Get the minimum chunk size for buddy allocator.
inline
size_t
CpuMinChunkSize
()
{
// Allow to allocate the minimum chunk size is 4 KB.
return
1
<<
12
;
}
size_t
CpuMinChunkSize
();
//! Get the maximum chunk size for buddy allocator.
size_t
CpuMaxChunkSize
();
//! Get the minimum chunk size for buddy allocator.
size_t
CUDAPinnedMinChunkSize
();
//! Get the maximum chunk size for buddy allocator.
size_t
CUDAPinnedMaxChunkSize
();
//! Get the maximum allocation size for a machine.
size_t
NPUPinnedMaxAllocSize
();
//! Get the minimum chunk size for buddy allocator.
size_t
NPUPinnedMinChunkSize
();
//! Get the maximum chunk size for buddy allocator.
size_t
NPUPinnedMaxChunkSize
();
typedef
enum
{
isa_any
,
...
...
@@ -59,6 +96,8 @@ typedef enum {
avx512_bf16
,
}
cpu_isa_t
;
// Instruction set architecture
// May I use some instruction
bool
MayIUse
(
const
cpu_isa_t
cpu_isa
);
}
// namespace cpu
}
// namespace backends
}
// namespace phi
paddle/phi/kernels/funcs/CMakeLists.txt
浏览文件 @
344b99e1
...
...
@@ -19,6 +19,7 @@ math_library(matrix_solve DEPS dense_tensor eigen3 blas math_function)
math_library
(
cross_entropy
)
math_library
(
im2col
)
math_library
(
vol2col
)
math_library
(
softmax DEPS math_function
)
cc_library
(
phi_data_layout_transform
...
...
paddle/phi/kernels/funcs/eigen/common.h
浏览文件 @
344b99e1
...
...
@@ -17,6 +17,7 @@ limitations under the License. */
#include <stdint.h>
#include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/kernels/funcs/eigen/extensions.h"
#include "unsupported/Eigen/CXX11/Tensor"
namespace
phi
{
...
...
paddle/
fluid/operators/math
/softmax.cc
→
paddle/
phi/kernels/funcs
/softmax.cc
浏览文件 @
344b99e1
...
...
@@ -12,20 +12,18 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/
fluid/operators/math
/softmax.h"
#include "paddle/
phi/kernels/funcs
/softmax.h"
#include "paddle/fluid/operators/math/softmax_impl.h"
#include "paddle/phi/backends/cpu/cpu_context.h"
#include "paddle/phi/kernels/funcs/softmax_impl.h"
namespace
paddle
{
namespace
operators
{
namespace
math
{
namespace
phi
{
namespace
funcs
{
template
class
SoftmaxFunctor
<
phi
::
CPUContext
,
float
>;
template
class
SoftmaxFunctor
<
phi
::
CPUContext
,
double
>;
template
class
SoftmaxGradFunctor
<
phi
::
CPUContext
,
float
>;
template
class
SoftmaxGradFunctor
<
phi
::
CPUContext
,
double
>;
}
// namespace math
}
// namespace operators
}
// namespace paddle
}
// namespace funcs
}
// namespace phi
paddle/
fluid/operators/math
/softmax.cu
→
paddle/
phi/kernels/funcs
/softmax.cu
浏览文件 @
344b99e1
...
...
@@ -13,20 +13,19 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include <vector>
#include "paddle/fluid/operators/math/softmax.h"
#include "paddle/fluid/operators/math/softmax_impl.h"
#include "paddle/fluid/platform/device/gpu/gpu_dnn.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/backends/gpu/gpu_dnn.h"
#include "paddle/phi/kernels/funcs/math_function.h"
#include "paddle/phi/kernels/funcs/softmax.h"
#include "paddle/phi/kernels/funcs/softmax_impl.h"
namespace
paddle
{
namespace
operators
{
namespace
math
{
namespace
phi
{
namespace
funcs
{
using
ScopedTensorDescriptor
=
p
latform
::
ScopedTensorDescriptor
;
using
DataLayout
=
p
latform
::
DataLayout
;
using
ScopedTensorDescriptor
=
p
hi
::
backends
::
gpu
::
ScopedTensorDescriptor
;
using
DataLayout
=
p
hi
::
backends
::
gpu
::
DataLayout
;
template
<
typename
T
>
using
CudnnDataType
=
p
latform
::
CudnnDataType
<
T
>
;
using
CudnnDataType
=
p
hi
::
backends
::
gpu
::
CudnnDataType
<
T
>
;
template
<
typename
T
,
typename
DeviceContext
>
void
SoftmaxCUDNNFunctor
<
T
,
DeviceContext
>::
operator
()(
...
...
@@ -51,31 +50,31 @@ void SoftmaxCUDNNFunctor<T, DeviceContext>::operator()(
xDesc
.
descriptor
<
T
>
(
layout
,
cudnn_tensor_dims
);
miopenTensorDescriptor_t
cudnn_y_desc
=
xDesc
.
descriptor
<
T
>
(
layout
,
cudnn_tensor_dims
);
PADDLE_ENFORCE_GPU_SUCCESS
(
platform
::
dynload
::
miopenSoftmaxForward_V2
(
context
.
cudnn_handle
(),
CudnnDataType
<
T
>::
kOne
(),
cudnn_x_desc
,
X
->
data
<
T
>
(),
CudnnDataType
<
T
>::
kZero
(),
cudnn_y_desc
,
Y
->
mutable_data
<
T
>
(
context
.
GetPlace
()
),
MIOPEN_SOFTMAX_ACCURATE
,
MIOPEN_SOFTMAX_MODE_INSTANCE
));
PADDLE_ENFORCE_GPU_SUCCESS
(
phi
::
dynload
::
miopenSoftmaxForward_V2
(
context
.
cudnn_handle
(),
CudnnDataType
<
T
>::
kOne
(),
cudnn_x_desc
,
X
->
data
<
T
>
(),
CudnnDataType
<
T
>::
kZero
(),
cudnn_y_desc
,
context
.
template
Alloc
<
T
>(
Y
),
MIOPEN_SOFTMAX_ACCURATE
,
MIOPEN_SOFTMAX_MODE_INSTANCE
));
#else
cudnnTensorDescriptor_t
cudnn_x_desc
=
xDesc
.
descriptor
<
T
>
(
layout
,
cudnn_tensor_dims
);
cudnnTensorDescriptor_t
cudnn_y_desc
=
xDesc
.
descriptor
<
T
>
(
layout
,
cudnn_tensor_dims
);
PADDLE_ENFORCE_GPU_SUCCESS
(
platform
::
dynload
::
cudnnSoftmaxForward
(
context
.
cudnn_handle
(),
CUDNN_SOFTMAX_ACCURATE
,
CUDNN_SOFTMAX_MODE_INSTANCE
,
CudnnDataType
<
T
>::
kOne
(),
cudnn_x_desc
,
X
->
data
<
T
>
(),
CudnnDataType
<
T
>::
kZero
(),
cudnn_y_desc
,
Y
->
mutable_data
<
T
>
(
context
.
GetPlace
()
)));
PADDLE_ENFORCE_GPU_SUCCESS
(
phi
::
dynload
::
cudnnSoftmaxForward
(
context
.
cudnn_handle
(),
CUDNN_SOFTMAX_ACCURATE
,
CUDNN_SOFTMAX_MODE_INSTANCE
,
CudnnDataType
<
T
>::
kOne
(),
cudnn_x_desc
,
X
->
data
<
T
>
(),
CudnnDataType
<
T
>::
kZero
(),
cudnn_y_desc
,
context
.
template
Alloc
<
T
>(
Y
)));
#endif
}
...
...
@@ -106,18 +105,18 @@ void SoftmaxGradCUDNNFunctor<T, DeviceContext>::operator()(
dxDesc
.
descriptor
<
T
>
(
layout
,
cudnn_tensor_dims
);
miopenTensorDescriptor_t
cudnn_ygrad_desc
=
dyDesc
.
descriptor
<
T
>
(
layout
,
cudnn_tensor_dims
);
PADDLE_ENFORCE_GPU_SUCCESS
(
platform
::
dynload
::
miopenSoftmaxBackward_V2
(
context
.
cudnn_handle
(),
CudnnDataType
<
T
>::
kOne
(),
cudnn_y_desc
,
Y
->
data
<
T
>
(),
cudnn_ygrad_desc
,
YGrad
->
data
<
T
>
(),
CudnnDataType
<
T
>::
kZero
(),
cudnn_xgrad_desc
,
XGrad
->
mutable_data
<
T
>
(
context
.
GetPlace
()
),
MIOPEN_SOFTMAX_ACCURATE
,
MIOPEN_SOFTMAX_MODE_INSTANCE
));
PADDLE_ENFORCE_GPU_SUCCESS
(
phi
::
dynload
::
miopenSoftmaxBackward_V2
(
context
.
cudnn_handle
(),
CudnnDataType
<
T
>::
kOne
(),
cudnn_y_desc
,
Y
->
data
<
T
>
(),
cudnn_ygrad_desc
,
YGrad
->
data
<
T
>
(),
CudnnDataType
<
T
>::
kZero
(),
cudnn_xgrad_desc
,
context
.
template
Alloc
<
T
>(
XGrad
),
MIOPEN_SOFTMAX_ACCURATE
,
MIOPEN_SOFTMAX_MODE_INSTANCE
));
#else
cudnnTensorDescriptor_t
cudnn_y_desc
=
yDesc
.
descriptor
<
T
>
(
layout
,
cudnn_tensor_dims
);
...
...
@@ -125,28 +124,28 @@ void SoftmaxGradCUDNNFunctor<T, DeviceContext>::operator()(
dxDesc
.
descriptor
<
T
>
(
layout
,
cudnn_tensor_dims
);
cudnnTensorDescriptor_t
cudnn_ygrad_desc
=
dyDesc
.
descriptor
<
T
>
(
layout
,
cudnn_tensor_dims
);
PADDLE_ENFORCE_GPU_SUCCESS
(
platform
::
dynload
::
cudnnSoftmaxBackward
(
context
.
cudnn_handle
(),
CUDNN_SOFTMAX_ACCURATE
,
CUDNN_SOFTMAX_MODE_INSTANCE
,
CudnnDataType
<
T
>::
kOne
(),
cudnn_y_desc
,
Y
->
data
<
T
>
(),
cudnn_ygrad_desc
,
YGrad
->
data
<
T
>
(),
CudnnDataType
<
T
>::
kZero
(),
cudnn_xgrad_desc
,
XGrad
->
mutable_data
<
T
>
(
context
.
GetPlace
()
)));
PADDLE_ENFORCE_GPU_SUCCESS
(
phi
::
dynload
::
cudnnSoftmaxBackward
(
context
.
cudnn_handle
(),
CUDNN_SOFTMAX_ACCURATE
,
CUDNN_SOFTMAX_MODE_INSTANCE
,
CudnnDataType
<
T
>::
kOne
(),
cudnn_y_desc
,
Y
->
data
<
T
>
(),
cudnn_ygrad_desc
,
YGrad
->
data
<
T
>
(),
CudnnDataType
<
T
>::
kZero
(),
cudnn_xgrad_desc
,
context
.
template
Alloc
<
T
>(
XGrad
)));
#endif
}
template
class
SoftmaxCUDNNFunctor
<
float
,
phi
::
GPUContext
>;
template
class
SoftmaxCUDNNFunctor
<
p
latform
::
float16
,
phi
::
GPUContext
>;
template
class
SoftmaxCUDNNFunctor
<
p
hi
::
dtype
::
float16
,
phi
::
GPUContext
>;
template
class
SoftmaxGradCUDNNFunctor
<
float
,
phi
::
GPUContext
>;
template
class
SoftmaxGradCUDNNFunctor
<
p
latform
::
float16
,
phi
::
GPUContext
>;
template
class
SoftmaxGradCUDNNFunctor
<
p
hi
::
dtype
::
float16
,
phi
::
GPUContext
>;
#if CUDNN_VERSION_MIN(8, 1, 0)
template
class
SoftmaxCUDNNFunctor
<
p
latform
::
bfloat16
,
phi
::
GPUContext
>;
template
class
SoftmaxGradCUDNNFunctor
<
p
latform
::
bfloat16
,
phi
::
GPUContext
>;
template
class
SoftmaxCUDNNFunctor
<
p
hi
::
dtype
::
bfloat16
,
phi
::
GPUContext
>;
template
class
SoftmaxGradCUDNNFunctor
<
p
hi
::
dtype
::
bfloat16
,
phi
::
GPUContext
>;
#endif
// MIOPEN do not support double
...
...
@@ -155,15 +154,14 @@ template class SoftmaxCUDNNFunctor<double, phi::GPUContext>;
template
class
SoftmaxGradCUDNNFunctor
<
double
,
phi
::
GPUContext
>;
#endif
template
class
SoftmaxFunctor
<
phi
::
GPUContext
,
p
latform
::
float16
>;
template
class
SoftmaxFunctor
<
phi
::
GPUContext
,
p
latform
::
bfloat16
>;
template
class
SoftmaxFunctor
<
phi
::
GPUContext
,
p
hi
::
dtype
::
float16
>;
template
class
SoftmaxFunctor
<
phi
::
GPUContext
,
p
hi
::
dtype
::
bfloat16
>;
template
class
SoftmaxFunctor
<
phi
::
GPUContext
,
float
>;
template
class
SoftmaxFunctor
<
phi
::
GPUContext
,
double
>;
template
class
SoftmaxGradFunctor
<
phi
::
GPUContext
,
float
>;
template
class
SoftmaxGradFunctor
<
phi
::
GPUContext
,
double
>;
template
class
SoftmaxGradFunctor
<
phi
::
GPUContext
,
p
latform
::
float16
>;
template
class
SoftmaxGradFunctor
<
phi
::
GPUContext
,
p
latform
::
bfloat16
>;
template
class
SoftmaxGradFunctor
<
phi
::
GPUContext
,
p
hi
::
dtype
::
float16
>;
template
class
SoftmaxGradFunctor
<
phi
::
GPUContext
,
p
hi
::
dtype
::
bfloat16
>;
}
// namespace math
}
// namespace operators
}
// namespace paddle
}
// namespace funcs
}
// namespace phi
paddle/
fluid/operators/math
/softmax.h
→
paddle/
phi/kernels/funcs
/softmax.h
浏览文件 @
344b99e1
...
...
@@ -13,11 +13,10 @@ See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/
fluid/framework/
tensor.h"
#include "paddle/
phi/core/dense_
tensor.h"
namespace
paddle
{
namespace
operators
{
namespace
math
{
namespace
phi
{
namespace
funcs
{
template
<
typename
DeviceContext
,
typename
T
,
typename
Enable
=
void
>
class
SoftmaxFunctor
{
...
...
@@ -58,6 +57,5 @@ class SoftmaxGradCUDNNFunctor {
#endif
}
// namespace math
}
// namespace operators
}
// namespace paddle
}
// namespace funcs
}
// namespace phi
paddle/
fluid/operators/math
/softmax_impl.h
→
paddle/
phi/kernels/funcs
/softmax_impl.h
浏览文件 @
344b99e1
...
...
@@ -15,24 +15,22 @@ limitations under the License. */
#pragma once
#include <vector>
#include "paddle/fluid/framework/eigen.h"
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/operators/jit/kernels.h"
#include "paddle/fluid/operators/math/cpu_vec.h"
#include "paddle/fluid/platform/bfloat16.h"
#include "paddle/fluid/platform/cpu_info.h"
#include "paddle/fluid/platform/float16.h"
#include "paddle/phi/backends/cpu/cpu_context.h"
#include "paddle/phi/backends/cpu/cpu_info.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/common/bfloat16.h"
#include "paddle/phi/common/float16.h"
#include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/kernels/funcs/cpu_vec.h"
#include "paddle/phi/kernels/funcs/eigen/common.h"
namespace
paddle
{
namespace
operators
{
namespace
math
{
namespace
phi
{
namespace
funcs
{
template
<
typename
T
,
int
MajorType
=
Eigen
::
RowMajor
,
typename
IndexType
=
Eigen
::
DenseIndex
>
using
EigenMatrix
=
framework
::
EigenMatrix
<
T
,
MajorType
,
IndexType
>
;
using
EigenMatrix
=
phi
::
EigenMatrix
<
T
,
MajorType
,
IndexType
>
;
template
<
typename
T
>
struct
ValueClip
{
...
...
@@ -104,7 +102,7 @@ class SoftmaxEigen {
};
template
<
typename
DeviceContext
>
class
SoftmaxEigen
<
DeviceContext
,
p
latform
::
float16
>
{
class
SoftmaxEigen
<
DeviceContext
,
p
hi
::
dtype
::
float16
>
{
public:
void
operator
()(
const
DeviceContext
&
context
,
const
int
axis_dim
,
...
...
@@ -114,8 +112,8 @@ class SoftmaxEigen<DeviceContext, platform::float16> {
constexpr
int
kClassDim
=
1
;
constexpr
int
kAxisDim
=
1
;
auto
logits
=
EigenMatrix
<
p
latform
::
float16
>::
From
(
*
X
);
auto
softmax
=
EigenMatrix
<
p
latform
::
float16
>::
From
(
*
Y
);
auto
logits
=
EigenMatrix
<
p
hi
::
dtype
::
float16
>::
From
(
*
X
);
auto
softmax
=
EigenMatrix
<
p
hi
::
dtype
::
float16
>::
From
(
*
Y
);
const
int
batch_size
=
logits
.
dimension
(
kBatchDim
);
const
int
num_classes
=
logits
.
dimension
(
kClassDim
);
...
...
@@ -139,7 +137,7 @@ class SoftmaxEigen<DeviceContext, platform::float16> {
(
logits
-
logits
.
maximum
(
along_axis
)
.
reshape
(
batch_by_one
)
.
broadcast
(
one_by_class
))
.
unaryExpr
(
ValueClip
<
p
latform
::
float16
>
());
.
unaryExpr
(
ValueClip
<
p
hi
::
dtype
::
float16
>
());
}
else
{
// axis != -1, class dimension split into (axis, remain), max and sum
// should be calculated along axis dimension
...
...
@@ -149,7 +147,7 @@ class SoftmaxEigen<DeviceContext, platform::float16> {
.
reshape
(
batch_one_remain
)
.
broadcast
(
one_axis_one
)
.
reshape
(
batch_classes
))
.
unaryExpr
(
ValueClip
<
p
latform
::
float16
>
());
.
unaryExpr
(
ValueClip
<
p
hi
::
dtype
::
float16
>
());
}
softmax
.
device
(
*
context
.
eigen_device
())
=
softmax
.
exp
();
...
...
@@ -162,7 +160,7 @@ class SoftmaxEigen<DeviceContext, platform::float16> {
};
template
<
typename
DeviceContext
>
class
SoftmaxEigen
<
DeviceContext
,
p
latform
::
bfloat16
>
{
class
SoftmaxEigen
<
DeviceContext
,
p
hi
::
dtype
::
bfloat16
>
{
public:
void
operator
()(
const
DeviceContext
&
context
,
const
int
axis_dim
,
...
...
@@ -172,8 +170,8 @@ class SoftmaxEigen<DeviceContext, platform::bfloat16> {
constexpr
int
kClassDim
=
1
;
constexpr
int
kAxisDim
=
1
;
auto
logits
=
EigenMatrix
<
p
latform
::
bfloat16
>::
From
(
*
X
);
auto
softmax
=
EigenMatrix
<
p
latform
::
bfloat16
>::
From
(
*
Y
);
auto
logits
=
EigenMatrix
<
p
hi
::
dtype
::
bfloat16
>::
From
(
*
X
);
auto
softmax
=
EigenMatrix
<
p
hi
::
dtype
::
bfloat16
>::
From
(
*
Y
);
const
int
batch_size
=
logits
.
dimension
(
kBatchDim
);
const
int
num_classes
=
logits
.
dimension
(
kClassDim
);
...
...
@@ -197,7 +195,7 @@ class SoftmaxEigen<DeviceContext, platform::bfloat16> {
(
logits
-
logits
.
maximum
(
along_axis
)
.
reshape
(
batch_by_one
)
.
broadcast
(
one_by_class
))
.
unaryExpr
(
ValueClip
<
p
latform
::
bfloat16
>
());
.
unaryExpr
(
ValueClip
<
p
hi
::
dtype
::
bfloat16
>
());
}
else
{
// axis != -1, class dimension split into (axis, remain), max and sum
// should be calculated along axis dimension
...
...
@@ -207,7 +205,7 @@ class SoftmaxEigen<DeviceContext, platform::bfloat16> {
.
reshape
(
batch_one_remain
)
.
broadcast
(
one_axis_one
)
.
reshape
(
batch_classes
))
.
unaryExpr
(
ValueClip
<
p
latform
::
bfloat16
>
());
.
unaryExpr
(
ValueClip
<
p
hi
::
dtype
::
bfloat16
>
());
}
softmax
.
device
(
*
context
.
eigen_device
())
=
softmax
.
exp
();
...
...
@@ -247,21 +245,24 @@ class SoftmaxFunctor<DeviceContext, T, enable_if_CPU<DeviceContext>> {
const
int
batch_size
=
in_dims
[
kBatchDim
];
const
int
num_remain
=
num_classes
/
axis_dim
;
if
(
num_remain
==
1
&&
platform
::
MayIUse
(
platform
::
avx
))
{
if
(
num_remain
==
1
&&
phi
::
backends
::
cpu
::
MayIUse
(
phi
::
backends
::
cpu
::
avx
))
{
const
T
*
in_data
=
X
->
data
<
T
>
();
T
*
out_data
=
Y
->
data
<
T
>
();
for
(
int
bs
=
0
;
bs
<
batch_size
;
++
bs
)
{
T
max_val
=
*
std
::
max_element
(
in_data
,
in_data
+
num_classes
);
max_val
*=
static_cast
<
T
>
(
-
1
);
vec_add_bias
<
T
,
platform
::
avx
>
(
num_classes
,
max_val
,
in_data
,
out_data
);
vec_clip
<
T
,
platform
::
avx
>
(
vec_add_bias
<
T
,
phi
::
backends
::
cpu
::
avx
>
(
num_classes
,
max_val
,
in_data
,
out_data
);
vec_clip
<
T
,
phi
::
backends
::
cpu
::
avx
>
(
num_classes
,
static_cast
<
T
>
(
-
64
),
out_data
,
out_data
);
vec_exp
<
T
>
(
num_classes
,
out_data
,
out_data
);
T
sum
=
0
;
vec_sum
<
T
,
p
latform
::
avx
>
(
num_classes
,
out_data
,
&
sum
);
vec_sum
<
T
,
p
hi
::
backends
::
cpu
::
avx
>
(
num_classes
,
out_data
,
&
sum
);
sum
=
static_cast
<
T
>
(
1
)
/
sum
;
vec_scal
<
T
,
platform
::
avx
>
(
num_classes
,
sum
,
out_data
,
out_data
);
vec_scal
<
T
,
phi
::
backends
::
cpu
::
avx
>
(
num_classes
,
sum
,
out_data
,
out_data
);
in_data
+=
num_classes
;
out_data
+=
num_classes
;
...
...
@@ -308,16 +309,16 @@ class SoftmaxGradEigen {
};
template
<
typename
DeviceContext
>
class
SoftmaxGradEigen
<
DeviceContext
,
p
latform
::
float16
>
{
class
SoftmaxGradEigen
<
DeviceContext
,
p
hi
::
dtype
::
float16
>
{
public:
void
operator
()(
const
DeviceContext
&
context
,
const
int
axis_dim
,
const
phi
::
DenseTensor
*
y
,
const
phi
::
DenseTensor
*
y_grad
,
phi
::
DenseTensor
*
x_grad
)
{
auto
softmax
=
EigenMatrix
<
p
latform
::
float16
>::
From
(
*
y
);
auto
softmax_grad
=
EigenMatrix
<
p
latform
::
float16
>::
From
(
*
y_grad
);
auto
logits_grad
=
EigenMatrix
<
p
latform
::
float16
>::
From
(
*
x_grad
);
auto
softmax
=
EigenMatrix
<
p
hi
::
dtype
::
float16
>::
From
(
*
y
);
auto
softmax_grad
=
EigenMatrix
<
p
hi
::
dtype
::
float16
>::
From
(
*
y_grad
);
auto
logits_grad
=
EigenMatrix
<
p
hi
::
dtype
::
float16
>::
From
(
*
x_grad
);
constexpr
int
kBatchDim
=
0
;
constexpr
int
kClassDim
=
1
;
...
...
@@ -342,16 +343,16 @@ class SoftmaxGradEigen<DeviceContext, platform::float16> {
};
template
<
typename
DeviceContext
>
class
SoftmaxGradEigen
<
DeviceContext
,
p
latform
::
bfloat16
>
{
class
SoftmaxGradEigen
<
DeviceContext
,
p
hi
::
dtype
::
bfloat16
>
{
public:
void
operator
()(
const
DeviceContext
&
context
,
const
int
axis_dim
,
const
phi
::
DenseTensor
*
y
,
const
phi
::
DenseTensor
*
y_grad
,
phi
::
DenseTensor
*
x_grad
)
{
auto
softmax
=
EigenMatrix
<
p
latform
::
bfloat16
>::
From
(
*
y
);
auto
softmax_grad
=
EigenMatrix
<
p
latform
::
bfloat16
>::
From
(
*
y_grad
);
auto
logits_grad
=
EigenMatrix
<
p
latform
::
bfloat16
>::
From
(
*
x_grad
);
auto
softmax
=
EigenMatrix
<
p
hi
::
dtype
::
bfloat16
>::
From
(
*
y
);
auto
softmax_grad
=
EigenMatrix
<
p
hi
::
dtype
::
bfloat16
>::
From
(
*
y_grad
);
auto
logits_grad
=
EigenMatrix
<
p
hi
::
dtype
::
bfloat16
>::
From
(
*
x_grad
);
constexpr
int
kBatchDim
=
0
;
constexpr
int
kClassDim
=
1
;
...
...
@@ -400,17 +401,20 @@ class SoftmaxGradFunctor<DeviceContext, T, enable_if_CPU<DeviceContext>> {
const
int
batch_size
=
out_dims
[
kBatchDim
];
const
int
num_remain
=
num_classes
/
axis_dim
;
if
(
num_remain
==
1
&&
platform
::
MayIUse
(
platform
::
avx
))
{
if
(
num_remain
==
1
&&
phi
::
backends
::
cpu
::
MayIUse
(
phi
::
backends
::
cpu
::
avx
))
{
const
T
*
out_data
=
y
->
data
<
T
>
();
const
T
*
out_grad
=
y_grad
->
data
<
T
>
();
T
*
in_grad
=
x_grad
->
data
<
T
>
();
for
(
int
bs
=
0
;
bs
<
batch_size
;
++
bs
)
{
T
scalar
;
vec_mul_reduce
<
T
,
p
latform
::
avx
>
(
vec_mul_reduce
<
T
,
p
hi
::
backends
::
cpu
::
avx
>
(
num_classes
,
out_grad
,
out_data
,
&
scalar
);
scalar
*=
static_cast
<
T
>
(
-
1
);
vec_add_bias
<
T
,
platform
::
avx
>
(
num_classes
,
scalar
,
out_grad
,
in_grad
);
vec_mul
<
T
,
platform
::
avx
>
(
num_classes
,
out_data
,
in_grad
,
in_grad
);
vec_add_bias
<
T
,
phi
::
backends
::
cpu
::
avx
>
(
num_classes
,
scalar
,
out_grad
,
in_grad
);
vec_mul
<
T
,
phi
::
backends
::
cpu
::
avx
>
(
num_classes
,
out_data
,
in_grad
,
in_grad
);
out_data
+=
num_classes
;
out_grad
+=
num_classes
;
in_grad
+=
num_classes
;
...
...
@@ -422,6 +426,5 @@ class SoftmaxGradFunctor<DeviceContext, T, enable_if_CPU<DeviceContext>> {
}
};
}
// namespace math
}
// namespace operators
}
// namespace paddle
}
// namespace funcs
}
// namespace phi
paddle/phi/kernels/gpu/cross_entropy_grad_kernel.cu
浏览文件 @
344b99e1
...
...
@@ -22,7 +22,6 @@ limitations under the License. */
namespace
cub
=
hipcub
;
#endif
#include "paddle/fluid/operators/math/softmax.h"
#include "paddle/phi/backends/gpu/gpu_device_function.h"
#include "paddle/phi/backends/gpu/gpu_dnn.h"
#include "paddle/phi/common/amp_type_traits.h"
...
...
@@ -32,6 +31,7 @@ namespace cub = hipcub;
#include "paddle/phi/kernels/funcs/axis_utils.h"
#include "paddle/phi/kernels/funcs/for_range.h"
#include "paddle/phi/kernels/funcs/math_function.h"
#include "paddle/phi/kernels/funcs/softmax.h"
#include "paddle/phi/kernels/gpudnn/softmax_gpudnn.h"
namespace
phi
{
...
...
paddle/phi/kernels/gpu/cross_entropy_kernel.cu
浏览文件 @
344b99e1
...
...
@@ -22,7 +22,6 @@ limitations under the License. */
namespace
cub
=
hipcub
;
#endif
#include "paddle/fluid/operators/math/softmax.h"
#include "paddle/phi/backends/gpu/gpu_device_function.h"
#include "paddle/phi/backends/gpu/gpu_dnn.h"
#include "paddle/phi/common/amp_type_traits.h"
...
...
@@ -33,6 +32,7 @@ namespace cub = hipcub;
#include "paddle/phi/kernels/funcs/cross_entropy.h"
#include "paddle/phi/kernels/funcs/for_range.h"
#include "paddle/phi/kernels/funcs/math_function.h"
#include "paddle/phi/kernels/funcs/softmax.h"
#include "paddle/phi/kernels/gpudnn/softmax_gpudnn.h"
namespace
phi
{
...
...
@@ -1386,7 +1386,7 @@ void CrossEntropyWithSoftmaxCUDAKernel(const GPUContext& dev_ctx,
labels_2d
.
Resize
({
n
,
label
.
numel
()
/
n
});
DenseTensor
loss_2d
(
*
loss
);
loss_2d
.
Resize
({
n
,
1
});
p
addle
::
operators
::
math
::
SoftmaxCUDNNFunctor
<
T
,
GPUContext
>
()(
p
hi
::
funcs
::
SoftmaxCUDNNFunctor
<
T
,
GPUContext
>
()(
dev_ctx
,
&
logits_2d
,
&
softmax_2d
);
phi
::
funcs
::
CrossEntropyFunctor
<
GPUContext
,
T
>
()(
dev_ctx
,
&
loss_2d
,
...
...
paddle/phi/kernels/impl/gumbel_softmax_grad_kernel_impl.h
浏览文件 @
344b99e1
...
...
@@ -14,11 +14,11 @@
#pragma once
#include "paddle/fluid/operators/math/softmax.h"
#include "paddle/fluid/operators/math/softmax_impl.h"
#include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/kernels/funcs/axis_utils.h"
#include "paddle/phi/kernels/funcs/math_function.h"
#include "paddle/phi/kernels/funcs/softmax.h"
#include "paddle/phi/kernels/funcs/softmax_impl.h"
namespace
phi
{
...
...
@@ -50,7 +50,7 @@ void GumbelSoftmaxGradKernel(const Context& ctx,
dx_2d
.
Resize
({
size_to_axis
,
size_from_axis
});
out_2d
.
Resize
({
size_to_axis
,
size_from_axis
});
dout_2d
.
Resize
({
size_to_axis
,
size_from_axis
});
p
addle
::
operators
::
math
::
SoftmaxGradFunctor
<
Context
,
T
>
()(
p
hi
::
funcs
::
SoftmaxGradFunctor
<
Context
,
T
>
()(
ctx
,
axis_dim
,
&
out_2d
,
&
dout_2d
,
&
dx_2d
);
}
...
...
paddle/phi/kernels/impl/gumbel_softmax_kernel_impl.h
浏览文件 @
344b99e1
...
...
@@ -16,12 +16,12 @@
#include <random>
#include "paddle/fluid/operators/math/softmax.h"
#include "paddle/fluid/operators/math/softmax_impl.h"
#include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/kernels/funcs/axis_utils.h"
#include "paddle/phi/kernels/funcs/eigen/common.h"
#include "paddle/phi/kernels/funcs/math_function.h"
#include "paddle/phi/kernels/funcs/softmax.h"
#include "paddle/phi/kernels/funcs/softmax_impl.h"
namespace
phi
{
...
...
@@ -87,8 +87,7 @@ void GumbelSoftmaxKernelHelper(const Context& ctx,
size_to_axis
,
size_from_axis
,
temperature
);
paddle
::
operators
::
math
::
SoftmaxFunctor
<
Context
,
T
>
()(
ctx
,
axis_dim
,
&
x_noise_2d
,
&
out_2d
);
phi
::
funcs
::
SoftmaxFunctor
<
Context
,
T
>
()(
ctx
,
axis_dim
,
&
x_noise_2d
,
&
out_2d
);
if
(
hard
)
{
OneHotGenerator
<
Context
,
T
>::
Transform
(
ctx
,
x
,
out
,
axis
);
...
...
paddle/phi/kernels/impl/softmax_grad_kernel_impl.h
浏览文件 @
344b99e1
...
...
@@ -14,9 +14,9 @@ limitations under the License. */
#pragma once
#include "paddle/fluid/operators/math/softmax.h"
#include "paddle/phi/kernels/funcs/axis_utils.h"
#include "paddle/phi/kernels/funcs/math_function.h"
#include "paddle/phi/kernels/funcs/softmax.h"
#include "paddle/phi/kernels/softmax_grad_kernel.h"
namespace
phi
{
...
...
@@ -50,7 +50,7 @@ void SoftmaxGradKernel(const Context& dev_ctx,
Out_2d
.
ShareDataWith
(
out
).
Resize
({
n
,
d
});
dOut_2d
.
ShareDataWith
(
out_grad
).
Resize
({
n
,
d
});
p
addle
::
operators
::
math
::
SoftmaxGradFunctor
<
Context
,
T
>
()(
p
hi
::
funcs
::
SoftmaxGradFunctor
<
Context
,
T
>
()(
dev_ctx
,
axis_dim
,
&
Out_2d
,
&
dOut_2d
,
&
dX_2d
);
}
...
...
paddle/phi/kernels/impl/softmax_kernel_impl.h
浏览文件 @
344b99e1
...
...
@@ -14,9 +14,9 @@ limitations under the License. */
#pragma once
#include "paddle/fluid/operators/math/softmax.h"
#include "paddle/phi/kernels/funcs/axis_utils.h"
#include "paddle/phi/kernels/funcs/math_function.h"
#include "paddle/phi/kernels/funcs/softmax.h"
#include "paddle/phi/kernels/softmax_kernel.h"
namespace
phi
{
...
...
@@ -47,8 +47,7 @@ void SoftmaxKernel(const Context& dev_ctx,
DenseTensor
X_2d
,
Out_2d
;
X_2d
.
ShareDataWith
(
x
).
Resize
({
n
,
d
});
Out_2d
.
ShareDataWith
(
*
out
).
Resize
({
n
,
d
});
paddle
::
operators
::
math
::
SoftmaxFunctor
<
Context
,
T
>
()(
dev_ctx
,
axis_dim
,
&
X_2d
,
&
Out_2d
);
phi
::
funcs
::
SoftmaxFunctor
<
Context
,
T
>
()(
dev_ctx
,
axis_dim
,
&
X_2d
,
&
Out_2d
);
}
}
// namespace phi
paddle/phi/tests/kernels/CMakeLists.txt
浏览文件 @
344b99e1
...
...
@@ -22,7 +22,7 @@ endif()
cc_test
(
test_cpu_vec
SRCS test_cpu_vec.cc
DEPS blas
cpu_info
)
DEPS blas
phi_backends
)
# For String Kernels
cc_test
(
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录