Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle-Lite
提交
8776f751
P
Paddle-Lite
项目概览
PaddlePaddle
/
Paddle-Lite
通知
332
Star
4
Fork
1
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
271
列表
看板
标记
里程碑
合并请求
78
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle-Lite
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
271
Issue
271
列表
看板
标记
里程碑
合并请求
78
合并请求
78
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
8776f751
编写于
8月 04, 2020
作者:
W
Wilber
提交者:
GitHub
8月 04, 2020
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
[CUDA] Add cudnn_softmax. (#4040)
上级
737fedeb
变更
12
隐藏空白更改
内联
并排
Showing
12 changed file
with
683 addition
and
186 deletion
+683
-186
lite/backends/cuda/math/CMakeLists.txt
lite/backends/cuda/math/CMakeLists.txt
+5
-1
lite/backends/cuda/math/cudnn_conv.cc
lite/backends/cuda/math/cudnn_conv.cc
+10
-19
lite/backends/cuda/math/cudnn_helper.cc
lite/backends/cuda/math/cudnn_helper.cc
+38
-0
lite/backends/cuda/math/cudnn_helper.h
lite/backends/cuda/math/cudnn_helper.h
+12
-1
lite/backends/cuda/math/cudnn_softmax.cc
lite/backends/cuda/math/cudnn_softmax.cc
+105
-0
lite/backends/cuda/math/cudnn_softmax.h
lite/backends/cuda/math/cudnn_softmax.h
+64
-0
lite/kernels/cuda/CMakeLists.txt
lite/kernels/cuda/CMakeLists.txt
+1
-1
lite/kernels/cuda/softmax_compute.cu
lite/kernels/cuda/softmax_compute.cu
+270
-71
lite/kernels/cuda/softmax_compute.h
lite/kernels/cuda/softmax_compute.h
+4
-1
lite/kernels/cuda/softmax_compute_test.cc
lite/kernels/cuda/softmax_compute_test.cc
+170
-92
lite/operators/op_params.h
lite/operators/op_params.h
+1
-0
lite/operators/softmax_op.cc
lite/operators/softmax_op.cc
+3
-0
未找到文件。
lite/backends/cuda/math/CMakeLists.txt
浏览文件 @
8776f751
...
...
@@ -8,7 +8,9 @@ nv_library(cuda_activation SRCS activation.cu DEPS ${cuda_static_deps})
nv_library
(
cuda_scale SRCS scale.cu DEPS
${
cuda_static_deps
}
)
nv_library
(
cuda_type_trans SRCS type_trans.cu DEPS
${
cuda_static_deps
}
)
nv_library
(
cuda_transpose SRCS transpose.cu DEPS
${
cuda_static_deps
}
)
nv_library
(
cudnn_conv SRCS cudnn_conv.cc DEPS cuda_activation cuda_scale cuda_type_trans
${
cuda_static_deps
}
)
nv_library
(
cudnn_helper SRCS cudnn_helper.cc DEPS
${
cuda_static_deps
}
)
nv_library
(
cudnn_conv SRCS cudnn_conv.cc DEPS cuda_activation cuda_scale cuda_type_trans cudnn_helper
${
cuda_static_deps
}
)
nv_library
(
cudnn_softmax SRCS cudnn_softmax.cc DEPS cudnn_helper
${
cuda_static_deps
}
tensor
)
nv_library
(
cuda_elementwise SRCS elementwise.cu DEPS
${
cuda_static_deps
}
)
nv_library
(
cudnn_pool SRCS cudnn_pool.cc DEPS
${
cuda_static_deps
}
)
nv_library
(
cuda_gru_forward SRCS gru_forward.cu DEPS cuda_activation
${
cuda_static_deps
}
)
...
...
@@ -22,6 +24,7 @@ nv_library(cuda_bias SRCS bias.cu DEPS ${cuda_static_deps})
set
(
math_cuda
cudnn_conv
cudnn_softmax
cuda_activation
cuda_scale
cuda_type_trans
...
...
@@ -35,6 +38,7 @@ set (
cuda_strided_gemm
cuda_sequence_padding
cuda_bias
cudnn_helper
)
set
(
math_cuda
"
${
math_cuda
}
"
CACHE GLOBAL
"math cuda"
)
lite/backends/cuda/math/cudnn_conv.cc
浏览文件 @
8776f751
...
...
@@ -15,6 +15,7 @@
#include "lite/backends/cuda/math/cudnn_conv.h"
#include "lite/backends/cuda/math/activation.h"
#include "lite/backends/cuda/math/conv_op_cache_cudnn.h"
#include "lite/backends/cuda/math/cudnn_helper.h"
#include "lite/backends/cuda/math/scale.h"
#include "lite/backends/cuda/math/type_trans.h"
...
...
@@ -23,19 +24,6 @@ namespace lite {
namespace
cuda
{
namespace
math
{
template
<
PrecisionType
PType
>
cudnnDataType_t
GetDataType
();
template
<
>
cudnnDataType_t
GetDataType
<
PRECISION
(
kFloat
)
>
()
{
return
CUDNN_DATA_FLOAT
;
}
template
<
>
cudnnDataType_t
GetDataType
<
PRECISION
(
kFP16
)
>
()
{
return
CUDNN_DATA_HALF
;
}
template
<
typename
T
,
PrecisionType
Ptype_out
>
bool
CudnnConv2D
<
T
,
Ptype_out
>::
create
(
const
operators
::
ConvParam
&
param
,
Context
<
TARGET
(
kCUDA
)
>*
ctx
)
{
...
...
@@ -67,13 +55,13 @@ bool CudnnConv2D<T, Ptype_out>::create(const operators::ConvParam& param,
CUDNN_CHECK
(
cudnnSetTensor4dDescriptor
(
this
->
input_desc_
,
CUDNN_TENSOR_NCHW
,
GetDataType
<
Ptype_out
>
(),
Get
Cudnn
DataType
<
Ptype_out
>
(),
batch
,
ic
,
ih
,
iw
));
CUDNN_CHECK
(
cudnnSetFilter4dDescriptor
(
this
->
filter_desc_
,
GetDataType
<
Ptype_out
>
(),
Get
Cudnn
DataType
<
Ptype_out
>
(),
CUDNN_TENSOR_NCHW
,
oc
,
ic
/
param
.
groups
,
...
...
@@ -87,11 +75,11 @@ bool CudnnConv2D<T, Ptype_out>::create(const operators::ConvParam& param,
dh
,
dw
,
CUDNN_CROSS_CORRELATION
,
GetDataType
<
Ptype_out
>
()));
Get
Cudnn
DataType
<
Ptype_out
>
()));
CUDNN_CHECK
(
cudnnSetConvolutionGroupCount
(
this
->
conv_desc_
,
param
.
groups
));
CUDNN_CHECK
(
cudnnSetTensor4dDescriptor
(
this
->
output_desc_
,
CUDNN_TENSOR_NCHW
,
GetDataType
<
Ptype_out
>
(),
Get
Cudnn
DataType
<
Ptype_out
>
(),
batch
,
oc
,
oh
,
...
...
@@ -190,8 +178,11 @@ bool CudnnConv2D<T, Ptype_out>::create(const operators::ConvParam& param,
if
(
param
.
bias
)
{
int
dim_bias
[]
=
{
1
,
oc
,
1
,
1
};
int
stride_bias
[]
=
{
oc
,
1
,
1
,
1
};
cudnnSetTensorNdDescriptor
(
this
->
bias_desc_
,
GetDataType
<
Ptype_out
>
(),
4
,
dim_bias
,
stride_bias
);
cudnnSetTensorNdDescriptor
(
this
->
bias_desc_
,
GetCudnnDataType
<
Ptype_out
>
(),
4
,
dim_bias
,
stride_bias
);
}
return
true
;
}
...
...
lite/backends/cuda/math/cudnn_helper.cc
0 → 100644
浏览文件 @
8776f751
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/backends/cuda/math/cudnn_helper.h"
#include <string>
#include <vector>
namespace
paddle
{
namespace
lite
{
namespace
cuda
{
namespace
math
{
template
<
>
cudnnDataType_t
GetCudnnDataType
<
PRECISION
(
kFloat
)
>
()
{
return
CUDNN_DATA_FLOAT
;
}
template
<
>
cudnnDataType_t
GetCudnnDataType
<
PRECISION
(
kFP16
)
>
()
{
return
CUDNN_DATA_HALF
;
}
}
// namespace math
}
// namespace cuda
}
// namespace lite
}
// namespace paddle
lite/backends/cuda/math/cudnn_helper.h
浏览文件 @
8776f751
...
...
@@ -13,12 +13,23 @@
// limitations under the License.
#pragma once
#include <cudnn.h>
#include <string>
#include <vector>
#include "lite/api/paddle_place.h"
#include "lite/backends/cuda/cuda_utils.h"
namespace
paddle
{
namespace
lite
{
namespace
cuda
{
namespace
math
{}
// namespace math
namespace
math
{
template
<
lite_api
::
PrecisionType
PType
>
cudnnDataType_t
GetCudnnDataType
();
}
// namespace math
}
// namespace cuda
}
// namespace lite
}
// namespace paddle
lite/backends/cuda/math/cudnn_softmax.cc
0 → 100644
浏览文件 @
8776f751
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "lite/backends/cuda/math/cudnn_softmax.h"
#include "lite/backends/cuda/math/cudnn_helper.h"
namespace
paddle
{
namespace
lite
{
namespace
cuda
{
namespace
math
{
template
<
typename
T
,
PrecisionType
Ptype
>
bool
CudnnSoftmax
<
T
,
Ptype
>::
Init
(
const
operators
::
SoftmaxParam
&
param
,
Context
<
TARGET
(
kCUDA
)
>*
ctx
)
{
this
->
stream_
=
ctx
->
exec_stream
();
CUDNN_CHECK
(
cudnnCreate
(
&
this
->
handle_
));
CUDNN_CHECK
(
cudnnSetStream
(
this
->
handle_
,
this
->
stream_
));
cudnnCreateTensorDescriptor
(
&
this
->
bottom_desc_
);
cudnnCreateTensorDescriptor
(
&
this
->
top_desc_
);
return
Create
(
param
,
ctx
);
}
template
<
typename
T
,
PrecisionType
Ptype
>
bool
CudnnSoftmax
<
T
,
Ptype
>::
Create
(
const
operators
::
SoftmaxParam
&
param
,
Context
<
TARGET
(
kCUDA
)
>*
ctx
)
{
int
axis
=
param
.
axis
;
if
(
axis
<
0
)
{
axis
+=
param
.
x
->
dims
().
size
();
}
int
outer_num
=
param
.
x
->
dims
().
count
(
0
,
axis
);
int
inner_num
=
param
.
x
->
dims
().
count
(
axis
+
1
,
param
.
x
->
dims
().
size
());
int
N
=
outer_num
;
int
C
=
param
.
x
->
dims
()[
axis
];
int
H
=
inner_num
;
int
W
=
1
;
const
int
stride_w
=
1
;
const
int
stride_h
=
W
*
stride_w
;
const
int
stride_c
=
H
*
stride_h
;
const
int
stride_n
=
C
*
stride_c
;
CUDNN_CHECK
(
cudnnSetTensor4dDescriptorEx
(
bottom_desc_
,
GetCudnnDataType
<
Ptype
>
(),
N
,
C
,
H
,
W
,
stride_n
,
stride_c
,
stride_h
,
stride_w
));
CUDNN_CHECK
(
cudnnSetTensor4dDescriptorEx
(
top_desc_
,
GetCudnnDataType
<
Ptype
>
(),
N
,
C
,
H
,
W
,
stride_n
,
stride_c
,
stride_h
,
stride_w
));
handle_setup_
=
true
;
return
true
;
}
template
<
typename
T
,
PrecisionType
Ptype
>
bool
CudnnSoftmax
<
T
,
Ptype
>::
Run
(
const
operators
::
SoftmaxParam
&
param
)
{
T
*
output_data
=
param
.
output
->
mutable_data
<
T
>
(
TARGET
(
kCUDA
));
const
T
*
input_data
=
param
.
x
->
data
<
T
>
();
float
alpha
=
1.0
f
;
float
beta
=
0.
f
;
CUDNN_CHECK
(
cudnnSoftmaxForward
(
handle_
,
CUDNN_SOFTMAX_ACCURATE
,
CUDNN_SOFTMAX_MODE_CHANNEL
,
&
alpha
,
bottom_desc_
,
reinterpret_cast
<
const
void
*>
(
input_data
),
&
beta
,
top_desc_
,
reinterpret_cast
<
void
*>
(
output_data
)));
return
true
;
}
template
class
CudnnSoftmax
<
float
,
PRECISION
(
kFloat
)>;
template
class
CudnnSoftmax
<
half
,
PRECISION
(
kFP16
)>;
}
// namespace math
}
// namespace cuda
}
// namespace lite
}
// namespace paddle
lite/backends/cuda/math/cudnn_softmax.h
0 → 100644
浏览文件 @
8776f751
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <cudnn.h>
#include <string>
#include <vector>
#include "lite/api/paddle_place.h"
#include "lite/backends/cuda/cuda_utils.h"
#include "lite/core/context.h"
#include "lite/core/target_wrapper.h"
#include "lite/operators/op_params.h"
namespace
paddle
{
namespace
lite
{
namespace
cuda
{
namespace
math
{
template
<
typename
T
,
PrecisionType
Ptype
>
class
CudnnSoftmax
{
public:
CudnnSoftmax
()
:
handle_
(
nullptr
),
bottom_desc_
(
nullptr
),
top_desc_
(
nullptr
),
handle_setup_
(
false
)
{}
virtual
~
CudnnSoftmax
()
{
if
(
!
handle_setup_
)
return
;
cudnnDestroyTensorDescriptor
(
bottom_desc_
);
cudnnDestroyTensorDescriptor
(
top_desc_
);
cudnnDestroy
(
handle_
);
}
bool
Init
(
const
operators
::
SoftmaxParam
&
param
,
Context
<
TARGET
(
kCUDA
)
>*
ctx
);
bool
Create
(
const
operators
::
SoftmaxParam
&
param
,
Context
<
TARGET
(
kCUDA
)
>*
ctx
);
bool
Run
(
const
operators
::
SoftmaxParam
&
param
);
private:
cudaStream_t
stream_
;
cudnnHandle_t
handle_
;
cudnnTensorDescriptor_t
bottom_desc_
;
cudnnTensorDescriptor_t
top_desc_
;
bool
handle_setup_
;
};
}
// namespace math
}
// namespace cuda
}
// namespace lite
}
// namespace paddle
lite/kernels/cuda/CMakeLists.txt
浏览文件 @
8776f751
...
...
@@ -30,7 +30,7 @@ add_kernel(feed_compute_cuda CUDA basic SRCS feed_compute.cc DEPS ${lite_kernel_
add_kernel
(
fetch_compute_cuda CUDA basic SRCS fetch_compute.cc DEPS
${
lite_kernel_deps
}
)
add_kernel
(
scale_compute_cuda CUDA basic SRCS scale_compute.cc DEPS
${
lite_kernel_deps
}
cuda_scale
)
add_kernel
(
dropout_compute_cuda CUDA basic SRCS dropout_compute.cc DEPS
${
lite_kernel_deps
}
cuda_scale
)
add_kernel
(
softmax_compute_cuda CUDA basic SRCS softmax_compute.cu DEPS
${
lite_kernel_deps
}
)
add_kernel
(
softmax_compute_cuda CUDA basic SRCS softmax_compute.cu DEPS
${
lite_kernel_deps
}
${
math_cuda
}
)
add_kernel
(
pool_compute_cuda CUDA basic SRCS pool_compute.cu DEPS
${
lite_kernel_deps
}
cudnn_pool
)
add_kernel
(
bilinear_interp_compute_cuda CUDA basic SRCS bilinear_interp_compute.cu DEPS
${
lite_kernel_deps
}
)
...
...
lite/kernels/cuda/softmax_compute.cu
浏览文件 @
8776f751
...
...
@@ -12,6 +12,8 @@ limitations under the License. */
#pragma once
#include <limits>
#include <vector>
#include "lite/backends/cuda/cuda_utils.h"
#include "lite/core/op_registry.h"
#include "lite/kernels/cuda/softmax_compute.h"
...
...
@@ -21,8 +23,6 @@ namespace kernels {
namespace
cuda
{
using
Tensor
=
lite
::
Tensor
;
const
int
CUDA_NUM_THREADS
=
512
;
extern
__shared__
char
tile
[];
template
<
typename
dtype
>
__global__
void
sharemem_softmax_kernel
(
int
total_size
,
...
...
@@ -73,6 +73,69 @@ __global__ void sharemem_softmax_kernel(int total_size,
}
}
template
<
>
__global__
void
sharemem_softmax_kernel
(
int
total_size
,
const
half
*
in_data
,
half
*
out_data
,
int
inner_num
,
int
outer_num
,
int
axis_size
)
{
half
*
data
=
reinterpret_cast
<
half
*>
(
tile
)
+
threadIdx
.
x
;
//! compute thread index and real data index
int
idx
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
if
(
idx
<
total_size
)
{
int
idx_inner
=
idx
%
inner_num
;
int
idx_outer
=
(
idx
/
inner_num
)
*
axis_size
;
int
blocksize
=
blockDim
.
x
;
int
real_index
=
idx_outer
*
inner_num
+
idx_inner
;
int
loop_idx
=
real_index
;
//! read all data to sharemem in softmax channel
#pragma unroll
for
(
int
i
=
0
;
i
<
axis_size
;
++
i
)
{
data
[
i
*
blocksize
]
=
in_data
[
loop_idx
];
loop_idx
+=
inner_num
;
}
//! get maximum value in softmax channel
half
max_data
=
data
[
0
];
#pragma unroll
for
(
int
i
=
1
;
i
<
axis_size
;
++
i
)
{
half
dt
=
data
[
i
*
blocksize
];
#if __CUDA_ARCH__ >= 530
if
(
__hlt
(
max_data
,
dt
))
{
#else
if
(
__half2float
(
max_data
)
<
__half2float
(
dt
))
{
#endif
max_data
=
dt
;
}
}
//! subtract then summarize
half
sum
=
0
;
#pragma unroll
for
(
int
i
=
0
;
i
<
axis_size
;
++
i
)
{
half
*
dt
=
data
+
i
*
blocksize
;
#if __CUDA_ARCH__ >= 530
*
dt
=
hexp
(
__hsub
(
*
dt
,
max_data
));
sum
=
__hadd
(
sum
,
*
dt
);
#else
*
dt
=
__float2half
(
expf
(
__half2float
(
*
dt
)
-
__half2float
(
max_data
)));
sum
=
__float2half
(
__half2float
(
sum
)
+
__half2float
(
*
dt
));
#endif
}
//! write back result
loop_idx
=
real_index
;
#pragma unroll
for
(
int
i
=
0
;
i
<
axis_size
;
++
i
)
{
#if __CUDA_ARCH__ >= 530
out_data
[
loop_idx
]
=
__hdiv
(
data
[
i
*
blocksize
],
sum
);
#else
out_data
[
loop_idx
]
=
__float2half
(
__half2float
(
data
[
i
*
blocksize
])
/
__half2float
(
sum
));
#endif
loop_idx
+=
inner_num
;
}
}
}
//! general kernel for softmax
template
<
typename
dtype
>
__global__
void
softmax_max_kernel
(
int
total_size
,
...
...
@@ -99,6 +162,38 @@ __global__ void softmax_max_kernel(int total_size,
}
}
template
<
>
__global__
void
softmax_max_kernel
(
int
total_size
,
const
half
*
in_data
,
half
*
out_data
,
half
min_data
,
int
inner_num
,
int
outer_num
,
int
axis_size
)
{
//! compute data index
int
idx
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
if
(
idx
<
total_size
)
{
int
idx_inner
=
idx
%
inner_num
;
int
idx_outer
=
(
idx
/
inner_num
)
*
axis_size
;
int
real_index
=
idx_outer
*
inner_num
+
idx_inner
;
//! get maximum data across softmax axis
half
max_data
=
min_data
;
for
(
int
i
=
0
;
i
<
axis_size
;
++
i
)
{
#if __CUDA_ARCH__ >= 530
max_data
=
__hgt
(
in_data
[
real_index
],
max_data
)
?
in_data
[
real_index
]
:
max_data
;
#else
float
a
=
__half2float
(
in_data
[
real_index
]);
float
b
=
__half2float
(
max_data
);
float
res
=
a
>
b
?
a
:
b
;
max_data
=
__float2half
(
res
);
#endif
real_index
+=
inner_num
;
}
out_data
[
idx
]
=
max_data
;
}
}
template
<
typename
dtype
>
__global__
void
softmax_sub_exp_sum_kernel
(
int
total_size
,
const
dtype
*
in_data
,
...
...
@@ -129,6 +224,44 @@ __global__ void softmax_sub_exp_sum_kernel(int total_size,
}
}
template
<
>
__global__
void
softmax_sub_exp_sum_kernel
(
int
total_size
,
const
half
*
in_data
,
half
*
out_data
,
const
half
*
max_data
,
half
*
sum_data
,
int
inner_num
,
int
outer_num
,
int
axis_size
)
{
//! compute data index
int
idx
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
if
(
idx
<
total_size
)
{
int
idx_inner
=
idx
%
inner_num
;
int
idx_outer
=
(
idx
/
inner_num
)
*
axis_size
;
half
max_data_cur
=
max_data
[
idx
];
half
sum_data_cur
=
0
;
int
real_index
=
idx_outer
*
inner_num
+
idx_inner
;
//! compute exp and summarize across the softmax axis
for
(
int
i
=
0
;
i
<
axis_size
;
++
i
)
{
#if __CUDA_ARCH__ >= 530
half
sub_data
=
__hsub
(
in_data
[
real_index
],
max_data_cur
);
sub_data
=
hexp
(
sub_data
);
sum_data_cur
=
__hadd
(
sum_data_cur
,
sub_data
);
#else
half
sub_data
=
__float2half
(
__half2float
(
in_data
[
real_index
])
-
__half2float
(
max_data_cur
));
sub_data
=
__float2half
(
expf
(
__half2float
(
sub_data
)));
sum_data_cur
=
__float2half
(
__half2float
(
sum_data_cur
)
+
__half2float
(
sub_data
));
#endif
out_data
[
real_index
]
=
sub_data
;
real_index
+=
inner_num
;
}
sum_data
[
idx
]
=
sum_data_cur
;
}
}
template
<
typename
dtype
>
__global__
void
softmax_divid_output_kernel
(
int
total_size
,
dtype
*
io_data
,
...
...
@@ -151,75 +284,122 @@ __global__ void softmax_divid_output_kernel(int total_size,
}
}
void
SoftmaxCompute
::
PrepareForRun
()
{
template
<
>
__global__
void
softmax_divid_output_kernel
(
int
total_size
,
half
*
io_data
,
const
half
*
sum_data
,
int
inner_num
,
int
outer_num
,
int
axis_size
)
{
//! compute data index
int
idx
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
if
(
idx
<
total_size
)
{
int
idx_inner
=
idx
%
inner_num
;
int
idx_outer
=
(
idx
/
inner_num
)
*
axis_size
;
#if __CUDA_ARCH__ >= 530
half
sum_data_cur
=
__hdiv
(
__float2half
(
1.
f
),
sum_data
[
idx
]);
#else
half
sum_data_cur
=
__float2half
(
1.
f
/
__half2float
(
sum_data
[
idx
]));
#endif
int
real_index
=
idx_outer
*
inner_num
+
idx_inner
;
//! compute final result
for
(
int
i
=
0
;
i
<
axis_size
;
++
i
)
{
#if __CUDA_ARCH__ >= 530
io_data
[
real_index
]
=
__hmul
(
io_data
[
real_index
],
sum_data_cur
);
#else
io_data
[
real_index
]
=
__float2half
(
__half2float
(
io_data
[
real_index
])
*
__half2float
(
sum_data_cur
));
#endif
real_index
+=
inner_num
;
}
}
}
template
<
typename
Dtype
,
PrecisionType
Ptype
>
void
SoftmaxCompute
<
Dtype
,
Ptype
>::
PrepareForRun
()
{
auto
&
param
=
this
->
template
Param
<
param_t
>();
auto
&
ctx
=
this
->
ctx_
->
template
As
<
CUDAContext
>();
int
device_id
;
cudaGetDevice
(
&
device_id
);
cudaDeviceProp
deviceProp
;
cudaGetDeviceProperties
(
&
deviceProp
,
device_id
);
sharedmem_size_
=
deviceProp
.
sharedMemPerBlock
;
max_dimsize_
=
sharedmem_size_
/
sizeof
(
float
)
/
CUDA_NUM_THREADS
;
if
(
param
.
use_cudnn
)
{
cudnn_softmax_
.
Init
(
param
,
&
ctx
);
}
}
void
SoftmaxCompute
::
Run
()
{
auto
&
param
=
this
->
Param
<
param_t
>
();
template
<
typename
Dtype
,
PrecisionType
Ptype
>
void
SoftmaxCompute
<
Dtype
,
Ptype
>::
Run
()
{
auto
&
param
=
this
->
template
Param
<
param_t
>();
auto
&
ctx
=
this
->
ctx_
->
template
As
<
CUDAContext
>();
auto
stream
=
ctx
.
exec_stream
();
auto
x_dims
=
param
.
x
->
dims
();
auto
x_rank
=
x_dims
.
size
();
int
axis
=
param
.
axis
;
if
(
axis
<
0
)
{
axis
+=
x_rank
;
}
int
outer_num
=
x_dims
.
Slice
(
0
,
axis
).
production
();
int
inner_num
=
x_dims
.
Slice
(
axis
+
1
,
x_rank
).
production
();
int
total_threads
=
inner_num
*
outer_num
;
axis_size_
=
x_dims
[
axis
];
const
int
threads
=
CUDA_NUM_THREADS
;
const
int
blocks
=
(
total_threads
+
threads
-
1
)
/
threads
;
auto
input_data
=
param
.
x
->
data
<
float
>
();
auto
output_data
=
param
.
output
->
mutable_data
<
float
>
(
TARGET
(
kCUDA
));
if
(
axis_size_
<=
max_dimsize_
)
{
int
use_sharemem_size
=
axis_size_
*
threads
*
sizeof
(
float
);
sharemem_softmax_kernel
<<<
blocks
,
threads
,
use_sharemem_size
,
stream
>>>
(
total_threads
,
input_data
,
output_data
,
inner_num
,
outer_num
,
axis_size_
);
if
(
param
.
use_cudnn
)
{
cudnn_softmax_
.
Create
(
param
,
&
ctx
);
cudnn_softmax_
.
Run
(
param
);
}
else
{
//! re_alloc device memory
tmax_data_
.
Resize
({
1
,
1
,
1
,
outer_num
*
inner_num
});
tsum_data_
.
Resize
({
1
,
1
,
1
,
outer_num
*
inner_num
});
auto
max_data
=
tmax_data_
.
mutable_data
<
float
>
(
TARGET
(
kCUDA
));
auto
sum_data
=
tsum_data_
.
mutable_data
<
float
>
(
TARGET
(
kCUDA
));
//! firstly, get maximum data
float
min_data
=
std
::
numeric_limits
<
float
>::
lowest
();
softmax_max_kernel
<
float
><<<
blocks
,
threads
,
0
,
stream
>>>
(
total_threads
,
input_data
,
max_data
,
min_data
,
inner_num
,
outer_num
,
axis_size_
);
//! then, compute exp and sum data
softmax_sub_exp_sum_kernel
<
float
><<<
blocks
,
threads
,
0
,
stream
>>>
(
total_threads
,
input_data
,
output_data
,
max_data
,
sum_data
,
inner_num
,
outer_num
,
axis_size_
);
//! last, compute divided output
softmax_divid_output_kernel
<
float
><<<
blocks
,
threads
,
0
,
stream
>>>
(
total_threads
,
output_data
,
sum_data
,
inner_num
,
outer_num
,
axis_size_
);
auto
x_dims
=
param
.
x
->
dims
();
auto
x_rank
=
x_dims
.
size
();
int
axis
=
param
.
axis
;
if
(
axis
<
0
)
{
axis
+=
x_rank
;
}
int
outer_num
=
x_dims
.
Slice
(
0
,
axis
).
production
();
int
inner_num
=
x_dims
.
Slice
(
axis
+
1
,
x_rank
).
production
();
int
total_threads
=
inner_num
*
outer_num
;
axis_size_
=
x_dims
[
axis
];
const
int
threads
=
CUDA_NUM_THREADS
;
const
int
blocks
=
(
total_threads
+
threads
-
1
)
/
threads
;
auto
input_data
=
param
.
x
->
template
data
<
Dtype
>();
auto
output_data
=
param
.
output
->
template
mutable_data
<
Dtype
>(
TARGET
(
kCUDA
));
if
(
axis_size_
<=
max_dimsize_
)
{
int
use_sharemem_size
=
axis_size_
*
threads
*
sizeof
(
Dtype
);
sharemem_softmax_kernel
<
Dtype
><<<
blocks
,
threads
,
use_sharemem_size
,
stream
>>>
(
total_threads
,
input_data
,
output_data
,
inner_num
,
outer_num
,
axis_size_
);
}
else
{
//! re_alloc device memory
tmax_data_
.
Resize
({
1
,
1
,
1
,
outer_num
*
inner_num
});
tsum_data_
.
Resize
({
1
,
1
,
1
,
outer_num
*
inner_num
});
auto
max_data
=
tmax_data_
.
mutable_data
<
Dtype
>
(
TARGET
(
kCUDA
));
auto
sum_data
=
tsum_data_
.
mutable_data
<
Dtype
>
(
TARGET
(
kCUDA
));
//! firstly, get maximum data
float
min_data
=
std
::
numeric_limits
<
float
>::
lowest
();
softmax_max_kernel
<
Dtype
><<<
blocks
,
threads
,
0
,
stream
>>>
(
total_threads
,
input_data
,
max_data
,
min_data
,
inner_num
,
outer_num
,
axis_size_
);
//! then, compute exp and sum data
softmax_sub_exp_sum_kernel
<
Dtype
><<<
blocks
,
threads
,
0
,
stream
>>>
(
total_threads
,
input_data
,
output_data
,
max_data
,
sum_data
,
inner_num
,
outer_num
,
axis_size_
);
//! last, compute divided output
softmax_divid_output_kernel
<
Dtype
><<<
blocks
,
threads
,
0
,
stream
>>>
(
total_threads
,
output_data
,
sum_data
,
inner_num
,
outer_num
,
axis_size_
);
}
}
cudaError_t
error
=
cudaGetLastError
();
if
(
error
!=
cudaSuccess
)
LOG
(
ERROR
)
<<
cudaGetErrorString
(
error
);
CUDA_POST_KERNEL_CHECK
;
}
}
// namespace cuda
...
...
@@ -227,12 +407,12 @@ void SoftmaxCompute::Run() {
}
// namespace lite
}
// namespace paddle
REGISTER_LITE_KERNEL
(
softmax
,
kCUDA
,
kFloat
,
kNCHW
,
paddle
::
lite
::
kernels
::
cuda
::
SoftmaxCompute
,
def
)
using
SoftmaxFp32
=
paddle
::
lite
::
kernels
::
cuda
::
SoftmaxCompute
<
float
,
PRECISION
(
kFloat
)
>
;
using
SoftmaxFp16
=
paddle
::
lite
::
kernels
::
cuda
::
SoftmaxCompute
<
half
,
PRECISION
(
kFP16
)
>
;
REGISTER_LITE_KERNEL
(
softmax
,
kCUDA
,
kFloat
,
kNCHW
,
SoftmaxFp32
,
def
)
.
BindInput
(
"X"
,
{
LiteType
::
GetTensorTy
(
TARGET
(
kCUDA
),
PRECISION
(
kFloat
),
...
...
@@ -246,12 +426,19 @@ REGISTER_LITE_KERNEL(softmax,
PRECISION
(
kFloat
),
DATALAYOUT
(
kNCHW
))})
.
Finalize
();
REGISTER_LITE_KERNEL
(
search_seq_softmax
,
kCUDA
,
kFloat
,
kNCHW
,
paddle
::
lite
::
kernels
::
cuda
::
SoftmaxCompute
,
def
)
REGISTER_LITE_KERNEL
(
softmax
,
kCUDA
,
kFP16
,
kNCHW
,
SoftmaxFp16
,
def
)
.
BindInput
(
"X"
,
{
LiteType
::
GetTensorTy
(
TARGET
(
kCUDA
),
PRECISION
(
kFP16
),
DATALAYOUT
(
kNCHW
))})
.
BindOutput
(
"Out"
,
{
LiteType
::
GetTensorTy
(
TARGET
(
kCUDA
),
PRECISION
(
kFP16
),
DATALAYOUT
(
kNCHW
))})
.
BindOutput
(
"Out_log"
,
{
LiteType
::
GetTensorTy
(
TARGET
(
kCUDA
),
PRECISION
(
kFP16
))})
.
Finalize
();
REGISTER_LITE_KERNEL
(
search_seq_softmax
,
kCUDA
,
kFloat
,
kNCHW
,
SoftmaxFp32
,
def
)
.
BindInput
(
"X"
,
{
LiteType
::
GetTensorTy
(
TARGET
(
kCUDA
),
PRECISION
(
kFloat
),
...
...
@@ -262,3 +449,15 @@ REGISTER_LITE_KERNEL(search_seq_softmax,
DATALAYOUT
(
kNCHW
))})
.
BindOutput
(
"Out_log"
,
{
LiteType
::
GetTensorTy
(
TARGET
(
kCUDA
))})
.
Finalize
();
REGISTER_LITE_KERNEL
(
search_seq_softmax
,
kCUDA
,
kFP16
,
kNCHW
,
SoftmaxFp16
,
def
)
.
BindInput
(
"X"
,
{
LiteType
::
GetTensorTy
(
TARGET
(
kCUDA
),
PRECISION
(
kFP16
),
DATALAYOUT
(
kNCHW
))})
.
BindOutput
(
"Out"
,
{
LiteType
::
GetTensorTy
(
TARGET
(
kCUDA
),
PRECISION
(
kFP16
),
DATALAYOUT
(
kNCHW
))})
.
BindOutput
(
"Out_log"
,
{
LiteType
::
GetTensorTy
(
TARGET
(
kCUDA
),
PRECISION
(
kFP16
))})
.
Finalize
();
lite/kernels/cuda/softmax_compute.h
浏览文件 @
8776f751
...
...
@@ -13,6 +13,7 @@
// limitations under the License.
#pragma once
#include "lite/backends/cuda/math/cudnn_softmax.h"
#include "lite/core/kernel.h"
namespace
paddle
{
...
...
@@ -20,8 +21,9 @@ namespace lite {
namespace
kernels
{
namespace
cuda
{
template
<
typename
Dtype
,
PrecisionType
Ptype
>
class
SoftmaxCompute
:
public
KernelLite
<
TARGET
(
kCUDA
),
P
RECISION
(
kFloat
)
,
DATALAYOUT
(
kNCHW
)
>
{
:
public
KernelLite
<
TARGET
(
kCUDA
),
P
type
,
DATALAYOUT
(
kNCHW
)
>
{
public:
using
param_t
=
operators
::
SoftmaxParam
;
...
...
@@ -30,6 +32,7 @@ class SoftmaxCompute
virtual
~
SoftmaxCompute
()
=
default
;
private:
lite
::
cuda
::
math
::
CudnnSoftmax
<
Dtype
,
Ptype
>
cudnn_softmax_
;
lite
::
Tensor
tmax_data_
;
lite
::
Tensor
tsum_data_
;
size_t
sharedmem_size_
;
...
...
lite/kernels/cuda/softmax_compute_test.cc
浏览文件 @
8776f751
...
...
@@ -20,114 +20,192 @@
#include <utility>
#include <vector>
#include "lite/api/test_helper.h"
#include "lite/utils/float16.h"
namespace
paddle
{
namespace
lite
{
namespace
kernels
{
namespace
cuda
{
using
Tensor
=
lite
::
Tensor
;
using
DDim
=
lite
::
DDim
;
template
<
typename
dtype
>
static
void
softmax_compute_ref
(
const
operators
::
SoftmaxParam
&
param
)
{
const
dtype
*
x_data
=
param
.
x
->
mutable_data
<
const
dtype
>
();
dtype
*
output_data
=
param
.
output
->
mutable_data
<
dtype
>
();
DDim
x_dims
=
param
.
x
->
dims
();
ASSERT_EQ
(
x_dims
.
data
(),
param
.
output
->
dims
().
data
());
auto
x_rank
=
x_dims
.
size
();
int
axis
=
param
.
axis
;
if
(
axis
<
0
)
{
axis
+=
x_rank
;
class
SoftmaxTest
:
public
::
testing
::
Test
{
protected:
SoftmaxTest
()
:
n_
(
2
),
c_
(
2
),
h_
(
2
),
w_
(
2
),
axis_
(
1
),
use_cudnn_
(
true
),
shape_
({
n_
,
c_
,
h_
,
w_
})
{
x_ref_
.
Resize
(
lite
::
DDim
(
shape_
));
x_gpu_
.
Resize
(
lite
::
DDim
(
shape_
));
auto
x_ref_data
=
x_ref_
.
mutable_data
<
float
>
();
for
(
int64_t
i
=
0
;
i
<
x_ref_
.
numel
();
i
++
)
{
x_ref_data
[
i
]
=
static_cast
<
float
>
(
i
%
10
*
0.2
);
}
out_ref_
.
Resize
(
lite
::
DDim
(
shape_
));
out_cpu_
.
Resize
(
out_ref_
.
dims
());
out_gpu_
.
Resize
(
out_ref_
.
dims
());
RunBaseLine
();
InitParamAndContext
();
}
void
InitParamAndContext
()
{
ctx_
.
reset
(
new
KernelContext
);
cudaStreamCreate
(
&
stream_
);
auto
&
context
=
ctx_
->
As
<
CUDAContext
>
();
context
.
SetExecStream
(
stream_
);
param_
.
x
=
&
x_gpu_
;
param_
.
axis
=
axis_
;
param_
.
output
=
&
out_gpu_
;
param_
.
use_cudnn
=
use_cudnn_
;
}
void
InitFloatInput
()
{
x_gpu_
.
Assign
<
float
,
lite
::
DDim
,
TARGET
(
kCUDA
)
>
(
x_ref_
.
data
<
float
>
(),
x_gpu_
.
dims
());
}
int
axis_size
=
x_dims
[
axis
];
int
outer_num
=
x_dims
.
Slice
(
0
,
axis
).
production
();
int
inner_num
=
x_dims
.
Slice
(
axis
+
1
,
x_rank
).
production
();
int
compute_size
=
outer_num
*
inner_num
;
for
(
int
i
=
0
;
i
<
compute_size
;
i
++
)
{
int
idx_inner
=
i
%
inner_num
;
int
idx_outer
=
(
i
/
inner_num
)
*
axis_size
;
int
start
=
idx_outer
*
inner_num
+
idx_inner
;
int
offset
;
offset
=
start
;
dtype
max_data
=
std
::
numeric_limits
<
dtype
>::
lowest
();
for
(
int
j
=
0
;
j
<
axis_size
;
j
++
)
{
max_data
=
x_data
[
offset
]
>
max_data
?
x_data
[
offset
]
:
max_data
;
offset
+=
inner_num
;
void
InitHalfInput
()
{
x_half_
.
Resize
(
x_ref_
.
dims
());
auto
x_half_data
=
x_half_
.
mutable_data
<
half
>
();
for
(
int64_t
i
=
0
;
i
<
x_half_
.
numel
();
i
++
)
{
x_half_data
[
i
]
=
half
(
lite
::
float16
(
x_ref_
.
data
<
float
>
()[
i
]));
}
x_gpu_
.
Assign
<
half
,
lite
::
DDim
,
TARGET
(
kCUDA
)
>
(
x_half_data
,
x_gpu_
.
dims
());
}
offset
=
start
;
dtype
sum_data
=
(
dtype
)
0
;
for
(
int
j
=
0
;
j
<
axis_size
;
j
++
)
{
output_data
[
offset
]
=
exp
(
x_data
[
offset
]
-
max_data
);
sum_data
+=
output_data
[
offset
];
offset
+=
inner_num
;
void
RunBaseLine
()
{
const
float
*
x_data
=
x_ref_
.
mutable_data
<
float
>
();
float
*
output_data
=
out_ref_
.
mutable_data
<
float
>
();
DDim
x_dims
=
x_ref_
.
dims
();
ASSERT_EQ
(
x_dims
.
data
(),
out_ref_
.
dims
().
data
());
auto
x_rank
=
x_dims
.
size
();
int
axis
=
axis_
;
if
(
axis
<
0
)
{
axis
+=
x_rank
;
}
int
axis_size
=
x_dims
[
axis
];
int
outer_num
=
x_dims
.
Slice
(
0
,
axis
).
production
();
int
inner_num
=
x_dims
.
Slice
(
axis
+
1
,
x_rank
).
production
();
int
compute_size
=
outer_num
*
inner_num
;
for
(
int
i
=
0
;
i
<
compute_size
;
i
++
)
{
int
idx_inner
=
i
%
inner_num
;
int
idx_outer
=
(
i
/
inner_num
)
*
axis_size
;
int
start
=
idx_outer
*
inner_num
+
idx_inner
;
int
offset
;
offset
=
start
;
float
max_data
=
std
::
numeric_limits
<
float
>::
lowest
();
for
(
int
j
=
0
;
j
<
axis_size
;
j
++
)
{
max_data
=
x_data
[
offset
]
>
max_data
?
x_data
[
offset
]
:
max_data
;
offset
+=
inner_num
;
}
offset
=
start
;
for
(
int
j
=
0
;
j
<
axis_size
;
j
++
)
{
output_data
[
offset
]
/=
sum_data
;
offset
+=
inner_num
;
offset
=
start
;
float
sum_data
=
0.
f
;
for
(
int
j
=
0
;
j
<
axis_size
;
j
++
)
{
output_data
[
offset
]
=
exp
(
x_data
[
offset
]
-
max_data
);
sum_data
+=
output_data
[
offset
];
offset
+=
inner_num
;
}
offset
=
start
;
for
(
int
j
=
0
;
j
<
axis_size
;
j
++
)
{
output_data
[
offset
]
/=
sum_data
;
offset
+=
inner_num
;
}
}
}
int
n_
,
c_
,
h_
,
w_
,
axis_
;
bool
use_cudnn_
;
std
::
vector
<
int64_t
>
shape_
;
lite
::
Tensor
x_ref_
,
out_ref_
;
lite
::
Tensor
x_gpu_
;
lite
::
Tensor
x_half_
;
lite
::
Tensor
out_cpu_
,
out_gpu_
;
operators
::
SoftmaxParam
param_
;
std
::
unique_ptr
<
KernelContext
>
ctx_
;
cudaStream_t
stream_
;
};
TEST_F
(
SoftmaxTest
,
TestFP32
)
{
InitFloatInput
();
SoftmaxCompute
<
float
,
PRECISION
(
kFloat
)
>
kernel
;
kernel
.
SetParam
(
param_
);
kernel
.
SetContext
(
std
::
move
(
ctx_
));
for
(
int
i
=
0
;
i
<
FLAGS_warmup
;
++
i
)
{
kernel
.
Launch
();
cudaDeviceSynchronize
();
}
auto
start
=
GetCurrentUS
();
kernel
.
PrepareForRun
();
for
(
int
i
=
0
;
i
<
FLAGS_repeats
;
++
i
)
{
kernel
.
Run
();
}
cudaDeviceSynchronize
();
auto
duration
=
(
GetCurrentUS
()
-
start
)
/
1000.0
;
LOG
(
INFO
)
<<
"fp32, warmup: "
<<
FLAGS_warmup
<<
", repeats: "
<<
FLAGS_repeats
<<
", spend "
<<
duration
/
FLAGS_repeats
<<
" ms in average."
;
CopySync
<
TARGET
(
kCUDA
)
>
(
out_cpu_
.
mutable_data
<
float
>
(),
out_gpu_
.
data
<
float
>
(),
sizeof
(
float
)
*
out_gpu_
.
numel
(),
IoDirection
::
DtoH
);
for
(
int
i
=
0
;
i
<
out_gpu_
.
numel
();
++
i
)
{
float
res
=
out_cpu_
.
data
<
float
>
()[
i
];
float
ref
=
out_ref_
.
data
<
float
>
()[
i
];
EXPECT_NEAR
(
fabs
(
res
-
ref
)
/
ref
,
0.
f
,
1e-5
);
}
}
TEST
(
softmax_cuda
,
compute
)
{
std
::
unique_ptr
<
KernelContext
>
ctx
(
new
KernelContext
);
auto
&
context
=
ctx
->
As
<
CUDAContext
>
();
cudaStream_t
stream
;
cudaStreamCreate
(
&
stream
);
context
.
SetExecStream
(
stream
);
SoftmaxCompute
softmax
;
operators
::
SoftmaxParam
param
;
softmax
.
SetContext
(
std
::
move
(
ctx
));
lite
::
Tensor
x
;
lite
::
Tensor
x_cpu
;
lite
::
Tensor
output
;
lite
::
Tensor
output_cpu
;
lite
::
Tensor
output_ref
;
for
(
auto
n
:
{
1
,
3
})
{
for
(
auto
c
:
{
1
,
4
})
{
for
(
auto
h
:
{
5
,
1
,
112
})
{
for
(
auto
w
:
{
1
,
6
,
112
})
{
for
(
auto
axis
:
{
-
2
,
-
1
,
0
,
1
,
2
})
{
x
.
Resize
({
n
,
c
,
h
,
w
});
x_cpu
.
Resize
({
n
,
c
,
h
,
w
});
output
.
Resize
({
n
,
c
,
h
,
w
});
output_cpu
.
Resize
({
n
,
c
,
h
,
w
});
output_ref
.
Resize
({
n
,
c
,
h
,
w
});
auto
*
x_cpu_data
=
x_cpu
.
mutable_data
<
float
>
();
auto
*
output_data
=
output
.
mutable_data
<
float
>
(
TARGET
(
kCUDA
));
auto
*
output_cpu_data
=
output_ref
.
mutable_data
<
float
>
();
auto
*
output_ref_data
=
output_ref
.
mutable_data
<
float
>
();
for
(
int
i
=
0
;
i
<
x
.
dims
().
production
();
i
++
)
{
x_cpu_data
[
i
]
=
i
;
}
x
.
Assign
<
float
,
lite
::
DDim
,
TARGET
(
kCUDA
)
>
(
x_cpu_data
,
x_cpu
.
dims
());
param
.
x
=
&
x
;
param
.
axis
=
axis
;
param
.
output
=
&
output
;
softmax
.
SetParam
(
param
);
softmax
.
Launch
();
param
.
x
=
&
x_cpu
;
param
.
output
=
&
output_ref
;
softmax_compute_ref
<
float
>
(
param
);
cudaDeviceSynchronize
();
CopySync
<
TARGET
(
kCUDA
)
>
(
output_cpu_data
,
output_data
,
sizeof
(
float
)
*
output
.
numel
(),
IoDirection
::
DtoH
);
for
(
int
i
=
0
;
i
<
output
.
dims
().
production
();
i
++
)
{
EXPECT_NEAR
(
output_cpu_data
[
i
],
output_ref_data
[
i
],
1e-5
);
}
}
}
}
}
TEST_F
(
SoftmaxTest
,
TestFP16
)
{
InitHalfInput
();
SoftmaxCompute
<
half
,
PRECISION
(
kFP16
)
>
kernel
;
kernel
.
SetParam
(
param_
);
kernel
.
SetContext
(
std
::
move
(
ctx_
));
for
(
int
i
=
0
;
i
<
FLAGS_warmup
;
++
i
)
{
kernel
.
Launch
();
cudaDeviceSynchronize
();
}
auto
start
=
GetCurrentUS
();
kernel
.
PrepareForRun
();
for
(
int
i
=
0
;
i
<
FLAGS_repeats
;
++
i
)
{
kernel
.
Run
();
}
cudaDeviceSynchronize
();
auto
duration
=
(
GetCurrentUS
()
-
start
)
/
1000.0
;
LOG
(
INFO
)
<<
"fp16, warmup: "
<<
FLAGS_warmup
<<
", repeats: "
<<
FLAGS_repeats
<<
", spend "
<<
duration
/
FLAGS_repeats
<<
" ms in average."
;
const
half
*
out_gpu_data
=
out_gpu_
.
data
<
half
>
();
half
*
out_cpu_data
=
out_cpu_
.
mutable_data
<
half
>
();
CopySync
<
TARGET
(
kCUDA
)
>
(
out_cpu_data
,
out_gpu_data
,
sizeof
(
half
)
*
out_gpu_
.
numel
(),
IoDirection
::
DtoH
);
for
(
int
i
=
0
;
i
<
out_gpu_
.
numel
();
++
i
)
{
float
res
=
static_cast
<
float
>
(
lite
::
float16
(
out_cpu_data
[
i
]));
float
ref
=
out_ref_
.
data
<
float
>
()[
i
];
EXPECT_NEAR
(
fabs
(
res
-
ref
)
/
(
ref
+
1e-5
),
0.
,
1e-2
);
}
}
}
// namespace cuda
}
// namespace kernels
}
// namespace lite
...
...
lite/operators/op_params.h
浏览文件 @
8776f751
...
...
@@ -268,6 +268,7 @@ struct SoftmaxParam : ParamBase {
lite
::
Tensor
*
x
{};
lite
::
Tensor
*
output
{};
int
axis
{
-
1
};
bool
use_cudnn
{
true
};
///////////////////////////////////////////////////////////////////////////////////
// get a vector of input tensors
const
std
::
vector
<
const
Tensor
*>*
input_tensor_ptrs
()
override
{
...
...
lite/operators/softmax_op.cc
浏览文件 @
8776f751
...
...
@@ -52,6 +52,9 @@ bool SoftmaxOp::AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) {
}
CHECK
(
param_
.
x
);
CHECK
(
param_
.
output
);
if
(
opdesc
.
HasAttr
(
"use_cudnn"
))
{
param_
.
use_cudnn
=
opdesc
.
GetAttr
<
bool
>
(
"use_cudnn"
);
}
return
true
;
}
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录