Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
weixin_44025039
mindspore
提交
fb405ee6
M
mindspore
项目概览
weixin_44025039
/
mindspore
与 Fork 源项目一致
Fork自
MindSpore / mindspore
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
M
mindspore
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
体验新版 GitCode,发现更多精彩内容 >>
提交
fb405ee6
编写于
8月 06, 2020
作者:
L
linqingke
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
broadcast, slice, scatter_nd ops optimizer.
上级
645f11fa
变更
14
隐藏空白更改
内联
并排
Showing
14 changed file
with
240 addition
and
123 deletion
+240
-123
mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/array_reduce_gpu_kernel.h
...kend/kernel_compiler/gpu/arrays/array_reduce_gpu_kernel.h
+43
-14
mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/scatter_nd_gpu_kernel.h
...ackend/kernel_compiler/gpu/arrays/scatter_nd_gpu_kernel.h
+4
-0
mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/broadcast_impl.cu
...c/backend/kernel_compiler/gpu/cuda_impl/broadcast_impl.cu
+88
-65
mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/broadcast_impl.cuh
.../backend/kernel_compiler/gpu/cuda_impl/broadcast_impl.cuh
+4
-3
mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/check_valid_impl.cu
...backend/kernel_compiler/gpu/cuda_impl/check_valid_impl.cu
+6
-4
mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/iou_impl.cu
...e/ccsrc/backend/kernel_compiler/gpu/cuda_impl/iou_impl.cu
+19
-18
mindspore/ccsrc/backend/kernel_compiler/gpu/gpu_kernel.h
mindspore/ccsrc/backend/kernel_compiler/gpu/gpu_kernel.h
+34
-0
mindspore/ccsrc/backend/kernel_compiler/gpu/math/broadcast_gpu_kernel.h
...c/backend/kernel_compiler/gpu/math/broadcast_gpu_kernel.h
+11
-8
mindspore/ccsrc/backend/kernel_compiler/gpu/nn/activation_gpu_kernel.h
...rc/backend/kernel_compiler/gpu/nn/activation_gpu_kernel.h
+11
-4
mindspore/ccsrc/backend/kernel_compiler/gpu/nn/activation_grad_kernel.h
...c/backend/kernel_compiler/gpu/nn/activation_grad_kernel.h
+10
-4
mindspore/ccsrc/backend/kernel_compiler/gpu/other/check_valid_gpu_kernel.cc
...ckend/kernel_compiler/gpu/other/check_valid_gpu_kernel.cc
+4
-0
mindspore/ccsrc/backend/kernel_compiler/gpu/other/iou_gpu_kernel.cc
...ccsrc/backend/kernel_compiler/gpu/other/iou_gpu_kernel.cc
+3
-0
tests/st/ops/gpu/test_floordiv_op.py
tests/st/ops/gpu/test_floordiv_op.py
+2
-2
tests/st/ops/gpu/test_reduce_mean_op.py
tests/st/ops/gpu/test_reduce_mean_op.py
+1
-1
未找到文件。
mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/array_reduce_gpu_kernel.h
浏览文件 @
fb405ee6
...
...
@@ -182,30 +182,59 @@ class ArrayReduceGpuKernel : public GpuKernel {
void
InferInAndOutDesc
(
const
std
::
vector
<
size_t
>
&
input_shape
,
const
std
::
vector
<
size_t
>
&
output_shape
)
{
std
::
vector
<
int
>
inputA
;
std
::
vector
<
size_t
>
outputC_shape
=
output_shape
;
ShapeNdTo4d
(
input_shape
,
&
inputA
);
CHECK_CUDNN_RET_WITH_EXCEPT
(
cudnnSetTensor4dDescriptor
(
inputA_descriptor_
,
CUDNN_TENSOR_NCHW
,
data_type_
,
inputA
[
0
],
inputA
[
1
],
inputA
[
2
],
inputA
[
3
]),
"cudnnSetTensor4dDescriptor failed"
);
const
int
split_dim
=
4
;
if
(
input_shape
.
size
()
<=
split_dim
)
{
ShapeNdTo4d
(
input_shape
,
&
inputA
);
CHECK_CUDNN_RET_WITH_EXCEPT
(
cudnnSetTensor4dDescriptor
(
inputA_descriptor_
,
CUDNN_TENSOR_NCHW
,
data_type_
,
inputA
[
0
],
inputA
[
1
],
inputA
[
2
],
inputA
[
3
]),
"cudnnSetTensor4dDescriptor failed"
);
}
else
{
CudnnSetTensorNdDescriptor
(
input_shape
,
inputA_descriptor_
,
data_type_
);
for
(
auto
dim
:
input_shape
)
{
inputA
.
emplace_back
(
SizeToInt
(
dim
));
}
}
if
(
axis_
[
0
]
==
-
1
)
{
CHECK_CUDNN_RET_WITH_EXCEPT
(
cudnnSetTensor4dDescriptor
(
outputC_descriptor_
,
CUDNN_TENSOR_NCHW
,
data_type_
,
1
,
1
,
1
,
1
),
"cudnnSetTensor4dDescriptor failed"
);
if
(
inputA
[
0
]
==
1
&&
inputA
[
1
]
==
1
&&
inputA
[
2
]
==
1
&&
inputA
[
3
]
==
1
)
{
all_match_
=
true
;
outputC_shape
.
resize
(
input_shape
.
size
(),
1
);
if
(
outputC_shape
.
size
()
<=
split_dim
)
{
CHECK_CUDNN_RET_WITH_EXCEPT
(
cudnnSetTensor4dDescriptor
(
outputC_descriptor_
,
CUDNN_TENSOR_NCHW
,
data_type_
,
1
,
1
,
1
,
1
),
"cudnnSetTensor4dDescriptor failed"
);
}
else
{
CudnnSetTensorNdDescriptor
(
outputC_shape
,
outputC_descriptor_
,
data_type_
);
}
for
(
auto
dim
:
inputA
)
{
if
(
dim
!=
1
)
{
return
;
}
}
all_match_
=
true
;
return
;
}
std
::
vector
<
int
>
outputC
;
if
(
!
keep_dims_
)
{
for
(
auto
i
:
axis_
)
{
(
void
)(
outputC_shape
.
insert
(
outputC_shape
.
begin
()
+
i
,
1
));
}
}
std
::
vector
<
int
>
outputC
;
ShapeNdTo4d
(
outputC_shape
,
&
outputC
);
CHECK_CUDNN_RET_WITH_EXCEPT
(
cudnnSetTensor4dDescriptor
(
outputC_descriptor_
,
CUDNN_TENSOR_NCHW
,
data_type_
,
outputC
[
0
],
outputC
[
1
],
outputC
[
2
],
outputC
[
3
]),
"cudnnSetTensor4dDescriptor failed"
);
if
(
outputC_shape
.
size
()
<=
split_dim
)
{
ShapeNdTo4d
(
outputC_shape
,
&
outputC
);
CHECK_CUDNN_RET_WITH_EXCEPT
(
cudnnSetTensor4dDescriptor
(
outputC_descriptor_
,
CUDNN_TENSOR_NCHW
,
data_type_
,
outputC
[
0
],
outputC
[
1
],
outputC
[
2
],
outputC
[
3
]),
"cudnnSetTensor4dDescriptor failed"
);
}
else
{
CudnnSetTensorNdDescriptor
(
outputC_shape
,
outputC_descriptor_
,
data_type_
);
for
(
auto
dim
:
outputC_shape
)
{
outputC
.
emplace_back
(
SizeToInt
(
dim
));
}
}
if
(
inputA
==
outputC
)
{
all_match_
=
true
;
}
...
...
mindspore/ccsrc/backend/kernel_compiler/gpu/arrays/scatter_nd_gpu_kernel.h
浏览文件 @
fb405ee6
...
...
@@ -69,6 +69,10 @@ class ScatterNdGpuFwdKernel : public GpuKernel {
memcpy_flag_
=
true
;
}
CHECK_CUDA_RET_WITH_EXCEPT
(
cudaMemsetAsync
(
output
,
static_cast
<
T
>
(
0.0
),
output_size_
,
reinterpret_cast
<
cudaStream_t
>
(
stream_ptr
)),
"cudaMemSet failed in ScatterNdGpuFwdKernel::Launch."
);
const
size_t
input_size
=
input_size_
/
sizeof
(
T
);
const
size_t
output_size
=
output_size_
/
sizeof
(
T
);
...
...
mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/broadcast_impl.cu
浏览文件 @
fb405ee6
...
...
@@ -14,6 +14,7 @@
* limitations under the License.
*/
#include <vector>
#include "backend/kernel_compiler/gpu/cuda_impl/broadcast_impl.cuh"
#include "runtime/device/gpu/cuda_common.h"
...
...
@@ -107,69 +108,97 @@ __device__ __forceinline__ int Index(const int &index, const int &dim) { return
template
<
typename
T
,
typename
S
,
typename
Func
>
__device__
__forceinline__
void
BroadcastOperator
(
const
int
&
l0
,
const
int
&
l1
,
const
int
&
l2
,
const
int
&
l3
,
const
int
&
r0
,
const
int
&
r1
,
const
int
&
r2
,
const
int
&
r3
,
const
int
&
d0
,
const
int
&
d1
,
const
int
&
d2
,
const
int
&
d3
,
const
T
*
input0
,
const
T
*
input1
,
S
*
output
)
{
for
(
size_t
pos
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
pos
<
d0
*
d1
*
d2
*
d3
;
pos
+=
blockDim
.
x
*
gridDim
.
x
)
{
int
i
=
pos
/
(
d1
*
d2
*
d3
)
%
d0
;
int
j
=
pos
/
(
d2
*
d3
)
%
d1
;
int
k
=
pos
/
d3
%
d2
;
int
l
=
pos
%
d3
;
const
int
&
l4
,
const
int
&
l5
,
const
int
&
l6
,
const
int
&
r0
,
const
int
&
r1
,
const
int
&
r2
,
const
int
&
r3
,
const
int
&
r4
,
const
int
&
r5
,
const
int
&
r6
,
const
int
&
d0
,
const
int
&
d1
,
const
int
&
d2
,
const
int
&
d3
,
const
int
&
d4
,
const
int
&
d5
,
const
int
&
d6
,
const
T
*
input0
,
const
T
*
input1
,
S
*
output
)
{
for
(
size_t
pos
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
pos
<
d0
*
d1
*
d2
*
d3
*
d4
*
d5
*
d6
;
pos
+=
blockDim
.
x
*
gridDim
.
x
)
{
int
i
=
pos
/
(
d1
*
d2
*
d3
*
d4
*
d5
*
d6
)
%
d0
;
int
j
=
pos
/
(
d2
*
d3
*
d4
*
d5
*
d6
)
%
d1
;
int
k
=
pos
/
(
d3
*
d4
*
d5
*
d6
)
%
d2
;
int
l
=
pos
/
(
d4
*
d5
*
d6
)
%
d3
;
int
m
=
pos
/
(
d5
*
d6
)
%
d4
;
int
n
=
pos
/
d6
%
d5
;
int
o
=
pos
%
d6
;
int
l_index
=
Index
(
i
,
l0
)
*
l1
*
l2
*
l3
+
Index
(
j
,
l1
)
*
l2
*
l3
+
Index
(
k
,
l2
)
*
l3
+
Index
(
l
,
l3
);
int
r_index
=
Index
(
i
,
r0
)
*
r1
*
r2
*
r3
+
Index
(
j
,
r1
)
*
r2
*
r3
+
Index
(
k
,
r2
)
*
r3
+
Index
(
l
,
r3
);
int
l_index
=
Index
(
i
,
l0
)
*
l1
*
l2
*
l3
*
l4
*
l5
*
l6
;
l_index
+=
Index
(
j
,
l1
)
*
l2
*
l3
*
l4
*
l5
*
l6
;
l_index
+=
Index
(
k
,
l2
)
*
l3
*
l4
*
l5
*
l6
;
l_index
+=
Index
(
l
,
l3
)
*
l4
*
l5
*
l6
;
l_index
+=
Index
(
m
,
l4
)
*
l5
*
l6
;
l_index
+=
Index
(
n
,
l5
)
*
l6
;
l_index
+=
Index
(
o
,
l6
);
int
r_index
=
Index
(
i
,
r0
)
*
r1
*
r2
*
r3
*
r4
*
r5
*
r6
;
r_index
+=
Index
(
j
,
r1
)
*
r2
*
r3
*
r4
*
r5
*
r6
;
r_index
+=
Index
(
k
,
r2
)
*
r3
*
r4
*
r5
*
r6
;
r_index
+=
Index
(
l
,
r3
)
*
r4
*
r5
*
r6
;
r_index
+=
Index
(
m
,
r4
)
*
r5
*
r6
;
r_index
+=
Index
(
n
,
r5
)
*
r6
;
r_index
+=
Index
(
o
,
r6
);
output
[
pos
]
=
Func
()(
input0
[
l_index
],
input1
[
r_index
]);
}
}
template
<
typename
T
,
typename
S
>
__global__
void
BroadcastKernel
(
const
int
l0
,
const
int
l1
,
const
int
l2
,
const
int
l3
,
const
int
r0
,
const
int
r1
,
const
int
r2
,
const
int
r3
,
const
int
d0
,
const
int
d1
,
const
int
d2
,
const
int
d3
,
enum
BroadcastOpType
op
,
const
T
*
input0
,
const
T
*
input1
,
S
*
output
)
{
__global__
void
BroadcastKernel
(
const
int
l0
,
const
int
l1
,
const
int
l2
,
const
int
l3
,
const
int
l4
,
const
int
l5
,
const
int
l6
,
const
int
r0
,
const
int
r1
,
const
int
r2
,
const
int
r3
,
const
int
r4
,
const
int
r5
,
const
int
r6
,
const
int
d0
,
const
int
d1
,
const
int
d2
,
const
int
d3
,
const
int
d4
,
const
int
d5
,
const
int
d6
,
enum
BroadcastOpType
op
,
const
T
*
input0
,
const
T
*
input1
,
S
*
output
)
{
switch
(
op
)
{
case
BROADCAST_TYPE_GREATER
:
return
BroadcastOperator
<
T
,
S
,
GreaterFunc
<
T
,
S
>>
(
l0
,
l1
,
l2
,
l3
,
r0
,
r1
,
r2
,
r3
,
d0
,
d1
,
d2
,
d3
,
input0
,
input
1
,
output
);
return
BroadcastOperator
<
T
,
S
,
GreaterFunc
<
T
,
S
>>
(
l0
,
l1
,
l2
,
l3
,
l4
,
l5
,
l6
,
r0
,
r1
,
r2
,
r3
,
r4
,
r5
,
r6
,
d0
,
d
1
,
d2
,
d3
,
d4
,
d5
,
d6
,
input0
,
input1
,
output
);
case
BROADCAST_TYPE_LESS
:
return
BroadcastOperator
<
T
,
S
,
LessFunc
<
T
,
S
>>
(
l0
,
l1
,
l2
,
l3
,
r0
,
r1
,
r2
,
r3
,
d0
,
d1
,
d2
,
d3
,
input0
,
input1
,
output
);
return
BroadcastOperator
<
T
,
S
,
LessFunc
<
T
,
S
>>
(
l0
,
l1
,
l2
,
l3
,
l4
,
l5
,
l6
,
r0
,
r1
,
r2
,
r3
,
r4
,
r5
,
r6
,
d0
,
d1
,
d2
,
d3
,
d4
,
d5
,
d6
,
input0
,
input1
,
output
);
case
BROADCAST_TYPE_MINIMUM
:
return
BroadcastOperator
<
T
,
S
,
MinimumFunc
<
T
,
S
>>
(
l0
,
l1
,
l2
,
l3
,
r0
,
r1
,
r2
,
r3
,
d0
,
d1
,
d2
,
d3
,
input0
,
input
1
,
output
);
return
BroadcastOperator
<
T
,
S
,
MinimumFunc
<
T
,
S
>>
(
l0
,
l1
,
l2
,
l3
,
l4
,
l5
,
l6
,
r0
,
r1
,
r2
,
r3
,
r4
,
r5
,
r6
,
d0
,
d
1
,
d2
,
d3
,
d4
,
d5
,
d6
,
input0
,
input1
,
output
);
case
BROADCAST_TYPE_MAXIMUM
:
return
BroadcastOperator
<
T
,
S
,
MaximumFunc
<
T
,
S
>>
(
l0
,
l1
,
l2
,
l3
,
r0
,
r1
,
r2
,
r3
,
d0
,
d1
,
d2
,
d3
,
input0
,
input
1
,
output
);
return
BroadcastOperator
<
T
,
S
,
MaximumFunc
<
T
,
S
>>
(
l0
,
l1
,
l2
,
l3
,
l4
,
l5
,
l6
,
r0
,
r1
,
r2
,
r3
,
r4
,
r5
,
r6
,
d0
,
d
1
,
d2
,
d3
,
d4
,
d5
,
d6
,
input0
,
input1
,
output
);
case
BROADCAST_TYPE_POWER
:
return
BroadcastOperator
<
T
,
S
,
PowerFunc
<
T
,
S
>>
(
l0
,
l1
,
l2
,
l3
,
r0
,
r1
,
r2
,
r3
,
d0
,
d1
,
d2
,
d3
,
input0
,
input
1
,
output
);
return
BroadcastOperator
<
T
,
S
,
PowerFunc
<
T
,
S
>>
(
l0
,
l1
,
l2
,
l3
,
l4
,
l5
,
l6
,
r0
,
r1
,
r2
,
r3
,
r4
,
r5
,
r6
,
d0
,
d
1
,
d2
,
d3
,
d4
,
d5
,
d6
,
input0
,
input1
,
output
);
case
BROADCAST_TYPE_REALDIV
:
return
BroadcastOperator
<
T
,
S
,
RealDivFunc
<
T
,
S
>>
(
l0
,
l1
,
l2
,
l3
,
r0
,
r1
,
r2
,
r3
,
d0
,
d1
,
d2
,
d3
,
input0
,
input
1
,
output
);
return
BroadcastOperator
<
T
,
S
,
RealDivFunc
<
T
,
S
>>
(
l0
,
l1
,
l2
,
l3
,
l4
,
l5
,
l6
,
r0
,
r1
,
r2
,
r3
,
r4
,
r5
,
r6
,
d0
,
d
1
,
d2
,
d3
,
d4
,
d5
,
d6
,
input0
,
input1
,
output
);
case
BROADCAST_TYPE_MUL
:
return
BroadcastOperator
<
T
,
S
,
MulFunc
<
T
,
S
>>
(
l0
,
l1
,
l2
,
l3
,
r0
,
r1
,
r2
,
r3
,
d0
,
d1
,
d2
,
d3
,
input0
,
input1
,
output
);
return
BroadcastOperator
<
T
,
S
,
MulFunc
<
T
,
S
>>
(
l0
,
l1
,
l2
,
l3
,
l4
,
l5
,
l6
,
r0
,
r1
,
r2
,
r3
,
r4
,
r5
,
r6
,
d0
,
d1
,
d2
,
d3
,
d4
,
d5
,
d6
,
input0
,
input1
,
output
);
case
BROADCAST_TYPE_SUB
:
return
BroadcastOperator
<
T
,
S
,
SubFunc
<
T
,
S
>>
(
l0
,
l1
,
l2
,
l3
,
r0
,
r1
,
r2
,
r3
,
d0
,
d1
,
d2
,
d3
,
input0
,
input1
,
output
);
return
BroadcastOperator
<
T
,
S
,
SubFunc
<
T
,
S
>>
(
l0
,
l1
,
l2
,
l3
,
l4
,
l5
,
l6
,
r0
,
r1
,
r2
,
r3
,
r4
,
r5
,
r6
,
d0
,
d1
,
d2
,
d3
,
d4
,
d5
,
d6
,
input0
,
input1
,
output
);
case
BROADCAST_TYPE_ADD
:
return
BroadcastOperator
<
T
,
S
,
AddFunc
<
T
,
S
>>
(
l0
,
l1
,
l2
,
l3
,
r0
,
r1
,
r2
,
r3
,
d0
,
d1
,
d2
,
d3
,
input0
,
input1
,
output
);
return
BroadcastOperator
<
T
,
S
,
AddFunc
<
T
,
S
>>
(
l0
,
l1
,
l2
,
l3
,
l4
,
l5
,
l6
,
r0
,
r1
,
r2
,
r3
,
r4
,
r5
,
r6
,
d0
,
d1
,
d2
,
d3
,
d4
,
d5
,
d6
,
input0
,
input1
,
output
);
case
BROADCAST_TYPE_FLOORDIV
:
return
BroadcastOperator
<
T
,
S
,
FloorDivFunc
<
T
,
S
>>
(
l0
,
l1
,
l2
,
l3
,
r0
,
r1
,
r2
,
r3
,
d0
,
d1
,
d2
,
d3
,
input0
,
input
1
,
output
);
return
BroadcastOperator
<
T
,
S
,
FloorDivFunc
<
T
,
S
>>
(
l0
,
l1
,
l2
,
l3
,
l4
,
l5
,
l6
,
r0
,
r1
,
r2
,
r3
,
r4
,
r5
,
r6
,
d0
,
d
1
,
d2
,
d3
,
d4
,
d5
,
d6
,
input0
,
input1
,
output
);
case
BROADCAST_TYPE_ABSGRAD
:
return
BroadcastOperator
<
T
,
S
,
AbsGradFunc
<
T
,
S
>>
(
l0
,
l1
,
l2
,
l3
,
r0
,
r1
,
r2
,
r3
,
d0
,
d1
,
d2
,
d3
,
input0
,
input
1
,
output
);
return
BroadcastOperator
<
T
,
S
,
AbsGradFunc
<
T
,
S
>>
(
l0
,
l1
,
l2
,
l3
,
l4
,
l5
,
l6
,
r0
,
r1
,
r2
,
r3
,
r4
,
r5
,
r6
,
d0
,
d
1
,
d2
,
d3
,
d4
,
d5
,
d6
,
input0
,
input1
,
output
);
}
}
template
<
typename
T
,
typename
S
>
void
Broadcast
(
const
int
&
l0
,
const
int
&
l1
,
const
int
&
l2
,
const
int
&
l3
,
const
int
&
r0
,
const
int
&
r1
,
const
int
&
r2
,
const
int
&
r3
,
const
int
&
d0
,
const
int
&
d1
,
const
int
&
d2
,
const
int
&
d3
,
enum
BroadcastOpType
op
,
const
T
*
input0
,
const
T
*
input1
,
S
*
output
,
cudaStream_t
stream
)
{
int
size
=
d0
*
d1
*
d2
*
d3
;
BroadcastKernel
<<<
GET_BLOCKS
(
size
),
GET_THREADS
,
0
,
stream
>>>
(
l0
,
l1
,
l2
,
l3
,
r0
,
r1
,
r2
,
r3
,
d0
,
d1
,
d2
,
d3
,
op
,
input0
,
input1
,
output
);
void
Broadcast
(
const
std
::
vector
<
int
>
&
lhs_shape
,
const
std
::
vector
<
int
>
&
rhs_shape
,
const
std
::
vector
<
int
>
&
output_shape
,
enum
BroadcastOpType
op
,
const
T
*
input0
,
const
T
*
input1
,
S
*
output
,
cudaStream_t
stream
)
{
int
size
=
1
;
for
(
auto
d
:
output_shape
)
{
size
*=
d
;
}
BroadcastKernel
<<<
GET_BLOCKS
(
size
),
GET_THREADS
,
0
,
stream
>>>
(
lhs_shape
[
0
],
lhs_shape
[
1
],
lhs_shape
[
2
],
lhs_shape
[
3
],
lhs_shape
[
4
],
lhs_shape
[
5
],
lhs_shape
[
6
],
rhs_shape
[
0
],
rhs_shape
[
1
],
rhs_shape
[
2
],
rhs_shape
[
3
],
rhs_shape
[
4
],
rhs_shape
[
5
],
rhs_shape
[
6
],
output_shape
[
0
],
output_shape
[
1
],
output_shape
[
2
],
output_shape
[
3
],
output_shape
[
4
],
output_shape
[
5
],
output_shape
[
6
],
op
,
input0
,
input1
,
output
);
}
template
<
typename
T
,
typename
S
,
typename
Func
>
...
...
@@ -236,30 +265,24 @@ void BroadcastTo(const int &i0, const int &i1, const int &i2, const int &i3, con
output_addr
);
}
template
void
Broadcast
(
const
int
&
l0
,
const
int
&
l1
,
const
int
&
l2
,
const
int
&
l3
,
const
int
&
r0
,
const
int
&
r1
,
const
int
&
r2
,
const
int
&
r3
,
const
int
&
d0
,
const
int
&
d1
,
const
int
&
d2
,
const
int
&
d3
,
enum
BroadcastOpType
op
,
const
float
*
input0
,
const
float
*
input1
,
bool
*
output
,
cudaStream_t
stream
);
template
void
Broadcast
(
const
int
&
l0
,
const
int
&
l1
,
const
int
&
l2
,
const
int
&
l3
,
const
int
&
r0
,
const
int
&
r1
,
const
int
&
r2
,
const
int
&
r3
,
const
int
&
d0
,
const
int
&
d1
,
const
int
&
d2
,
const
int
&
d3
,
enum
BroadcastOpType
op
,
const
float
*
input0
,
const
float
*
input1
,
float
*
output
,
cudaStream_t
stream
);
template
void
Broadcast
(
const
int
&
l0
,
const
int
&
l1
,
const
int
&
l2
,
const
int
&
l3
,
const
int
&
r0
,
const
int
&
r1
,
const
int
&
r2
,
const
int
&
r3
,
const
int
&
d0
,
const
int
&
d1
,
const
int
&
d2
,
const
int
&
d3
,
enum
BroadcastOpType
op
,
const
half
*
input0
,
const
half
*
input1
,
bool
*
output
,
cudaStream_t
stream
);
template
void
Broadcast
(
const
int
&
l0
,
const
int
&
l1
,
const
int
&
l2
,
const
int
&
l3
,
const
int
&
r0
,
const
int
&
r1
,
const
int
&
r2
,
const
int
&
r3
,
const
int
&
d0
,
const
int
&
d1
,
const
int
&
d2
,
const
int
&
d3
,
enum
BroadcastOpType
op
,
const
half
*
input0
,
const
half
*
input1
,
half
*
output
,
cudaStream_t
stream
);
template
void
Broadcast
(
const
int
&
l0
,
const
int
&
l1
,
const
int
&
l2
,
const
int
&
l3
,
const
int
&
r0
,
const
int
&
r1
,
const
int
&
r2
,
const
int
&
r3
,
const
int
&
d0
,
const
int
&
d1
,
const
int
&
d2
,
const
int
&
d3
,
enum
BroadcastOpType
op
,
const
int
*
input0
,
const
int
*
input1
,
int
*
output
,
cudaStream_t
stream
);
template
void
Broadcast
(
const
int
&
l0
,
const
int
&
l1
,
const
int
&
l2
,
const
int
&
l3
,
const
int
&
r0
,
const
int
&
r1
,
const
int
&
r2
,
const
int
&
r3
,
const
int
&
d0
,
const
int
&
d1
,
const
int
&
d2
,
const
int
&
d3
,
enum
BroadcastOpType
op
,
const
int
*
input0
,
const
int
*
input1
,
bool
*
output
,
cudaStream_t
stream
);
template
void
Broadcast
(
const
std
::
vector
<
int
>
&
lhs_shape
,
const
std
::
vector
<
int
>
&
rhs_shape
,
const
std
::
vector
<
int
>
&
output_shape
,
enum
BroadcastOpType
op
,
const
float
*
input0
,
const
float
*
input1
,
bool
*
output
,
cudaStream_t
stream
);
template
void
Broadcast
(
const
std
::
vector
<
int
>
&
lhs_shape
,
const
std
::
vector
<
int
>
&
rhs_shape
,
const
std
::
vector
<
int
>
&
output_shape
,
enum
BroadcastOpType
op
,
const
float
*
input0
,
const
float
*
input1
,
float
*
output
,
cudaStream_t
stream
);
template
void
Broadcast
(
const
std
::
vector
<
int
>
&
lhs_shape
,
const
std
::
vector
<
int
>
&
rhs_shape
,
const
std
::
vector
<
int
>
&
output_shape
,
enum
BroadcastOpType
op
,
const
half
*
input0
,
const
half
*
input1
,
bool
*
output
,
cudaStream_t
stream
);
template
void
Broadcast
(
const
std
::
vector
<
int
>
&
lhs_shape
,
const
std
::
vector
<
int
>
&
rhs_shape
,
const
std
::
vector
<
int
>
&
output_shape
,
enum
BroadcastOpType
op
,
const
half
*
input0
,
const
half
*
input1
,
half
*
output
,
cudaStream_t
stream
);
template
void
Broadcast
(
const
std
::
vector
<
int
>
&
lhs_shape
,
const
std
::
vector
<
int
>
&
rhs_shape
,
const
std
::
vector
<
int
>
&
output_shape
,
enum
BroadcastOpType
op
,
const
int
*
input0
,
const
int
*
input1
,
int
*
output
,
cudaStream_t
stream
);
template
void
Broadcast
(
const
std
::
vector
<
int
>
&
lhs_shape
,
const
std
::
vector
<
int
>
&
rhs_shape
,
const
std
::
vector
<
int
>
&
output_shape
,
enum
BroadcastOpType
op
,
const
int
*
input0
,
const
int
*
input1
,
bool
*
output
,
cudaStream_t
stream
);
template
void
NoBroadcast
(
const
int
&
nums
,
enum
BroadcastOpType
op
,
const
float
*
input0
,
const
float
*
input1
,
bool
*
output
,
cudaStream_t
stream
);
template
void
NoBroadcast
(
const
int
&
nums
,
enum
BroadcastOpType
op
,
const
float
*
input0
,
const
float
*
input1
,
...
...
mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/broadcast_impl.cuh
浏览文件 @
fb405ee6
...
...
@@ -17,6 +17,7 @@
#ifndef MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_BROADCAST_H_
#define MINDSPORE_CCSRC_KERNEL_GPU_CUDA_IMPL_BROADCAST_H_
#include <vector>
#include "runtime/device/gpu/cuda_common.h"
enum
BroadcastOpType
{
...
...
@@ -35,9 +36,9 @@ enum BroadcastOpType {
};
template
<
typename
T
,
typename
S
>
void
Broadcast
(
const
int
&
l0
,
const
int
&
l1
,
const
int
&
l2
,
const
int
&
l3
,
const
int
&
r0
,
const
int
&
r1
,
const
int
&
r2
,
const
int
&
r3
,
const
int
&
d0
,
const
int
&
d1
,
const
int
&
d2
,
const
int
&
d3
,
enum
BroadcastOpType
op
,
const
T
*
input0
,
const
T
*
input1
,
S
*
output
,
cudaStream_t
stream
);
void
Broadcast
(
const
std
::
vector
<
int
>
&
lhs_shape
,
const
std
::
vector
<
int
>
&
rhs_shape
,
const
std
::
vector
<
int
>
&
output_shape
,
enum
BroadcastOpType
op
,
const
T
*
input0
,
const
T
*
input1
,
S
*
output
,
cudaStream_t
stream
);
template
<
typename
T
,
typename
S
>
void
NoBroadcast
(
const
int
&
size
,
enum
BroadcastOpType
op
,
const
T
*
input0
,
const
T
*
input1
,
S
*
output
,
...
...
mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/check_valid_impl.cu
浏览文件 @
fb405ee6
...
...
@@ -25,10 +25,10 @@ __global__ void CheckValidKernel(const size_t size, const T *box, const T *img_m
const
size_t
right_y
=
i
*
4
+
3
;
S
valid_flag
=
false
;
valid_flag
|=
!
(
box
[
left_x
]
>=
0.
f
);
valid_flag
|=
!
(
box
[
left_y
]
>=
0.
f
);
valid_flag
|=
!
(
img_metas
[
0
]
*
img_metas
[
2
]
-
1.
f
>=
box
[
right_x
]);
valid_flag
|=
!
(
img_metas
[
1
]
*
img_metas
[
2
]
-
1.
f
>=
box
[
right_y
]);
valid_flag
|=
!
(
box
[
left_x
]
>=
static_cast
<
T
>
(
0.0
)
);
valid_flag
|=
!
(
box
[
left_y
]
>=
static_cast
<
T
>
(
0.0
)
);
valid_flag
|=
!
(
img_metas
[
1
]
*
img_metas
[
2
]
-
static_cast
<
T
>
(
1.0
)
>=
box
[
right_x
]);
valid_flag
|=
!
(
img_metas
[
0
]
*
img_metas
[
2
]
-
static_cast
<
T
>
(
1.0
)
>=
box
[
right_y
]);
valid
[
i
]
=
!
valid_flag
;
}
...
...
@@ -43,3 +43,5 @@ void CheckValid(const size_t &size, const T *box, const T *img_metas, S *valid,
template
void
CheckValid
(
const
size_t
&
size
,
const
float
*
box
,
const
float
*
img_metas
,
bool
*
valid
,
cudaStream_t
cuda_stream
);
template
void
CheckValid
(
const
size_t
&
size
,
const
half
*
box
,
const
half
*
img_metas
,
bool
*
valid
,
cudaStream_t
cuda_stream
);
mindspore/ccsrc/backend/kernel_compiler/gpu/cuda_impl/iou_impl.cu
浏览文件 @
fb405ee6
...
...
@@ -16,27 +16,26 @@
#include "backend/kernel_compiler/gpu/cuda_impl/iou_impl.cuh"
template
<
typename
T
>
__device__
T
CoordinateMax
(
const
T
a
,
const
T
b
)
{
__device__
float
CoordinateMax
(
const
float
a
,
const
float
b
)
{
return
(
a
>
b
?
a
:
b
);
}
template
<
typename
T
>
__device__
T
CoordinateMin
(
const
T
a
,
const
T
b
)
{
__device__
float
CoordinateMin
(
const
float
a
,
const
float
b
)
{
return
(
a
<
b
?
a
:
b
);
}
template
<
typename
T
>
__global__
void
IOUKernel
(
const
size_t
size
,
const
T
*
box1
,
const
T
*
box2
,
T
*
iou_results
,
const
size_t
mode
,
const
size_t
input_len_0
)
{
T
location_coordinate
[
IOU_LOCATION_NUM
][
IOU_DIMENSION
];
T
overlaps_coordinate
[
IOU_DIMENSION
];
const
T
epsilon
=
1e-10
;
float
location_coordinate
[
IOU_LOCATION_NUM
][
IOU_DIMENSION
];
float
overlaps_coordinate
[
IOU_DIMENSION
];
const
float
epsilon
=
1e-10
;
const
float
offset
=
1.0
;
for
(
size_t
i
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
i
<
size
;
i
+=
gridDim
.
x
*
blockDim
.
x
)
{
for
(
size_t
j
=
0
;
j
<
IOU_DIMENSION
;
j
++
)
{
location_coordinate
[
0
][
j
]
=
box1
[(
i
%
input_len_0
)
*
IOU_DIMENSION
+
j
]
;
location_coordinate
[
1
][
j
]
=
box2
[(
i
/
input_len_0
)
*
IOU_DIMENSION
+
j
]
;
location_coordinate
[
0
][
j
]
=
static_cast
<
float
>
(
box1
[(
i
%
input_len_0
)
*
IOU_DIMENSION
+
j
])
;
location_coordinate
[
1
][
j
]
=
static_cast
<
float
>
(
box2
[(
i
/
input_len_0
)
*
IOU_DIMENSION
+
j
])
;
}
overlaps_coordinate
[
0
]
=
CoordinateMax
(
location_coordinate
[
0
][
0
],
location_coordinate
[
1
][
0
]);
...
...
@@ -44,18 +43,18 @@ __global__ void IOUKernel(const size_t size, const T *box1, const T *box2, T *io
overlaps_coordinate
[
2
]
=
CoordinateMin
(
location_coordinate
[
0
][
2
],
location_coordinate
[
1
][
2
]);
overlaps_coordinate
[
3
]
=
CoordinateMin
(
location_coordinate
[
0
][
3
],
location_coordinate
[
1
][
3
]);
T
overlaps_w
=
CoordinateMax
(
0.
f
,
overlaps_coordinate
[
2
]
-
overlaps_coordinate
[
0
]
+
1
);
T
overlaps_h
=
CoordinateMax
(
0.
f
,
overlaps_coordinate
[
3
]
-
overlaps_coordinate
[
1
]
+
1
);
T
overlaps
=
overlaps_w
*
overlaps_h
;
float
overlaps_w
=
CoordinateMax
(
0.0
,
overlaps_coordinate
[
2
]
-
overlaps_coordinate
[
0
]
+
offset
);
float
overlaps_h
=
CoordinateMax
(
0.0
,
overlaps_coordinate
[
3
]
-
overlaps_coordinate
[
1
]
+
offset
);
float
overlaps
=
overlaps_w
*
overlaps_h
;
T
area1
=
(
location_coordinate
[
0
][
2
]
-
location_coordinate
[
0
][
0
]
+
1
)
*
(
location_coordinate
[
0
][
3
]
-
location_coordinate
[
0
][
1
]
+
1
);
T
area2
=
(
location_coordinate
[
1
][
2
]
-
location_coordinate
[
1
][
0
]
+
1
)
*
(
location_coordinate
[
1
][
3
]
-
location_coordinate
[
1
][
1
]
+
1
);
float
area1
=
(
location_coordinate
[
0
][
2
]
-
location_coordinate
[
0
][
0
]
+
offset
)
*
(
location_coordinate
[
0
][
3
]
-
location_coordinate
[
0
][
1
]
+
offset
);
float
area2
=
(
location_coordinate
[
1
][
2
]
-
location_coordinate
[
1
][
0
]
+
offset
)
*
(
location_coordinate
[
1
][
3
]
-
location_coordinate
[
1
][
1
]
+
offset
);
if
(
mode
==
0
)
{
iou_results
[
i
]
=
overlaps
/
(
area1
+
area2
-
overlaps
+
epsilon
);
iou_results
[
i
]
=
static_cast
<
T
>
(
overlaps
/
(
area1
+
area2
-
overlaps
+
epsilon
)
);
}
else
{
iou_results
[
i
]
=
overlaps
/
(
area2
+
epsilon
);
iou_results
[
i
]
=
static_cast
<
T
>
(
overlaps
/
(
area2
+
epsilon
)
);
}
}
...
...
@@ -70,3 +69,5 @@ void IOU(const size_t &size, const T *box1, const T *box2, T *iou_results, const
template
void
IOU
(
const
size_t
&
size
,
const
float
*
box1
,
const
float
*
box2
,
float
*
iou_results
,
const
size_t
&
mode
,
const
size_t
&
input_len_0
,
cudaStream_t
cuda_stream
);
template
void
IOU
(
const
size_t
&
size
,
const
half
*
box1
,
const
half
*
box2
,
half
*
iou_results
,
const
size_t
&
mode
,
const
size_t
&
input_len_0
,
cudaStream_t
cuda_stream
);
mindspore/ccsrc/backend/kernel_compiler/gpu/gpu_kernel.h
浏览文件 @
fb405ee6
...
...
@@ -84,6 +84,40 @@ class GpuKernel : public KernelMod {
}
}
// set the tensor descriptor for cudnn/cublas
void
CudnnSetTensorNdDescriptor
(
const
std
::
vector
<
size_t
>
&
shape
,
cudnnTensorDescriptor_t
descriptor
,
cudnnDataType_t
data_type
)
{
if
(
shape
.
size
()
<
3
)
{
MS_EXCEPTION
(
ValueError
)
<<
"cudnnSetTensorNdDescriptor don't support"
<<
shape
.
size
()
<<
"D."
;
}
const
int
nbDims
=
shape
.
size
();
int
*
dim
=
new
(
std
::
nothrow
)
int
[
nbDims
];
if
(
dim
==
nullptr
)
{
MS_LOG
(
EXCEPTION
)
<<
"malloc dim failed."
;
}
int
*
stride
=
new
(
std
::
nothrow
)
int
[
nbDims
];
if
(
stride
==
nullptr
)
{
MS_LOG
(
EXCEPTION
)
<<
"malloc stride failed."
;
}
for
(
int
i
=
0
;
i
<
nbDims
;
i
++
)
{
dim
[
i
]
=
SizeToInt
(
shape
[
i
]);
stride
[
i
]
=
1
;
}
for
(
int
i
=
nbDims
-
2
;
i
>=
0
;
i
--
)
{
stride
[
i
]
=
stride
[
i
+
1
]
*
SizeToInt
(
shape
[
i
+
1
]);
}
CHECK_CUDNN_RET_WITH_EXCEPT
(
cudnnSetTensorNdDescriptor
(
descriptor
,
data_type
,
nbDims
,
dim
,
stride
),
"cudnnSetTensorNdDescriptor failed"
);
delete
[]
dim
;
dim
=
nullptr
;
delete
[]
stride
;
stride
=
nullptr
;
}
// choose the suitable datatype for cudnn/cublas
inline
cudnnDataType_t
GetCudnnDataType
(
const
std
::
string
&
Type
)
{
auto
type
=
kCudnnDtypeMap
.
find
(
Type
);
...
...
mindspore/ccsrc/backend/kernel_compiler/gpu/math/broadcast_gpu_kernel.h
浏览文件 @
fb405ee6
...
...
@@ -27,6 +27,7 @@
#include "backend/kernel_compiler/gpu/kernel_constants.h"
namespace
mindspore
{
namespace
kernel
{
constexpr
int
MAX_DIMS
=
7
;
template
<
typename
T
,
typename
S
>
class
BroadcastOpGpuKernel
:
public
GpuKernel
{
public:
...
...
@@ -45,9 +46,8 @@ class BroadcastOpGpuKernel : public GpuKernel {
S
*
output
=
GetDeviceAddress
<
S
>
(
outputs
,
0
);
if
(
need_broadcast_
)
{
Broadcast
(
lhs_shape_
[
0
],
lhs_shape_
[
1
],
lhs_shape_
[
2
],
lhs_shape_
[
3
],
rhs_shape_
[
0
],
rhs_shape_
[
1
],
rhs_shape_
[
2
],
rhs_shape_
[
3
],
output_shape_
[
0
],
output_shape_
[
1
],
output_shape_
[
2
],
output_shape_
[
3
],
op_type_
,
lhs
,
rhs
,
output
,
reinterpret_cast
<
cudaStream_t
>
(
stream_ptr
));
Broadcast
(
lhs_shape_
,
rhs_shape_
,
output_shape_
,
op_type_
,
lhs
,
rhs
,
output
,
reinterpret_cast
<
cudaStream_t
>
(
stream_ptr
));
}
else
{
NoBroadcast
(
output_num_
,
op_type_
,
lhs
,
rhs
,
output
,
reinterpret_cast
<
cudaStream_t
>
(
stream_ptr
));
}
...
...
@@ -60,10 +60,13 @@ class BroadcastOpGpuKernel : public GpuKernel {
auto
shape2
=
AnfAlgo
::
GetPrevNodeOutputInferShape
(
kernel_node
,
1
);
auto
shape3
=
AnfAlgo
::
GetOutputInferShape
(
kernel_node
,
0
);
need_broadcast_
=
IsBroadcast
(
shape1
,
shape2
);
if
(
need_broadcast_
&&
shape1
.
size
()
>
4
)
{
MS_LOG
(
EXCEPTION
)
<<
"Broadcast operation not support dim greater than
4
"
;
if
(
need_broadcast_
&&
shape1
.
size
()
>
7
)
{
MS_LOG
(
EXCEPTION
)
<<
"Broadcast operation not support dim greater than
7
"
;
}
lhs_shape_
.
resize
(
MAX_DIMS
,
1
);
rhs_shape_
.
resize
(
MAX_DIMS
,
1
);
output_shape_
.
resize
(
MAX_DIMS
,
1
);
for
(
size_t
i
=
0
;
i
<
shape3
.
size
();
i
++
)
{
output_shape_
[
i
]
=
shape3
[
i
];
output_num_
*=
shape3
[
i
];
...
...
@@ -127,9 +130,9 @@ class BroadcastOpGpuKernel : public GpuKernel {
int
input1_num_
;
int
input2_num_
;
int
output_num_
;
int
lhs_shape_
[
4
]
=
{
1
,
1
,
1
,
1
}
;
int
rhs_shape_
[
4
]
=
{
1
,
1
,
1
,
1
}
;
int
output_shape_
[
4
]
=
{
1
,
1
,
1
,
1
}
;
std
::
vector
<
int
>
lhs_shape_
;
std
::
vector
<
int
>
rhs_shape_
;
std
::
vector
<
int
>
output_shape_
;
std
::
vector
<
size_t
>
input_size_list_
;
std
::
vector
<
size_t
>
output_size_list_
;
...
...
mindspore/ccsrc/backend/kernel_compiler/gpu/nn/activation_gpu_kernel.h
浏览文件 @
fb405ee6
...
...
@@ -83,12 +83,19 @@ class ActivationGpuFwdKernel : public GpuKernel {
return
true
;
}
std
::
vector
<
int
>
shape
;
ShapeNdTo4d
(
input_shape
,
&
shape
);
CHECK_CUDNN_RET_WITH_EXCEPT
(
cudnnSetActivationDescriptor
(
activation_desc_
,
mode_
,
CUDNN_NOT_PROPAGATE_NAN
,
0.0
),
"cudnnSetActivationDescriptor failed"
);
CHECK_CUDNN_RET_WITH_EXCEPT
(
cudnnSetTensor4dDescriptor
(
data_descriptor_
,
CUDNN_TENSOR_NCHW
,
cudnn_data_type_
,
shape
[
0
],
shape
[
1
],
shape
[
2
],
shape
[
3
]),
"cudnnSetTensor4dDescriptor failed"
);
const
int
split_dim
=
4
;
if
(
input_shape
.
size
()
<=
split_dim
)
{
ShapeNdTo4d
(
input_shape
,
&
shape
);
CHECK_CUDNN_RET_WITH_EXCEPT
(
cudnnSetTensor4dDescriptor
(
data_descriptor_
,
CUDNN_TENSOR_NCHW
,
cudnn_data_type_
,
shape
[
0
],
shape
[
1
],
shape
[
2
],
shape
[
3
]),
"cudnnSetTensor4dDescriptor failed"
);
}
else
{
CudnnSetTensorNdDescriptor
(
input_shape
,
data_descriptor_
,
cudnn_data_type_
);
}
InitSizeLists
();
return
true
;
}
...
...
mindspore/ccsrc/backend/kernel_compiler/gpu/nn/activation_grad_kernel.h
浏览文件 @
fb405ee6
...
...
@@ -90,12 +90,18 @@ class ActivationGradGpuKernel : public GpuKernel {
return
true
;
}
std
::
vector
<
int
>
shape
;
ShapeNdTo4d
(
input_shape
,
&
shape
);
CHECK_CUDNN_RET_WITH_EXCEPT
(
cudnnSetActivationDescriptor
(
activation_desc_
,
mode_
,
CUDNN_PROPAGATE_NAN
,
0.0
),
"SetActivationDescriptor failed"
);
CHECK_CUDNN_RET_WITH_EXCEPT
(
cudnnSetTensor4dDescriptor
(
data_descriptor_
,
CUDNN_TENSOR_NCHW
,
cudnn_data_type_
,
shape
[
0
],
shape
[
1
],
shape
[
2
],
shape
[
3
]),
"SetTensor4dDescriptor failed"
);
const
int
split_dim
=
4
;
if
(
input_shape
.
size
()
<=
split_dim
)
{
ShapeNdTo4d
(
input_shape
,
&
shape
);
CHECK_CUDNN_RET_WITH_EXCEPT
(
cudnnSetTensor4dDescriptor
(
data_descriptor_
,
CUDNN_TENSOR_NCHW
,
cudnn_data_type_
,
shape
[
0
],
shape
[
1
],
shape
[
2
],
shape
[
3
]),
"SetTensor4dDescriptor failed"
);
}
else
{
CudnnSetTensorNdDescriptor
(
input_shape
,
data_descriptor_
,
cudnn_data_type_
);
}
InitSizeLists
();
return
true
;
...
...
mindspore/ccsrc/backend/kernel_compiler/gpu/other/check_valid_gpu_kernel.cc
浏览文件 @
fb405ee6
...
...
@@ -22,5 +22,9 @@ MS_REG_GPU_KERNEL_TWO(
CheckValid
,
KernelAttr
().
AddInputAttr
(
kNumberTypeFloat32
).
AddInputAttr
(
kNumberTypeFloat32
).
AddOutputAttr
(
kNumberTypeBool
),
CheckValidGpuKernel
,
float
,
bool
)
MS_REG_GPU_KERNEL_TWO
(
CheckValid
,
KernelAttr
().
AddInputAttr
(
kNumberTypeFloat16
).
AddInputAttr
(
kNumberTypeFloat16
).
AddOutputAttr
(
kNumberTypeBool
),
CheckValidGpuKernel
,
half
,
bool
)
}
// namespace kernel
}
// namespace mindspore
mindspore/ccsrc/backend/kernel_compiler/gpu/other/iou_gpu_kernel.cc
浏览文件 @
fb405ee6
...
...
@@ -21,5 +21,8 @@ namespace kernel {
MS_REG_GPU_KERNEL_ONE
(
IOU
,
KernelAttr
().
AddInputAttr
(
kNumberTypeFloat32
).
AddInputAttr
(
kNumberTypeFloat32
).
AddOutputAttr
(
kNumberTypeFloat32
),
IOUGpuKernel
,
float
)
MS_REG_GPU_KERNEL_ONE
(
IOU
,
KernelAttr
().
AddInputAttr
(
kNumberTypeFloat16
).
AddInputAttr
(
kNumberTypeFloat16
).
AddOutputAttr
(
kNumberTypeFloat16
),
IOUGpuKernel
,
half
)
}
// namespace kernel
}
// namespace mindspore
tests/st/ops/gpu/test_floordiv_op.py
浏览文件 @
fb405ee6
...
...
@@ -37,8 +37,8 @@ def test_floor_div():
y0_np
=
np
.
random
.
randint
(
1
,
5
,
(
2
,
3
,
4
,
4
)).
astype
(
np
.
float32
)
x1_np
=
np
.
random
.
randint
(
1
,
5
,
(
2
,
3
,
4
,
4
)).
astype
(
np
.
float32
)
y1_np
=
np
.
random
.
randint
(
1
,
5
,
(
2
,
1
,
4
,
4
)).
astype
(
np
.
float32
)
x2_np
=
np
.
random
.
randint
(
1
,
5
,
(
2
,
1
,
1
,
4
)).
astype
(
np
.
float32
)
y2_np
=
np
.
random
.
randint
(
1
,
5
,
(
2
,
3
,
4
,
4
)).
astype
(
np
.
float32
)
x2_np
=
np
.
random
.
randint
(
1
,
5
,
(
2
,
1
,
1
,
4
,
9
)).
astype
(
np
.
float32
)
y2_np
=
np
.
random
.
randint
(
1
,
5
,
(
2
,
3
,
4
,
4
,
9
)).
astype
(
np
.
float32
)
x3_np
=
np
.
random
.
randint
(
1
,
5
,
1
).
astype
(
np
.
float32
)
y3_np
=
np
.
random
.
randint
(
1
,
5
,
1
).
astype
(
np
.
float32
)
x4_np
=
np
.
array
(
768
).
astype
(
np
.
float32
)
...
...
tests/st/ops/gpu/test_reduce_mean_op.py
浏览文件 @
fb405ee6
...
...
@@ -70,7 +70,7 @@ x11 = np.random.rand(1, 1, 1, 1).astype(np.float32)
axis11
=
(
0
,
1
,
2
,
3
)
keep_dims11
=
False
x12
=
np
.
random
.
rand
(
2
,
3
,
4
,
4
).
astype
(
np
.
float32
)
x12
=
np
.
random
.
rand
(
2
,
3
,
4
,
4
,
5
,
6
).
astype
(
np
.
float32
)
axis12
=
-
2
keep_dims12
=
False
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录