Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
magicwindyyd
mindspore
提交
378a7122
M
mindspore
项目概览
magicwindyyd
/
mindspore
与 Fork 源项目一致
Fork自
MindSpore / mindspore
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
M
mindspore
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
378a7122
编写于
5月 07, 2020
作者:
M
mindspore-ci-bot
提交者:
Gitee
5月 07, 2020
浏览文件
操作
浏览文件
下载
差异文件
!372 Gpu support BatchMatMul kernel
Merge pull request !372 from chenweifeng/batchmatmul
上级
97d21ba0
9a7702b8
变更
3
隐藏空白更改
内联
并排
Showing
3 changed file
with
152 addition
and
17 deletion
+152
-17
mindspore/ccsrc/kernel/gpu/math/matmul_gpu_kernel.cc
mindspore/ccsrc/kernel/gpu/math/matmul_gpu_kernel.cc
+8
-0
mindspore/ccsrc/kernel/gpu/math/matmul_gpu_kernel.h
mindspore/ccsrc/kernel/gpu/math/matmul_gpu_kernel.h
+24
-17
tests/st/ops/gpu/test_batch_matmul.py
tests/st/ops/gpu/test_batch_matmul.py
+120
-0
未找到文件。
mindspore/ccsrc/kernel/gpu/math/matmul_gpu_kernel.cc
浏览文件 @
378a7122
...
...
@@ -26,5 +26,13 @@ MS_REG_GPU_KERNEL_ONE(
MatMul
,
KernelAttr
().
AddInputAttr
(
kNumberTypeFloat16
).
AddInputAttr
(
kNumberTypeFloat16
).
AddOutputAttr
(
kNumberTypeFloat16
),
MatMulGpuKernel
,
half
)
MS_REG_GPU_KERNEL_ONE
(
BatchMatMul
,
KernelAttr
().
AddInputAttr
(
kNumberTypeFloat32
).
AddInputAttr
(
kNumberTypeFloat32
).
AddOutputAttr
(
kNumberTypeFloat32
),
MatMulGpuKernel
,
float
)
MS_REG_GPU_KERNEL_ONE
(
BatchMatMul
,
KernelAttr
().
AddInputAttr
(
kNumberTypeFloat16
).
AddInputAttr
(
kNumberTypeFloat16
).
AddOutputAttr
(
kNumberTypeFloat16
),
MatMulGpuKernel
,
half
)
}
// namespace kernel
}
// namespace mindspore
mindspore/ccsrc/kernel/gpu/math/matmul_gpu_kernel.h
浏览文件 @
378a7122
...
...
@@ -38,7 +38,10 @@ class MatMulGpuKernel : public GpuKernel {
transpose_x1_
(
CUBLAS_OP_N
),
transpose_x2_
(
CUBLAS_OP_N
),
handle_
(
nullptr
),
cudaDataType_
(
CUDA_R_32F
)
{}
dtype_a_
(
CUDA_R_32F
),
dtype_b_
(
CUDA_R_32F
),
dtype_c_
(
CUDA_R_32F
),
algo_
(
CUBLAS_GEMM_DEFAULT_TENSOR_OP
)
{}
~
MatMulGpuKernel
()
=
default
;
const
std
::
vector
<
size_t
>
&
GetInputSizeList
()
const
override
{
return
input_size_list_
;
}
const
std
::
vector
<
size_t
>
&
GetOutputSizeList
()
const
override
{
return
output_size_list_
;
}
...
...
@@ -54,24 +57,25 @@ class MatMulGpuKernel : public GpuKernel {
const
float
alpha
=
1
;
const
float
beta
=
0
;
const
int
lda
=
(
transpose_x2_
==
CUBLAS_OP_T
)
?
SizeToInt
(
k_
)
:
SizeToInt
(
n_
);
const
int
ldb
=
(
transpose_x1_
==
CUBLAS_OP_T
)
?
SizeToInt
(
m_
)
:
SizeToInt
(
k_
);
for
(
size_t
i
=
0
;
i
<
batch_
;
i
++
)
{
auto
input1_slice
=
input1_addr
+
i
*
m_
*
k_
;
auto
input2_slice
=
input2_addr
+
i
*
k_
*
n_
;
auto
output_slice
=
output_addr
+
i
*
m_
*
n_
;
CHECK_CUBLAS_RET_WITH_EXCEPT
(
cublasSgemmEx
(
handle_
,
transpose_x2_
,
transpose_x1_
,
SizeToInt
(
n_
),
SizeToInt
(
m_
),
SizeToInt
(
k_
),
&
alpha
,
input2_slice
,
cudaDataType_
,
lda
,
input1_slice
,
cudaDataType_
,
ldb
,
&
beta
,
output_slice
,
cudaDataType_
,
SizeToInt
(
n_
)),
"cublasSgemm Call Fail"
);
}
const
int
lda
=
(
transpose_x1_
==
CUBLAS_OP_T
)
?
SizeToInt
(
m_
)
:
SizeToInt
(
k_
);
const
int
ldb
=
(
transpose_x2_
==
CUBLAS_OP_T
)
?
SizeToInt
(
k_
)
:
SizeToInt
(
n_
);
const
int
ldc
=
n_
;
auto
stride_a
=
SizeToInt
(
m_
*
k_
);
auto
stride_b
=
SizeToInt
(
k_
*
n_
);
auto
stride_c
=
SizeToInt
(
m_
*
n_
);
CHECK_CUBLAS_RET_WITH_EXCEPT
(
cublasGemmStridedBatchedEx
(
handle_
,
transpose_x2_
,
transpose_x1_
,
SizeToInt
(
n_
),
SizeToInt
(
m_
),
SizeToInt
(
k_
),
&
alpha
,
input2_addr
,
dtype_b_
,
ldb
,
stride_b
,
input1_addr
,
dtype_a_
,
lda
,
stride_a
,
&
beta
,
output_addr
,
dtype_c_
,
ldc
,
stride_c
,
batch_
,
dtype_c_
,
algo_
),
"cublasSgemm Call Fail"
);
return
true
;
}
bool
Init
(
const
CNodePtr
&
kernel_node
)
override
{
handle_
=
device
::
gpu
::
GPUDeviceManager
::
GetInstance
().
GetCublasHandle
();
cudaDataType_
=
kCudaDtypeMap
[
TypeIdLabel
(
AnfAlgo
::
GetInputDeviceDataType
(
kernel_node
,
0
))];
dtype_a_
=
kCudaDtypeMap
[
TypeIdLabel
(
AnfAlgo
::
GetInputDeviceDataType
(
kernel_node
,
0
))];
dtype_b_
=
kCudaDtypeMap
[
TypeIdLabel
(
AnfAlgo
::
GetInputDeviceDataType
(
kernel_node
,
1
))];
dtype_c_
=
kCudaDtypeMap
[
TypeIdLabel
(
AnfAlgo
::
GetOutputDeviceDataType
(
kernel_node
,
0
))];
auto
output_shape
=
AnfAlgo
::
GetOutputInferShape
(
kernel_node
,
0
);
auto
dims
=
output_shape
.
size
();
if
(
dims
<
2
)
{
...
...
@@ -119,9 +123,12 @@ class MatMulGpuKernel : public GpuKernel {
cublasOperation_t
transpose_x1_
;
cublasOperation_t
transpose_x2_
;
cublasHandle_t
handle_
;
cudaDataType_t
cudaDataType_
;
cudaDataType_t
dtype_a_
;
cudaDataType_t
dtype_b_
;
cudaDataType_t
dtype_c_
;
cublasGemmAlgo_t
algo_
;
std
::
vector
<
size_t
>
input_size_list_
;
std
::
vector
<
size_t
>
output_size_list_
;
std
::
vector
<
size_t
>
workspace_size_list_
;
...
...
tests/st/ops/gpu/test_batch_matmul.py
0 → 100644
浏览文件 @
378a7122
# Copyright 2020 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
import
pytest
import
numpy
as
np
from
mindspore
import
Tensor
from
mindspore.ops
import
operations
as
P
from
mindspore.common.api
import
ms_function
from
mindspore.common.initializer
import
initializer
from
mindspore.common.parameter
import
Parameter
import
mindspore.nn
as
nn
import
mindspore.context
as
context
from
mindspore.common
import
dtype
as
mstype
@
pytest
.
mark
.
level0
@
pytest
.
mark
.
platform_x86_gpu_training
@
pytest
.
mark
.
env_onecard
class
BatchMatMulNet
(
nn
.
Cell
):
def
__init__
(
self
,
transpose_a
=
False
,
transpose_b
=
False
):
super
(
BatchMatMulNet
,
self
).
__init__
()
self
.
batch_matmul
=
P
.
BatchMatMul
(
transpose_a
,
transpose_b
)
def
construct
(
self
,
x
,
y
):
return
self
.
batch_matmul
(
x
,
y
)
def
test_4D
():
input_x
=
Tensor
(
np
.
arange
(
2
*
4
*
1
*
3
).
reshape
(
2
,
4
,
1
,
3
),
mstype
.
float32
)
input_y
=
Tensor
(
np
.
arange
(
2
*
4
*
3
*
4
).
reshape
(
2
,
4
,
3
,
4
),
mstype
.
float32
)
context
.
set_context
(
mode
=
context
.
GRAPH_MODE
,
device_target
=
"GPU"
)
net
=
BatchMatMulNet
()
output
=
net
(
input_x
,
input_y
)
expect
=
[[[[
20
,
23
,
26
,
29
]],
[[
200
,
212
,
224
,
236
]],
[[
596
,
617
,
638
,
659
]],
[[
1208
,
1238
,
1268
,
1298
]]],
[[[
2036
,
2075
,
2114
,
2153
]],
[[
3080
,
3128
,
3176
,
3224
]],
[[
4340
,
4397
,
4454
,
4511
]],
[[
5816
,
5882
,
5948
,
6014
]]]]
assert
(
output
.
asnumpy
()
==
expect
).
all
()
@
pytest
.
mark
.
level0
@
pytest
.
mark
.
platform_x86_gpu_training
@
pytest
.
mark
.
env_onecard
def
test_4D_transpose_a
():
input_x
=
Tensor
(
np
.
arange
(
2
*
4
*
3
*
1
).
reshape
(
2
,
4
,
3
,
1
),
mstype
.
float32
)
input_y
=
Tensor
(
np
.
arange
(
2
*
4
*
3
*
4
).
reshape
(
2
,
4
,
3
,
4
),
mstype
.
float32
)
context
.
set_context
(
mode
=
context
.
GRAPH_MODE
,
device_target
=
"GPU"
)
net
=
BatchMatMulNet
(
transpose_a
=
True
)
output
=
net
(
input_x
,
input_y
)
expect
=
[[[[
20
,
23
,
26
,
29
]],
[[
200
,
212
,
224
,
236
]],
[[
596
,
617
,
638
,
659
]],
[[
1208
,
1238
,
1268
,
1298
]]],
[[[
2036
,
2075
,
2114
,
2153
]],
[[
3080
,
3128
,
3176
,
3224
]],
[[
4340
,
4397
,
4454
,
4511
]],
[[
5816
,
5882
,
5948
,
6014
]]]]
assert
(
output
.
asnumpy
()
==
expect
).
all
()
@
pytest
.
mark
.
level0
@
pytest
.
mark
.
platform_x86_gpu_training
@
pytest
.
mark
.
env_onecard
def
test_4D_transpose_b
():
input_x
=
Tensor
(
np
.
arange
(
2
*
4
*
1
*
3
).
reshape
(
2
,
4
,
1
,
3
),
mstype
.
float32
)
input_y
=
Tensor
(
np
.
arange
(
2
*
4
*
4
*
3
).
reshape
(
2
,
4
,
4
,
3
),
mstype
.
float32
)
context
.
set_context
(
mode
=
context
.
GRAPH_MODE
,
device_target
=
"GPU"
)
net
=
BatchMatMulNet
(
transpose_b
=
True
)
output
=
net
(
input_x
,
input_y
)
expect
=
[[[[
5
,
14
,
23
,
32
]],
[[
158
,
194
,
230
,
266
]],
[[
527
,
590
,
653
,
716
]],
[[
1112
,
1202
,
1292
,
1382
]]],
[[[
1913
,
2030
,
2147
,
2264
]],
[[
2930
,
3074
,
3218
,
3362
]],
[[
4163
,
4334
,
4505
,
4676
]],
[[
5612
,
5810
,
6008
,
6206
]]]]
assert
(
output
.
asnumpy
()
==
expect
).
all
()
@
pytest
.
mark
.
level0
@
pytest
.
mark
.
platform_x86_gpu_training
@
pytest
.
mark
.
env_onecard
def
test_4D_transpose_ab
():
input_x
=
Tensor
(
np
.
arange
(
2
*
4
*
3
*
1
).
reshape
(
2
,
4
,
3
,
1
),
mstype
.
float32
)
input_y
=
Tensor
(
np
.
arange
(
2
*
4
*
4
*
3
).
reshape
(
2
,
4
,
4
,
3
),
mstype
.
float32
)
context
.
set_context
(
mode
=
context
.
GRAPH_MODE
,
device_target
=
"GPU"
)
net
=
BatchMatMulNet
(
transpose_a
=
True
,
transpose_b
=
True
)
output
=
net
(
input_x
,
input_y
)
expect
=
[[[[
5
,
14
,
23
,
32
]],
[[
158
,
194
,
230
,
266
]],
[[
527
,
590
,
653
,
716
]],
[[
1112
,
1202
,
1292
,
1382
]]],
[[[
1913
,
2030
,
2147
,
2264
]],
[[
2930
,
3074
,
3218
,
3362
]],
[[
4163
,
4334
,
4505
,
4676
]],
[[
5612
,
5810
,
6008
,
6206
]]]]
assert
(
output
.
asnumpy
()
==
expect
).
all
()
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录