Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
magicwindyyd
mindspore
提交
2e002ab6
M
mindspore
项目概览
magicwindyyd
/
mindspore
与 Fork 源项目一致
Fork自
MindSpore / mindspore
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
M
mindspore
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
2e002ab6
编写于
6月 18, 2020
作者:
M
mindspore-ci-bot
提交者:
Gitee
6月 18, 2020
浏览文件
操作
浏览文件
下载
差异文件
!2292 gpu fix all nop node graph execute
Merge pull request !2292 from limingqi107/master
上级
9b90d89f
0f4397ce
变更
5
隐藏空白更改
内联
并排
Showing
5 changed file
with
126 addition
and
22 deletion
+126
-22
mindspore/ccsrc/device/gpu/gpu_kernel_runtime.cc
mindspore/ccsrc/device/gpu/gpu_kernel_runtime.cc
+9
-8
mindspore/ccsrc/pre_activate/mem_reuse/mem_reuse.cc
mindspore/ccsrc/pre_activate/mem_reuse/mem_reuse.cc
+4
-2
mindspore/ccsrc/session/anf_runtime_algorithm.cc
mindspore/ccsrc/session/anf_runtime_algorithm.cc
+12
-8
mindspore/ccsrc/session/anf_runtime_algorithm.h
mindspore/ccsrc/session/anf_runtime_algorithm.h
+6
-4
tests/st/ops/gpu/test_flatten_op.py
tests/st/ops/gpu/test_flatten_op.py
+95
-0
未找到文件。
mindspore/ccsrc/device/gpu/gpu_kernel_runtime.cc
浏览文件 @
2e002ab6
...
...
@@ -228,7 +228,7 @@ void GPUKernelRuntime::ClearKernelOutputAddress(const session::KernelGraph *grap
continue
;
}
auto
device_address
=
AnfAlgo
::
GetMutableOutputAddr
(
kernel
,
i
);
auto
device_address
=
AnfAlgo
::
GetMutableOutputAddr
(
kernel
,
i
,
false
);
if
(
device_address
->
ptr_
)
{
mem_manager_
->
FreeMemFromMemPool
(
device_address
);
}
...
...
@@ -289,7 +289,7 @@ bool GPUKernelRuntime::AddMemSwapTask(const AnfNodePtr &kernel) {
for
(
auto
&
mem_swap_info
:
mem_swap_info_list
)
{
auto
&
kernel_exec_info
=
mem_swap_manager_
->
SearchKernelExecutionInfo
(
mem_swap_info
.
kernel_
);
const
HostAddress
&
host_address
=
kernel_exec_info
.
host_addrs_
[
mem_swap_info
.
output_idx_
];
auto
device_address
=
AnfAlgo
::
GetMutableOutputAddr
(
mem_swap_info
.
kernel_
,
mem_swap_info
.
output_idx_
);
auto
device_address
=
AnfAlgo
::
GetMutableOutputAddr
(
mem_swap_info
.
kernel_
,
mem_swap_info
.
output_idx_
,
false
);
if
(
mem_swap_info
.
swap_kind_
==
SwapKind
::
kDeviceToHost
)
{
mem_swap_manager_
->
AddMemSwapTask
(
SwapKind
::
kDeviceToHost
,
device_address
,
host_address
);
...
...
@@ -379,7 +379,8 @@ bool GPUKernelRuntime::AllocKernelInputDynamicRes(const mindspore::AnfNodePtr &k
MS_EXCEPTION_IF_NULL
(
kernel_inputs
);
MS_EXCEPTION_IF_NULL
(
mem_swap_manager_
);
for
(
size_t
i
=
0
;
i
<
AnfAlgo
::
GetInputTensorNum
(
kernel
);
++
i
)
{
auto
device_address
=
AnfAlgo
::
GetPrevNodeMutableOutputAddr
(
kernel
,
i
);
// Graph may be all nop nodes and not remove nop node, so this can not skip nop node.
auto
device_address
=
AnfAlgo
::
GetPrevNodeMutableOutputAddr
(
kernel
,
i
,
false
);
MS_EXCEPTION_IF_NULL
(
device_address
);
if
(
mem_swap_manager_
->
trigger_swap
())
{
while
(
auto
device_address_swap_in
=
mem_swap_manager_
->
UpdateSwapQueue
(
SwapKind
::
kHostToDevice
))
{
...
...
@@ -437,7 +438,7 @@ bool GPUKernelRuntime::AllocKernelOutputDynamicRes(const mindspore::kernel::Kern
}
auto
output_sizes
=
kernel_mod
.
GetOutputSizeList
();
for
(
size_t
i
=
0
;
i
<
output_sizes
.
size
();
++
i
)
{
auto
device_address
=
AnfAlgo
::
GetMutableOutputAddr
(
kernel
,
i
);
auto
device_address
=
AnfAlgo
::
GetMutableOutputAddr
(
kernel
,
i
,
false
);
MS_EXCEPTION_IF_NULL
(
device_address
);
if
(
device_address
->
ptr_
==
nullptr
&&
!
AttemptMallocMem
(
device_address
,
output_sizes
[
i
]))
{
return
false
;
...
...
@@ -495,7 +496,7 @@ void GPUKernelRuntime::AllocCommunicationOpInputDynamicRes(const mindspore::AnfN
std
::
vector
<
size_t
>
size_list
;
DeviceAddressPtrList
addr_list
;
for
(
size_t
i
=
0
;
i
<
AnfAlgo
::
GetInputTensorNum
(
kernel
);
++
i
)
{
auto
device_address
=
AnfAlgo
::
GetPrevNodeMutableOutputAddr
(
kernel
,
i
);
auto
device_address
=
AnfAlgo
::
GetPrevNodeMutableOutputAddr
(
kernel
,
i
,
false
);
MS_EXCEPTION_IF_NULL
(
device_address
);
if
(
device_address
->
ptr_
==
nullptr
)
{
is_need_alloc_memory
=
true
;
...
...
@@ -520,7 +521,7 @@ void GPUKernelRuntime::AllocCommunicationOpOutputDynamicRes(const mindspore::Anf
MS_EXCEPTION_IF_NULL
(
kernel_mod
);
auto
output_sizes
=
kernel_mod
->
GetOutputSizeList
();
for
(
size_t
i
=
0
;
i
<
output_sizes
.
size
();
++
i
)
{
auto
device_address
=
AnfAlgo
::
GetMutableOutputAddr
(
kernel
,
i
);
auto
device_address
=
AnfAlgo
::
GetMutableOutputAddr
(
kernel
,
i
,
false
);
MS_EXCEPTION_IF_NULL
(
device_address
);
if
(
device_address
->
ptr_
==
nullptr
)
{
is_need_alloc_memory
=
true
;
...
...
@@ -578,7 +579,7 @@ void GPUKernelRuntime::FreeKernelDynamicRes(const mindspore::AnfNodePtr &kernel,
MS_LOG
(
EXCEPTION
)
<<
"Check dynamic reference count failed."
;
}
if
(
kernel_ref_count_ptr
->
ref_count_dynamic_use_
==
0
)
{
auto
device_address
=
AnfAlgo
::
GetPrevNodeMutableOutputAddr
(
kernel
,
i
);
auto
device_address
=
AnfAlgo
::
GetPrevNodeMutableOutputAddr
(
kernel
,
i
,
false
);
mem_manager_
->
FreeMemFromMemPool
(
device_address
);
device_address
->
set_status
(
DeviceAddressStatus
::
kInDevice
);
}
...
...
@@ -590,7 +591,7 @@ void GPUKernelRuntime::FreeKernelDynamicRes(const mindspore::AnfNodePtr &kernel,
continue
;
}
if
(
kernel_ref_count_ptr
->
ref_count_dynamic_use_
==
0
)
{
auto
device_address
=
AnfAlgo
::
GetMutableOutputAddr
(
kernel
,
i
);
auto
device_address
=
AnfAlgo
::
GetMutableOutputAddr
(
kernel
,
i
,
false
);
mem_manager_
->
FreeMemFromMemPool
(
device_address
);
device_address
->
set_status
(
DeviceAddressStatus
::
kInDevice
);
}
...
...
mindspore/ccsrc/pre_activate/mem_reuse/mem_reuse.cc
浏览文件 @
2e002ab6
...
...
@@ -228,7 +228,8 @@ KernelRefCountPtr MemReuseUtil::GetKernelInputRef(const CNodePtr &kernel, size_t
<<
AnfAlgo
::
GetInputTensorNum
(
kernel
);
}
auto
input_node
=
kernel
->
input
(
input_idx
+
1
);
auto
kernel_input
=
AnfAlgo
::
VisitKernelWithReturnType
(
input_node
,
0
,
true
);
// Graph may be all nop nodes and not remove nop node, so this can not skip nop node.
auto
kernel_input
=
AnfAlgo
::
VisitKernelWithReturnType
(
input_node
,
0
,
false
);
if
(
IsPrimitive
(
kernel_input
.
first
,
prim
::
kPrimMakeTuple
))
{
MS_LOG
(
EXCEPTION
)
<<
"Input node ["
<<
input_node
->
DebugString
()
<<
"]'s input "
<<
input_idx
<<
" is MakeTuple"
;
}
...
...
@@ -269,7 +270,8 @@ void MemReuseUtil::SetKernelDefInputs() {
if
(
ref_ptr
!=
nullptr
)
{
// set the inputs of this kernel_def
auto
input_node
=
AnfAlgo
::
GetInputNode
(
kernel
,
i
);
auto
input
=
AnfAlgo
::
VisitKernelWithReturnType
(
input_node
,
0
,
true
);
// Graph may be all nop nodes and not remove nop node, so this can not skip nop node.
auto
input
=
AnfAlgo
::
VisitKernelWithReturnType
(
input_node
,
0
,
false
);
if
(
IsPrimitive
(
input
.
first
,
prim
::
kPrimMakeTuple
))
{
MS_LOG
(
EXCEPTION
)
<<
"Input node ["
<<
input_node
->
DebugString
()
<<
"]'s input "
<<
i
<<
" is MakeTuple"
;
}
...
...
mindspore/ccsrc/session/anf_runtime_algorithm.cc
浏览文件 @
2e002ab6
...
...
@@ -544,9 +544,10 @@ TypeId AnfRuntimeAlgorithm::GetPrevNodeOutputDeviceDataType(const AnfNodePtr &an
}
// get output device addr of anf_node
const
DeviceAddress
*
AnfRuntimeAlgorithm
::
GetOutputAddr
(
const
AnfNodePtr
&
node
,
size_t
output_idx
)
{
const
DeviceAddress
*
AnfRuntimeAlgorithm
::
GetOutputAddr
(
const
AnfNodePtr
&
node
,
size_t
output_idx
,
bool
visit_nop_node
)
{
MS_EXCEPTION_IF_NULL
(
node
);
if
(
opt
::
IsNopNode
(
node
))
{
if
(
opt
::
IsNopNode
(
node
)
&&
visit_nop_node
)
{
auto
cnode
=
node
->
cast
<
CNodePtr
>
();
MS_EXCEPTION_IF_NULL
(
cnode
);
if
(
cnode
->
inputs
().
size
()
==
2
)
{
...
...
@@ -565,9 +566,10 @@ const DeviceAddress *AnfRuntimeAlgorithm::GetOutputAddr(const AnfNodePtr &node,
return
addr
;
}
DeviceAddressPtr
AnfRuntimeAlgorithm
::
GetMutableOutputAddr
(
const
AnfNodePtr
&
node
,
size_t
output_idx
)
{
DeviceAddressPtr
AnfRuntimeAlgorithm
::
GetMutableOutputAddr
(
const
AnfNodePtr
&
node
,
size_t
output_idx
,
bool
visit_nop_node
)
{
MS_EXCEPTION_IF_NULL
(
node
);
if
(
opt
::
IsNopNode
(
node
))
{
if
(
opt
::
IsNopNode
(
node
)
&&
visit_nop_node
)
{
auto
cnode
=
node
->
cast
<
CNodePtr
>
();
MS_EXCEPTION_IF_NULL
(
cnode
);
if
(
cnode
->
inputs
().
size
()
==
2
)
{
...
...
@@ -598,14 +600,16 @@ bool AnfRuntimeAlgorithm::OutputAddrExist(const AnfNodePtr &node, size_t output_
return
kernel_info
->
OutputAddrExist
(
output_idx
);
}
const
DeviceAddress
*
AnfRuntimeAlgorithm
::
GetPrevNodeOutputAddr
(
const
AnfNodePtr
&
anf_node
,
size_t
input_idx
)
{
const
DeviceAddress
*
AnfRuntimeAlgorithm
::
GetPrevNodeOutputAddr
(
const
AnfNodePtr
&
anf_node
,
size_t
input_idx
,
bool
visit_nop_node
)
{
KernelWithIndex
kernel_with_index
=
AnfAlgo
::
GetPrevNodeOutput
(
anf_node
,
input_idx
);
return
AnfRuntimeAlgorithm
::
GetOutputAddr
(
kernel_with_index
.
first
,
kernel_with_index
.
second
);
return
AnfRuntimeAlgorithm
::
GetOutputAddr
(
kernel_with_index
.
first
,
kernel_with_index
.
second
,
visit_nop_node
);
}
DeviceAddressPtr
AnfRuntimeAlgorithm
::
GetPrevNodeMutableOutputAddr
(
const
AnfNodePtr
&
anf_node
,
size_t
input_idx
)
{
DeviceAddressPtr
AnfRuntimeAlgorithm
::
GetPrevNodeMutableOutputAddr
(
const
AnfNodePtr
&
anf_node
,
size_t
input_idx
,
bool
visit_nop_node
)
{
KernelWithIndex
kernel_with_index
=
AnfAlgo
::
GetPrevNodeOutput
(
anf_node
,
input_idx
);
return
AnfRuntimeAlgorithm
::
GetMutableOutputAddr
(
kernel_with_index
.
first
,
kernel_with_index
.
second
);
return
AnfRuntimeAlgorithm
::
GetMutableOutputAddr
(
kernel_with_index
.
first
,
kernel_with_index
.
second
,
visit_nop_node
);
}
// set output device addr of anf_node
...
...
mindspore/ccsrc/session/anf_runtime_algorithm.h
浏览文件 @
2e002ab6
...
...
@@ -121,14 +121,16 @@ class AnfRuntimeAlgorithm {
// get output select data type from prev node,input_index is the input index of current node related to prev node
static
TypeId
GetPrevNodeOutputDeviceDataType
(
const
AnfNodePtr
&
node
,
size_t
input_idx
);
// get output device addr of anf_node
static
const
DeviceAddress
*
GetOutputAddr
(
const
AnfNodePtr
&
node
,
size_t
output_idx
);
static
const
DeviceAddress
*
GetOutputAddr
(
const
AnfNodePtr
&
node
,
size_t
output_idx
,
bool
visit_nop_node
=
true
);
// get mutable output device addr of anf_node
static
DeviceAddressPtr
GetMutableOutputAddr
(
const
AnfNodePtr
&
node
,
size_t
output_idx
);
static
DeviceAddressPtr
GetMutableOutputAddr
(
const
AnfNodePtr
&
node
,
size_t
output_idx
,
bool
visit_nop_node
=
true
);
// check whether output addr is exist or not
static
bool
OutputAddrExist
(
const
AnfNodePtr
&
node
,
size_t
output_idx
);
// get address from prev node,input_index is the input index of current node related to prev node
static
const
DeviceAddress
*
GetPrevNodeOutputAddr
(
const
AnfNodePtr
&
node
,
size_t
input_idx
);
static
DeviceAddressPtr
GetPrevNodeMutableOutputAddr
(
const
AnfNodePtr
&
anf_node
,
size_t
input_idx
);
static
const
DeviceAddress
*
GetPrevNodeOutputAddr
(
const
AnfNodePtr
&
node
,
size_t
input_idx
,
bool
visit_nop_node
=
true
);
static
DeviceAddressPtr
GetPrevNodeMutableOutputAddr
(
const
AnfNodePtr
&
anf_node
,
size_t
input_idx
,
bool
visit_nop_node
=
true
);
// set output device addr of anf_node
static
void
SetOutputAddr
(
const
DeviceAddressPtr
&
addr
,
size_t
output_idx
,
AnfNode
*
node
);
// set workspace device addr of anf_node
...
...
tests/st/ops/gpu/test_flatten_op.py
浏览文件 @
2e002ab6
...
...
@@ -31,6 +31,49 @@ class NetFlatten(nn.Cell):
return
self
.
flatten
(
x
)
class
NetAllFlatten
(
nn
.
Cell
):
def
__init__
(
self
):
super
(
NetAllFlatten
,
self
).
__init__
()
self
.
flatten
=
P
.
Flatten
()
def
construct
(
self
,
x
):
loop_count
=
4
while
loop_count
>
0
:
x
=
self
.
flatten
(
x
)
loop_count
=
loop_count
-
1
return
x
class
NetFirstFlatten
(
nn
.
Cell
):
def
__init__
(
self
):
super
(
NetFirstFlatten
,
self
).
__init__
()
self
.
flatten
=
P
.
Flatten
()
self
.
relu
=
P
.
ReLU
()
def
construct
(
self
,
x
):
loop_count
=
4
while
loop_count
>
0
:
x
=
self
.
flatten
(
x
)
loop_count
=
loop_count
-
1
x
=
self
.
relu
(
x
)
return
x
class
NetLastFlatten
(
nn
.
Cell
):
def
__init__
(
self
):
super
(
NetLastFlatten
,
self
).
__init__
()
self
.
flatten
=
P
.
Flatten
()
self
.
relu
=
P
.
ReLU
()
def
construct
(
self
,
x
):
loop_count
=
4
x
=
self
.
relu
(
x
)
while
loop_count
>
0
:
x
=
self
.
flatten
(
x
)
loop_count
=
loop_count
-
1
return
x
@
pytest
.
mark
.
level0
@
pytest
.
mark
.
platform_x86_gpu_training
@
pytest
.
mark
.
env_onecard
...
...
@@ -46,3 +89,55 @@ def test_flatten():
flatten
=
NetFlatten
()
output
=
flatten
(
x
)
assert
(
output
.
asnumpy
()
==
expect
).
all
()
@
pytest
.
mark
.
level0
@
pytest
.
mark
.
platform_x86_gpu_training
@
pytest
.
mark
.
env_onecard
def
test_all_flatten
():
x
=
Tensor
(
np
.
array
([[
-
0.1
,
0.3
,
3.6
],
[
0.4
,
0.5
,
-
3.2
]]).
astype
(
np
.
float32
))
expect
=
np
.
array
([[
-
0.1
,
0.3
,
3.6
],
[
0.4
,
0.5
,
-
3.2
]]).
astype
(
np
.
float32
)
context
.
set_context
(
mode
=
context
.
PYNATIVE_MODE
,
device_target
=
"GPU"
)
flatten
=
NetAllFlatten
()
output
=
flatten
(
x
)
assert
(
output
.
asnumpy
()
==
expect
).
all
()
context
.
set_context
(
mode
=
context
.
GRAPH_MODE
,
device_target
=
"GPU"
)
flatten
=
NetAllFlatten
()
output
=
flatten
(
x
)
assert
(
output
.
asnumpy
()
==
expect
).
all
()
@
pytest
.
mark
.
level0
@
pytest
.
mark
.
platform_x86_gpu_training
@
pytest
.
mark
.
env_onecard
def
test_first_flatten
():
x
=
Tensor
(
np
.
array
([[
-
0.1
,
0.3
,
3.6
],
[
0.4
,
0.5
,
-
3.2
]]).
astype
(
np
.
float32
))
expect
=
np
.
array
([[
0
,
0.3
,
3.6
],
[
0.4
,
0.5
,
0
]]).
astype
(
np
.
float32
)
context
.
set_context
(
mode
=
context
.
PYNATIVE_MODE
,
device_target
=
"GPU"
)
flatten
=
NetFirstFlatten
()
output
=
flatten
(
x
)
assert
(
output
.
asnumpy
()
==
expect
).
all
()
context
.
set_context
(
mode
=
context
.
GRAPH_MODE
,
device_target
=
"GPU"
)
flatten
=
NetFirstFlatten
()
output
=
flatten
(
x
)
assert
(
output
.
asnumpy
()
==
expect
).
all
()
@
pytest
.
mark
.
level0
@
pytest
.
mark
.
platform_x86_gpu_training
@
pytest
.
mark
.
env_onecard
def
test_last_flatten
():
x
=
Tensor
(
np
.
array
([[
-
0.1
,
0.3
,
3.6
],
[
0.4
,
0.5
,
-
3.2
]]).
astype
(
np
.
float32
))
expect
=
np
.
array
([[
0
,
0.3
,
3.6
],
[
0.4
,
0.5
,
0
]]).
astype
(
np
.
float32
)
context
.
set_context
(
mode
=
context
.
PYNATIVE_MODE
,
device_target
=
"GPU"
)
flatten
=
NetLastFlatten
()
output
=
flatten
(
x
)
assert
(
output
.
asnumpy
()
==
expect
).
all
()
context
.
set_context
(
mode
=
context
.
GRAPH_MODE
,
device_target
=
"GPU"
)
flatten
=
NetLastFlatten
()
output
=
flatten
(
x
)
assert
(
output
.
asnumpy
()
==
expect
).
all
()
\ No newline at end of file
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录