Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
magicwindyyd
mindspore
提交
7c358ca4
M
mindspore
项目概览
magicwindyyd
/
mindspore
与 Fork 源项目一致
Fork自
MindSpore / mindspore
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
M
mindspore
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
7c358ca4
编写于
6月 22, 2020
作者:
M
mindspore-ci-bot
提交者:
Gitee
6月 22, 2020
浏览文件
操作
浏览文件
下载
差异文件
!2460 optimize cpu reduce gradient
Merge pull request !2460 from kisnwang/optimize-cpu-reduce-gradient
上级
f975963a
b867d6d6
变更
3
隐藏空白更改
内联
并排
Showing
3 changed file
with
86 addition
and
45 deletion
+86
-45
mindspore/ccsrc/device/ascend/ascend_kernel_runtime.cc
mindspore/ccsrc/device/ascend/ascend_kernel_runtime.cc
+0
-6
mindspore/ccsrc/device/cpu/cpu_kernel_runtime.cc
mindspore/ccsrc/device/cpu/cpu_kernel_runtime.cc
+9
-11
mindspore/ccsrc/kernel/common_utils.cc
mindspore/ccsrc/kernel/common_utils.cc
+77
-28
未找到文件。
mindspore/ccsrc/device/ascend/ascend_kernel_runtime.cc
浏览文件 @
7c358ca4
...
...
@@ -327,19 +327,16 @@ bool AscendKernelRuntime::GenTask(const session::KernelGraph *graph) {
vector
<
std
::
shared_ptr
<
TaskInfo
>>
task_info_list
;
auto
anf_node_list
=
graph
->
execution_order
();
TaskGenerator
::
GenTasks
(
anf_node_list
,
&
task_info_list
,
graph
->
graph_id
());
// Store the task_info_list
auto
insert_ret
=
task_map_
.
insert
(
std
::
make_pair
(
graph
->
graph_id
(),
task_info_list
));
if
(
!
insert_ret
.
second
)
{
MS_LOG
(
EXCEPTION
)
<<
"Duplicate GraphId! Please check in ascend_session."
;
}
// Graph may have no compute node, such TensorAddGrad.
if
(
task_info_list
.
empty
())
{
MS_LOG
(
WARNING
)
<<
"graph "
<<
graph
->
graph_id
()
<<
" have no compute node"
;
return
true
;
}
AscendStreamAssign
&
assign_instance
=
AscendStreamAssign
::
GetInstance
();
AscendStreamMng
&
stream_manager
=
AscendStreamMng
::
GetInstance
();
AscendLabelAssign
&
label_assign_instance
=
AscendLabelAssign
::
GetInstance
();
...
...
@@ -348,19 +345,16 @@ bool AscendKernelRuntime::GenTask(const session::KernelGraph *graph) {
assign_instance
.
GetWaitStreams
(
&
wait_active_stream_list
);
std
::
vector
<
uint32_t
>
force_copy_stream_list
;
assign_instance
.
GetHcomStreams
(
&
force_copy_stream_list
);
MS_LOG
(
INFO
)
<<
"call DavinciModel total stream num:"
<<
stream_manager
.
GetCurAllocStreamNum
()
<<
", total event num:"
<<
assign_instance
.
total_event_num
()
<<
", total label num:"
<<
label_assign_instance
.
GetLabelNum
(
NOT_NULL
(
graph
))
<<
", wait_active_stream_list size:"
<<
wait_active_stream_list
.
size
()
<<
", force_copy_stream_list size:"
<<
force_copy_stream_list
.
size
();
std
::
vector
<
std
::
shared_ptr
<
ge
::
model_runner
::
OpInfo
>>
empty_list
;
std
::
shared_ptr
<
ge
::
model_runner
::
DavinciModel
>
model
=
std
::
make_shared
<
ge
::
model_runner
::
DavinciModel
>
(
task_info_list
,
empty_list
,
empty_list
,
empty_list
,
empty_list
,
wait_active_stream_list
,
force_copy_stream_list
,
0
,
0
,
0
,
0
,
0
,
0
,
stream_manager
.
GetCurAllocStreamNum
(),
label_assign_instance
.
GetLabelNum
(
NOT_NULL
(
graph
)),
assign_instance
.
total_event_num
(),
0
);
auto
ret
=
graph_model_map_
.
insert
(
std
::
make_pair
(
graph
->
graph_id
(),
model
));
if
(
!
ret
.
second
)
{
MS_LOG
(
EXCEPTION
)
<<
"Duplicate GraphId! Please check in ascend_session."
;
...
...
mindspore/ccsrc/device/cpu/cpu_kernel_runtime.cc
浏览文件 @
7c358ca4
...
...
@@ -147,20 +147,18 @@ BaseRef CPUKernelRuntime::CreatTensorForOutput(const session::KernelWithIndex &k
auto
&
input_node
=
kernel_with_index
.
first
;
auto
index
=
kernel_with_index
.
second
;
MS_EXCEPTION_IF_NULL
(
input_node
);
if
(
input_node
->
isa
<
CNode
>
()
&&
AnfAlgo
::
GetCNodeName
(
input_node
)
==
prim
::
kPrimMakeTuple
->
name
())
{
auto
cnode
=
input_node
->
cast
<
CNodePtr
>
();
MS_EXCEPTION_IF_NULL
(
cnode
);
VectorRef
ret
;
for
(
size_t
i
=
1
;
i
<
cnode
->
inputs
().
size
();
i
++
)
{
auto
item_with_index
=
AnfAlgo
::
VisitKernelWithReturnType
(
cnode
->
input
(
i
),
0
);
auto
out
=
CreatTensorForOutput
(
item_with_index
,
input_map
,
bound_addresses
,
need_sync_outputs
);
ret
.
push_back
(
out
);
}
return
ret
;
}
if
(
input_node
->
isa
<
CNode
>
())
{
auto
node
=
input_node
->
cast
<
CNodePtr
>
();
MS_EXCEPTION_IF_NULL
(
node
);
if
(
AnfAlgo
::
GetCNodeName
(
input_node
)
==
prim
::
kPrimMakeTuple
->
name
())
{
VectorRef
ret
;
for
(
size_t
i
=
1
;
i
<
node
->
inputs
().
size
();
i
++
)
{
auto
item_with_index
=
AnfAlgo
::
VisitKernelWithReturnType
(
node
->
input
(
i
),
0
);
auto
out
=
CreatTensorForOutput
(
item_with_index
,
input_map
,
bound_addresses
,
need_sync_outputs
);
ret
.
push_back
(
out
);
}
return
ret
;
}
size_t
output_size
=
AnfAlgo
::
GetOutputTensorNum
(
node
);
if
(
index
>=
output_size
)
{
MS_LOG
(
EXCEPTION
)
<<
"Invalid input index "
<<
index
;
...
...
mindspore/ccsrc/kernel/common_utils.cc
浏览文件 @
7c358ca4
...
...
@@ -577,6 +577,52 @@ void DeduplicateIndexedSlices(const SparseGradient &origin_sparse_grad, SparseGr
unique_grad
->
indices_size_
=
unique_indices_size
;
}
struct
WorkerParamsForReduceSparseGradient
{
size_t
slice_start_
{
0
};
size_t
slice_end_
{
0
};
size_t
max_length_
{
0
};
size_t
outer_dim_
{
0
};
std
::
vector
<
std
::
pair
<
int
,
size_t
>>
*
sorted_indices_
{
nullptr
};
std
::
vector
<
size_t
>
*
slice_positions_
{
nullptr
};
float
*
src_value_
{
nullptr
};
SparseGradient
*
unique_grad_
{
nullptr
};
};
void
WorkerForReduceSparseGradient
(
WorkerParamsForReduceSparseGradient
param
)
{
MS_EXCEPTION_IF_NULL
(
param
.
sorted_indices_
);
MS_EXCEPTION_IF_NULL
(
param
.
slice_positions_
);
MS_EXCEPTION_IF_NULL
(
param
.
src_value_
);
MS_EXCEPTION_IF_NULL
(
param
.
unique_grad_
);
auto
outer_dim
=
param
.
outer_dim_
;
auto
&
sorted_indices
=
*
(
param
.
sorted_indices_
);
auto
&
slice_positions
=
*
(
param
.
slice_positions_
);
auto
unique_grad
=
param
.
unique_grad_
;
for
(
size_t
slice_id
=
param
.
slice_start_
;
slice_id
<
param
.
slice_end_
;
++
slice_id
)
{
size_t
cur_pos
=
slice_positions
[
slice_id
];
int
index
=
sorted_indices
[
cur_pos
].
first
;
unique_grad
->
indices_
[
slice_id
]
=
index
;
size_t
start_index
=
slice_id
*
outer_dim
;
auto
ret_code
=
memcpy_s
(
unique_grad
->
value_
+
start_index
,
(
param
.
max_length_
-
start_index
)
*
sizeof
(
float
),
param
.
src_value_
+
sorted_indices
[
cur_pos
].
second
,
outer_dim
*
sizeof
(
float
));
if
(
ret_code
!=
EOK
)
{
MS_LOG
(
EXCEPTION
)
<<
"Failed to copy data!"
;
}
cur_pos
++
;
size_t
end_pos
;
if
(
slice_id
+
1
<
slice_positions
.
size
())
{
end_pos
=
slice_positions
[
slice_id
+
1
];
}
else
{
end_pos
=
sorted_indices
.
size
();
}
while
(
cur_pos
<
end_pos
)
{
for
(
size_t
i
=
0
;
i
<
outer_dim
;
++
i
)
{
unique_grad
->
value_
[
start_index
+
i
]
+=
param
.
src_value_
[
sorted_indices
[
cur_pos
].
second
+
i
];
}
cur_pos
++
;
}
}
}
void
ReduceSparseGradient
(
const
SparseGradient
&
origin_sparse_grad
,
SparseGradient
*
unique_grad
,
size_t
first_dim
,
size_t
outer_dim
)
{
MS_EXCEPTION_IF_NULL
(
origin_sparse_grad
.
value_
);
...
...
@@ -584,47 +630,50 @@ void ReduceSparseGradient(const SparseGradient &origin_sparse_grad, SparseGradie
MS_EXCEPTION_IF_NULL
(
unique_grad
);
MS_EXCEPTION_IF_NULL
(
unique_grad
->
value_
);
MS_EXCEPTION_IF_NULL
(
unique_grad
->
indices_
);
size_t
unique_indices_size
=
0
;
std
::
vector
<
std
::
pair
<
int
,
size_t
>>
sorted_indices
;
sorted_indices
.
reserve
(
origin_sparse_grad
.
indices_size_
);
for
(
size_t
i
=
0
;
i
<
origin_sparse_grad
.
indices_size_
;
++
i
)
{
int
index
=
origin_sparse_grad
.
indices_
[
i
];
if
(
index
<
0
||
IntToSize
(
index
)
>=
first_dim
)
{
continue
;
if
(
index
>=
0
&&
IntToSize
(
index
)
<
first_dim
)
{
sorted_indices
.
emplace_back
(
std
::
pair
<
int
,
size_t
>
(
index
,
i
*
outer_dim
))
;
}
sorted_indices
.
emplace_back
(
std
::
pair
<
int
,
size_t
>
(
index
,
i
*
outer_dim
));
}
std
::
sort
(
sorted_indices
.
begin
(),
sorted_indices
.
end
(),
[](
const
std
::
pair
<
int
,
size_t
>
&
left
,
const
std
::
pair
<
int
,
size_t
>
&
right
)
{
return
left
.
first
<
right
.
first
;
});
int
last_index
=
0
;
size_t
indices_size
=
sorted_indices
.
size
();
size_t
start_index
=
0
;
size_t
end_index
=
outer_dim
;
size_t
dst_len
=
indices_size
*
outer_dim
;
for
(
size_t
i
=
0
;
i
<
indices_size
;
++
i
)
{
int
index
=
sorted_indices
[
i
].
first
;
if
(
i
==
0
||
last_index
!=
index
)
{
if
(
i
>
0
&&
last_index
!=
index
)
{
unique_indices_size
++
;
start_index
+=
outer_dim
;
end_index
+=
outer_dim
;
}
unique_grad
->
indices_
[
unique_indices_size
]
=
index
;
auto
ret_code
=
memcpy_s
(
unique_grad
->
value_
+
start_index
,
dst_len
-
start_index
,
origin_sparse_grad
.
value_
+
sorted_indices
[
i
].
second
,
outer_dim
);
if
(
ret_code
!=
EOK
)
{
MS_LOG
(
EXCEPTION
)
<<
"Failed to copy data!"
;
}
std
::
vector
<
size_t
>
slice_positions
;
for
(
size_t
i
=
0
;
i
<
sorted_indices
.
size
();
++
i
)
{
if
(
i
==
0
||
last_index
!=
sorted_indices
[
i
].
first
)
{
slice_positions
.
emplace_back
(
i
);
}
last_index
=
sorted_indices
[
i
].
first
;
}
size_t
thread_num
=
8
;
if
(
slice_positions
.
size
()
<
thread_num
)
{
thread_num
=
slice_positions
.
size
();
}
size_t
stride
=
(
slice_positions
.
size
()
+
thread_num
-
1
)
/
thread_num
;
thread_num
=
(
slice_positions
.
size
()
+
stride
-
1
)
/
stride
;
std
::
vector
<
std
::
thread
>
threads
;
size_t
max_length
=
sorted_indices
.
size
()
*
outer_dim
;
for
(
size_t
i
=
0
;
i
<
thread_num
;
++
i
)
{
size_t
slice_start
=
i
*
stride
;
size_t
slice_end
=
0
;
if
(
i
==
thread_num
-
1
)
{
slice_end
=
slice_positions
.
size
();
}
else
{
for
(
size_t
j
=
start_index
,
k
=
sorted_indices
[
i
].
second
;
j
<
end_index
;
++
j
,
++
k
)
{
unique_grad
->
value_
[
j
]
+=
origin_sparse_grad
.
value_
[
k
];
}
slice_end
=
slice_start
+
stride
;
}
last_index
=
index
;
WorkerParamsForReduceSparseGradient
params
{
slice_start
,
slice_end
,
max_length
,
outer_dim
,
&
sorted_indices
,
&
slice_positions
,
origin_sparse_grad
.
value_
,
unique_grad
};
threads
.
emplace_back
(
std
::
thread
(
WorkerForReduceSparseGradient
,
params
));
}
for
(
size_t
i
=
0
;
i
<
thread_num
;
++
i
)
{
threads
[
i
].
join
();
}
unique_grad
->
indices_size_
=
unique_indices_size
+
1
;
unique_grad
->
indices_size_
=
slice_positions
.
size
()
;
}
std
::
pair
<
AnfNodePtr
,
size_t
>
GetKernelInput
(
const
AnfNodePtr
&
anf_node
,
size_t
index
)
{
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录