Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
magicwindyyd
mindspore
提交
241e980f
M
mindspore
项目概览
magicwindyyd
/
mindspore
与 Fork 源项目一致
Fork自
MindSpore / mindspore
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
M
mindspore
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
241e980f
编写于
7月 24, 2020
作者:
C
cristoval
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
graceful shutdown in ps mode
上级
7be664fa
变更
5
隐藏空白更改
内联
并排
Showing
5 changed file
with
33 addition
and
10 deletion
+33
-10
mindspore/ccsrc/frontend/parallel/ps/parameter_server.h
mindspore/ccsrc/frontend/parallel/ps/parameter_server.h
+16
-2
mindspore/ccsrc/frontend/parallel/ps/scheduler.cc
mindspore/ccsrc/frontend/parallel/ps/scheduler.cc
+2
-3
mindspore/ccsrc/frontend/parallel/ps/worker.h
mindspore/ccsrc/frontend/parallel/ps/worker.h
+6
-3
mindspore/ccsrc/frontend/parallel/ps/worker_proxy.h
mindspore/ccsrc/frontend/parallel/ps/worker_proxy.h
+1
-1
mindspore/ccsrc/pipeline/jit/pipeline.cc
mindspore/ccsrc/pipeline/jit/pipeline.cc
+8
-1
未找到文件。
mindspore/ccsrc/frontend/parallel/ps/parameter_server.h
浏览文件 @
241e980f
...
...
@@ -70,6 +70,7 @@ class ParameterServer {
handler_
(
nullptr
),
func_graph_
(
nullptr
),
sess_
(
nullptr
),
running_
(
true
),
thread_
(
nullptr
)
{}
~
ParameterServer
()
=
default
;
ParameterServer
(
const
ParameterServer
&
)
=
delete
;
...
...
@@ -106,6 +107,7 @@ class ParameterServer {
void
InitGrad
(
const
Key
&
key
,
const
GradPtr
&
grad
);
void
InitEmbeddingTable
(
const
Key
&
key
,
const
std
::
shared_ptr
<
std
::
vector
<
std
::
shared_ptr
<
std
::
vector
<
size_t
>>>>
&
shapes
);
void
Finalize
();
void
UpdateWeights
();
void
AccumGrad
(
const
Keys
&
key
,
const
Values
&
values
,
const
Lengths
&
lengths
);
WeightPtr
weight
(
const
Key
&
key
);
...
...
@@ -123,6 +125,7 @@ class ParameterServer {
std
::
unique_ptr
<
ServerHandler
>
handler_
;
FuncGraphPtr
func_graph_
;
std
::
shared_ptr
<
session
::
SessionBasic
>
sess_
;
bool
running_
;
std
::
unordered_map
<
Key
,
std
::
shared_ptr
<
PServerKernel
>>
optimizers_
;
std
::
unordered_map
<
Key
,
InputsShapePtr
>
optim_inputs_shape_
;
...
...
@@ -261,7 +264,7 @@ void ParameterServer<T>::ServerHandler::HandleEmbeddingLookup(const ::ps::KVMeta
template
<
typename
T
>
void
ParameterServer
<
T
>::
ServerHandler
::
HandleFinalize
(
const
::
ps
::
KVMeta
&
req_meta
,
const
::
ps
::
KVPairs
<
T
>
&
req_data
,
::
ps
::
KVPairs
<
T
>
*
res
)
{
::
ps
::
Finalize
(
0
,
false
);
ps_
->
Finalize
(
);
}
template
<
typename
T
>
...
...
@@ -381,11 +384,20 @@ void ParameterServer<T>::InitEmbeddingTable(
grads_accum_counter_
[
key
]
=
0
;
}
template
<
typename
T
>
void
ParameterServer
<
T
>::
Finalize
()
{
running_
=
false
;
apply_grads_cv_
.
notify_one
();
}
template
<
typename
T
>
void
ParameterServer
<
T
>::
UpdateWeights
()
{
while
(
true
)
{
std
::
unique_lock
<
std
::
mutex
>
lock
(
mutex_
);
apply_grads_cv_
.
wait
(
lock
,
[
this
]
{
return
this
->
ReadyForUpdateWeights
();
});
apply_grads_cv_
.
wait
(
lock
,
[
this
]
{
return
this
->
ReadyForUpdateWeights
()
||
!
running_
;
});
if
(
!
running_
)
{
break
;
}
for
(
auto
iter
=
weights_
.
begin
();
iter
!=
weights_
.
end
();
iter
++
)
{
Key
key
=
iter
->
first
;
...
...
@@ -550,6 +562,8 @@ void ParameterServer<T>::Run(const FuncGraphPtr &func_graph) {
}
Init
(
func_graph
);
thread_
->
join
();
::
ps
::
Finalize
(
0
,
true
);
exit
(
1
);
}
}
// namespace ps
}
// namespace parallel
...
...
mindspore/ccsrc/frontend/parallel/ps/scheduler.cc
浏览文件 @
241e980f
...
...
@@ -23,9 +23,8 @@ namespace parallel {
namespace
ps
{
void
Scheduler
::
Run
()
{
::
ps
::
Start
(
0
);
while
(
true
)
{
sleep
(
1
);
}
::
ps
::
Finalize
(
0
,
true
);
exit
(
1
);
}
}
// namespace ps
}
// namespace parallel
...
...
mindspore/ccsrc/frontend/parallel/ps/worker.h
浏览文件 @
241e980f
...
...
@@ -54,7 +54,7 @@ class Worker {
private:
Worker
()
:
kv_worker_
(
nullptr
),
running_
(
false
),
key_cnt_
(
0
)
{}
~
Worker
()
{
::
ps
::
Finalize
(
0
,
true
);
}
~
Worker
()
=
default
;
Worker
(
const
Worker
&
)
=
delete
;
Worker
&
operator
=
(
const
Worker
&
)
=
delete
;
...
...
@@ -81,7 +81,6 @@ void Worker<T>::Run() {
MS_LOG
(
INFO
)
<<
"'Worker is already running."
;
return
;
}
::
ps
::
Start
(
0
);
if
(
!::
ps
::
IsWorker
())
{
MS_LOG
(
EXCEPTION
)
<<
"The role is not worker."
;
...
...
@@ -121,7 +120,11 @@ void Worker<T>::DoPSEmbeddingLookup(const ::ps::SArray<::ps::Key> &keys, const :
template
<
typename
T
>
void
Worker
<
T
>::
Finalize
()
{
kv_worker_
->
Finalize
();
if
(
running_
)
{
kv_worker_
->
Finalize
();
kv_worker_
.
reset
();
running_
=
false
;
}
}
template
<
typename
T
>
...
...
mindspore/ccsrc/frontend/parallel/ps/worker_proxy.h
浏览文件 @
241e980f
...
...
@@ -155,7 +155,7 @@ void WorkerProxy<T>::Finalize() {
kvs
.
vals
.
push_back
(
0.0
f
);
Send
(
obj_
,
ts
,
true
,
false
,
kFinalizeCmd
,
kvs
,
broadcast_slicer_
);
obj_
->
WaitRequest
(
ts
);
::
ps
::
Finalize
(
0
,
fals
e
);
::
ps
::
Finalize
(
0
,
tru
e
);
}
template
<
typename
T
>
...
...
mindspore/ccsrc/pipeline/jit/pipeline.cc
浏览文件 @
241e980f
...
...
@@ -45,6 +45,7 @@
#if (ENABLE_CPU && (ENABLE_D || ENABLE_GPU))
#include "frontend/parallel/ps/common.h"
#include "frontend/parallel/ps/util.h"
#include "frontend/parallel/ps/worker.h"
#endif
#if (ENABLE_GE || ENABLE_D)
...
...
@@ -949,7 +950,13 @@ void ClearResAtexit() {
pynative
::
ClearPyNativeSession
();
session
::
ClearPythonParasMap
();
device
::
KernelRuntimeManager
::
Instance
().
ClearRuntimeResource
();
#if (ENABLE_CPU && (ENABLE_D || ENABLE_GPU))
if
(
mindspore
::
parallel
::
ps
::
Util
::
IsParamServerMode
())
{
if
(
parallel
::
ps
::
Util
::
IsRoleOfWorker
())
{
parallel
::
ps
::
Worker
<
float
>::
GetInstance
().
Finalize
();
}
}
#endif
ad
::
g_k_prims
.
clear
();
abstract
::
ClearPrimEvaluatorMap
();
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录