Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
BaiXuePrincess
Paddle
提交
23bbd912
P
Paddle
项目概览
BaiXuePrincess
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
23bbd912
编写于
2月 24, 2022
作者:
Z
zmxdream
提交者:
GitHub
2月 24, 2022
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
config fleet optimize. test=develop (#39849)
上级
2ec943a7
变更
2
隐藏空白更改
内联
并排
Showing
2 changed file
with
43 addition
and
39 deletion
+43
-39
paddle/fluid/framework/ps_gpu_trainer.cc
paddle/fluid/framework/ps_gpu_trainer.cc
+42
-39
paddle/fluid/framework/trainer.h
paddle/fluid/framework/trainer.h
+1
-0
未找到文件。
paddle/fluid/framework/ps_gpu_trainer.cc
浏览文件 @
23bbd912
...
...
@@ -46,6 +46,48 @@ void PSGPUTrainer::Initialize(const TrainerDesc& trainer_desc,
dense_grad_names_
[
table_id
][
j
]
=
table
.
dense_grad_name
(
j
);
}
}
InitializeGPUServer
(
trainer_desc
);
scale_datanorm_
=
trainer_desc
.
scale_datanorm
();
int
place_num
=
trainer_desc
.
worker_places_size
();
const
std
::
vector
<
paddle
::
framework
::
DataFeed
*>
readers
=
dataset
->
GetReaders
();
dump_file_num_
=
trainer_desc
.
dump_file_num
();
user_define_dump_filename_
=
trainer_desc
.
user_define_dump_filename
();
std
::
vector
<
int
>
dev_ids
;
for
(
int
i
=
0
;
i
<
place_num
;
++
i
)
{
int
num
=
trainer_desc
.
worker_places
(
i
);
platform
::
CUDAPlace
place
=
platform
::
CUDAPlace
(
num
);
places_
.
push_back
(
place
);
dev_ids
.
push_back
(
num
);
}
for
(
int
i
=
0
;
i
<
trainer_desc
.
downpour_param
().
stat_var_names_size
();
i
++
)
{
need_merge_var_names_
.
push_back
(
trainer_desc
.
downpour_param
().
stat_var_names
(
i
));
}
VLOG
(
3
)
<<
"going to initialize pull dense worker"
;
SetDebug
(
trainer_desc
.
debug
());
trainer_desc_
=
trainer_desc
;
workers_
.
resize
(
place_num
);
for
(
int
i
=
0
;
i
<
place_num
;
++
i
)
{
workers_
[
i
]
=
DeviceWorkerFactory
::
CreateDeviceWorker
(
trainer_desc
.
device_worker_name
());
workers_
[
i
]
->
SetDeviceIndex
(
i
);
workers_
[
i
]
->
SetNeedDumpField
(
need_dump_field_
);
workers_
[
i
]
->
SetNeedDumpParam
(
need_dump_param_
);
workers_
[
i
]
->
SetDumpFieldVector
(
dump_fields_
);
workers_
[
i
]
->
SetDumpParamVector
(
dump_param_
);
workers_
[
i
]
->
InitRandomDumpConfig
(
trainer_desc
);
workers_
[
i
]
->
SetDataFeed
(
readers
[
i
]);
workers_
[
i
]
->
SetPlace
(
places_
[
i
]);
workers_
[
i
]
->
SetReaderPlace
(
places_
[
i
]);
workers_
[
i
]
->
Initialize
(
trainer_desc
);
workers_
[
i
]
->
SetWorkerNum
(
place_num
);
}
return
;
}
void
PSGPUTrainer
::
InitializeGPUServer
(
const
TrainerDesc
&
trainer_desc
)
{
// add for hbmps optimizer config
auto
fleet_desc_str
=
trainer_desc
.
fleet_desc
();
google
::
protobuf
::
TextFormat
::
ParseFromString
(
fleet_desc_str
,
&
_ps_param
);
...
...
@@ -203,45 +245,6 @@ void PSGPUTrainer::Initialize(const TrainerDesc& trainer_desc,
auto
ps_gpu_wrapper
=
paddle
::
framework
::
PSGPUWrapper
::
GetInstance
();
ps_gpu_wrapper
->
InitializeGPUServer
(
config
);
scale_datanorm_
=
trainer_desc
.
scale_datanorm
();
int
place_num
=
trainer_desc
.
worker_places_size
();
const
std
::
vector
<
paddle
::
framework
::
DataFeed
*>
readers
=
dataset
->
GetReaders
();
dump_file_num_
=
trainer_desc
.
dump_file_num
();
user_define_dump_filename_
=
trainer_desc
.
user_define_dump_filename
();
std
::
vector
<
int
>
dev_ids
;
for
(
int
i
=
0
;
i
<
place_num
;
++
i
)
{
int
num
=
trainer_desc
.
worker_places
(
i
);
platform
::
CUDAPlace
place
=
platform
::
CUDAPlace
(
num
);
places_
.
push_back
(
place
);
dev_ids
.
push_back
(
num
);
}
for
(
int
i
=
0
;
i
<
trainer_desc
.
downpour_param
().
stat_var_names_size
();
i
++
)
{
need_merge_var_names_
.
push_back
(
trainer_desc
.
downpour_param
().
stat_var_names
(
i
));
}
VLOG
(
3
)
<<
"going to initialize pull dense worker"
;
SetDebug
(
trainer_desc
.
debug
());
trainer_desc_
=
trainer_desc
;
workers_
.
resize
(
place_num
);
for
(
int
i
=
0
;
i
<
place_num
;
++
i
)
{
workers_
[
i
]
=
DeviceWorkerFactory
::
CreateDeviceWorker
(
trainer_desc
.
device_worker_name
());
workers_
[
i
]
->
SetDeviceIndex
(
i
);
workers_
[
i
]
->
SetNeedDumpField
(
need_dump_field_
);
workers_
[
i
]
->
SetNeedDumpParam
(
need_dump_param_
);
workers_
[
i
]
->
SetDumpFieldVector
(
dump_fields_
);
workers_
[
i
]
->
SetDumpParamVector
(
dump_param_
);
workers_
[
i
]
->
InitRandomDumpConfig
(
trainer_desc
);
workers_
[
i
]
->
SetDataFeed
(
readers
[
i
]);
workers_
[
i
]
->
SetPlace
(
places_
[
i
]);
workers_
[
i
]
->
SetReaderPlace
(
places_
[
i
]);
workers_
[
i
]
->
Initialize
(
trainer_desc
);
workers_
[
i
]
->
SetWorkerNum
(
place_num
);
}
return
;
}
std
::
string
PSGPUTrainer
::
GetDumpPath
(
int
tid
)
{
...
...
paddle/fluid/framework/trainer.h
浏览文件 @
23bbd912
...
...
@@ -271,6 +271,7 @@ class PSGPUTrainer : public TrainerBase {
template
<
typename
T
>
void
MergeToRootScope
(
LoDTensor
*
root_tensor
,
LoDTensor
*
thread_tensor
);
void
InitializeGPUServer
(
const
TrainerDesc
&
trainer_desc
);
protected:
Dataset
*
dataset_
;
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录