Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
机器未来
Paddle
提交
a839ddca
P
Paddle
项目概览
机器未来
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
a839ddca
编写于
12月 29, 2020
作者:
T
Thunderbrook
提交者:
GitHub
12月 29, 2020
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
cherry pick heter ps (#29955)
* cherry pick heter ps * CMakeList
上级
fae406ae
变更
48
展开全部
隐藏空白更改
内联
并排
Showing
48 changed file
with
4179 addition
and
60 deletion
+4179
-60
paddle/fluid/distributed/CMakeLists.txt
paddle/fluid/distributed/CMakeLists.txt
+3
-0
paddle/fluid/framework/CMakeLists.txt
paddle/fluid/framework/CMakeLists.txt
+19
-18
paddle/fluid/framework/data_feed.cc
paddle/fluid/framework/data_feed.cc
+4
-4
paddle/fluid/framework/data_feed.h
paddle/fluid/framework/data_feed.h
+12
-9
paddle/fluid/framework/data_set.h
paddle/fluid/framework/data_set.h
+14
-0
paddle/fluid/framework/device_worker.h
paddle/fluid/framework/device_worker.h
+96
-0
paddle/fluid/framework/device_worker_factory.cc
paddle/fluid/framework/device_worker_factory.cc
+8
-0
paddle/fluid/framework/fleet/CMakeLists.txt
paddle/fluid/framework/fleet/CMakeLists.txt
+9
-0
paddle/fluid/framework/fleet/fleet_wrapper.cc
paddle/fluid/framework/fleet/fleet_wrapper.cc
+4
-17
paddle/fluid/framework/fleet/heter_context.h
paddle/fluid/framework/fleet/heter_context.h
+47
-0
paddle/fluid/framework/fleet/heter_ps/CMakeLists.txt
paddle/fluid/framework/fleet/heter_ps/CMakeLists.txt
+6
-0
paddle/fluid/framework/fleet/heter_ps/cudf/LICENSE
paddle/fluid/framework/fleet/heter_ps/cudf/LICENSE
+201
-0
paddle/fluid/framework/fleet/heter_ps/cudf/concurrent_unordered_map.cuh.h
...mework/fleet/heter_ps/cudf/concurrent_unordered_map.cuh.h
+830
-0
paddle/fluid/framework/fleet/heter_ps/cudf/hash_functions.cuh
...le/fluid/framework/fleet/heter_ps/cudf/hash_functions.cuh
+121
-0
paddle/fluid/framework/fleet/heter_ps/cudf/managed.cuh
paddle/fluid/framework/fleet/heter_ps/cudf/managed.cuh
+33
-0
paddle/fluid/framework/fleet/heter_ps/cudf/managed_allocator.cuh
...fluid/framework/fleet/heter_ps/cudf/managed_allocator.cuh
+54
-0
paddle/fluid/framework/fleet/heter_ps/feature_value.h
paddle/fluid/framework/fleet/heter_ps/feature_value.h
+76
-0
paddle/fluid/framework/fleet/heter_ps/hashtable.h
paddle/fluid/framework/fleet/heter_ps/hashtable.h
+64
-0
paddle/fluid/framework/fleet/heter_ps/hashtable.tpp
paddle/fluid/framework/fleet/heter_ps/hashtable.tpp
+126
-0
paddle/fluid/framework/fleet/heter_ps/heter_comm.h
paddle/fluid/framework/fleet/heter_ps/heter_comm.h
+84
-0
paddle/fluid/framework/fleet/heter_ps/heter_comm.tpp
paddle/fluid/framework/fleet/heter_ps/heter_comm.tpp
+494
-0
paddle/fluid/framework/fleet/heter_ps/heter_ps.cu
paddle/fluid/framework/fleet/heter_ps/heter_ps.cu
+62
-0
paddle/fluid/framework/fleet/heter_ps/heter_ps.h
paddle/fluid/framework/fleet/heter_ps/heter_ps.h
+51
-0
paddle/fluid/framework/fleet/heter_ps/heter_ps_base.h
paddle/fluid/framework/fleet/heter_ps/heter_ps_base.h
+48
-0
paddle/fluid/framework/fleet/heter_ps/heter_resource.cc
paddle/fluid/framework/fleet/heter_ps/heter_resource.cc
+91
-0
paddle/fluid/framework/fleet/heter_ps/heter_resource.h
paddle/fluid/framework/fleet/heter_ps/heter_resource.h
+66
-0
paddle/fluid/framework/fleet/heter_ps/optimizer.cuh
paddle/fluid/framework/fleet/heter_ps/optimizer.cuh
+122
-0
paddle/fluid/framework/fleet/heter_ps/optimizer_conf.h
paddle/fluid/framework/fleet/heter_ps/optimizer_conf.h
+32
-0
paddle/fluid/framework/fleet/heter_ps/test_comm.cu
paddle/fluid/framework/fleet/heter_ps/test_comm.cu
+112
-0
paddle/fluid/framework/fleet/ps_gpu_wrapper.cc
paddle/fluid/framework/fleet/ps_gpu_wrapper.cc
+194
-0
paddle/fluid/framework/fleet/ps_gpu_wrapper.cu
paddle/fluid/framework/fleet/ps_gpu_wrapper.cu
+182
-0
paddle/fluid/framework/fleet/ps_gpu_wrapper.h
paddle/fluid/framework/fleet/ps_gpu_wrapper.h
+118
-0
paddle/fluid/framework/ps_gpu_trainer.cc
paddle/fluid/framework/ps_gpu_trainer.cc
+404
-0
paddle/fluid/framework/ps_gpu_worker.cc
paddle/fluid/framework/ps_gpu_worker.cc
+196
-0
paddle/fluid/framework/trainer.h
paddle/fluid/framework/trainer.h
+49
-0
paddle/fluid/framework/trainer_factory.cc
paddle/fluid/framework/trainer_factory.cc
+3
-0
paddle/fluid/operators/pull_box_sparse_op.cc
paddle/fluid/operators/pull_box_sparse_op.cc
+11
-0
paddle/fluid/operators/pull_box_sparse_op.h
paddle/fluid/operators/pull_box_sparse_op.h
+13
-0
paddle/fluid/pybind/CMakeLists.txt
paddle/fluid/pybind/CMakeLists.txt
+2
-1
paddle/fluid/pybind/fleet_wrapper_py.cc
paddle/fluid/pybind/fleet_wrapper_py.cc
+0
-4
paddle/fluid/pybind/ps_gpu_wrapper_py.cc
paddle/fluid/pybind/ps_gpu_wrapper_py.cc
+44
-0
paddle/fluid/pybind/ps_gpu_wrapper_py.h
paddle/fluid/pybind/ps_gpu_wrapper_py.h
+29
-0
paddle/fluid/pybind/pybind.cc
paddle/fluid/pybind/pybind.cc
+5
-0
python/paddle/fluid/executor.py
python/paddle/fluid/executor.py
+0
-2
python/paddle/fluid/incubate/fleet/parameter_server/pslib/optimizer_factory.py
...ncubate/fleet/parameter_server/pslib/optimizer_factory.py
+1
-1
python/paddle/fluid/layers/nn.py
python/paddle/fluid/layers/nn.py
+14
-3
python/paddle/fluid/trainer_desc.py
python/paddle/fluid/trainer_desc.py
+24
-0
python/paddle/fluid/trainer_factory.py
python/paddle/fluid/trainer_factory.py
+1
-1
未找到文件。
paddle/fluid/distributed/CMakeLists.txt
浏览文件 @
a839ddca
if
(
WITH_PSLIB
)
return
()
endif
()
if
(
NOT WITH_DISTRIBUTE
)
return
()
endif
()
...
...
paddle/fluid/framework/CMakeLists.txt
浏览文件 @
a839ddca
...
...
@@ -204,11 +204,11 @@ if(WITH_DISTRIBUTE)
cc_library
(
executor SRCS executor.cc multi_trainer.cc pipeline_trainer.cc dataset_factory.cc
dist_multi_trainer.cc trainer_factory.cc trainer.cc data_feed_factory.cc
heterxpu_trainer.cc
data_feed.cc device_worker.cc hogwild_worker.cc hetercpu_worker.cc
heterbox_worker.cc heterbox_trainer.cc downpour_worker.cc downpour_worker_opt.cc
data_feed.cc device_worker.cc hogwild_worker.cc hetercpu_worker.cc
ps_gpu_worker.cc
heterbox_worker.cc heterbox_trainer.cc
ps_gpu_trainer.cc
downpour_worker.cc downpour_worker_opt.cc
pull_dense_worker.cc section_worker.cc device_worker_factory.cc data_set.cc DEPS op_registry
device_context scope framework_proto trainer_desc_proto glog fs shell
fleet_wrapper heter_wrapper box_wrapper lodtensor_printer
fleet_wrapper heter_wrapper
ps_gpu_wrapper
box_wrapper lodtensor_printer
lod_rank_table feed_fetch_method sendrecvop_rpc communicator collective_helper
${
GLOB_DISTRIBUTE_DEPS
}
graph_to_program_pass variable_helper data_feed_proto timer monitor
heter_service_proto pslib_brpc
)
...
...
@@ -216,15 +216,16 @@ if(WITH_DISTRIBUTE)
set_source_files_properties
(
executor.cc PROPERTIES COMPILE_FLAGS
${
DISTRIBUTE_COMPILE_FLAGS
}
)
else
()
cc_library
(
executor SRCS executor.cc multi_trainer.cc pipeline_trainer.cc dataset_factory.cc
dist_multi_trainer.cc trainer_factory.cc trainer.cc data_feed_factory.cc
heterxpu_trainer.cc
data_feed.cc device_worker.cc hogwild_worker.cc hetercpu_worker.cc
heterbox_worker.cc heterbox_trainer.cc downpour_worker.cc downpour_worker_opt.cc
pull_dense_worker.cc section_worker.cc device_worker_factory.cc data_set.cc DEPS op_registry
device_context scope framework_proto data_feed_proto heter_service_proto trainer_desc_proto glog
lod_rank_table fs shell fleet_wrapper heter_wrapper box_wrapper lodtensor_printer feed_fetch_method
graph_to_program_pass variable_helper timer monitor heter_service_proto fleet
)
dist_multi_trainer.cc trainer_factory.cc trainer.cc data_feed_factory.cc
heterxpu_trainer.cc
data_feed.cc device_worker.cc hogwild_worker.cc hetercpu_worker.cc ps_gpu_worker.cc
heterbox_worker.cc heterbox_trainer.cc ps_gpu_trainer.cc downpour_worker.cc downpour_worker_opt.cc
pull_dense_worker.cc section_worker.cc device_worker_factory.cc data_set.cc DEPS op_registry
device_context scope framework_proto trainer_desc_proto glog fs shell
fleet_wrapper heter_wrapper ps_gpu_wrapper box_wrapper lodtensor_printer
lod_rank_table feed_fetch_method collective_helper
${
GLOB_DISTRIBUTE_DEPS
}
graph_to_program_pass variable_helper data_feed_proto timer monitor
heter_service_proto fleet
)
set
(
DISTRIBUTE_COMPILE_FLAGS
"-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor"
)
set_source_files_properties
(
executor.cc PROPERTIES COMPILE_FLAGS
${
DISTRIBUTE_COMPILE_FLAGS
}
)
set_source_files_properties
(
multi_trainer.cc PROPERTIES COMPILE_FLAGS
${
DISTRIBUTE_COMPILE_FLAGS
}
)
...
...
@@ -234,22 +235,22 @@ elseif(WITH_PSLIB)
cc_library
(
executor SRCS executor.cc multi_trainer.cc pipeline_trainer.cc dataset_factory.cc
dist_multi_trainer.cc trainer_factory.cc trainer.cc data_feed_factory.cc
heterxpu_trainer.cc
data_feed.cc device_worker.cc hogwild_worker.cc hetercpu_worker.cc
heterbox_worker.cc heterbox_trainer.cc downpour_worker.cc downpour_worker_opt.cc
data_feed.cc device_worker.cc hogwild_worker.cc hetercpu_worker.cc
ps_gpu_worker.cc
heterbox_worker.cc heterbox_trainer.cc
ps_gpu_trainer.cc
downpour_worker.cc downpour_worker_opt.cc
pull_dense_worker.cc section_worker.cc device_worker_factory.cc data_set.cc DEPS op_registry
device_context scope framework_proto data_feed_proto heter_service_proto trainer_desc_proto glog
lod_rank_table fs shell fleet_wrapper heter_wrapper box_wrapper lodtensor_printer feed_fetch_method
lod_rank_table fs shell fleet_wrapper heter_wrapper
ps_gpu_wrapper
box_wrapper lodtensor_printer feed_fetch_method
graph_to_program_pass variable_helper timer monitor pslib_brpc
)
else
()
cc_library
(
executor SRCS executor.cc multi_trainer.cc pipeline_trainer.cc dataset_factory.cc
dist_multi_trainer.cc trainer_factory.cc trainer.cc data_feed_factory.cc
heterxpu_trainer.cc
data_feed.cc device_worker.cc hogwild_worker.cc hetercpu_worker.cc
heterbox_worker.cc heterbox_trainer.cc downpour_worker.cc downpour_worker_opt.cc
data_feed.cc device_worker.cc hogwild_worker.cc hetercpu_worker.cc
ps_gpu_worker.cc
heterbox_worker.cc heterbox_trainer.cc
ps_gpu_trainer.cc
downpour_worker.cc downpour_worker_opt.cc
pull_dense_worker.cc section_worker.cc device_worker_factory.cc data_set.cc DEPS op_registry
device_context scope framework_proto data_feed_proto heter_service_proto trainer_desc_proto glog
lod_rank_table fs shell fleet_wrapper heter_wrapper box_wrapper lodtensor_printer feed_fetch_method
lod_rank_table fs shell fleet_wrapper heter_wrapper
ps_gpu_wrapper
box_wrapper lodtensor_printer feed_fetch_method
graph_to_program_pass variable_helper timer monitor
)
endif
()
...
...
paddle/fluid/framework/data_feed.cc
浏览文件 @
a839ddca
...
...
@@ -968,7 +968,7 @@ bool MultiSlotInMemoryDataFeed::ParseOneInstanceFromPipe(Record* instance) {
if
(
fabs
(
feasign
)
<
1e-6
&&
!
use_slots_is_dense_
[
i
])
{
continue
;
}
Feature
Key
f
;
Feature
Feasign
f
;
f
.
float_feasign_
=
feasign
;
instance
->
float_feasigns_
.
push_back
(
FeatureItem
(
f
,
idx
));
}
...
...
@@ -980,7 +980,7 @@ bool MultiSlotInMemoryDataFeed::ParseOneInstanceFromPipe(Record* instance) {
if
(
feasign
==
0
&&
!
use_slots_is_dense_
[
i
])
{
continue
;
}
Feature
Key
f
;
Feature
Feasign
f
;
f
.
uint64_feasign_
=
feasign
;
instance
->
uint64_feasigns_
.
push_back
(
FeatureItem
(
f
,
idx
));
}
...
...
@@ -1038,7 +1038,7 @@ bool MultiSlotInMemoryDataFeed::ParseOneInstance(Record* instance) {
if
(
fabs
(
feasign
)
<
1e-6
)
{
continue
;
}
Feature
Key
f
;
Feature
Feasign
f
;
f
.
float_feasign_
=
feasign
;
instance
->
float_feasigns_
.
push_back
(
FeatureItem
(
f
,
idx
));
}
...
...
@@ -1048,7 +1048,7 @@ bool MultiSlotInMemoryDataFeed::ParseOneInstance(Record* instance) {
if
(
feasign
==
0
)
{
continue
;
}
Feature
Key
f
;
Feature
Feasign
f
;
f
.
uint64_feasign_
=
feasign
;
instance
->
uint64_feasigns_
.
push_back
(
FeatureItem
(
f
,
idx
));
}
...
...
paddle/fluid/framework/data_feed.h
浏览文件 @
a839ddca
...
...
@@ -69,20 +69,23 @@ namespace framework {
// while (reader->Next()) {
// // trainer do something
// }
union
Feature
Key
{
union
Feature
Feasign
{
uint64_t
uint64_feasign_
;
float
float_feasign_
;
};
struct
FeatureItem
{
FeatureItem
()
{}
FeatureItem
(
Feature
Key
sign
,
uint16_t
slot
)
{
FeatureItem
(
Feature
Feasign
sign
,
uint16_t
slot
)
{
this
->
sign
()
=
sign
;
this
->
slot
()
=
slot
;
}
FeatureKey
&
sign
()
{
return
*
(
reinterpret_cast
<
FeatureKey
*>
(
sign_buffer
()));
}
const
FeatureKey
&
sign
()
const
{
const
FeatureKey
*
ret
=
reinterpret_cast
<
FeatureKey
*>
(
sign_buffer
());
FeatureFeasign
&
sign
()
{
return
*
(
reinterpret_cast
<
FeatureFeasign
*>
(
sign_buffer
()));
}
const
FeatureFeasign
&
sign
()
const
{
const
FeatureFeasign
*
ret
=
reinterpret_cast
<
FeatureFeasign
*>
(
sign_buffer
());
return
*
ret
;
}
uint16_t
&
slot
()
{
return
slot_
;
}
...
...
@@ -90,7 +93,7 @@ struct FeatureItem {
private:
char
*
sign_buffer
()
const
{
return
const_cast
<
char
*>
(
sign_
);
}
char
sign_
[
sizeof
(
Feature
Key
)];
char
sign_
[
sizeof
(
Feature
Feasign
)];
uint16_t
slot_
;
};
...
...
@@ -514,7 +517,7 @@ paddle::framework::Archive<AR>& operator>>(paddle::framework::Archive<AR>& ar,
struct
RecordCandidate
{
std
::
string
ins_id_
;
std
::
unordered_multimap
<
uint16_t
,
Feature
Key
>
feas_
;
std
::
unordered_multimap
<
uint16_t
,
Feature
Feasign
>
feas_
;
size_t
shadow_index_
=
-
1
;
// Optimization for Reservoir Sample
RecordCandidate
()
{}
...
...
@@ -606,7 +609,7 @@ class RecordCandidateList {
template
<
class
AR
>
paddle
::
framework
::
Archive
<
AR
>&
operator
<<
(
paddle
::
framework
::
Archive
<
AR
>&
ar
,
const
Feature
Key
&
fk
)
{
const
Feature
Feasign
&
fk
)
{
ar
<<
fk
.
uint64_feasign_
;
ar
<<
fk
.
float_feasign_
;
return
ar
;
...
...
@@ -614,7 +617,7 @@ paddle::framework::Archive<AR>& operator<<(paddle::framework::Archive<AR>& ar,
template
<
class
AR
>
paddle
::
framework
::
Archive
<
AR
>&
operator
>>
(
paddle
::
framework
::
Archive
<
AR
>&
ar
,
Feature
Key
&
fk
)
{
Feature
Feasign
&
fk
)
{
ar
>>
fk
.
uint64_feasign_
;
ar
>>
fk
.
float_feasign_
;
return
ar
;
...
...
paddle/fluid/framework/data_set.h
浏览文件 @
a839ddca
...
...
@@ -229,6 +229,20 @@ class DatasetImpl : public Dataset {
virtual
void
DynamicAdjustReadersNum
(
int
thread_num
);
virtual
void
SetFleetSendSleepSeconds
(
int
seconds
);
std
::
vector
<
paddle
::
framework
::
Channel
<
T
>>&
GetMultiOutputChannel
()
{
return
multi_output_channel_
;
}
std
::
vector
<
paddle
::
framework
::
Channel
<
T
>>&
GetCurOutputChannel
()
{
if
(
cur_channel_
==
0
)
{
return
multi_output_channel_
;
}
else
{
return
multi_consume_channel_
;
}
}
Channel
<
T
>&
GetInputChannelRef
()
{
return
input_channel_
;
}
protected:
virtual
int
ReceiveFromClient
(
int
msg_type
,
int
client_id
,
const
std
::
string
&
msg
);
...
...
paddle/fluid/framework/device_worker.h
浏览文件 @
a839ddca
...
...
@@ -537,6 +537,102 @@ class HeterBoxWorker : public HogwildWorker {
};
#endif
#if (defined PADDLE_WITH_NCCL) && (defined PADDLE_WITH_PSLIB)
class
PSGPUWorker
:
public
HogwildWorker
{
public:
PSGPUWorker
()
{}
virtual
~
PSGPUWorker
()
{}
virtual
void
Initialize
(
const
TrainerDesc
&
desc
);
virtual
void
TrainFiles
();
virtual
void
SetNeedDump
(
bool
need_dump_field
);
virtual
void
SetChannelWriter
(
ChannelObject
<
std
::
string
>*
queue
);
virtual
void
SetWorkerNum
(
int
num
)
{
worker_num_
=
num
;
}
virtual
void
CacheProgram
(
const
ProgramDesc
&
main_program
)
{
new
(
&
program_
)
ProgramDesc
(
main_program
);
}
virtual
void
ProduceTasks
()
override
;
virtual
void
SetStream
(
const
cudaStream_t
stream
)
{
copy_stream_
=
stream
;
}
virtual
void
SetEvent
(
const
cudaEvent_t
event
)
{
event_
=
event
;
}
virtual
void
TrainFilesWithProfiler
()
{}
void
ResetStat
();
protected:
std
::
shared_ptr
<
paddle
::
framework
::
FleetWrapper
>
fleet_ptr_
;
void
PushGradients
();
void
DumpParam
();
void
CopySparseTable
();
void
CopyDenseTable
();
void
CopyDenseVars
();
private:
int
mpi_rank_
;
std
::
mutex
mutex_
;
std
::
vector
<
std
::
string
>
send_var_list_
;
int
worker_num_
;
ProgramDesc
program_
;
HeterObjectPool
<
HeterTask
>
object_pool_
;
bool
need_dump_param_
;
std
::
vector
<
std
::
string
>
dump_param_
;
bool
need_to_push_dense_
;
bool
need_dump_field_
;
bool
dump_slot_
;
bool
need_to_push_sparse_
;
std
::
vector
<
std
::
string
>
dump_fields_
;
ChannelWriter
<
std
::
string
>
writer_
;
DownpourWorkerParameter
param_
;
float
scale_datanorm_
;
// just save the value in param_ for easy access
std
::
map
<
uint64_t
,
std
::
string
>
label_var_name_
;
std
::
map
<
uint64_t
,
std
::
vector
<
std
::
string
>>
sparse_key_names_
;
std
::
map
<
uint64_t
,
std
::
vector
<
std
::
string
>>
sparse_value_names_
;
std
::
map
<
uint64_t
,
std
::
vector
<
std
::
string
>>
sparse_grad_names_
;
std
::
map
<
uint64_t
,
std
::
vector
<
std
::
string
>>
dense_value_names_
;
std
::
map
<
uint64_t
,
std
::
vector
<
std
::
string
>>
dense_grad_names_
;
platform
::
Place
root_place_
;
// actually pushed feasign of each table
std
::
map
<
uint64_t
,
std
::
vector
<
uint64_t
>>
sparse_push_keys_
;
// skipped ops
std
::
vector
<
std
::
string
>
skip_ops_
;
std
::
vector
<::
std
::
future
<
int32_t
>>
push_sparse_status_
;
std
::
vector
<::
std
::
future
<
int32_t
>>
push_dense_status_
;
// adjust ins weight
AdjustInsWeightConfig
adjust_ins_weight_config_
;
std
::
vector
<
float
>
nid_show_
;
// check nan and inf during training
std
::
vector
<
std
::
string
>
check_nan_var_names_
;
// copy table
CopyTableConfig
copy_table_config_
;
std
::
map
<
uint64_t
,
uint64_t
>
table_dependency_
;
std
::
vector
<
std
::
pair
<
uint64_t
,
uint64_t
>>
copy_sparse_tables_
;
std
::
vector
<
std
::
pair
<
uint64_t
,
uint64_t
>>
copy_dense_tables_
;
std
::
unordered_map
<
uint64_t
,
std
::
unordered_set
<
uint64_t
>>
feasign_set_
;
paddle
::
framework
::
Channel
<
std
::
shared_ptr
<
HeterTask
>>
pull_queue_
;
paddle
::
framework
::
Channel
<
std
::
shared_ptr
<
HeterTask
>>
push_queue_
;
cudaEvent_t
event_
;
cudaStream_t
copy_stream_
;
int
batch_cnt_
{
0
};
std
::
atomic
<
int
>
done_cnt_
{
0
};
double
total_time_
;
double
read_time_
;
double
pack_time_
;
double
pull_sparse_local_time_
;
double
op_all_time_
;
double
xpu_op_time_
;
double
xpu_wait_time_
;
double
cpu_op_time_
;
double
collect_label_time_
;
double
fill_sparse_time_
;
double
push_sparse_time_
;
double
gpu_2_cpu_time_
;
double
cpu_2_gpu_time_
;
uint64_t
total_inst_
;
};
#endif
#if defined(PADDLE_WITH_NCCL)
class
SectionWorker
:
public
DeviceWorker
{
public:
...
...
paddle/fluid/framework/device_worker_factory.cc
浏览文件 @
a839ddca
...
...
@@ -66,8 +66,16 @@ REGISTER_DEVICE_WORKER_CLASS(DownpourWorker);
REGISTER_DEVICE_WORKER_CLASS
(
DownpourWorkerOpt
);
#ifdef PADDLE_WITH_PSLIB
REGISTER_DEVICE_WORKER_CLASS
(
HeterCpuWorker
);
#endif
#if (defined PADDLE_WITH_NCCL) && (defined PADDLE_WITH_PSLIB)
REGISTER_DEVICE_WORKER_CLASS
(
HeterBoxWorker
);
#endif
#if (defined PADDLE_WITH_NCCL) && (defined PADDLE_WITH_PSLIB)
REGISTER_DEVICE_WORKER_CLASS
(
PSGPUWorker
);
#endif
#if defined(PADDLE_WITH_NCCL)
REGISTER_DEVICE_WORKER_CLASS
(
SectionWorker
);
#endif
...
...
paddle/fluid/framework/fleet/CMakeLists.txt
浏览文件 @
a839ddca
if
(
WITH_PSLIB
)
cc_library
(
fleet_wrapper SRCS fleet_wrapper.cc DEPS framework_proto variable_helper scope pslib_brpc pslib
)
if
(
WITH_NCCL
)
nv_library
(
ps_gpu_wrapper SRCS ps_gpu_wrapper.cu ps_gpu_wrapper.cc
DEPS heter_ps
)
add_subdirectory
(
heter_ps
)
else
()
cc_library
(
ps_gpu_wrapper SRCS ps_gpu_wrapper.cc
)
endif
(
WITH_NCCL
)
else
()
cc_library
(
fleet_wrapper SRCS fleet_wrapper.cc DEPS framework_proto variable_helper scope
)
cc_library
(
ps_gpu_wrapper SRCS ps_gpu_wrapper.cc
)
endif
(
WITH_PSLIB
)
if
(
WITH_NCCL
)
...
...
@@ -13,6 +21,7 @@ else()
cc_library
(
box_wrapper SRCS box_wrapper.cc DEPS framework_proto lod_tensor
)
endif
(
WITH_BOX_PS
)
if
(
WITH_GLOO
)
cc_library
(
gloo_wrapper SRCS gloo_wrapper.cc DEPS framework_proto variable_helper scope gloo
)
else
()
...
...
paddle/fluid/framework/fleet/fleet_wrapper.cc
浏览文件 @
a839ddca
...
...
@@ -198,6 +198,7 @@ void FleetWrapper::HeterPullSparseVars(
for
(
auto
&
t
:
fea_values
)
{
pull_result_ptr
.
push_back
(
t
.
data
());
}
/*
auto status = pslib_ptr_->_worker_ptr->heter_pull_sparse(
workerid, pull_result_ptr.data(), table_id, fea_keys.data(),
fea_keys.size(), task->taskid_);
...
...
@@ -211,6 +212,7 @@ void FleetWrapper::HeterPullSparseVars(
exit(-1);
}
}
*/
}
void
FleetWrapper
::
HeterPushSparseVars
(
...
...
@@ -359,6 +361,7 @@ int FleetWrapper::RegisterHeterCallback(HeterCallBackFunc handler) {
VLOG
(
3
)
<<
"pslib_ptr_="
<<
pslib_ptr_
;
VLOG
(
3
)
<<
"_worker_ptr="
<<
pslib_ptr_
->
_worker_ptr
;
return
pslib_ptr_
->
_worker_ptr
->
registe_heter_callback
(
handler
);
#else
VLOG
(
0
)
<<
"FleetWrapper::RegisterHeterCallback"
<<
" does nothing when no pslib"
;
...
...
@@ -1222,13 +1225,6 @@ void FleetWrapper::LoadModelOneTable(const uint64_t table_id,
void
FleetWrapper
::
LoadWithWhitelist
(
const
uint64_t
table_id
,
const
std
::
string
&
path
,
const
int
mode
)
{
#ifdef PADDLE_WITH_PSLIB
auto
ret
=
pslib_ptr_
->
_worker_ptr
->
load_with_whitelist
(
table_id
,
path
,
std
::
to_string
(
mode
));
ret
.
wait
();
if
(
ret
.
get
()
!=
0
)
{
LOG
(
ERROR
)
<<
"load model of table id: "
<<
table_id
<<
", from path: "
<<
path
<<
" failed"
;
}
#else
VLOG
(
0
)
<<
"FleetWrapper::LoadWhitelist does nothing when no pslib"
;
#endif
...
...
@@ -1353,16 +1349,7 @@ int32_t FleetWrapper::SaveWithWhitelist(int table_id, const std::string& path,
const
int
mode
,
const
std
::
string
&
whitelist_path
)
{
#ifdef PADDLE_WITH_PSLIB
auto
ret
=
pslib_ptr_
->
_worker_ptr
->
save_with_whitelist
(
table_id
,
path
,
std
::
to_string
(
mode
),
whitelist_path
);
ret
.
wait
();
int32_t
feasign_cnt
=
ret
.
get
();
if
(
feasign_cnt
==
-
1
)
{
LOG
(
ERROR
)
<<
"table save cache failed"
;
sleep
(
sleep_seconds_before_fail_exit_
);
exit
(
-
1
);
}
return
feasign_cnt
;
return
0
;
#else
VLOG
(
0
)
<<
"FleetWrapper::SaveCache does nothing when no pslib"
;
return
-
1
;
...
...
paddle/fluid/framework/fleet/heter_context.h
0 → 100644
浏览文件 @
a839ddca
/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#if (defined PADDLE_WITH_NCCL) && (defined PADDLE_WITH_PSLIB)
#include <map>
#include <unordered_map>
#include <vector>
#include "common_value.h" // NOLINT
#include "paddle/fluid/framework/fleet/heter_ps/feature_value.h"
#include "paddle/fluid/framework/scope.h"
namespace
paddle
{
namespace
framework
{
class
HeterContext
{
public:
Scope
*
scope_
{
nullptr
};
std
::
vector
<
std
::
vector
<
FeatureKey
>>
feature_keys_
;
std
::
vector
<
std
::
vector
<
paddle
::
ps
::
DownpourFixedFeatureValue
*>>
value_ptr_
;
std
::
vector
<
std
::
vector
<
FeatureValue
>>
feature_values_
;
uint64_t
size
()
{
uint64_t
total_size
=
0
;
for
(
auto
&
keys
:
feature_keys_
)
{
total_size
+=
keys
.
size
();
}
return
total_size
;
}
};
}
// end namespace framework
}
// end namespace paddle
#endif
paddle/fluid/framework/fleet/heter_ps/CMakeLists.txt
0 → 100644
浏览文件 @
a839ddca
nv_library
(
heter_comm SRCS heter_comm.h feature_value.h heter_resource.cc
heter_resource.h hashtable.h DEPS cub device_context
)
nv_test
(
test_heter_comm SRCS test_heter_comm.cu feature_value.h DEPS
heter_comm
)
nv_library
(
heter_ps SRCS heter_ps.cu DEPS heter_comm
)
paddle/fluid/framework/fleet/heter_ps/cudf/LICENSE
0 → 100644
浏览文件 @
a839ddca
Apache License
Version 2.0, January 2004
http://www.apache.org/licenses/
TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
1. Definitions.
"License" shall mean the terms and conditions for use, reproduction,
and distribution as defined by Sections 1 through 9 of this document.
"Licensor" shall mean the copyright owner or entity authorized by
the copyright owner that is granting the License.
"Legal Entity" shall mean the union of the acting entity and all
other entities that control, are controlled by, or are under common
control with that entity. For the purposes of this definition,
"control" means (i) the power, direct or indirect, to cause the
direction or management of such entity, whether by contract or
otherwise, or (ii) ownership of fifty percent (50%) or more of the
outstanding shares, or (iii) beneficial ownership of such entity.
"You" (or "Your") shall mean an individual or Legal Entity
exercising permissions granted by this License.
"Source" form shall mean the preferred form for making modifications,
including but not limited to software source code, documentation
source, and configuration files.
"Object" form shall mean any form resulting from mechanical
transformation or translation of a Source form, including but
not limited to compiled object code, generated documentation,
and conversions to other media types.
"Work" shall mean the work of authorship, whether in Source or
Object form, made available under the License, as indicated by a
copyright notice that is included in or attached to the work
(an example is provided in the Appendix below).
"Derivative Works" shall mean any work, whether in Source or Object
form, that is based on (or derived from) the Work and for which the
editorial revisions, annotations, elaborations, or other modifications
represent, as a whole, an original work of authorship. For the purposes
of this License, Derivative Works shall not include works that remain
separable from, or merely link (or bind by name) to the interfaces of,
the Work and Derivative Works thereof.
"Contribution" shall mean any work of authorship, including
the original version of the Work and any modifications or additions
to that Work or Derivative Works thereof, that is intentionally
submitted to Licensor for inclusion in the Work by the copyright owner
or by an individual or Legal Entity authorized to submit on behalf of
the copyright owner. For the purposes of this definition, "submitted"
means any form of electronic, verbal, or written communication sent
to the Licensor or its representatives, including but not limited to
communication on electronic mailing lists, source code control systems,
and issue tracking systems that are managed by, or on behalf of, the
Licensor for the purpose of discussing and improving the Work, but
excluding communication that is conspicuously marked or otherwise
designated in writing by the copyright owner as "Not a Contribution."
"Contributor" shall mean Licensor and any individual or Legal Entity
on behalf of whom a Contribution has been received by Licensor and
subsequently incorporated within the Work.
2. Grant of Copyright License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
copyright license to reproduce, prepare Derivative Works of,
publicly display, publicly perform, sublicense, and distribute the
Work and such Derivative Works in Source or Object form.
3. Grant of Patent License. Subject to the terms and conditions of
this License, each Contributor hereby grants to You a perpetual,
worldwide, non-exclusive, no-charge, royalty-free, irrevocable
(except as stated in this section) patent license to make, have made,
use, offer to sell, sell, import, and otherwise transfer the Work,
where such license applies only to those patent claims licensable
by such Contributor that are necessarily infringed by their
Contribution(s) alone or by combination of their Contribution(s)
with the Work to which such Contribution(s) was submitted. If You
institute patent litigation against any entity (including a
cross-claim or counterclaim in a lawsuit) alleging that the Work
or a Contribution incorporated within the Work constitutes direct
or contributory patent infringement, then any patent licenses
granted to You under this License for that Work shall terminate
as of the date such litigation is filed.
4. Redistribution. You may reproduce and distribute copies of the
Work or Derivative Works thereof in any medium, with or without
modifications, and in Source or Object form, provided that You
meet the following conditions:
(a) You must give any other recipients of the Work or
Derivative Works a copy of this License; and
(b) You must cause any modified files to carry prominent notices
stating that You changed the files; and
(c) You must retain, in the Source form of any Derivative Works
that You distribute, all copyright, patent, trademark, and
attribution notices from the Source form of the Work,
excluding those notices that do not pertain to any part of
the Derivative Works; and
(d) If the Work includes a "NOTICE" text file as part of its
distribution, then any Derivative Works that You distribute must
include a readable copy of the attribution notices contained
within such NOTICE file, excluding those notices that do not
pertain to any part of the Derivative Works, in at least one
of the following places: within a NOTICE text file distributed
as part of the Derivative Works; within the Source form or
documentation, if provided along with the Derivative Works; or,
within a display generated by the Derivative Works, if and
wherever such third-party notices normally appear. The contents
of the NOTICE file are for informational purposes only and
do not modify the License. You may add Your own attribution
notices within Derivative Works that You distribute, alongside
or as an addendum to the NOTICE text from the Work, provided
that such additional attribution notices cannot be construed
as modifying the License.
You may add Your own copyright statement to Your modifications and
may provide additional or different license terms and conditions
for use, reproduction, or distribution of Your modifications, or
for any such Derivative Works as a whole, provided Your use,
reproduction, and distribution of the Work otherwise complies with
the conditions stated in this License.
5. Submission of Contributions. Unless You explicitly state otherwise,
any Contribution intentionally submitted for inclusion in the Work
by You to the Licensor shall be under the terms and conditions of
this License, without any additional terms or conditions.
Notwithstanding the above, nothing herein shall supersede or modify
the terms of any separate license agreement you may have executed
with Licensor regarding such Contributions.
6. Trademarks. This License does not grant permission to use the trade
names, trademarks, service marks, or product names of the Licensor,
except as required for reasonable and customary use in describing the
origin of the Work and reproducing the content of the NOTICE file.
7. Disclaimer of Warranty. Unless required by applicable law or
agreed to in writing, Licensor provides the Work (and each
Contributor provides its Contributions) on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
implied, including, without limitation, any warranties or conditions
of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
PARTICULAR PURPOSE. You are solely responsible for determining the
appropriateness of using or redistributing the Work and assume any
risks associated with Your exercise of permissions under this License.
8. Limitation of Liability. In no event and under no legal theory,
whether in tort (including negligence), contract, or otherwise,
unless required by applicable law (such as deliberate and grossly
negligent acts) or agreed to in writing, shall any Contributor be
liable to You for damages, including any direct, indirect, special,
incidental, or consequential damages of any character arising as a
result of this License or out of the use or inability to use the
Work (including but not limited to damages for loss of goodwill,
work stoppage, computer failure or malfunction, or any and all
other commercial damages or losses), even if such Contributor
has been advised of the possibility of such damages.
9. Accepting Warranty or Additional Liability. While redistributing
the Work or Derivative Works thereof, You may choose to offer,
and charge a fee for, acceptance of support, warranty, indemnity,
or other liability obligations and/or rights consistent with this
License. However, in accepting such obligations, You may act only
on Your own behalf and on Your sole responsibility, not on behalf
of any other Contributor, and only if You agree to indemnify,
defend, and hold each Contributor harmless for any liability
incurred by, or claims asserted against, such Contributor by reason
of your accepting any such warranty or additional liability.
END OF TERMS AND CONDITIONS
APPENDIX: How to apply the Apache License to your work.
To apply the Apache License to your work, attach the following
boilerplate notice, with the fields enclosed by brackets "{}"
replaced with your own identifying information. (Don't include
the brackets!) The text should be enclosed in the appropriate
comment syntax for the file format. We also recommend that a
file or class name and description of purpose be included on the
same "printed page" as the copyright notice for easier
identification within third-party archives.
Copyright 2018 NVIDIA Corporation
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
paddle/fluid/framework/fleet/heter_ps/cudf/concurrent_unordered_map.cuh.h
0 → 100644
浏览文件 @
a839ddca
此差异已折叠。
点击以展开。
paddle/fluid/framework/fleet/heter_ps/cudf/hash_functions.cuh
0 → 100644
浏览文件 @
a839ddca
/*
* Copyright (c) 2017, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef HASH_FUNCTIONS_CUH
#define HASH_FUNCTIONS_CUH
using
hash_value_type
=
uint32_t
;
// MurmurHash3_32 implementation from
// https://github.com/aappleby/smhasher/blob/master/src/MurmurHash3.cpp
//-----------------------------------------------------------------------------
// MurmurHash3 was written by Austin Appleby, and is placed in the public
// domain. The author hereby disclaims copyright to this source code.
// Note - The x86 and x64 versions do _not_ produce the same results, as the
// algorithms are optimized for their respective platforms. You can still
// compile and run any of them on any platform, but your performance with the
// non-native version will be less than optimal.
template
<
typename
Key
>
struct
MurmurHash3_32
{
using
argument_type
=
Key
;
using
result_type
=
hash_value_type
;
__forceinline__
__host__
__device__
MurmurHash3_32
()
:
m_seed
(
0
)
{}
__forceinline__
__host__
__device__
uint32_t
rotl32
(
uint32_t
x
,
int8_t
r
)
const
{
return
(
x
<<
r
)
|
(
x
>>
(
32
-
r
));
}
__forceinline__
__host__
__device__
uint32_t
fmix32
(
uint32_t
h
)
const
{
h
^=
h
>>
16
;
h
*=
0x85ebca6b
;
h
^=
h
>>
13
;
h
*=
0xc2b2ae35
;
h
^=
h
>>
16
;
return
h
;
}
/* --------------------------------------------------------------------------*/
/**
* @Synopsis Combines two hash values into a new single hash value. Called
* repeatedly to create a hash value from several variables.
* Taken from the Boost hash_combine function
* https://www.boost.org/doc/libs/1_35_0/doc/html/boost/hash_combine_id241013.html
*
* @Param lhs The first hash value to combine
* @Param rhs The second hash value to combine
*
* @Returns A hash value that intelligently combines the lhs and rhs hash values
*/
/* ----------------------------------------------------------------------------*/
__host__
__device__
result_type
hash_combine
(
result_type
lhs
,
result_type
rhs
)
{
result_type
combined
{
lhs
};
combined
^=
rhs
+
0x9e3779b9
+
(
combined
<<
6
)
+
(
combined
>>
2
);
return
combined
;
}
__forceinline__
__host__
__device__
result_type
operator
()(
const
Key
&
key
)
const
{
constexpr
int
len
=
sizeof
(
argument_type
);
const
uint8_t
*
const
data
=
(
const
uint8_t
*
)
&
key
;
constexpr
int
nblocks
=
len
/
4
;
uint32_t
h1
=
m_seed
;
constexpr
uint32_t
c1
=
0xcc9e2d51
;
constexpr
uint32_t
c2
=
0x1b873593
;
//----------
// body
const
uint32_t
*
const
blocks
=
(
const
uint32_t
*
)(
data
+
nblocks
*
4
);
for
(
int
i
=
-
nblocks
;
i
;
i
++
)
{
uint32_t
k1
=
blocks
[
i
];
// getblock32(blocks,i);
k1
*=
c1
;
k1
=
rotl32
(
k1
,
15
);
k1
*=
c2
;
h1
^=
k1
;
h1
=
rotl32
(
h1
,
13
);
h1
=
h1
*
5
+
0xe6546b64
;
}
//----------
// tail
const
uint8_t
*
tail
=
(
const
uint8_t
*
)(
data
+
nblocks
*
4
);
uint32_t
k1
=
0
;
switch
(
len
&
3
)
{
case
3
:
k1
^=
tail
[
2
]
<<
16
;
case
2
:
k1
^=
tail
[
1
]
<<
8
;
case
1
:
k1
^=
tail
[
0
];
k1
*=
c1
;
k1
=
rotl32
(
k1
,
15
);
k1
*=
c2
;
h1
^=
k1
;
};
//----------
// finalization
h1
^=
len
;
h1
=
fmix32
(
h1
);
return
h1
;
}
private:
const
uint32_t
m_seed
;
};
template
<
typename
Key
>
using
default_hash
=
MurmurHash3_32
<
Key
>
;
#endif // HASH_FUNCTIONS_CUH
paddle/fluid/framework/fleet/heter_ps/cudf/managed.cuh
0 → 100644
浏览文件 @
a839ddca
/*
* Copyright (c) 2017, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MANAGED_CUH
#define MANAGED_CUH
#include <new>
struct
managed
{
static
void
*
operator
new
(
size_t
n
)
{
void
*
ptr
=
0
;
cudaError_t
result
=
cudaMallocManaged
(
&
ptr
,
n
);
if
(
cudaSuccess
!=
result
||
0
==
ptr
)
throw
std
::
bad_alloc
();
return
ptr
;
}
static
void
operator
delete
(
void
*
ptr
)
noexcept
{
cudaFree
(
ptr
);
}
};
#endif // MANAGED_CUH
paddle/fluid/framework/fleet/heter_ps/cudf/managed_allocator.cuh
0 → 100644
浏览文件 @
a839ddca
/*
* Copyright (c) 2017, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#ifndef MANAGED_ALLOCATOR_CUH
#define MANAGED_ALLOCATOR_CUH
#include <new>
template
<
class
T
>
struct
managed_allocator
{
typedef
T
value_type
;
managed_allocator
()
=
default
;
template
<
class
U
>
constexpr
managed_allocator
(
const
managed_allocator
<
U
>&
)
noexcept
{}
T
*
allocate
(
std
::
size_t
n
)
const
{
T
*
ptr
=
0
;
cudaError_t
result
=
cudaMallocManaged
(
&
ptr
,
n
*
sizeof
(
T
));
if
(
cudaSuccess
!=
result
||
nullptr
==
ptr
)
{
std
::
cerr
<<
"ERROR: CUDA Runtime call in line "
<<
__LINE__
<<
"of file "
<<
__FILE__
<<
" failed with "
<<
cudaGetErrorString
(
result
)
<<
" ("
<<
result
<<
") "
<<
" Attempted to allocate: "
<<
n
*
sizeof
(
T
)
<<
" bytes.
\n
"
;
throw
std
::
bad_alloc
();
}
return
ptr
;
}
void
deallocate
(
T
*
p
,
std
::
size_t
)
const
{
cudaFree
(
p
);
}
};
template
<
class
T
,
class
U
>
bool
operator
==
(
const
managed_allocator
<
T
>&
,
const
managed_allocator
<
U
>&
)
{
return
true
;
}
template
<
class
T
,
class
U
>
bool
operator
!=
(
const
managed_allocator
<
T
>&
,
const
managed_allocator
<
U
>&
)
{
return
false
;
}
#endif
paddle/fluid/framework/fleet/heter_ps/feature_value.h
0 → 100644
浏览文件 @
a839ddca
/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#ifdef PADDLE_WITH_PSLIB
#include <iostream>
namespace
paddle
{
namespace
framework
{
#define MF_DIM 8
typedef
uint64_t
FeatureKey
;
struct
FeatureValue
{
float
delta_score
;
float
show
;
float
clk
;
int
slot
;
float
lr
;
float
lr_g2sum
;
int
mf_size
;
float
mf
[
MF_DIM
+
1
];
friend
std
::
ostream
&
operator
<<
(
std
::
ostream
&
out
,
FeatureValue
&
val
)
{
out
<<
"show: "
<<
val
.
show
<<
" clk: "
<<
val
.
clk
<<
" slot: "
<<
val
.
slot
<<
" lr: "
<<
val
.
lr
<<
" mf_size: "
<<
val
.
mf_size
<<
" mf:"
;
for
(
int
i
=
0
;
i
<
val
.
mf_size
;
++
i
)
{
out
<<
" "
<<
val
.
mf
[
i
];
}
return
out
;
}
};
struct
FeaturePushValue
{
float
show
;
float
clk
;
int
slot
;
float
lr_g
;
float
mf_g
[
MF_DIM
];
};
// class DownpourFixedFeatureValue {
// public:
// DownpourFixedFeatureValue() {}
// ~DownpourFixedFeatureValue() {}
// float* data() {
// return _data.data();
// }
// size_t size() {
// return _data.size();
// }
// void resize(size_t size) {
// _data.resize(size);
// }
// void shrink_to_fit() {
// _data.shrink_to_fit();
// }
// private:
// std::vector<float> _data;
// };
}
// end namespace framework
}
// end namespace paddle
#endif
paddle/fluid/framework/fleet/heter_ps/hashtable.h
0 → 100644
浏览文件 @
a839ddca
/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <limits>
#include <memory>
#include <vector>
#include "thrust/pair.h"
//#include "cudf/concurrent_unordered_map.cuh.h"
#include "paddle/fluid/framework/fleet/heter_ps/cudf/concurrent_unordered_map.cuh.h"
#ifdef PADDLE_WITH_PSLIB
namespace
paddle
{
namespace
framework
{
template
<
typename
KeyType
,
typename
ValType
>
class
TableContainer
:
public
concurrent_unordered_map
<
KeyType
,
ValType
,
std
::
numeric_limits
<
KeyType
>::
max
()
>
{
public:
TableContainer
(
size_t
capacity
)
:
concurrent_unordered_map
<
KeyType
,
ValType
,
std
::
numeric_limits
<
KeyType
>::
max
()
>
(
capacity
,
ValType
())
{}
};
template
<
typename
KeyType
,
typename
ValType
>
class
HashTable
{
public:
HashTable
(
size_t
capacity
);
virtual
~
HashTable
();
HashTable
(
const
HashTable
&
)
=
delete
;
HashTable
&
operator
=
(
const
HashTable
&
)
=
delete
;
void
insert
(
const
KeyType
*
d_keys
,
const
ValType
*
d_vals
,
size_t
len
,
cudaStream_t
stream
);
void
get
(
const
KeyType
*
d_keys
,
ValType
*
d_vals
,
size_t
len
,
cudaStream_t
stream
);
void
show
();
template
<
typename
GradType
,
typename
Sgd
>
void
update
(
const
KeyType
*
d_keys
,
const
GradType
*
d_grads
,
size_t
len
,
Sgd
sgd
,
cudaStream_t
stream
);
private:
TableContainer
<
KeyType
,
ValType
>*
container_
;
int
BLOCK_SIZE_
{
256
};
float
LOAD_FACTOR
{
0.75
f
};
size_t
capacity_
;
};
}
// end namespace framework
}
// end namespace paddle
#include "hashtable.tpp"
#endif
paddle/fluid/framework/fleet/heter_ps/hashtable.tpp
0 → 100644
浏览文件 @
a839ddca
/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef PADDLE_WITH_PSLIB
namespace
paddle
{
namespace
framework
{
template
<
typename
value_type
>
struct
ReplaceOp
{
__host__
__device__
value_type
operator
()(
value_type
new_value
,
value_type
old_value
)
{
return
new_value
;
}
};
template
<
typename
Table
>
__global__
void
insert_kernel
(
Table
*
table
,
const
typename
Table
::
key_type
*
const
keys
,
const
typename
Table
::
mapped_type
*
const
vals
,
size_t
len
)
{
ReplaceOp
<
typename
Table
::
mapped_type
>
op
;
thrust
::
pair
<
typename
Table
::
key_type
,
typename
Table
::
mapped_type
>
kv
;
const
size_t
i
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
if
(
i
<
len
)
{
kv
.
first
=
keys
[
i
];
kv
.
second
=
vals
[
i
];
auto
it
=
table
->
insert
(
kv
,
op
);
assert
(
it
!=
table
->
end
()
&&
"error: insert fails: table is full"
);
}
}
template
<
typename
Table
>
__global__
void
search_kernel
(
Table
*
table
,
const
typename
Table
::
key_type
*
const
keys
,
typename
Table
::
mapped_type
*
const
vals
,
size_t
len
)
{
const
size_t
i
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
if
(
i
<
len
)
{
auto
it
=
table
->
find
(
keys
[
i
]);
if
(
it
!=
table
->
end
())
{
vals
[
i
]
=
it
->
second
;
}
}
}
template
<
typename
Table
,
typename
GradType
,
typename
Sgd
>
__global__
void
update_kernel
(
Table
*
table
,
const
typename
Table
::
key_type
*
const
keys
,
const
GradType
*
const
grads
,
size_t
len
,
Sgd
sgd
)
{
const
size_t
i
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
if
(
i
<
len
)
{
auto
it
=
table
->
find
(
keys
[
i
]);
if
(
it
!=
table
->
end
())
{
sgd
.
update_value
((
it
.
getter
())
->
second
,
grads
[
i
]);
}
}
}
template
<
typename
KeyType
,
typename
ValType
>
HashTable
<
KeyType
,
ValType
>::
HashTable
(
size_t
capacity
)
{
container_
=
new
TableContainer
<
KeyType
,
ValType
>
(
capacity
);
}
template
<
typename
KeyType
,
typename
ValType
>
HashTable
<
KeyType
,
ValType
>::~
HashTable
()
{
delete
container_
;
}
template
<
typename
KeyType
,
typename
ValType
>
void
HashTable
<
KeyType
,
ValType
>::
show
()
{
container_
->
print
();
}
template
<
typename
KeyType
,
typename
ValType
>
void
HashTable
<
KeyType
,
ValType
>::
get
(
const
KeyType
*
d_keys
,
ValType
*
d_vals
,
size_t
len
,
cudaStream_t
stream
)
{
if
(
len
==
0
)
{
return
;
}
const
int
grid_size
=
(
len
-
1
)
/
BLOCK_SIZE_
+
1
;
search_kernel
<<<
grid_size
,
BLOCK_SIZE_
,
0
,
stream
>>>
(
container_
,
d_keys
,
d_vals
,
len
);
}
template
<
typename
KeyType
,
typename
ValType
>
void
HashTable
<
KeyType
,
ValType
>::
insert
(
const
KeyType
*
d_keys
,
const
ValType
*
d_vals
,
size_t
len
,
cudaStream_t
stream
)
{
if
(
len
==
0
)
{
return
;
}
const
int
grid_size
=
(
len
-
1
)
/
BLOCK_SIZE_
+
1
;
insert_kernel
<<<
grid_size
,
BLOCK_SIZE_
,
0
,
stream
>>>
(
container_
,
d_keys
,
d_vals
,
len
);
}
template
<
typename
KeyType
,
typename
ValType
>
template
<
typename
GradType
,
typename
Sgd
>
void
HashTable
<
KeyType
,
ValType
>::
update
(
const
KeyType
*
d_keys
,
const
GradType
*
d_grads
,
size_t
len
,
Sgd
sgd
,
cudaStream_t
stream
)
{
if
(
len
==
0
)
{
return
;
}
const
int
grid_size
=
(
len
-
1
)
/
BLOCK_SIZE_
+
1
;
update_kernel
<<<
grid_size
,
BLOCK_SIZE_
,
0
,
stream
>>>
(
container_
,
d_keys
,
d_grads
,
len
,
sgd
);
}
}
// end namespace framework
}
// end namespace paddle
#endif
paddle/fluid/framework/fleet/heter_ps/heter_comm.h
0 → 100644
浏览文件 @
a839ddca
/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <vector>
#include "cub/cub.cuh"
#include "hashtable.h"
#include "heter_resource.h"
#include "paddle/fluid/framework/fleet/heter_ps/optimizer.cuh"
#include "paddle/fluid/memory/memory.h"
#include "paddle/fluid/platform/cuda_device_guard.h"
#include "paddle/fluid/platform/place.h"
#include "thrust/pair.h"
#ifdef PADDLE_WITH_PSLIB
namespace
paddle
{
namespace
framework
{
struct
CustomGradMerger
{
template
<
typename
T
>
CUB_RUNTIME_FUNCTION
__forceinline__
__device__
T
operator
()(
const
T
&
a
,
const
T
&
b
)
const
{
T
out
;
out
.
slot
=
a
.
slot
;
out
.
show
=
a
.
show
+
b
.
show
;
out
.
clk
=
a
.
clk
+
b
.
clk
;
out
.
lr_g
=
a
.
lr_g
+
b
.
lr_g
;
for
(
int
i
=
0
;
i
<
MF_DIM
;
++
i
)
{
out
.
mf_g
[
i
]
=
a
.
mf_g
[
i
]
+
b
.
mf_g
[
i
];
}
return
out
;
}
};
template
<
typename
KeyType
,
typename
ValType
,
typename
GradType
>
class
HeterComm
{
public:
HeterComm
(
size_t
capacity
,
std
::
shared_ptr
<
HeterPsResource
>
resource
);
virtual
~
HeterComm
();
HeterComm
(
const
HeterComm
&
)
=
delete
;
HeterComm
&
operator
=
(
const
HeterComm
&
)
=
delete
;
void
split_input_to_shard
(
KeyType
*
d_keys
,
int
*
d_idx_ptr
,
size_t
len
,
int
*
left
,
int
*
right
,
int
gpu_num
);
void
merge_grad
(
int
gpu_num
,
KeyType
*
d_keys
,
GradType
*
d_grads
,
size_t
len
,
int
&
uniq_len
);
void
pull_sparse
(
int
num
,
KeyType
*
d_keys
,
ValType
*
d_vals
,
size_t
len
);
void
build_ps
(
int
num
,
KeyType
*
h_keys
,
ValType
*
h_vals
,
size_t
len
,
size_t
chunk_size
,
int
stream_num
);
void
dump
();
void
show_one_table
(
int
gpu_num
);
int
get_index_by_devid
(
int
devid
);
template
<
typename
Sgd
>
void
push_sparse
(
int
num
,
KeyType
*
d_keys
,
GradType
*
d_grads
,
size_t
len
,
Sgd
&
sgd
);
int
log2i
(
int
x
);
private:
using
Table
=
HashTable
<
KeyType
,
ValType
>
;
int
block_size_
{
256
};
float
load_factor_
{
0.75
};
std
::
vector
<
Table
*>
tables_
;
std
::
shared_ptr
<
HeterPsResource
>
resource_
;
CustomGradMerger
merger_
;
};
}
// end namespace framework
}
// end namespace paddle
#include "paddle/fluid/framework/fleet/heter_ps/heter_comm.tpp"
#endif
paddle/fluid/framework/fleet/heter_ps/heter_comm.tpp
0 → 100644
浏览文件 @
a839ddca
/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#ifdef PADDLE_WITH_PSLIB
namespace
paddle
{
namespace
framework
{
template
<
typename
T
>
__global__
void
fill_idx
(
T
*
idx
,
size_t
len
)
{
const
size_t
i
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
if
(
i
<
len
)
{
idx
[
i
]
=
i
;
}
}
template
<
typename
T
>
void
show_tensor
(
T
*
input
,
size_t
len
,
cudaStream_t
stream
,
std
::
string
name
)
{
T
tmp
[
len
];
cudaMemcpyAsync
(
&
tmp
,
input
,
sizeof
(
T
)
*
len
,
cudaMemcpyDeviceToHost
,
stream
);
cudaStreamSynchronize
(
stream
);
std
::
cout
<<
name
;
for
(
int
i
=
0
;
i
<
len
;
++
i
)
{
std
::
cout
<<
":"
<<
tmp
[
i
];
}
std
::
cout
<<
std
::
endl
;
}
template
<
typename
T
>
__global__
void
calc_shard_offset
(
T
*
idx
,
T
*
left
,
T
*
right
,
size_t
len
)
{
const
size_t
i
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
if
(
i
<
len
-
1
)
{
if
(
idx
[
i
]
!=
idx
[
i
+
1
])
{
right
[
idx
[
i
]]
=
i
;
left
[
idx
[
i
+
1
]]
=
i
+
1
;
}
}
if
(
i
==
0
)
{
left
[
idx
[
i
]]
=
i
;
}
if
(
i
==
(
len
-
1
))
{
right
[
idx
[
i
]]
=
i
;
}
}
template
<
typename
KeyType
,
typename
T
>
__global__
void
calc_shard_index
(
KeyType
*
d_keys
,
size_t
len
,
T
*
shard_index
,
int
total_gpu
)
{
const
size_t
i
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
if
(
i
<
len
)
{
shard_index
[
i
]
=
d_keys
[
i
]
%
total_gpu
;
}
}
template
<
typename
KeyType
,
typename
T
>
__global__
void
fill_shard_key
(
KeyType
*
d_shard_keys
,
KeyType
*
d_keys
,
T
*
idx
,
size_t
len
)
{
const
size_t
i
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
if
(
i
<
len
)
{
d_shard_keys
[
i
]
=
d_keys
[
idx
[
i
]];
}
}
template
<
typename
KeyType
,
typename
GradType
,
typename
T
>
__global__
void
fill_shard_grads
(
KeyType
*
d_shard_keys
,
KeyType
*
d_keys
,
GradType
*
d_shard_grads
,
GradType
*
d_grads
,
T
*
idx
,
size_t
len
)
{
const
size_t
i
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
if
(
i
<
len
)
{
d_shard_keys
[
i
]
=
d_keys
[
idx
[
i
]];
d_shard_grads
[
i
]
=
d_grads
[
idx
[
i
]];
}
}
template
<
typename
ValType
,
typename
T
>
__global__
void
fill_dvals
(
ValType
*
d_shard_vals
,
ValType
*
d_vals
,
T
*
idx
,
size_t
len
)
{
const
size_t
i
=
blockIdx
.
x
*
blockDim
.
x
+
threadIdx
.
x
;
if
(
i
<
len
)
{
d_vals
[
idx
[
i
]]
=
d_shard_vals
[
i
];
}
}
template
<
typename
KeyType
,
typename
ValType
,
typename
GradType
>
HeterComm
<
KeyType
,
ValType
,
GradType
>::
HeterComm
(
size_t
capacity
,
std
::
shared_ptr
<
HeterPsResource
>
resource
)
{
resource_
=
resource
;
for
(
int
i
=
0
;
i
<
resource_
->
total_gpu
();
++
i
)
{
platform
::
CUDADeviceGuard
guard
(
resource_
->
dev_id
(
i
));
auto
table
=
new
Table
(
capacity
/
load_factor_
);
tables_
.
push_back
(
table
);
}
}
template
<
typename
KeyType
,
typename
ValType
,
typename
GradType
>
HeterComm
<
KeyType
,
ValType
,
GradType
>::~
HeterComm
()
{
for
(
auto
&
table
:
tables_
)
{
delete
table
;
table
=
nullptr
;
}
}
template
<
typename
KeyType
,
typename
ValType
,
typename
GradType
>
void
HeterComm
<
KeyType
,
ValType
,
GradType
>::
show_one_table
(
int
gpu_num
)
{
tables_
[
gpu_num
]
->
show
();
}
template
<
typename
KeyType
,
typename
ValType
,
typename
GradType
>
int
HeterComm
<
KeyType
,
ValType
,
GradType
>::
log2i
(
int
x
)
{
unsigned
res
=
0
;
while
(
x
>>=
1
)
{
++
res
;
}
return
res
;
}
template
<
typename
KeyType
,
typename
ValType
,
typename
GradType
>
int
HeterComm
<
KeyType
,
ValType
,
GradType
>::
get_index_by_devid
(
int
devid
)
{
return
resource_
->
get_index_by_devid
(
devid
);
}
template
<
typename
KeyType
,
typename
ValType
,
typename
GradType
>
void
HeterComm
<
KeyType
,
ValType
,
GradType
>::
build_ps
(
int
num
,
KeyType
*
h_keys
,
ValType
*
h_vals
,
size_t
len
,
size_t
chunk_size
,
int
stream_num
)
{
if
(
len
<=
0
)
{
return
;
}
int
dev_id
=
resource_
->
dev_id
(
num
);
platform
::
CUDAPlace
place
=
platform
::
CUDAPlace
(
dev_id
);
platform
::
CUDADeviceGuard
guard
(
dev_id
);
std
::
vector
<
std
::
shared_ptr
<
memory
::
Allocation
>>
d_key_bufs
;
std
::
vector
<
std
::
shared_ptr
<
memory
::
Allocation
>>
d_val_bufs
;
cudaStream_t
streams
[
stream_num
];
for
(
int
i
=
0
;
i
<
stream_num
;
++
i
)
{
PADDLE_ENFORCE_CUDA_SUCCESS
(
cudaStreamCreate
(
&
(
streams
[
i
])));
auto
d_k_buf
=
memory
::
AllocShared
(
place
,
chunk_size
*
sizeof
(
KeyType
));
auto
d_v_buf
=
memory
::
AllocShared
(
place
,
chunk_size
*
sizeof
(
ValType
));
d_key_bufs
.
push_back
(
d_k_buf
);
d_val_bufs
.
push_back
(
d_v_buf
);
}
int
cur_len
=
0
;
int
cur_stream
=
0
;
while
(
cur_len
<
len
)
{
cur_stream
=
cur_stream
%
stream_num
;
int
tmp_len
=
cur_len
+
chunk_size
>
len
?
len
-
cur_len
:
chunk_size
;
PADDLE_ENFORCE_CUDA_SUCCESS
(
cudaMemcpyAsync
(
d_key_bufs
[
cur_stream
]
->
ptr
(),
h_keys
+
cur_len
,
sizeof
(
KeyType
)
*
tmp_len
,
cudaMemcpyHostToDevice
,
streams
[
cur_stream
]));
PADDLE_ENFORCE_CUDA_SUCCESS
(
cudaMemcpyAsync
(
d_val_bufs
[
cur_stream
]
->
ptr
(),
h_vals
+
cur_len
,
sizeof
(
ValType
)
*
tmp_len
,
cudaMemcpyHostToDevice
,
streams
[
cur_stream
]));
tables_
[
num
]
->
insert
(
reinterpret_cast
<
KeyType
*>
(
d_key_bufs
[
cur_stream
]
->
ptr
()),
reinterpret_cast
<
ValType
*>
(
d_val_bufs
[
cur_stream
]
->
ptr
()),
tmp_len
,
streams
[
cur_stream
]);
cur_stream
+=
1
;
cur_len
+=
tmp_len
;
}
for
(
int
i
=
0
;
i
<
stream_num
;
++
i
)
{
cudaStreamSynchronize
(
streams
[
i
]);
PADDLE_ENFORCE_CUDA_SUCCESS
(
cudaStreamDestroy
(
streams
[
i
]));
}
}
template
<
typename
KeyType
,
typename
ValType
,
typename
GradType
>
void
HeterComm
<
KeyType
,
ValType
,
GradType
>::
merge_grad
(
int
gpu_num
,
KeyType
*
d_keys
,
GradType
*
d_grads
,
size_t
len
,
int
&
uniq_len
)
{
int
dev_id
=
resource_
->
dev_id
(
gpu_num
);
platform
::
CUDAPlace
place
=
platform
::
CUDAPlace
(
dev_id
);
platform
::
CUDADeviceGuard
guard
(
dev_id
);
auto
stream
=
resource_
->
stream
(
gpu_num
);
size_t
temp_storage_bytes
;
auto
d_merge_keys
=
memory
::
AllocShared
(
place
,
len
*
sizeof
(
KeyType
));
KeyType
*
d_merge_keys_ptr
=
reinterpret_cast
<
KeyType
*>
(
d_merge_keys
->
ptr
());
auto
d_merge_grads
=
memory
::
AllocShared
(
place
,
len
*
sizeof
(
GradType
));
GradType
*
d_merge_grads_ptr
=
reinterpret_cast
<
GradType
*>
(
d_merge_grads
->
ptr
());
PADDLE_ENFORCE_CUDA_SUCCESS
(
cub
::
DeviceRadixSort
::
SortPairs
(
NULL
,
temp_storage_bytes
,
d_keys
,
d_merge_keys_ptr
,
d_grads
,
d_merge_grads_ptr
,
len
,
0
,
8
*
sizeof
(
KeyType
),
stream
,
false
));
void
*
d_buff
=
NULL
;
auto
d_temp_storage
=
memory
::
AllocShared
(
place
,
temp_storage_bytes
);
PADDLE_ENFORCE_CUDA_SUCCESS
(
cub
::
DeviceRadixSort
::
SortPairs
(
d_temp_storage
->
ptr
(),
temp_storage_bytes
,
d_keys
,
d_merge_keys_ptr
,
d_grads
,
d_merge_grads_ptr
,
len
,
0
,
8
*
sizeof
(
KeyType
),
stream
,
false
));
temp_storage_bytes
=
0
;
auto
d_num_runs_out_mem
=
memory
::
AllocShared
(
place
,
sizeof
(
int
));
int
*
d_num_runs_out
=
reinterpret_cast
<
int
*>
(
d_num_runs_out_mem
->
ptr
());
PADDLE_ENFORCE_CUDA_SUCCESS
(
cub
::
DeviceReduce
::
ReduceByKey
(
NULL
,
temp_storage_bytes
,
d_merge_keys_ptr
,
d_keys
,
d_merge_grads_ptr
,
d_grads
,
d_num_runs_out
,
merger_
,
len
,
stream
,
false
));
if
(
d_temp_storage
->
size
()
<
temp_storage_bytes
)
{
d_temp_storage
=
NULL
;
d_temp_storage
=
memory
::
AllocShared
(
place
,
temp_storage_bytes
);
}
PADDLE_ENFORCE_CUDA_SUCCESS
(
cub
::
DeviceReduce
::
ReduceByKey
(
d_temp_storage
->
ptr
(),
temp_storage_bytes
,
d_merge_keys_ptr
,
d_keys
,
d_merge_grads_ptr
,
d_grads
,
d_num_runs_out
,
merger_
,
len
,
stream
,
false
));
cudaMemcpyAsync
(
&
uniq_len
,
d_num_runs_out
,
sizeof
(
int
),
cudaMemcpyDeviceToHost
,
stream
);
PADDLE_ENFORCE_CUDA_SUCCESS
(
cudaStreamSynchronize
(
stream
));
}
template
<
typename
KeyType
,
typename
ValType
,
typename
GradType
>
void
HeterComm
<
KeyType
,
ValType
,
GradType
>::
split_input_to_shard
(
KeyType
*
d_keys
,
int
*
d_idx_ptr
,
size_t
len
,
int
*
left
,
int
*
right
,
int
gpu_num
)
{
int
total_gpu
=
resource_
->
total_gpu
();
int
dev_id
=
resource_
->
dev_id
(
gpu_num
);
platform
::
CUDAPlace
place
=
platform
::
CUDAPlace
(
dev_id
);
platform
::
CUDADeviceGuard
guard
(
dev_id
);
auto
stream
=
resource_
->
stream
(
gpu_num
);
auto
d_idx_tmp
=
memory
::
AllocShared
(
place
,
len
*
sizeof
(
int
));
int
*
d_idx_tmp_ptr
=
reinterpret_cast
<
int
*>
(
d_idx_tmp
->
ptr
());
auto
d_shard_index
=
memory
::
AllocShared
(
place
,
len
*
sizeof
(
int
));
int
*
d_shard_index_ptr
=
reinterpret_cast
<
int
*>
(
d_shard_index
->
ptr
());
auto
d_shard_index_tmp
=
memory
::
AllocShared
(
place
,
len
*
sizeof
(
int
));
int
*
d_shard_index_tmp_ptr
=
reinterpret_cast
<
int
*>
(
d_shard_index_tmp
->
ptr
());
int
grid_size
=
(
len
-
1
)
/
block_size_
+
1
;
fill_idx
<<<
grid_size
,
block_size_
,
0
,
stream
>>>
(
d_idx_tmp_ptr
,
len
);
calc_shard_index
<<<
grid_size
,
block_size_
,
0
,
stream
>>>
(
d_keys
,
len
,
d_shard_index_tmp_ptr
,
total_gpu
);
size_t
temp_storage_bytes
;
const
int
num_bits
=
1
+
log2i
(
total_gpu
);
PADDLE_ENFORCE_CUDA_SUCCESS
(
cub
::
DeviceRadixSort
::
SortPairs
(
NULL
,
temp_storage_bytes
,
d_shard_index_tmp_ptr
,
d_shard_index_ptr
,
d_idx_tmp_ptr
,
d_idx_ptr
,
len
,
0
,
num_bits
,
stream
));
auto
d_temp_storage
=
memory
::
AllocShared
(
place
,
temp_storage_bytes
);
PADDLE_ENFORCE_CUDA_SUCCESS
(
cub
::
DeviceRadixSort
::
SortPairs
(
d_temp_storage
->
ptr
(),
temp_storage_bytes
,
d_shard_index_tmp_ptr
,
d_shard_index_ptr
,
d_idx_tmp_ptr
,
d_idx_ptr
,
len
,
0
,
num_bits
,
stream
));
calc_shard_offset
<<<
grid_size
,
block_size_
,
0
,
stream
>>>
(
d_shard_index_ptr
,
left
,
right
,
len
);
cudaStreamSynchronize
(
stream
);
}
template
<
typename
KeyType
,
typename
ValType
,
typename
GradType
>
void
HeterComm
<
KeyType
,
ValType
,
GradType
>::
pull_sparse
(
int
num
,
KeyType
*
d_keys
,
ValType
*
d_vals
,
size_t
len
)
{
if
(
len
==
0
)
{
return
;
}
int
total_gpu
=
resource_
->
total_gpu
();
int
dev_id
=
resource_
->
dev_id
(
num
);
platform
::
CUDAPlace
place
=
platform
::
CUDAPlace
(
dev_id
);
platform
::
CUDADeviceGuard
guard
(
dev_id
);
auto
stream
=
resource_
->
stream
(
num
);
int
grid_size
=
(
len
-
1
)
/
block_size_
+
1
;
int
h_left
[
total_gpu
];
int
h_right
[
total_gpu
];
auto
d_left
=
memory
::
AllocShared
(
place
,
total_gpu
*
sizeof
(
int
));
auto
d_right
=
memory
::
AllocShared
(
place
,
total_gpu
*
sizeof
(
int
));
int
*
d_left_ptr
=
reinterpret_cast
<
int
*>
(
d_left
->
ptr
());
int
*
d_right_ptr
=
reinterpret_cast
<
int
*>
(
d_right
->
ptr
());
cudaMemset
(
d_left_ptr
,
-
1
,
total_gpu
*
sizeof
(
int
));
cudaMemset
(
d_right_ptr
,
-
1
,
total_gpu
*
sizeof
(
int
));
//
auto
d_idx
=
memory
::
AllocShared
(
place
,
len
*
sizeof
(
int
));
int
*
d_idx_ptr
=
reinterpret_cast
<
int
*>
(
d_idx
->
ptr
());
auto
d_shard_keys
=
memory
::
AllocShared
(
place
,
len
*
sizeof
(
KeyType
));
KeyType
*
d_shard_keys_ptr
=
reinterpret_cast
<
KeyType
*>
(
d_shard_keys
->
ptr
());
auto
d_shard_vals
=
memory
::
AllocShared
(
place
,
len
*
sizeof
(
ValType
));
ValType
*
d_shard_vals_ptr
=
reinterpret_cast
<
ValType
*>
(
d_shard_vals
->
ptr
());
split_input_to_shard
(
d_keys
,
d_idx_ptr
,
len
,
d_left_ptr
,
d_right_ptr
,
num
);
fill_shard_key
<<<
grid_size
,
block_size_
,
0
,
stream
>>>
(
d_shard_keys_ptr
,
d_keys
,
d_idx_ptr
,
len
);
cudaStreamSynchronize
(
stream
);
cudaMemcpy
(
h_left
,
d_left_ptr
,
total_gpu
*
sizeof
(
int
),
cudaMemcpyDeviceToHost
);
cudaMemcpy
(
h_right
,
d_right_ptr
,
total_gpu
*
sizeof
(
int
),
cudaMemcpyDeviceToHost
);
std
::
vector
<
KeyType
*>
d_remote_shard_keys_ptr
;
std
::
vector
<
ValType
*>
d_remote_shard_vals_ptr
;
std
::
vector
<
std
::
shared_ptr
<
memory
::
Allocation
>>
d_remote_shard_keys
;
std
::
vector
<
std
::
shared_ptr
<
memory
::
Allocation
>>
d_remote_shard_vals
;
for
(
int
i
=
0
;
i
<
total_gpu
;
++
i
)
{
int
shard_len
=
h_right
[
i
]
-
h_left
[
i
]
+
1
;
if
(
shard_len
==
0
)
{
continue
;
}
platform
::
CUDADeviceGuard
guard
(
resource_
->
dev_id
(
i
));
platform
::
CUDAPlace
remote_place
=
platform
::
CUDAPlace
(
resource_
->
dev_id
(
i
));
d_remote_shard_keys
.
push_back
(
memory
::
AllocShared
(
remote_place
,
shard_len
*
sizeof
(
KeyType
)));
d_remote_shard_keys_ptr
.
push_back
(
reinterpret_cast
<
KeyType
*>
(
d_remote_shard_keys
[
i
]
->
ptr
()));
d_remote_shard_vals
.
push_back
(
memory
::
AllocShared
(
remote_place
,
shard_len
*
sizeof
(
ValType
)));
d_remote_shard_vals_ptr
.
push_back
(
reinterpret_cast
<
ValType
*>
(
d_remote_shard_vals
[
i
]
->
ptr
()));
}
for
(
int
i
=
0
;
i
<
total_gpu
;
++
i
)
{
int
shard_len
=
h_right
[
i
]
-
h_left
[
i
]
+
1
;
if
(
h_left
[
i
]
==
-
1
||
h_right
[
i
]
==
-
1
)
{
continue
;
}
cudaMemcpyAsync
(
d_remote_shard_keys_ptr
[
i
],
d_shard_keys_ptr
+
h_left
[
i
],
shard_len
*
sizeof
(
KeyType
),
cudaMemcpyDefault
,
stream
);
}
cudaStreamSynchronize
(
stream
);
for
(
int
i
=
0
;
i
<
total_gpu
;
++
i
)
{
if
(
h_left
[
i
]
==
-
1
)
{
continue
;
}
platform
::
CUDADeviceGuard
guard
(
resource_
->
dev_id
(
i
));
tables_
[
i
]
->
get
(
d_remote_shard_keys_ptr
[
i
],
d_remote_shard_vals_ptr
[
i
],
h_right
[
i
]
-
h_left
[
i
]
+
1
,
resource_
->
stream
(
i
));
}
for
(
int
i
=
0
;
i
<
total_gpu
;
++
i
)
{
cudaStreamSynchronize
(
resource_
->
stream
(
i
));
}
for
(
int
i
=
0
;
i
<
total_gpu
;
++
i
)
{
int
shard_len
=
h_right
[
i
]
-
h_left
[
i
]
+
1
;
if
(
h_left
[
i
]
==
-
1
||
h_right
[
i
]
==
-
1
)
{
continue
;
}
platform
::
CUDADeviceGuard
guard
(
resource_
->
dev_id
(
i
));
cudaMemcpyAsync
(
d_shard_vals_ptr
+
h_left
[
i
],
d_remote_shard_vals_ptr
[
i
],
shard_len
*
sizeof
(
ValType
),
cudaMemcpyDefault
,
resource_
->
stream
(
i
));
}
for
(
int
i
=
0
;
i
<
total_gpu
;
++
i
)
{
cudaStreamSynchronize
(
resource_
->
stream
(
i
));
}
fill_dvals
<<<
grid_size
,
block_size_
,
0
,
stream
>>>
(
d_shard_vals_ptr
,
d_vals
,
d_idx_ptr
,
len
);
cudaStreamSynchronize
(
stream
);
}
template
<
typename
KeyType
,
typename
ValType
,
typename
GradType
>
template
<
typename
Sgd
>
void
HeterComm
<
KeyType
,
ValType
,
GradType
>::
push_sparse
(
int
gpu_num
,
KeyType
*
d_keys
,
GradType
*
d_grads
,
size_t
len
,
Sgd
&
sgd
)
{
if
(
len
==
0
)
{
return
;
}
int
total_gpu
=
resource_
->
total_gpu
();
int
dev_id
=
resource_
->
dev_id
(
gpu_num
);
platform
::
CUDAPlace
place
=
platform
::
CUDAPlace
(
dev_id
);
platform
::
CUDADeviceGuard
guard
(
dev_id
);
auto
stream
=
resource_
->
stream
(
gpu_num
);
int
h_left
[
total_gpu
];
int
h_right
[
total_gpu
];
auto
d_left
=
memory
::
AllocShared
(
place
,
total_gpu
*
sizeof
(
int
));
auto
d_right
=
memory
::
AllocShared
(
place
,
total_gpu
*
sizeof
(
int
));
int
*
d_left_ptr
=
reinterpret_cast
<
int
*>
(
d_left
->
ptr
());
int
*
d_right_ptr
=
reinterpret_cast
<
int
*>
(
d_right
->
ptr
());
cudaMemset
(
d_left_ptr
,
-
1
,
total_gpu
*
sizeof
(
int
));
cudaMemset
(
d_right_ptr
,
-
1
,
total_gpu
*
sizeof
(
int
));
//
auto
d_idx
=
memory
::
AllocShared
(
place
,
len
*
sizeof
(
int
));
int
*
d_idx_ptr
=
reinterpret_cast
<
int
*>
(
d_idx
->
ptr
());
auto
d_shard_keys
=
memory
::
AllocShared
(
place
,
len
*
sizeof
(
KeyType
));
KeyType
*
d_shard_keys_ptr
=
reinterpret_cast
<
KeyType
*>
(
d_shard_keys
->
ptr
());
auto
d_shard_grads
=
memory
::
AllocShared
(
place
,
len
*
sizeof
(
GradType
));
GradType
*
d_shard_grads_ptr
=
reinterpret_cast
<
GradType
*>
(
d_shard_grads
->
ptr
());
int
uniq_len
=
len
;
merge_grad
(
gpu_num
,
d_keys
,
d_grads
,
len
,
uniq_len
);
int
grid_size
=
(
uniq_len
-
1
)
/
block_size_
+
1
;
split_input_to_shard
(
d_keys
,
d_idx_ptr
,
uniq_len
,
d_left_ptr
,
d_right_ptr
,
gpu_num
);
fill_shard_grads
<<<
grid_size
,
block_size_
,
0
,
stream
>>>
(
d_shard_keys_ptr
,
d_keys
,
d_shard_grads_ptr
,
d_grads
,
d_idx_ptr
,
uniq_len
);
cudaStreamSynchronize
(
stream
);
cudaMemcpy
(
h_left
,
d_left_ptr
,
total_gpu
*
sizeof
(
int
),
cudaMemcpyDeviceToHost
);
cudaMemcpy
(
h_right
,
d_right_ptr
,
total_gpu
*
sizeof
(
int
),
cudaMemcpyDeviceToHost
);
std
::
vector
<
KeyType
*>
d_remote_shard_keys_ptr
;
std
::
vector
<
GradType
*>
d_remote_shard_grads_ptr
;
std
::
vector
<
std
::
shared_ptr
<
memory
::
Allocation
>>
d_remote_shard_keys
;
std
::
vector
<
std
::
shared_ptr
<
memory
::
Allocation
>>
d_remote_shard_grads
;
for
(
int
i
=
0
;
i
<
total_gpu
;
++
i
)
{
int
shard_len
=
h_right
[
i
]
-
h_left
[
i
]
+
1
;
if
(
h_left
[
i
]
==
-
1
||
h_right
[
i
]
==
-
1
)
{
continue
;
}
platform
::
CUDADeviceGuard
guard
(
resource_
->
dev_id
(
i
));
platform
::
CUDAPlace
remote_place
=
platform
::
CUDAPlace
(
resource_
->
dev_id
(
i
));
d_remote_shard_keys
.
push_back
(
memory
::
AllocShared
(
remote_place
,
shard_len
*
sizeof
(
KeyType
)));
d_remote_shard_keys_ptr
.
push_back
(
reinterpret_cast
<
KeyType
*>
(
d_remote_shard_keys
[
i
]
->
ptr
()));
d_remote_shard_grads
.
push_back
(
memory
::
AllocShared
(
remote_place
,
shard_len
*
sizeof
(
GradType
)));
d_remote_shard_grads_ptr
.
push_back
(
reinterpret_cast
<
GradType
*>
(
d_remote_shard_grads
[
i
]
->
ptr
()));
}
for
(
int
i
=
0
;
i
<
total_gpu
;
++
i
)
{
int
shard_len
=
h_right
[
i
]
-
h_left
[
i
]
+
1
;
if
(
h_left
[
i
]
==
-
1
||
h_right
[
i
]
==
-
1
)
{
continue
;
}
cudaMemcpyAsync
(
d_remote_shard_keys_ptr
[
i
],
d_shard_keys_ptr
+
h_left
[
i
],
shard_len
*
sizeof
(
KeyType
),
cudaMemcpyDefault
,
stream
);
cudaMemcpyAsync
(
d_remote_shard_grads_ptr
[
i
],
d_shard_grads_ptr
+
h_left
[
i
],
shard_len
*
sizeof
(
GradType
),
cudaMemcpyDefault
,
stream
);
}
cudaStreamSynchronize
(
stream
);
for
(
int
i
=
0
;
i
<
total_gpu
;
++
i
)
{
if
(
h_left
[
i
]
==
-
1
||
h_right
[
i
]
==
-
1
)
{
continue
;
}
platform
::
CUDADeviceGuard
guard
(
resource_
->
dev_id
(
i
));
tables_
[
i
]
->
update
(
d_remote_shard_keys_ptr
[
i
],
d_remote_shard_grads_ptr
[
i
],
h_right
[
i
]
-
h_left
[
i
]
+
1
,
sgd
,
resource_
->
stream
(
i
));
}
for
(
int
i
=
0
;
i
<
total_gpu
;
++
i
)
{
cudaStreamSynchronize
(
resource_
->
stream
(
i
));
}
}
}
// end namespace framework
}
// end namespace paddle
#endif
paddle/fluid/framework/fleet/heter_ps/heter_ps.cu
0 → 100644
浏览文件 @
a839ddca
/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <vector>
#include "paddle/fluid/framework/fleet/heter_ps/heter_ps.h"
#ifdef PADDLE_WITH_PSLIB
namespace
paddle
{
namespace
framework
{
HeterPsBase
*
HeterPsBase
::
get_instance
(
size_t
capacity
,
std
::
shared_ptr
<
HeterPsResource
>
resource
)
{
return
new
HeterPs
(
capacity
,
resource
);
}
HeterPs
::
HeterPs
(
size_t
capacity
,
std
::
shared_ptr
<
HeterPsResource
>
resource
)
{
comm_
=
std
::
make_shared
<
HeterComm
<
FeatureKey
,
FeatureValue
,
FeaturePushValue
>>
(
capacity
,
resource
);
opt_
=
Optimizer
<
FeatureValue
,
FeaturePushValue
>
();
}
HeterPs
::~
HeterPs
()
{}
void
HeterPs
::
pull_sparse
(
int
num
,
FeatureKey
*
d_keys
,
FeatureValue
*
d_vals
,
size_t
len
)
{
comm_
->
pull_sparse
(
num
,
d_keys
,
d_vals
,
len
);
}
void
HeterPs
::
build_ps
(
int
num
,
FeatureKey
*
h_keys
,
FeatureValue
*
h_vals
,
size_t
len
,
size_t
chunk_size
,
int
stream_num
)
{
comm_
->
build_ps
(
num
,
h_keys
,
h_vals
,
len
,
chunk_size
,
stream_num
);
}
int
HeterPs
::
get_index_by_devid
(
int
devid
)
{
return
comm_
->
get_index_by_devid
(
devid
);
}
void
HeterPs
::
dump
()
{}
void
HeterPs
::
show_one_table
(
int
gpu_num
)
{
comm_
->
show_one_table
(
gpu_num
);
}
void
HeterPs
::
push_sparse
(
int
num
,
FeatureKey
*
d_keys
,
FeaturePushValue
*
d_grads
,
size_t
len
)
{
comm_
->
push_sparse
(
num
,
d_keys
,
d_grads
,
len
,
opt_
);
}
}
// end namespace framework
}
// end namespace paddle
#endif
paddle/fluid/framework/fleet/heter_ps/heter_ps.h
0 → 100644
浏览文件 @
a839ddca
/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <vector>
#include "paddle/fluid/framework/fleet/heter_ps/heter_comm.h"
#include "paddle/fluid/framework/fleet/heter_ps/heter_ps_base.h"
#include "paddle/fluid/framework/fleet/heter_ps/optimizer.cuh"
#ifdef PADDLE_WITH_PSLIB
namespace
paddle
{
namespace
framework
{
class
HeterPs
:
public
HeterPsBase
{
public:
HeterPs
()
{}
HeterPs
(
size_t
capacity
,
std
::
shared_ptr
<
HeterPsResource
>
resource
);
virtual
~
HeterPs
();
HeterPs
(
const
HeterPs
&
)
=
delete
;
HeterPs
&
operator
=
(
const
HeterPs
&
)
=
delete
;
virtual
void
pull_sparse
(
int
num
,
FeatureKey
*
d_keys
,
FeatureValue
*
d_vals
,
size_t
len
)
override
;
virtual
void
build_ps
(
int
num
,
FeatureKey
*
h_keys
,
FeatureValue
*
h_vals
,
size_t
len
,
size_t
chunk_size
,
int
stream_num
)
override
;
virtual
void
dump
()
override
;
virtual
int
get_index_by_devid
(
int
devid
)
override
;
virtual
void
show_one_table
(
int
gpu_num
)
override
;
virtual
void
push_sparse
(
int
num
,
FeatureKey
*
d_keys
,
FeaturePushValue
*
d_grads
,
size_t
len
)
override
;
private:
std
::
shared_ptr
<
HeterComm
<
FeatureKey
,
FeatureValue
,
FeaturePushValue
>>
comm_
;
Optimizer
<
FeatureValue
,
FeaturePushValue
>
opt_
;
};
}
// end namespace framework
}
// end namespace paddle
#endif
paddle/fluid/framework/fleet/heter_ps/heter_ps_base.h
0 → 100644
浏览文件 @
a839ddca
/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <vector>
#include "paddle/fluid/framework/fleet/heter_ps/feature_value.h"
#include "paddle/fluid/framework/fleet/heter_ps/heter_resource.h"
#ifdef PADDLE_WITH_PSLIB
namespace
paddle
{
namespace
framework
{
class
HeterPsBase
{
public:
HeterPsBase
(){};
HeterPsBase
(
size_t
capacity
,
std
::
shared_ptr
<
HeterPsResource
>
resource
){};
virtual
~
HeterPsBase
(){};
HeterPsBase
(
const
HeterPsBase
&
)
=
delete
;
HeterPsBase
&
operator
=
(
const
HeterPsBase
&
)
=
delete
;
virtual
void
pull_sparse
(
int
num
,
FeatureKey
*
d_keys
,
FeatureValue
*
d_vals
,
size_t
len
)
=
0
;
virtual
void
build_ps
(
int
num
,
FeatureKey
*
h_keys
,
FeatureValue
*
h_vals
,
size_t
len
,
size_t
chunk_size
,
int
stream_num
)
=
0
;
virtual
int
get_index_by_devid
(
int
devid
)
=
0
;
virtual
void
dump
()
=
0
;
virtual
void
show_one_table
(
int
gpu_num
)
=
0
;
virtual
void
push_sparse
(
int
num
,
FeatureKey
*
d_keys
,
FeaturePushValue
*
d_grads
,
size_t
len
)
=
0
;
static
HeterPsBase
*
get_instance
(
size_t
capacity
,
std
::
shared_ptr
<
HeterPsResource
>
resource
);
};
}
// end namespace framework
}
// end namespace paddle
#endif
paddle/fluid/framework/fleet/heter_ps/heter_resource.cc
0 → 100644
浏览文件 @
a839ddca
/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef PADDLE_WITH_PSLIB
#include "heter_resource.h"
#include "paddle/fluid/platform/cuda_device_guard.h"
namespace
paddle
{
namespace
framework
{
GPUResource
::
GPUResource
(
int
dev_id
,
int
index
)
{
index_
=
index
;
dev_id_
=
dev_id
;
platform
::
CUDADeviceGuard
guard
(
dev_id_
);
PADDLE_ENFORCE_CUDA_SUCCESS
(
cudaStreamCreateWithFlags
(
&
stream_
,
cudaStreamNonBlocking
));
PADDLE_ENFORCE_CUDA_SUCCESS
(
cudaStreamCreateWithFlags
(
&
copy_stream_
,
cudaStreamNonBlocking
));
}
GPUResource
::~
GPUResource
()
{
platform
::
CUDADeviceGuard
guard
(
dev_id_
);
PADDLE_ENFORCE_CUDA_SUCCESS
(
cudaStreamDestroy
(
stream_
));
PADDLE_ENFORCE_CUDA_SUCCESS
(
cudaStreamDestroy
(
copy_stream_
));
}
void
HeterPsResource
::
enable_p2p
()
{
for
(
size_t
i
=
0
;
i
<
dev_ids_
.
size
();
++
i
)
{
platform
::
CUDADeviceGuard
guard
(
dev_ids_
[
i
]);
for
(
size_t
j
=
0
;
j
<
dev_ids_
.
size
();
++
j
)
{
if
(
i
!=
j
)
{
int
p2p_flag
;
PADDLE_ENFORCE_CUDA_SUCCESS
(
cudaDeviceCanAccessPeer
(
&
p2p_flag
,
dev_ids_
[
i
],
dev_ids_
[
j
]));
if
(
p2p_flag
==
1
)
{
cudaError_t
ret
=
cudaDeviceEnablePeerAccess
(
dev_ids_
[
j
],
0
);
if
(
ret
!=
cudaSuccess
&&
ret
!=
cudaErrorPeerAccessAlreadyEnabled
)
{
VLOG
(
0
)
<<
" Cuda error("
<<
ret
<<
"), "
<<
cudaGetErrorString
(
ret
)
<<
"."
;
}
else
{
cudaGetLastError
();
}
}
}
}
}
}
HeterPsResource
::
HeterPsResource
(
const
std
::
vector
<
int
>&
dev_ids
)
{
dev_ids_
=
dev_ids
;
for
(
size_t
i
=
0
;
i
<
dev_ids_
.
size
();
++
i
)
{
std
::
shared_ptr
<
GPUResource
>
resource
=
std
::
make_shared
<
GPUResource
>
(
dev_ids_
[
i
],
i
);
resources_
.
push_back
(
resource
);
devid_2_index_
[
dev_ids_
[
i
]]
=
i
;
}
}
cudaStream_t
HeterPsResource
::
copy_stream
(
int
num
)
{
return
resources_
[
num
]
->
copy_stream
();
}
cudaStream_t
HeterPsResource
::
stream
(
int
num
)
{
return
resources_
[
num
]
->
stream
();
}
int
HeterPsResource
::
dev_id
(
int
num
)
{
return
dev_ids_
[
num
];
}
int
HeterPsResource
::
get_index_by_devid
(
int
devid
)
{
return
devid_2_index_
[
devid
];
}
int
HeterPsResource
::
total_gpu
()
{
return
dev_ids_
.
size
();
}
}
// end namespace framework
}
// end namespace paddle
#endif
paddle/fluid/framework/fleet/heter_ps/heter_resource.h
0 → 100644
浏览文件 @
a839ddca
/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <cstddef>
#include <map>
#include <memory>
#include <vector>
#include "paddle/fluid/platform/cuda_device_guard.h"
#include "paddle/fluid/platform/enforce.h"
#ifdef PADDLE_WITH_PSLIB
namespace
paddle
{
namespace
framework
{
class
GPUResource
{
public:
GPUResource
(
int
device_id
,
int
index
);
virtual
~
GPUResource
();
GPUResource
(
const
GPUResource
&
)
=
delete
;
GPUResource
&
operator
=
(
const
GPUResource
&
)
=
delete
;
int
dev_id
()
const
{
return
dev_id_
;
}
int
index
()
const
{
return
index_
;
}
cudaStream_t
stream
()
{
return
stream_
;
}
cudaStream_t
copy_stream
()
{
return
copy_stream_
;
}
int
dev_id_
;
int
index_
;
cudaStream_t
stream_
;
cudaStream_t
copy_stream_
;
};
class
HeterPsResource
{
public:
HeterPsResource
(
const
std
::
vector
<
int
>&
dev_ids
);
HeterPsResource
(
const
HeterPsResource
&
)
=
delete
;
HeterPsResource
&
operator
=
(
const
HeterPsResource
&
)
=
delete
;
virtual
~
HeterPsResource
()
{}
void
enable_p2p
();
int
total_gpu
();
int
get_index_by_devid
(
int
devid
);
cudaStream_t
stream
(
int
num
);
cudaStream_t
copy_stream
(
int
num
);
int
dev_id
(
int
num
);
std
::
vector
<
std
::
shared_ptr
<
GPUResource
>>
resources_
;
std
::
vector
<
int
>
dev_ids_
;
std
::
map
<
int
,
int
>
devid_2_index_
;
};
}
// end namespace framework
}
// end namespace paddle
#endif
paddle/fluid/framework/fleet/heter_ps/optimizer.cuh
0 → 100644
浏览文件 @
a839ddca
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <vector>
#include "optimizer_conf.h"
#include "paddle/fluid/framework/fleet/heter_ps/feature_value.h"
#ifdef PADDLE_WITH_PSLIB
namespace
paddle
{
namespace
framework
{
__device__
double
cuda_double_random
(
unsigned
long
long
seed
)
{
// copy from MurmurHash3
seed
^=
seed
>>
33
;
seed
*=
0xff51afd7ed558ccd
;
seed
^=
seed
>>
33
;
seed
*=
0xc4ceb9fe1a85ec53
;
seed
^=
seed
>>
33
;
return
((
double
)
seed
/
18446744073709551615.0
);
}
__device__
float
cuda_normal_random
(
unsigned
long
long
idx
)
{
static
double
pi
=
3.1415926897932384
;
unsigned
long
long
x
=
clock64
()
+
idx
;
double
x1
,
x2
,
res
;
while
(
1
)
{
x1
=
cuda_double_random
(
x
);
x2
=
cuda_double_random
(
x
+
33
);
res
=
sqrt
(
-
2.0
*
log
(
x1
))
*
cos
(
2.0
*
pi
*
x2
);
if
(
-
10
<
res
&&
res
<
10
)
break
;
x
+=
207
;
}
return
res
;
}
template
<
typename
ValType
,
typename
GradType
>
class
Optimizer
{
public:
Optimizer
()
{}
~
Optimizer
()
{}
void
initialize
()
{}
__device__
void
update_lr
(
float
&
w
,
float
&
g2sum
,
float
g
,
float
scale
)
{
double
add_g2sum
=
0
;
double
ratio
=
optimizer_config
::
learning_rate
*
sqrt
(
optimizer_config
::
initial_g2sum
/
(
optimizer_config
::
initial_g2sum
+
g2sum
));
double
scaled_grad
=
g
/
scale
;
w
+=
scaled_grad
*
ratio
;
if
(
w
<
optimizer_config
::
min_bound
)
w
=
optimizer_config
::
min_bound
;
if
(
w
>
optimizer_config
::
max_bound
)
w
=
optimizer_config
::
max_bound
;
add_g2sum
=
scaled_grad
*
scaled_grad
;
g2sum
+=
add_g2sum
;
}
__device__
void
update_mf
(
int
n
,
float
*
w
,
float
&
g2sum
,
const
float
*
g
,
float
scale
)
{
double
add_g2sum
=
0
;
double
ratio
=
optimizer_config
::
mf_learning_rate
*
sqrt
(
optimizer_config
::
mf_initial_g2sum
/
(
optimizer_config
::
mf_initial_g2sum
+
g2sum
));
for
(
int
i
=
0
;
i
<
n
;
++
i
)
{
double
scaled_grad
=
g
[
i
]
/
scale
;
w
[
i
]
+=
scaled_grad
*
ratio
;
if
(
w
[
i
]
<
optimizer_config
::
mf_min_bound
)
w
[
i
]
=
optimizer_config
::
mf_min_bound
;
if
(
w
[
i
]
>
optimizer_config
::
mf_max_bound
)
w
[
i
]
=
optimizer_config
::
mf_max_bound
;
add_g2sum
=
scaled_grad
*
scaled_grad
;
}
g2sum
+=
add_g2sum
/
n
;
}
__device__
void
update_value
(
ValType
&
val
,
const
GradType
&
grad
)
{
val
.
slot
=
grad
.
slot
;
;
val
.
show
+=
grad
.
show
;
val
.
clk
+=
grad
.
clk
;
update_lr
(
val
.
lr
,
val
.
lr_g2sum
,
grad
.
lr_g
,
1.0
);
if
(
val
.
mf_size
==
0
)
{
if
(
optimizer_config
::
mf_create_thresholds
<=
optimizer_config
::
nonclk_coeff
*
(
val
.
show
-
val
.
clk
)
+
optimizer_config
::
clk_coeff
*
val
.
clk
)
{
val
.
mf_size
=
MF_DIM
+
1
;
val
.
mf
[
0
]
=
0
;
for
(
int
i
=
0
;
i
<
MF_DIM
;
++
i
)
{
val
.
mf
[
i
+
1
]
=
(
cuda_normal_random
((
int
)
grad
.
show
)
*
2
-
1
)
*
optimizer_config
::
mf_initial_range
;
}
}
}
else
{
update_mf
(
MF_DIM
,
&
val
.
mf
[
1
],
val
.
mf
[
0
],
grad
.
mf_g
,
1.0
);
}
}
};
}
// end namespace framework
}
// end namespace paddle
#endif
paddle/fluid/framework/fleet/heter_ps/optimizer_conf.h
0 → 100644
浏览文件 @
a839ddca
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
namespace
optimizer_config
{
__constant__
float
mf_create_thresholds
=
1
;
__constant__
float
nonclk_coeff
=
1
;
__constant__
float
clk_coeff
=
1
;
__constant__
float
min_bound
=
-
10000
;
__constant__
float
max_bound
=
10000
;
__constant__
float
learning_rate
=
1
;
__constant__
float
initial_g2sum
=
1
;
__constant__
float
initial_range
=
1
;
__constant__
float
mf_learning_rate
=
1
;
__constant__
float
mf_initial_g2sum
=
1
;
__constant__
float
mf_initial_range
=
1
;
__constant__
float
mf_min_bound
=
1
;
__constant__
float
mf_max_bound
=
1
;
}
paddle/fluid/framework/fleet/heter_ps/test_comm.cu
0 → 100644
浏览文件 @
a839ddca
/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <gtest/gtest.h>
#include <vector>
#include "paddle/fluid/framework/fleet/heter_ps/feature_value.h"
#include "paddle/fluid/framework/fleet/heter_ps/heter_comm.h"
#include "paddle/fluid/framework/fleet/heter_ps/heter_resource.h"
#include "paddle/fluid/framework/fleet/heter_ps/optimizer.cuh"
#include "paddle/fluid/platform/cuda_device_guard.h"
using
namespace
paddle
::
framework
;
TEST
(
TEST_FLEET
,
heter_comm
)
{
int
gpu_count
=
3
;
std
::
vector
<
int
>
dev_ids
;
dev_ids
.
push_back
(
0
);
dev_ids
.
push_back
(
1
);
dev_ids
.
push_back
(
2
);
std
::
shared_ptr
<
HeterPsResource
>
resource
=
std
::
make_shared
<
HeterPsResource
>
(
dev_ids
);
resource
->
enable_p2p
();
std
::
vector
<
size_t
>
count
;
std
::
vector
<
std
::
vector
<
FeatureKey
>>
keys
;
std
::
vector
<
std
::
vector
<
FeatureValue
>>
vals
;
count
.
resize
(
dev_ids
.
size
(),
0
);
keys
.
resize
(
dev_ids
.
size
());
vals
.
resize
(
dev_ids
.
size
());
for
(
int
i
=
0
;
i
<
10
;
i
++
)
{
FeatureKey
key
;
FeatureValue
val
;
int
gpu_num
=
i
%
gpu_count
;
key
=
i
;
val
.
lr
=
i
;
val
.
lr_g2sum
=
val
.
mf_size
=
val
.
show
=
val
.
clk
=
val
.
slot
=
0
;
keys
[
gpu_num
].
push_back
(
key
);
vals
[
gpu_num
].
push_back
(
val
);
count
[
gpu_num
]
+=
1
;
}
size_t
size
=
0
;
for
(
size_t
i
=
0
;
i
<
count
.
size
();
++
i
)
{
size
=
std
::
max
(
size
,
count
[
i
]);
}
auto
heter_comm
=
std
::
make_shared
<
HeterComm
<
FeatureKey
,
FeatureValue
,
FeaturePushValue
>>
(
size
,
resource
);
for
(
int
i
=
0
;
i
<
gpu_count
;
++
i
)
{
std
::
cout
<<
"building table: "
<<
i
<<
std
::
endl
;
heter_comm
->
build_ps
(
i
,
keys
[
i
].
data
(),
vals
[
i
].
data
(),
count
[
i
],
10
,
1
);
heter_comm
->
show_one_table
(
i
);
}
std
::
cout
<<
"testing pull sparse:"
<<
std
::
endl
;
paddle
::
platform
::
CUDADeviceGuard
guard
(
0
);
FeatureKey
*
pull_keys
;
FeatureValue
*
pull_vals
;
cudaMallocManaged
(
&
pull_keys
,
5
*
sizeof
(
FeatureKey
));
cudaMallocManaged
(
&
pull_vals
,
5
*
sizeof
(
FeatureValue
));
pull_keys
[
0
]
=
2
;
pull_keys
[
1
]
=
3
;
pull_keys
[
2
]
=
9
;
pull_keys
[
3
]
=
1
;
pull_keys
[
4
]
=
6
;
heter_comm
->
pull_sparse
(
0
,
pull_keys
,
pull_vals
,
5
);
for
(
int
i
=
0
;
i
<
5
;
i
++
)
{
std
::
cout
<<
pull_keys
[
i
]
<<
": "
<<
pull_vals
[
i
]
<<
std
::
endl
;
}
cudaFree
(
pull_keys
);
cudaFree
(
pull_vals
);
std
::
cout
<<
"testing push sparse:"
<<
std
::
endl
;
Optimizer
<
FeatureValue
,
FeaturePushValue
>
opt
;
FeatureKey
*
push_keys
;
FeaturePushValue
*
push_vals
;
cudaMallocManaged
(
&
push_keys
,
5
*
sizeof
(
FeatureKey
));
cudaMallocManaged
(
&
push_vals
,
5
*
sizeof
(
FeaturePushValue
));
push_keys
[
0
]
=
2
;
push_keys
[
1
]
=
3
;
push_keys
[
2
]
=
9
;
push_keys
[
3
]
=
1
;
push_keys
[
4
]
=
3
;
for
(
int
i
=
0
;
i
<
5
;
++
i
)
{
push_vals
[
i
].
lr_g
=
push_keys
[
i
]
*
100
;
push_vals
[
i
].
slot
=
push_keys
[
i
];
push_vals
[
i
].
show
=
push_keys
[
i
];
push_vals
[
i
].
clk
=
push_keys
[
i
];
}
heter_comm
->
push_sparse
(
0
,
push_keys
,
push_vals
,
5
,
opt
);
for
(
int
i
=
0
;
i
<
gpu_count
;
++
i
)
{
std
::
cout
<<
"table "
<<
i
<<
";"
<<
std
::
endl
;
heter_comm
->
show_one_table
(
i
);
}
cudaFree
(
push_keys
);
cudaFree
(
push_vals
);
}
paddle/fluid/framework/fleet/ps_gpu_wrapper.cc
0 → 100644
浏览文件 @
a839ddca
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#if (defined PADDLE_WITH_NCCL) && (defined PADDLE_WITH_PSLIB)
/*
#include <algorithm>
#include <utility>
#include "paddle/fluid/framework/io/fs.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/scope.h"
*/
#include "paddle/fluid/framework/fleet/ps_gpu_wrapper.h"
#include "paddle/fluid/platform/timer.h"
namespace
paddle
{
namespace
framework
{
std
::
shared_ptr
<
PSGPUWrapper
>
PSGPUWrapper
::
s_instance_
=
NULL
;
bool
PSGPUWrapper
::
is_initialized_
=
false
;
void
PSGPUWrapper
::
BuildGPUPS
(
uint64_t
table_id
,
int
feature_dim
,
std
::
shared_ptr
<
HeterContext
>
gpu_task
)
{
platform
::
Timer
timeline
;
timeline
.
Start
();
int
shard_num
=
gpu_task
->
feature_keys_
.
size
();
if
(
shard_num
==
0
)
{
return
;
}
std
::
vector
<
size_t
>
feature_keys_count
(
shard_num
);
size_t
size_max
=
0
;
for
(
int
i
=
0
;
i
<
shard_num
;
i
++
)
{
feature_keys_count
[
i
]
=
gpu_task
->
feature_keys_
[
i
].
size
();
size_max
=
std
::
max
(
size_max
,
feature_keys_count
[
i
]);
}
if
(
HeterPs_
)
{
HeterPs_
->
show_one_table
(
0
);
return
;
}
HeterPs_
=
HeterPsBase
::
get_instance
(
size_max
,
resource_
);
for
(
int
i
=
0
;
i
<
shard_num
;
++
i
)
{
std
::
cout
<<
"building table: "
<<
i
<<
std
::
endl
;
HeterPs_
->
build_ps
(
i
,
gpu_task
->
feature_keys_
[
i
].
data
(),
gpu_task
->
feature_values_
[
i
].
data
(),
feature_keys_count
[
i
],
10000
,
2
);
HeterPs_
->
show_one_table
(
i
);
}
timeline
.
Pause
();
VLOG
(
0
)
<<
"GpuPs build table total costs: "
<<
timeline
.
ElapsedSec
()
<<
" s."
;
}
void
PSGPUWrapper
::
PullSparse
(
const
paddle
::
platform
::
Place
&
place
,
const
int
table_id
,
const
std
::
vector
<
const
uint64_t
*>&
keys
,
const
std
::
vector
<
float
*>&
values
,
const
std
::
vector
<
int64_t
>&
slot_lengths
,
const
int
hidden_size
)
{
VLOG
(
3
)
<<
"Begine Gpu Ps PullSparse"
;
platform
::
Timer
all_timer
;
platform
::
Timer
pull_gpups_timer
;
all_timer
.
Start
();
int64_t
total_length
=
std
::
accumulate
(
slot_lengths
.
begin
(),
slot_lengths
.
end
(),
0UL
);
auto
buf
=
memory
::
AllocShared
(
place
,
total_length
*
sizeof
(
FeatureValue
));
FeatureValue
*
total_values_gpu
=
reinterpret_cast
<
FeatureValue
*>
(
buf
->
ptr
());
if
(
platform
::
is_cpu_place
(
place
))
{
PADDLE_THROW
(
platform
::
errors
::
Unimplemented
(
"Warning:: CPUPlace is not supported in GpuPs now."
));
}
else
if
(
platform
::
is_gpu_place
(
place
))
{
VLOG
(
3
)
<<
"Begin copy keys, key_num["
<<
total_length
<<
"]"
;
int
device_id
=
BOOST_GET_CONST
(
platform
::
CUDAPlace
,
place
).
GetDeviceId
();
int
devid_2_index
=
HeterPs_
->
get_index_by_devid
(
device_id
);
LoDTensor
&
total_keys_tensor
=
keys_tensor
[
devid_2_index
];
uint64_t
*
total_keys
=
reinterpret_cast
<
uint64_t
*>
(
total_keys_tensor
.
mutable_data
<
int64_t
>
({
total_length
,
1
},
place
));
// construct slot_level lod info
auto
slot_lengths_lod
=
slot_lengths
;
for
(
size_t
i
=
1
;
i
<
slot_lengths_lod
.
size
();
i
++
)
{
slot_lengths_lod
[
i
]
+=
slot_lengths_lod
[
i
-
1
];
}
auto
buf_key
=
memory
::
AllocShared
(
place
,
keys
.
size
()
*
sizeof
(
uint64_t
*
));
auto
buf_length
=
memory
::
AllocShared
(
place
,
slot_lengths
.
size
()
*
sizeof
(
int64_t
));
uint64_t
**
gpu_keys
=
reinterpret_cast
<
uint64_t
**>
(
buf_key
->
ptr
());
int64_t
*
gpu_len
=
reinterpret_cast
<
int64_t
*>
(
buf_length
->
ptr
());
cudaMemcpy
(
gpu_keys
,
keys
.
data
(),
keys
.
size
()
*
sizeof
(
uint64_t
*
),
cudaMemcpyHostToDevice
);
cudaMemcpy
(
gpu_len
,
slot_lengths_lod
.
data
(),
slot_lengths
.
size
()
*
sizeof
(
int64_t
),
cudaMemcpyHostToDevice
);
this
->
CopyKeys
(
place
,
gpu_keys
,
total_keys
,
gpu_len
,
static_cast
<
int
>
(
slot_lengths
.
size
()),
static_cast
<
int
>
(
total_length
));
VLOG
(
3
)
<<
"Begin call PullSparseGPU in GPUPS, dev: "
<<
devid_2_index
<<
" len: "
<<
total_length
;
pull_gpups_timer
.
Start
();
HeterPs_
->
pull_sparse
(
devid_2_index
,
total_keys
,
total_values_gpu
,
static_cast
<
int
>
(
total_length
));
// PADDLE_ENFORCE_EQ(ret, 0, platform::errors::PreconditionNotMet(
// "PullSparseGPU failed in GPUPS."));
pull_gpups_timer
.
Pause
();
VLOG
(
3
)
<<
"Begin Copy result to tensor, total_length["
<<
total_length
<<
"]"
;
this
->
CopyForPull
(
place
,
gpu_keys
,
values
,
total_values_gpu
,
gpu_len
,
static_cast
<
int
>
(
slot_lengths
.
size
()),
hidden_size
,
total_length
);
}
else
{
PADDLE_THROW
(
platform
::
errors
::
PreconditionNotMet
(
"GpuPs: PullSparse Only Support CUDAPlace Now."
));
}
all_timer
.
Pause
();
VLOG
(
1
)
<<
"GpuPs PullSparse total costs: "
<<
all_timer
.
ElapsedSec
()
<<
" s, of which GPUPS costs: "
<<
pull_gpups_timer
.
ElapsedSec
()
<<
" s"
;
VLOG
(
3
)
<<
"End PullSparse"
;
}
void
PSGPUWrapper
::
PushSparseGrad
(
const
paddle
::
platform
::
Place
&
place
,
const
int
table_id
,
const
std
::
vector
<
const
uint64_t
*>&
keys
,
const
std
::
vector
<
const
float
*>&
grad_values
,
const
std
::
vector
<
int64_t
>&
slot_lengths
,
const
int
hidden_size
,
const
int
batch_size
)
{
VLOG
(
3
)
<<
"Begin GPUPS PushSparseGrad"
;
platform
::
Timer
all_timer
;
platform
::
Timer
push_gpups_timer
;
all_timer
.
Start
();
int64_t
total_length
=
std
::
accumulate
(
slot_lengths
.
begin
(),
slot_lengths
.
end
(),
0UL
);
auto
buf
=
memory
::
AllocShared
(
place
,
total_length
*
sizeof
(
FeaturePushValue
));
FeaturePushValue
*
total_grad_values_gpu
=
reinterpret_cast
<
FeaturePushValue
*>
(
buf
->
ptr
());
if
(
platform
::
is_cpu_place
(
place
))
{
PADDLE_THROW
(
platform
::
errors
::
Unimplemented
(
"Warning:: CPUPlace is not supported in GPUPS now."
));
}
else
if
(
platform
::
is_gpu_place
(
place
))
{
int
device_id
=
BOOST_GET_CONST
(
platform
::
CUDAPlace
,
place
).
GetDeviceId
();
int
devid_2_index
=
HeterPs_
->
get_index_by_devid
(
device_id
);
LoDTensor
&
cached_total_keys_tensor
=
keys_tensor
[
devid_2_index
];
uint64_t
*
total_keys
=
reinterpret_cast
<
uint64_t
*>
(
cached_total_keys_tensor
.
data
<
int64_t
>
());
VLOG
(
3
)
<<
"Begin copy grad tensor to gpups struct"
;
this
->
CopyForPush
(
place
,
grad_values
,
total_grad_values_gpu
,
slot_lengths
,
hidden_size
,
total_length
,
batch_size
);
VLOG
(
3
)
<<
"Begin call PushSparseGPU in GPUPS, dev: "
<<
devid_2_index
<<
" len: "
<<
total_length
;
push_gpups_timer
.
Start
();
HeterPs_
->
push_sparse
(
devid_2_index
,
total_keys
,
total_grad_values_gpu
,
static_cast
<
int
>
(
total_length
));
push_gpups_timer
.
Pause
();
}
else
{
PADDLE_THROW
(
platform
::
errors
::
PreconditionNotMet
(
"GPUPS: PushSparseGrad Only Support CUDAPlace Now."
));
}
all_timer
.
Pause
();
VLOG
(
1
)
<<
"PushSparseGrad total cost: "
<<
all_timer
.
ElapsedSec
()
<<
" s, of which GPUPS cost: "
<<
push_gpups_timer
.
ElapsedSec
()
<<
" s"
;
VLOG
(
3
)
<<
"End PushSparseGrad"
;
}
}
// end namespace framework
}
// end namespace paddle
#endif
paddle/fluid/framework/fleet/ps_gpu_wrapper.cu
0 → 100644
浏览文件 @
a839ddca
/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef PADDLE_WITH_PSLIB
#include <algorithm>
#include <ctime>
#include <memory>
#include <numeric>
#include "paddle/fluid/framework/fleet/ps_gpu_wrapper.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/platform/gpu_info.h"
namespace
paddle
{
namespace
framework
{
__global__
void
PullCopy
(
float
**
dest
,
const
FeatureValue
*
src
,
const
int64_t
*
len
,
int
hidden
,
int
slot_num
,
int
total_len
,
uint64_t
**
keys
)
{
CUDA_KERNEL_LOOP
(
i
,
total_len
)
{
int
low
=
0
;
int
high
=
slot_num
-
1
;
while
(
low
<
high
)
{
int
mid
=
(
low
+
high
)
/
2
;
if
(
i
<
len
[
mid
])
high
=
mid
;
else
low
=
mid
+
1
;
}
int
x
=
low
;
int
y
=
i
-
(
x
?
len
[
x
-
1
]
:
0
);
if
(
*
(
keys
[
x
]
+
y
)
==
0
)
{
*
(
dest
[
x
]
+
y
*
hidden
)
=
0
;
*
(
dest
[
x
]
+
y
*
hidden
+
1
)
=
0
;
*
(
dest
[
x
]
+
y
*
hidden
+
2
)
=
0
;
}
else
{
*
(
dest
[
x
]
+
y
*
hidden
)
=
(
src
+
i
)
->
show
;
*
(
dest
[
x
]
+
y
*
hidden
+
1
)
=
(
src
+
i
)
->
clk
;
*
(
dest
[
x
]
+
y
*
hidden
+
2
)
=
(
src
+
i
)
->
lr
;
}
if
((
src
+
i
)
->
mf_size
==
0
||
*
(
keys
[
x
]
+
y
)
==
0
)
{
for
(
int
j
=
0
;
j
<
8
;
j
++
)
{
*
(
dest
[
x
]
+
y
*
hidden
+
3
+
j
)
=
0
;
}
}
else
{
for
(
int
j
=
0
;
j
<
8
;
j
++
)
{
*
(
dest
[
x
]
+
y
*
hidden
+
3
+
j
)
=
(
src
+
i
)
->
mf
[
1
+
j
];
}
}
}
}
__global__
void
CopyKeysKernel
(
uint64_t
**
src_keys
,
uint64_t
*
dest_total_keys
,
const
int64_t
*
len
,
int
slot_num
,
int
total_len
)
{
CUDA_KERNEL_LOOP
(
i
,
total_len
)
{
int
low
=
0
;
int
high
=
slot_num
-
1
;
while
(
low
<
high
)
{
int
mid
=
(
low
+
high
)
/
2
;
if
(
i
<
len
[
mid
])
high
=
mid
;
else
low
=
mid
+
1
;
}
int
x
=
low
;
int
y
=
i
-
(
x
?
len
[
x
-
1
]
:
0
);
dest_total_keys
[
i
]
=
src_keys
[
x
][
y
];
}
}
__global__
void
PushCopy
(
FeaturePushValue
*
dest
,
float
**
src
,
int64_t
*
len
,
int
hidden
,
int
slot_num
,
int
total_len
,
int
bs
,
int
*
slot_vector
)
{
CUDA_KERNEL_LOOP
(
i
,
total_len
)
{
int
low
=
0
;
int
high
=
slot_num
-
1
;
while
(
low
<
high
)
{
int
mid
=
(
low
+
high
)
/
2
;
if
(
i
<
len
[
mid
])
high
=
mid
;
else
low
=
mid
+
1
;
}
int
x
=
low
;
int
y
=
i
-
(
x
?
len
[
low
-
1
]
:
0
);
(
dest
+
i
)
->
slot
=
slot_vector
[
x
];
(
dest
+
i
)
->
show
=
*
(
src
[
x
]
+
y
*
hidden
);
(
dest
+
i
)
->
clk
=
*
(
src
[
x
]
+
y
*
hidden
+
1
);
(
dest
+
i
)
->
lr_g
=
*
(
src
[
x
]
+
y
*
hidden
+
2
)
*
-
1.
*
bs
;
for
(
int
j
=
0
;
j
<
8
;
j
++
)
{
(
dest
+
i
)
->
mf_g
[
j
]
=
*
(
src
[
x
]
+
y
*
hidden
+
3
+
j
)
*
-
1.
*
bs
;
}
}
}
void
PSGPUWrapper
::
CopyForPull
(
const
paddle
::
platform
::
Place
&
place
,
uint64_t
**
gpu_keys
,
const
std
::
vector
<
float
*>&
values
,
const
FeatureValue
*
total_values_gpu
,
const
int64_t
*
gpu_len
,
const
int
slot_num
,
const
int
hidden_size
,
const
int64_t
total_length
)
{
auto
stream
=
dynamic_cast
<
platform
::
CUDADeviceContext
*>
(
platform
::
DeviceContextPool
::
Instance
().
Get
(
BOOST_GET_CONST
(
platform
::
CUDAPlace
,
place
)))
->
stream
();
auto
buf_value
=
memory
::
AllocShared
(
place
,
values
.
size
()
*
sizeof
(
float
*
));
float
**
gpu_values
=
reinterpret_cast
<
float
**>
(
buf_value
->
ptr
());
cudaMemcpy
(
gpu_values
,
values
.
data
(),
values
.
size
()
*
sizeof
(
float
*
),
cudaMemcpyHostToDevice
);
PullCopy
<<<
(
total_length
+
512
-
1
)
/
512
,
512
,
0
,
stream
>>>
(
gpu_values
,
total_values_gpu
,
gpu_len
,
hidden_size
,
slot_num
,
total_length
,
gpu_keys
);
cudaStreamSynchronize
(
stream
);
}
void
PSGPUWrapper
::
CopyKeys
(
const
paddle
::
platform
::
Place
&
place
,
uint64_t
**
origin_keys
,
uint64_t
*
total_keys
,
const
int64_t
*
gpu_len
,
int
slot_num
,
int
total_len
)
{
auto
stream
=
dynamic_cast
<
platform
::
CUDADeviceContext
*>
(
platform
::
DeviceContextPool
::
Instance
().
Get
(
BOOST_GET_CONST
(
platform
::
CUDAPlace
,
place
)))
->
stream
();
CopyKeysKernel
<<<
(
total_len
+
512
-
1
)
/
512
,
512
,
0
,
stream
>>>
(
origin_keys
,
total_keys
,
gpu_len
,
slot_num
,
total_len
);
cudaStreamSynchronize
(
stream
);
}
void
PSGPUWrapper
::
CopyForPush
(
const
paddle
::
platform
::
Place
&
place
,
const
std
::
vector
<
const
float
*>&
grad_values
,
FeaturePushValue
*
total_grad_values_gpu
,
const
std
::
vector
<
int64_t
>&
slot_lengths
,
const
int
hidden_size
,
const
int64_t
total_length
,
const
int
batch_size
)
{
auto
stream
=
dynamic_cast
<
platform
::
CUDADeviceContext
*>
(
platform
::
DeviceContextPool
::
Instance
().
Get
(
BOOST_GET_CONST
(
platform
::
CUDAPlace
,
place
)))
->
stream
();
auto
slot_lengths_lod
=
slot_lengths
;
for
(
int
i
=
1
;
i
<
slot_lengths_lod
.
size
();
i
++
)
{
slot_lengths_lod
[
i
]
+=
slot_lengths_lod
[
i
-
1
];
}
auto
buf_grad_value
=
memory
::
AllocShared
(
place
,
grad_values
.
size
()
*
sizeof
(
float
*
));
auto
buf_length
=
memory
::
AllocShared
(
place
,
slot_lengths
.
size
()
*
sizeof
(
int64_t
));
auto
buf_slot_vector
=
memory
::
AllocShared
(
place
,
slot_lengths_lod
.
size
()
*
sizeof
(
int
));
float
**
gpu_values
=
reinterpret_cast
<
float
**>
(
buf_grad_value
->
ptr
());
int64_t
*
gpu_len
=
reinterpret_cast
<
int64_t
*>
(
buf_length
->
ptr
());
int
*
d_slot_vector
=
reinterpret_cast
<
int
*>
(
buf_slot_vector
->
ptr
());
cudaMemcpy
(
gpu_values
,
grad_values
.
data
(),
grad_values
.
size
()
*
sizeof
(
float
*
),
cudaMemcpyHostToDevice
);
cudaMemcpy
(
gpu_len
,
slot_lengths_lod
.
data
(),
slot_lengths
.
size
()
*
sizeof
(
int64_t
),
cudaMemcpyHostToDevice
);
cudaMemcpy
(
d_slot_vector
,
slot_vector_
.
data
(),
slot_lengths_lod
.
size
()
*
sizeof
(
int
),
cudaMemcpyHostToDevice
);
PushCopy
<<<
(
total_length
+
512
-
1
)
/
512
,
512
,
0
,
stream
>>>
(
total_grad_values_gpu
,
gpu_values
,
gpu_len
,
hidden_size
,
slot_lengths
.
size
(),
total_length
,
batch_size
,
d_slot_vector
);
cudaStreamSynchronize
(
stream
);
}
}
// end namespace framework
}
// end namespace paddle
#endif
paddle/fluid/framework/fleet/ps_gpu_wrapper.h
0 → 100644
浏览文件 @
a839ddca
/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#if (defined PADDLE_WITH_NCCL) && (defined PADDLE_WITH_PSLIB)
#include <atomic>
#include <ctime>
#include <map>
#include <memory>
#include <random>
#include <string>
#include <unordered_map>
#include <vector>
#include "paddle/fluid/framework/fleet/heter_context.h"
#include "paddle/fluid/framework/fleet/heter_ps/heter_ps_base.h"
#include "paddle/fluid/framework/fleet/heter_ps/heter_resource.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/framework/variable_helper.h"
#include "paddle/fluid/platform/gpu_info.h"
#include "paddle/fluid/platform/macros.h" // for DISABLE_COPY_AND_ASSIGN
#include "paddle/fluid/platform/place.h"
namespace
paddle
{
namespace
framework
{
class
PSGPUWrapper
{
public:
virtual
~
PSGPUWrapper
()
{
delete
HeterPs_
;
}
PSGPUWrapper
()
{
HeterPs_
=
NULL
;
sleep_seconds_before_fail_exit_
=
300
;
}
void
PullSparse
(
const
paddle
::
platform
::
Place
&
place
,
const
int
table_id
,
const
std
::
vector
<
const
uint64_t
*>&
keys
,
const
std
::
vector
<
float
*>&
values
,
const
std
::
vector
<
int64_t
>&
slot_lengths
,
const
int
hidden_size
);
void
PushSparseGrad
(
const
paddle
::
platform
::
Place
&
place
,
const
int
table_id
,
const
std
::
vector
<
const
uint64_t
*>&
keys
,
const
std
::
vector
<
const
float
*>&
grad_values
,
const
std
::
vector
<
int64_t
>&
slot_lengths
,
const
int
hidden_size
,
const
int
batch_size
);
void
CopyKeys
(
const
paddle
::
platform
::
Place
&
place
,
uint64_t
**
origin_keys
,
uint64_t
*
total_keys
,
const
int64_t
*
gpu_len
,
int
slot_num
,
int
total_len
);
void
CopyForPull
(
const
paddle
::
platform
::
Place
&
place
,
uint64_t
**
gpu_keys
,
const
std
::
vector
<
float
*>&
values
,
const
FeatureValue
*
total_values_gpu
,
const
int64_t
*
gpu_len
,
const
int
slot_num
,
const
int
hidden_size
,
const
int64_t
total_length
);
void
CopyForPush
(
const
paddle
::
platform
::
Place
&
place
,
const
std
::
vector
<
const
float
*>&
grad_values
,
FeaturePushValue
*
total_grad_values_gpu
,
const
std
::
vector
<
int64_t
>&
slot_lengths
,
const
int
hidden_size
,
const
int64_t
total_length
,
const
int
batch_size
);
void
BuildGPUPS
(
const
uint64_t
table_id
,
int
feature_dim
,
std
::
shared_ptr
<
HeterContext
>
context
);
void
InitializeGPU
(
const
std
::
vector
<
int
>&
dev_ids
)
{
if
(
s_instance_
!=
NULL
)
{
VLOG
(
3
)
<<
"PSGPUWrapper Begin InitializeGPU"
;
resource_
=
std
::
make_shared
<
HeterPsResource
>
(
dev_ids
);
resource_
->
enable_p2p
();
keys_tensor
.
resize
(
resource_
->
total_gpu
());
}
}
// PSGPUWrapper singleton
static
std
::
shared_ptr
<
PSGPUWrapper
>
GetInstance
()
{
if
(
NULL
==
s_instance_
)
{
s_instance_
.
reset
(
new
paddle
::
framework
::
PSGPUWrapper
());
}
return
s_instance_
;
}
std
::
vector
<
std
::
unordered_map
<
uint64_t
,
std
::
vector
<
float
>>>&
GetLocalTable
(
int
table_id
)
{
return
local_tables_
[
table_id
];
}
void
SetSlotVector
(
const
std
::
vector
<
int
>&
slot_vector
)
{
slot_vector_
=
slot_vector
;
}
private:
static
std
::
shared_ptr
<
PSGPUWrapper
>
s_instance_
;
std
::
unordered_map
<
uint64_t
,
std
::
vector
<
std
::
unordered_map
<
uint64_t
,
std
::
vector
<
float
>>>>
local_tables_
;
HeterPsBase
*
HeterPs_
;
std
::
vector
<
LoDTensor
>
keys_tensor
;
// Cache for pull_sparse
std
::
shared_ptr
<
HeterPsResource
>
resource_
;
int32_t
sleep_seconds_before_fail_exit_
;
std
::
vector
<
int
>
slot_vector_
;
protected:
static
bool
is_initialized_
;
};
}
// end namespace framework
}
// end namespace paddle
#endif
paddle/fluid/framework/ps_gpu_trainer.cc
0 → 100644
浏览文件 @
a839ddca
/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <cstdlib>
#include <string>
#include <vector>
#include "io/fs.h"
#include "paddle/fluid/framework/data_feed_factory.h"
#include "paddle/fluid/framework/data_set.h"
#include "paddle/fluid/framework/device_worker_factory.h"
#include "paddle/fluid/framework/fleet/fleet_wrapper.h"
#include "paddle/fluid/framework/fleet/heter_context.h"
#include "paddle/fluid/framework/fleet/heter_ps/feature_value.h"
#include "paddle/fluid/framework/fleet/ps_gpu_wrapper.h"
#include "paddle/fluid/framework/trainer.h"
#if (defined PADDLE_WITH_NCCL) && (defined PADDLE_WITH_PSLIB)
#include "paddle/fluid/platform/cuda_device_guard.h"
namespace
paddle
{
namespace
framework
{
void
PSGPUTrainer
::
Initialize
(
const
TrainerDesc
&
trainer_desc
,
Dataset
*
dataset
)
{
dataset_
=
dataset
;
thread_num_
=
trainer_desc
.
thread_num
();
param_
=
trainer_desc
.
downpour_param
();
for
(
int
i
=
0
;
i
<
param_
.
dense_table_size
();
++
i
)
{
uint64_t
table_id
=
static_cast
<
uint64_t
>
(
param_
.
dense_table
(
i
).
table_id
());
auto
table
=
param_
.
dense_table
(
i
);
dense_grad_names_
[
table_id
].
resize
(
table
.
dense_grad_name_size
());
for
(
int
j
=
0
;
j
<
table
.
dense_grad_name_size
();
++
j
)
{
dense_grad_names_
[
table_id
][
j
]
=
table
.
dense_grad_name
(
j
);
}
}
scale_datanorm_
=
trainer_desc
.
scale_datanorm
();
int
place_num
=
trainer_desc
.
worker_places_size
();
const
std
::
vector
<
paddle
::
framework
::
DataFeed
*>
readers
=
dataset
->
GetReaders
();
std
::
vector
<
int
>
dev_ids
;
for
(
int
i
=
0
;
i
<
place_num
;
++
i
)
{
int
num
=
trainer_desc
.
worker_places
(
i
);
platform
::
CUDAPlace
place
=
platform
::
CUDAPlace
(
num
);
places_
.
push_back
(
place
);
dev_ids
.
push_back
(
num
);
}
for
(
int
i
=
0
;
i
<
trainer_desc
.
downpour_param
().
stat_var_names_size
();
i
++
)
{
need_merge_var_names_
.
push_back
(
trainer_desc
.
downpour_param
().
stat_var_names
(
i
));
}
VLOG
(
3
)
<<
"going to initialize pull dense worker"
;
pull_dense_worker_
=
PullDenseWorker
::
GetInstance
();
pull_dense_worker_
->
Initialize
(
trainer_desc
);
SetDebug
(
trainer_desc
.
debug
());
fleet_ptr_
=
FleetWrapper
::
GetInstance
();
trainer_desc_
=
trainer_desc
;
workers_
.
resize
(
place_num
);
for
(
int
i
=
0
;
i
<
place_num
;
++
i
)
{
workers_
[
i
]
=
DeviceWorkerFactory
::
CreateDeviceWorker
(
trainer_desc
.
device_worker_name
());
workers_
[
i
]
->
SetDeviceIndex
(
i
);
workers_
[
i
]
->
SetDataFeed
(
readers
[
i
]);
workers_
[
i
]
->
Initialize
(
trainer_desc
);
workers_
[
i
]
->
SetWorkerNum
(
place_num
);
}
auto
gpu_ps_wrapper
=
PSGPUWrapper
::
GetInstance
();
gpu_ps_wrapper
->
InitializeGPU
(
dev_ids
);
return
;
}
void
PSGPUTrainer
::
DumpWork
(
int
tid
)
{}
void
PSGPUTrainer
::
RegisterHeterCallback
()
{
/*
auto fleet_ptr = FleetWrapper::GetInstance();
fleet_ptr->RegisterHeterCallback([this](int worker, int taskid) {
// workers_[worker]->Schedule(taskid);
});
*/
}
void
PSGPUTrainer
::
InitTrainerEnv
(
const
ProgramDesc
&
main_program
,
const
platform
::
Place
&
place
)
{
for
(
size_t
i
=
0
;
i
<
places_
.
size
();
++
i
)
{
workers_
[
i
]
->
SetPlace
(
places_
[
i
]);
workers_
[
i
]
->
SetReaderPlace
(
places_
[
i
]);
workers_
[
i
]
->
SetRootScope
(
root_scope_
);
workers_
[
i
]
->
CreateDeviceResource
(
main_program
);
// Program
workers_
[
i
]
->
BindingDataFeedMemory
();
}
for
(
size_t
num
=
0
;
num
<
places_
.
size
();
++
num
)
{
auto
place
=
places_
[
num
];
Scope
*
scope
=
workers_
[
num
]
->
GetThreadScope
();
auto
&
block
=
main_program
.
Block
(
0
);
for
(
auto
&
var
:
block
.
AllVars
())
{
if
(
var
->
Persistable
())
{
auto
name
=
var
->
Name
();
Variable
*
root_var
=
root_scope_
->
FindVar
(
name
);
if
(
!
root_var
)
{
continue
;
}
LoDTensor
*
root_tensor
=
root_var
->
GetMutable
<
LoDTensor
>
();
auto
*
ptr
=
scope
->
Var
(
name
);
InitializeVariable
(
ptr
,
proto
::
VarType
::
LOD_TENSOR
);
LoDTensor
*
thread_tensor
=
ptr
->
GetMutable
<
LoDTensor
>
();
TensorCopy
(
*
root_tensor
,
place
,
thread_tensor
);
}
}
}
place_
=
place
;
return
;
}
void
PSGPUTrainer
::
InitOtherEnv
(
const
ProgramDesc
&
main_program
)
{
pull_dense_worker_
->
SetRootScope
(
root_scope_
);
for
(
size_t
i
=
0
;
i
<
places_
.
size
();
++
i
)
{
pull_dense_worker_
->
AddThreadScope
(
workers_
[
i
]
->
GetThreadScope
());
}
VLOG
(
3
)
<<
"init other env done."
;
}
void
PSGPUTrainer
::
Run
()
{
BuildGPUPSTask
(
0
,
8
);
for
(
size_t
thidx
=
0
;
thidx
<
places_
.
size
();
++
thidx
)
{
threads_
.
push_back
(
std
::
thread
(
&
DeviceWorker
::
TrainFiles
,
workers_
[
thidx
].
get
()));
}
}
void
PSGPUTrainer
::
BuildGPUPSTask
(
int
table_id
,
int
feadim
)
{
VLOG
(
3
)
<<
"PSGPUTrainer::BuildGPUPSTask begin"
;
platform
::
Timer
timeline
;
timeline
.
Start
();
MultiSlotDataset
*
dataset
=
dynamic_cast
<
MultiSlotDataset
*>
(
dataset_
);
auto
fleet_ptr
=
FleetWrapper
::
GetInstance
();
std
::
shared_ptr
<
HeterContext
>
heter_context
=
std
::
make_shared
<
HeterContext
>
();
auto
&
multi_output_channel
=
dataset
->
GetCurOutputChannel
();
auto
&
input_channel
=
dataset
->
GetInputChannelRef
();
int
gen_shard_num
=
multi_output_channel
.
size
();
int
device_num
=
places_
.
size
();
auto
gpu_ps_wrapper
=
PSGPUWrapper
::
GetInstance
();
auto
&
local_keys
=
heter_context
->
feature_keys_
;
local_keys
.
resize
(
device_num
);
auto
&
local_values
=
heter_context
->
feature_values_
;
local_values
.
resize
(
device_num
);
auto
&
local_ptr
=
heter_context
->
value_ptr_
;
local_ptr
.
resize
(
device_num
);
for
(
auto
&
ks
:
local_keys
)
{
ks
.
reserve
(
100000
);
}
// read thread
std
::
vector
<
std
::
thread
>
threads
(
gen_shard_num
);
std
::
vector
<
std
::
shared_ptr
<
ThreadPool
>>
consume_task_pool
(
device_num
);
for
(
size_t
i
=
0
;
i
<
consume_task_pool
.
size
();
i
++
)
{
consume_task_pool
[
i
].
reset
(
new
::
ThreadPool
(
1
));
}
auto
consume_func
=
[
&
local_keys
](
int
shard_id
,
int
feadim
,
std
::
vector
<
uint64_t
>&
keys
)
{
local_keys
[
shard_id
].
insert
(
local_keys
[
shard_id
].
end
(),
keys
.
begin
(),
keys
.
end
());
};
if
(
input_channel
->
Size
()
==
0
)
{
// output_channel_ should hold one pass instances now
uint64_t
output_channels_data_size
=
0
;
for
(
size_t
i
=
0
;
i
<
multi_output_channel
.
size
();
i
++
)
{
int
cur_channel_size
=
multi_output_channel
[
i
]
->
Size
();
output_channels_data_size
+=
cur_channel_size
;
}
CHECK
(
output_channels_data_size
>
0
);
for
(
auto
&
ks
:
local_keys
)
{
ks
.
reserve
(
output_channels_data_size
*
10
);
// magic number
}
auto
gen_func
=
[
&
dataset
,
&
device_num
,
&
feadim
,
&
consume_task_pool
,
&
multi_output_channel
,
&
consume_func
](
int
i
)
{
const
std
::
deque
<
Record
>&
vec_data
=
multi_output_channel
[
i
]
->
GetData
();
std
::
vector
<
std
::
vector
<
uint64_t
>>
task_keys
(
device_num
);
std
::
vector
<
std
::
future
<
void
>>
task_futures
;
for
(
size_t
j
=
0
;
j
<
vec_data
.
size
();
j
++
)
{
for
(
auto
&
feature
:
vec_data
[
j
].
uint64_feasigns_
)
{
int
shard
=
feature
.
sign
().
uint64_feasign_
%
device_num
;
task_keys
[
shard
].
push_back
(
feature
.
sign
().
uint64_feasign_
);
}
}
for
(
int
shard_id
=
0
;
shard_id
<
device_num
;
shard_id
++
)
{
task_futures
.
emplace_back
(
consume_task_pool
[
shard_id
]
->
enqueue
(
consume_func
,
shard_id
,
feadim
,
task_keys
[
shard_id
]));
}
for
(
auto
&
tf
:
task_futures
)
{
tf
.
wait
();
}
for
(
auto
&
tk
:
task_keys
)
{
tk
.
clear
();
std
::
vector
<
uint64_t
>
().
swap
(
tk
);
}
task_keys
.
clear
();
std
::
vector
<
std
::
vector
<
uint64_t
>>
().
swap
(
task_keys
);
};
for
(
size_t
i
=
0
;
i
<
threads
.
size
();
i
++
)
{
threads
[
i
]
=
std
::
thread
(
gen_func
,
i
);
}
for
(
std
::
thread
&
t
:
threads
)
{
t
.
join
();
}
}
else
{
int
input_channel_size
=
input_channel
->
Size
();
CHECK
(
input_channel_size
>
0
);
CHECK
(
gen_shard_num
>
0
);
for
(
auto
&
ks
:
local_keys
)
{
ks
.
reserve
(
input_channel_size
*
10
);
// magic number
}
const
std
::
deque
<
Record
>&
vec_data
=
input_channel
->
GetData
();
auto
gen_func
=
[
&
dataset
,
&
vec_data
,
&
device_num
,
&
gen_shard_num
,
&
input_channel_size
,
&
feadim
,
&
consume_task_pool
,
multi_output_channel
,
&
consume_func
](
int
i
)
{
std
::
vector
<
std
::
vector
<
uint64_t
>>
task_keys
(
device_num
);
std
::
vector
<
std
::
future
<
void
>>
task_futures
;
size_t
per_shard_num
=
input_channel_size
/
gen_shard_num
+
1
;
size_t
total_size
=
vec_data
.
size
();
size_t
start_index
=
i
*
per_shard_num
;
size_t
end_index
=
std
::
min
(
start_index
+
per_shard_num
-
1
,
total_size
-
1
);
for
(
size_t
j
=
start_index
;
j
<=
end_index
;
j
++
)
{
for
(
auto
&
feature
:
vec_data
[
j
].
uint64_feasigns_
)
{
int
shard
=
feature
.
sign
().
uint64_feasign_
%
device_num
;
task_keys
[
shard
].
push_back
(
feature
.
sign
().
uint64_feasign_
);
}
}
for
(
int
shard_id
=
0
;
shard_id
<
device_num
;
shard_id
++
)
{
task_futures
.
emplace_back
(
consume_task_pool
[
shard_id
]
->
enqueue
(
consume_func
,
shard_id
,
feadim
,
task_keys
[
shard_id
]));
}
for
(
auto
&
tf
:
task_futures
)
{
tf
.
wait
();
}
for
(
auto
&
tk
:
task_keys
)
{
tk
.
clear
();
std
::
vector
<
uint64_t
>
().
swap
(
tk
);
}
task_keys
.
clear
();
std
::
vector
<
std
::
vector
<
uint64_t
>>
().
swap
(
task_keys
);
};
for
(
size_t
i
=
0
;
i
<
threads
.
size
();
i
++
)
{
threads
[
i
]
=
std
::
thread
(
gen_func
,
i
);
}
for
(
std
::
thread
&
t
:
threads
)
{
t
.
join
();
}
}
timeline
.
Pause
();
VLOG
(
0
)
<<
"GpuPs build task cost "
<<
timeline
.
ElapsedSec
()
<<
" seconds."
;
timeline
.
Start
();
auto
unique_func
=
[
&
local_keys
](
int
i
)
{
auto
&
cur_keys
=
local_keys
[
i
];
std
::
sort
(
cur_keys
.
begin
(),
cur_keys
.
end
());
cur_keys
.
erase
(
std
::
unique
(
cur_keys
.
begin
(),
cur_keys
.
end
()),
cur_keys
.
end
());
};
for
(
size_t
i
=
0
;
i
<
threads
.
size
();
i
++
)
{
threads
[
i
]
=
std
::
thread
(
unique_func
,
i
);
}
for
(
std
::
thread
&
t
:
threads
)
{
t
.
join
();
}
timeline
.
Pause
();
VLOG
(
0
)
<<
"GpuPs task unique cost "
<<
timeline
.
ElapsedSec
()
<<
" seconds."
;
timeline
.
Start
();
for
(
size_t
i
=
0
;
i
<
consume_task_pool
.
size
();
i
++
)
{
consume_task_pool
[
i
].
reset
();
}
consume_task_pool
.
clear
();
for
(
int
i
=
0
;
i
<
device_num
;
i
++
)
{
local_values
[
i
].
resize
(
local_keys
[
i
].
size
());
local_ptr
[
i
].
resize
(
local_keys
[
i
].
size
());
}
auto
ptl_func
=
[
this
,
&
local_keys
,
&
local_values
,
&
local_ptr
,
&
table_id
,
&
fleet_ptr
](
int
i
)
{
size_t
key_size
=
local_keys
[
i
].
size
();
auto
tt
=
fleet_ptr
->
pslib_ptr_
->
_worker_ptr
->
pull_sparse_ptr
(
(
char
**
)(
local_ptr
[
i
].
data
()),
table_id
,
local_keys
[
i
].
data
(),
key_size
);
tt
.
wait
();
auto
status
=
tt
.
get
();
// auto status = 0;
if
(
status
!=
0
)
{
LOG
(
ERROR
)
<<
"fleet pull sparse failed, status["
<<
status
<<
"]"
;
sleep
(
300
);
exit
(
-
1
);
}
else
{
VLOG
(
3
)
<<
"FleetWrapper Pull sparse to local done with table size: "
<<
local_keys
[
i
].
size
();
}
for
(
size_t
num
=
0
;
num
<
local_ptr
[
i
].
size
();
++
num
)
{
float
*
ptr_val
=
local_ptr
[
i
][
num
]
->
data
();
FeatureValue
&
val
=
local_values
[
i
][
num
];
size_t
dim
=
local_ptr
[
i
][
num
]
->
size
();
val
.
delta_score
=
ptr_val
[
1
];
val
.
show
=
ptr_val
[
2
];
val
.
clk
=
ptr_val
[
3
];
val
.
slot
=
ptr_val
[
6
];
val
.
lr
=
ptr_val
[
4
];
val
.
lr_g2sum
=
ptr_val
[
5
];
if
(
dim
>
7
)
{
val
.
mf_size
=
MF_DIM
+
1
;
for
(
int
x
=
0
;
x
<
val
.
mf_size
;
x
++
)
{
val
.
mf
[
x
]
=
ptr_val
[
x
+
7
];
}
}
else
{
val
.
mf_size
=
0
;
for
(
int
x
=
0
;
x
<
MF_DIM
+
1
;
x
++
)
{
val
.
mf
[
x
]
=
0
;
}
}
}
};
for
(
size_t
i
=
0
;
i
<
threads
.
size
();
i
++
)
{
threads
[
i
]
=
std
::
thread
(
ptl_func
,
i
);
}
for
(
std
::
thread
&
t
:
threads
)
{
t
.
join
();
}
timeline
.
Pause
();
VLOG
(
0
)
<<
"GpuPs pull sparse cost "
<<
timeline
.
ElapsedSec
()
<<
" seconds."
;
gpu_ps_wrapper
->
BuildGPUPS
(
table_id
,
feadim
,
heter_context
);
}
Scope
*
PSGPUTrainer
::
GetWorkerScope
(
int
thread_id
)
{
return
nullptr
;
}
template
<
typename
T
>
void
PSGPUTrainer
::
MergeToRootScope
(
LoDTensor
*
root_tensor
,
LoDTensor
*
tensor
)
{
LoDTensor
tmp_root
;
TensorCopy
(
*
root_tensor
,
platform
::
CPUPlace
(),
&
tmp_root
);
T
*
tmp_root_data
=
tmp_root
.
data
<
T
>
();
LoDTensor
tmp_tensor
;
TensorCopy
(
*
tensor
,
platform
::
CPUPlace
(),
&
tmp_tensor
);
T
*
data
=
tmp_tensor
.
data
<
T
>
();
for
(
int
i
=
0
;
i
<
tmp_tensor
.
numel
();
i
++
)
{
tmp_root_data
[
i
]
+=
data
[
i
];
}
TensorCopy
(
tmp_root
,
platform
::
CPUPlace
(),
root_tensor
);
}
void
PSGPUTrainer
::
Finalize
()
{
for
(
auto
&
th
:
threads_
)
{
th
.
join
();
}
for
(
size_t
i
=
0
;
i
<
need_merge_var_names_
.
size
();
i
++
)
{
Variable
*
root_var
=
root_scope_
->
FindVar
(
need_merge_var_names_
[
i
]);
if
(
root_var
==
nullptr
)
{
continue
;
}
LoDTensor
*
root_tensor
=
root_var
->
GetMutable
<
LoDTensor
>
();
for
(
size_t
j
=
0
;
j
<
places_
.
size
();
j
++
)
{
Scope
*
cur_thread_scope
=
workers_
[
j
]
->
GetThreadScope
();
Variable
*
thread_var
=
cur_thread_scope
->
FindVar
(
need_merge_var_names_
[
i
]);
if
(
thread_var
==
nullptr
)
{
continue
;
}
LoDTensor
*
thread_tensor
=
thread_var
->
GetMutable
<
LoDTensor
>
();
#define MergeCallback(cpp_type, proto_type) \
do { \
if (root_tensor->type() == proto_type) { \
if (thread_tensor->type() != proto_type) { \
VLOG(0) << "Error: thread id=" << j << ", need_merge_var_names_[" << i \
<< "] " << need_merge_var_names_[i] \
<< ", root tensor type=" << root_tensor->type() \
<< ", thread tensor type=" << thread_tensor->type(); \
exit(-1); \
} \
MergeToRootScope<cpp_type>(root_tensor, thread_tensor); \
} \
} while (0)
_ForEachDataType_
(
MergeCallback
);
}
}
pull_dense_worker_
->
MergeDenseParam
();
root_scope_
->
DropKids
();
}
}
// namespace framework
}
// namespace paddle
#endif
paddle/fluid/framework/ps_gpu_worker.cc
0 → 100644
浏览文件 @
a839ddca
/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/device_worker.h"
#include "paddle/fluid/framework/device_worker_factory.h"
#include "paddle/fluid/framework/fleet/fleet_wrapper.h"
#include "paddle/fluid/framework/fleet/heter_wrapper.h"
#include "paddle/fluid/platform/cpu_helper.h"
#include "paddle/fluid/string/string_helper.h"
#if (defined PADDLE_WITH_NCCL) && (defined PADDLE_WITH_PSLIB)
#include "paddle/fluid/platform/cuda_device_guard.h"
#if defined _WIN32 || defined __APPLE__
#else
#define _LINUX
#endif
namespace
paddle
{
namespace
framework
{
void
PSGPUWorker
::
Initialize
(
const
TrainerDesc
&
desc
)
{
param_
=
desc
.
downpour_param
();
mpi_rank_
=
desc
.
mpi_rank
();
trainer_desc_
=
desc
;
/*
for (int i = 0; i < trainer_desc_.xpu_recv_list_size(); ++i) {
send_var_list_.push_back(trainer_desc_.xpu_recv_list(i));
}
*/
for
(
int
i
=
0
;
i
<
param_
.
sparse_table_size
();
++
i
)
{
uint64_t
table_id
=
static_cast
<
uint64_t
>
(
param_
.
sparse_table
(
i
).
table_id
());
TableParameter
table
=
param_
.
sparse_table
(
i
);
sparse_key_names_
[
table_id
].
resize
(
table
.
sparse_key_name_size
());
for
(
int
j
=
0
;
j
<
table
.
sparse_key_name_size
();
++
j
)
{
sparse_key_names_
[
table_id
][
j
]
=
table
.
sparse_key_name
(
j
);
}
sparse_value_names_
[
table_id
].
resize
(
table
.
sparse_value_name_size
());
for
(
int
j
=
0
;
j
<
table
.
sparse_value_name_size
();
++
j
)
{
sparse_value_names_
[
table_id
][
j
]
=
table
.
sparse_value_name
(
j
);
}
sparse_grad_names_
[
table_id
].
resize
(
table
.
sparse_grad_name_size
());
for
(
int
j
=
0
;
j
<
table
.
sparse_grad_name_size
();
++
j
)
{
sparse_grad_names_
[
table_id
][
j
]
=
table
.
sparse_grad_name
(
j
);
}
label_var_name_
[
table_id
]
=
table
.
label_var_name
();
sparse_push_keys_
[
table_id
]
=
std
::
vector
<
uint64_t
>
();
}
for
(
int
i
=
0
;
i
<
param_
.
dense_table_size
();
++
i
)
{
uint64_t
table_id
=
static_cast
<
uint64_t
>
(
param_
.
dense_table
(
i
).
table_id
());
auto
table
=
param_
.
dense_table
(
i
);
dense_value_names_
[
table_id
].
resize
(
table
.
dense_value_name_size
());
for
(
int
j
=
0
;
j
<
table
.
dense_value_name_size
();
++
j
)
{
dense_value_names_
[
table_id
][
j
]
=
table
.
dense_value_name
(
j
);
}
dense_grad_names_
[
table_id
].
resize
(
table
.
dense_grad_name_size
());
for
(
int
j
=
0
;
j
<
table
.
dense_grad_name_size
();
++
j
)
{
dense_grad_names_
[
table_id
][
j
]
=
table
.
dense_grad_name
(
j
);
}
}
skip_ops_
.
resize
(
param_
.
skip_ops_size
());
for
(
int
i
=
0
;
i
<
param_
.
skip_ops_size
();
++
i
)
{
skip_ops_
[
i
]
=
param_
.
skip_ops
(
i
);
}
for
(
int
i
=
0
;
i
<
param_
.
stat_var_names_size
();
++
i
)
{
stat_var_name_map_
[
param_
.
stat_var_names
(
i
)]
=
1
;
}
need_to_push_sparse_
=
param_
.
push_sparse
();
need_to_push_dense_
=
param_
.
push_dense
();
fetch_config_
=
desc
.
fetch_config
();
use_cvm_
=
desc
.
use_cvm
();
// for sparse value accessor, embedding only
no_cvm_
=
desc
.
no_cvm
();
scale_datanorm_
=
desc
.
scale_datanorm
();
dump_slot_
=
desc
.
dump_slot
();
dump_fields_
.
resize
(
desc
.
dump_fields_size
());
for
(
int
i
=
0
;
i
<
desc
.
dump_fields_size
();
++
i
)
{
dump_fields_
[
i
]
=
desc
.
dump_fields
(
i
);
}
adjust_ins_weight_config_
=
desc
.
adjust_ins_weight_config
();
need_dump_param_
=
false
;
dump_param_
.
resize
(
desc
.
dump_param_size
());
for
(
int
i
=
0
;
i
<
desc
.
dump_param_size
();
++
i
)
{
dump_param_
[
i
]
=
desc
.
dump_param
(
i
);
}
if
(
desc
.
dump_param_size
()
!=
0
)
{
need_dump_param_
=
true
;
}
for
(
int
i
=
0
;
i
<
desc
.
check_nan_var_names_size
();
++
i
)
{
check_nan_var_names_
.
push_back
(
desc
.
check_nan_var_names
(
i
));
}
copy_table_config_
=
desc
.
copy_table_config
();
for
(
int
i
=
0
;
i
<
copy_table_config_
.
src_sparse_tables_size
();
++
i
)
{
uint64_t
src_table
=
copy_table_config_
.
src_sparse_tables
(
i
);
uint64_t
dest_table
=
copy_table_config_
.
dest_sparse_tables
(
i
);
VLOG
(
3
)
<<
"copy_sparse_tables_ push back "
<<
src_table
<<
"->"
<<
dest_table
;
copy_sparse_tables_
.
push_back
(
std
::
make_pair
(
src_table
,
dest_table
));
}
for
(
int
i
=
0
;
i
<
copy_table_config_
.
src_dense_tables_size
();
++
i
)
{
uint64_t
src_table
=
copy_table_config_
.
src_dense_tables
(
i
);
uint64_t
dest_table
=
copy_table_config_
.
dest_dense_tables
(
i
);
VLOG
(
3
)
<<
"copy_dense_tables_ push back "
<<
src_table
<<
"->"
<<
dest_table
;
copy_dense_tables_
.
push_back
(
std
::
make_pair
(
src_table
,
dest_table
));
}
for
(
auto
&
m
:
copy_table_config_
.
table_denpendency_map
())
{
if
(
sparse_key_names_
.
find
(
m
.
key
())
!=
sparse_key_names_
.
end
())
{
// currently only support one dependency
for
(
auto
&
value
:
m
.
values
())
{
table_dependency_
[
m
.
key
()]
=
value
;
}
}
}
// pull_queue_ = paddle::framework::MakeChannel<std::shared_ptr<HeterTask>>();
// push_queue_ = paddle::framework::MakeChannel<std::shared_ptr<HeterTask>>();
}
void
PSGPUWorker
::
SetChannelWriter
(
ChannelObject
<
std
::
string
>*
queue
)
{
writer_
.
Reset
(
queue
);
}
void
PSGPUWorker
::
SetNeedDump
(
bool
need_dump_field
)
{
need_dump_field_
=
need_dump_field
;
}
void
PSGPUWorker
::
DumpParam
()
{}
void
PSGPUWorker
::
TrainFiles
()
{
VLOG
(
3
)
<<
"train file A"
;
platform
::
SetNumThreads
(
1
);
VLOG
(
3
)
<<
"train file B"
;
// how to accumulate fetched values here
device_reader_
->
Start
();
VLOG
(
3
)
<<
"train file C"
;
int
cur_batch
;
while
((
cur_batch
=
device_reader_
->
Next
())
>
0
)
{
VLOG
(
3
)
<<
"train file D"
;
for
(
auto
&
op
:
ops_
)
{
bool
need_skip
=
false
;
for
(
auto
t
=
0u
;
t
<
skip_ops_
.
size
();
++
t
)
{
if
(
op
->
Type
().
find
(
skip_ops_
[
t
])
!=
std
::
string
::
npos
)
{
need_skip
=
true
;
break
;
}
}
if
(
!
need_skip
)
{
op
->
Run
(
*
thread_scope_
,
place_
);
}
}
PrintFetchVars
();
thread_scope_
->
DropKids
();
}
return
;
}
void
PSGPUWorker
::
ResetStat
()
{
total_time_
=
0
;
read_time_
=
0
;
pack_time_
=
0
;
pull_sparse_local_time_
=
0
;
op_all_time_
=
0
;
xpu_op_time_
=
0
;
xpu_wait_time_
=
0
;
cpu_op_time_
=
0
;
collect_label_time_
=
0
;
fill_sparse_time_
=
0
;
push_sparse_time_
=
0
;
gpu_2_cpu_time_
=
0
;
cpu_2_gpu_time_
=
0
;
total_inst_
=
0
;
}
void
PSGPUWorker
::
ProduceTasks
()
{
return
;
}
}
// end namespace framework
}
// end namespace paddle
#endif
paddle/fluid/framework/trainer.h
浏览文件 @
a839ddca
...
...
@@ -277,6 +277,55 @@ class HeterBoxTrainer : public TrainerBase {
};
#endif
#if (defined PADDLE_WITH_NCCL) && (defined PADDLE_WITH_PSLIB)
class
PSGPUTrainer
:
public
TrainerBase
{
public:
PSGPUTrainer
()
{}
virtual
~
PSGPUTrainer
()
{}
virtual
void
Initialize
(
const
TrainerDesc
&
trainer_desc
,
Dataset
*
data_set
);
virtual
void
InitTrainerEnv
(
const
ProgramDesc
&
main_program
,
const
platform
::
Place
&
place
);
virtual
void
InitOtherEnv
(
const
ProgramDesc
&
main_program
);
virtual
void
Run
();
virtual
void
Finalize
();
virtual
void
RegisterHeterCallback
();
virtual
void
DumpWork
(
int
tid
);
virtual
Scope
*
GetWorkerScope
(
int
thread_id
);
virtual
void
CacheProgram
(
const
ProgramDesc
&
main_program
)
{
new
(
&
program_
)
ProgramDesc
(
main_program
);
}
virtual
std
::
string
GetDumpPath
(
int
tid
)
{
return
""
;
}
virtual
void
InitDumpEnv
()
{}
void
BuildGPUPSTask
(
int
table_id
,
int
feadim
);
/*
template <typename T>
void HeterMemCpy(LoDTensor* tensor, LoDTensor* root_tensor,
const paddle::platform::Place& thread_place,
cudaStream_t stream);
*/
template
<
typename
T
>
void
MergeToRootScope
(
LoDTensor
*
root_tensor
,
LoDTensor
*
thread_tensor
);
protected:
Dataset
*
dataset_
;
DownpourWorkerParameter
param_
;
std
::
map
<
uint64_t
,
std
::
vector
<
std
::
string
>>
dense_grad_names_
;
std
::
vector
<
std
::
string
>
need_merge_var_names_
;
float
scale_datanorm_
;
paddle
::
platform
::
Place
place_
;
ProgramDesc
program_
;
std
::
shared_ptr
<
paddle
::
framework
::
FleetWrapper
>
fleet_ptr_
;
std
::
shared_ptr
<
paddle
::
framework
::
PullDenseWorker
>
pull_dense_worker_
;
std
::
vector
<
std
::
shared_ptr
<
DeviceWorker
>>
workers_
;
std
::
vector
<
platform
::
Place
>
places_
;
// ps-gpu
std
::
vector
<
std
::
thread
>
threads_
;
int
use_ps_gpu_
;
int
thread_num_
;
};
#endif
#if defined(PADDLE_WITH_NCCL)
class
PipelineTrainer
:
public
TrainerBase
{
public:
...
...
paddle/fluid/framework/trainer_factory.cc
浏览文件 @
a839ddca
...
...
@@ -68,6 +68,9 @@ REGISTER_TRAINER_CLASS(DistMultiTrainer);
REGISTER_TRAINER_CLASS
(
HeterXpuTrainer
);
REGISTER_TRAINER_CLASS
(
HeterBoxTrainer
);
#endif
#if (defined PADDLE_WITH_NCCL) && (defined PADDLE_WITH_PSLIB)
REGISTER_TRAINER_CLASS
(
PSGPUTrainer
);
#endif
#if defined(PADDLE_WITH_NCCL)
REGISTER_TRAINER_CLASS
(
PipelineTrainer
);
#endif
...
...
paddle/fluid/operators/pull_box_sparse_op.cc
浏览文件 @
a839ddca
...
...
@@ -64,12 +64,23 @@ class PullBoxSparseOp : public framework::OperatorWithKernel {
class
PullBoxSparseOpMaker
:
public
framework
::
OpProtoAndCheckerMaker
{
public:
void
Make
()
override
{
AddInput
(
"W"
,
"(Tensor) The input represents embedding tensors, "
"which is a learnable parameter."
)
.
AsDispensable
();
AddInput
(
"Ids"
,
"Input tensors with type int32 or int64 "
"contains the ids to be looked up in BoxPS. "
"The last dimension size must be 1."
)
.
AsDuplicable
();
AddOutput
(
"Out"
,
"The lookup results tensors."
).
AsDuplicable
();
AddAttr
<
bool
>
(
"is_sparse"
,
"(boolean, default false) "
"Sparse update."
)
.
SetDefault
(
false
);
AddAttr
<
bool
>
(
"is_distributed"
,
"(boolean, default false) distributed lookup table."
)
.
SetDefault
(
false
);
AddAttr
<
int
>
(
"size"
,
"(int, the embedding hidden size"
).
SetDefault
(
1
);
AddComment
(
R"DOC(
Pull Box Sparse Operator.
...
...
paddle/fluid/operators/pull_box_sparse_op.h
浏览文件 @
a839ddca
...
...
@@ -16,6 +16,7 @@
#include <memory>
#include <vector>
#include "paddle/fluid/framework/fleet/box_wrapper.h"
#include "paddle/fluid/framework/fleet/ps_gpu_wrapper.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/tensor.h"
...
...
@@ -46,6 +47,12 @@ static void PullBoxSparseFunctor(const framework::ExecutionContext &ctx) {
box_ptr
->
PullSparse
(
ctx
.
GetPlace
(),
all_keys
,
all_values
,
slot_lengths
,
hidden_size
,
0
);
#endif
#if (defined PADDLE_WITH_NCCL) && (defined PADDLE_WITH_PSLIB)
auto
hidden_size
=
ctx
.
Attr
<
int
>
(
"size"
);
auto
gpu_ps_ptr
=
paddle
::
framework
::
PSGPUWrapper
::
GetInstance
();
gpu_ps_ptr
->
PullSparse
(
ctx
.
GetPlace
(),
0
,
all_keys
,
all_values
,
slot_lengths
,
hidden_size
);
#endif
}
template
<
typename
T
>
...
...
@@ -83,6 +90,12 @@ static void PushBoxSparseFunctor(const framework::ExecutionContext &ctx) {
box_ptr
->
PushSparseGrad
(
ctx
.
GetPlace
(),
all_keys
,
all_grad_values
,
slot_lengths
,
hidden_size
,
0
,
batch_size
);
#endif
#if (defined PADDLE_WITH_NCCL) && (defined PADDLE_WITH_PSLIB)
auto
hidden_size
=
ctx
.
Attr
<
int
>
(
"size"
);
auto
gpu_ps_ptr
=
paddle
::
framework
::
PSGPUWrapper
::
GetInstance
();
gpu_ps_ptr
->
PushSparseGrad
(
ctx
.
GetPlace
(),
0
,
all_keys
,
all_grad_values
,
slot_lengths
,
hidden_size
,
batch_size
);
#endif
}
using
LoDTensor
=
framework
::
LoDTensor
;
...
...
paddle/fluid/pybind/CMakeLists.txt
浏览文件 @
a839ddca
set
(
PYBIND_DEPS pybind python proto_desc memory executor fleet_wrapper box_wrapper prune
feed_fetch_method pass_builder parallel_executor profiler layer tracer engine scope_pool
analysis_predictor imperative_profiler imperative_flag save_load_util dlpack_tensor device_context
gloo_wrapper infer_io_utils heter_wrapper generator op_version_registry
)
gloo_wrapper infer_io_utils heter_wrapper generator op_version_registry
ps_gpu_wrapper
)
if
(
WITH_NCCL
)
set
(
PYBIND_DEPS
${
PYBIND_DEPS
}
nccl_wrapper
)
...
...
@@ -29,6 +29,7 @@ set(PYBIND_SRCS
reader_py.cc
fleet_wrapper_py.cc
heter_wrapper_py.cc
ps_gpu_wrapper_py.cc
gloo_wrapper_py.cc
box_helper_py.cc
data_set_py.cc
...
...
paddle/fluid/pybind/fleet_wrapper_py.cc
浏览文件 @
a839ddca
...
...
@@ -57,11 +57,7 @@ void BindFleetWrapper(py::module* m) {
.
def
(
"get_cache_threshold"
,
&
framework
::
FleetWrapper
::
GetCacheThreshold
)
.
def
(
"cache_shuffle"
,
&
framework
::
FleetWrapper
::
CacheShuffle
)
.
def
(
"save_cache"
,
&
framework
::
FleetWrapper
::
SaveCache
)
.
def
(
"save_model_with_whitelist"
,
&
framework
::
FleetWrapper
::
SaveWithWhitelist
)
.
def
(
"load_model"
,
&
framework
::
FleetWrapper
::
LoadModel
)
.
def
(
"load_table_with_whitelist"
,
&
framework
::
FleetWrapper
::
LoadWithWhitelist
)
.
def
(
"clear_model"
,
&
framework
::
FleetWrapper
::
ClearModel
)
.
def
(
"clear_one_table"
,
&
framework
::
FleetWrapper
::
ClearOneTable
)
.
def
(
"stop_server"
,
&
framework
::
FleetWrapper
::
StopServer
)
...
...
paddle/fluid/pybind/ps_gpu_wrapper_py.cc
0 → 100644
浏览文件 @
a839ddca
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <fcntl.h>
#ifdef _POSIX_C_SOURCE
#undef _POSIX_C_SOURCE
#endif
#ifdef _XOPEN_SOURCE
#undef _XOPEN_SOURCE
#endif
#include <string>
#include <vector>
#include "paddle/fluid/framework/fleet/ps_gpu_wrapper.h"
#include "paddle/fluid/pybind/ps_gpu_wrapper_py.h"
namespace
py
=
pybind11
;
namespace
paddle
{
namespace
pybind
{
#if (defined PADDLE_WITH_NCCL) && (defined PADDLE_WITH_PSLIB)
void
BindPSGPUWrapper
(
py
::
module
*
m
)
{
py
::
class_
<
framework
::
PSGPUWrapper
,
std
::
shared_ptr
<
framework
::
PSGPUWrapper
>>
(
*
m
,
"PSGPU"
)
.
def
(
py
::
init
([]()
{
return
framework
::
PSGPUWrapper
::
GetInstance
();
}))
.
def
(
"set_slot_vector"
,
&
framework
::
PSGPUWrapper
::
SetSlotVector
,
py
::
call_guard
<
py
::
gil_scoped_release
>
());
}
// end PSGPUWrapper
#endif
}
// end namespace pybind
}
// end namespace paddle
paddle/fluid/pybind/ps_gpu_wrapper_py.h
0 → 100644
浏览文件 @
a839ddca
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "pybind11/pybind11.h"
#include "pybind11/stl.h"
namespace
py
=
pybind11
;
namespace
paddle
{
namespace
pybind
{
#if (defined PADDLE_WITH_NCCL) && (defined PADDLE_WITH_PSLIB)
void
BindPSGPUWrapper
(
py
::
module
*
m
);
#endif
}
// namespace pybind
}
// namespace paddle
paddle/fluid/pybind/pybind.cc
浏览文件 @
a839ddca
...
...
@@ -78,6 +78,7 @@ limitations under the License. */
#include "paddle/fluid/pybind/imperative.h"
#include "paddle/fluid/pybind/inference_api.h"
#include "paddle/fluid/pybind/ir.h"
#include "paddle/fluid/pybind/ps_gpu_wrapper_py.h"
#include "paddle/fluid/pybind/pybind_boost_headers.h"
#ifdef PADDLE_WITH_NCCL
...
...
@@ -2798,8 +2799,12 @@ All parameter, weight, gradient are variables in Paddle.
.
def
(
"device_count"
,
&
ParallelExecutor
::
DeviceCount
);
BindFleetWrapper
(
&
m
);
#ifdef PADDLE_WITH_PSLIB
BindHeterWrapper
(
&
m
);
#endif
#if (defined PADDLE_WITH_NCCL) && (defined PADDLE_WITH_PSLIB)
BindPSGPUWrapper
(
&
m
);
#endif
BindGlooWrapper
(
&
m
);
BindBoxHelper
(
&
m
);
...
...
python/paddle/fluid/executor.py
浏览文件 @
a839ddca
...
...
@@ -1375,8 +1375,6 @@ class Executor(object):
is_heter
=
1
if
program
.
_fleet_opt
.
get
(
"trainer"
,
""
)
==
"HeterXpuTrainer"
:
is_heter
=
1
if
program
.
_fleet_opt
.
get
(
"use_ps_gpu"
,
""
):
is_heter
=
1
if
scope
is
None
:
scope
=
global_scope
()
if
fetch_list
is
None
:
...
...
python/paddle/fluid/incubate/fleet/parameter_server/pslib/optimizer_factory.py
浏览文件 @
a839ddca
...
...
@@ -85,7 +85,7 @@ class DistributedAdam(DistributedOptimizerImplBase):
".batch_size@GRAD"
,
".batch_square_sum@GRAD"
,
".batch_sum@GRAD"
]
self
.
supported_embedding_types
=
[
"lookup_table"
,
"pull_sparse"
,
"pull_sparse_v2"
"lookup_table"
,
"pull_sparse"
,
"pull_sparse_v2"
,
"pull_box_sparse"
]
self
.
supported_embedding_grad_types
=
[
"lookup_table_grad"
,
"push_sparse"
,
"push_sparse_v2"
...
...
python/paddle/fluid/layers/nn.py
浏览文件 @
a839ddca
...
...
@@ -663,7 +663,11 @@ def _pull_sparse_v2(input,
return outs
def _pull_box_sparse(input, size, dtype='float32'):
def _pull_box_sparse(input,
size,
dtype='float32',
is_distributed=False,
is_sparse=False):
r"""
**Pull Box Sparse Layer**
...
...
@@ -701,11 +705,18 @@ def _pull_box_sparse(input, size, dtype='float32'):
helper.create_variable_for_type_inference(dtype)
for i in range(len(inputs))
]
w = helper.create_parameter(
attr=helper.param_attr, shape=[size], dtype=dtype, is_bias=False)
helper.append_op(
type='pull_box_sparse',
inputs={'Ids': inputs},
inputs={'Ids': inputs,
'W': w},
outputs={'Out': outs},
attrs={'size': size})
attrs={
'size': size,
'is_distributed': is_distributed,
'is_sparse': is_sparse
})
if len(outs) == 1:
return outs[0]
return outs
...
...
python/paddle/fluid/trainer_desc.py
浏览文件 @
a839ddca
...
...
@@ -370,6 +370,30 @@ class HeterBoxTrainer(TrainerDesc):
self
.
_device_worker
.
_gen_worker_desc
(
self
.
proto_desc
)
class
PSGPUTrainer
(
TrainerDesc
):
"""
Implement of PSGPUTrainer.
It's for Distributed training.
"""
def
__init__
(
self
):
super
(
PSGPUTrainer
,
self
).
__init__
()
pass
def
_set_program
(
self
,
program
):
super
(
PSGPUTrainer
,
self
).
_set_program
(
program
)
self
.
_program
=
program
def
_gen_trainer_desc
(
self
):
super
(
PSGPUTrainer
,
self
).
_gen_trainer_desc
()
self
.
proto_desc
.
class_name
=
"PSGPUTrainer"
if
self
.
_program
==
None
:
raise
RuntimeError
(
"None Program"
)
self
.
_device_worker
.
_set_infer
(
self
.
_infer
)
self
.
_device_worker
.
_set_program
(
self
.
_program
)
self
.
_device_worker
.
_gen_worker_desc
(
self
.
proto_desc
)
class
PipelineTrainer
(
TrainerDesc
):
"""
Implement of PipelineTrainer.
...
...
python/paddle/fluid/trainer_factory.py
浏览文件 @
a839ddca
...
...
@@ -22,7 +22,7 @@ from paddle.fluid.log_helper import get_logger
local_logger
=
get_logger
(
__name__
,
logging
.
INFO
,
fmt
=
'%(asctime)s-%(levelname)s: %(message)s'
)
from
.trainer_desc
import
MultiTrainer
,
DistMultiTrainer
,
PipelineTrainer
,
HeterXpuTrainer
,
HeterBoxTrainer
from
.trainer_desc
import
MultiTrainer
,
DistMultiTrainer
,
PipelineTrainer
,
HeterXpuTrainer
,
HeterBoxTrainer
,
PSGPUTrainer
from
.device_worker
import
Hogwild
,
DownpourSGD
,
Section
,
DownpourSGDOPT
from
.framework
import
Variable
from
multiprocessing
import
Process
,
Manager
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录