Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
BaiXuePrincess
Paddle
提交
862dde5e
P
Paddle
项目概览
BaiXuePrincess
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
862dde5e
编写于
8月 07, 2020
作者:
Z
zlsh80826
浏览文件
操作
浏览文件
下载
差异文件
Merge branch 'develop' of
https://github.com/PaddlePaddle/Paddle
into trt_stack_op
上级
4a15797e
6e7f0bb2
变更
68
隐藏空白更改
内联
并排
Showing
68 changed file
with
6338 addition
and
231 deletion
+6338
-231
cmake/operators.cmake
cmake/operators.cmake
+1
-1
paddle/fluid/framework/CMakeLists.txt
paddle/fluid/framework/CMakeLists.txt
+24
-6
paddle/fluid/framework/device_worker.h
paddle/fluid/framework/device_worker.h
+105
-0
paddle/fluid/framework/device_worker_factory.cc
paddle/fluid/framework/device_worker_factory.cc
+3
-0
paddle/fluid/framework/dist_multi_trainer.cc
paddle/fluid/framework/dist_multi_trainer.cc
+17
-1
paddle/fluid/framework/downpour_worker.cc
paddle/fluid/framework/downpour_worker.cc
+1
-1
paddle/fluid/framework/fleet/CMakeLists.txt
paddle/fluid/framework/fleet/CMakeLists.txt
+2
-0
paddle/fluid/framework/fleet/fleet_wrapper.cc
paddle/fluid/framework/fleet/fleet_wrapper.cc
+272
-4
paddle/fluid/framework/fleet/fleet_wrapper.h
paddle/fluid/framework/fleet/fleet_wrapper.h
+32
-4
paddle/fluid/framework/fleet/gloo_wrapper.cc
paddle/fluid/framework/fleet/gloo_wrapper.cc
+7
-7
paddle/fluid/framework/fleet/heter_wrapper.cc
paddle/fluid/framework/fleet/heter_wrapper.cc
+308
-0
paddle/fluid/framework/fleet/heter_wrapper.h
paddle/fluid/framework/fleet/heter_wrapper.h
+123
-0
paddle/fluid/framework/heter_service.h
paddle/fluid/framework/heter_service.h
+368
-0
paddle/fluid/framework/heter_service.proto
paddle/fluid/framework/heter_service.proto
+69
-0
paddle/fluid/framework/hetercpu_worker.cc
paddle/fluid/framework/hetercpu_worker.cc
+1166
-0
paddle/fluid/framework/heterxpu_trainer.cc
paddle/fluid/framework/heterxpu_trainer.cc
+469
-0
paddle/fluid/framework/multi_trainer.cc
paddle/fluid/framework/multi_trainer.cc
+1
-0
paddle/fluid/framework/pull_dense_worker.cc
paddle/fluid/framework/pull_dense_worker.cc
+60
-1
paddle/fluid/framework/trainer.h
paddle/fluid/framework/trainer.h
+80
-0
paddle/fluid/framework/trainer_desc.proto
paddle/fluid/framework/trainer_desc.proto
+6
-0
paddle/fluid/framework/trainer_factory.cc
paddle/fluid/framework/trainer_factory.cc
+3
-0
paddle/fluid/operators/flatten_op.cc
paddle/fluid/operators/flatten_op.cc
+183
-0
paddle/fluid/operators/flatten_op.cu.cc
paddle/fluid/operators/flatten_op.cu.cc
+23
-0
paddle/fluid/operators/flatten_op.h
paddle/fluid/operators/flatten_op.h
+68
-0
paddle/fluid/operators/fused/CMakeLists.txt
paddle/fluid/operators/fused/CMakeLists.txt
+6
-1
paddle/fluid/operators/fused/fusion_gru_op.cc
paddle/fluid/operators/fused/fusion_gru_op.cc
+16
-1
paddle/fluid/operators/fused/mkldnn/fusion_gru_mkldnn_op.cc
paddle/fluid/operators/fused/mkldnn/fusion_gru_mkldnn_op.cc
+439
-0
paddle/fluid/platform/mkldnn_helper.h
paddle/fluid/platform/mkldnn_helper.h
+4
-0
paddle/fluid/pybind/CMakeLists.txt
paddle/fluid/pybind/CMakeLists.txt
+2
-1
paddle/fluid/pybind/heter_wrapper_py.cc
paddle/fluid/pybind/heter_wrapper_py.cc
+50
-0
paddle/fluid/pybind/heter_wrapper_py.h
paddle/fluid/pybind/heter_wrapper_py.h
+29
-0
paddle/fluid/pybind/pybind.cc
paddle/fluid/pybind/pybind.cc
+4
-0
python/paddle/fleet/__init__.py
python/paddle/fleet/__init__.py
+5
-2
python/paddle/fleet/dataset/__init__.py
python/paddle/fleet/dataset/__init__.py
+2
-0
python/paddle/fleet/dataset/dataset.py
python/paddle/fleet/dataset/dataset.py
+1103
-0
python/paddle/fluid/annotations.py
python/paddle/fluid/annotations.py
+0
-39
python/paddle/fluid/device_worker.py
python/paddle/fluid/device_worker.py
+2
-1
python/paddle/fluid/dygraph/dygraph_to_static/program_translator.py
...dle/fluid/dygraph/dygraph_to_static/program_translator.py
+0
-1
python/paddle/fluid/dygraph/nn.py
python/paddle/fluid/dygraph/nn.py
+60
-1
python/paddle/fluid/executor.py
python/paddle/fluid/executor.py
+67
-0
python/paddle/fluid/incubate/fleet/base/fleet_base.py
python/paddle/fluid/incubate/fleet/base/fleet_base.py
+10
-0
python/paddle/fluid/incubate/fleet/base/role_maker.py
python/paddle/fluid/incubate/fleet/base/role_maker.py
+142
-0
python/paddle/fluid/incubate/fleet/parameter_server/pslib/__init__.py
...e/fluid/incubate/fleet/parameter_server/pslib/__init__.py
+83
-12
python/paddle/fluid/incubate/fleet/parameter_server/pslib/optimizer_factory.py
...ncubate/fleet/parameter_server/pslib/optimizer_factory.py
+4
-1
python/paddle/fluid/incubate/fleet/utils/fleet_util.py
python/paddle/fluid/incubate/fleet/utils/fleet_util.py
+121
-0
python/paddle/fluid/layers/detection.py
python/paddle/fluid/layers/detection.py
+46
-46
python/paddle/fluid/layers/device.py
python/paddle/fluid/layers/device.py
+2
-2
python/paddle/fluid/layers/nn.py
python/paddle/fluid/layers/nn.py
+9
-6
python/paddle/fluid/reader.py
python/paddle/fluid/reader.py
+2
-3
python/paddle/fluid/tests/unittests/dist_fleet_ctr.py
python/paddle/fluid/tests/unittests/dist_fleet_ctr.py
+1
-1
python/paddle/fluid/tests/unittests/mkldnn/test_fusion_gru_mkldnn_op.py
...fluid/tests/unittests/mkldnn/test_fusion_gru_mkldnn_op.py
+78
-0
python/paddle/fluid/tests/unittests/test_dataset.py
python/paddle/fluid/tests/unittests/test_dataset.py
+89
-22
python/paddle/fluid/tests/unittests/test_dataset_dataloader.py
...n/paddle/fluid/tests/unittests/test_dataset_dataloader.py
+3
-1
python/paddle/fluid/tests/unittests/test_flatten_contiguous_range_op.py
...fluid/tests/unittests/test_flatten_contiguous_range_op.py
+204
-0
python/paddle/fluid/tests/unittests/test_fleet_rolemaker_2.py
...on/paddle/fluid/tests/unittests/test_fleet_rolemaker_2.py
+3
-1
python/paddle/fluid/tests/unittests/test_fusion_gru_op.py
python/paddle/fluid/tests/unittests/test_fusion_gru_op.py
+9
-3
python/paddle/fluid/tests/unittests/test_layers.py
python/paddle/fluid/tests/unittests/test_layers.py
+45
-0
python/paddle/fluid/tests/unittests/test_monitor.py
python/paddle/fluid/tests/unittests/test_monitor.py
+3
-1
python/paddle/fluid/tests/unittests/test_pipeline.py
python/paddle/fluid/tests/unittests/test_pipeline.py
+2
-1
python/paddle/fluid/tests/unittests/white_list/no_check_set_white_list.py
...uid/tests/unittests/white_list/no_check_set_white_list.py
+1
-0
python/paddle/fluid/trainer_desc.py
python/paddle/fluid/trainer_desc.py
+69
-1
python/paddle/fluid/trainer_factory.py
python/paddle/fluid/trainer_factory.py
+3
-1
python/paddle/nn/__init__.py
python/paddle/nn/__init__.py
+1
-0
python/paddle/nn/layer/__init__.py
python/paddle/nn/layer/__init__.py
+1
-0
python/paddle/nn/layer/common.py
python/paddle/nn/layer/common.py
+1
-0
python/paddle/tensor/manipulation.py
python/paddle/tensor/manipulation.py
+142
-56
python/paddle/utils/__init__.py
python/paddle/utils/__init__.py
+2
-1
python/paddle/utils/deprecated.py
python/paddle/utils/deprecated.py
+82
-0
未找到文件。
cmake/operators.cmake
浏览文件 @
862dde5e
...
...
@@ -118,7 +118,7 @@ function(op_library TARGET)
"tensor_array_read_write_op"
"tensorrt_engine_op"
"conv_fusion_op"
"fusion_transpose_flatten_concat_op"
"fusion_conv_inception_op"
"sync_batch_norm_op"
"dgc_op"
"fused_fc_elementwise_layernorm_op"
"multihead_matmul_op"
"fusion_group_op"
"fused_bn_activation_op"
"fused_embedding_eltwise_layernorm_op"
)
"multihead_matmul_op"
"fusion_group_op"
"fused_bn_activation_op"
"fused_embedding_eltwise_layernorm_op"
"fusion_gru_op"
)
if
(
"
${
TARGET
}
"
STREQUAL
"
${
manual_pybind_op
}
"
)
set
(
pybind_flag 1
)
endif
()
...
...
paddle/fluid/framework/CMakeLists.txt
浏览文件 @
862dde5e
...
...
@@ -27,6 +27,7 @@ add_subdirectory(fleet)
add_subdirectory
(
io
)
#ddim lib
proto_library
(
framework_proto SRCS framework.proto
)
proto_library
(
heter_service_proto SRCS heter_service.proto
)
proto_library
(
data_feed_proto SRCS data_feed.proto
)
proto_library
(
trainer_desc_proto SRCS trainer_desc.proto DEPS framework_proto
data_feed_proto
)
...
...
@@ -195,20 +196,37 @@ cc_library(executor_gc_helper SRCS executor_gc_helper.cc DEPS scope proto_desc o
if
(
WITH_DISTRIBUTE
)
cc_library
(
executor SRCS executor.cc multi_trainer.cc pipeline_trainer.cc dataset_factory.cc
dist_multi_trainer.cc trainer_factory.cc trainer.cc data_feed_factory.cc
data_feed.cc device_worker.cc hogwild_worker.cc downpour_worker.cc downpour_worker_opt.cc
heterxpu_trainer.cc
data_feed.cc device_worker.cc hogwild_worker.cc hetercpu_worker.cc downpour_worker.cc downpour_worker_opt.cc
pull_dense_worker.cc section_worker.cc device_worker_factory.cc data_set.cc DEPS op_registry
device_context scope framework_proto trainer_desc_proto glog fs shell fleet_wrapper box_wrapper lodtensor_printer
device_context scope framework_proto trainer_desc_proto glog fs shell
fleet_wrapper heter_wrapper box_wrapper lodtensor_printer
lod_rank_table feed_fetch_method sendrecvop_rpc communicator collective_helper
${
GLOB_DISTRIBUTE_DEPS
}
graph_to_program_pass variable_helper data_feed_proto timer monitor
)
graph_to_program_pass variable_helper data_feed_proto timer monitor
heter_service_proto
)
set
(
DISTRIBUTE_COMPILE_FLAGS
"-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor"
)
set_source_files_properties
(
executor.cc PROPERTIES COMPILE_FLAGS
${
DISTRIBUTE_COMPILE_FLAGS
}
)
elseif
(
WITH_PSLIB
)
cc_library
(
executor SRCS executor.cc multi_trainer.cc pipeline_trainer.cc dataset_factory.cc
dist_multi_trainer.cc trainer_factory.cc trainer.cc data_feed_factory.cc
heterxpu_trainer.cc
data_feed.cc device_worker.cc hogwild_worker.cc hetercpu_worker.cc downpour_worker.cc downpour_worker_opt.cc
pull_dense_worker.cc section_worker.cc device_worker_factory.cc data_set.cc DEPS op_registry
device_context scope framework_proto data_feed_proto heter_service_proto trainer_desc_proto glog
lod_rank_table fs shell fleet_wrapper heter_wrapper box_wrapper lodtensor_printer feed_fetch_method
graph_to_program_pass variable_helper timer monitor pslib_brpc
)
# TODO: Fix these unittest failed on Windows
if
(
NOT WIN32
)
cc_test
(
test_naive_executor SRCS naive_executor_test.cc DEPS naive_executor elementwise_add_op
)
endif
()
else
()
cc_library
(
executor SRCS executor.cc multi_trainer.cc pipeline_trainer.cc dataset_factory.cc
dist_multi_trainer.cc trainer_factory.cc trainer.cc data_feed_factory.cc
data_feed.cc device_worker.cc hogwild_worker.cc downpour_worker.cc downpour_worker_opt.cc
heterxpu_trainer.cc
data_feed.cc device_worker.cc hogwild_worker.cc hetercpu_worker.cc downpour_worker.cc downpour_worker_opt.cc
pull_dense_worker.cc section_worker.cc device_worker_factory.cc data_set.cc DEPS op_registry
device_context scope framework_proto data_feed_proto trainer_desc_proto glog
lod_rank_table fs shell fleet_wrapper box_wrapper lodtensor_printer feed_fetch_method
device_context scope framework_proto data_feed_proto
heter_service_proto
trainer_desc_proto glog
lod_rank_table fs shell fleet_wrapper
heter_wrapper
box_wrapper lodtensor_printer feed_fetch_method
graph_to_program_pass variable_helper timer monitor
)
# TODO: Fix these unittest failed on Windows
if
(
NOT WIN32
)
...
...
paddle/fluid/framework/device_worker.h
浏览文件 @
862dde5e
...
...
@@ -27,6 +27,7 @@ limitations under the License. */
#include <vector>
#include "paddle/fluid/framework/data_feed.h"
#include "paddle/fluid/framework/heter_service.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/program_desc.h"
...
...
@@ -51,10 +52,23 @@ bool CheckValidOutput(LoDTensor* tensor, size_t batch_size);
class
FleetWrapper
;
#ifdef PADDLE_WITH_PSLIB
class
HeterWrapper
;
#endif
class
PullDenseWorker
{
public:
virtual
~
PullDenseWorker
()
{}
virtual
void
Initialize
(
const
TrainerDesc
&
param
);
#ifdef PADDLE_WITH_CUDA
void
AddStream
(
const
cudaStream_t
stream
)
{
copy_streams_
.
push_back
(
stream
);
}
void
AddPlace
(
const
paddle
::
platform
::
Place
place
)
{
places_
.
push_back
(
place
);
}
void
AddThreadScope
(
Scope
*
scope
)
{
thread_scopes_
.
push_back
(
scope
);
}
#endif
int
Start
();
void
Stop
();
void
SetRootScope
(
Scope
*
scope
)
{
root_scope_
=
scope
;
}
...
...
@@ -62,6 +76,7 @@ class PullDenseWorker {
void
ResetThreadVersion
(
uint64_t
table_id
);
void
Wait
(
std
::
vector
<::
std
::
future
<
int32_t
>>*
status_vec
);
void
PullDense
(
bool
force_update
=
false
);
void
CreatePinVar
();
int
GetThreadIdByScope
(
const
Scope
*
scope
);
void
SetThreadIdByScope
(
const
Scope
*
scope
,
int
tid
);
static
std
::
shared_ptr
<
PullDenseWorker
>
GetInstance
()
{
...
...
@@ -105,6 +120,12 @@ class PullDenseWorker {
std
::
mutex
mutex_for_mean_scale_
;
float
total_batch_num_
=
0
;
std
::
unordered_map
<
const
Scope
*
,
int
>
scope_to_thread_id_
;
#ifdef PADDLE_WITH_CUDA
std
::
vector
<
cudaStream_t
>
copy_streams_
;
std
::
vector
<
paddle
::
platform
::
Place
>
places_
;
std
::
vector
<
Scope
*>
thread_scopes_
;
#endif
};
// should incorporate different type of device
...
...
@@ -126,6 +147,8 @@ class DeviceWorker {
virtual
void
BindingDataFeedMemory
()
=
0
;
virtual
void
SetRootScope
(
Scope
*
root_scope
);
virtual
void
SetDataFeed
(
DataFeed
*
data_feed
);
virtual
void
SetWorkerNum
(
int
num
)
{}
virtual
void
CacheProgram
(
const
ProgramDesc
&
main_program
)
{}
virtual
void
SetNeedDumpField
(
bool
need_dump_field
)
{
need_dump_field_
=
need_dump_field
;
}
...
...
@@ -161,6 +184,7 @@ class DeviceWorker {
FetchConfig
fetch_config_
;
bool
use_cvm_
;
bool
no_cvm_
;
TrainerDesc
trainer_desc_
;
// dump params or grads for debug
bool
need_dump_param_
;
...
...
@@ -306,6 +330,87 @@ class DownpourWorkerOpt : public DownpourWorker {
uint64_t
async_tid_
=
0
;
};
#ifdef PADDLE_WITH_PSLIB
class
HeterCpuWorker
:
public
HogwildWorker
{
public:
HeterCpuWorker
()
{}
virtual
~
HeterCpuWorker
()
{}
virtual
void
Initialize
(
const
TrainerDesc
&
desc
);
virtual
void
TrainFiles
();
virtual
void
TrainFilesWithProfiler
();
virtual
void
SetNeedDump
(
bool
need_dump_field
);
virtual
void
SetChannelWriter
(
ChannelObject
<
std
::
string
>*
queue
);
virtual
void
SetWorkerNum
(
int
num
)
{
worker_num_
=
num
;
}
virtual
void
Schedule
(
int
taskid
);
virtual
void
JumpContext
(
std
::
shared_ptr
<
HeterTask
>
task
);
virtual
void
CacheProgram
(
const
ProgramDesc
&
main_program
)
{
new
(
&
program_
)
ProgramDesc
(
main_program
);
}
virtual
void
GetXpuOpIndex
();
protected:
std
::
shared_ptr
<
paddle
::
framework
::
FleetWrapper
>
fleet_ptr_
;
std
::
shared_ptr
<
paddle
::
framework
::
HeterWrapper
>
heter_ptr_
;
std
::
shared_ptr
<
paddle
::
framework
::
PullDenseWorker
>
pull_dense_worker_
;
void
FillSparseValue
(
std
::
shared_ptr
<
HeterTask
>
task
,
size_t
table_id
);
void
PushGradients
();
void
CollectLabelInfo
(
std
::
shared_ptr
<
HeterTask
>
task
,
size_t
table_id
);
void
AdjustInsWeight
(
std
::
shared_ptr
<
HeterTask
>
task
);
void
DumpParam
();
void
CopySparseTable
();
void
CopyDenseTable
();
void
CopyDenseVars
();
private:
int
mpi_rank_
;
int
worker_num_
;
int
xpu_begin_op_index_
;
int
xpu_end_op_index_
;
ProgramDesc
program_
;
HeterObjectPool
<
HeterTask
>
object_pool_
;
HeterList
<
int
,
std
::
shared_ptr
<
HeterTask
>>
run_queue_
;
HeterList
<
int
,
std
::
shared_ptr
<
HeterTask
>>
wait_queue_
;
bool
need_dump_param_
;
std
::
vector
<
std
::
string
>
dump_param_
;
bool
need_to_push_dense_
;
bool
need_dump_field_
;
bool
dump_slot_
;
bool
need_to_push_sparse_
;
std
::
vector
<
std
::
string
>
dump_fields_
;
ChannelWriter
<
std
::
string
>
writer_
;
DownpourWorkerParameter
param_
;
float
scale_datanorm_
;
// just save the value in param_ for easy access
std
::
map
<
uint64_t
,
std
::
string
>
label_var_name_
;
std
::
map
<
uint64_t
,
std
::
vector
<
std
::
string
>>
sparse_key_names_
;
std
::
map
<
uint64_t
,
std
::
vector
<
std
::
string
>>
sparse_value_names_
;
std
::
map
<
uint64_t
,
std
::
vector
<
std
::
string
>>
sparse_grad_names_
;
std
::
map
<
uint64_t
,
std
::
vector
<
std
::
string
>>
dense_value_names_
;
std
::
map
<
uint64_t
,
std
::
vector
<
std
::
string
>>
dense_grad_names_
;
platform
::
Place
root_place_
;
// actually pushed feasign of each table
std
::
map
<
uint64_t
,
std
::
vector
<
uint64_t
>>
sparse_push_keys_
;
// skipped ops
std
::
vector
<
std
::
string
>
skip_ops_
;
std
::
vector
<::
std
::
future
<
int32_t
>>
push_sparse_status_
;
std
::
vector
<::
std
::
future
<
int32_t
>>
push_dense_status_
;
// adjust ins weight
AdjustInsWeightConfig
adjust_ins_weight_config_
;
std
::
vector
<
float
>
nid_show_
;
// check nan and inf during training
std
::
vector
<
std
::
string
>
check_nan_var_names_
;
// copy table
CopyTableConfig
copy_table_config_
;
std
::
map
<
uint64_t
,
uint64_t
>
table_dependency_
;
std
::
vector
<
std
::
pair
<
uint64_t
,
uint64_t
>>
copy_sparse_tables_
;
std
::
vector
<
std
::
pair
<
uint64_t
,
uint64_t
>>
copy_dense_tables_
;
std
::
unordered_map
<
uint64_t
,
std
::
unordered_set
<
uint64_t
>>
feasign_set_
;
};
#endif
#if defined(PADDLE_WITH_NCCL)
class
SectionWorker
:
public
DeviceWorker
{
public:
...
...
paddle/fluid/framework/device_worker_factory.cc
浏览文件 @
862dde5e
...
...
@@ -62,6 +62,9 @@ std::shared_ptr<DeviceWorker> DeviceWorkerFactory::CreateDeviceWorker(
REGISTER_DEVICE_WORKER_CLASS
(
HogwildWorker
);
REGISTER_DEVICE_WORKER_CLASS
(
DownpourWorker
);
REGISTER_DEVICE_WORKER_CLASS
(
DownpourWorkerOpt
);
#ifdef PADDLE_WITH_PSLIB
REGISTER_DEVICE_WORKER_CLASS
(
HeterCpuWorker
);
#endif
#if defined(PADDLE_WITH_NCCL)
REGISTER_DEVICE_WORKER_CLASS
(
SectionWorker
);
#endif
...
...
paddle/fluid/framework/dist_multi_trainer.cc
浏览文件 @
862dde5e
...
...
@@ -35,7 +35,7 @@ void DistMultiTrainer::Initialize(const TrainerDesc &trainer_desc,
dump_file_num_
=
trainer_desc
.
dump_file_num
();
const
std
::
vector
<
paddle
::
framework
::
DataFeed
*>
readers
=
dataset
->
GetReaders
();
RegisterHeterCallback
();
thread_num_
=
readers
.
size
();
workers_
.
resize
(
thread_num_
);
for
(
int
i
=
0
;
i
<
trainer_desc
.
downpour_param
().
stat_var_names_size
();
...
...
@@ -55,6 +55,7 @@ void DistMultiTrainer::Initialize(const TrainerDesc &trainer_desc,
workers_
[
i
]
->
SetDumpParamVector
(
dump_param_
);
workers_
[
i
]
->
InitRandomDumpConfig
(
trainer_desc
);
workers_
[
i
]
->
Initialize
(
trainer_desc
);
workers_
[
i
]
->
SetWorkerNum
(
thread_num_
);
}
VLOG
(
3
)
<<
"going to initialize pull dense worker"
;
...
...
@@ -64,6 +65,13 @@ void DistMultiTrainer::Initialize(const TrainerDesc &trainer_desc,
SetDebug
(
trainer_desc
.
debug
());
}
void
DistMultiTrainer
::
RegisterHeterCallback
()
{
auto
fleet_ptr
=
FleetWrapper
::
GetInstance
();
fleet_ptr
->
RegisterHeterCallback
([
this
](
int
worker
,
int
taskid
)
{
// workers_[worker]->Schedule(taskid);
});
}
void
DistMultiTrainer
::
InitDumpEnv
()
{
queue_
=
paddle
::
framework
::
MakeChannel
<
std
::
string
>
();
for
(
int
i
=
0
;
i
<
thread_num_
;
++
i
)
{
...
...
@@ -90,6 +98,9 @@ void DistMultiTrainer::InitTrainerEnv(const ProgramDesc &main_program,
workers_
[
i
]
->
SetRootScope
(
root_scope_
);
workers_
[
i
]
->
CreateDeviceResource
(
main_program
);
// Program
workers_
[
i
]
->
BindingDataFeedMemory
();
#ifdef PADDLE_WITH_PSLIB
workers_
[
i
]
->
CacheProgram
(
main_program
);
#endif
}
// Scope* -> thread id, it will be used in push_dense op
for
(
int
i
=
0
;
i
<
thread_num_
;
++
i
)
{
...
...
@@ -104,6 +115,11 @@ void DistMultiTrainer::InitOtherEnv(const ProgramDesc &main_program) {
}
pull_dense_worker_
->
SetRootScope
(
root_scope_
);
pull_dense_worker_
->
Start
();
#ifdef PADDLE_WITH_PSLIB
for
(
int
i
=
0
;
i
<
thread_num_
;
++
i
)
{
workers_
[
i
]
->
GetXpuOpIndex
();
}
#endif
VLOG
(
3
)
<<
"init other env done."
;
}
...
...
paddle/fluid/framework/downpour_worker.cc
浏览文件 @
862dde5e
...
...
@@ -379,7 +379,7 @@ void DownpourWorker::CopyDenseTable() {
pull_dense_status
.
resize
(
0
);
fleet_ptr_
->
PullDenseVarsAsync
(
*
root_scope_
,
dest_table
,
dense_value_names_
[
dest_table
],
&
pull_dense_status
);
&
pull_dense_status
,
true
);
for
(
auto
&
t
:
pull_dense_status
)
{
t
.
wait
();
auto
status
=
t
.
get
();
...
...
paddle/fluid/framework/fleet/CMakeLists.txt
浏览文件 @
862dde5e
...
...
@@ -19,4 +19,6 @@ else()
cc_library
(
gloo_wrapper SRCS gloo_wrapper.cc DEPS framework_proto variable_helper scope
)
endif
(
WITH_GLOO
)
cc_library
(
heter_wrapper SRCS heter_wrapper.cc DEPS framework_proto
)
cc_test
(
test_fleet SRCS test_fleet.cc DEPS fleet_wrapper gloo_wrapper fs shell
)
paddle/fluid/framework/fleet/fleet_wrapper.cc
浏览文件 @
862dde5e
...
...
@@ -154,6 +154,219 @@ void FleetWrapper::CreateClient2ClientConnection() {
#endif
}
#ifdef PADDLE_WITH_PSLIB
void
FleetWrapper
::
HeterPullSparseVars
(
int
workerid
,
std
::
shared_ptr
<
HeterTask
>
task
,
const
uint64_t
table_id
,
const
std
::
vector
<
std
::
string
>&
var_names
,
int
fea_value_dim
,
const
std
::
vector
<
std
::
string
>&
var_emb_names
)
{
std
::
vector
<::
std
::
future
<
int32_t
>>
pull_sparse_status
;
pull_sparse_status
.
resize
(
0
);
auto
&
scope
=
*
(
task
->
scope_
);
auto
&
fea_keys
=
(
task
->
features_
)[
table_id
];
auto
&
fea_values
=
(
task
->
feature_values_
)[
table_id
];
fea_keys
.
clear
();
for
(
size_t
var_index
=
0
;
var_index
<
var_names
.
size
();
++
var_index
)
{
const
std
::
string
&
name
=
var_names
[
var_index
];
Variable
*
var
=
scope
.
FindVar
(
name
);
if
(
var
==
nullptr
)
{
continue
;
}
LoDTensor
*
tensor
=
var
->
GetMutable
<
LoDTensor
>
();
CHECK
(
tensor
!=
nullptr
)
<<
"tensor of var "
<<
name
<<
" is null"
;
int64_t
*
ids
=
tensor
->
data
<
int64_t
>
();
size_t
len
=
tensor
->
numel
();
// skip slots which do not have embedding
const
std
::
string
&
emb_name
=
var_emb_names
[
var_index
];
Variable
*
emb_var
=
scope
.
FindVar
(
emb_name
);
if
(
emb_var
==
nullptr
)
{
continue
;
}
for
(
auto
i
=
0u
;
i
<
len
;
++
i
)
{
if
(
ids
[
i
]
==
0u
)
{
continue
;
}
fea_keys
.
push_back
(
static_cast
<
uint64_t
>
(
ids
[
i
]));
}
}
fea_values
.
resize
(
fea_keys
.
size
()
+
1
);
for
(
auto
&
t
:
fea_values
)
{
t
.
resize
(
fea_value_dim
);
}
std
::
vector
<
float
*>
pull_result_ptr
;
for
(
auto
&
t
:
fea_values
)
{
pull_result_ptr
.
push_back
(
t
.
data
());
}
auto
status
=
pslib_ptr_
->
_worker_ptr
->
heter_pull_sparse
(
workerid
,
pull_result_ptr
.
data
(),
table_id
,
fea_keys
.
data
(),
fea_keys
.
size
(),
task
->
taskid_
);
pull_sparse_status
.
push_back
(
std
::
move
(
status
));
for
(
auto
&
t
:
pull_sparse_status
)
{
t
.
wait
();
auto
status
=
t
.
get
();
if
(
status
!=
0
)
{
LOG
(
ERROR
)
<<
"fleet pull sparse failed, status["
<<
status
<<
"]"
;
sleep
(
sleep_seconds_before_fail_exit_
);
exit
(
-
1
);
}
}
}
void
FleetWrapper
::
HeterPushSparseVars
(
std
::
shared_ptr
<
HeterTask
>
task
,
const
uint64_t
table_id
,
const
std
::
vector
<
std
::
string
>&
sparse_key_names
,
const
std
::
vector
<
std
::
string
>&
sparse_grad_names
,
const
int
emb_dim
,
std
::
vector
<::
std
::
future
<
int32_t
>>*
push_sparse_status
,
const
bool
use_cvm
,
const
bool
dump_slot
,
const
bool
no_cvm
)
{
auto
&
scope
=
*
(
task
->
scope_
);
int
batch_size
=
task
->
cur_batch_
;
int
offset
=
2
;
int
slot_offset
=
0
;
int
grad_dim
=
emb_dim
;
int
show_index
=
0
;
int
click_index
=
1
;
auto
&
fea_keys
=
(
task
->
features_
)[
table_id
];
auto
&
fea_labels
=
(
task
->
feature_labels_
)[
table_id
];
auto
&
push_values
=
(
task
->
feature_grads_
)[
table_id
];
auto
&
sparse_push_keys
=
(
task
->
sparse_push_keys_
)[
table_id
];
if
(
use_cvm
)
{
offset
=
0
;
grad_dim
=
emb_dim
-
2
;
}
if
(
no_cvm
)
{
offset
=
0
;
grad_dim
=
emb_dim
;
}
if
(
dump_slot
)
{
slot_offset
=
1
;
show_index
=
1
;
click_index
=
2
;
}
CHECK_GE
(
grad_dim
,
0
);
sparse_push_keys
.
clear
();
sparse_push_keys
.
reserve
(
fea_keys
.
size
()
+
1
);
push_values
.
resize
(
fea_keys
.
size
()
+
1
);
for
(
auto
&
t
:
push_values
)
{
t
.
resize
(
emb_dim
+
offset
+
slot_offset
);
}
uint64_t
fea_idx
=
0u
;
for
(
size_t
i
=
0
;
i
<
sparse_key_names
.
size
()
&&
i
<
sparse_grad_names
.
size
();
++
i
)
{
Variable
*
var
=
scope
.
FindVar
(
sparse_key_names
[
i
]);
if
(
var
==
nullptr
)
{
continue
;
}
LoDTensor
*
tensor
=
var
->
GetMutable
<
LoDTensor
>
();
if
(
tensor
==
nullptr
)
{
LOG
(
ERROR
)
<<
"tensor of var["
<<
sparse_key_names
[
i
]
<<
"] is null"
;
exit
(
-
1
);
}
size_t
len
=
tensor
->
numel
();
int64_t
*
ids
=
tensor
->
data
<
int64_t
>
();
int
slot
=
0
;
if
(
dump_slot
)
{
slot
=
boost
::
lexical_cast
<
int
>
(
sparse_key_names
[
i
]);
}
Variable
*
g_var
=
scope
.
FindVar
(
sparse_grad_names
[
i
]);
if
(
g_var
==
nullptr
)
{
continue
;
}
LoDTensor
*
g_tensor
=
g_var
->
GetMutable
<
LoDTensor
>
();
if
(
g_tensor
==
nullptr
)
{
LOG
(
ERROR
)
<<
"tensor of var["
<<
sparse_key_names
[
i
]
<<
"] is null"
;
exit
(
-
1
);
}
float
*
g
=
g_tensor
->
data
<
float
>
();
if
(
scale_sparse_gradient_with_batch_size_
&&
grad_dim
>
0
)
{
int
dim
=
emb_dim
+
offset
;
Eigen
::
Map
<
Eigen
::
Matrix
<
float
,
Eigen
::
Dynamic
,
Eigen
::
Dynamic
,
Eigen
::
RowMajor
>>
g_mat
(
g
,
g_tensor
->
numel
()
/
dim
,
dim
);
g_mat
.
rightCols
(
grad_dim
)
*=
batch_size
;
}
for
(
auto
id_idx
=
0u
;
id_idx
<
len
;
++
id_idx
)
{
if
(
ids
[
id_idx
]
==
0
)
{
g
+=
emb_dim
;
continue
;
}
sparse_push_keys
.
push_back
(
ids
[
id_idx
]);
CHECK
(
fea_idx
<
push_values
.
size
());
if
(
use_cvm
||
no_cvm
)
{
memcpy
(
push_values
[
fea_idx
].
data
()
+
offset
+
slot_offset
,
g
,
sizeof
(
float
)
*
emb_dim
);
}
else
{
CHECK
(
fea_idx
<
fea_labels
.
size
());
memcpy
(
push_values
[
fea_idx
].
data
()
+
offset
+
slot_offset
,
g
,
sizeof
(
float
)
*
emb_dim
);
push_values
[
fea_idx
][
show_index
]
=
1.0
f
;
push_values
[
fea_idx
][
click_index
]
=
static_cast
<
float
>
(
fea_labels
[
fea_idx
]);
}
if
(
dump_slot
)
{
push_values
[
fea_idx
][
0
]
=
static_cast
<
float
>
(
slot
);
}
g
+=
emb_dim
;
fea_idx
++
;
}
}
// slots whose embedding has been stop gradient or
// not involved in forward-backward
uint64_t
no_grad_fea_num
=
0u
;
for
(
size_t
i
=
sparse_grad_names
.
size
();
i
<
sparse_key_names
.
size
();
++
i
)
{
Variable
*
var
=
scope
.
FindVar
(
sparse_key_names
[
i
]);
if
(
var
==
nullptr
)
{
continue
;
}
LoDTensor
*
tensor
=
var
->
GetMutable
<
LoDTensor
>
();
if
(
tensor
==
nullptr
)
{
LOG
(
ERROR
)
<<
"tensor of var["
<<
sparse_key_names
[
i
]
<<
"] is null"
;
exit
(
-
1
);
}
size_t
len
=
tensor
->
numel
();
int64_t
*
ids
=
tensor
->
data
<
int64_t
>
();
for
(
auto
id_idx
=
0u
;
id_idx
<
len
;
++
id_idx
)
{
if
(
ids
[
id_idx
]
==
0
)
{
continue
;
}
++
no_grad_fea_num
;
}
}
CHECK
(
fea_idx
+
no_grad_fea_num
==
fea_keys
.
size
())
<<
"fea_idx: "
<<
fea_idx
<<
" no_grad_fea_num: "
<<
no_grad_fea_num
<<
" features size: "
<<
fea_keys
.
size
();
CHECK
(
fea_idx
==
sparse_push_keys
.
size
());
if
(
fea_idx
==
0
)
{
return
;
}
std
::
vector
<
float
*>
push_g_vec
;
for
(
auto
i
=
0u
;
i
<
sparse_push_keys
.
size
();
++
i
)
{
push_g_vec
.
push_back
(
push_values
[
i
].
data
());
}
auto
status
=
pslib_ptr_
->
_worker_ptr
->
push_sparse
(
table_id
,
sparse_push_keys
.
data
(),
(
const
float
**
)
push_g_vec
.
data
(),
sparse_push_keys
.
size
());
push_sparse_status
->
push_back
(
std
::
move
(
status
));
}
#endif
int
FleetWrapper
::
RegisterHeterCallback
(
HeterCallBackFunc
handler
)
{
#ifdef PADDLE_WITH_PSLIB
VLOG
(
3
)
<<
"calling FleetWrapper::RegisterHeterCallback"
;
VLOG
(
3
)
<<
"pslib_ptr_="
<<
pslib_ptr_
;
VLOG
(
3
)
<<
"_worker_ptr="
<<
pslib_ptr_
->
_worker_ptr
;
return
pslib_ptr_
->
_worker_ptr
->
registe_heter_callback
(
handler
);
#else
VLOG
(
0
)
<<
"FleetWrapper::RegisterHeterCallback"
<<
" does nothing when no pslib"
;
#endif
return
0
;
}
void
FleetWrapper
::
PullSparseToLocal
(
const
uint64_t
table_id
,
int
fea_value_dim
)
{
#ifdef PADDLE_WITH_PSLIB
...
...
@@ -421,13 +634,17 @@ void FleetWrapper::PullSparseToTensorSync(const uint64_t table_id, int fea_dim,
void
FleetWrapper
::
PullDenseVarsAsync
(
const
Scope
&
scope
,
const
uint64_t
tid
,
const
std
::
vector
<
std
::
string
>&
var_names
,
std
::
vector
<::
std
::
future
<
int32_t
>>*
pull_dense_status
)
{
std
::
vector
<::
std
::
future
<
int32_t
>>*
pull_dense_status
,
bool
in_cpu
)
{
#ifdef PADDLE_WITH_PSLIB
auto
&
regions
=
_regions
[
tid
];
regions
.
clear
();
regions
.
resize
(
var_names
.
size
());
for
(
auto
i
=
0u
;
i
<
var_names
.
size
();
++
i
)
{
Variable
*
var
=
scope
.
FindVar
(
var_names
[
i
]);
std
::
string
varname
=
var_names
[
i
];
if
(
!
in_cpu
)
{
varname
=
var_names
[
i
]
+
"pin"
;
}
Variable
*
var
=
scope
.
FindVar
(
varname
);
LoDTensor
*
tensor
=
var
->
GetMutable
<
LoDTensor
>
();
float
*
w
=
tensor
->
data
<
float
>
();
paddle
::
ps
::
Region
reg
(
w
,
tensor
->
numel
());
...
...
@@ -485,6 +702,57 @@ void FleetWrapper::PushDenseVarsSync(
Scope
*
scope
,
const
uint64_t
table_id
,
const
std
::
vector
<
std
::
string
>&
var_names
)
{}
#if (defined PADDLE_WITH_CUDA) && (defined PADDLE_WITH_PSLIB)
void
FleetWrapper
::
PushDenseVarsAsync
(
const
Scope
&
scope
,
const
uint64_t
table_id
,
const
std
::
vector
<
std
::
string
>&
var_names
,
std
::
vector
<::
std
::
future
<
int32_t
>>*
push_sparse_status
,
float
scale_datanorm
,
int
batch_size
,
const
paddle
::
platform
::
Place
&
place
,
cudaStream_t
stream
,
cudaEvent_t
event
)
{
std
::
vector
<
paddle
::
ps
::
Region
>
regions
;
for
(
auto
&
t
:
var_names
)
{
Variable
*
var
=
scope
.
FindVar
(
t
);
LoDTensor
*
tensor
=
var
->
GetMutable
<
LoDTensor
>
();
int
count
=
tensor
->
numel
();
float
*
g_data
=
tensor
->
data
<
float
>
();
Variable
*
pin_var
=
scope
.
FindVar
(
t
+
"pin"
);
LoDTensor
*
pin_tensor
=
pin_var
->
GetMutable
<
LoDTensor
>
();
float
*
pin_g
=
pin_tensor
->
mutable_data
<
float
>
(
tensor
->
dims
(),
platform
::
CUDAPinnedPlace
());
memory
::
Copy
(
platform
::
CUDAPinnedPlace
(),
pin_g
,
BOOST_GET_CONST
(
platform
::
CUDAPlace
,
place
),
g_data
,
sizeof
(
float
)
*
count
,
stream
);
PADDLE_ENFORCE_CUDA_SUCCESS
(
cudaEventRecord
(
event
,
stream
));
cudaEventSynchronize
(
event
);
float
*
g
=
pin_g
;
if
(
scale_datanorm
>=
0
)
{
if
(
t
.
find
(
".batch_size@GRAD"
)
!=
std
::
string
::
npos
||
t
.
find
(
".batch_sum@GRAD"
)
!=
std
::
string
::
npos
)
{
Eigen
::
Map
<
Eigen
::
MatrixXf
>
mat
(
g
,
1
,
count
);
float
scale
=
1.0
/
batch_size
;
mat
*=
scale
;
}
else
if
(
t
.
find
(
".batch_square_sum@GRAD"
)
!=
std
::
string
::
npos
)
{
VLOG
(
3
)
<<
"epsilon: "
<<
scale_datanorm
;
for
(
int
i
=
0
;
i
<
count
;
++
i
)
{
g
[
i
]
=
(
g
[
i
]
-
batch_size
*
scale_datanorm
)
/
batch_size
+
batch_size
*
scale_datanorm
;
}
}
}
paddle
::
ps
::
Region
reg
(
g
,
count
);
regions
.
emplace_back
(
std
::
move
(
reg
));
}
auto
status
=
pslib_ptr_
->
_worker_ptr
->
push_dense
(
regions
.
data
(),
regions
.
size
(),
table_id
);
if
(
push_sparse_status
)
{
push_sparse_status
->
push_back
(
std
::
move
(
status
));
}
}
#endif
void
FleetWrapper
::
PushDenseVarsAsync
(
const
Scope
&
scope
,
const
uint64_t
table_id
,
const
std
::
vector
<
std
::
string
>&
var_names
,
...
...
@@ -1085,8 +1353,8 @@ void FleetWrapper::ShrinkDenseTable(int table_id, Scope* scope,
push_status
.
wait
();
auto
status
=
push_status
.
get
();
if
(
status
!=
0
)
{
PADDLE_THORW
(
platform
::
errors
::
Fatal
(
"push shrink dense param failed, status is [%d]."
,
status
));
//
PADDLE_THORW(platform::errors::Fatal(
//
"push shrink dense param failed, status is [%d].", status));
sleep
(
sleep_seconds_before_fail_exit_
);
exit
(
-
1
);
}
...
...
paddle/fluid/framework/fleet/fleet_wrapper.h
浏览文件 @
862dde5e
...
...
@@ -28,6 +28,7 @@ limitations under the License. */
#include <unordered_map>
#include <vector>
#include "paddle/fluid/framework/heter_service.h"
#include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/framework/tensor.h"
...
...
@@ -80,6 +81,24 @@ class FleetWrapper {
pull_local_thread_num_
=
thread_num
;
}
#ifdef PADDLE_WITH_PSLIB
void
HeterPullSparseVars
(
int
workerid
,
std
::
shared_ptr
<
HeterTask
>
task
,
const
uint64_t
table_id
,
const
std
::
vector
<
std
::
string
>&
var_names
,
int
fea_dim
,
const
std
::
vector
<
std
::
string
>&
var_emb_names
);
void
HeterPushSparseVars
(
std
::
shared_ptr
<
HeterTask
>
task
,
const
uint64_t
table_id
,
const
std
::
vector
<
std
::
string
>&
sparse_key_names
,
const
std
::
vector
<
std
::
string
>&
sparse_grad_names
,
const
int
emb_dim
,
std
::
vector
<::
std
::
future
<
int32_t
>>*
push_sparse_status
,
const
bool
use_cvm
,
const
bool
dump_slot
,
const
bool
no_cvm
);
#endif
typedef
std
::
function
<
void
(
int
,
int
)
>
HeterCallBackFunc
;
int
RegisterHeterCallback
(
HeterCallBackFunc
handler
);
// Pull sparse variables from server in sync mode
// Param<in>: scope, table_id, var_names, fea_keys, fea_dim, var_emb_names
// Param<out>: fea_values
...
...
@@ -118,15 +137,24 @@ class FleetWrapper {
void
PullDenseVarsAsync
(
const
Scope
&
scope
,
const
uint64_t
table_id
,
const
std
::
vector
<
std
::
string
>&
var_names
,
std
::
vector
<::
std
::
future
<
int32_t
>>*
pull_dense_status
);
std
::
vector
<::
std
::
future
<
int32_t
>>*
pull_dense_status
,
bool
in_cpu
);
// push dense parameters(not gradients) to server in sync mode
void
PushDenseParamSync
(
const
Scope
&
scope
,
const
uint64_t
table_id
,
const
std
::
vector
<
std
::
string
>&
var_names
);
// Push dense variables to server in async mode
// Param<in>: scope, table_id, var_names, scale_datanorm, batch_size
// Param<out>: push_sparse_status
// Push dense variables to server in async mode
// Param<in>: scope, table_id, var_names, scale_datanorm, batch_size
// Param<out>: push_sparse_status
#ifdef PADDLE_WITH_CUDA
void
PushDenseVarsAsync
(
const
Scope
&
scope
,
const
uint64_t
table_id
,
const
std
::
vector
<
std
::
string
>&
var_names
,
std
::
vector
<::
std
::
future
<
int32_t
>>*
push_sparse_status
,
float
scale_datanorm
,
int
batch_size
,
const
paddle
::
platform
::
Place
&
place
,
cudaStream_t
stream
,
cudaEvent_t
event
);
#endif
void
PushDenseVarsAsync
(
const
Scope
&
scope
,
const
uint64_t
table_id
,
const
std
::
vector
<
std
::
string
>&
var_names
,
...
...
paddle/fluid/framework/fleet/gloo_wrapper.cc
浏览文件 @
862dde5e
...
...
@@ -54,10 +54,10 @@ void HdfsStore::set(const std::string& key, const std::vector<char>& data) {
paddle
::
framework
::
fs_remove
(
tmp
);
if
(
i
==
retry_times_
)
{
VLOG
(
0
)
<<
"fs_open_write failed, retry times reaches limit"
;
PADDLE_THROW
(
platform
::
errors
::
PreconditionNotMet
(
"fs_open_write failed, retry times reaches"
" limit "
,
retry_times_
));
//
PADDLE_THROW(platform::errors::PreconditionNotMet(
//
"fs_open_write failed, retry times reaches"
//
" limit ",
//
retry_times_));
}
}
else
{
break
;
...
...
@@ -143,9 +143,9 @@ void HdfsStore::wait(const std::vector<std::string>& keys,
break
;
}
}
PADDLE_THROW
(
platform
::
errors
::
ExecutionTimeout
(
"TIMEOUT self_rank = %d pair_rank = %d"
,
self_rank_
,
last_check_rank
))
;
//
PADDLE_THROW(platform::errors::ExecutionTimeout(
VLOG
(
0
)
<<
"TIMEOUT self_rank = "
<<
self_rank_
<<
" pair_rank = "
<<
last_check_rank
;
}
std
::
this_thread
::
sleep_for
(
std
::
chrono
::
milliseconds
(
wait_sleep_ms_
));
}
...
...
paddle/fluid/framework/fleet/heter_wrapper.cc
0 → 100644
浏览文件 @
862dde5e
// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/fleet/heter_wrapper.h"
#include <algorithm>
#include <utility>
#include "paddle/fluid/framework/channel.h"
#include "paddle/fluid/framework/data_feed.h"
#include "paddle/fluid/framework/device_worker.h"
#include "paddle/fluid/framework/io/fs.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/platform/timer.h"
#ifdef PADDLE_WITH_PSLIB
namespace
paddle
{
namespace
framework
{
std
::
shared_ptr
<
HeterWrapper
>
HeterWrapper
::
s_instance_
=
NULL
;
bool
HeterWrapper
::
is_initialized_
=
false
;
void
HeterWrapper
::
CreateClient2XpuConnection
()
{
brpc
::
ChannelOptions
options
;
options
.
protocol
=
"baidu_std"
;
options
.
connection_type
=
"single"
;
options
.
timeout_ms
=
2000000
;
xpu_channels_
.
resize
(
xpu_list_
.
size
());
for
(
size_t
i
=
0
;
i
<
xpu_list_
.
size
();
++
i
)
{
VLOG
(
3
)
<<
"channel init: "
<<
xpu_list_
[
i
];
xpu_channels_
[
i
].
reset
(
new
brpc
::
Channel
());
if
(
xpu_channels_
[
i
]
->
Init
(
xpu_list_
[
i
].
c_str
(),
""
,
&
options
)
!=
0
)
{
VLOG
(
0
)
<<
"server channel init fail"
;
}
}
}
void
HeterWrapper
::
RegisterServiceHandler
(
int
cmd
,
HeterServiceHandler
func
)
{
service_
.
RegisterServiceHandler
(
cmd
,
func
);
}
void
HeterWrapper
::
SetXpuList
(
const
std
::
vector
<
std
::
string
>&
xpu_list
)
{
#ifdef PADDLE_WITH_PSLIB
VLOG
(
3
)
<<
"Going to set xpu list"
;
for
(
auto
&
x
:
xpu_list
)
{
xpu_list_
.
push_back
(
x
);
VLOG
(
3
)
<<
"set xpu list: "
<<
x
<<
" size: "
<<
xpu_list_
.
size
();
}
#endif
}
void
HeterWrapper
::
StartXpuService
(
const
std
::
string
&
ip
,
uint32_t
port
)
{
std
::
string
ip_port
=
ip
+
":"
+
std
::
to_string
(
port
);
VLOG
(
3
)
<<
"xpu server starts at "
<<
ip_port
;
server_
.
AddService
(
&
service_
,
brpc
::
SERVER_DOESNT_OWN_SERVICE
);
brpc
::
ServerOptions
options
;
if
(
server_
.
Start
(
ip_port
.
c_str
(),
&
options
)
!=
0
)
{
VLOG
(
0
)
<<
"xpu server start fail"
;
}
}
// void HeterWrapper::SerializeToReq(const std::string& varname,
// Scope* scope, HeterRequest& request) {
// auto* req_var = request.mutable_vars();
void
HeterWrapper
::
SerializeToReq
(
const
std
::
string
&
varname
,
Scope
*
scope
,
VariableMessage
*
req_var
)
{
Variable
*
var
=
scope
->
FindVar
(
varname
);
if
(
var
==
nullptr
)
{
return
;
}
LoDTensor
*
tensor
=
var
->
GetMutable
<
LoDTensor
>
();
req_var
->
set_varname
(
varname
);
req_var
->
set_type
(
LOD_TENSOR
);
req_var
->
set_data_type
(
static_cast
<
VariableMessage
::
Type
>
(
tensor
->
type
()));
for
(
auto
&
dim
:
framework
::
vectorize
(
tensor
->
dims
()))
{
req_var
->
add_dims
(
dim
);
}
const
framework
::
LoD
lod
=
tensor
->
lod
();
if
(
lod
.
size
()
>
0
)
{
req_var
->
set_lod_level
(
lod
.
size
());
for
(
auto
&
each
:
lod
)
{
VariableMessage
::
LodData
*
lod_inner
=
req_var
->
add_lod
();
for
(
auto
&
d
:
each
)
{
lod_inner
->
add_lod_data
(
d
);
}
}
}
auto
*
req_data
=
req_var
->
mutable_data
();
req_data
->
clear
();
req_data
->
resize
(
tensor
->
numel
()
*
SizeOfType
(
tensor
->
type
()));
char
*
data_ptr
=
const_cast
<
char
*>
(
req_data
->
data
());
if
(
platform
::
is_cpu_place
(
tensor
->
place
()))
{
memcpy
(
data_ptr
,
tensor
->
data
<
void
>
(),
tensor
->
numel
()
*
SizeOfType
(
tensor
->
type
()));
}
#ifdef PADDLE_WITH_CUDA
else
{
memory
::
Copy
(
platform
::
CPUPlace
(),
data_ptr
,
BOOST_GET_CONST
(
platform
::
CUDAPlace
,
tensor
->
place
()),
tensor
->
data
<
void
>
(),
tensor
->
numel
()
*
SizeOfType
(
tensor
->
type
()),
nullptr
);
}
#endif
}
// void HeterWrapper::DeSerializeToTensor(Scope* scope,
// const HeterRequest* request) {
#ifdef PADDLE_WITH_CUDA
void
HeterWrapper
::
DeSerializeToTensor
(
Scope
*
scope
,
const
VariableMessage
&
req_var
,
platform
::
Place
place
,
cudaStream_t
stream
)
{
#else
void
HeterWrapper
::
DeSerializeToTensor
(
Scope
*
scope
,
const
VariableMessage
&
req_var
,
platform
::
Place
place
)
{
#endif
// const VariableMessage& req_var = request->vars();
auto
*
var
=
scope
->
FindVar
(
req_var
.
varname
());
auto
*
tensor
=
var
->
GetMutable
<
LoDTensor
>
();
std
::
vector
<
int
>
vec_dim
;
for
(
auto
&
x
:
req_var
.
dims
())
{
vec_dim
.
push_back
(
x
);
}
tensor
->
Resize
(
make_ddim
(
vec_dim
));
LoD
lod
;
for
(
int
i
=
0
;
i
<
req_var
.
lod_level
();
++
i
)
{
framework
::
Vector
<
size_t
>
v
;
for
(
int
j
=
0
;
j
<
req_var
.
lod
(
i
).
lod_data_size
();
++
j
)
{
v
.
push_back
(
req_var
.
lod
(
i
).
lod_data
(
j
));
}
lod
.
push_back
(
v
);
}
tensor
->
set_lod
(
lod
);
void
*
tensor_data
=
tensor
->
mutable_data
(
place
,
ToVarType
(
req_var
.
data_type
()));
#ifdef PADDLE_WITH_CUDA
memory
::
Copy
(
BOOST_GET_CONST
(
platform
::
CUDAPlace
,
place
),
tensor_data
,
platform
::
CPUPlace
(),
req_var
.
data
().
data
(),
tensor
->
numel
()
*
SizeOfType
(
tensor
->
type
()),
stream
);
#else
memcpy
(
tensor_data
,
req_var
.
data
().
data
(),
tensor
->
numel
()
*
SizeOfType
(
tensor
->
type
()));
#endif
}
framework
::
proto
::
VarType
::
Type
HeterWrapper
::
ToVarType
(
VariableMessage
::
Type
type
)
{
switch
(
type
)
{
case
VariableMessage
::
FP32
:
return
framework
::
proto
::
VarType
::
FP32
;
// NOLINT
case
VariableMessage
::
FP64
:
return
framework
::
proto
::
VarType
::
FP64
;
// NOLINT
case
VariableMessage
::
INT32
:
return
framework
::
proto
::
VarType
::
INT32
;
// NOLINT
case
VariableMessage
::
INT64
:
return
framework
::
proto
::
VarType
::
INT64
;
// NOLINT
case
VariableMessage
::
BOOL
:
return
framework
::
proto
::
VarType
::
BOOL
;
// NOLINT
default:
VLOG
(
0
)
<<
"Not support type "
<<
type
;
}
}
void
HeterWrapper
::
StopXpuService
(
int
num
)
{
HeterRequest
request
;
HeterResponse
response
;
brpc
::
Controller
cntl
;
request
.
set_cmd
(
2
);
// for (size_t i = 0; i < xpu_channels_.size(); ++i) {
HeterService_Stub
stub
(
xpu_channels_
[
num
].
get
());
stub
.
service
(
&
cntl
,
&
request
,
&
response
,
NULL
);
if
(
cntl
.
Failed
())
{
VLOG
(
0
)
<<
"call stop xpu service fail: "
<<
cntl
.
ErrorText
();
}
else
{
VLOG
(
3
)
<<
"call stop xpu service success"
;
}
// }
}
void
HeterWrapper
::
EndPass
(
Scope
*
scope
,
int
num
)
{
HeterRequest
request
;
HeterResponse
response
;
brpc
::
Controller
cntl
;
request
.
set_cmd
(
1
);
// for (size_t i = 0; i < xpu_channels_.size(); ++i) {
HeterService_Stub
stub
(
xpu_channels_
[
num
].
get
());
stub
.
service
(
&
cntl
,
&
request
,
&
response
,
NULL
);
if
(
cntl
.
Failed
())
{
VLOG
(
0
)
<<
"call end pass fail: "
<<
cntl
.
ErrorText
();
}
else
{
VLOG
(
3
)
<<
"call end pass success"
;
for
(
int
j
=
0
;
j
<
response
.
vars_size
();
++
j
)
{
DeSerializeToTensor
(
scope
,
response
.
vars
(
j
),
platform
::
CPUPlace
());
}
}
// }
}
void
HeterWrapper
::
CallRemoteXpu
(
std
::
shared_ptr
<
HeterTask
>
task
,
HeterCpuWorker
*
worker
,
int
mpi_rank
,
std
::
vector
<
std
::
string
>&
send_vars
)
{
HeterRequest
request
;
request
.
set_cmd
(
0
);
request
.
set_cur_batch
(
task
->
cur_batch_
);
OnHeterRpcDone
*
done
=
new
OnHeterRpcDone
([
this
,
task
,
worker
](
void
*
done
)
{
auto
*
closure
=
(
OnHeterRpcDone
*
)
done
;
if
(
closure
->
cntl
.
Failed
())
{
VLOG
(
0
)
<<
"call xpu fail: "
<<
closure
->
cntl
.
ErrorText
();
}
else
{
VLOG
(
3
)
<<
"call xpu success"
;
}
// DeSerializeToTensor(task->scope_,
// closure->response.vars(), platform::CPUPlace());
for
(
int
i
=
0
;
i
<
closure
->
response
.
vars_size
();
++
i
)
{
DeSerializeToTensor
(
task
->
scope_
,
closure
->
response
.
vars
(
i
),
platform
::
CPUPlace
());
}
worker
->
Schedule
(
task
->
taskid_
);
});
// std::vector<std::string> varnames = {"click", "12345"};
// //varnames.push_back(send_var);
// //if (send_var == "_generated_var_412") {
// varnames.push_back("filter_by_instag_0.tmp_0");
// varnames.push_back("filter_by_instag_2.tmp_0");
// varnames.push_back("filter_by_instag_0.tmp_1");
// varnames.push_back("concat_1.tmp_0");
// }
for
(
auto
&
varname
:
send_vars
)
{
auto
*
req_var
=
request
.
add_vars
();
SerializeToReq
(
varname
,
task
->
scope_
,
req_var
);
}
int
num
=
mpi_rank
%
xpu_channels_
.
size
();
HeterService_Stub
stub
(
xpu_channels_
[
num
].
get
());
// stub.service(&cntl, &request, &response,
// brpc::NewCallback(&HeterWrapper::RpcCallBack,
// response, cntl, worker, task));
stub
.
service
(
&
done
->
cntl
,
&
request
,
&
done
->
response
,
done
);
}
void
HeterWrapper
::
CallRemoteXpuSync
(
std
::
shared_ptr
<
HeterTask
>
task
,
HeterCpuWorker
*
worker
,
int
mpi_rank
,
std
::
vector
<
std
::
string
>&
send_vars
)
{
HeterRequest
request
;
HeterResponse
response
;
brpc
::
Controller
cntl
;
request
.
set_cmd
(
0
);
request
.
set_cur_batch
(
task
->
cur_batch_
);
// std::vector<std::string> varnames = {"concat_1.tmp_0", "click", "12345"};
for
(
auto
&
varname
:
send_vars
)
{
auto
*
req_var
=
request
.
add_vars
();
SerializeToReq
(
varname
,
task
->
scope_
,
req_var
);
}
HeterService_Stub
stub
(
xpu_channels_
[
0
].
get
());
stub
.
service
(
&
cntl
,
&
request
,
&
response
,
NULL
);
if
(
cntl
.
Failed
())
{
VLOG
(
0
)
<<
"call xpu fail: "
<<
cntl
.
ErrorText
();
}
else
{
VLOG
(
3
)
<<
"call xpu success"
;
for
(
int
i
=
0
;
i
<
response
.
vars_size
();
++
i
)
{
DeSerializeToTensor
(
task
->
scope_
,
response
.
vars
(
i
),
platform
::
CPUPlace
());
}
}
}
}
// end namespace framework
}
// end namespace paddle
#endif
paddle/fluid/framework/fleet/heter_wrapper.h
0 → 100644
浏览文件 @
862dde5e
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <atomic>
#include <ctime>
#include <map>
#include <memory>
#include <random>
#include <string>
#include <unordered_map>
#include <vector>
#ifdef PADDLE_WITH_PSLIB
#include "paddle/fluid/framework/heter_service.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/framework/tensor.h"
#include "paddle/fluid/framework/variable_helper.h"
#include "paddle/fluid/platform/macros.h" // for DISABLE_COPY_AND_ASSIGN
namespace
paddle
{
namespace
framework
{
class
HeterCpuWorker
;
typedef
std
::
function
<
void
(
void
*
)
>
HeterRpcCallbackFunc
;
class
OnHeterRpcDone
:
public
google
::
protobuf
::
Closure
{
public:
OnHeterRpcDone
(
HeterRpcCallbackFunc
func
)
:
handler_
(
func
)
{}
virtual
~
OnHeterRpcDone
()
{}
void
Run
()
{
std
::
unique_ptr
<
OnHeterRpcDone
>
self_guard
(
this
);
handler_
(
this
);
}
HeterRpcCallbackFunc
handler_
;
HeterResponse
response
;
brpc
::
Controller
cntl
;
};
class
HeterWrapper
{
public:
virtual
~
HeterWrapper
()
{
server_
.
Stop
(
1000
);
server_
.
Join
();
}
HeterWrapper
()
{}
static
void
HeterRpcCallBack
(
HeterResponse
*
response
,
brpc
::
Controller
*
cntl
,
HeterCpuWorker
*
worker
,
std
::
shared_ptr
<
HeterTask
>
task
);
void
CreateClient2XpuConnection
();
void
RegisterServiceHandler
(
int
cmd
,
HeterServiceHandler
func
);
void
StartXpuService
(
const
std
::
string
&
ip
,
uint32_t
port
);
void
CallRemoteXpu
(
std
::
shared_ptr
<
HeterTask
>
task
,
HeterCpuWorker
*
worker
,
int
mpi_rank
,
std
::
vector
<
std
::
string
>&
send_vars
);
void
CallRemoteXpuSync
(
std
::
shared_ptr
<
HeterTask
>
task
,
HeterCpuWorker
*
worker
,
int
mpi_rank
,
std
::
vector
<
std
::
string
>&
send_vars
);
void
StopXpuService
(
int
num
);
void
EndPass
(
Scope
*
scope
,
int
num
);
void
SerializeToReq
(
const
std
::
string
&
varname
,
Scope
*
scope
,
VariableMessage
*
req_var
);
framework
::
proto
::
VarType
::
Type
ToVarType
(
VariableMessage
::
Type
type
);
#ifdef PADDLE_WITH_CUDA
void
DeSerializeToTensor
(
Scope
*
scope
,
const
VariableMessage
&
req_var
,
platform
::
Place
place
,
cudaStream_t
stream
=
nullptr
);
#else
void
DeSerializeToTensor
(
Scope
*
scope
,
const
VariableMessage
&
req_var
,
platform
::
Place
place
);
#endif
// HeterWrapper singleton
static
std
::
shared_ptr
<
HeterWrapper
>
GetInstance
()
{
if
(
NULL
==
s_instance_
)
{
s_instance_
.
reset
(
new
paddle
::
framework
::
HeterWrapper
());
}
return
s_instance_
;
}
std
::
vector
<
std
::
string
>&
GetXpuList
()
{
return
xpu_list_
;
}
void
SetXpuList
(
const
std
::
vector
<
std
::
string
>&
xpu_list
);
private:
static
std
::
shared_ptr
<
HeterWrapper
>
s_instance_
;
protected:
std
::
vector
<
std
::
shared_ptr
<
brpc
::
Channel
>>
xpu_channels_
;
brpc
::
Server
server_
;
HeterXpuService
service_
;
static
bool
is_initialized_
;
DISABLE_COPY_AND_ASSIGN
(
HeterWrapper
);
std
::
vector
<
std
::
string
>
xpu_list_
;
};
}
// end namespace framework
}
// end namespace paddle
#endif
paddle/fluid/framework/heter_service.h
0 → 100644
浏览文件 @
862dde5e
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <fstream>
#include <memory>
#include <mutex> // NOLINT
#include <string>
#include <thread> // NOLINT
#include <unordered_map> // NOLINT
#include <unordered_set> // NOLINT
#include <vector>
#include "paddle/fluid/framework/heter_service.pb.h"
#include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/framework/scope.h"
#ifdef PADDLE_WITH_PSLIB
#include "brpc/channel.h"
#include "brpc/controller.h"
#include "brpc/server.h"
namespace
paddle
{
namespace
framework
{
typedef
std
::
function
<
int
(
const
HeterRequest
*
,
HeterResponse
*
)
>
HeterServiceHandler
;
class
DataFeed
;
class
HeterXpuService
:
public
HeterService
{
public:
HeterXpuService
()
{}
virtual
~
HeterXpuService
()
{}
void
service
(
::
google
::
protobuf
::
RpcController
*
controller
,
const
HeterRequest
*
request
,
HeterResponse
*
response
,
::
google
::
protobuf
::
Closure
*
done
)
{
brpc
::
ClosureGuard
done_guard
(
done
);
int
ret
=
0
;
int
cmd
=
request
->
cmd
();
auto
itr
=
handler_map_
.
find
(
cmd
);
if
(
itr
==
handler_map_
.
end
())
{
}
else
{
ret
=
itr
->
second
(
request
,
response
);
}
// response->set_err_code(0);
// response->set_err_msg("");
if
(
ret
!=
0
)
{
// response->set_err_code(-1);
// response->set_err_msg("xpu service error");
}
}
void
RegisterServiceHandler
(
int
cmd
,
HeterServiceHandler
func
)
{
VLOG
(
0
)
<<
"register heter service"
;
handler_map_
[
cmd
]
=
func
;
}
private:
std
::
unordered_map
<
int
,
HeterServiceHandler
>
handler_map_
;
};
enum
HeterTaskState
{
PULL_SPARSE
,
OP_RUN
,
XPU
,
OP_RUN_END
,
PUSH_GRAD
,
DONE
};
class
HeterTask
{
public:
void
Update
()
{
if
(
state_
==
PULL_SPARSE
)
{
state_
=
OP_RUN
;
}
else
if
(
state_
==
OP_RUN
)
{
state_
=
XPU
;
// state_ = PUSH_GRAD;
// state_ = PUSH_GRAD;
}
else
if
(
state_
==
XPU
)
{
state_
=
OP_RUN_END
;
}
else
if
(
state_
==
OP_RUN_END
)
{
state_
=
PUSH_GRAD
;
}
else
if
(
state_
==
PUSH_GRAD
)
{
state_
=
DONE
;
}
}
void
Reset
()
{
total_time
=
0
;
read_time
=
0
;
pack_time
=
0
;
pull_sparse_local_time
=
0
;
op_all_time
=
0
;
xpu_op_time
=
0
;
xpu_wait_time
=
0
;
cpu_op_time
=
0
;
collect_label_time
=
0
;
fill_sparse_time
=
0
;
push_sparse_time
=
0
;
}
void
Show
()
{
std
::
cout
<<
"features size "
<<
features_
.
size
()
<<
std
::
endl
;
for
(
size_t
i
=
0
;
i
<
features_
.
size
();
++
i
)
{
std
::
cout
<<
"features["
<<
i
<<
"] size "
<<
features_
[
i
].
size
()
<<
std
::
endl
;
}
}
void
PackTask
(
Scope
*
scope
,
int
taskid
,
DataFeed
*
reader
,
int
cur_batch
,
const
ProgramDesc
&
program
);
Scope
*
scope_
{
nullptr
};
int
taskid_
;
int
cur_batch_
;
HeterTaskState
state_
;
// cache
std
::
map
<
uint64_t
,
std
::
vector
<
uint64_t
>>
features_
;
std
::
map
<
uint64_t
,
std
::
vector
<
float
>>
feature_labels_
;
std
::
map
<
uint64_t
,
std
::
vector
<
std
::
vector
<
float
>>>
feature_values_
;
std
::
map
<
uint64_t
,
std
::
vector
<
std
::
vector
<
float
>>>
feature_grads_
;
std
::
map
<
uint64_t
,
std
::
vector
<
uint64_t
>>
sparse_push_keys_
;
double
total_time
{
0
};
double
read_time
{
0
};
double
pack_time
{
0
};
double
pull_sparse_local_time
{
0
};
double
op_all_time
{
0
};
double
xpu_op_time
{
0
};
double
xpu_wait_time
{
0
};
double
cpu_op_time
{
0
};
double
collect_label_time
{
0
};
double
fill_sparse_time
{
0
};
double
push_sparse_time
{
0
};
};
template
<
class
T
>
class
HeterObjectPool
{
public:
HeterObjectPool
()
{}
virtual
~
HeterObjectPool
(){};
std
::
shared_ptr
<
T
>
Get
()
{
std
::
lock_guard
<
std
::
mutex
>
lock
(
mutex_
);
if
(
pool_
.
empty
())
{
num_
+=
1
;
#ifdef PADDLE_WITH_CUDA
VLOG
(
0
)
<<
"pool construct size: "
<<
num_
;
#endif
return
std
::
make_shared
<
T
>
();
}
else
{
auto
ret
=
pool_
.
back
();
pool_
.
pop_back
();
return
ret
;
}
}
void
Push
(
std
::
shared_ptr
<
T
>
data
)
{
std
::
lock_guard
<
std
::
mutex
>
lock
(
mutex_
);
pool_
.
push_back
(
std
::
move
(
data
));
}
int
Size
()
{
std
::
lock_guard
<
std
::
mutex
>
lock
(
mutex_
);
return
pool_
.
size
();
}
std
::
shared_ptr
<
T
>&
GetElement
(
int
i
)
{
return
pool_
[
i
];
}
private:
std
::
vector
<
std
::
shared_ptr
<
T
>>
pool_
;
std
::
mutex
mutex_
;
int
num_
{
0
};
};
struct
BthreadMutextGuard
{
BthreadMutextGuard
(
bthread_mutex_t
*
rho
)
{
mutex_
=
rho
;
bthread_mutex_lock
(
mutex_
);
}
~
BthreadMutextGuard
()
{
bthread_mutex_unlock
(
mutex_
);
}
bthread_mutex_t
*
mutex_
;
};
template
<
class
T
>
class
BtObjectPool
{
public:
BtObjectPool
()
{
bthread_mutex_init
(
&
mutex_
,
NULL
);
bthread_cond_init
(
&
cond_
,
NULL
);
}
virtual
~
BtObjectPool
()
{
bthread_cond_destroy
(
&
cond_
);
bthread_mutex_destroy
(
&
mutex_
);
};
std
::
shared_ptr
<
T
>
Get
()
{
BthreadMutextGuard
guard
(
&
mutex_
);
while
(
pool_
.
empty
())
{
bthread_cond_wait
(
&
cond_
,
&
mutex_
);
}
auto
ret
=
pool_
.
back
();
pool_
.
pop_back
();
return
ret
;
}
void
Push
(
std
::
shared_ptr
<
T
>
data
)
{
BthreadMutextGuard
guard
(
&
mutex_
);
pool_
.
push_back
(
std
::
move
(
data
));
bthread_cond_signal
(
&
cond_
);
}
int
Size
()
{
return
pool_
.
size
();
}
std
::
shared_ptr
<
T
>&
GetElement
(
int
i
)
{
return
pool_
[
i
];
}
private:
std
::
vector
<
std
::
shared_ptr
<
T
>>
pool_
;
bthread_mutex_t
mutex_
;
bthread_cond_t
cond_
;
int
num_
{
0
};
};
template
<
class
K
,
class
T
>
struct
HeterNode
{
K
key
;
T
value
;
HeterNode
*
prev
;
HeterNode
*
next
;
};
template
<
class
K
,
class
T
>
class
HeterList
{
public:
HeterList
()
:
head_
(
new
HeterNode
<
K
,
T
>
),
tail_
(
new
HeterNode
<
K
,
T
>
)
{
head_
->
prev
=
NULL
;
head_
->
next
=
tail_
;
tail_
->
prev
=
head_
;
tail_
->
next
=
NULL
;
size
=
0
;
cap_
=
1e9
;
}
~
HeterList
()
{
delete
head_
;
delete
tail_
;
}
void
SetCap
(
int
num
)
{
cap_
=
num
;
}
bool
TryPut
(
K
&
key
,
T
&
value
)
{
std
::
unique_lock
<
std
::
mutex
>
lock
(
mutex_
);
cond_
.
wait
(
lock
,
[
this
]
{
return
size
<
cap_
;
});
if
(
task_map_
.
find
(
key
)
!=
task_map_
.
end
())
{
// std::cout << "try put key=" << key << " false" << std::endl;
task_map_
.
erase
(
key
);
return
false
;
}
else
{
HeterNode
<
K
,
T
>*
node
=
new
HeterNode
<
K
,
T
>
;
node
->
key
=
key
;
node
->
value
=
value
;
map_
[
node
->
key
]
=
node
;
attach
(
node
);
// std::cout << "try put key=" << key << " true" << std::endl;
return
true
;
}
}
bool
Put
(
K
&
key
,
T
&
value
)
{
std
::
unique_lock
<
std
::
mutex
>
lock
(
mutex_
);
cond_
.
wait
(
lock
,
[
this
]
{
return
size
<
cap_
;
});
HeterNode
<
K
,
T
>*
node
=
new
HeterNode
<
K
,
T
>
;
// std::cout << "put key=" << key << " true" << std::endl;
node
->
key
=
key
;
node
->
value
=
value
;
map_
[
node
->
key
]
=
node
;
attach
(
node
);
return
true
;
}
T
TryGet
(
const
K
&
key
)
{
std
::
lock_guard
<
std
::
mutex
>
lock
(
mutex_
);
auto
iter
=
map_
.
find
(
key
);
if
(
iter
!=
map_
.
end
())
{
// std::cout << "try get key=" << key << " true" << std::endl;
HeterNode
<
K
,
T
>*
node
=
iter
->
second
;
detach
(
node
);
cond_
.
notify_one
();
T
ret
=
std
::
move
(
node
->
value
);
map_
.
erase
(
key
);
delete
node
;
return
ret
;
}
task_map_
.
insert
(
key
);
// std::cout << "try get key=" << key << " false" << std::endl;
return
nullptr
;
}
T
Get
(
const
K
&
key
)
{
std
::
lock_guard
<
std
::
mutex
>
lock
(
mutex_
);
auto
iter
=
map_
.
find
(
key
);
if
(
iter
!=
map_
.
end
())
{
// std::cout << "get key=" << key << " true" << std::endl;
HeterNode
<
K
,
T
>*
node
=
iter
->
second
;
detach
(
node
);
cond_
.
notify_one
();
T
ret
=
std
::
move
(
node
->
value
);
map_
.
erase
(
key
);
delete
node
;
return
ret
;
}
// std::cout << "get key=" << key << " false" << std::endl;
return
nullptr
;
}
T
Get
()
{
std
::
lock_guard
<
std
::
mutex
>
lock
(
mutex_
);
HeterNode
<
K
,
T
>*
node
=
head_
->
next
;
if
(
node
==
tail_
)
{
// std::cout << "get2 false" << std::endl;
return
nullptr
;
}
else
{
detach
(
node
);
cond_
.
notify_one
();
T
ret
=
std
::
move
(
node
->
value
);
map_
.
erase
(
node
->
key
);
// std::cout << "get2 key=" << node->key << " true" << std::endl;
delete
node
;
return
ret
;
}
}
bool
Empty
()
{
std
::
lock_guard
<
std
::
mutex
>
lock
(
mutex_
);
return
head_
->
next
==
tail_
;
}
int
Size
()
{
std
::
lock_guard
<
std
::
mutex
>
lock
(
mutex_
);
return
size
;
}
private:
void
detach
(
HeterNode
<
K
,
T
>*
node
)
{
node
->
prev
->
next
=
node
->
next
;
node
->
next
->
prev
=
node
->
prev
;
size
--
;
}
void
attach
(
HeterNode
<
K
,
T
>*
node
)
{
node
->
prev
=
head_
;
node
->
next
=
head_
->
next
;
head_
->
next
->
prev
=
node
;
head_
->
next
=
node
;
size
++
;
}
private:
HeterNode
<
K
,
T
>*
head_
;
HeterNode
<
K
,
T
>*
tail_
;
std
::
unordered_map
<
K
,
HeterNode
<
K
,
T
>*>
map_
;
std
::
unordered_set
<
K
>
task_map_
;
std
::
mutex
mutex_
;
std
::
condition_variable
cond_
;
int
cap_
;
int
size
;
};
}
// namespace framework
}
// namespace paddle
#endif
paddle/fluid/framework/heter_service.proto
0 → 100644
浏览文件 @
862dde5e
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
syntax
=
"proto2"
;
package
paddle
.
framework
;
option
cc_generic_services
=
true
;
// It can be: LoDTensor、SelectedRows or NCCL_ID
enum
VarType
{
LOD_TENSOR
=
0
;
SELECTED_ROWS
=
1
;
NCCL_ID
=
2
;
}
// VariableMessage is serialized paddle variable message.
// NOTICE(gongwb):don't modify this proto if you are not
// not familar with how we serialize in sendrecvop_utils.h
// and deserilize it in variable_response.h.
message
VariableMessage
{
enum
Type
{
// Pod Types
BOOL
=
0
;
INT16
=
1
;
INT32
=
2
;
INT64
=
3
;
FP16
=
4
;
FP32
=
5
;
FP64
=
6
;
}
message
LodData
{
repeated
int64
lod_data
=
1
;
}
optional
string
varname
=
1
;
// TODO(Yancey1989): reference framework::proto::VarDesc::VarType
optional
VarType
type
=
2
;
// bool persistable is not needed for sending.
// tensor info:
optional
Type
data_type
=
3
;
repeated
int64
dims
=
4
;
// lod details:
optional
int64
lod_level
=
5
;
repeated
LodData
lod
=
6
;
// selected_rows height, aka. original dim0
optional
int64
slr_height
=
7
;
// tensor data
optional
bytes
data
=
8
;
}
message
HeterRequest
{
required
int32
cmd
=
1
;
optional
int32
cur_batch
=
2
;
repeated
VariableMessage
vars
=
3
;
};
message
HeterResponse
{
// optional VariableMessage vars = 1;
repeated
VariableMessage
vars
=
1
;
};
service
HeterService
{
rpc
service
(
HeterRequest
)
returns
(
HeterResponse
);
};
paddle/fluid/framework/hetercpu_worker.cc
0 → 100644
浏览文件 @
862dde5e
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/device_worker.h"
#include "paddle/fluid/framework/device_worker_factory.h"
#include "paddle/fluid/framework/fleet/fleet_wrapper.h"
#include "paddle/fluid/framework/fleet/heter_wrapper.h"
#include "paddle/fluid/platform/cpu_helper.h"
#include "paddle/fluid/string/string_helper.h"
#ifdef PADDLE_WITH_PSLIB
#if defined _WIN32 || defined __APPLE__
#else
#define _LINUX
#endif
namespace
paddle
{
namespace
framework
{
void
HeterTask
::
PackTask
(
Scope
*
thread_scope
,
int
taskid
,
DataFeed
*
reader
,
int
cur_batch
,
const
ProgramDesc
&
program
)
{
// total_time = 0;
// read_time = 0;
// pack_time = 0;
// pull_sparse_local_time = 0;
taskid_
=
taskid
;
auto
&
block
=
program
.
Block
(
0
);
if
(
!
scope_
)
{
scope_
=
&
(
thread_scope
->
NewScope
());
for
(
auto
&
var
:
block
.
AllVars
())
{
if
(
!
var
->
Persistable
())
{
auto
*
ptr
=
scope_
->
Var
(
var
->
Name
());
InitializeVariable
(
ptr
,
var
->
GetType
());
}
}
}
state_
=
PULL_SPARSE
;
cur_batch_
=
cur_batch
;
auto
&
use_slots
=
reader
->
GetUseSlotAlias
();
for
(
size_t
i
=
0
;
i
<
use_slots
.
size
();
++
i
)
{
Variable
*
thread_var
=
thread_scope
->
FindVar
(
use_slots
[
i
]);
LoDTensor
*
thread_tensor
=
thread_var
->
GetMutable
<
LoDTensor
>
();
Variable
*
task_var
=
scope_
->
FindVar
(
use_slots
[
i
]);
LoDTensor
*
task_tensor
=
task_var
->
GetMutable
<
LoDTensor
>
();
TensorCopy
(
*
thread_tensor
,
platform
::
CPUPlace
(),
task_tensor
);
auto
&
tensor_lod
=
thread_tensor
->
lod
()[
0
];
LoD
thread_lod
{
tensor_lod
};
task_tensor
->
set_lod
(
thread_lod
);
}
}
void
HeterCpuWorker
::
GetXpuOpIndex
()
{
xpu_begin_op_index_
=
trainer_desc_
.
xpu_start_idx
();
xpu_end_op_index_
=
trainer_desc_
.
xpu_end_idx
();
VLOG
(
0
)
<<
"xpu begin: "
<<
xpu_begin_op_index_
<<
" xpu end: "
<<
xpu_end_op_index_
;
// CHECK(xpu_begin_op_index_ == trainer_desc_.xpu_start_idx());
// CHECK(xpu_end_op_index_ == trainer_desc_.xpu_end_idx());
// CHECK(trainer_desc_.op_run_start_idx() == 0);
// CHECK(trainer_desc_.op_run_end_idx() == xpu_begin_op_index_ - 1);
// CHECK(trainer_desc_.op_run_end_start_idx() == xpu_end_op_index_ + 1);
// CHECK(trainer_desc_.op_run_end_end_idx() == ops_.size() - 1);
}
void
HeterCpuWorker
::
Schedule
(
int
taskid
)
{
VLOG
(
3
)
<<
"schedule "
<<
taskid
;
auto
task
=
wait_queue_
.
TryGet
(
taskid
);
if
(
task
)
{
run_queue_
.
Put
(
task
->
taskid_
,
task
);
}
}
void
HeterCpuWorker
::
JumpContext
(
std
::
shared_ptr
<
HeterTask
>
task
)
{
VLOG
(
3
)
<<
"jump context "
<<
task
->
taskid_
;
if
(
!
(
wait_queue_
.
TryPut
(
task
->
taskid_
,
task
)))
{
run_queue_
.
Put
(
task
->
taskid_
,
task
);
}
}
void
HeterCpuWorker
::
Initialize
(
const
TrainerDesc
&
desc
)
{
param_
=
desc
.
downpour_param
();
mpi_rank_
=
desc
.
mpi_rank
();
trainer_desc_
=
desc
;
for
(
int
i
=
0
;
i
<
param_
.
sparse_table_size
();
++
i
)
{
uint64_t
table_id
=
static_cast
<
uint64_t
>
(
param_
.
sparse_table
(
i
).
table_id
());
TableParameter
table
=
param_
.
sparse_table
(
i
);
sparse_key_names_
[
table_id
].
resize
(
table
.
sparse_key_name_size
());
for
(
int
j
=
0
;
j
<
table
.
sparse_key_name_size
();
++
j
)
{
sparse_key_names_
[
table_id
][
j
]
=
table
.
sparse_key_name
(
j
);
}
sparse_value_names_
[
table_id
].
resize
(
table
.
sparse_value_name_size
());
for
(
int
j
=
0
;
j
<
table
.
sparse_value_name_size
();
++
j
)
{
sparse_value_names_
[
table_id
][
j
]
=
table
.
sparse_value_name
(
j
);
}
sparse_grad_names_
[
table_id
].
resize
(
table
.
sparse_grad_name_size
());
for
(
int
j
=
0
;
j
<
table
.
sparse_grad_name_size
();
++
j
)
{
sparse_grad_names_
[
table_id
][
j
]
=
table
.
sparse_grad_name
(
j
);
}
label_var_name_
[
table_id
]
=
table
.
label_var_name
();
sparse_push_keys_
[
table_id
]
=
std
::
vector
<
uint64_t
>
();
}
for
(
int
i
=
0
;
i
<
param_
.
dense_table_size
();
++
i
)
{
uint64_t
table_id
=
static_cast
<
uint64_t
>
(
param_
.
dense_table
(
i
).
table_id
());
auto
table
=
param_
.
dense_table
(
i
);
dense_value_names_
[
table_id
].
resize
(
table
.
dense_value_name_size
());
for
(
int
j
=
0
;
j
<
table
.
dense_value_name_size
();
++
j
)
{
dense_value_names_
[
table_id
][
j
]
=
table
.
dense_value_name
(
j
);
}
dense_grad_names_
[
table_id
].
resize
(
table
.
dense_grad_name_size
());
for
(
int
j
=
0
;
j
<
table
.
dense_grad_name_size
();
++
j
)
{
dense_grad_names_
[
table_id
][
j
]
=
table
.
dense_grad_name
(
j
);
}
}
skip_ops_
.
resize
(
param_
.
skip_ops_size
());
for
(
int
i
=
0
;
i
<
param_
.
skip_ops_size
();
++
i
)
{
skip_ops_
[
i
]
=
param_
.
skip_ops
(
i
);
}
for
(
int
i
=
0
;
i
<
param_
.
stat_var_names_size
();
++
i
)
{
stat_var_name_map_
[
param_
.
stat_var_names
(
i
)]
=
1
;
}
need_to_push_sparse_
=
param_
.
push_sparse
();
need_to_push_dense_
=
param_
.
push_dense
();
fleet_ptr_
=
FleetWrapper
::
GetInstance
();
heter_ptr_
=
HeterWrapper
::
GetInstance
();
fetch_config_
=
desc
.
fetch_config
();
use_cvm_
=
desc
.
use_cvm
();
// for sparse value accessor, embedding only
no_cvm_
=
desc
.
no_cvm
();
scale_datanorm_
=
desc
.
scale_datanorm
();
dump_slot_
=
desc
.
dump_slot
();
dump_fields_
.
resize
(
desc
.
dump_fields_size
());
for
(
int
i
=
0
;
i
<
desc
.
dump_fields_size
();
++
i
)
{
dump_fields_
[
i
]
=
desc
.
dump_fields
(
i
);
}
adjust_ins_weight_config_
=
desc
.
adjust_ins_weight_config
();
need_dump_param_
=
false
;
dump_param_
.
resize
(
desc
.
dump_param_size
());
for
(
int
i
=
0
;
i
<
desc
.
dump_param_size
();
++
i
)
{
dump_param_
[
i
]
=
desc
.
dump_param
(
i
);
}
if
(
desc
.
dump_param_size
()
!=
0
)
{
need_dump_param_
=
true
;
}
for
(
int
i
=
0
;
i
<
desc
.
check_nan_var_names_size
();
++
i
)
{
check_nan_var_names_
.
push_back
(
desc
.
check_nan_var_names
(
i
));
}
copy_table_config_
=
desc
.
copy_table_config
();
for
(
int
i
=
0
;
i
<
copy_table_config_
.
src_sparse_tables_size
();
++
i
)
{
uint64_t
src_table
=
copy_table_config_
.
src_sparse_tables
(
i
);
uint64_t
dest_table
=
copy_table_config_
.
dest_sparse_tables
(
i
);
VLOG
(
3
)
<<
"copy_sparse_tables_ push back "
<<
src_table
<<
"->"
<<
dest_table
;
copy_sparse_tables_
.
push_back
(
std
::
make_pair
(
src_table
,
dest_table
));
}
for
(
int
i
=
0
;
i
<
copy_table_config_
.
src_dense_tables_size
();
++
i
)
{
uint64_t
src_table
=
copy_table_config_
.
src_dense_tables
(
i
);
uint64_t
dest_table
=
copy_table_config_
.
dest_dense_tables
(
i
);
VLOG
(
3
)
<<
"copy_dense_tables_ push back "
<<
src_table
<<
"->"
<<
dest_table
;
copy_dense_tables_
.
push_back
(
std
::
make_pair
(
src_table
,
dest_table
));
}
for
(
auto
&
m
:
copy_table_config_
.
table_denpendency_map
())
{
if
(
sparse_key_names_
.
find
(
m
.
key
())
!=
sparse_key_names_
.
end
())
{
// currently only support one dependency
for
(
auto
&
value
:
m
.
values
())
{
table_dependency_
[
m
.
key
()]
=
value
;
}
}
}
}
void
HeterCpuWorker
::
SetChannelWriter
(
ChannelObject
<
std
::
string
>*
queue
)
{
writer_
.
Reset
(
queue
);
}
void
HeterCpuWorker
::
SetNeedDump
(
bool
need_dump_field
)
{
need_dump_field_
=
need_dump_field
;
}
// template <typename T>
// std::string PrintLodTensorType(LoDTensor* tensor,
// int64_t start, int64_t end) {
// auto count = tensor->numel();
// if (start < 0 || end > count) {
// VLOG(3) << "access violation";
// return "access violation";
// }
// std::ostringstream os;
// for (int64_t i = start; i < end; i++) {
// os << ":" << tensor->data<T>()[i];
// }
// return os.str();
// }
//
// std::string PrintLodTensorIntType(LoDTensor* tensor, int64_t start,
// int64_t end) {
// auto count = tensor->numel();
// if (start < 0 || end > count) {
// VLOG(3) << "access violation";
// return "access violation";
// }
// std::ostringstream os;
// for (int64_t i = start; i < end; i++) {
// os << ":" << static_cast<uint64_t>(tensor->data<int64_t>()[i]);
// }
// return os.str();
// }
//
// std::string PrintLodTensor(LoDTensor* tensor, int64_t start, int64_t end) {
// std::string out_val;
// if (tensor->type() == proto::VarType::FP32) {
// out_val = PrintLodTensorType<float>(tensor, start, end);
// } else if (tensor->type() == proto::VarType::INT64) {
// out_val = PrintLodTensorIntType(tensor, start, end);
// } else if (tensor->type() == proto::VarType::FP64) {
// out_val = PrintLodTensorType<double>(tensor, start, end);
// } else {
// out_val = "unsupported type";
// }
// return out_val;
// }
//
// std::pair<int64_t, int64_t> GetTensorBound(LoDTensor* tensor, int index) {
// auto& dims = tensor->dims();
// if (tensor->lod().size() != 0) {
// auto& lod = tensor->lod()[0];
// return {lod[index] * dims[1], lod[index + 1] * dims[1]};
// } else {
// return {index * dims[1], (index + 1) * dims[1]};
// }
// }
//
// bool CheckValidOutput(LoDTensor* tensor, size_t batch_size) {
// auto& dims = tensor->dims();
// if (dims.size() != 2) return false;
// if (tensor->lod().size() != 0) {
// auto& lod = tensor->lod()[0];
// if (lod.size() != batch_size + 1) {
// return false;
// }
// } else {
// if (dims[0] != static_cast<int>(batch_size)) {
// return false;
// }
// }
// return true;
// }
void
HeterCpuWorker
::
DumpParam
()
{
// std::string os;
// for (auto& param : dump_param_) {
// os.clear();
// os = param;
// Variable* var = thread_scope_->FindVar(param);
// if (var == nullptr) {
// continue;
// }
// LoDTensor* tensor = var->GetMutable<LoDTensor>();
// int64_t len = tensor->numel();
// os += PrintLodTensor(tensor, 0, len);
// writer_ << os;
// }
}
void
HeterCpuWorker
::
CollectLabelInfo
(
std
::
shared_ptr
<
HeterTask
>
task
,
size_t
table_idx
)
{
if
(
no_cvm_
)
{
return
;
}
uint64_t
table_id
=
static_cast
<
uint64_t
>
(
param_
.
program_config
(
0
).
pull_sparse_table_id
(
table_idx
));
TableParameter
table
;
for
(
auto
i
:
param_
.
sparse_table
())
{
if
(
i
.
table_id
()
==
table_id
)
{
table
=
i
;
break
;
}
}
auto
&
feature
=
(
task
->
features_
)[
table_id
];
auto
&
feature_label
=
(
task
->
feature_labels_
)[
table_id
];
Scope
*
scope
=
task
->
scope_
;
feature_label
.
resize
(
feature
.
size
());
Variable
*
var
=
scope
->
FindVar
(
label_var_name_
[
table_id
]);
LoDTensor
*
tensor
=
var
->
GetMutable
<
LoDTensor
>
();
int64_t
*
label_ptr
=
tensor
->
data
<
int64_t
>
();
size_t
global_index
=
0
;
for
(
size_t
i
=
0
;
i
<
sparse_key_names_
[
table_id
].
size
();
++
i
)
{
VLOG
(
3
)
<<
"sparse_key_names_["
<<
i
<<
"]: "
<<
sparse_key_names_
[
table_id
][
i
];
Variable
*
fea_var
=
scope
->
FindVar
(
sparse_key_names_
[
table_id
][
i
]);
if
(
fea_var
==
nullptr
)
{
continue
;
}
LoDTensor
*
tensor
=
fea_var
->
GetMutable
<
LoDTensor
>
();
CHECK
(
tensor
!=
nullptr
)
<<
"tensor of var "
<<
sparse_key_names_
[
table_id
][
i
]
<<
" is null"
;
// skip slots which do not have embedding
Variable
*
emb_var
=
scope
->
FindVar
(
sparse_value_names_
[
table_id
][
i
]);
if
(
emb_var
==
nullptr
)
{
continue
;
}
int64_t
*
ids
=
tensor
->
data
<
int64_t
>
();
size_t
fea_idx
=
0
;
// tensor->lod()[0].size() == batch_size + 1
for
(
auto
lod_idx
=
1u
;
lod_idx
<
tensor
->
lod
()[
0
].
size
();
++
lod_idx
)
{
for
(;
fea_idx
<
tensor
->
lod
()[
0
][
lod_idx
];
++
fea_idx
)
{
// should be skipped feasign defined in protobuf
if
(
ids
[
fea_idx
]
==
0u
)
{
continue
;
}
feature_label
[
global_index
++
]
=
static_cast
<
float
>
(
label_ptr
[
lod_idx
-
1
]);
}
}
}
CHECK
(
global_index
==
feature
.
size
())
<<
"expect fea info size:"
<<
feature
.
size
()
<<
" real:"
<<
global_index
;
}
void
HeterCpuWorker
::
FillSparseValue
(
std
::
shared_ptr
<
HeterTask
>
task
,
size_t
table_idx
)
{
uint64_t
table_id
=
static_cast
<
uint64_t
>
(
param_
.
program_config
(
0
).
pull_sparse_table_id
(
table_idx
));
TableParameter
table
;
for
(
auto
i
:
param_
.
sparse_table
())
{
if
(
i
.
table_id
()
==
table_id
)
{
table
=
i
;
break
;
}
}
auto
&
fea_value
=
(
task
->
feature_values_
)[
table_id
];
Scope
*
scope
=
task
->
scope_
;
auto
fea_idx
=
0u
;
std
::
vector
<
float
>
init_value
(
table
.
fea_dim
());
for
(
size_t
i
=
0
;
i
<
sparse_key_names_
[
table_id
].
size
();
++
i
)
{
std
::
string
slot_name
=
sparse_key_names_
[
table_id
][
i
];
std
::
string
emb_slot_name
=
sparse_value_names_
[
table_id
][
i
];
Variable
*
var
=
scope
->
FindVar
(
slot_name
);
if
(
var
==
nullptr
)
{
continue
;
}
LoDTensor
*
tensor
=
var
->
GetMutable
<
LoDTensor
>
();
CHECK
(
tensor
!=
nullptr
)
<<
"tensor of var "
<<
slot_name
<<
" is null"
;
int64_t
*
ids
=
tensor
->
data
<
int64_t
>
();
int
len
=
tensor
->
numel
();
Variable
*
var_emb
=
scope
->
FindVar
(
emb_slot_name
);
if
(
var_emb
==
nullptr
)
{
continue
;
}
LoDTensor
*
tensor_emb
=
var_emb
->
GetMutable
<
LoDTensor
>
();
float
*
ptr
=
tensor_emb
->
mutable_data
<
float
>
({
len
,
table
.
emb_dim
()},
place_
);
// memset(ptr, 0, sizeof(float) * len * table.emb_dim());
auto
&
tensor_lod
=
tensor
->
lod
()[
0
];
LoD
data_lod
{
tensor_lod
};
tensor_emb
->
set_lod
(
data_lod
);
bool
is_nid
=
(
adjust_ins_weight_config_
.
need_adjust
()
&&
adjust_ins_weight_config_
.
nid_slot
()
==
emb_slot_name
);
if
(
is_nid
)
{
nid_show_
.
clear
();
}
int
nid_ins_index
=
0
;
for
(
int
index
=
0
;
index
<
len
;
++
index
)
{
if
(
use_cvm_
||
no_cvm_
)
{
if
(
ids
[
index
]
==
0u
)
{
memcpy
(
ptr
+
table
.
emb_dim
()
*
index
,
init_value
.
data
(),
sizeof
(
float
)
*
table
.
emb_dim
());
if
(
is_nid
)
{
nid_show_
.
push_back
(
-
1
);
++
nid_ins_index
;
}
continue
;
}
memcpy
(
ptr
+
table
.
emb_dim
()
*
index
,
fea_value
[
fea_idx
].
data
(),
sizeof
(
float
)
*
table
.
emb_dim
());
if
(
is_nid
&&
static_cast
<
size_t
>
(
index
)
==
tensor
->
lod
()[
0
][
nid_ins_index
])
{
nid_show_
.
push_back
(
fea_value
[
fea_idx
][
0
]);
++
nid_ins_index
;
}
fea_idx
++
;
}
else
{
if
(
ids
[
index
]
==
0u
)
{
memcpy
(
ptr
+
table
.
emb_dim
()
*
index
,
init_value
.
data
()
+
2
,
sizeof
(
float
)
*
table
.
emb_dim
());
if
(
is_nid
)
{
nid_show_
.
push_back
(
-
1
);
++
nid_ins_index
;
}
continue
;
}
memcpy
(
ptr
+
table
.
emb_dim
()
*
index
,
fea_value
[
fea_idx
].
data
()
+
2
,
sizeof
(
float
)
*
table
.
emb_dim
());
if
(
is_nid
&&
static_cast
<
size_t
>
(
index
)
==
tensor
->
lod
()[
0
][
nid_ins_index
])
{
nid_show_
.
push_back
(
fea_value
[
fea_idx
][
0
]);
++
nid_ins_index
;
}
fea_idx
++
;
}
}
}
}
void
HeterCpuWorker
::
AdjustInsWeight
(
std
::
shared_ptr
<
HeterTask
>
task
)
{
#ifdef _LINUX
// check var and tensor not null
Scope
*
scope
=
task
->
scope_
;
if
(
!
adjust_ins_weight_config_
.
need_adjust
())
{
VLOG
(
0
)
<<
"need_adjust=false, skip adjust ins weight"
;
return
;
}
Variable
*
nid_var
=
scope
->
FindVar
(
adjust_ins_weight_config_
.
nid_slot
());
if
(
nid_var
==
nullptr
)
{
VLOG
(
0
)
<<
"nid slot var "
<<
adjust_ins_weight_config_
.
nid_slot
()
<<
" is nullptr, skip adjust ins weight"
;
return
;
}
LoDTensor
*
nid_tensor
=
nid_var
->
GetMutable
<
LoDTensor
>
();
if
(
nid_tensor
==
nullptr
)
{
VLOG
(
0
)
<<
"tensor of nid slot var "
<<
adjust_ins_weight_config_
.
nid_slot
()
<<
" is nullptr, skip adjust ins weight"
;
return
;
}
Variable
*
ins_weight_var
=
scope
->
FindVar
(
adjust_ins_weight_config_
.
ins_weight_slot
());
if
(
ins_weight_var
==
nullptr
)
{
VLOG
(
0
)
<<
"ins weight var "
<<
adjust_ins_weight_config_
.
ins_weight_slot
()
<<
" is nullptr, skip adjust ins weight"
;
return
;
}
LoDTensor
*
ins_weight_tensor
=
ins_weight_var
->
GetMutable
<
LoDTensor
>
();
if
(
ins_weight_tensor
==
nullptr
)
{
VLOG
(
0
)
<<
"tensor of ins weight tensor "
<<
adjust_ins_weight_config_
.
ins_weight_slot
()
<<
" is nullptr, skip adjust ins weight"
;
return
;
}
float
*
ins_weights
=
ins_weight_tensor
->
data
<
float
>
();
size_t
len
=
ins_weight_tensor
->
numel
();
// len = batch size
// here we assume nid_show slot only has one feasign in each instance
CHECK
(
len
==
nid_show_
.
size
())
<<
"ins_weight size should be equal to "
<<
"nid_show size, "
<<
len
<<
" vs "
<<
nid_show_
.
size
();
float
nid_adjw_threshold
=
adjust_ins_weight_config_
.
nid_adjw_threshold
();
float
nid_adjw_ratio
=
adjust_ins_weight_config_
.
nid_adjw_ratio
();
int64_t
nid_adjw_num
=
0
;
double
nid_adjw_weight
=
0.0
;
size_t
ins_index
=
0
;
for
(
size_t
i
=
0
;
i
<
len
;
++
i
)
{
float
nid_show
=
nid_show_
[
i
];
VLOG
(
3
)
<<
"nid_show "
<<
nid_show
;
if
(
nid_show
<
0
)
{
VLOG
(
3
)
<<
"nid_show < 0, continue"
;
continue
;
}
float
ins_weight
=
1.0
;
if
(
nid_show
>=
0
&&
nid_show
<
nid_adjw_threshold
)
{
ins_weight
=
log
(
M_E
+
(
nid_adjw_threshold
-
nid_show
)
/
nid_adjw_threshold
*
nid_adjw_ratio
);
// count nid adjw insnum and weight
++
nid_adjw_num
;
nid_adjw_weight
+=
ins_weight
;
// choose large ins weight
VLOG
(
3
)
<<
"ins weight new "
<<
ins_weight
<<
", ins weight origin "
<<
ins_weights
[
ins_index
];
if
(
ins_weight
>
ins_weights
[
ins_index
])
{
VLOG
(
3
)
<<
"ins "
<<
ins_index
<<
" weight changes to "
<<
ins_weight
;
ins_weights
[
ins_index
]
=
ins_weight
;
}
++
ins_index
;
}
}
VLOG
(
3
)
<<
"nid adjw info: total_adjw_num: "
<<
nid_adjw_num
<<
", avg_adjw_weight: "
<<
nid_adjw_weight
;
#endif
}
void
HeterCpuWorker
::
CopySparseTable
()
{
for
(
size_t
i
=
0
;
i
<
copy_sparse_tables_
.
size
();
++
i
)
{
int64_t
src_table
=
copy_sparse_tables_
[
i
].
first
;
int64_t
dest_table
=
copy_sparse_tables_
[
i
].
second
;
int32_t
feanum
=
0
;
if
(
src_table
==
dest_table
)
{
continue
;
}
else
if
(
!
copy_table_config_
.
sparse_copy_by_feasign
())
{
if
(
feasign_set_
.
find
(
src_table
)
==
feasign_set_
.
end
())
{
continue
;
}
else
if
(
feasign_set_
[
src_table
].
size
()
==
0
)
{
continue
;
}
feanum
=
fleet_ptr_
->
CopyTable
(
src_table
,
dest_table
);
}
else
{
std
::
vector
<
uint64_t
>
fea_vec
(
feasign_set_
[
src_table
].
begin
(),
feasign_set_
[
src_table
].
end
());
feanum
=
fleet_ptr_
->
CopyTableByFeasign
(
src_table
,
dest_table
,
fea_vec
);
fea_vec
.
clear
();
std
::
vector
<
uint64_t
>
().
swap
(
fea_vec
);
}
VLOG
(
3
)
<<
"copy feasign from table "
<<
src_table
<<
" to table "
<<
dest_table
<<
", feasign num="
<<
feanum
;
feasign_set_
[
src_table
].
clear
();
std
::
unordered_set
<
uint64_t
>
().
swap
(
feasign_set_
[
src_table
]);
}
feasign_set_
.
clear
();
}
void
HeterCpuWorker
::
CopyDenseTable
()
{
if
(
thread_id_
!=
0
)
{
return
;
}
thread_local
std
::
vector
<
std
::
future
<
int32_t
>>
pull_dense_status
;
for
(
size_t
i
=
0
;
i
<
copy_dense_tables_
.
size
();
++
i
)
{
uint64_t
src_table
=
copy_dense_tables_
[
i
].
first
;
uint64_t
dest_table
=
copy_dense_tables_
[
i
].
second
;
if
(
src_table
==
dest_table
)
{
continue
;
}
int32_t
dim
=
fleet_ptr_
->
CopyTable
(
src_table
,
dest_table
);
VLOG
(
3
)
<<
"copy param from table "
<<
src_table
<<
" to table "
<<
dest_table
<<
", dim="
<<
dim
;
if
(
copy_table_config_
.
dense_pull_after_copy
())
{
VLOG
(
3
)
<<
"dense pull after copy, table="
<<
dest_table
;
pull_dense_status
.
resize
(
0
);
// fleet_ptr_->PullDenseVarsAsync(*root_scope_, dest_table,
// dense_value_names_[dest_table],
// &pull_dense_status);
for
(
auto
&
t
:
pull_dense_status
)
{
t
.
wait
();
auto
status
=
t
.
get
();
if
(
status
!=
0
)
{
LOG
(
WARNING
)
<<
"pull dense after copy table failed,"
<<
" table="
<<
dest_table
;
}
}
}
}
}
void
HeterCpuWorker
::
CopyDenseVars
()
{
if
(
thread_id_
!=
0
)
{
return
;
}
for
(
int
i
=
0
;
i
<
copy_table_config_
.
src_var_list_size
();
++
i
)
{
auto
&
src_var_name
=
copy_table_config_
.
src_var_list
(
i
);
auto
&
dest_var_name
=
copy_table_config_
.
dest_var_list
(
i
);
if
(
src_var_name
==
dest_var_name
)
{
continue
;
}
VLOG
(
3
)
<<
"copy dense var from "
<<
src_var_name
<<
" to "
<<
dest_var_name
;
Variable
*
src_var
=
thread_scope_
->
FindVar
(
src_var_name
);
CHECK
(
src_var
!=
nullptr
)
<<
src_var_name
<<
" not found"
;
// NOLINT
LoDTensor
*
src_tensor
=
src_var
->
GetMutable
<
LoDTensor
>
();
CHECK
(
src_tensor
!=
nullptr
)
<<
src_var_name
<<
" tensor is null"
;
// NOLINT
float
*
src_data
=
src_tensor
->
data
<
float
>
();
Variable
*
dest_var
=
thread_scope_
->
FindVar
(
dest_var_name
);
CHECK
(
dest_var
!=
nullptr
)
<<
dest_var_name
<<
" not found"
;
// NOLINT
LoDTensor
*
dest_tensor
=
dest_var
->
GetMutable
<
LoDTensor
>
();
CHECK
(
dest_tensor
!=
nullptr
)
<<
dest_var_name
<<
" tensor is null"
;
// NOLINT
float
*
dest_data
=
dest_tensor
->
data
<
float
>
();
CHECK
(
src_tensor
->
numel
()
==
dest_tensor
->
numel
())
<<
"tensor numel not equal,"
<<
src_tensor
->
numel
()
<<
" vs "
<<
dest_tensor
->
numel
();
for
(
int
i
=
0
;
i
<
src_tensor
->
numel
();
i
++
)
{
dest_data
[
i
]
=
src_data
[
i
];
}
}
}
void
HeterCpuWorker
::
TrainFilesWithProfiler
()
{
VLOG
(
3
)
<<
"Begin to train files with profiler"
;
platform
::
SetNumThreads
(
1
);
device_reader_
->
Start
();
std
::
vector
<
double
>
op_total_time
;
std
::
vector
<
std
::
string
>
op_name
;
for
(
auto
&
op
:
ops_
)
{
bool
need_skip
=
false
;
for
(
auto
t
=
0u
;
t
<
skip_ops_
.
size
();
++
t
)
{
if
(
op
->
Type
().
find
(
skip_ops_
[
t
])
!=
std
::
string
::
npos
)
{
need_skip
=
true
;
break
;
}
}
if
(
!
need_skip
)
{
op_name
.
push_back
(
op
->
Type
());
}
}
VLOG
(
3
)
<<
"op name size: "
<<
op_name
.
size
();
op_total_time
.
resize
(
op_name
.
size
());
for
(
size_t
i
=
0
;
i
<
op_total_time
.
size
();
++
i
)
{
op_total_time
[
i
]
=
0.0
;
}
platform
::
Timer
timeline
;
double
total_time
=
0.0
;
double
read_time
=
0.0
;
double
pack_time
=
0.0
;
double
pull_sparse_local_time
=
0.0
;
double
op_all_time
=
0
;
double
xpu_op_time
=
0
;
double
xpu_wait_time
=
0
;
double
cpu_op_time
=
0
;
double
collect_label_time
=
0
;
double
fill_sparse_time
=
0
;
double
push_sparse_time
=
0
;
int
batch_cnt
=
0
;
int
done_cnt
=
0
;
int
cur_batch
;
uint64_t
total_inst
=
0
;
wait_queue_
.
SetCap
(
1
);
while
(
1
)
{
std
::
shared_ptr
<
HeterTask
>
task
;
task
=
run_queue_
.
Get
();
if
(
!
task
)
{
double
tmp_read_time
;
timeline
.
Start
();
cur_batch
=
device_reader_
->
Next
();
timeline
.
Pause
();
tmp_read_time
=
timeline
.
ElapsedSec
();
if
(
cur_batch
<=
0
)
{
if
(
batch_cnt
==
done_cnt
)
{
break
;
}
else
{
continue
;
}
}
batch_cnt
+=
1
;
int
taskid
=
batch_cnt
*
worker_num_
+
thread_id_
;
timeline
.
Start
();
task
=
object_pool_
.
Get
();
task
->
Reset
();
task
->
PackTask
(
thread_scope_
,
taskid
,
device_reader_
,
cur_batch
,
program_
);
timeline
.
Pause
();
task
->
read_time
=
tmp_read_time
;
task
->
pack_time
=
timeline
.
ElapsedSec
();
task
->
total_time
=
tmp_read_time
+
task
->
pack_time
;
}
for
(;;)
{
// pull sparse here
if
(
task
->
state_
==
PULL_SPARSE
)
{
timeline
.
Start
();
for
(
int
i
=
0
;
i
<
param_
.
program_config
(
0
).
pull_sparse_table_id_size
();
++
i
)
{
uint64_t
tid
=
static_cast
<
uint64_t
>
(
param_
.
program_config
(
0
).
pull_sparse_table_id
(
i
));
TableParameter
table
;
for
(
auto
j
:
param_
.
sparse_table
())
{
if
(
j
.
table_id
()
==
tid
)
{
table
=
j
;
break
;
}
}
fleet_ptr_
->
HeterPullSparseVars
(
thread_id_
,
task
,
tid
,
sparse_key_names_
[
tid
],
table
.
fea_dim
(),
sparse_value_names_
[
tid
]);
}
task
->
Update
();
// JumpContext(task);
timeline
.
Pause
();
task
->
pull_sparse_local_time
+=
timeline
.
ElapsedSec
();
task
->
total_time
+=
timeline
.
ElapsedSec
();
}
else
if
(
task
->
state_
==
OP_RUN
)
{
// total_time += task->total_time;
// read_time += task->read_time;
// pack_time += task->pack_time;
// pull_sparse_local_time += task->pull_sparse_local_time;
for
(
int
i
=
0
;
i
<
param_
.
program_config
(
0
).
pull_sparse_table_id_size
();
++
i
)
{
uint64_t
tid
=
static_cast
<
uint64_t
>
(
param_
.
program_config
(
0
).
pull_sparse_table_id
(
i
));
timeline
.
Start
();
CollectLabelInfo
(
task
,
i
);
timeline
.
Pause
();
task
->
collect_label_time
+=
timeline
.
ElapsedSec
();
task
->
total_time
+=
timeline
.
ElapsedSec
();
timeline
.
Start
();
FillSparseValue
(
task
,
i
);
timeline
.
Pause
();
task
->
fill_sparse_time
+=
timeline
.
ElapsedSec
();
task
->
total_time
+=
timeline
.
ElapsedSec
();
auto
nid_iter
=
std
::
find
(
sparse_value_names_
[
tid
].
begin
(),
sparse_value_names_
[
tid
].
end
(),
adjust_ins_weight_config_
.
nid_slot
());
if
(
nid_iter
!=
sparse_value_names_
[
tid
].
end
())
{
AdjustInsWeight
(
task
);
}
}
VLOG
(
3
)
<<
"fill sparse value for all sparse table done."
;
// do computation here
// int run_op_idx = 0;
timeline
.
Start
();
for
(
int
i
=
0
;
i
<
xpu_begin_op_index_
;
++
i
)
{
auto
&
op
=
ops_
[
i
];
bool
need_skip
=
false
;
for
(
auto
t
=
0u
;
t
<
skip_ops_
.
size
();
++
t
)
{
if
(
op
->
Type
().
find
(
skip_ops_
[
t
])
!=
std
::
string
::
npos
)
{
need_skip
=
true
;
break
;
}
}
if
(
!
need_skip
)
{
// timeline.Start();
op
->
Run
(
*
(
task
->
scope_
),
place_
);
// timeline.Pause();
// op_total_time[run_op_idx++] += timeline.ElapsedSec();
// total_time += timeline.ElapsedSec();
}
}
task
->
Update
();
timeline
.
Pause
();
task
->
cpu_op_time
+=
timeline
.
ElapsedSec
();
task
->
total_time
+=
timeline
.
ElapsedSec
();
}
else
if
(
task
->
state_
==
XPU
)
{
timeline
.
Start
();
VLOG
(
3
)
<<
"call remote xpu taskid = "
<<
task
->
taskid_
;
std
::
vector
<
std
::
string
>
send_var_list
;
for
(
int
i
=
0
;
i
<
trainer_desc_
.
xpu_recv_list_size
();
++
i
)
{
send_var_list
.
push_back
(
trainer_desc_
.
xpu_recv_list
(
i
));
}
heter_ptr_
->
CallRemoteXpu
(
task
,
this
,
mpi_rank_
,
send_var_list
);
timeline
.
Pause
();
task
->
xpu_op_time
+=
timeline
.
ElapsedSec
();
task
->
total_time
+=
timeline
.
ElapsedSec
();
task
->
Update
();
timeline
.
Start
();
JumpContext
(
task
);
timeline
.
Pause
();
task
->
xpu_wait_time
+=
timeline
.
ElapsedSec
();
task
->
total_time
+=
timeline
.
ElapsedSec
();
break
;
}
else
if
(
task
->
state_
==
OP_RUN_END
)
{
timeline
.
Start
();
for
(
size_t
i
=
xpu_end_op_index_
+
1
;
i
<
ops_
.
size
();
++
i
)
{
auto
&
op
=
ops_
[
i
];
bool
need_skip
=
false
;
for
(
auto
t
=
0u
;
t
<
skip_ops_
.
size
();
++
t
)
{
if
(
op
->
Type
().
find
(
skip_ops_
[
t
])
!=
std
::
string
::
npos
)
{
need_skip
=
true
;
break
;
}
}
if
(
!
need_skip
)
{
op
->
Run
(
*
(
task
->
scope_
),
place_
);
}
}
// check inf and nan
for
(
std
::
string
&
var_name
:
check_nan_var_names_
)
{
Variable
*
var
=
(
task
->
scope_
)
->
FindVar
(
var_name
);
if
(
var
==
nullptr
)
{
continue
;
}
LoDTensor
*
tensor
=
var
->
GetMutable
<
LoDTensor
>
();
if
(
tensor
==
nullptr
)
{
continue
;
}
}
task
->
Update
();
timeline
.
Pause
();
task
->
cpu_op_time
+=
timeline
.
ElapsedSec
();
task
->
total_time
+=
timeline
.
ElapsedSec
();
}
else
if
(
task
->
state_
==
PUSH_GRAD
)
{
if
(
need_to_push_sparse_
)
{
// push gradients here
for
(
int
i
=
0
;
i
<
param_
.
program_config
(
0
).
push_sparse_table_id_size
();
++
i
)
{
uint64_t
tid
=
static_cast
<
uint64_t
>
(
param_
.
program_config
(
0
).
push_sparse_table_id
(
i
));
TableParameter
table
;
for
(
auto
i
:
param_
.
sparse_table
())
{
if
(
i
.
table_id
()
==
tid
)
{
table
=
i
;
break
;
}
}
timeline
.
Start
();
fleet_ptr_
->
HeterPushSparseVars
(
task
,
tid
,
sparse_key_names_
[
tid
],
sparse_grad_names_
[
tid
],
table
.
emb_dim
(),
&
push_sparse_status_
,
use_cvm_
,
dump_slot_
,
no_cvm_
);
timeline
.
Pause
();
task
->
push_sparse_time
+=
timeline
.
ElapsedSec
();
task
->
total_time
+=
timeline
.
ElapsedSec
();
}
}
if
(
need_to_push_sparse_
)
{
VLOG
(
3
)
<<
"push sparse gradient done."
;
int32_t
tmp_push_sparse_wait_times
=
-
1
;
static
uint32_t
push_sparse_wait_times
=
static_cast
<
uint32_t
>
(
tmp_push_sparse_wait_times
);
if
(
push_sparse_status_
.
size
()
>=
push_sparse_wait_times
)
{
for
(
auto
&
t
:
push_sparse_status_
)
{
t
.
wait
();
}
push_sparse_status_
.
resize
(
0
);
}
if
(
tmp_push_sparse_wait_times
==
-
1
)
{
push_sparse_status_
.
resize
(
0
);
}
}
// thread_scope_->DropKids();
task
->
Update
();
}
else
if
(
task
->
state_
==
DONE
)
{
PrintFetchVars
();
++
done_cnt
;
total_inst
+=
task
->
cur_batch_
;
object_pool_
.
Push
(
task
);
total_time
+=
task
->
total_time
;
read_time
+=
task
->
read_time
;
pack_time
+=
task
->
pack_time
;
pull_sparse_local_time
+=
task
->
pull_sparse_local_time
;
op_all_time
+=
task
->
op_all_time
;
xpu_op_time
+=
task
->
xpu_op_time
;
xpu_wait_time
+=
task
->
xpu_wait_time
;
cpu_op_time
+=
task
->
cpu_op_time
;
collect_label_time
+=
task
->
collect_label_time
;
fill_sparse_time
+=
task
->
fill_sparse_time
;
push_sparse_time
+=
task
->
push_sparse_time
;
// ++batch_cnt;
if
(
thread_id_
==
0
)
{
// should be configured here
if
(
done_cnt
>
0
&&
done_cnt
%
100
==
0
)
{
// double op_sum_time = 0;
// std::unordered_map<std::string, double> op_to_time;
// for (size_t i = 0; i < op_total_time.size(); ++i) {
// fprintf(stderr, "op_name:[%zu][%s], op_mean_time:[%fs]\n", i,
// op_name[i].c_str(), op_total_time[i] / done_cnt);
// if (op_to_time.find(op_name[i]) == op_to_time.end()) {
// op_to_time[op_name[i]] = 0.0;
// }
// op_to_time[op_name[i]] += op_total_time[i];
// op_sum_time += op_total_time[i];
// }
// for (auto& i : op_to_time) {
// fprintf(stderr, "op [%s] run total time: [%f]ms\n",
// i.first.c_str(),
// i.second / done_cnt);
// }
fprintf
(
stderr
,
"cpu op run total time: %fs
\n
"
,
cpu_op_time
/
done_cnt
);
fprintf
(
stderr
,
"xpu op run total time: %fs
\n
"
,
xpu_op_time
/
done_cnt
);
fprintf
(
stderr
,
"xpu wait total time: %fs
\n
"
,
xpu_wait_time
/
done_cnt
);
fprintf
(
stderr
,
"pack task time: %fs
\n
"
,
pack_time
/
done_cnt
);
fprintf
(
stderr
,
"train total time: %fs
\n
"
,
total_time
/
done_cnt
);
fprintf
(
stderr
,
"pull sparse local time: %fs
\n
"
,
pull_sparse_local_time
/
done_cnt
);
fprintf
(
stderr
,
"fill sparse time: %fs
\n
"
,
fill_sparse_time
/
done_cnt
);
fprintf
(
stderr
,
"push sparse time: %fs
\n
"
,
push_sparse_time
/
done_cnt
);
fprintf
(
stderr
,
"collect label time: %fs
\n
"
,
collect_label_time
/
done_cnt
);
fprintf
(
stderr
,
"mean read time: %fs
\n
"
,
read_time
/
done_cnt
);
fprintf
(
stderr
,
"IO percent: %f
\n
"
,
read_time
/
total_time
*
100
);
fprintf
(
stderr
,
"cpu op run percent: %f
\n
"
,
cpu_op_time
/
total_time
*
100
);
fprintf
(
stderr
,
"xpu op run percent: %f
\n
"
,
xpu_op_time
/
total_time
*
100
);
fprintf
(
stderr
,
"xpu wait percent: %f
\n
"
,
xpu_wait_time
/
total_time
*
100
);
fprintf
(
stderr
,
"pack task percent: %f
\n
"
,
pack_time
/
total_time
*
100
);
fprintf
(
stderr
,
"pull sparse local time percent: %f
\n
"
,
pull_sparse_local_time
/
total_time
*
100
);
fprintf
(
stderr
,
"collect label time percent: %f
\n
"
,
collect_label_time
/
total_time
*
100
);
fprintf
(
stderr
,
"fill sparse time percent: %f
\n
"
,
fill_sparse_time
/
total_time
*
100
);
fprintf
(
stderr
,
"push sparse time percent: %f
\n
"
,
push_sparse_time
/
total_time
*
100
);
fprintf
(
stderr
,
"%6.2f instances/s
\n
"
,
total_inst
/
total_time
);
}
}
break
;
}
}
}
if
(
copy_table_config_
.
need_copy
())
{
CopySparseTable
();
CopyDenseTable
();
CopyDenseVars
();
}
}
void
HeterCpuWorker
::
TrainFiles
()
{
VLOG
(
3
)
<<
"Begin to train files"
;
platform
::
SetNumThreads
(
1
);
device_reader_
->
Start
();
int
batch_cnt
=
0
;
int
done_cnt
=
0
;
int
cur_batch
;
wait_queue_
.
SetCap
(
1
);
need_to_push_dense_
=
false
;
while
(
1
)
{
// if (copy_table_config_.need_copy()) {
// if (copy_table_config_.sparse_copy_by_feasign()) {
// for (size_t i = 0; i < copy_sparse_tables_.size(); ++i) {
// uint64_t tid = copy_sparse_tables_[i].first;
// feasign_set_[tid].insert(sparse_push_keys_[tid].begin(),
// sparse_push_keys_[tid].end());
// }
// }
// if (batch_cnt % copy_table_config_.batch_num() == 0) {
// CopySparseTable();
// CopyDenseTable();
// CopyDenseVars();
// }
// }
std
::
shared_ptr
<
HeterTask
>
task
;
task
=
run_queue_
.
Get
();
if
(
!
task
)
{
cur_batch
=
device_reader_
->
Next
();
if
(
cur_batch
<=
0
)
{
if
(
batch_cnt
==
done_cnt
)
{
break
;
}
else
{
continue
;
}
}
batch_cnt
+=
1
;
int
taskid
=
batch_cnt
*
worker_num_
+
thread_id_
;
task
=
object_pool_
.
Get
();
task
->
Reset
();
task
->
PackTask
(
thread_scope_
,
taskid
,
device_reader_
,
cur_batch
,
program_
);
}
for
(;;)
{
// pull sparse here
if
(
task
->
state_
==
PULL_SPARSE
)
{
VLOG
(
3
)
<<
"pull sparse taskid = "
<<
task
->
taskid_
;
for
(
int
i
=
0
;
i
<
param_
.
program_config
(
0
).
pull_sparse_table_id_size
();
++
i
)
{
uint64_t
tid
=
static_cast
<
uint64_t
>
(
param_
.
program_config
(
0
).
pull_sparse_table_id
(
i
));
TableParameter
table
;
for
(
auto
j
:
param_
.
sparse_table
())
{
if
(
j
.
table_id
()
==
tid
)
{
table
=
j
;
break
;
}
}
fleet_ptr_
->
HeterPullSparseVars
(
thread_id_
,
task
,
tid
,
sparse_key_names_
[
tid
],
table
.
fea_dim
(),
sparse_value_names_
[
tid
]);
}
task
->
Update
();
// JumpContext(task);
// break;
}
else
if
(
task
->
state_
==
OP_RUN
)
{
VLOG
(
3
)
<<
"oprun taskid = "
<<
task
->
taskid_
;
for
(
int
i
=
0
;
i
<
param_
.
program_config
(
0
).
pull_sparse_table_id_size
();
++
i
)
{
uint64_t
tid
=
static_cast
<
uint64_t
>
(
param_
.
program_config
(
0
).
pull_sparse_table_id
(
i
));
CollectLabelInfo
(
task
,
i
);
FillSparseValue
(
task
,
i
);
auto
nid_iter
=
std
::
find
(
sparse_value_names_
[
tid
].
begin
(),
sparse_value_names_
[
tid
].
end
(),
adjust_ins_weight_config_
.
nid_slot
());
if
(
nid_iter
!=
sparse_value_names_
[
tid
].
end
())
{
AdjustInsWeight
(
task
);
}
}
VLOG
(
3
)
<<
"fill sparse value for all sparse table done."
;
// do computation here
for
(
int
i
=
0
;
i
<
xpu_begin_op_index_
;
++
i
)
{
auto
&
op
=
ops_
[
i
];
bool
need_skip
=
false
;
for
(
auto
t
=
0u
;
t
<
skip_ops_
.
size
();
++
t
)
{
if
(
op
->
Type
().
find
(
skip_ops_
[
t
])
!=
std
::
string
::
npos
)
{
need_skip
=
true
;
break
;
}
}
if
(
!
need_skip
)
{
VLOG
(
3
)
<<
"run op: "
<<
op
->
Type
();
op
->
Run
(
*
(
task
->
scope_
),
place_
);
}
}
task
->
Update
();
}
else
if
(
task
->
state_
==
XPU
)
{
VLOG
(
3
)
<<
"call remote xpu taskid = "
<<
task
->
taskid_
;
std
::
vector
<
std
::
string
>
send_var_list
;
for
(
int
i
=
0
;
i
<
trainer_desc_
.
xpu_recv_list_size
();
++
i
)
{
send_var_list
.
push_back
(
trainer_desc_
.
xpu_recv_list
(
i
));
}
heter_ptr_
->
CallRemoteXpu
(
task
,
this
,
mpi_rank_
,
send_var_list
);
task
->
Update
();
JumpContext
(
task
);
break
;
}
else
if
(
task
->
state_
==
OP_RUN_END
)
{
for
(
size_t
i
=
xpu_end_op_index_
+
1
;
i
<
ops_
.
size
();
++
i
)
{
auto
&
op
=
ops_
[
i
];
bool
need_skip
=
false
;
for
(
auto
t
=
0u
;
t
<
skip_ops_
.
size
();
++
t
)
{
if
(
op
->
Type
().
find
(
skip_ops_
[
t
])
!=
std
::
string
::
npos
)
{
need_skip
=
true
;
break
;
}
}
if
(
!
need_skip
)
{
op
->
Run
(
*
(
task
->
scope_
),
place_
);
}
}
// check inf and nan
for
(
std
::
string
&
var_name
:
check_nan_var_names_
)
{
Variable
*
var
=
(
task
->
scope_
)
->
FindVar
(
var_name
);
if
(
var
==
nullptr
)
{
continue
;
}
LoDTensor
*
tensor
=
var
->
GetMutable
<
LoDTensor
>
();
if
(
tensor
==
nullptr
)
{
continue
;
}
}
task
->
Update
();
}
else
if
(
task
->
state_
==
PUSH_GRAD
)
{
VLOG
(
3
)
<<
"push grad taskid = "
<<
task
->
taskid_
;
if
(
need_to_push_sparse_
)
{
// push gradients here
for
(
int
i
=
0
;
i
<
param_
.
program_config
(
0
).
push_sparse_table_id_size
();
++
i
)
{
uint64_t
tid
=
static_cast
<
uint64_t
>
(
param_
.
program_config
(
0
).
push_sparse_table_id
(
i
));
TableParameter
table
;
for
(
auto
i
:
param_
.
sparse_table
())
{
if
(
i
.
table_id
()
==
tid
)
{
table
=
i
;
break
;
}
}
fleet_ptr_
->
HeterPushSparseVars
(
task
,
tid
,
sparse_key_names_
[
tid
],
sparse_grad_names_
[
tid
],
table
.
emb_dim
(),
&
push_sparse_status_
,
use_cvm_
,
dump_slot_
,
no_cvm_
);
}
}
if
(
need_to_push_sparse_
)
{
VLOG
(
3
)
<<
"push sparse gradient done."
;
int32_t
tmp_push_sparse_wait_times
=
-
1
;
static
uint32_t
push_sparse_wait_times
=
static_cast
<
uint32_t
>
(
tmp_push_sparse_wait_times
);
if
(
push_sparse_status_
.
size
()
>=
push_sparse_wait_times
)
{
for
(
auto
&
t
:
push_sparse_status_
)
{
t
.
wait
();
}
push_sparse_status_
.
resize
(
0
);
}
if
(
tmp_push_sparse_wait_times
==
-
1
)
{
push_sparse_status_
.
resize
(
0
);
}
}
// if (need_dump_field_) {
// size_t batch_size = device_reader_->GetCurBatchSize();
// std::vector<std::string> ars(batch_size);
// for (auto& ar : ars) {
// ar.clear();
// }
// auto& ins_id_vec = device_reader_->GetInsIdVec();
// auto& ins_content_vec = device_reader_->GetInsContentVec();
// for (size_t i = 0; i < ins_id_vec.size(); i++) {
// ars[i] += ins_id_vec[i];
// ars[i] = ars[i] + "\t" + ins_content_vec[i];
// }
// for (auto& field : dump_fields_) {
// Variable* var = thread_scope_->FindVar(field);
// if (var == nullptr) {
// continue;
// }
// LoDTensor* tensor = var->GetMutable<LoDTensor>();
// if (!CheckValidOutput(tensor, batch_size)) {
// continue;
// }
// for (size_t i = 0; i < batch_size; ++i) {
// auto output_dim = tensor->dims()[1];
// std::string output_dimstr =
// boost::lexical_cast<std::string>(output_dim);
// ars[i] = ars[i] + "\t" + field + ":" + output_dimstr;
// auto bound = GetTensorBound(tensor, i);
// ars[i] += PrintLodTensor(tensor, bound.first, bound.second);
// }
// }
// // #pragma omp parallel for
// for (size_t i = 0; i < ars.size(); i++) {
// if (ars[i].length() == 0) {
// continue;
// }
// writer_ << ars[i];
// }
// if (need_dump_param_ && thread_id_ == 0) {
// DumpParam();
// }
// }
// thread_scope_->DropKids();
task
->
Update
();
}
else
if
(
task
->
state_
==
DONE
)
{
VLOG
(
3
)
<<
"done taskid = "
<<
task
->
taskid_
;
object_pool_
.
Push
(
task
);
PrintFetchVars
();
++
done_cnt
;
// ++batch_cnt;
break
;
}
}
}
if
(
need_dump_field_
)
{
// writer_.Flush();
}
if
(
copy_table_config_
.
need_copy
())
{
CopySparseTable
();
CopyDenseTable
();
CopyDenseVars
();
}
}
}
// end namespace framework
}
// end namespace paddle
#endif
paddle/fluid/framework/heterxpu_trainer.cc
0 → 100644
浏览文件 @
862dde5e
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <cstdlib>
#include <ctime>
#include <string>
#include <vector>
#include "io/fs.h"
#include "paddle/fluid/framework/data_feed_factory.h"
#include "paddle/fluid/framework/data_set.h"
#include "paddle/fluid/framework/device_worker_factory.h"
#include "paddle/fluid/framework/fleet/fleet_wrapper.h"
#include "paddle/fluid/framework/trainer.h"
#if (defined PADDLE_WITH_CUDA) && (defined PADDLE_WITH_PSLIB)
#include "paddle/fluid/platform/cuda_device_guard.h"
namespace
paddle
{
namespace
framework
{
void
HeterXpuTrainer
::
Initialize
(
const
TrainerDesc
&
trainer_desc
,
Dataset
*
dataset
)
{
srand
((
unsigned
)
time
(
NULL
));
param_
=
trainer_desc
.
downpour_param
();
for
(
int
i
=
0
;
i
<
param_
.
dense_table_size
();
++
i
)
{
uint64_t
table_id
=
static_cast
<
uint64_t
>
(
param_
.
dense_table
(
i
).
table_id
());
auto
table
=
param_
.
dense_table
(
i
);
dense_grad_names_
[
table_id
].
resize
(
table
.
dense_grad_name_size
());
for
(
int
j
=
0
;
j
<
table
.
dense_grad_name_size
();
++
j
)
{
dense_grad_names_
[
table_id
][
j
]
=
table
.
dense_grad_name
(
j
);
}
}
scale_datanorm_
=
trainer_desc
.
scale_datanorm
();
int
place_num
=
trainer_desc
.
worker_places_size
();
for
(
int
i
=
0
;
i
<
place_num
;
++
i
)
{
int
num
=
trainer_desc
.
worker_places
(
i
);
platform
::
CUDAPlace
place
=
platform
::
CUDAPlace
(
num
);
platform
::
CUDADeviceGuard
guard
(
place
.
device
);
cudaStream_t
stream
;
PADDLE_ENFORCE_CUDA_SUCCESS
(
cudaStreamCreate
(
&
stream
));
copy_streams_
.
push_back
(
stream
);
places_
.
push_back
(
place
);
cudaEvent_t
event
;
PADDLE_ENFORCE_CUDA_SUCCESS
(
cudaEventCreateWithFlags
(
&
event
,
cudaEventDisableTiming
));
events_
.
push_back
(
event
);
}
// thread_num_ = trainer_desc.thread_num();
// SetDataset(dataset);
// dump_fields_path_ = trainer_desc.dump_fields_path();
// dump_converter_ = trainer_desc.dump_converter();
// need_dump_field_ = false;
// if (trainer_desc.dump_fields_size() != 0 && dump_fields_path_ != "") {
// need_dump_field_ = true;
// }
// if (need_dump_field_) {
// auto &file_list = dataset->GetFileList();
// if (file_list.size() == 0) {
// need_dump_field_ = false;
// }
// }
// mpi_rank_ = trainer_desc.mpi_rank();
// mpi_size_ = trainer_desc.mpi_size();
// dump_file_num_ = trainer_desc.dump_file_num();
// const std::vector<paddle::framework::DataFeed *> readers =
// dataset->GetReaders();
// thread_num_ = readers.size();
for
(
int
i
=
0
;
i
<
trainer_desc
.
downpour_param
().
stat_var_names_size
();
i
++
)
{
need_merge_var_names_
.
push_back
(
trainer_desc
.
downpour_param
().
stat_var_names
(
i
));
}
running_
=
true
;
VLOG
(
3
)
<<
"going to initialize pull dense worker"
;
pull_dense_worker_
=
PullDenseWorker
::
GetInstance
();
pull_dense_worker_
->
Initialize
(
trainer_desc
);
VLOG
(
3
)
<<
"initialize pull dense worker"
;
SetDebug
(
trainer_desc
.
debug
());
fleet_ptr_
=
FleetWrapper
::
GetInstance
();
heter_ptr_
=
HeterWrapper
::
GetInstance
();
RegisterServiceHandler
();
// for (int i = 0; i < trainer_desc.worker_places_size(); ++i) {
// int num = trainer_desc.worker_places(i);
// platform::CUDAPlace place = platform::CUDAPlace(num);
// platform::CUDADeviceGuard guard(place.device);
// cudaStream_t stream;
// PADDLE_ENFORCE_CUDA_SUCCESS(cudaStreamCreate(&stream));
// copy_streams_.push_back(stream);
// places_.push_back(place);
// }
trainer_desc_
=
trainer_desc
;
}
void
HeterXpuTrainer
::
CreateThreadParam
(
const
ProgramDesc
&
program
,
int
num
)
{
auto
place
=
places_
[
num
];
Scope
*
scope
=
place_scopes_
[
num
];
auto
stream
=
copy_streams_
[
num
];
auto
event
=
events_
[
num
];
auto
dev_id
=
BOOST_GET_CONST
(
platform
::
CUDAPlace
,
place
).
device
;
platform
::
CUDADeviceGuard
guard
(
dev_id
);
auto
&
block
=
program
.
Block
(
0
);
for
(
auto
&
var
:
block
.
AllVars
())
{
if
(
var
->
Persistable
())
{
auto
name
=
var
->
Name
();
Variable
*
root_var
=
root_scope_
->
FindVar
(
name
);
LoDTensor
*
root_tensor
=
root_var
->
GetMutable
<
LoDTensor
>
();
auto
*
ptr
=
scope
->
Var
(
name
);
InitializeVariable
(
ptr
,
proto
::
VarType
::
LOD_TENSOR
);
LoDTensor
*
thread_tensor
=
ptr
->
GetMutable
<
LoDTensor
>
();
#define HeterMemcpyFunc(cpp_type, proto_type) \
do { \
if (root_tensor->type() == proto_type) { \
HeterMemCpy<cpp_type>(thread_tensor, root_tensor, place, stream); \
} \
} while (0)
_ForEachDataType_
(
HeterMemcpyFunc
);
}
}
PADDLE_ENFORCE_CUDA_SUCCESS
(
cudaEventRecord
(
event
,
stream
));
cudaEventSynchronize
(
event
);
}
template
<
typename
T
>
void
HeterXpuTrainer
::
HeterMemCpy
(
LoDTensor
*
thread_tensor
,
LoDTensor
*
root_tensor
,
const
paddle
::
platform
::
Place
&
thread_place
,
cudaStream_t
stream
)
{
T
*
thread_ptr
=
thread_tensor
->
mutable_data
<
T
>
(
root_tensor
->
dims
(),
thread_place
);
T
*
root_ptr
=
root_tensor
->
data
<
T
>
();
if
(
platform
::
is_cpu_place
(
root_tensor
->
place
()))
{
memory
::
Copy
(
BOOST_GET_CONST
(
platform
::
CUDAPlace
,
thread_place
),
thread_ptr
,
platform
::
CPUPlace
(),
root_ptr
,
sizeof
(
T
)
*
root_tensor
->
numel
(),
stream
);
}
else
{
memory
::
Copy
(
BOOST_GET_CONST
(
platform
::
CUDAPlace
,
thread_place
),
thread_ptr
,
BOOST_GET_CONST
(
platform
::
CUDAPlace
,
root_tensor
->
place
()),
root_ptr
,
sizeof
(
T
)
*
root_tensor
->
numel
(),
stream
);
}
}
void
HeterXpuTrainer
::
DumpWork
(
int
tid
)
{}
void
HeterXpuTrainer
::
InitTrainerEnv
(
const
ProgramDesc
&
main_program
,
const
platform
::
Place
&
place
)
{
CacheProgram
(
main_program
);
place_
=
place
;
auto
&
profiler
=
paddle
::
ps
::
CostProfiler
::
instance
();
profiler
.
register_profiler
(
"xpu_service_run_task"
);
profiler
.
register_profiler
(
"xpu_service_deserial"
);
profiler
.
register_profiler
(
"xpu_service_launch_kernel"
);
profiler
.
register_profiler
(
"xpu_service_wait"
);
}
void
HeterXpuTrainer
::
InitOtherEnv
(
const
ProgramDesc
&
main_program
)
{
auto
&
block
=
main_program
.
Block
(
0
);
pull_dense_worker_
->
SetRootScope
(
root_scope_
);
pull_dense_worker_
->
CreatePinVar
();
for
(
size_t
i
=
0
;
i
<
places_
.
size
();
++
i
)
{
Scope
*
scope
=
&
(
root_scope_
->
NewScope
());
// for (auto &var : block.AllVars()) {
// if (var->Persistable()) {
// auto *ptr = scope->Var(var->Name());
// InitializeVariable(ptr, var->GetType());
// }
// }
place_scopes_
.
push_back
(
scope
);
CreateThreadParam
(
main_program
,
i
);
pull_dense_worker_
->
AddThreadScope
(
scope
);
pull_dense_worker_
->
AddPlace
(
places_
[
i
]);
pull_dense_worker_
->
AddStream
(
copy_streams_
[
i
]);
}
pull_dense_worker_
->
Start
();
for
(
auto
&
stream
:
copy_streams_
)
{
cudaStreamSynchronize
(
stream
);
}
op_names_
.
clear
();
for
(
auto
&
op_desc
:
block
.
AllOps
())
{
std
::
unique_ptr
<
OperatorBase
>
local_op
=
OpRegistry
::
CreateOp
(
*
op_desc
);
op_names_
.
push_back
(
op_desc
->
Type
());
OperatorBase
*
local_op_ptr
=
local_op
.
release
();
ops_
.
push_back
(
local_op_ptr
);
continue
;
}
xpu_begin_op_index_
=
xpu_end_op_index_
=
-
1
;
xpu_begin_op_index_
=
trainer_desc_
.
xpu_start_idx
();
xpu_end_op_index_
=
trainer_desc_
.
xpu_end_idx
();
VLOG
(
0
)
<<
"xpu begin: "
<<
xpu_begin_op_index_
<<
" xpu end: "
<<
xpu_end_op_index_
;
// CHECK(xpu_begin_op_index_ == 0);
// CHECK(xpu_end_op_index_ = ops_.size() - 1);
//// init pool
for
(
size_t
i
=
0
;
i
<
6
;
++
i
)
{
for
(
size_t
j
=
0
;
j
<
places_
.
size
();
++
j
)
{
int
num
=
j
;
std
::
shared_ptr
<
HeterServiceContext
>
context
=
std
::
make_shared
<
HeterServiceContext
>
();
context
->
place_num_
=
num
;
auto
place
=
places_
[
num
];
context
->
scope_
=
&
(
place_scopes_
[
num
]
->
NewScope
());
auto
&
block
=
program_
.
Block
(
0
);
for
(
auto
&
var
:
block
.
AllVars
())
{
if
(
!
var
->
Persistable
())
{
auto
*
ptr
=
context
->
scope_
->
Var
(
var
->
Name
());
InitializeVariable
(
ptr
,
var
->
GetType
());
}
}
for
(
auto
&
v
:
dense_grad_names_
)
{
for
(
auto
&
name
:
v
.
second
)
{
auto
*
ptr
=
context
->
scope_
->
Var
(
name
+
"pin"
);
InitializeVariable
(
ptr
,
proto
::
VarType
::
LOD_TENSOR
);
}
}
for
(
auto
&
op_desc
:
block
.
AllOps
())
{
std
::
unique_ptr
<
OperatorBase
>
local_op
=
OpRegistry
::
CreateOp
(
*
op_desc
);
OperatorBase
*
local_op_ptr
=
local_op
.
release
();
(
context
->
ops_
).
push_back
(
local_op_ptr
);
}
auto
dev_id
=
BOOST_GET_CONST
(
platform
::
CUDAPlace
,
place
).
device
;
platform
::
CUDADeviceGuard
guard
(
dev_id
);
PADDLE_ENFORCE_CUDA_SUCCESS
(
cudaEventCreateWithFlags
(
&
context
->
event_
,
cudaEventDisableTiming
));
object_pool_
.
Push
(
context
);
}
}
VLOG
(
3
)
<<
"init other env done."
;
}
void
HeterXpuTrainer
::
Run
()
{}
int
HeterXpuTrainer
::
EndPass
(
const
HeterRequest
*
request
,
HeterResponse
*
response
)
{
// int scope_num = object_pool_.Size();
for
(
size_t
i
=
0
;
i
<
need_merge_var_names_
.
size
();
i
++
)
{
Variable
*
root_var
=
root_scope_
->
FindVar
(
need_merge_var_names_
[
i
]);
if
(
root_var
==
nullptr
)
{
continue
;
}
LoDTensor
*
root_tensor
=
root_var
->
GetMutable
<
LoDTensor
>
();
for
(
size_t
j
=
0
;
j
<
place_scopes_
.
size
();
j
++
)
{
Scope
*
cur_thread_scope
=
place_scopes_
[
j
];
Variable
*
thread_var
=
cur_thread_scope
->
FindVar
(
need_merge_var_names_
[
i
]);
if
(
thread_var
==
nullptr
)
{
continue
;
}
LoDTensor
*
thread_tensor
=
thread_var
->
GetMutable
<
LoDTensor
>
();
// if (root_tensor->numel() != thread_tensor->numel()) {
// continue;
// }
#define MergeCallback(cpp_type, proto_type) \
do { \
if (root_tensor->type() == proto_type) { \
if (thread_tensor->type() != proto_type) { \
VLOG(0) << "Error: thread id=" << j << ", need_merge_var_names_[" << i \
<< "] " << need_merge_var_names_[i] \
<< ", root tensor type=" << root_tensor->type() \
<< ", thread tensor type=" << thread_tensor->type(); \
exit(-1); \
} \
MergeToRootScope<cpp_type>(root_tensor, thread_tensor); \
} \
} while (0)
_ForEachDataType_
(
MergeCallback
);
if
(
platform
::
is_gpu_place
(
thread_tensor
->
place
()))
{
auto
dev_id
=
BOOST_GET_CONST
(
platform
::
CUDAPlace
,
thread_tensor
->
place
()).
device
;
platform
::
CUDADeviceGuard
guard
(
dev_id
);
cudaMemset
(
thread_tensor
->
data
<
void
>
(),
0
,
thread_tensor
->
numel
()
*
SizeOfType
(
thread_tensor
->
type
()));
}
else
{
memset
(
thread_tensor
->
data
<
void
>
(),
0
,
thread_tensor
->
numel
()
*
SizeOfType
(
thread_tensor
->
type
()));
}
}
auto
*
merge_var
=
response
->
add_vars
();
heter_ptr_
->
SerializeToReq
(
need_merge_var_names_
[
i
],
root_scope_
,
merge_var
);
if
(
platform
::
is_gpu_place
(
root_tensor
->
place
()))
{
auto
dev_id
=
BOOST_GET_CONST
(
platform
::
CUDAPlace
,
root_tensor
->
place
()).
device
;
platform
::
CUDADeviceGuard
guard
(
dev_id
);
cudaMemset
(
root_tensor
->
data
<
void
>
(),
0
,
root_tensor
->
numel
()
*
SizeOfType
(
root_tensor
->
type
()));
}
else
{
memset
(
root_tensor
->
data
<
void
>
(),
0
,
root_tensor
->
numel
()
*
SizeOfType
(
root_tensor
->
type
()));
}
}
return
0
;
}
template
<
typename
T
>
void
HeterXpuTrainer
::
MergeToRootScope
(
LoDTensor
*
root_tensor
,
LoDTensor
*
tensor
)
{
LoDTensor
tmp_root
;
TensorCopy
(
*
root_tensor
,
platform
::
CPUPlace
(),
&
tmp_root
);
T
*
tmp_root_data
=
tmp_root
.
data
<
T
>
();
LoDTensor
tmp_tensor
;
TensorCopy
(
*
tensor
,
platform
::
CPUPlace
(),
&
tmp_tensor
);
T
*
data
=
tmp_tensor
.
data
<
T
>
();
for
(
int
i
=
0
;
i
<
tmp_tensor
.
numel
();
i
++
)
{
tmp_root_data
[
i
]
+=
data
[
i
];
}
TensorCopy
(
tmp_root
,
root_tensor
->
place
(),
root_tensor
);
}
int
HeterXpuTrainer
::
StopService
(
const
HeterRequest
*
request
,
HeterResponse
*
response
)
{
std
::
unique_lock
<
std
::
mutex
>
lock
(
mutex_
);
running_
=
false
;
cond_
.
notify_one
();
return
0
;
}
int
HeterXpuTrainer
::
RunTask
(
const
HeterRequest
*
request
,
HeterResponse
*
response
)
{
auto
timer
=
std
::
make_shared
<
paddle
::
ps
::
CostTimer
>
(
"xpu_service_run_task"
);
std
::
shared_ptr
<
HeterServiceContext
>
context
=
object_pool_
.
Get
();
if
(
!
context
->
scope_
)
{
int
num
=
rand
()
%
places_
.
size
();
context
->
place_num_
=
num
;
auto
place
=
places_
[
num
];
context
->
scope_
=
&
(
place_scopes_
[
num
]
->
NewScope
());
auto
&
block
=
program_
.
Block
(
0
);
for
(
auto
&
var
:
block
.
AllVars
())
{
if
(
!
var
->
Persistable
())
{
auto
*
ptr
=
context
->
scope_
->
Var
(
var
->
Name
());
InitializeVariable
(
ptr
,
var
->
GetType
());
}
}
for
(
auto
&
v
:
dense_grad_names_
)
{
for
(
auto
&
name
:
v
.
second
)
{
auto
*
ptr
=
context
->
scope_
->
Var
(
name
+
"pin"
);
InitializeVariable
(
ptr
,
proto
::
VarType
::
LOD_TENSOR
);
}
}
for
(
auto
&
op_desc
:
block
.
AllOps
())
{
std
::
unique_ptr
<
OperatorBase
>
local_op
=
OpRegistry
::
CreateOp
(
*
op_desc
);
OperatorBase
*
local_op_ptr
=
local_op
.
release
();
(
context
->
ops_
).
push_back
(
local_op_ptr
);
}
auto
dev_id
=
BOOST_GET_CONST
(
platform
::
CUDAPlace
,
place
).
device
;
platform
::
CUDADeviceGuard
guard
(
dev_id
);
PADDLE_ENFORCE_CUDA_SUCCESS
(
cudaEventCreateWithFlags
(
&
context
->
event_
,
cudaEventDisableTiming
));
}
context
->
Reset
();
auto
place
=
places_
[
context
->
place_num_
];
{
auto
deserial_timer
=
std
::
make_shared
<
paddle
::
ps
::
CostTimer
>
(
"xpu_service_deserial"
);
for
(
int
i
=
0
;
i
<
request
->
vars_size
();
++
i
)
{
heter_ptr_
->
DeSerializeToTensor
(
context
->
scope_
,
request
->
vars
(
i
),
place
,
copy_streams_
[
context
->
place_num_
]);
}
PADDLE_ENFORCE_CUDA_SUCCESS
(
cudaEventRecord
(
context
->
event_
,
copy_streams_
[
context
->
place_num_
]));
while
(
cudaEventQuery
(
context
->
event_
)
!=
cudaSuccess
)
{
VLOG
(
3
)
<<
"wait for kernel"
;
bthread_yield
();
}
}
{
auto
launch_timer
=
std
::
make_shared
<
paddle
::
ps
::
CostTimer
>
(
"xpu_service_launch_kernel"
);
for
(
int
i
=
xpu_begin_op_index_
;
i
<=
xpu_end_op_index_
;
++
i
)
{
auto
&
op
=
(
context
->
ops_
)[
i
];
op
->
Run
(
*
(
context
->
scope_
),
place
);
}
}
auto
*
dev_ctx
=
static_cast
<
platform
::
CUDADeviceContext
*>
(
platform
::
DeviceContextPool
::
Instance
().
Get
(
place
));
PADDLE_ENFORCE_CUDA_SUCCESS
(
cudaEventRecord
(
context
->
event_
,
dev_ctx
->
stream
()));
// cudaEventSynchronize(context->event_);
{
auto
wait_timer
=
std
::
make_shared
<
paddle
::
ps
::
CostTimer
>
(
"xpu_service_wait"
);
while
(
cudaEventQuery
(
context
->
event_
)
!=
cudaSuccess
)
{
VLOG
(
3
)
<<
"wait for kernel"
;
bthread_yield
();
}
}
for
(
int
i
=
0
;
i
<
trainer_desc_
.
xpu_send_list_size
();
++
i
)
{
const
std
::
string
&
varname
=
trainer_desc_
.
xpu_send_list
(
i
);
// CHECK(varname == "concat_1.tmp_0@GRAD");
auto
*
res_var
=
response
->
add_vars
();
heter_ptr_
->
SerializeToReq
(
varname
,
context
->
scope_
,
res_var
);
}
// std::string varname = "concat_1.tmp_0@GRAD";
//
// auto* res_var = response->add_vars();
// heter_ptr_->SerializeToReq(varname, context->scope_, res_var);
for
(
int
i
=
0
;
i
<
param_
.
program_config
(
0
).
push_dense_table_id_size
();
++
i
)
{
uint64_t
tid
=
static_cast
<
uint64_t
>
(
param_
.
program_config
(
0
).
push_dense_table_id
(
i
));
fleet_ptr_
->
PushDenseVarsAsync
(
*
(
context
->
scope_
),
tid
,
dense_grad_names_
[
tid
],
&
(
context
->
push_dense_status_
),
scale_datanorm_
,
request
->
cur_batch
(),
places_
[
context
->
place_num_
],
copy_streams_
[
context
->
place_num_
],
context
->
event_
);
}
for
(
int
i
=
0
;
i
<
param_
.
program_config
(
0
).
push_dense_table_id_size
();
++
i
)
{
uint64_t
tid
=
static_cast
<
uint64_t
>
(
param_
.
program_config
(
0
).
push_dense_table_id
(
i
));
pull_dense_worker_
->
IncreaseThreadVersion
(
0
,
tid
);
}
VLOG
(
3
)
<<
"push dense gradient done."
;
context
->
scope_
->
DropKids
();
object_pool_
.
Push
(
context
);
VLOG
(
0
)
<<
"pool size "
<<
object_pool_
.
Size
();
return
0
;
}
void
HeterXpuTrainer
::
RegisterServiceHandler
()
{
heter_ptr_
->
RegisterServiceHandler
(
0
,
[
this
](
const
HeterRequest
*
request
,
HeterResponse
*
response
)
->
int
{
return
this
->
RunTask
(
request
,
response
);
});
heter_ptr_
->
RegisterServiceHandler
(
1
,
[
this
](
const
HeterRequest
*
request
,
HeterResponse
*
response
)
->
int
{
return
this
->
EndPass
(
request
,
response
);
});
heter_ptr_
->
RegisterServiceHandler
(
2
,
[
this
](
const
HeterRequest
*
request
,
HeterResponse
*
response
)
->
int
{
return
this
->
StopService
(
request
,
response
);
});
}
Scope
*
HeterXpuTrainer
::
GetWorkerScope
(
int
thread_id
)
{
return
nullptr
;
}
void
HeterXpuTrainer
::
Finalize
()
{
// for (auto &th : threads_) {
// th.join();
// }
std
::
unique_lock
<
std
::
mutex
>
lock
(
mutex_
);
cond_
.
wait
(
lock
,
[
this
]
{
return
!
running_
;
});
sleep
(
3
);
pull_dense_worker_
->
Stop
();
root_scope_
->
DropKids
();
}
}
// namespace framework
}
// namespace paddle
#endif
paddle/fluid/framework/multi_trainer.cc
浏览文件 @
862dde5e
...
...
@@ -102,6 +102,7 @@ void MultiTrainer::InitTrainerEnv(const ProgramDesc& main_program,
workers_
[
i
]
->
SetRootScope
(
root_scope_
);
workers_
[
i
]
->
CreateDeviceResource
(
main_program
);
// Program
workers_
[
i
]
->
BindingDataFeedMemory
();
workers_
[
i
]
->
CacheProgram
(
main_program
);
}
}
...
...
paddle/fluid/framework/pull_dense_worker.cc
浏览文件 @
862dde5e
...
...
@@ -56,6 +56,34 @@ void PullDenseWorker::Initialize(const TrainerDesc& param) {
current_version_
[
tid
]
=
0
;
}
fleet_ptr_
=
FleetWrapper
::
GetInstance
();
#ifdef PADDLE_WITH_CUDA
copy_streams_
.
clear
();
places_
.
clear
();
thread_scopes_
.
clear
();
#endif
}
void
PullDenseWorker
::
CreatePinVar
()
{
#ifdef PADDLE_WITH_CUDA
// for (auto& v : dense_value_names_) {
// for (auto& name : v.second) {
for
(
int
i
=
0
;
i
<
dwp_param_
.
program_config
(
0
).
pull_dense_table_id_size
();
++
i
)
{
uint64_t
tid
=
static_cast
<
uint64_t
>
(
dwp_param_
.
program_config
(
0
).
pull_dense_table_id
(
i
));
for
(
size_t
j
=
0
;
j
<
dense_value_names_
[
tid
].
size
();
j
++
)
{
auto
&
name
=
dense_value_names_
[
tid
][
j
];
Variable
*
var
=
root_scope_
->
FindVar
(
name
);
LoDTensor
*
tensor
=
var
->
GetMutable
<
LoDTensor
>
();
auto
*
ptr
=
root_scope_
->
Var
(
name
+
"pin"
);
InitializeVariable
(
ptr
,
proto
::
VarType
::
LOD_TENSOR
);
LoDTensor
*
pin_tensor
=
ptr
->
GetMutable
<
LoDTensor
>
();
pin_tensor
->
mutable_data
<
float
>
(
tensor
->
dims
(),
platform
::
CUDAPinnedPlace
());
}
}
#endif
}
void
PullDenseWorker
::
Wait
(
std
::
vector
<::
std
::
future
<
int32_t
>>*
status_vec
)
{
...
...
@@ -75,6 +103,31 @@ void PullDenseWorker::Wait(std::vector<::std::future<int32_t>>* status_vec) {
exit
(
-
1
);
}
status_vec
->
resize
(
0
);
#ifdef PADDLE_WITH_CUDA
for
(
size_t
i
=
0
;
i
<
places_
.
size
();
++
i
)
{
// for (auto& v : dense_value_names_) {
// for (auto& name : v.second) {
for
(
int
x
=
0
;
x
<
dwp_param_
.
program_config
(
0
).
pull_dense_table_id_size
();
++
x
)
{
uint64_t
tid
=
static_cast
<
uint64_t
>
(
dwp_param_
.
program_config
(
0
).
pull_dense_table_id
(
x
));
for
(
size_t
j
=
0
;
j
<
dense_value_names_
[
tid
].
size
();
j
++
)
{
auto
&
name
=
dense_value_names_
[
tid
][
j
];
Variable
*
pin_var
=
root_scope_
->
FindVar
(
name
+
"pin"
);
LoDTensor
*
pin_tensor
=
pin_var
->
GetMutable
<
LoDTensor
>
();
float
*
pin_w
=
pin_tensor
->
data
<
float
>
();
Variable
*
var
=
thread_scopes_
[
i
]
->
FindVar
(
name
);
LoDTensor
*
tensor
=
var
->
GetMutable
<
LoDTensor
>
();
float
*
w
=
tensor
->
data
<
float
>
();
memory
::
Copy
(
BOOST_GET_CONST
(
platform
::
CUDAPlace
,
places_
[
i
]),
w
,
platform
::
CUDAPinnedPlace
(),
pin_w
,
sizeof
(
float
)
*
tensor
->
numel
(),
copy_streams_
[
i
]);
}
}
}
#endif
}
void
PullDenseWorker
::
Stop
()
{
...
...
@@ -91,8 +144,14 @@ void PullDenseWorker::PullDense(bool force_update) {
uint64_t
tid
=
static_cast
<
uint64_t
>
(
dwp_param_
.
program_config
(
0
).
pull_dense_table_id
(
i
));
if
(
force_update
||
CheckUpdateParam
(
tid
))
{
#ifdef PADDLE_WITH_CUDA
VLOG
(
3
)
<<
"pull dense "
<<
force_update
<<
" "
<<
tid
;
fleet_ptr_
->
PullDenseVarsAsync
(
*
root_scope_
,
tid
,
dense_value_names_
[
tid
],
&
pull_dense_status_
);
&
pull_dense_status_
,
false
);
#else
fleet_ptr_
->
PullDenseVarsAsync
(
*
root_scope_
,
tid
,
dense_value_names_
[
tid
],
&
pull_dense_status_
,
true
);
#endif
ResetThreadVersion
(
tid
);
}
}
...
...
paddle/fluid/framework/trainer.h
浏览文件 @
862dde5e
...
...
@@ -21,9 +21,12 @@ limitations under the License. */
#include <thread> // NOLINT
#include <vector>
#include <ctime>
#include "paddle/fluid/framework/data_feed.h"
#include "paddle/fluid/framework/data_set.h"
#include "paddle/fluid/framework/device_worker.h"
#include "paddle/fluid/framework/fleet/heter_wrapper.h"
#include "paddle/fluid/framework/heter_service.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/framework/reader.h"
...
...
@@ -62,6 +65,7 @@ class TrainerBase {
Scope
*
root_scope_
;
bool
debug_
;
Dataset
*
dataset_ptr_
;
TrainerDesc
trainer_desc_
;
// For dump param or field
bool
need_dump_field_
=
false
;
...
...
@@ -118,10 +122,86 @@ class DistMultiTrainer : public MultiTrainer {
void
MergeToRootScope
(
LoDTensor
*
root_tensor
,
LoDTensor
*
thread_tensor
);
virtual
void
InitDumpEnv
();
virtual
Scope
*
GetWorkerScope
(
int
thread_id
);
virtual
void
RegisterHeterCallback
();
protected:
std
::
shared_ptr
<
paddle
::
framework
::
PullDenseWorker
>
pull_dense_worker_
;
};
#if (defined PADDLE_WITH_CUDA) && (defined PADDLE_WITH_PSLIB)
class
HeterServiceContext
{
public:
HeterServiceContext
()
{}
virtual
~
HeterServiceContext
()
{
for
(
OperatorBase
*
op
:
ops_
)
{
delete
op
;
}
std
::
vector
<
OperatorBase
*>
().
swap
(
ops_
);
}
void
Reset
()
{
push_dense_status_
.
clear
();
}
int
place_num_
;
Scope
*
scope_
{
nullptr
};
cudaEvent_t
event_
;
std
::
vector
<
OperatorBase
*>
ops_
;
std
::
vector
<::
std
::
future
<
int32_t
>>
push_dense_status_
;
};
class
HeterXpuTrainer
:
public
TrainerBase
{
public:
HeterXpuTrainer
()
{}
virtual
~
HeterXpuTrainer
()
{
for
(
OperatorBase
*
op
:
ops_
)
{
delete
op
;
}
std
::
vector
<
OperatorBase
*>
().
swap
(
ops_
);
}
virtual
void
Initialize
(
const
TrainerDesc
&
trainer_desc
,
Dataset
*
data_set
);
virtual
void
InitTrainerEnv
(
const
ProgramDesc
&
main_program
,
const
platform
::
Place
&
place
);
virtual
void
InitOtherEnv
(
const
ProgramDesc
&
main_program
);
virtual
void
Run
();
virtual
void
Finalize
();
virtual
void
DumpWork
(
int
tid
);
virtual
void
RegisterServiceHandler
();
virtual
int
RunTask
(
const
HeterRequest
*
request
,
HeterResponse
*
response
);
virtual
Scope
*
GetWorkerScope
(
int
thread_id
);
virtual
void
CacheProgram
(
const
ProgramDesc
&
main_program
)
{
new
(
&
program_
)
ProgramDesc
(
main_program
);
}
template
<
typename
T
>
void
HeterMemCpy
(
LoDTensor
*
tensor
,
LoDTensor
*
root_tensor
,
const
paddle
::
platform
::
Place
&
thread_place
,
cudaStream_t
stream
);
void
CreateThreadParam
(
const
ProgramDesc
&
program
,
int
num
);
template
<
typename
T
>
void
MergeToRootScope
(
LoDTensor
*
root_tensor
,
LoDTensor
*
thread_tensor
);
int
EndPass
(
const
HeterRequest
*
request
,
HeterResponse
*
response
);
int
StopService
(
const
HeterRequest
*
request
,
HeterResponse
*
response
);
protected:
DownpourWorkerParameter
param_
;
std
::
map
<
uint64_t
,
std
::
vector
<
std
::
string
>>
dense_grad_names_
;
std
::
vector
<
std
::
string
>
need_merge_var_names_
;
float
scale_datanorm_
;
int
xpu_begin_op_index_
;
int
xpu_end_op_index_
;
bool
running_
;
paddle
::
platform
::
Place
place_
;
std
::
mutex
mutex_
;
ProgramDesc
program_
;
std
::
condition_variable
cond_
;
std
::
shared_ptr
<
paddle
::
framework
::
FleetWrapper
>
fleet_ptr_
;
std
::
shared_ptr
<
paddle
::
framework
::
HeterWrapper
>
heter_ptr_
;
std
::
shared_ptr
<
paddle
::
framework
::
PullDenseWorker
>
pull_dense_worker_
;
std
::
vector
<
OperatorBase
*>
ops_
;
std
::
vector
<
std
::
string
>
op_names_
;
std
::
vector
<
Scope
*>
place_scopes_
;
BtObjectPool
<
HeterServiceContext
>
object_pool_
;
std
::
vector
<
cudaStream_t
>
copy_streams_
;
std
::
vector
<
platform
::
Place
>
places_
;
std
::
vector
<
cudaEvent_t
>
events_
;
};
#endif
#if defined(PADDLE_WITH_NCCL)
class
PipelineTrainer
:
public
TrainerBase
{
...
...
paddle/fluid/framework/trainer_desc.proto
浏览文件 @
862dde5e
...
...
@@ -52,6 +52,12 @@ message TrainerDesc {
optional
bool
enable_random_dump
=
24
[
default
=
false
];
optional
bool
random_with_lineid
=
25
[
default
=
false
];
optional
int32
dump_interval
=
26
[
default
=
10000
];
repeated
int32
worker_places
=
27
;
repeated
string
xpu_send_list
=
28
;
repeated
string
xpu_recv_list
=
29
;
optional
int32
xpu_start_idx
=
30
;
optional
int32
xpu_end_idx
=
31
;
// device worker parameters
optional
HogwildWorkerParameter
hogwild_param
=
101
;
...
...
paddle/fluid/framework/trainer_factory.cc
浏览文件 @
862dde5e
...
...
@@ -63,6 +63,9 @@ std::shared_ptr<TrainerBase> TrainerFactory::CreateTrainer(
REGISTER_TRAINER_CLASS
(
MultiTrainer
);
REGISTER_TRAINER_CLASS
(
DistMultiTrainer
);
#if (defined PADDLE_WITH_CUDA) && (defined PADDLE_WITH_PSLIB)
REGISTER_TRAINER_CLASS
(
HeterXpuTrainer
);
#endif
#if defined(PADDLE_WITH_NCCL)
REGISTER_TRAINER_CLASS
(
PipelineTrainer
);
#endif
...
...
paddle/fluid/operators/flatten_op.cc
浏览文件 @
862dde5e
...
...
@@ -241,6 +241,156 @@ class Flatten2GradOp : public framework::OperatorWithKernel {
}
};
class
FlattenContiguousRangeOp
:
public
framework
::
OperatorWithKernel
{
public:
using
framework
::
OperatorWithKernel
::
OperatorWithKernel
;
void
InferShape
(
framework
::
InferShapeContext
*
ctx
)
const
override
{
OP_INOUT_CHECK
(
ctx
->
HasInput
(
"X"
),
"Input"
,
"X"
,
"FlattenContiguousRange"
);
OP_INOUT_CHECK
(
ctx
->
HasOutput
(
"Out"
),
"Output"
,
"Out"
,
"FlattenContiguousRange"
);
const
auto
&
start_axis
=
ctx
->
Attrs
().
Get
<
int
>
(
"start_axis"
);
const
auto
&
stop_axis
=
ctx
->
Attrs
().
Get
<
int
>
(
"stop_axis"
);
const
auto
&
in_dims
=
ctx
->
GetInputDim
(
"X"
);
int
in_dims_size
=
in_dims
.
size
();
int
real_start_axis
=
start_axis
,
real_stop_axis
=
stop_axis
;
if
(
start_axis
<
0
)
{
real_start_axis
=
start_axis
+
in_dims_size
;
}
if
(
stop_axis
<
0
)
{
real_stop_axis
=
stop_axis
+
in_dims_size
;
}
PADDLE_ENFORCE_GE
(
real_stop_axis
,
real_start_axis
,
platform
::
errors
::
InvalidArgument
(
"The stop_axis should be greater"
"than or equal to start_axis."
));
const
auto
&
out_dims
=
GetOutputShape
(
real_start_axis
,
real_stop_axis
,
in_dims
);
ctx
->
SetOutputDim
(
"Out"
,
framework
::
make_ddim
(
out_dims
));
if
(
in_dims
[
0
]
==
out_dims
[
0
])
{
// Only pass LoD when the first dimension of output and Input(X)
// are the same.
ctx
->
ShareLoD
(
"X"
,
"Out"
);
}
OP_INOUT_CHECK
(
ctx
->
HasOutput
(
"XShape"
),
"Output"
,
"XShape"
,
"Flatten2"
);
std
::
vector
<
int64_t
>
xshape_dims
(
in_dims
.
size
()
+
1
);
xshape_dims
[
0
]
=
0
;
for
(
int
i
=
0
;
i
<
in_dims
.
size
();
++
i
)
{
xshape_dims
[
i
+
1
]
=
in_dims
[
i
];
}
ctx
->
SetOutputDim
(
"XShape"
,
framework
::
make_ddim
(
xshape_dims
));
ctx
->
ShareLoD
(
"X"
,
"XShape"
);
}
static
std
::
vector
<
int32_t
>
GetOutputShape
(
const
int
start_axis
,
const
int
stop_axis
,
const
framework
::
DDim
&
in_dims
)
{
int64_t
outer
=
1
;
std
::
vector
<
int32_t
>
out_shape
;
int
in_dims_size
=
in_dims
.
size
();
out_shape
.
reserve
(
in_dims_size
-
stop_axis
+
start_axis
);
for
(
int
i
=
0
;
i
<
start_axis
;
++
i
)
{
out_shape
.
push_back
(
in_dims
[
i
]);
}
for
(
int
i
=
start_axis
;
i
<=
stop_axis
;
i
++
)
{
outer
*=
in_dims
[
i
];
}
out_shape
.
push_back
(
outer
);
for
(
int
i
=
stop_axis
+
1
;
i
<
in_dims_size
;
i
++
)
{
out_shape
.
push_back
(
in_dims
[
i
]);
}
return
out_shape
;
}
};
class
FlattenContiguousRangeOpMaker
:
public
FlattenOpMaker
{
public:
void
Make
()
override
{
AddInput
(
"X"
,
"(Tensor) A tensor of rank >= axis."
);
AddOutput
(
"Out"
,
"A 2D tensor is reshaped input tensor. The input dimensions"
"up to axis are flattened to the outer dimension of the output"
"and the remaining input dimensions are flattened into the inner"
"dimension of the output."
);
AddAttr
<
int
>
(
"start_axis"
,
"(int)"
"Indicate the input start dimension (exclusive) to flatten"
)
.
SetDefault
(
1
);
AddAttr
<
int
>
(
"stop_axis"
,
"(int)"
"Indicate the input stop dimension (exclusive) to flatten"
)
.
SetDefault
(
1
);
AddComment
(
R"DOC(
Flatten Operator
Flattens the input tensor into a new matrix according to start_axis and stop_axis.
Examples:
Case 1:
Given
X.shape = (3, 100, 100, 4)
and
start_axis = 2, stop_axis = -1
We get:
Out.shape = (3, 100, 400)
Case 2:
Given
X.shape = (3, 100, 100, 4)
and
start_axis = 0, stop_axis = -1
We get:
Out.shape = (3 * 100 * 100 * 4)
)DOC"
);
AddOutput
(
"XShape"
,
"XShape is just used to store the shape and lod of X, which will "
"be used in FlattenGradOp."
)
.
AsIntermediate
();
}
};
template
<
typename
T
>
class
FlattenContiguousRangeGradOpMaker
:
public
framework
::
SingleGradOpMaker
<
T
>
{
public:
using
framework
::
SingleGradOpMaker
<
T
>::
SingleGradOpMaker
;
void
Apply
(
GradOpPtr
<
T
>
grad_op
)
const
override
{
grad_op
->
SetType
(
"flatten_contiguous_range_grad"
);
grad_op
->
SetInput
(
"XShape"
,
this
->
Output
(
"XShape"
));
grad_op
->
SetInput
(
framework
::
GradVarName
(
"Out"
),
this
->
OutputGrad
(
"Out"
));
grad_op
->
SetOutput
(
framework
::
GradVarName
(
"X"
),
this
->
InputGrad
(
"X"
));
grad_op
->
SetAttrMap
(
this
->
Attrs
());
}
};
class
FlattenContiguousRangeGradOp
:
public
framework
::
OperatorWithKernel
{
public:
using
framework
::
OperatorWithKernel
::
OperatorWithKernel
;
void
InferShape
(
framework
::
InferShapeContext
*
context
)
const
override
{
OP_INOUT_CHECK
(
context
->
HasInput
(
"XShape"
),
"Input"
,
"XShape"
,
"FlattenContiguousRangeGrad"
);
OP_INOUT_CHECK
(
context
->
HasInput
(
framework
::
GradVarName
(
"Out"
)),
"Input"
,
framework
::
GradVarName
(
"Out"
),
"FlattenContiguousRangeGrad"
);
auto
xshape_dims
=
context
->
GetInputDim
(
"XShape"
);
auto
x_dims
=
framework
::
slice_ddim
(
xshape_dims
,
1
,
xshape_dims
.
size
());
context
->
SetOutputDim
(
framework
::
GradVarName
(
"X"
),
x_dims
);
context
->
ShareLoD
(
"XShape"
,
framework
::
GradVarName
(
"X"
));
}
protected:
framework
::
OpKernelType
GetExpectedKernelType
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
return
framework
::
OpKernelType
(
OperatorWithKernel
::
IndicateVarDataType
(
ctx
,
framework
::
GradVarName
(
"Out"
)),
ctx
.
device_context
());
}
};
DECLARE_INPLACE_OP_INFERER
(
FlattenOpInplaceInferer
,
{
"X"
,
"Out"
});
DECLARE_INPLACE_OP_INFERER
(
FlattenGradInplaceInferer
,
{
framework
::
GradVarName
(
"Out"
),
...
...
@@ -266,6 +416,16 @@ REGISTER_OPERATOR(flatten2, ops::Flatten2Op, ops::Flatten2OpMaker,
REGISTER_OPERATOR
(
flatten2_grad
,
ops
::
Flatten2GradOp
,
ops
::
FlattenGradInplaceInferer
);
REGISTER_OPERATOR
(
flatten_contiguous_range
,
ops
::
FlattenContiguousRangeOp
,
ops
::
FlattenContiguousRangeOpMaker
,
ops
::
FlattenContiguousRangeGradOpMaker
<
paddle
::
framework
::
OpDesc
>
,
ops
::
FlattenContiguousRangeGradOpMaker
<
paddle
::
imperative
::
OpBase
>
,
ops
::
FlattenOpInplaceInferer
);
REGISTER_OPERATOR
(
flatten_contiguous_range_grad
,
ops
::
FlattenContiguousRangeGradOp
,
ops
::
FlattenGradInplaceInferer
);
REGISTER_OP_CPU_KERNEL
(
flatten
,
ops
::
FlattenKernel
<
paddle
::
platform
::
CPUDeviceContext
,
float
>
,
ops
::
FlattenKernel
<
paddle
::
platform
::
CPUDeviceContext
,
double
>
,
...
...
@@ -292,3 +452,26 @@ REGISTER_OP_CPU_KERNEL(
ops
::
Flatten2GradKernel
<
paddle
::
platform
::
CPUDeviceContext
,
int
>
,
ops
::
Flatten2GradKernel
<
paddle
::
platform
::
CPUDeviceContext
,
int8_t
>
,
ops
::
Flatten2GradKernel
<
paddle
::
platform
::
CPUDeviceContext
,
int64_t
>
);
REGISTER_OP_CPU_KERNEL
(
flatten_contiguous_range
,
ops
::
FlattenContiguousRangeKernel
<
paddle
::
platform
::
CPUDeviceContext
,
float
>
,
ops
::
FlattenContiguousRangeKernel
<
paddle
::
platform
::
CPUDeviceContext
,
double
>
,
ops
::
FlattenContiguousRangeKernel
<
paddle
::
platform
::
CPUDeviceContext
,
int
>
,
ops
::
FlattenContiguousRangeKernel
<
paddle
::
platform
::
CPUDeviceContext
,
int8_t
>
,
ops
::
FlattenContiguousRangeKernel
<
paddle
::
platform
::
CPUDeviceContext
,
int64_t
>
);
REGISTER_OP_CPU_KERNEL
(
flatten_contiguous_range_grad
,
ops
::
FlattenContiguousRangeGradKernel
<
paddle
::
platform
::
CPUDeviceContext
,
float
>
,
ops
::
FlattenContiguousRangeGradKernel
<
paddle
::
platform
::
CPUDeviceContext
,
double
>
,
ops
::
FlattenContiguousRangeGradKernel
<
paddle
::
platform
::
CPUDeviceContext
,
int
>
,
ops
::
FlattenContiguousRangeGradKernel
<
paddle
::
platform
::
CPUDeviceContext
,
int8_t
>
,
ops
::
FlattenContiguousRangeGradKernel
<
paddle
::
platform
::
CPUDeviceContext
,
int64_t
>
);
paddle/fluid/operators/flatten_op.cu.cc
浏览文件 @
862dde5e
...
...
@@ -42,3 +42,26 @@ REGISTER_OP_CUDA_KERNEL(
ops
::
Flatten2GradKernel
<
paddle
::
platform
::
CUDADeviceContext
,
int
>
,
ops
::
Flatten2GradKernel
<
paddle
::
platform
::
CUDADeviceContext
,
int8_t
>
,
ops
::
Flatten2GradKernel
<
paddle
::
platform
::
CUDADeviceContext
,
int64_t
>
);
REGISTER_OP_CUDA_KERNEL
(
flatten_contiguous_range
,
ops
::
FlattenContiguousRangeKernel
<
paddle
::
platform
::
CUDADeviceContext
,
float
>
,
ops
::
FlattenContiguousRangeKernel
<
paddle
::
platform
::
CUDADeviceContext
,
double
>
,
ops
::
FlattenContiguousRangeKernel
<
paddle
::
platform
::
CUDADeviceContext
,
int
>
,
ops
::
FlattenContiguousRangeKernel
<
paddle
::
platform
::
CUDADeviceContext
,
int8_t
>
,
ops
::
FlattenContiguousRangeKernel
<
paddle
::
platform
::
CUDADeviceContext
,
int64_t
>
);
REGISTER_OP_CUDA_KERNEL
(
flatten_contiguous_range_grad
,
ops
::
FlattenContiguousRangeGradKernel
<
paddle
::
platform
::
CUDADeviceContext
,
float
>
,
ops
::
FlattenContiguousRangeGradKernel
<
paddle
::
platform
::
CUDADeviceContext
,
double
>
,
ops
::
FlattenContiguousRangeGradKernel
<
paddle
::
platform
::
CUDADeviceContext
,
int
>
,
ops
::
FlattenContiguousRangeGradKernel
<
paddle
::
platform
::
CUDADeviceContext
,
int8_t
>
,
ops
::
FlattenContiguousRangeGradKernel
<
paddle
::
platform
::
CUDADeviceContext
,
int64_t
>
);
paddle/fluid/operators/flatten_op.h
浏览文件 @
862dde5e
...
...
@@ -112,5 +112,73 @@ class Flatten2GradKernel : public framework::OpKernel<T> {
}
};
template
<
typename
DeviceContext
,
typename
T
>
class
FlattenContiguousRangeKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
context
)
const
override
{
auto
&
start_axis
=
context
.
Attr
<
int
>
(
"start_axis"
);
auto
&
stop_axis
=
context
.
Attr
<
int
>
(
"stop_axis"
);
auto
*
in
=
context
.
Input
<
framework
::
LoDTensor
>
(
"X"
);
auto
x_dims
=
in
->
dims
();
int
in_dims_size
=
x_dims
.
size
();
int
real_start_axis
=
start_axis
,
real_stop_axis
=
stop_axis
;
if
(
start_axis
<
0
)
{
real_start_axis
=
start_axis
+
in_dims_size
;
}
if
(
stop_axis
<
0
)
{
real_stop_axis
=
stop_axis
+
in_dims_size
;
}
auto
*
out
=
context
.
Output
<
framework
::
LoDTensor
>
(
"Out"
);
auto
out_dims
=
framework
::
make_ddim
(
GetOutputShape
(
real_start_axis
,
real_stop_axis
,
x_dims
));
out
->
mutable_data
(
context
.
GetPlace
(),
in
->
type
());
framework
::
TensorCopy
(
*
in
,
context
.
GetPlace
(),
context
.
template
device_context
<
platform
::
DeviceContext
>(),
out
);
out
->
Resize
(
out_dims
);
}
static
std
::
vector
<
int32_t
>
GetOutputShape
(
const
int
start_axis
,
const
int
stop_axis
,
const
framework
::
DDim
&
in_dims
)
{
int64_t
outer
=
1
;
std
::
vector
<
int32_t
>
out_shape
;
int
in_dims_size
=
in_dims
.
size
();
out_shape
.
reserve
(
in_dims_size
-
stop_axis
+
start_axis
);
for
(
int
i
=
0
;
i
<
start_axis
;
++
i
)
{
out_shape
.
push_back
(
in_dims
[
i
]);
}
for
(
int
i
=
start_axis
;
i
<=
stop_axis
;
i
++
)
{
outer
*=
in_dims
[
i
];
}
out_shape
.
push_back
(
outer
);
for
(
int
i
=
stop_axis
+
1
;
i
<
in_dims_size
;
i
++
)
{
out_shape
.
push_back
(
in_dims
[
i
]);
}
return
out_shape
;
}
};
template
<
typename
DeviceContext
,
typename
T
>
class
FlattenContiguousRangeGradKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
*
d_x
=
ctx
.
Output
<
framework
::
LoDTensor
>
(
framework
::
GradVarName
(
"X"
));
auto
*
d_out
=
ctx
.
Input
<
framework
::
LoDTensor
>
(
framework
::
GradVarName
(
"Out"
));
auto
xshape_dims
=
ctx
.
Input
<
framework
::
LoDTensor
>
(
"XShape"
)
->
dims
();
auto
x_dims
=
framework
::
slice_ddim
(
xshape_dims
,
1
,
xshape_dims
.
size
());
d_x
->
mutable_data
(
ctx
.
GetPlace
(),
d_out
->
type
());
framework
::
TensorCopySync
(
*
d_out
,
ctx
.
GetPlace
(),
d_x
);
d_x
->
Resize
(
x_dims
);
}
};
}
// namespace operators
}
// namespace paddle
paddle/fluid/operators/fused/CMakeLists.txt
浏览文件 @
862dde5e
...
...
@@ -7,7 +7,12 @@ register_operators(EXCLUDES
fused_fc_elementwise_layernorm_op
multihead_matmul_op
fused_embedding_eltwise_layernorm_op
fusion_group_op
)
fusion_group_op
fusion_gru_op
)
# fusion_gru_op does not have CUDA kernel
op_library
(
fusion_gru_op
)
file
(
APPEND
${
pybind_file
}
"USE_CPU_ONLY_OP(fusion_gru);
\n
"
)
if
(
WITH_GPU
)
# fused_bn_activation_op needs cudnn 7.4.1 above
...
...
paddle/fluid/operators/fused/fusion_gru_op.cc
浏览文件 @
862dde5e
...
...
@@ -19,6 +19,9 @@ limitations under the License. */
#include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/operators/math/fc.h"
#include "paddle/fluid/operators/math/sequence2batch.h"
#ifdef PADDLE_WITH_MKLDNN
#include "paddle/fluid/platform/mkldnn_helper.h"
#endif
namespace
paddle
{
namespace
operators
{
...
...
@@ -122,8 +125,17 @@ void FusionGRUOp::InferShape(framework::InferShapeContext* ctx) const {
framework
::
OpKernelType
FusionGRUOp
::
GetExpectedKernelType
(
const
framework
::
ExecutionContext
&
ctx
)
const
{
framework
::
LibraryType
library
=
framework
::
LibraryType
::
kPlain
;
framework
::
DataLayout
layout
=
framework
::
DataLayout
::
kAnyLayout
;
#ifdef PADDLE_WITH_MKLDNN
if
(
platform
::
CanMKLDNNBeUsed
(
ctx
))
{
library
=
framework
::
LibraryType
::
kMKLDNN
;
layout
=
framework
::
DataLayout
::
kMKLDNN
;
}
#endif
return
framework
::
OpKernelType
(
OperatorWithKernel
::
IndicateVarDataType
(
ctx
,
"X"
),
ctx
.
device_context
());
OperatorWithKernel
::
IndicateVarDataType
(
ctx
,
"X"
),
ctx
.
GetPlace
(),
layout
,
library
);
}
void
FusionGRUOpMaker
::
Make
()
{
...
...
@@ -187,6 +199,9 @@ void FusionGRUOpMaker::Make() {
"bool"
"use origin mode in article https://arxiv.org/abs/1412.3555"
)
.
SetDefault
(
false
);
AddAttr
<
bool
>
(
"use_mkldnn"
,
"(bool, default false) Only used in mkldnn kernel"
)
.
SetDefault
(
false
);
AddComment
(
R"DOC(
The Fusion complete GRU Operator.
This operator fuse the fully-connected operator into GRU,
...
...
paddle/fluid/operators/fused/mkldnn/fusion_gru_mkldnn_op.cc
0 → 100644
浏览文件 @
862dde5e
/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/fused/fusion_gru_op.h"
#include "paddle/fluid/platform/mkldnn_reuse.h"
namespace
paddle
{
namespace
operators
{
using
paddle
::
framework
::
LoDTensor
;
using
paddle
::
framework
::
Tensor
;
using
paddle
::
platform
::
CPUDeviceContext
;
using
paddle
::
platform
::
MKLDNNGetDataType
;
using
paddle
::
platform
::
MKLDNNMemDesc
;
using
platform
::
to_void_cast
;
template
<
typename
T
>
class
GRUMKLDNNHandler
:
public
platform
::
MKLDNNHandlerT
<
T
,
dnnl
::
gru_forward
>
{
public:
GRUMKLDNNHandler
(
const
paddle
::
framework
::
ExecutionContext
&
ctx
,
const
platform
::
MKLDNNDeviceContext
&
dev_ctx
,
const
mkldnn
::
engine
mkldnn_engine
,
platform
::
Place
cpu_place
,
const
LoDTensor
*
input
,
const
Tensor
*
weight_h
,
const
Tensor
*
h0
,
const
bool
is_reverse
,
const
int64_t
N
,
const
int64_t
Ti
,
const
int64_t
IC
,
const
int64_t
OC
,
const
std
::
string
&
unique_name
)
:
platform
::
MKLDNNHandlerT
<
T
,
dnnl
::
gru_forward
>
(
dev_ctx
,
dev_ctx
.
GetEngine
(),
cpu_place
,
platform
::
CreateKey
(
unique_name
,
Ti
)),
N
(
N
),
Ti
(
Ti
),
IC
(
IC
),
OC
(
OC
)
{
// Create memory key without Ti because weights, bias and h0 memories
// do not depend on Ti size but primitive and input/output memory do
if
(
platform
::
MKLDNNDeviceContext
::
tls
().
get_cur_mkldnn_session_id
()
!=
platform
::
MKLDNNDeviceContextThreadLocals
::
kMKLDNNSessionID_Default
)
{
memory_key_
=
unique_name
;
}
else
{
memory_key_
=
unique_name
+
"-t:"
+
platform
::
ThreadIDasStr
();
}
if
(
!
this
->
isCached
())
{
// oneDNN kernel has hardcoded activation functions
PADDLE_ENFORCE_EQ
(
ctx
.
Attr
<
std
::
string
>
(
"gate_activation"
),
"sigmoid"
,
platform
::
errors
::
Unimplemented
(
"oneDNN fusion_gru supports only sigmoid as a gate activation."
));
PADDLE_ENFORCE_EQ
(
ctx
.
Attr
<
std
::
string
>
(
"activation"
),
"tanh"
,
platform
::
errors
::
Unimplemented
(
"oneDNN fusion_gru supports only tanh as an activation."
));
// oneDNN RNN dimensions
const
int64_t
D
=
1
;
// Directions
const
int64_t
L
=
1
;
// Layers (PP supports only 1 stacked layer)
const
int64_t
G
=
3
;
// Number of Gates, 3 for GRU
// Create memory descriptors
auto
input_md
=
MKLDNNMemDesc
({
Ti
,
N
,
IC
},
MKLDNNGetDataType
<
T
>
(),
MKLDNNMemoryFormat
::
any
);
auto
weight_x_md
=
MKLDNNMemDesc
(
{
L
,
D
,
IC
,
G
,
OC
},
MKLDNNGetDataType
<
T
>
(),
MKLDNNMemoryFormat
::
any
);
auto
weight_h_md
=
MKLDNNMemDesc
(
{
L
,
D
,
OC
,
G
,
OC
},
MKLDNNGetDataType
<
T
>
(),
MKLDNNMemoryFormat
::
any
);
auto
bias_md
=
MKLDNNMemDesc
({
L
,
D
,
G
,
OC
},
MKLDNNGetDataType
<
float
>
(),
MKLDNNMemoryFormat
::
ldgo
);
auto
hidden_md
=
MKLDNNMemDesc
({
Ti
,
N
,
OC
},
MKLDNNGetDataType
<
T
>
(),
MKLDNNMemoryFormat
::
any
);
auto
h0_md
=
dnnl
::
memory
::
desc
();
if
(
h0
)
{
h0_md
=
MKLDNNMemDesc
({
L
,
D
,
N
,
OC
},
MKLDNNGetDataType
<
T
>
(),
MKLDNNMemoryFormat
::
ldnc
);
}
// Create GRU oneDNN primitive
const
auto
direction
=
is_reverse
?
dnnl
::
rnn_direction
::
unidirectional_right2left
:
dnnl
::
rnn_direction
::
unidirectional_left2right
;
this
->
AcquireForwardPrimitiveDescriptor
(
dnnl
::
prop_kind
::
forward_inference
,
direction
,
input_md
,
h0_md
,
weight_x_md
,
weight_h_md
,
bias_md
,
hidden_md
,
dnnl
::
memory
::
desc
());
}
}
bool
is_NTC
()
{
return
(
platform
::
GetMKLDNNFormat
(
this
->
fwd_pd_
->
dst_desc
())
==
dnnl
::
memory
::
format_tag
::
ntc
);
}
void
reorderRNNdata
(
const
T
*
input_data
,
T
*
output_data
,
std
::
vector
<
size_t
>
lod
,
const
bool
is_reverse
,
platform
::
RNNReorderType
reorder_type
)
{
switch
(
reorder_type
)
{
// Reorder input memory [WORDS, C] + LoD -> [N, T, C]
case
platform
::
RNNReorderType
::
PP_NTC
:
{
auto
*
input_data_iter
=
input_data
;
for
(
int
n
=
0
;
n
<
N
;
++
n
)
{
const
auto
num_elements
=
(
lod
[
n
+
1
]
-
lod
[
n
])
*
IC
;
const
auto
offset
=
is_reverse
?
(
Ti
*
IC
-
num_elements
)
:
0
;
memcpy
(
output_data
+
n
*
Ti
*
IC
+
offset
,
input_data_iter
,
sizeof
(
T
)
*
num_elements
);
input_data_iter
+=
num_elements
;
}
}
break
;
// Reorder input memory [WORDS, C] + LoD -> [T, N, C]
case
platform
::
RNNReorderType
::
PP_TNC
:
{
auto
*
input_data_iter
=
input_data
;
for
(
int
n
=
0
;
n
<
N
;
++
n
)
{
const
auto
num_elements
=
(
lod
[
n
+
1
]
-
lod
[
n
]);
const
auto
offset
=
is_reverse
?
(
Ti
-
num_elements
)
:
0
;
for
(
size_t
t
=
0
;
t
<
num_elements
;
++
t
)
{
memcpy
(
output_data
+
(
t
+
offset
)
*
N
*
IC
+
n
*
IC
,
input_data_iter
,
sizeof
(
T
)
*
IC
);
input_data_iter
+=
IC
;
}
}
}
break
;
// Reorder output values to PP format [N, T, C] -> [WORDS, C]
case
platform
::
RNNReorderType
::
NTC_PP
:
{
auto
*
output_data_iter
=
output_data
;
for
(
int
n
=
0
;
n
<
N
;
++
n
)
{
const
auto
num_elements
=
(
lod
[
n
+
1
]
-
lod
[
n
])
*
OC
;
const
auto
offset
=
is_reverse
?
(
Ti
*
OC
-
num_elements
)
:
0
;
memcpy
(
output_data_iter
,
input_data
+
n
*
Ti
*
OC
+
offset
,
sizeof
(
T
)
*
num_elements
);
output_data_iter
+=
num_elements
;
}
}
break
;
// Reorder output values to PP format [T, N, C] -> [WORDS, C]
case
platform
::
RNNReorderType
::
TNC_PP
:
{
auto
*
output_data_iter
=
output_data
;
for
(
int
n
=
0
;
n
<
N
;
++
n
)
{
const
auto
num_elements
=
lod
[
n
+
1
]
-
lod
[
n
];
const
auto
offset
=
is_reverse
?
(
Ti
-
num_elements
)
:
0
;
for
(
size_t
t
=
0
;
t
<
num_elements
;
++
t
)
{
memcpy
(
output_data_iter
,
input_data
+
(
t
+
offset
)
*
N
*
OC
+
n
*
OC
,
sizeof
(
T
)
*
OC
);
output_data_iter
+=
OC
;
}
}
}
break
;
}
}
std
::
shared_ptr
<
dnnl
::
memory
>
AcquireInputMemoryWithReorder
(
const
LoDTensor
*
input
,
const
bool
is_reverse
)
{
const
auto
name
=
this
->
key_
+
"@input_mem"
;
auto
memory_p
=
std
::
static_pointer_cast
<
dnnl
::
memory
>
(
this
->
dev_ctx_
.
GetBlob
(
name
));
if
(
!
memory_p
)
{
memory_p
=
std
::
make_shared
<
dnnl
::
memory
>
(
this
->
fwd_pd_
->
src_desc
(),
this
->
engine_
);
this
->
dev_ctx_
.
SetBlob
(
name
,
memory_p
);
}
const
auto
&
input_lod
=
input
->
lod
()[
0
];
auto
*
x_data
=
input
->
data
<
T
>
();
auto
*
x_onednn_data
=
reinterpret_cast
<
T
*>
(
memory_p
->
get_data_handle
());
memset
(
x_onednn_data
,
0
,
sizeof
(
T
)
*
N
*
Ti
*
IC
);
if
(
platform
::
GetMKLDNNFormat
(
this
->
fwd_pd_
->
src_desc
())
==
dnnl
::
memory
::
format_tag
::
ntc
)
{
reorderRNNdata
(
x_data
,
x_onednn_data
,
input_lod
,
is_reverse
,
platform
::
RNNReorderType
::
PP_NTC
);
}
else
{
reorderRNNdata
(
x_data
,
x_onednn_data
,
input_lod
,
is_reverse
,
platform
::
RNNReorderType
::
PP_TNC
);
}
return
memory_p
;
}
std
::
shared_ptr
<
dnnl
::
memory
>
AcquireOutputMemory
()
{
const
auto
name
=
this
->
key_
+
"@output_mem"
;
auto
memory_p
=
std
::
static_pointer_cast
<
dnnl
::
memory
>
(
this
->
dev_ctx_
.
GetBlob
(
name
));
if
(
!
memory_p
)
{
memory_p
=
std
::
make_shared
<
dnnl
::
memory
>
(
this
->
fwd_pd_
->
dst_desc
(),
this
->
engine_
);
this
->
dev_ctx_
.
SetBlob
(
name
,
memory_p
);
}
return
memory_p
;
}
std
::
shared_ptr
<
dnnl
::
memory
>
AcquireH0Memory
(
const
Tensor
*
h0
)
{
const
std
::
string
h0_key
=
memory_key_
+
"@h0"
;
auto
memory_p
=
std
::
static_pointer_cast
<
dnnl
::
memory
>
(
this
->
dev_ctx_
.
GetBlob
(
h0_key
));
auto
*
h0_data
=
to_void_cast
(
h0
->
data
<
T
>
());
if
(
!
memory_p
)
{
memory_p
=
std
::
make_shared
<
dnnl
::
memory
>
(
this
->
fwd_pd_
->
weights_layer_desc
(),
this
->
engine_
,
h0_data
);
this
->
dev_ctx_
.
SetBlob
(
h0_key
,
memory_p
);
}
else
{
memory_p
->
set_data_handle
(
h0_data
);
}
return
memory_p
;
}
std
::
shared_ptr
<
dnnl
::
memory
>
AcquireWeightXMemory
(
const
Tensor
*
weight_x
,
const
bool
origin_mode
)
{
const
std
::
string
wx_key
=
memory_key_
+
"@weight_x"
;
auto
memory_p
=
std
::
static_pointer_cast
<
dnnl
::
memory
>
(
this
->
dev_ctx_
.
GetBlob
(
wx_key
));
if
(
!
memory_p
)
{
auto
user_md
=
MKLDNNMemDesc
({
1
,
1
,
IC
,
3
,
OC
},
MKLDNNGetDataType
<
float
>
(),
MKLDNNMemoryFormat
::
ldigo
);
auto
user_memory
=
dnnl
::
memory
(
user_md
,
this
->
engine_
);
auto
*
weight_x_data
=
reinterpret_cast
<
float
*>
(
user_memory
.
get_data_handle
());
memcpy
(
weight_x_data
,
weight_x
->
data
<
float
>
(),
sizeof
(
float
)
*
IC
*
3
*
OC
);
if
(
origin_mode
==
false
)
{
for
(
int64_t
i
=
0
;
i
<
IC
;
++
i
)
{
for
(
int64_t
j
=
0
;
j
<
OC
;
++
j
)
{
weight_x_data
[
j
]
*=
-
1
;
}
weight_x_data
+=
3
*
OC
;
}
}
memory_p
=
std
::
make_shared
<
dnnl
::
memory
>
(
this
->
fwd_pd_
->
weights_layer_desc
(),
this
->
engine_
);
dnnl
::
stream
astream
(
this
->
engine_
);
dnnl
::
reorder
(
user_memory
,
*
memory_p
)
.
execute
(
astream
,
user_memory
,
*
memory_p
);
this
->
dev_ctx_
.
SetBlob
(
wx_key
,
memory_p
);
}
return
memory_p
;
}
std
::
shared_ptr
<
dnnl
::
memory
>
AcquireWeightHMemory
(
const
Tensor
*
weight_h
,
const
bool
origin_mode
)
{
const
std
::
string
wh_key
=
memory_key_
+
"@weight_h"
;
auto
memory_p
=
std
::
static_pointer_cast
<
dnnl
::
memory
>
(
this
->
dev_ctx_
.
GetBlob
(
wh_key
));
if
(
!
memory_p
)
{
auto
user_md
=
MKLDNNMemDesc
({
1
,
1
,
OC
,
3
,
OC
},
MKLDNNGetDataType
<
float
>
(),
MKLDNNMemoryFormat
::
ldigo
);
auto
user_memory
=
dnnl
::
memory
(
user_md
,
this
->
engine_
);
// Reorder weights_h from PP format [OC, 2OC] + [OC, OC] to
// oneDNN format [OC, 3OC]
auto
*
weight_h_data
=
reinterpret_cast
<
float
*>
(
user_memory
.
get_data_handle
());
auto
*
user_weight_h_data
=
weight_h
->
data
<
float
>
();
auto
src1_iter
=
user_weight_h_data
;
auto
src2_iter
=
user_weight_h_data
+
2
*
OC
*
OC
;
for
(
int64_t
c
=
0
;
c
<
OC
;
++
c
)
{
memcpy
(
weight_h_data
,
src1_iter
,
2
*
OC
*
sizeof
(
float
));
memcpy
(
weight_h_data
+
2
*
OC
,
src2_iter
,
OC
*
sizeof
(
float
));
src1_iter
+=
2
*
OC
;
src2_iter
+=
OC
;
weight_h_data
+=
3
*
OC
;
}
weight_h_data
=
reinterpret_cast
<
float
*>
(
user_memory
.
get_data_handle
());
if
(
origin_mode
==
false
)
{
for
(
int64_t
i
=
0
;
i
<
OC
;
++
i
)
{
for
(
int64_t
j
=
0
;
j
<
OC
;
++
j
)
{
weight_h_data
[
j
]
*=
-
1
;
}
weight_h_data
+=
3
*
OC
;
}
}
memory_p
=
std
::
make_shared
<
dnnl
::
memory
>
(
this
->
fwd_pd_
->
weights_iter_desc
(),
this
->
engine_
);
dnnl
::
stream
astream
(
this
->
engine_
);
dnnl
::
reorder
(
user_memory
,
*
memory_p
)
.
execute
(
astream
,
user_memory
,
*
memory_p
);
this
->
dev_ctx_
.
SetBlob
(
wh_key
,
memory_p
);
}
return
memory_p
;
}
std
::
shared_ptr
<
dnnl
::
memory
>
AcquireBiasMemory
(
const
Tensor
*
bias
,
const
bool
origin_mode
)
{
const
std
::
string
bias_key
=
memory_key_
+
"@bias"
;
auto
memory_p
=
std
::
static_pointer_cast
<
dnnl
::
memory
>
(
this
->
dev_ctx_
.
GetBlob
(
bias_key
));
if
(
!
memory_p
)
{
memory_p
=
std
::
make_shared
<
dnnl
::
memory
>
(
this
->
fwd_pd_
->
bias_desc
(),
this
->
engine_
);
auto
*
bias_data
=
reinterpret_cast
<
float
*>
(
memory_p
->
get_data_handle
());
if
(
bias
)
{
const
float
*
user_bias_data
=
bias
->
data
<
float
>
();
// Bias in oneDNN is always float
memcpy
(
bias_data
,
user_bias_data
,
sizeof
(
float
)
*
3
*
OC
);
}
else
{
// oneDNN always need bias memory, if it's not provided in PP, let
// oneDNN allocate memory and set it to 0
memset
(
bias_data
,
0
,
sizeof
(
float
)
*
3
*
OC
);
}
if
(
origin_mode
==
false
&&
bias
)
{
for
(
int64_t
i
=
0
;
i
<
OC
;
++
i
)
{
bias_data
[
i
]
*=
-
1
;
}
}
this
->
dev_ctx_
.
SetBlob
(
bias_key
,
memory_p
);
}
return
memory_p
;
}
private:
// RNN dimensions
// N - Batch Size
// Ti - Max sentence length
// IC - Input Channels
// OC - Output Channels
const
int64_t
N
,
Ti
,
IC
,
OC
;
// Memory size of weights, bias and h0 does not depend
// on Ti size, thus we need another key to cache them
std
::
string
memory_key_
;
};
template
<
typename
T
>
class
FusionGRUMKLDNNKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
&
dev_ctx
=
ctx
.
template
device_context
<
platform
::
MKLDNNDeviceContext
>();
const
auto
&
mkldnn_engine
=
dev_ctx
.
GetEngine
();
// Get Tensors
const
auto
*
input
=
ctx
.
Input
<
LoDTensor
>
(
"X"
);
const
auto
*
h0
=
ctx
.
Input
<
Tensor
>
(
"H0"
);
const
auto
*
weight_x
=
ctx
.
Input
<
Tensor
>
(
"WeightX"
);
const
auto
*
weight_h
=
ctx
.
Input
<
Tensor
>
(
"WeightH"
);
const
auto
*
bias
=
ctx
.
Input
<
Tensor
>
(
"Bias"
);
auto
*
hidden
=
ctx
.
Output
<
LoDTensor
>
(
"Hidden"
);
// Get attributes
const
bool
is_reverse
=
ctx
.
Attr
<
bool
>
(
"is_reverse"
);
const
bool
origin_mode
=
ctx
.
Attr
<
bool
>
(
"origin_mode"
);
// Get tensor dimensions
const
auto
x_dims
=
framework
::
vectorize
(
input
->
dims
());
const
auto
weight_h_dims
=
framework
::
vectorize
(
weight_h
->
dims
());
const
auto
&
input_lod
=
input
->
lod
()[
0
];
// Calculate RNN dimensions
const
int64_t
N
=
input_lod
.
size
()
-
1
;
// Number of sentences (batches)
const
int64_t
Ti
=
// Max length of the sentence in a batch
[
&
input_lod
]()
{
size_t
res
=
0
;
for
(
size_t
i
=
0
;
i
<
(
input_lod
.
size
()
-
1
);
++
i
)
{
res
=
std
::
max
(
res
,
input_lod
[
i
+
1
]
-
input_lod
[
i
]);
}
return
res
;
}();
const
int64_t
IC
=
x_dims
[
1
];
// Input channels
const
int64_t
OC
=
weight_h_dims
[
0
];
// Output channels
GRUMKLDNNHandler
<
T
>
handler
(
ctx
,
dev_ctx
,
mkldnn_engine
,
ctx
.
GetPlace
(),
input
,
weight_h
,
h0
,
is_reverse
,
N
,
Ti
,
IC
,
OC
,
ctx
.
InputName
(
"X"
)
+
ctx
.
InputName
(
"WeightH"
));
auto
input_memory_p
=
handler
.
AcquireInputMemoryWithReorder
(
input
,
is_reverse
);
auto
weight_x_memory_p
=
handler
.
AcquireWeightXMemory
(
weight_x
,
origin_mode
);
auto
weight_h_memory_p
=
handler
.
AcquireWeightHMemory
(
weight_h
,
origin_mode
);
auto
bias_memory_p
=
handler
.
AcquireBiasMemory
(
bias
,
origin_mode
);
auto
hidden_onednn_memory_p
=
handler
.
AcquireOutputMemory
();
std
::
unordered_map
<
int
,
dnnl
::
memory
>
gru_args
=
{
{
DNNL_ARG_SRC_LAYER
,
*
input_memory_p
},
{
DNNL_ARG_WEIGHTS_LAYER
,
*
weight_x_memory_p
},
{
DNNL_ARG_WEIGHTS_ITER
,
*
weight_h_memory_p
},
{
DNNL_ARG_BIAS
,
*
bias_memory_p
},
{
DNNL_ARG_DST_LAYER
,
*
hidden_onednn_memory_p
}};
if
(
h0
)
{
auto
h0_memory_p
=
handler
.
AcquireH0Memory
(
h0
);
gru_args
.
insert
({
DNNL_ARG_SRC_ITER
,
*
h0_memory_p
});
}
auto
gru_forward_p
=
handler
.
AcquireForwardPrimitive
();
dnnl
::
stream
astream
(
mkldnn_engine
);
gru_forward_p
->
execute
(
astream
,
gru_args
);
astream
.
wait
();
auto
*
hidden_onednn_data
=
reinterpret_cast
<
T
*>
(
hidden_onednn_memory_p
->
get_data_handle
());
auto
*
hidden_data
=
hidden
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
if
(
handler
.
is_NTC
())
{
handler
.
reorderRNNdata
(
hidden_onednn_data
,
hidden_data
,
input_lod
,
is_reverse
,
platform
::
RNNReorderType
::
NTC_PP
);
}
else
{
handler
.
reorderRNNdata
(
hidden_onednn_data
,
hidden_data
,
input_lod
,
is_reverse
,
platform
::
RNNReorderType
::
TNC_PP
);
}
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
REGISTER_OP_KERNEL
(
fusion_gru
,
MKLDNN
,
paddle
::
platform
::
CPUPlace
,
ops
::
FusionGRUMKLDNNKernel
<
float
>
);
paddle/fluid/platform/mkldnn_helper.h
浏览文件 @
862dde5e
...
...
@@ -181,6 +181,8 @@ inline mkldnn::memory::format_tag GetMKLDNNFormat(
if
(
inner_nblks
==
0
)
{
if
(
strides
[
0
]
>=
strides
[
1
]
&&
strides
[
1
]
>=
strides
[
2
])
{
return
mkldnn
::
memory
::
format_tag
::
ncw
;
}
else
if
(
strides
[
1
]
>=
strides
[
0
]
&&
strides
[
0
]
>=
strides
[
2
])
{
return
mkldnn
::
memory
::
format_tag
::
ntc
;
}
else
{
return
mkldnn
::
memory
::
format_tag
::
nwc
;
}
...
...
@@ -420,5 +422,7 @@ inline std::vector<std::vector<int64_t>> ToMkldnnPadding(
}
}
enum
class
RNNReorderType
{
PP_NTC
,
PP_TNC
,
NTC_PP
,
TNC_PP
};
}
// namespace platform
}
// namespace paddle
paddle/fluid/pybind/CMakeLists.txt
浏览文件 @
862dde5e
set
(
PYBIND_DEPS pybind python proto_desc memory executor fleet_wrapper box_wrapper prune
feed_fetch_method pass_builder parallel_executor profiler layer tracer engine scope_pool
analysis_predictor imperative_profiler imperative_flag save_load_util dlpack_tensor device_context
gloo_wrapper infer_io_utils
)
gloo_wrapper infer_io_utils
heter_wrapper
)
if
(
WITH_NCCL
)
set
(
PYBIND_DEPS
${
PYBIND_DEPS
}
nccl_wrapper
)
...
...
@@ -31,6 +31,7 @@ set(PYBIND_SRCS
global_value_getter_setter.cc
reader_py.cc
fleet_wrapper_py.cc
heter_wrapper_py.cc
gloo_wrapper_py.cc
box_helper_py.cc
data_set_py.cc
...
...
paddle/fluid/pybind/heter_wrapper_py.cc
0 → 100644
浏览文件 @
862dde5e
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <fcntl.h>
#ifdef _POSIX_C_SOURCE
#undef _POSIX_C_SOURCE
#endif
#ifdef _XOPEN_SOURCE
#undef _XOPEN_SOURCE
#endif
#include <string>
#include <vector>
#include "google/protobuf/io/zero_copy_stream_impl.h"
#include "google/protobuf/text_format.h"
#include "paddle/fluid/framework/fleet/heter_wrapper.h"
#include "paddle/fluid/pybind/heter_wrapper_py.h"
namespace
py
=
pybind11
;
namespace
paddle
{
namespace
pybind
{
#ifdef PADDLE_WITH_PSLIB
void
BindHeterWrapper
(
py
::
module
*
m
)
{
py
::
class_
<
framework
::
HeterWrapper
,
std
::
shared_ptr
<
framework
::
HeterWrapper
>>
(
*
m
,
"Heter"
)
.
def
(
py
::
init
([]()
{
return
framework
::
HeterWrapper
::
GetInstance
();
}))
.
def
(
"create_client2xpu_connection"
,
&
framework
::
HeterWrapper
::
CreateClient2XpuConnection
)
.
def
(
"set_xpu_list"
,
&
framework
::
HeterWrapper
::
SetXpuList
)
.
def
(
"start_xpu_service"
,
&
framework
::
HeterWrapper
::
StartXpuService
)
.
def
(
"end_pass"
,
&
framework
::
HeterWrapper
::
EndPass
)
.
def
(
"stop_xpu_service"
,
&
framework
::
HeterWrapper
::
StopXpuService
);
}
// end HeterWrapper
#endif
}
// end namespace pybind
}
// end namespace paddle
paddle/fluid/pybind/heter_wrapper_py.h
0 → 100644
浏览文件 @
862dde5e
// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "pybind11/pybind11.h"
#include "pybind11/stl.h"
namespace
py
=
pybind11
;
namespace
paddle
{
namespace
pybind
{
#ifdef PADDLE_WITH_PSLIB
void
BindHeterWrapper
(
py
::
module
*
m
);
#endif
}
// namespace pybind
}
// namespace paddle
paddle/fluid/pybind/pybind.cc
浏览文件 @
862dde5e
...
...
@@ -66,6 +66,7 @@ limitations under the License. */
#include "paddle/fluid/pybind/fleet_wrapper_py.h"
#include "paddle/fluid/pybind/global_value_getter_setter.h"
#include "paddle/fluid/pybind/gloo_wrapper_py.h"
#include "paddle/fluid/pybind/heter_wrapper_py.h"
#include "paddle/fluid/pybind/imperative.h"
#include "paddle/fluid/pybind/inference_api.h"
#include "paddle/fluid/pybind/ir.h"
...
...
@@ -2479,6 +2480,9 @@ All parameter, weight, gradient are variables in Paddle.
.
def
(
"device_count"
,
&
ParallelExecutor
::
DeviceCount
);
BindFleetWrapper
(
&
m
);
#ifdef PADDLE_WITH_PSLIB
BindHeterWrapper
(
&
m
);
#endif
BindGlooWrapper
(
&
m
);
BindBoxHelper
(
&
m
);
#ifdef PADDLE_WITH_BOX_PS
...
...
python/paddle/fleet/__init__.py
浏览文件 @
862dde5e
...
...
@@ -16,10 +16,13 @@
from
.base.distributed_strategy
import
DistributedStrategy
from
.base.fleet_base
import
Fleet
from
.base.util_factory
import
UtilBase
from
.dataset
import
*
#from .base.role_maker import PaddleCloudRoleMaker
__all__
=
[
"DistributedStrategy"
,
"UtilBase"
]
__all__
=
[
"DistributedStrategy"
,
"UtilBase"
,
"DatasetFactory"
,
"DatasetBase"
,
"InMemoryDataset"
,
"QueueDataset"
]
fleet
=
Fleet
()
init
=
fleet
.
init
...
...
python/paddle/fleet/dataset/__init__.py
浏览文件 @
862dde5e
...
...
@@ -10,3 +10,5 @@
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
from
.dataset
import
*
python/paddle/fleet/dataset/dataset.py
0 → 100644
浏览文件 @
862dde5e
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""This is definition of dataset class, which is high performance IO."""
import
paddle
import
paddle.fluid
as
fluid
from
paddle.fluid.proto
import
data_feed_pb2
from
google.protobuf
import
text_format
import
paddle.fluid.core
as
core
class
DatasetFactory
(
object
):
"""
DatasetFactory is a factory which create dataset by its name,
you can create "QueueDataset" or "InMemoryDataset", or "FileInstantDataset",
the default is "QueueDataset".
Example:
.. code-block:: python
import paddle.fluid as fluid
dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
"""
def
__init__
(
self
):
""" Init. """
pass
def
create_dataset
(
self
,
datafeed_class
=
"QueueDataset"
):
"""
Create "QueueDataset" or "InMemoryDataset", or "FileInstantDataset",
the default is "QueueDataset".
Args:
datafeed_class(str): datafeed class name, QueueDataset or InMemoryDataset.
Default is QueueDataset.
Examples:
.. code-block:: python
import paddle.fluid as fluid
dataset = fluid.DatasetFactory().create_dataset()
"""
try
:
dataset
=
globals
()[
datafeed_class
]()
return
dataset
except
:
raise
ValueError
(
"datafeed class %s does not exist"
%
datafeed_class
)
class
DatasetBase
(
object
):
""" Base dataset class. """
def
__init__
(
self
):
""" Init. """
# define class name here
# to decide whether we need create in memory instance
self
.
proto_desc
=
data_feed_pb2
.
DataFeedDesc
()
self
.
proto_desc
.
pipe_command
=
"cat"
self
.
dataset
=
core
.
Dataset
(
"MultiSlotDataset"
)
self
.
thread_num
=
1
self
.
filelist
=
[]
def
set_pipe_command
(
self
,
pipe_command
):
"""
Set pipe command of current dataset
A pipe command is a UNIX pipeline command that can be used only
Examples:
.. code-block:: python
import paddle.fluid as fluid
dataset = fluid.DatasetFactory().create_dataset()
dataset.set_pipe_command("python my_script.py")
Args:
pipe_command(str): pipe command
"""
self
.
proto_desc
.
pipe_command
=
pipe_command
def
set_rank_offset
(
self
,
rank_offset
):
"""
Set rank_offset for merge_pv. It set the message of Pv.
Examples:
.. code-block:: python
import paddle.fluid as fluid
dataset = fluid.DatasetFactory().create_dataset()
dataset.set_rank_offset("rank_offset")
Args:
rank_offset(str): rank_offset's name
"""
self
.
proto_desc
.
rank_offset
=
rank_offset
def
set_fea_eval
(
self
,
record_candidate_size
,
fea_eval
=
True
):
"""
set fea eval mode for slots shuffle to debug the importance level of
slots(features), fea_eval need to be set True for slots shuffle.
Args:
record_candidate_size(int): size of instances candidate to shuffle
one slot
fea_eval(bool): whether enable fea eval mode to enable slots shuffle.
default is True.
Examples:
.. code-block:: python
import paddle.fluid as fluid
dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
dataset.set_fea_eval(1000000, True)
"""
if
fea_eval
:
self
.
dataset
.
set_fea_eval
(
fea_eval
,
record_candidate_size
)
self
.
fea_eval
=
fea_eval
def
slots_shuffle
(
self
,
slots
):
"""
Slots Shuffle
Slots Shuffle is a shuffle method in slots level, which is usually used
in sparse feature with large scale of instances. To compare the metric, i.e.
auc while doing slots shuffle on one or several slots with baseline to
evaluate the importance level of slots(features).
Args:
slots(list[string]): the set of slots(string) to do slots shuffle.
Examples:
import paddle.fluid as fluid
dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
dataset.set_merge_by_lineid()
#suppose there is a slot 0
dataset.slots_shuffle(['0'])
"""
if
self
.
fea_eval
:
slots_set
=
set
(
slots
)
self
.
dataset
.
slots_shuffle
(
slots_set
)
def
set_batch_size
(
self
,
batch_size
):
"""
Set batch size. Will be effective during training
Examples:
.. code-block:: python
import paddle.fluid as fluid
dataset = fluid.DatasetFactory().create_dataset()
dataset.set_batch_size(128)
Args:
batch_size(int): batch size
"""
self
.
proto_desc
.
batch_size
=
batch_size
def
set_pv_batch_size
(
self
,
pv_batch_size
):
"""
Set pv batch size. It will be effective during enable_pv_merge
Examples:
.. code-block:: python
import paddle.fluid as fluid
dataset = fluid.DatasetFactory().create_dataset()
dataset.set_pv_batch(128)
Args:
pv_batch_size(int): pv batch size
"""
self
.
proto_desc
.
pv_batch_size
=
pv_batch_size
def
set_thread
(
self
,
thread_num
):
"""
Set thread num, it is the num of readers.
Examples:
.. code-block:: python
import paddle.fluid as fluid
dataset = fluid.DatasetFactory().create_dataset()
dataset.set_thread(12)
Args:
thread_num(int): thread num
"""
self
.
dataset
.
set_thread_num
(
thread_num
)
self
.
thread_num
=
thread_num
def
set_filelist
(
self
,
filelist
):
"""
Set file list in current worker.
Examples:
.. code-block:: python
import paddle.fluid as fluid
dataset = fluid.DatasetFactory().create_dataset()
dataset.set_filelist(['a.txt', 'b.txt'])
Args:
filelist(list): file list
"""
self
.
dataset
.
set_filelist
(
filelist
)
self
.
filelist
=
filelist
def
set_input_type
(
self
,
input_type
):
self
.
proto_desc
.
input_type
=
input_type
def
set_use_var
(
self
,
var_list
):
"""
Set Variables which you will use.
Examples:
.. code-block:: python
import paddle.fluid as fluid
dataset = fluid.DatasetFactory().create_dataset()
dataset.set_use_var([data, label])
Args:
var_list(list): variable list
"""
multi_slot
=
self
.
proto_desc
.
multi_slot_desc
for
var
in
var_list
:
slot_var
=
multi_slot
.
slots
.
add
()
slot_var
.
is_used
=
True
slot_var
.
name
=
var
.
name
if
var
.
lod_level
==
0
:
slot_var
.
is_dense
=
True
slot_var
.
shape
.
extend
(
var
.
shape
)
if
var
.
dtype
==
core
.
VarDesc
.
VarType
.
FP32
:
slot_var
.
type
=
"float"
elif
var
.
dtype
==
core
.
VarDesc
.
VarType
.
INT64
:
slot_var
.
type
=
"uint64"
else
:
raise
ValueError
(
"Currently, fluid.dataset only supports dtype=float32 and dtype=int64"
)
def
set_hdfs_config
(
self
,
fs_name
,
fs_ugi
):
"""
Set hdfs config: fs name ad ugi
Examples:
.. code-block:: python
import paddle.fluid as fluid
dataset = fluid.DatasetFactory().create_dataset()
dataset.set_hdfs_config("my_fs_name", "my_fs_ugi")
Args:
fs_name(str): fs name
fs_ugi(str): fs ugi
"""
self
.
dataset
.
set_hdfs_config
(
fs_name
,
fs_ugi
)
def
set_download_cmd
(
self
,
download_cmd
):
"""
Set customized download cmd: download_cmd
Examples:
.. code-block:: python
import paddle.fluid as fluid
dataset = fluid.DatasetFactory().create_dataset()
dataset.set_download_cmd("./read_from_afs")
Args:
download_cmd(str): customized download command
"""
self
.
dataset
.
set_download_cmd
(
download_cmd
)
def
_prepare_to_run
(
self
):
"""
Set data_feed_desc before load or shuffle,
user no need to call this function.
"""
if
self
.
thread_num
>
len
(
self
.
filelist
):
self
.
thread_num
=
len
(
self
.
filelist
)
self
.
dataset
.
set_thread_num
(
self
.
thread_num
)
self
.
dataset
.
set_data_feed_desc
(
self
.
desc
())
self
.
dataset
.
create_readers
()
def
_finish_to_run
(
self
):
self
.
dataset
.
destroy_readers
()
def
desc
(
self
):
"""
Returns a protobuf message for this DataFeedDesc
Examples:
.. code-block:: python
import paddle.fluid as fluid
dataset = fluid.DatasetFactory().create_dataset()
print(dataset.desc())
Returns:
A string message
"""
return
text_format
.
MessageToString
(
self
.
proto_desc
)
def
_dynamic_adjust_before_train
(
self
,
thread_num
):
pass
def
_dynamic_adjust_after_train
(
self
):
pass
class
InMemoryDataset
(
DatasetBase
):
"""
InMemoryDataset, it will load data into memory
and shuffle data before training.
This class should be created by DatasetFactory
Example:
dataset = paddle.fluid.DatasetFactory().create_dataset("InMemoryDataset")
"""
def
__init__
(
self
):
""" Init. """
super
(
InMemoryDataset
,
self
).
__init__
()
self
.
proto_desc
.
name
=
"MultiSlotInMemoryDataFeed"
self
.
fleet_send_batch_size
=
None
self
.
is_user_set_queue_num
=
False
self
.
queue_num
=
None
self
.
parse_ins_id
=
False
self
.
parse_content
=
False
self
.
parse_logkey
=
False
self
.
merge_by_sid
=
True
self
.
enable_pv_merge
=
False
self
.
merge_by_lineid
=
False
self
.
fleet_send_sleep_seconds
=
None
def
set_feed_type
(
self
,
data_feed_type
):
"""
Set data_feed_desc
"""
self
.
proto_desc
.
name
=
data_feed_type
def
_prepare_to_run
(
self
):
"""
Set data_feed_desc before load or shuffle,
user no need to call this function.
"""
if
self
.
thread_num
<=
0
:
self
.
thread_num
=
1
self
.
dataset
.
set_thread_num
(
self
.
thread_num
)
if
self
.
queue_num
is
None
:
self
.
queue_num
=
self
.
thread_num
self
.
dataset
.
set_queue_num
(
self
.
queue_num
)
self
.
dataset
.
set_parse_ins_id
(
self
.
parse_ins_id
)
self
.
dataset
.
set_parse_content
(
self
.
parse_content
)
self
.
dataset
.
set_parse_logkey
(
self
.
parse_logkey
)
self
.
dataset
.
set_merge_by_sid
(
self
.
merge_by_sid
)
self
.
dataset
.
set_enable_pv_merge
(
self
.
enable_pv_merge
)
self
.
dataset
.
set_data_feed_desc
(
self
.
desc
())
self
.
dataset
.
create_channel
()
self
.
dataset
.
create_readers
()
def
_dynamic_adjust_before_train
(
self
,
thread_num
):
if
not
self
.
is_user_set_queue_num
:
self
.
dataset
.
dynamic_adjust_channel_num
(
thread_num
,
False
)
self
.
dataset
.
dynamic_adjust_readers_num
(
thread_num
)
def
_dynamic_adjust_after_train
(
self
):
if
not
self
.
is_user_set_queue_num
:
self
.
dataset
.
dynamic_adjust_channel_num
(
self
.
thread_num
,
False
)
self
.
dataset
.
dynamic_adjust_readers_num
(
self
.
thread_num
)
def
set_queue_num
(
self
,
queue_num
):
"""
Set Dataset output queue num, training threads get data from queues
Args:
queue_num(int): dataset output queue num
Examples:
.. code-block:: python
import paddle.fluid as fluid
dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
dataset.set_queue_num(12)
"""
self
.
is_user_set_queue_num
=
True
self
.
queue_num
=
queue_num
def
set_parse_ins_id
(
self
,
parse_ins_id
):
"""
Set id Dataset need to parse insid
Args:
parse_ins_id(bool): if parse ins_id or not
Examples:
.. code-block:: python
import paddle.fluid as fluid
dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
dataset.set_parse_ins_id(True)
"""
self
.
parse_ins_id
=
parse_ins_id
def
set_parse_content
(
self
,
parse_content
):
"""
Set if Dataset need to parse content
Args:
parse_content(bool): if parse content or not
Examples:
.. code-block:: python
import paddle.fluid as fluid
dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
dataset.set_parse_content(True)
"""
self
.
parse_content
=
parse_content
def
set_parse_logkey
(
self
,
parse_logkey
):
"""
Set if Dataset need to parse logkey
Args:
parse_content(bool): if parse logkey or not
Examples:
.. code-block:: python
import paddle.fluid as fluid
dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
dataset.set_parse_logkey(True)
"""
self
.
parse_logkey
=
parse_logkey
def
set_merge_by_sid
(
self
,
merge_by_sid
):
"""
Set if Dataset need to merge sid. If not, one ins means one Pv.
Args:
merge_by_sid(bool): if merge sid or not
Examples:
.. code-block:: python
import paddle.fluid as fluid
dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
dataset.set_merge_by_sid(True)
"""
self
.
merge_by_sid
=
merge_by_sid
def
set_enable_pv_merge
(
self
,
enable_pv_merge
):
"""
Set if Dataset need to merge pv.
Args:
enable_pv_merge(bool): if enable_pv_merge or not
Examples:
.. code-block:: python
import paddle.fluid as fluid
dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
dataset.set_enable_pv_merge(True)
"""
self
.
enable_pv_merge
=
enable_pv_merge
def
preprocess_instance
(
self
):
"""
Merge pv instance and convey it from input_channel to input_pv_channel.
It will be effective when enable_pv_merge_ is True.
Examples:
.. code-block:: python
import paddle.fluid as fluid
dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
filelist = ["a.txt", "b.txt"]
dataset.set_filelist(filelist)
dataset.load_into_memory()
dataset.preprocess_instance()
"""
self
.
dataset
.
preprocess_instance
()
def
set_current_phase
(
self
,
current_phase
):
"""
Set current phase in train. It is useful for untest.
current_phase : 1 for join, 0 for update.
Examples:
.. code-block:: python
import paddle.fluid as fluid
dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
filelist = ["a.txt", "b.txt"]
dataset.set_filelist(filelist)
dataset.load_into_memory()
dataset.set_current_phase(1)
"""
self
.
dataset
.
set_current_phase
(
current_phase
)
def
postprocess_instance
(
self
):
"""
Divide pv instance and convey it to input_channel.
Examples:
.. code-block:: python
import paddle.fluid as fluid
dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
filelist = ["a.txt", "b.txt"]
dataset.set_filelist(filelist)
dataset.load_into_memory()
dataset.preprocess_instance()
exe.train_from_dataset(dataset)
dataset.postprocess_instance()
"""
self
.
dataset
.
postprocess_instance
()
def
set_fleet_send_batch_size
(
self
,
fleet_send_batch_size
=
1024
):
"""
Set fleet send batch size, default is 1024
Args:
fleet_send_batch_size(int): fleet send batch size
Examples:
.. code-block:: python
import paddle.fluid as fluid
dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
dataset.set_fleet_send_batch_size(800)
"""
self
.
fleet_send_batch_size
=
fleet_send_batch_size
def
set_fleet_send_sleep_seconds
(
self
,
fleet_send_sleep_seconds
=
0
):
"""
Set fleet send sleep time, default is 0
Args:
fleet_send_sleep_seconds(int): fleet send sleep time
Examples:
.. code-block:: python
import paddle.fluid as fluid
dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
dataset.set_fleet_send_sleep_seconds(2)
"""
self
.
fleet_send_sleep_seconds
=
fleet_send_sleep_seconds
def
set_merge_by_lineid
(
self
,
merge_size
=
2
):
"""
Set merge by line id, instances of same line id will be merged after
shuffle, you should parse line id in data generator.
Args:
merge_size(int): ins size to merge. default is 2.
Examples:
.. code-block:: python
import paddle.fluid as fluid
dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
dataset.set_merge_by_lineid()
"""
self
.
dataset
.
set_merge_by_lineid
(
merge_size
)
self
.
merge_by_lineid
=
True
self
.
parse_ins_id
=
True
def
set_generate_unique_feasigns
(
self
,
generate_uni_feasigns
,
shard_num
):
self
.
dataset
.
set_generate_unique_feasigns
(
generate_uni_feasigns
)
self
.
gen_uni_feasigns
=
generate_uni_feasigns
self
.
local_shard_num
=
shard_num
def
generate_local_tables_unlock
(
self
,
table_id
,
fea_dim
,
read_thread_num
,
consume_thread_num
,
shard_num
):
self
.
dataset
.
generate_local_tables_unlock
(
table_id
,
fea_dim
,
read_thread_num
,
consume_thread_num
,
shard_num
)
def
load_into_memory
(
self
):
"""
Load data into memory
Examples:
.. code-block:: python
import paddle.fluid as fluid
dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
filelist = ["a.txt", "b.txt"]
dataset.set_filelist(filelist)
dataset.load_into_memory()
"""
self
.
_prepare_to_run
()
self
.
dataset
.
load_into_memory
()
def
preload_into_memory
(
self
,
thread_num
=
None
):
"""
Load data into memory in async mode
Args:
thread_num(int): preload thread num
Examples:
.. code-block:: python
import paddle.fluid as fluid
dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
filelist = ["a.txt", "b.txt"]
dataset.set_filelist(filelist)
dataset.preload_into_memory()
dataset.wait_preload_done()
"""
self
.
_prepare_to_run
()
if
thread_num
is
None
:
thread_num
=
self
.
thread_num
self
.
dataset
.
set_preload_thread_num
(
thread_num
)
self
.
dataset
.
create_preload_readers
()
self
.
dataset
.
preload_into_memory
()
def
wait_preload_done
(
self
):
"""
Wait preload_into_memory done
Examples:
.. code-block:: python
import paddle.fluid as fluid
dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
filelist = ["a.txt", "b.txt"]
dataset.set_filelist(filelist)
dataset.preload_into_memory()
dataset.wait_preload_done()
"""
self
.
dataset
.
wait_preload_done
()
self
.
dataset
.
destroy_preload_readers
()
def
local_shuffle
(
self
):
"""
Local shuffle
Examples:
.. code-block:: python
import paddle.fluid as fluid
dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
filelist = ["a.txt", "b.txt"]
dataset.set_filelist(filelist)
dataset.load_into_memory()
dataset.local_shuffle()
"""
self
.
dataset
.
local_shuffle
()
def
global_shuffle
(
self
,
fleet
=
None
,
thread_num
=
12
):
"""
Global shuffle.
Global shuffle can be used only in distributed mode. i.e. multiple
processes on single machine or multiple machines training together.
If you run in distributed mode, you should pass fleet instead of None.
Examples:
.. code-block:: python
import paddle.fluid as fluid
from paddle.fluid.incubate.fleet.parameter_server.pslib import fleet
dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
filelist = ["a.txt", "b.txt"]
dataset.set_filelist(filelist)
dataset.load_into_memory()
dataset.global_shuffle(fleet)
Args:
fleet(Fleet): fleet singleton. Default None.
thread_num(int): shuffle thread num. Default is 12.
"""
trainer_num
=
1
if
fleet
is
not
None
:
fleet
.
_role_maker
.
barrier_worker
()
trainer_num
=
fleet
.
worker_num
()
if
self
.
fleet_send_batch_size
is
None
:
self
.
fleet_send_batch_size
=
1024
if
self
.
fleet_send_sleep_seconds
is
None
:
self
.
fleet_send_sleep_seconds
=
0
self
.
dataset
.
register_client2client_msg_handler
()
self
.
dataset
.
set_trainer_num
(
trainer_num
)
self
.
dataset
.
set_fleet_send_batch_size
(
self
.
fleet_send_batch_size
)
self
.
dataset
.
set_fleet_send_sleep_seconds
(
self
.
fleet_send_sleep_seconds
)
if
fleet
is
not
None
:
fleet
.
_role_maker
.
barrier_worker
()
self
.
dataset
.
global_shuffle
(
thread_num
)
if
fleet
is
not
None
:
fleet
.
_role_maker
.
barrier_worker
()
if
self
.
merge_by_lineid
:
self
.
dataset
.
merge_by_lineid
()
if
fleet
is
not
None
:
fleet
.
_role_maker
.
barrier_worker
()
def
release_memory
(
self
):
"""
:api_attr: Static Graph
Release InMemoryDataset memory data, when data will not be used again.
Examples:
.. code-block:: python
import paddle.fluid as fluid
from paddle.fluid.incubate.fleet.parameter_server.pslib import fleet
dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
filelist = ["a.txt", "b.txt"]
dataset.set_filelist(filelist)
dataset.load_into_memory()
dataset.global_shuffle(fleet)
exe = fluid.Executor(fluid.CPUPlace())
exe.run(fluid.default_startup_program())
exe.train_from_dataset(fluid.default_main_program(), dataset)
dataset.release_memory()
"""
self
.
dataset
.
release_memory
()
def
get_pv_data_size
(
self
):
"""
Get memory data size of Pv, user can call this function to know the pv num
of ins in all workers after load into memory.
Note:
This function may cause bad performance, because it has barrier
Returns:
The size of memory pv data.
Examples:
.. code-block:: python
import paddle.fluid as fluid
dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
filelist = ["a.txt", "b.txt"]
dataset.set_filelist(filelist)
dataset.load_into_memory()
print dataset.get_pv_data_size()
"""
return
self
.
dataset
.
get_pv_data_size
()
def
get_memory_data_size
(
self
,
fleet
=
None
):
"""
Get memory data size, user can call this function to know the num
of ins in all workers after load into memory.
Note:
This function may cause bad performance, because it has barrier
Args:
fleet(Fleet): Fleet Object.
Returns:
The size of memory data.
Examples:
.. code-block:: python
import paddle.fluid as fluid
from paddle.fluid.incubate.fleet.parameter_server.pslib import fleet
dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
filelist = ["a.txt", "b.txt"]
dataset.set_filelist(filelist)
dataset.load_into_memory()
print dataset.get_memory_data_size(fleet)
"""
import
numpy
as
np
local_data_size
=
self
.
dataset
.
get_memory_data_size
()
local_data_size
=
np
.
array
([
local_data_size
])
if
fleet
is
not
None
:
global_data_size
=
local_data_size
*
0
fleet
.
_role_maker
.
all_reduce_worker
(
local_data_size
,
global_data_size
)
return
global_data_size
[
0
]
return
local_data_size
[
0
]
def
get_shuffle_data_size
(
self
,
fleet
=
None
):
"""
Get shuffle data size, user can call this function to know the num
of ins in all workers after local/global shuffle.
Note:
This function may cause bad performance to local shuffle,
because it has barrier. It does not affect global shuffle.
Args:
fleet(Fleet): Fleet Object.
Returns:
The size of shuffle data.
Examples:
.. code-block:: python
import paddle.fluid as fluid
from paddle.fluid.incubate.fleet.parameter_server.pslib import fleet
dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
filelist = ["a.txt", "b.txt"]
dataset.set_filelist(filelist)
dataset.load_into_memory()
dataset.global_shuffle(fleet)
print dataset.get_shuffle_data_size(fleet)
"""
import
numpy
as
np
local_data_size
=
self
.
dataset
.
get_shuffle_data_size
()
local_data_size
=
np
.
array
([
local_data_size
])
if
fleet
is
not
None
:
global_data_size
=
local_data_size
*
0
fleet
.
_role_maker
.
all_reduce_worker
(
local_data_size
,
global_data_size
)
return
global_data_size
[
0
]
return
local_data_size
[
0
]
class
QueueDataset
(
DatasetBase
):
"""
QueueDataset, it will process data streamly.
Examples:
.. code-block:: python
import paddle.fluid as fluid
dataset = fluid.DatasetFactory().create_dataset("QueueDataset")
"""
def
__init__
(
self
):
"""
Initialize QueueDataset
This class should be created by DatasetFactory
"""
super
(
QueueDataset
,
self
).
__init__
()
self
.
proto_desc
.
name
=
"MultiSlotDataFeed"
def
_prepare_to_run
(
self
):
"""
Set data_feed_desc/thread num/filelist before run,
user no need to call this function.
"""
if
self
.
thread_num
>
len
(
self
.
filelist
):
self
.
thread_num
=
len
(
self
.
filelist
)
if
self
.
thread_num
==
0
:
self
.
thread_num
=
1
self
.
dataset
.
set_thread_num
(
self
.
thread_num
)
self
.
dataset
.
set_filelist
(
self
.
filelist
)
self
.
dataset
.
set_data_feed_desc
(
self
.
desc
())
self
.
dataset
.
create_readers
()
def
local_shuffle
(
self
):
"""
Local shuffle data.
Local shuffle is not supported in QueueDataset
NotImplementedError will be raised
Examples:
.. code-block:: python
import paddle.fluid as fluid
dataset = fluid.DatasetFactory().create_dataset("QueueDataset")
dataset.local_shuffle()
Raises:
NotImplementedError: QueueDataset does not support local shuffle
"""
raise
NotImplementedError
(
"QueueDataset does not support local shuffle, "
"please use InMemoryDataset for local_shuffle"
)
def
global_shuffle
(
self
,
fleet
=
None
):
"""
Global shuffle data.
Global shuffle is not supported in QueueDataset
NotImplementedError will be raised
Args:
fleet(Fleet): fleet singleton. Default None.
Examples:
.. code-block:: python
import paddle.fluid as fluid
from paddle.fluid.incubate.fleet.parameter_server.pslib import fleet
dataset = fluid.DatasetFactory().create_dataset("QueueDataset")
dataset.global_shuffle(fleet)
Raises:
NotImplementedError: QueueDataset does not support global shuffle
"""
raise
NotImplementedError
(
"QueueDataset does not support global shuffle, "
"please use InMemoryDataset for global_shuffle"
)
class
FileInstantDataset
(
DatasetBase
):
"""
FileInstantDataset, it will process data streamly.
Examples:
.. code-block:: python
import paddle.fluid as fluid
dataset = fluid.DatasetFactory.create_dataset("FileInstantDataset")
"""
def
__init__
(
self
):
"""
Initialize FileInstantDataset
This class should be created by DatasetFactory
"""
super
(
FileInstantDataset
,
self
).
__init__
()
self
.
proto_desc
.
name
=
"MultiSlotFileInstantDataFeed"
def
local_shuffle
(
self
):
"""
Local shuffle
FileInstantDataset does not support local shuffle
"""
raise
NotImplementedError
(
"FileInstantDataset does not support local shuffle, "
"please use InMemoryDataset for local_shuffle"
)
def
global_shuffle
(
self
,
fleet
=
None
):
"""
Global shuffle
FileInstantDataset does not support global shuffle
"""
raise
NotImplementedError
(
"FileInstantDataset does not support global shuffle, "
"please use InMemoryDataset for global_shuffle"
)
class
BoxPSDataset
(
InMemoryDataset
):
"""
BoxPSDataset: derived from InMemoryDataset.
Examples:
.. code-block:: python
import paddle.fluid as fluid
dataset = fluid.DatasetFactory().create_dataset("BoxPSDataset")
"""
def
__init__
(
self
):
"""
Initialize BoxPSDataset
This class should be created by DatasetFactory
"""
super
(
BoxPSDataset
,
self
).
__init__
()
self
.
boxps
=
core
.
BoxPS
(
self
.
dataset
)
self
.
proto_desc
.
name
=
"PaddleBoxDataFeed"
def
set_date
(
self
,
date
):
"""
Workaround for date
"""
year
=
int
(
date
[:
4
])
month
=
int
(
date
[
4
:
6
])
day
=
int
(
date
[
6
:])
self
.
boxps
.
set_date
(
year
,
month
,
day
)
def
begin_pass
(
self
):
"""
Begin Pass
Notify BoxPS to load sparse parameters of next pass to GPU Memory
Examples:
.. code-block:: python
import paddle.fluid as fluid
dataset = fluid.DatasetFactory().create_dataset("BoxPSDataset")
dataset.begin_pass()
"""
self
.
boxps
.
begin_pass
()
def
end_pass
(
self
,
need_save_delta
):
"""
End Pass
Notify BoxPS that current pass ended
Examples:
.. code-block:: python
import paddle.fluid as fluid
dataset = fluid.DatasetFactory().create_dataset("BoxPSDataset")
dataset.end_pass(True)
"""
self
.
boxps
.
end_pass
(
need_save_delta
)
def
wait_preload_done
(
self
):
"""
Wait async preload done
Wait Until Feed Pass Done
Examples:
.. code-block:: python
import paddle.fluid as fluid
dataset = fluid.DatasetFactory().create_dataset("BoxPSDataset")
filelist = ["a.txt", "b.txt"]
dataset.set_filelist(filelist)
dataset.preload_into_memory()
dataset.wait_preload_done()
"""
self
.
boxps
.
wait_feed_pass_done
()
def
load_into_memory
(
self
):
"""
Load next pass into memory and notify boxps to fetch its emb from SSD
Examples:
.. code-block:: python
import paddle.fluid as fluid
dataset = fluid.DatasetFactory().create_dataset("BoxPSDataset")
filelist = ["a.txt", "b.txt"]
dataset.set_filelist(filelist)
dataset.load_into_memory()
"""
self
.
_prepare_to_run
()
self
.
boxps
.
load_into_memory
()
def
preload_into_memory
(
self
):
"""
Begin async preload next pass while current pass may be training
Examples:
.. code-block:: python
import paddle.fluid as fluid
dataset = fluid.DatasetFactory().create_dataset("BoxPSDataset")
filelist = ["a.txt", "b.txt"]
dataset.set_filelist(filelist)
dataset.preload_into_memory()
"""
self
.
_prepare_to_run
()
self
.
boxps
.
preload_into_memory
()
def
_dynamic_adjust_before_train
(
self
,
thread_num
):
if
not
self
.
is_user_set_queue_num
:
self
.
dataset
.
dynamic_adjust_channel_num
(
thread_num
,
True
)
self
.
dataset
.
dynamic_adjust_readers_num
(
thread_num
)
def
_dynamic_adjust_after_train
(
self
):
pass
def
slots_shuffle
(
self
,
slots
):
"""
Slots Shuffle
Slots Shuffle is a shuffle method in slots level, which is usually used
in sparse feature with large scale of instances. To compare the metric, i.e.
auc while doing slots shuffle on one or several slots with baseline to
evaluate the importance level of slots(features).
Args:
slots(list[string]): the set of slots(string) to do slots shuffle.
Examples:
import paddle.fluid as fluid
dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset")
dataset.set_merge_by_lineid()
#suppose there is a slot 0
dataset.slots_shuffle(['0'])
"""
slots_set
=
set
(
slots
)
self
.
boxps
.
slots_shuffle
(
slots_set
)
python/paddle/fluid/annotations.py
已删除
100644 → 0
浏览文件 @
4a15797e
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
__future__
import
print_function
import
functools
import
sys
__all__
=
[
'deprecated'
]
def
deprecated
(
since
,
instead
,
extra_message
=
""
):
def
decorator
(
func
):
err_msg
=
"API {0} is deprecated since {1}. Please use {2} instead."
.
format
(
func
.
__name__
,
since
,
instead
)
if
len
(
extra_message
)
!=
0
:
err_msg
+=
"
\n
"
err_msg
+=
extra_message
@
functools
.
wraps
(
func
)
def
wrapper
(
*
args
,
**
kwargs
):
print
(
err_msg
,
file
=
sys
.
stderr
)
return
func
(
*
args
,
**
kwargs
)
wrapper
.
__doc__
+=
"
\n
"
wrapper
.
__doc__
+=
err_msg
return
wrapper
return
decorator
python/paddle/fluid/device_worker.py
浏览文件 @
862dde5e
...
...
@@ -223,7 +223,8 @@ class DownpourSGD(DeviceWorker):
dense_table_set
.
add
(
i
)
break
trainer_desc
.
device_worker_name
=
"DownpourWorker"
trainer_desc
.
device_worker_name
=
opt_info
.
get
(
"worker_class"
,
"DownpourWorker"
)
pull_thread
=
trainer_desc
.
pull_dense_param
pull_thread
.
device_num
=
trainer_desc
.
thread_num
if
opt_info
.
get
(
"program_id_to_worker"
)
is
None
:
...
...
python/paddle/fluid/dygraph/dygraph_to_static/program_translator.py
浏览文件 @
862dde5e
...
...
@@ -36,7 +36,6 @@ from paddle.fluid.wrapped_decorator import signature_safe_contextmanager
from
paddle.fluid.dygraph.base
import
param_guard
from
paddle.fluid.data_feeder
import
check_type
from
paddle.fluid.dygraph.dygraph_to_static.partial_program
import
partial_program_from
from
paddle.fluid.annotations
import
deprecated
__all__
=
[
'ProgramTranslator'
,
'convert_to_static'
]
...
...
python/paddle/fluid/dygraph/nn.py
浏览文件 @
862dde5e
...
...
@@ -35,7 +35,7 @@ __all__ = [
'Conv2D'
,
'Conv3D'
,
'Pool2D'
,
'Linear'
,
'BatchNorm'
,
'Dropout'
,
'Embedding'
,
'GRUUnit'
,
'InstanceNorm'
,
'LayerNorm'
,
'NCE'
,
'PRelu'
,
'BilinearTensorProduct'
,
'Conv2DTranspose'
,
'Conv3DTranspose'
,
'GroupNorm'
,
'SpectralNorm'
,
'TreeConv'
'SpectralNorm'
,
'TreeConv'
,
'Flatten'
]
...
...
@@ -3182,3 +3182,62 @@ class TreeConv(layers.Layer):
else
:
pre_activation
=
out
return
self
.
_helper
.
append_activation
(
pre_activation
,
act
=
self
.
_act
)
class
Flatten
(
layers
.
Layer
):
"""
:alias_main: paddle.nn.Flatten
:alias: paddle.nn.Flatten,paddle.nn.layer.Flatten,paddle.nn.layer.common.Flatten
This interface is used to construct a callable object of the ``FLatten`` class.
For more details, refer to code examples.
It implements flatten a contiguous range of dims into a tensor.
Equation:
Parameters:
start_axis(int): first dim to flatten (default = 1)
stop_axis(int): last dim to flatten (default = -1).
Returns:
None
Examples:
.. code-block:: python
import paddle
from paddle.imperative import to_variable
import numpy as np
inp_np = np.ones([5, 2, 3, 4]).astype('float32')
paddle.enable_imperative()
inp_np = to_variable(inp_np)
flatten = paddle.nn.Flatten(start_axis=1, stop_axis=2)
flatten_res = flatten(inp_np)
"""
def
__init__
(
self
,
start_axis
=
1
,
stop_axis
=-
1
):
super
(
Flatten
,
self
).
__init__
()
self
.
start_axis
=
start_axis
self
.
stop_axis
=
stop_axis
def
forward
(
self
,
input
):
out
=
self
.
_helper
.
create_variable_for_type_inference
(
input
.
dtype
)
x_shape
=
self
.
_helper
.
create_variable_for_type_inference
(
input
.
dtype
)
if
in_dygraph_mode
():
dy_out
,
_
=
core
.
ops
.
flatten_contiguous_range
(
input
,
'start_axis'
,
self
.
start_axis
,
'stop_axis'
,
self
.
stop_axis
)
return
dy_out
self
.
_helper
.
append_op
(
type
=
"flatten_contiguous_range"
,
inputs
=
{
"X"
:
input
},
outputs
=
{
"Out"
:
out
,
"XShape"
:
x_shape
},
attrs
=
{
"start_axis"
:
self
.
start_axis
,
"stop_axis"
:
self
.
stop_axis
})
return
out
python/paddle/fluid/executor.py
浏览文件 @
862dde5e
...
...
@@ -1300,6 +1300,12 @@ class Executor(object):
fetch_list
=
None
,
fetch_info
=
None
,
print_period
=
100
):
is_heter
=
0
if
not
program
.
_fleet_opt
is
None
:
if
program
.
_fleet_opt
.
get
(
"worker_class"
,
""
)
==
"HeterCpuWorker"
:
is_heter
=
1
if
program
.
_fleet_opt
(
"trainer"
,
""
)
==
"HeterXpuTrainer"
:
is_heter
=
1
if
scope
is
None
:
scope
=
global_scope
()
if
fetch_list
is
None
:
...
...
@@ -1308,6 +1314,11 @@ class Executor(object):
fetch_info
=
[]
assert
len
(
fetch_list
)
==
len
(
fetch_info
)
compiled
=
isinstance
(
program
,
compiler
.
CompiledProgram
)
if
is_heter
:
from
paddle.fluid.incubate.fleet.parameter_server.pslib
import
fleet
from
paddle.fluid.incubate.fleet.utils.fleet_util
import
FleetUtil
fu
=
FleetUtil
()
ret
=
fu
.
split_program_by_device
(
program
)
if
not
compiled
:
# TODO: Need a better way to distinguish and specify different execution mode
if
program
.
_pipeline_opt
:
...
...
@@ -1317,6 +1328,8 @@ class Executor(object):
trainer
=
TrainerFactory
().
_create_trainer
(
program
.
_fleet_opt
)
trainer
.
_set_thread_barrier
(
program
.
_is_distributed
)
trainer
.
_set_program
(
program
)
if
is_heter
:
trainer
.
_set_heter_info
(
ret
)
else
:
if
program
.
_pipeline_opt
:
trainer
=
TrainerFactory
().
_create_trainer
(
...
...
@@ -1476,6 +1489,60 @@ class Executor(object):
debug
,
fetch_list
,
fetch_info
,
print_period
,
fetch_handler
)
def
start_heter_trainer
(
self
,
program
=
None
,
scope
=
None
,
debug
=
False
,
fetch_list
=
None
,
fetch_info
=
None
,
print_period
=
100
,
fetch_handler
=
None
):
return
self
.
_start_heter_trainer
(
program
,
scope
,
False
,
debug
,
fetch_list
,
fetch_info
,
print_period
,
fetch_handler
)
def
_start_heter_trainer
(
self
,
program
=
None
,
scope
=
None
,
is_infer
=
False
,
debug
=
False
,
fetch_list
=
None
,
fetch_info
=
None
,
print_period
=
100
,
fetch_handler
=
None
):
scope
,
trainer
=
self
.
_prepare_trainer
(
program
=
program
,
dataset
=
None
,
scope
=
scope
,
thread
=
1
,
debug
=
debug
,
fetch_list
=
fetch_list
,
fetch_info
=
fetch_info
,
print_period
=
print_period
)
trainer
.
_set_infer
(
is_infer
)
trainer
.
_gen_trainer_desc
()
self
.
_dump_debug_info
(
program
=
program
,
trainer
=
trainer
)
trainer_instance
=
self
.
_default_executor
.
init_for_dataset
(
program
.
desc
,
trainer
.
_desc
(),
scope
,
None
)
#if fetch_handler is not None:
# scope0 = trainer_instance.get_worker_scope(0)
# fetch_monitor = FetchHandlerMonitor(scope0, fetch_handler)
# fetch_monitor.start()
# self._default_executor.run_from_dataset(trainer_instance)
# fetch_monitor.stop()
# self._default_executor.release_trainer(trainer_instance)
#else:
self
.
_default_executor
.
run_from_dataset
(
trainer_instance
)
#self._default_executor.release_trainer(trainer_instance)
return
trainer_instance
def
train_from_dataset
(
self
,
program
=
None
,
dataset
=
None
,
...
...
python/paddle/fluid/incubate/fleet/base/fleet_base.py
浏览文件 @
862dde5e
...
...
@@ -149,6 +149,16 @@ class Fleet(object):
"""
return
self
.
_role_maker
.
is_server
()
def
is_xpu
(
self
):
"""
Check whether the node is an instance of server.
Returns:
bool: True if this is a node of server,
False if not.
"""
return
self
.
_role_maker
.
is_xpu
()
def
split_files
(
self
,
files
):
"""
split files before distributed training,
...
...
python/paddle/fluid/incubate/fleet/base/role_maker.py
浏览文件 @
862dde5e
...
...
@@ -28,6 +28,7 @@ __all__ = [
class
Role
:
WORKER
=
1
SERVER
=
2
XPU
=
3
class
MockBarrier
(
object
):
...
...
@@ -988,6 +989,147 @@ class GeneralRoleMaker(RoleMakerBase):
http_server
.
stop
()
class
HeterRoleMaker
(
GeneralRoleMaker
):
"""
This role maker is for general use, you can set os.environ to customize:
PADDLE_PSERVERS_IP_PORT_LIST : all pservers' ip:port, separated by ','
PADDLE_TRAINER_ENDPOINTS : all trainers' ip:port, separated by ','
TRAINING_ROLE : TRAINER or PSERVER
PADDLE_TRAINER_ID : current trainer id (only for trainer),
it is index in PADDLE_TRAINER_ENDPOINTS
PADDLE_PSERVER_ID : current pserver id (only for pserver)
it is index in PADDLE_PSERVERS_IP_PORT_LIST
"""
def
generate_role
(
self
):
"""
generate role for general role maker
"""
if
not
self
.
_role_is_generated
:
eplist
=
os
.
environ
[
"PADDLE_PSERVERS_IP_PORT_LIST"
].
split
(
","
)
training_role
=
os
.
environ
[
"TRAINING_ROLE"
]
worker_endpoints
=
os
.
environ
[
"PADDLE_TRAINER_ENDPOINTS"
].
split
(
","
)
trainers_num
=
len
(
worker_endpoints
)
xpu_endpoints
=
os
.
environ
[
"PADDLE_XPU_ENDPOINTS"
].
split
(
","
)
xpu_num
=
len
(
xpu_endpoints
)
if
training_role
not
in
[
"TRAINER"
,
"PSERVER"
,
"XPU"
]:
raise
ValueError
(
"TRAINING_ROLE must be PSERVER or TRAINER or XPU"
)
if
training_role
==
"TRAINER"
:
role
=
Role
.
WORKER
current_id
=
int
(
os
.
environ
[
"PADDLE_TRAINER_ID"
])
self
.
_node_type
=
1
self
.
_cur_endpoint
=
worker_endpoints
[
current_id
]
gloo
=
fluid
.
core
.
Gloo
()
gloo
.
init
(
current_id
,
len
(
worker_endpoints
),
self
.
_hdfs_path
.
rstrip
(
"/"
)
+
"/trainer"
,
self
.
_hdfs_name
,
self
.
_hdfs_ugi
,
self
.
_iface
,
self
.
_prefix
)
self
.
_node_type_comm
=
gloo
elif
training_role
==
"XPU"
:
role
=
Role
.
XPU
current_id
=
int
(
os
.
environ
[
"PADDLE_XPU_ID"
])
self
.
_node_type
=
2
self
.
_cur_endpoint
=
xpu_endpoints
[
current_id
]
gloo
=
fluid
.
core
.
Gloo
()
gloo
.
init
(
current_id
,
len
(
xpu_endpoints
),
self
.
_hdfs_path
.
rstrip
(
"/"
)
+
"/xpu"
,
self
.
_hdfs_name
,
self
.
_hdfs_ugi
,
self
.
_iface
,
self
.
_prefix
)
self
.
_node_type_comm
=
gloo
elif
training_role
==
"PSERVER"
:
role
=
Role
.
SERVER
if
os
.
environ
.
get
(
"PADDLE_PSERVER_ID"
)
is
not
None
:
current_id
=
int
(
os
.
environ
[
"PADDLE_PSERVER_ID"
])
cur_endpoint
=
eplist
[
current_id
]
else
:
# this is for compatible with paddlecloud
cur_ip
=
os
.
environ
[
"POD_IP"
]
cur_port
=
os
.
environ
[
"PADDLE_PORT"
]
cur_endpoint
=
":"
.
join
([
cur_ip
,
cur_port
])
current_id
=
eplist
.
index
(
cur_endpoint
)
self
.
_node_type
=
0
self
.
_cur_endpoint
=
cur_endpoint
gloo
=
fluid
.
core
.
Gloo
()
gloo
.
init
(
current_id
,
len
(
eplist
),
self
.
_hdfs_path
.
rstrip
(
"/"
)
+
"/pserver"
,
self
.
_hdfs_name
,
self
.
_hdfs_ugi
,
self
.
_iface
,
self
.
_prefix
)
self
.
_node_type_comm
=
gloo
if
training_role
==
"TRAINER"
or
training_role
==
"XPU"
:
gloo
=
fluid
.
core
.
Gloo
()
heter_list
=
worker_endpoints
+
xpu_endpoints
gloo
.
init
(
heter_list
.
index
(
self
.
_cur_endpoint
),
len
(
heter_list
),
self
.
_hdfs_path
.
rstrip
(
"/"
)
+
"/heter"
,
self
.
_hdfs_name
,
self
.
_hdfs_ugi
,
self
.
_iface
,
self
.
_prefix
)
self
.
_heter_comm
=
gloo
gloo
=
fluid
.
core
.
Gloo
()
all_list
=
worker_endpoints
+
eplist
+
xpu_endpoints
gloo
.
init
(
all_list
.
index
(
self
.
_cur_endpoint
),
len
(
all_list
),
self
.
_hdfs_path
.
rstrip
(
"/"
)
+
"/all"
,
self
.
_hdfs_name
,
self
.
_hdfs_ugi
,
self
.
_iface
,
self
.
_prefix
)
self
.
_all_comm
=
gloo
self
.
_trainers_num
=
trainers_num
self
.
_server_endpoints
=
eplist
self
.
_role
=
role
self
.
_current_id
=
current_id
self
.
_rank
=
all_list
.
index
(
self
.
_cur_endpoint
)
self
.
_size
=
len
(
all_list
)
self
.
_worker_endpoints
=
worker_endpoints
self
.
_xpu_endpoints
=
xpu_endpoints
self
.
_role_is_generated
=
True
def
is_xpu
(
self
):
"""
whether current process is server
"""
if
not
self
.
_role_is_generated
:
self
.
generate_role
()
return
self
.
_role
==
Role
.
XPU
def
is_first_xpu
(
self
):
"""
whether current process is worker of rank 0
"""
if
not
self
.
_role_is_generated
:
self
.
generate_role
()
return
self
.
_role
==
Role
.
XPU
and
self
.
_current_id
==
0
def
_barrier_xpu
(
self
):
"""
barrier all workers in current distributed job
"""
if
not
self
.
_role_is_generated
:
self
.
generate_role
()
if
self
.
is_xpu
():
self
.
_node_type_comm
.
barrier
()
def
_barrier_heter
(
self
):
"""
barrier all workers in current distributed job
"""
if
not
self
.
_role_is_generated
:
self
.
generate_role
()
if
self
.
is_xpu
()
or
self
.
is_worker
:
self
.
_heter_comm
.
barrier
()
def
xpu_num
(
self
):
"""
"""
if
not
self
.
_role_is_generated
:
self
.
generate_role
()
return
len
(
self
.
_xpu_endpoints
)
class
UserDefinedRoleMaker
(
RoleMakerBase
):
"""
UserDefinedRoleMaker is designed for worker and server assignment
...
...
python/paddle/fluid/incubate/fleet/parameter_server/pslib/__init__.py
浏览文件 @
862dde5e
...
...
@@ -23,6 +23,7 @@ from paddle.fluid.incubate.fleet.base.fleet_base import Fleet
from
paddle.fluid.incubate.fleet.base.mode
import
Mode
from
paddle.fluid.incubate.fleet.base.fleet_base
import
DistributedOptimizer
from
paddle.fluid.incubate.fleet.base.role_maker
import
MPISymetricRoleMaker
from
paddle.fluid.incubate.fleet.base.role_maker
import
HeterRoleMaker
class
PSLib
(
Fleet
):
...
...
@@ -44,6 +45,9 @@ class PSLib(Fleet):
role_maker
=
MPISymetricRoleMaker
()
super
(
PSLib
,
self
).
init
(
role_maker
)
self
.
_fleet_ptr
=
fluid
.
core
.
Fleet
()
self
.
_heter_ptr
=
None
if
isinstance
(
role_maker
,
HeterRoleMaker
):
self
.
_heter_ptr
=
fluid
.
core
.
Heter
()
def
_set_client_communication_config
(
self
,
request_timeout_ms
,
connect_timeout_ms
,
max_retry
):
...
...
@@ -77,23 +81,35 @@ class PSLib(Fleet):
raise
Exception
(
"You should run DistributedOptimizer.minimize() first"
)
# barrier_all for init_server, wait for server starts
if
isinstance
(
self
.
_role_maker
,
HeterRoleMaker
):
if
self
.
_role_maker
.
is_xpu
():
local_endpoint
=
self
.
_role_maker
.
get_local_endpoint
()
local_endpoint
=
local_endpoint
.
split
(
":"
)
self
.
_heter_ptr
.
start_xpu_service
(
str
(
local_endpoint
[
0
]),
int
(
local_endpoint
[
1
]))
self
.
_role_maker
.
_barrier_all
()
self
.
all_ips_
=
self
.
_role_maker
.
_all_gather
(
self
.
_local_ip
)
# worker_index * 2 is for compatible with older versions of pslib
self
.
_fleet_ptr
.
init_worker
(
self
.
_dist_desc_str
,
self
.
all_ips_
,
self
.
_role_maker
.
_get_size
(),
self
.
_role_maker
.
worker_index
()
*
2
)
if
isinstance
(
self
.
_role_maker
,
HeterRoleMaker
):
if
self
.
_role_maker
.
is_worker
():
self
.
_heter_ptr
.
set_xpu_list
(
self
.
_role_maker
.
_xpu_endpoints
)
self
.
_heter_ptr
.
create_client2xpu_connection
()
# barrier_all for init_worker
self
.
_role_maker
.
_barrier_all
()
# prepare for client to client communication
info
=
self
.
_fleet_ptr
.
get_clients_info
()
all_info
=
self
.
_role_maker
.
_worker_gather
(
info
[
0
])
self
.
_fleet_ptr
.
gather_clients
(
all_info
)
self
.
_fleet_ptr
.
set_client2client_config
(
self
.
_client2client_request_timeout_ms
,
self
.
_client2client_connect_timeout_ms
,
self
.
_client2client_max_retry
)
self
.
_fleet_ptr
.
create_client2client_connection
()
if
self
.
_role_maker
.
is_worker
():
info
=
self
.
_fleet_ptr
.
get_clients_info
()
all_info
=
self
.
_role_maker
.
_worker_gather
(
info
[
0
])
self
.
_fleet_ptr
.
gather_clients
(
all_info
)
self
.
_fleet_ptr
.
set_client2client_config
(
self
.
_client2client_request_timeout_ms
,
self
.
_client2client_connect_timeout_ms
,
self
.
_client2client_max_retry
)
self
.
_fleet_ptr
.
create_client2client_connection
()
# barrier for init model
self
.
_role_maker
.
_barrier_worker
()
if
self
.
_role_maker
.
is_first_worker
():
...
...
@@ -144,10 +160,16 @@ class PSLib(Fleet):
>>> fleet.init_server("/you/path/to/model", mode = 0)
"""
mode
=
kwargs
.
get
(
"mode"
,
0
)
self
.
_role_maker
.
_barrier_worker
()
if
self
.
_role_maker
.
is_first_worker
():
self
.
_fleet_ptr
.
load_model
(
model_dir
,
mode
)
self
.
_role_maker
.
_barrier_worker
()
if
isinstance
(
self
.
_role_maker
,
HeterRoleMaker
):
self
.
_role_maker
.
_barrier_xpu
()
if
self
.
_role_maker
.
is_first_xpu
():
self
.
_fleet_ptr
.
load_model
(
model_dir
,
mode
)
self
.
_role_maker
.
_barrier_xpu
()
else
:
self
.
_role_maker
.
_barrier_worker
()
if
self
.
_role_maker
.
is_first_worker
():
self
.
_fleet_ptr
.
load_model
(
model_dir
,
mode
)
self
.
_role_maker
.
_barrier_worker
()
def
run_server
(
self
):
"""
...
...
@@ -185,6 +207,54 @@ class PSLib(Fleet):
raise
Exception
(
"You should run DistributedOptimizer.minimize() first"
)
def
end_pass
(
self
,
scope
):
if
self
.
_role_maker
.
worker_index
()
<
self
.
_role_maker
.
xpu_num
():
self
.
_heter_ptr
.
end_pass
(
scope
,
self
.
_role_maker
.
worker_index
())
self
.
_heter_ptr
.
stop_xpu_service
(
self
.
_role_maker
.
worker_index
())
def
train_from_dataset
(
self
,
executor
,
program
=
None
,
dataset
=
None
,
scope
=
None
,
thread
=
0
,
debug
=
False
,
fetch_list
=
None
,
fetch_info
=
None
,
print_period
=
100
,
fetch_handler
=
None
):
"""
"""
if
self
.
_role_maker
.
is_worker
():
self
.
_role_maker
.
_barrier_heter
()
executor
.
train_from_dataset
(
program
,
dataset
,
scope
,
thread
,
debug
,
fetch_list
,
fetch_info
,
print_period
,
fetch_handler
)
def
start_heter_trainer
(
self
,
executor
,
program
=
None
,
scope
=
None
,
debug
=
False
,
fetch_list
=
None
,
fetch_info
=
None
,
print_period
=
100
,
fetch_handler
=
None
):
"""
"""
trainer_instance
=
executor
.
start_heter_trainer
(
program
,
scope
,
debug
,
fetch_list
,
fetch_info
,
print_period
,
fetch_handler
)
if
self
.
_role_maker
.
is_xpu
():
print
(
"barrier heter"
)
self
.
_role_maker
.
_barrier_heter
()
print
(
"barrier heter"
)
executor
.
_default_executor
.
release_trainer
(
trainer_instance
)
def
stop_worker
(
self
):
"""
stop(): will be called after a user finishes his/her training task. Fleet instance will be
...
...
@@ -197,6 +267,7 @@ class PSLib(Fleet):
self
.
_role_maker
.
_barrier_worker
()
if
self
.
_role_maker
.
is_first_worker
():
self
.
_fleet_ptr
.
stop_server
()
self
.
_heter_ptr
.
stop_xpu_service
()
self
.
_role_maker
.
_barrier_worker
()
self
.
_role_maker
.
_barrier_all
()
self
.
_role_maker
.
_finalize
()
...
...
python/paddle/fluid/incubate/fleet/parameter_server/pslib/optimizer_factory.py
浏览文件 @
862dde5e
...
...
@@ -509,13 +509,15 @@ class DistributedAdam(DistributedOptimizerImplBase):
opt_info
=
{}
opt_info
[
"program_id_to_worker"
]
=
prog_id_to_worker
opt_info
[
"program_configs"
]
=
program_configs
opt_info
[
"trainer"
]
=
"DistMultiTrainer"
opt_info
[
"trainer"
]
=
strategy
.
get
(
"trainer"
,
"DistMultiTrainer"
)
opt_info
[
"device_worker"
]
=
strategy
.
get
(
"device_worker"
,
"DownpourSGD"
)
opt_info
[
"optimizer"
]
=
"DownpourSGD"
opt_info
[
"fleet_desc"
]
=
ps_param
opt_info
[
"worker_skipped_ops"
]
=
worker_skipped_ops
opt_info
[
"use_cvm"
]
=
strategy
.
get
(
"use_cvm"
,
False
)
opt_info
[
"no_cvm"
]
=
strategy
.
get
(
"no_cvm"
,
False
)
opt_info
[
"worker_class"
]
=
strategy
.
get
(
"worker_class"
,
"DownpourWorker"
)
opt_info
[
"stat_var_names"
]
=
strategy
.
get
(
"stat_var_names"
,
[])
opt_info
[
"local_tables"
]
=
strategy
.
get
(
"local_tables"
,
[])
opt_info
[
"async_tables"
]
=
strategy
.
get
(
"async_tables"
,
[])
...
...
@@ -529,6 +531,7 @@ class DistributedAdam(DistributedOptimizerImplBase):
opt_info
[
"dump_file_num"
]
=
strategy
.
get
(
"dump_file_num"
,
16
)
opt_info
[
"dump_fields_path"
]
=
strategy
.
get
(
"dump_fields_path"
,
""
)
opt_info
[
"dump_param"
]
=
strategy
.
get
(
"dump_param"
,
[])
opt_info
[
"worker_places"
]
=
strategy
.
get
(
"worker_places"
,
[])
if
server
.
_server
.
downpour_server_param
.
downpour_table_param
[
0
].
accessor
.
accessor_class
in
[
"DownpourCtrAccessor"
,
"DownpourCtrDoubleAccessor"
,
...
...
python/paddle/fluid/incubate/fleet/utils/fleet_util.py
浏览文件 @
862dde5e
...
...
@@ -14,6 +14,7 @@
"""Fleet Utils."""
import
collections
import
copy
import
json
import
logging
import
math
...
...
@@ -1615,3 +1616,123 @@ class FleetUtil(object):
"""
program
=
utils
.
load_program
(
prog_path
,
is_text
)
utils
.
parse_program
(
program
,
output_dir
)
def
split_program_by_device
(
self
,
program
):
ops_list
=
[]
type_list
=
[]
pre
=
None
type_cpu
=
"cpu"
for
op
in
program
.
global_block
().
ops
:
if
op
.
has_attr
(
"op_device"
):
if
pre
is
None
or
pre
!=
op
.
attr
(
"op_device"
):
ops_list
.
append
([])
type_list
.
append
(
op
.
attr
(
"op_device"
)
if
op
.
attr
(
"op_device"
)
!=
""
else
type_cpu
)
ops_list
[
-
1
].
append
(
op
)
pre
=
op
.
attr
(
"op_device"
)
l
=
len
(
type_list
)
i
=
0
type_heter
=
None
while
i
<
l
:
while
i
<
l
and
type_list
[
i
]
==
type_cpu
:
i
+=
1
if
i
==
l
:
break
type_heter
=
type_list
[
i
]
i
+=
1
start
=
i
valid
=
True
while
i
<
l
and
type_list
[
i
]
!=
type_heter
:
if
type_list
[
i
]
!=
type_cpu
:
valid
=
False
break
i
+=
1
if
i
==
l
:
break
elif
not
valid
:
continue
for
j
in
range
(
start
,
i
):
for
op
in
ops_list
[
j
]:
op
.
_set_attr
(
"op_device"
,
type_heter
)
type_list
[
j
]
=
type_heter
j
+=
1
pre
=
None
merged_ops_list
=
[]
merged_type_list
=
[]
for
i
in
range
(
l
):
if
pre
is
None
or
pre
!=
type_list
[
i
]:
merged_ops_list
.
append
([])
merged_type_list
.
append
(
type_list
[
i
])
merged_ops_list
[
-
1
].
extend
(
ops_list
[
i
])
pre
=
type_list
[
i
]
data_vars
=
set
()
for
k
in
program
.
global_block
().
vars
:
var
=
program
.
global_block
().
var
(
k
)
if
not
var
.
persistable
:
data_vars
.
add
(
var
.
name
)
l
=
len
(
merged_ops_list
)
inputs_pre
=
set
()
outputs_pre
=
set
()
in_from_pre
=
[[]
for
i
in
range
(
l
)]
for
i
in
range
(
l
):
inputs
=
set
()
outputs
=
set
()
for
op
in
merged_ops_list
[
i
]:
for
input
in
op
.
input_names
:
for
tmp
in
op
.
input
(
input
):
if
tmp
not
in
outputs
:
inputs
.
add
(
tmp
)
for
output
in
op
.
output_names
:
for
tmp
in
op
.
output
(
output
):
outputs
.
add
(
tmp
)
if
i
==
0
:
in_from_pre
[
i
]
=
[]
elif
i
==
1
:
in_from_pre
[
i
]
=
(
outputs_pre
|
data_vars
)
&
inputs
else
:
in_from_pre
[
i
]
=
outputs_pre
&
inputs
inputs_pre
=
copy
.
deepcopy
(
inputs
)
outputs_pre
=
copy
.
deepcopy
(
outputs
)
l
=
len
(
in_from_pre
)
start_list
=
[]
end_list
=
[]
send_list
=
[[]
for
i
in
range
(
l
)]
sum
=
0
program_list
=
[]
for
i
in
range
(
l
):
start_list
.
append
(
sum
)
end_list
.
append
(
sum
+
len
(
merged_ops_list
[
i
])
-
1
)
sum
+=
len
(
merged_ops_list
[
i
])
if
i
<
l
-
1
:
send_list
[
i
].
extend
(
list
(
in_from_pre
[
i
+
1
]))
prog
=
program
.
clone
()
if
merged_type_list
[
i
]
!=
type_cpu
:
prog
=
prog
.
_prune_with_input
(
list
(
in_from_pre
[
i
]),
list
(
send_list
[
i
]))
program_list
.
append
(
prog
)
else
:
program_list
.
append
(
prog
)
recv_list
=
[
list
(
i
)
for
i
in
in_from_pre
]
found
=
False
heter_index
=
None
for
i
in
range
(
len
(
merged_type_list
)):
t
=
merged_type_list
[
i
]
if
t
!=
type_cpu
:
if
found
:
print
(
"only one region of program can be heter"
)
found
=
True
heter_index
=
i
if
heter_index
is
None
:
print
(
"warning: non heter program"
)
return
None
else
:
return
[
start_list
[
heter_index
],
end_list
[
heter_index
],
send_list
[
heter_index
],
\
recv_list
[
heter_index
],
program_list
[
heter_index
]]
python/paddle/fluid/layers/detection.py
浏览文件 @
862dde5e
...
...
@@ -471,9 +471,9 @@ def rpn_target_assign(bbox_pred,
def
sigmoid_focal_loss
(
x
,
label
,
fg_num
,
gamma
=
2.0
,
alpha
=
0.25
):
"""
:alias_main: paddle.nn.functional.sigmoid_focal_loss
:alias: paddle.nn.functional.sigmoid_focal_loss,paddle.nn.functional.loss.sigmoid_focal_loss
:old_api: paddle.fluid.layers.sigmoid_focal_loss
:alias_main: paddle.nn.functional.sigmoid_focal_loss
:alias: paddle.nn.functional.sigmoid_focal_loss,paddle.nn.functional.loss.sigmoid_focal_loss
:old_api: paddle.fluid.layers.sigmoid_focal_loss
**Sigmoid Focal Loss Operator.**
...
...
@@ -628,9 +628,9 @@ def detection_output(loc,
nms_eta
=
1.0
,
return_index
=
False
):
"""
:alias_main: paddle.nn.functional.detection_output
:alias: paddle.nn.functional.detection_output,paddle.nn.functional.vision.detection_output
:old_api: paddle.fluid.layers.detection_output
:alias_main: paddle.nn.functional.detection_output
:alias: paddle.nn.functional.detection_output,paddle.nn.functional.vision.detection_output
:old_api: paddle.fluid.layers.detection_output
Given the regression locations, classification confidences and prior boxes,
calculate the detection outputs by performing following steps:
...
...
@@ -761,9 +761,9 @@ def detection_output(loc,
@
templatedoc
()
def
iou_similarity
(
x
,
y
,
box_normalized
=
True
,
name
=
None
):
"""
:alias_main: paddle.nn.functional.iou_similarity
:alias: paddle.nn.functional.iou_similarity,paddle.nn.functional.loss.iou_similarity
:old_api: paddle.fluid.layers.iou_similarity
:alias_main: paddle.nn.functional.iou_similarity
:alias: paddle.nn.functional.iou_similarity,paddle.nn.functional.loss.iou_similarity
:old_api: paddle.fluid.layers.iou_similarity
${comment}
...
...
@@ -821,9 +821,9 @@ def box_coder(prior_box,
name
=
None
,
axis
=
0
):
"""
:alias_main: paddle.nn.functional.box_coder
:alias: paddle.nn.functional.box_coder,paddle.nn.functional.vision.box_coder
:old_api: paddle.fluid.layers.box_coder
:alias_main: paddle.nn.functional.box_coder
:alias: paddle.nn.functional.box_coder,paddle.nn.functional.vision.box_coder
:old_api: paddle.fluid.layers.box_coder
**Box Coder Layer**
...
...
@@ -1012,9 +1012,9 @@ def yolov3_loss(x,
name
=
None
,
scale_x_y
=
1.
):
"""
:alias_main: paddle.nn.functional.yolov3_loss
:alias: paddle.nn.functional.yolov3_loss,paddle.nn.functional.vision.yolov3_loss
:old_api: paddle.fluid.layers.yolov3_loss
:alias_main: paddle.nn.functional.yolov3_loss
:alias: paddle.nn.functional.yolov3_loss,paddle.nn.functional.vision.yolov3_loss
:old_api: paddle.fluid.layers.yolov3_loss
${comment}
...
...
@@ -1139,9 +1139,9 @@ def yolo_box(x,
name
=
None
,
scale_x_y
=
1.
):
"""
:alias_main: paddle.nn.functional.yolo_box
:alias: paddle.nn.functional.yolo_box,paddle.nn.functional.vision.yolo_box
:old_api: paddle.fluid.layers.yolo_box
:alias_main: paddle.nn.functional.yolo_box
:alias: paddle.nn.functional.yolo_box,paddle.nn.functional.vision.yolo_box
:old_api: paddle.fluid.layers.yolo_box
${comment}
...
...
@@ -1318,9 +1318,9 @@ def bipartite_match(dist_matrix,
dist_threshold
=
None
,
name
=
None
):
"""
:alias_main: paddle.nn.functional.bipartite_match
:alias: paddle.nn.functional.bipartite_match,paddle.nn.functional.vision.bipartite_match
:old_api: paddle.fluid.layers.bipartite_match
:alias_main: paddle.nn.functional.bipartite_match
:alias: paddle.nn.functional.bipartite_match,paddle.nn.functional.vision.bipartite_match
:old_api: paddle.fluid.layers.bipartite_match
This operator implements a greedy bipartite matching algorithm, which is
used to obtain the matching with the maximum distance based on the input
...
...
@@ -1412,9 +1412,9 @@ def target_assign(input,
mismatch_value
=
None
,
name
=
None
):
"""
:alias_main: paddle.nn.functional.target_assign
:alias: paddle.nn.functional.target_assign,paddle.nn.functional.extension.target_assign
:old_api: paddle.fluid.layers.target_assign
:alias_main: paddle.nn.functional.target_assign
:alias: paddle.nn.functional.target_assign,paddle.nn.functional.extension.target_assign
:old_api: paddle.fluid.layers.target_assign
This operator can be, for given the target bounding boxes or labels,
to assign classification and regression targets to each prediction as well as
...
...
@@ -1530,9 +1530,9 @@ def ssd_loss(location,
normalize
=
True
,
sample_size
=
None
):
"""
:alias_main: paddle.nn.functional.ssd_loss
:alias: paddle.nn.functional.ssd_loss,paddle.nn.functional.loss.ssd_loss
:old_api: paddle.fluid.layers.ssd_loss
:alias_main: paddle.nn.functional.ssd_loss
:alias: paddle.nn.functional.ssd_loss,paddle.nn.functional.loss.ssd_loss
:old_api: paddle.fluid.layers.ssd_loss
**Multi-box loss layer for object detection algorithm of SSD**
...
...
@@ -1777,9 +1777,9 @@ def prior_box(input,
name
=
None
,
min_max_aspect_ratios_order
=
False
):
"""
:alias_main: paddle.nn.functional.prior_box
:alias: paddle.nn.functional.prior_box,paddle.nn.functional.vision.prior_box
:old_api: paddle.fluid.layers.prior_box
:alias_main: paddle.nn.functional.prior_box
:alias: paddle.nn.functional.prior_box,paddle.nn.functional.vision.prior_box
:old_api: paddle.fluid.layers.prior_box
This op generates prior boxes for SSD(Single Shot MultiBox Detector) algorithm.
Each position of the input produce N prior boxes, N is determined by
...
...
@@ -1938,9 +1938,9 @@ def density_prior_box(input,
flatten_to_2d
=
False
,
name
=
None
):
"""
:alias_main: paddle.nn.functional.density_prior_box
:alias: paddle.nn.functional.density_prior_box,paddle.nn.functional.vision.density_prior_box
:old_api: paddle.fluid.layers.density_prior_box
:alias_main: paddle.nn.functional.density_prior_box
:alias: paddle.nn.functional.density_prior_box,paddle.nn.functional.vision.density_prior_box
:old_api: paddle.fluid.layers.density_prior_box
This op generates density prior boxes for SSD(Single Shot MultiBox Detector)
...
...
@@ -2130,7 +2130,7 @@ def multi_box_head(inputs,
name
=
None
,
min_max_aspect_ratios_order
=
False
):
"""
:api_attr: Static Graph
:api_attr: Static Graph
Base on SSD ((Single Shot MultiBox Detector) algorithm, generate prior boxes,
regression location and classification confidence on multiple input feature
...
...
@@ -2407,9 +2407,9 @@ def anchor_generator(input,
offset
=
0.5
,
name
=
None
):
"""
:alias_main: paddle.nn.functional.anchor_generator
:alias: paddle.nn.functional.anchor_generator,paddle.nn.functional.vision.anchor_generator
:old_api: paddle.fluid.layers.anchor_generator
:alias_main: paddle.nn.functional.anchor_generator
:alias: paddle.nn.functional.anchor_generator,paddle.nn.functional.vision.anchor_generator
:old_api: paddle.fluid.layers.anchor_generator
**Anchor generator operator**
...
...
@@ -2612,9 +2612,9 @@ def generate_proposal_labels(rpn_rois,
is_cls_agnostic
=
False
,
is_cascade_rcnn
=
False
):
"""
:alias_main: paddle.nn.functional.generate_proposal_labels
:alias: paddle.nn.functional.generate_proposal_labels,paddle.nn.functional.vision.generate_proposal_labels
:old_api: paddle.fluid.layers.generate_proposal_labels
:alias_main: paddle.nn.functional.generate_proposal_labels
:alias: paddle.nn.functional.generate_proposal_labels,paddle.nn.functional.vision.generate_proposal_labels
:old_api: paddle.fluid.layers.generate_proposal_labels
**Generate Proposal Labels of Faster-RCNN**
...
...
@@ -2737,9 +2737,9 @@ def generate_proposal_labels(rpn_rois,
def
generate_mask_labels
(
im_info
,
gt_classes
,
is_crowd
,
gt_segms
,
rois
,
labels_int32
,
num_classes
,
resolution
):
"""
:alias_main: paddle.nn.functional.generate_mask_labels
:alias: paddle.nn.functional.generate_mask_labels,paddle.nn.functional.vision.generate_mask_labels
:old_api: paddle.fluid.layers.generate_mask_labels
:alias_main: paddle.nn.functional.generate_mask_labels
:alias: paddle.nn.functional.generate_mask_labels,paddle.nn.functional.vision.generate_mask_labels
:old_api: paddle.fluid.layers.generate_mask_labels
**Generate Mask Labels for Mask-RCNN**
...
...
@@ -2896,9 +2896,9 @@ def generate_proposals(scores,
name
=
None
,
return_rois_num
=
False
):
"""
:alias_main: paddle.nn.functional.generate_proposals
:alias: paddle.nn.functional.generate_proposals,paddle.nn.functional.vision.generate_proposals
:old_api: paddle.fluid.layers.generate_proposals
:alias_main: paddle.nn.functional.generate_proposals
:alias: paddle.nn.functional.generate_proposals,paddle.nn.functional.vision.generate_proposals
:old_api: paddle.fluid.layers.generate_proposals
**Generate proposal Faster-RCNN**
...
...
python/paddle/fluid/layers/device.py
浏览文件 @
862dde5e
...
...
@@ -20,12 +20,12 @@ from __future__ import print_function
from
.layer_function_generator
import
autodoc
from
..framework
import
unique_name
from
..layer_helper
import
LayerHelper
from
..annotation
s
import
deprecated
from
paddle.util
s
import
deprecated
__all__
=
[]
@
deprecated
(
since
=
'0.15.0'
,
instead
=
"
ParallelExecutor"
)
@
deprecated
(
since
=
'0.15.0'
,
update_to
=
"paddle.fluid.
ParallelExecutor"
)
@
autodoc
()
def
get_places
(
device_count
=
None
,
device_type
=
None
):
helper
=
LayerHelper
(
'get_places'
,
**
locals
())
...
...
python/paddle/fluid/layers/nn.py
浏览文件 @
862dde5e
...
...
@@ -37,6 +37,7 @@ from functools import reduce
from .. import core
from ..data_feeder import convert_dtype, check_variable_and_dtype, check_type, check_dtype
import paddle
from paddle.utils import deprecated
__all__ = [
'fc',
...
...
@@ -9910,7 +9911,7 @@ def flatten(x, axis=1, name=None):
return out
def stack(x, axis=0):
def stack(x, axis=0
, name=None
):
"""
This OP stacks all the inputs :code:`x` along axis.
...
...
@@ -9990,15 +9991,16 @@ def stack(x, axis=0):
data = layers.stack(x1) # stack according to axis 0, data.shape=[1, None, 1, 2]
"""
helper = LayerHelper('stack', **locals())
axis = 0 if axis is None else axis
if not isinstance(x, list) and not isinstance(x, tuple):
x = [x]
if in_dygraph_mode():
return core.ops.stack(x, 'axis', axis)
helper = LayerHelper('stack', **locals())
out = helper.create_variable_for_type_inference(x[0].dtype)
if not in_dygraph_mode() and \
x[0].desc.type() == core.VarDesc.VarType.LOD_TENSOR_ARRAY:
if x[0].desc.type() == core.VarDesc.VarType.LOD_TENSOR_ARRAY:
assert len(x) == 1, "If the elements of 'x' in stack are Variable(LoDTensorArray), " \
"number of the elements must be 1, but received %s." % len(x)
out_index = helper.create_variable_for_type_inference(dtype="int32")
...
...
@@ -11614,6 +11616,7 @@ Examples:
return _elementwise_op(LayerHelper('elementwise_sub', **locals()))
@deprecated(since="2.0.0", update_to="paddle.multiply")
def elementwise_mul(x, y, axis=-1, act=None, name=None):
"""
:alias_main: paddle.elementwise_mul
...
...
python/paddle/fluid/reader.py
浏览文件 @
862dde5e
...
...
@@ -28,7 +28,6 @@ from .layers.io import monkey_patch_reader_methods, _copy_reader_var_, double_bu
from
.unique_name
import
UniqueNameGenerator
import
logging
import
warnings
from
.dataset
import
DatasetBase
,
InMemoryDataset
### Dygraph DataLoader configs ###
import
os
...
...
@@ -1670,7 +1669,7 @@ class PyReader(DataLoaderBase):
class
DatasetLoader
(
DataLoaderBase
):
def
__init__
(
self
,
dataset
,
places
,
drop_last
):
assert
isinstance
(
dataset
,
assert
isinstance
(
dataset
,
paddle
.
fleet
.
dataset
.
DatasetBase
),
"dataset must be type of DatasetBase"
assert
not
in_dygraph_mode
(
),
"DatasetLoader is not supported in dygraph mode yet"
...
...
@@ -1686,7 +1685,7 @@ class DatasetLoader(DataLoaderBase):
dataset
.
set_thread
(
thread_num
)
if
isinstance
(
dataset
,
if
isinstance
(
dataset
,
paddle
.
fleet
.
dataset
.
InMemoryDataset
)
and
dataset
.
queue_num
>
thread_num
:
logging
.
warn
(
"queue_num {} which is set in Dataset is ignored"
.
format
(
dataset
.
queue_num
))
...
...
python/paddle/fluid/tests/unittests/dist_fleet_ctr.py
浏览文件 @
862dde5e
...
...
@@ -210,7 +210,7 @@ class TestDistCTR2x2(FleetDistRunnerBase):
filelist
.
append
(
train_file_path
)
# config dataset
dataset
=
fluid
.
DatasetFactory
().
create_dataset
()
dataset
=
paddle
.
fleet
.
DatasetFactory
().
create_dataset
()
dataset
.
set_batch_size
(
batch_size
)
dataset
.
set_use_var
(
self
.
feeds
)
pipe_command
=
'python ctr_dataset_reader.py'
...
...
python/paddle/fluid/tests/unittests/mkldnn/test_fusion_gru_mkldnn_op.py
0 → 100644
浏览文件 @
862dde5e
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
unittest
import
numpy
as
np
from
paddle.fluid.tests.unittests.test_fusion_gru_op
import
TestFusionGRUOp
class
TestFusionGRUMKLDNNOp
(
TestFusionGRUOp
):
def
set_confs
(
self
):
self
.
use_mkldnn
=
True
class
TestFusionGRUMKLDNNOpNoInitial
(
TestFusionGRUOp
):
def
set_confs
(
self
):
self
.
with_h0
=
False
self
.
use_mkldnn
=
True
class
TestFusionGRUMKLDNNOpNoBias
(
TestFusionGRUOp
):
def
set_confs
(
self
):
self
.
with_bias
=
False
self
.
use_mkldnn
=
True
class
TestFusionGRUMKLDNNOpReverse
(
TestFusionGRUOp
):
def
set_confs
(
self
):
self
.
is_reverse
=
True
self
.
use_mkldnn
=
True
class
TestFusionGRUMKLDNNOpOriginMode
(
TestFusionGRUOp
):
def
set_confs
(
self
):
self
.
origin_mode
=
True
self
.
use_mkldnn
=
True
class
TestFusionGRUMKLDNNOpMD1
(
TestFusionGRUOp
):
def
set_confs
(
self
):
self
.
M
=
36
self
.
D
=
8
self
.
use_mkldnn
=
True
class
TestFusionGRUMKLDNNOpMD2
(
TestFusionGRUOp
):
def
set_confs
(
self
):
self
.
M
=
8
self
.
D
=
8
self
.
use_mkldnn
=
True
class
TestFusionGRUMKLDNNOpMD3
(
TestFusionGRUOp
):
def
set_confs
(
self
):
self
.
M
=
17
self
.
D
=
15
self
.
use_mkldnn
=
True
class
TestFusionGRUMKLDNNOpBS1
(
TestFusionGRUOp
):
def
set_confs
(
self
):
self
.
lod
=
[[
3
]]
self
.
D
=
16
self
.
use_mkldnn
=
True
if
__name__
==
"__main__"
:
unittest
.
main
()
python/paddle/fluid/tests/unittests/test_dataset.py
浏览文件 @
862dde5e
...
...
@@ -17,6 +17,7 @@ including create, config, run, etc.
"""
from
__future__
import
print_function
import
paddle
import
paddle.fluid
as
fluid
import
paddle.compat
as
cpt
import
paddle.fluid.core
as
core
...
...
@@ -37,23 +38,26 @@ class TestDataset(unittest.TestCase):
def
test_dataset_create
(
self
):
""" Testcase for dataset create. """
try
:
dataset
=
fluid
.
DatasetFactory
().
create_dataset
(
"InMemoryDataset"
)
dataset
=
paddle
.
fleet
.
DatasetFactory
().
create_dataset
(
"InMemoryDataset"
)
except
:
self
.
assertTrue
(
False
)
try
:
dataset
=
fluid
.
DatasetFactory
().
create_dataset
(
"QueueDataset"
)
dataset
=
paddle
.
fleet
.
DatasetFactory
().
create_dataset
(
"QueueDataset"
)
except
:
self
.
assertTrue
(
False
)
try
:
dataset
=
fluid
.
DatasetFactory
().
create_dataset
(
dataset
=
paddle
.
fleet
.
DatasetFactory
().
create_dataset
(
"FileInstantDataset"
)
except
:
self
.
assertTrue
(
False
)
try
:
dataset
=
fluid
.
DatasetFactory
().
create_dataset
(
"MyOwnDataset"
)
dataset
=
paddle
.
fleet
.
DatasetFactory
().
create_dataset
(
"MyOwnDataset"
)
self
.
assertTrue
(
False
)
except
:
self
.
assertTrue
(
True
)
...
...
@@ -91,7 +95,8 @@ class TestDataset(unittest.TestCase):
name
=
slot
,
shape
=
[
1
],
dtype
=
"int64"
,
lod_level
=
1
)
slots_vars
.
append
(
var
)
dataset
=
fluid
.
DatasetFactory
().
create_dataset
(
"InMemoryDataset"
)
dataset
=
paddle
.
fleet
.
DatasetFactory
().
create_dataset
(
"InMemoryDataset"
)
dataset
.
set_batch_size
(
32
)
dataset
.
set_thread
(
3
)
dataset
.
set_filelist
(
...
...
@@ -125,7 +130,7 @@ class TestDataset(unittest.TestCase):
dataset
.
set_trainer_num
(
4
)
dataset
.
set_hdfs_config
(
"my_fs_name"
,
"my_fs_ugi"
)
dataset
.
set_download_cmd
(
"./read_from_afs my_fs_name my_fs_ugi"
)
dataset
.
enable_pv_merge
(
)
dataset
.
set_enable_pv_merge
(
False
)
thread_num
=
dataset
.
get_thread_num
()
self
.
assertEqual
(
thread_num
,
12
)
...
...
@@ -171,7 +176,8 @@ class TestDataset(unittest.TestCase):
name
=
slot
,
shape
=
[
1
],
dtype
=
"int64"
,
lod_level
=
1
)
slots_vars
.
append
(
var
)
dataset
=
fluid
.
DatasetFactory
().
create_dataset
(
"InMemoryDataset"
)
dataset
=
paddle
.
fleet
.
DatasetFactory
().
create_dataset
(
"InMemoryDataset"
)
dataset
.
set_batch_size
(
32
)
dataset
.
set_thread
(
3
)
dataset
.
set_filelist
([
filename1
,
filename2
])
...
...
@@ -222,7 +228,8 @@ class TestDataset(unittest.TestCase):
name
=
slot
,
shape
=
[
1
],
dtype
=
"int64"
,
lod_level
=
1
)
slots_vars
.
append
(
var
)
dataset
=
fluid
.
DatasetFactory
().
create_dataset
(
"InMemoryDataset"
)
dataset
=
paddle
.
fleet
.
DatasetFactory
().
create_dataset
(
"InMemoryDataset"
)
dataset
.
set_batch_size
(
32
)
dataset
.
set_thread
(
3
)
dataset
.
set_filelist
([
...
...
@@ -293,7 +300,8 @@ class TestDataset(unittest.TestCase):
name
=
slot
,
shape
=
[
1
],
dtype
=
"float32"
,
lod_level
=
1
)
slots_vars
.
append
(
var
)
dataset
=
fluid
.
DatasetFactory
().
create_dataset
(
"InMemoryDataset"
)
dataset
=
paddle
.
fleet
.
DatasetFactory
().
create_dataset
(
"InMemoryDataset"
)
dataset
.
set_batch_size
(
32
)
dataset
.
set_thread
(
1
)
dataset
.
set_parse_ins_id
(
True
)
...
...
@@ -359,7 +367,8 @@ class TestDataset(unittest.TestCase):
name
=
"slot4"
,
shape
=
[
1
],
dtype
=
"float32"
,
lod_level
=
0
)
slots_vars
=
[
var1
,
var2
,
var3
,
var4
]
dataset
=
fluid
.
DatasetFactory
().
create_dataset
(
"InMemoryDataset"
)
dataset
=
paddle
.
fleet
.
DatasetFactory
().
create_dataset
(
"InMemoryDataset"
)
dataset
.
set_batch_size
(
32
)
dataset
.
set_thread
(
1
)
dataset
.
set_parse_ins_id
(
True
)
...
...
@@ -414,7 +423,8 @@ class TestDataset(unittest.TestCase):
name
=
slot
,
shape
=
[
1
],
dtype
=
"float32"
,
lod_level
=
1
)
slots_vars
.
append
(
var
)
dataset
=
fluid
.
DatasetFactory
().
create_dataset
(
"InMemoryDataset"
)
dataset
=
paddle
.
fleet
.
DatasetFactory
().
create_dataset
(
"InMemoryDataset"
)
dataset
.
set_batch_size
(
32
)
dataset
.
set_thread
(
3
)
dataset
.
set_filelist
([
...
...
@@ -507,7 +517,7 @@ class TestDataset(unittest.TestCase):
name
=
slot
,
shape
=
[
1
],
dtype
=
"int64"
,
lod_level
=
1
)
slots_vars
.
append
(
var
)
dataset
=
fluid
.
DatasetFactory
().
create_dataset
(
"QueueDataset"
)
dataset
=
paddle
.
fleet
.
DatasetFactory
().
create_dataset
(
"QueueDataset"
)
dataset
.
set_batch_size
(
32
)
dataset
.
set_thread
(
3
)
dataset
.
set_filelist
(
...
...
@@ -532,7 +542,7 @@ class TestDataset(unittest.TestCase):
except
Exception
as
e
:
self
.
assertTrue
(
False
)
dataset2
=
fluid
.
DatasetFactory
().
create_dataset
(
"QueueDataset"
)
dataset2
=
paddle
.
fleet
.
DatasetFactory
().
create_dataset
(
"QueueDataset"
)
dataset2
.
set_use_var
(
slots_vars
)
dataset2
.
set_batch_size
(
32
)
dataset2
.
set_thread
(
3
)
...
...
@@ -573,7 +583,7 @@ class TestDataset(unittest.TestCase):
name
=
slot
,
shape
=
[
1
],
dtype
=
"float32"
,
lod_level
=
1
)
slots_vars
.
append
(
var
)
dataset
=
fluid
.
DatasetFactory
().
create_dataset
(
"QueueDataset"
)
dataset
=
paddle
.
fleet
.
DatasetFactory
().
create_dataset
(
"QueueDataset"
)
dataset
.
set_batch_size
(
32
)
dataset
.
set_thread
(
3
)
dataset
.
set_filelist
(
...
...
@@ -628,7 +638,8 @@ class TestDataset(unittest.TestCase):
name
=
slot
,
shape
=
[
None
,
1
],
dtype
=
"int64"
,
lod_level
=
1
)
slots_vars
.
append
(
var
)
dataset
=
fluid
.
DatasetFactory
().
create_dataset
(
"InMemoryDataset"
)
dataset
=
paddle
.
fleet
.
DatasetFactory
().
create_dataset
(
"InMemoryDataset"
)
dataset
.
set_input_type
(
1
)
dataset
.
set_batch_size
(
1
)
dataset
.
set_thread
(
2
)
...
...
@@ -707,7 +718,7 @@ class TestDatasetWithFetchHandler(unittest.TestCase):
inputs(list): inputs of get_dataset
files(list): files of get_dataset
"""
dataset
=
fluid
.
DatasetFactory
().
create_dataset
(
"QueueDataset"
)
dataset
=
paddle
.
fleet
.
DatasetFactory
().
create_dataset
(
"QueueDataset"
)
dataset
.
set_batch_size
(
32
)
dataset
.
set_thread
(
3
)
dataset
.
set_filelist
(
files
)
...
...
@@ -864,7 +875,8 @@ class TestDataset2(unittest.TestCase):
except
ImportError
as
e
:
print
(
"warning: no mpi4py"
)
exe
.
run
(
startup_program
)
dataset
=
fluid
.
DatasetFactory
().
create_dataset
(
"InMemoryDataset"
)
dataset
=
paddle
.
fleet
.
DatasetFactory
().
create_dataset
(
"InMemoryDataset"
)
dataset
.
set_batch_size
(
32
)
dataset
.
set_thread
(
3
)
dataset
.
set_filelist
([
...
...
@@ -884,9 +896,6 @@ class TestDataset2(unittest.TestCase):
"""
Testcase for InMemoryDataset from create to run.
"""
self
.
skipTest
(
"parameter server will add pslib UT later"
)
with
open
(
"test_in_memory_dataset2_run2_a.txt"
,
"w"
)
as
f
:
data
=
"1 1 2 3 3 4 5 5 5 5 1 1
\n
"
data
+=
"1 2 2 3 4 4 6 6 6 6 1 2
\n
"
...
...
@@ -902,7 +911,7 @@ class TestDataset2(unittest.TestCase):
train_program
=
fluid
.
Program
()
startup_program
=
fluid
.
Program
()
scope
=
fluid
.
Scope
()
from
paddle.fluid.incubate.fleet.parameter_server.
distribute_transpiler
import
fleet
from
paddle.fluid.incubate.fleet.parameter_server.
pslib
import
fleet
with
fluid
.
program_guard
(
train_program
,
startup_program
):
slots
=
[
"slot1_ff"
,
"slot2_ff"
,
"slot3_ff"
,
"slot4_ff"
]
slots_vars
=
[]
...
...
@@ -936,7 +945,8 @@ class TestDataset2(unittest.TestCase):
except
ImportError
as
e
:
print
(
"warning: no mpi4py"
)
exe
.
run
(
startup_program
)
dataset
=
fluid
.
DatasetFactory
().
create_dataset
(
"InMemoryDataset"
)
dataset
=
paddle
.
fleet
.
DatasetFactory
().
create_dataset
(
"InMemoryDataset"
)
dataset
.
set_batch_size
(
32
)
dataset
.
set_thread
(
3
)
dataset
.
set_filelist
([
...
...
@@ -952,6 +962,63 @@ class TestDataset2(unittest.TestCase):
print
(
"warning: catch expected error"
)
fleet
.
_opt_info
=
None
fleet
.
_fleet_ptr
=
None
dataset
=
paddle
.
fleet
.
DatasetFactory
().
create_dataset
(
"InMemoryDataset"
)
dataset
.
set_rank_offset
(
""
)
dataset
.
set_pv_batch_size
(
1
)
dataset
.
set_hdfs_config
(
""
,
""
)
d
=
paddle
.
fleet
.
DatasetBase
()
try
:
dataset
.
set_feed_type
(
"MultiSlotInMemoryDataFeed"
)
except
:
print
(
"warning: catch expected error"
)
dataset
.
thread_num
=
0
try
:
dataset
.
_prepare_to_run
()
except
:
print
(
"warning: catch expected error"
)
dataset
.
set_parse_logkey
(
True
)
dataset
.
set_merge_by_sid
(
True
)
dataset
.
set_enable_pv_merge
(
True
)
try
:
dataset
.
preprocess_instance
()
except
:
print
(
"warning: catch expected error"
)
try
:
dataset
.
set_current_phase
(
1
)
except
:
print
(
"warning: catch expected error"
)
try
:
dataset
.
postprocess_instance
()
except
:
print
(
"warning: catch expected error"
)
dataset
.
set_fleet_send_batch_size
(
1024
)
try
:
dataset
.
global_shuffle
()
except
:
print
(
"warning: catch expected error"
)
dataset
.
get_pv_data_size
()
dataset
.
get_memory_data_size
()
dataset
.
get_shuffle_data_size
()
dataset
=
paddle
.
fleet
.
DatasetFactory
().
create_dataset
(
"QueueDataset"
)
try
:
dataset
.
local_shuffle
()
except
:
print
(
"warning: catch expected error"
)
try
:
dataset
.
global_shuffle
()
except
:
print
(
"warning: catch expected error"
)
dataset
=
paddle
.
fleet
.
FileInstantDataset
()
try
:
dataset
.
local_shuffle
()
except
:
print
(
"warning: catch expected error"
)
try
:
dataset
.
global_shuffle
()
except
:
print
(
"warning: catch expected error"
)
os
.
remove
(
"./test_in_memory_dataset2_run2_a.txt"
)
os
.
remove
(
"./test_in_memory_dataset2_run2_b.txt"
)
...
...
python/paddle/fluid/tests/unittests/test_dataset_dataloader.py
浏览文件 @
862dde5e
...
...
@@ -12,6 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import
paddle
import
paddle.fluid
as
fluid
import
numpy
as
np
import
six
...
...
@@ -96,7 +97,8 @@ class DatasetLoaderTestBase(unittest.TestCase):
def
check_batch_number
(
self
,
place
,
randomize_batch_num
=
False
):
main_prog
,
startup_prog
,
feeds
=
self
.
build_network
()
dataset
=
fluid
.
DatasetFactory
().
create_dataset
(
self
.
dataset_name
)
dataset
=
paddle
.
fleet
.
DatasetFactory
().
create_dataset
(
self
.
dataset_name
)
dataset
.
set_batch_size
(
BATCH_SIZE
)
if
isinstance
(
place
,
fluid
.
CPUPlace
):
...
...
python/paddle/fluid/tests/unittests/test_flatten_contiguous_range_op.py
0 → 100644
浏览文件 @
862dde5e
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
__future__
import
print_function
import
unittest
import
numpy
as
np
import
paddle.fluid
as
fluid
import
paddle
from
op_test
import
OpTest
class
TestFlattenOp
(
OpTest
):
def
setUp
(
self
):
self
.
op_type
=
"flatten_contiguous_range"
self
.
start_axis
=
0
self
.
stop_axis
=
-
1
self
.
init_test_case
()
self
.
inputs
=
{
"X"
:
np
.
random
.
random
(
self
.
in_shape
).
astype
(
"float64"
)}
self
.
init_attrs
()
self
.
outputs
=
{
"Out"
:
self
.
inputs
[
"X"
].
reshape
(
self
.
new_shape
),
"XShape"
:
np
.
random
.
random
(
self
.
in_shape
).
astype
(
"float32"
)
}
def
test_check_output
(
self
):
self
.
check_output
(
no_check_set
=
[
"XShape"
])
def
test_check_grad
(
self
):
self
.
check_grad
([
"X"
],
"Out"
)
def
init_test_case
(
self
):
self
.
in_shape
=
(
3
,
2
,
5
,
4
)
self
.
start_axis
=
0
self
.
stop_axis
=
-
1
self
.
new_shape
=
(
120
)
def
init_attrs
(
self
):
self
.
attrs
=
{
"start_axis"
:
self
.
start_axis
,
"stop_axis"
:
self
.
stop_axis
}
class
TestFlattenOp_1
(
TestFlattenOp
):
def
init_test_case
(
self
):
self
.
in_shape
=
(
3
,
2
,
5
,
4
)
self
.
start_axis
=
1
self
.
stop_axis
=
2
self
.
new_shape
=
(
3
,
10
,
4
)
def
init_attrs
(
self
):
self
.
attrs
=
{
"start_axis"
:
self
.
start_axis
,
"stop_axis"
:
self
.
stop_axis
}
class
TestFlattenOp_2
(
TestFlattenOp
):
def
init_test_case
(
self
):
self
.
in_shape
=
(
3
,
2
,
5
,
4
)
self
.
start_axis
=
0
self
.
stop_axis
=
1
self
.
new_shape
=
(
6
,
5
,
4
)
def
init_attrs
(
self
):
self
.
attrs
=
{
"start_axis"
:
self
.
start_axis
,
"stop_axis"
:
self
.
stop_axis
}
class
TestFlattenOp_3
(
TestFlattenOp
):
def
init_test_case
(
self
):
self
.
in_shape
=
(
3
,
2
,
5
,
4
)
self
.
start_axis
=
0
self
.
stop_axis
=
2
self
.
new_shape
=
(
30
,
4
)
def
init_attrs
(
self
):
self
.
attrs
=
{
"start_axis"
:
self
.
start_axis
,
"stop_axis"
:
self
.
stop_axis
}
class
TestFlattenOp_4
(
TestFlattenOp
):
def
init_test_case
(
self
):
self
.
in_shape
=
(
3
,
2
,
5
,
4
)
self
.
start_axis
=
-
2
self
.
stop_axis
=
-
1
self
.
new_shape
=
(
3
,
2
,
20
)
def
init_attrs
(
self
):
self
.
attrs
=
{
"start_axis"
:
self
.
start_axis
,
"stop_axis"
:
self
.
stop_axis
}
class
TestFlattenOp_5
(
TestFlattenOp
):
def
init_test_case
(
self
):
self
.
in_shape
=
(
3
,
2
,
5
,
4
)
self
.
start_axis
=
2
self
.
stop_axis
=
2
self
.
new_shape
=
(
3
,
2
,
5
,
4
)
def
init_attrs
(
self
):
self
.
attrs
=
{
"start_axis"
:
self
.
start_axis
,
"stop_axis"
:
self
.
stop_axis
}
class
TestFlattenOpSixDims
(
TestFlattenOp
):
def
init_test_case
(
self
):
self
.
in_shape
=
(
3
,
2
,
3
,
2
,
4
,
4
)
self
.
start_axis
=
3
self
.
stop_axis
=
5
self
.
new_shape
=
(
3
,
2
,
3
,
32
)
def
init_attrs
(
self
):
self
.
attrs
=
{
"start_axis"
:
self
.
start_axis
,
"stop_axis"
:
self
.
stop_axis
}
class
TestFlatten2OpError
(
unittest
.
TestCase
):
def
test_errors
(
self
):
image_shape
=
(
2
,
3
,
4
,
4
)
x
=
np
.
arange
(
image_shape
[
0
]
*
image_shape
[
1
]
*
image_shape
[
2
]
*
image_shape
[
3
]).
reshape
(
image_shape
)
/
100.
x
=
x
.
astype
(
'float32'
)
def
test_ValueError1
():
x_var
=
paddle
.
nn
.
data
(
name
=
"x"
,
shape
=
image_shape
,
dtype
=
'float32'
)
out
=
paddle
.
flatten
(
x_var
,
start_axis
=
2
,
stop_axis
=
1
)
self
.
assertRaises
(
ValueError
,
test_ValueError1
)
def
test_ValueError2
():
x_var
=
paddle
.
nn
.
data
(
name
=
"x"
,
shape
=
image_shape
,
dtype
=
'float32'
)
paddle
.
flatten
(
x_var
,
start_axis
=
10
,
stop_axis
=
1
)
self
.
assertRaises
(
ValueError
,
test_ValueError2
)
def
test_ValueError3
():
x_var
=
paddle
.
nn
.
data
(
name
=
"x"
,
shape
=
image_shape
,
dtype
=
'float32'
)
paddle
.
flatten
(
x_var
,
start_axis
=
2
,
stop_axis
=
10
)
self
.
assertRaises
(
ValueError
,
test_ValueError3
)
def
test_type
():
# dtype must be float32, float64, int8, int32, int64.
x2
=
np
.
arange
(
image_shape
[
0
]
*
image_shape
[
1
]
*
image_shape
[
2
]
*
image_shape
[
3
]).
reshape
(
image_shape
)
/
100.
x2
=
x2
.
astype
(
'float16'
)
x2_var
=
paddle
.
data
(
name
=
'x2'
,
shape
=
[
3
,
2
,
4
,
5
],
dtype
=
'float16'
)
paddle
.
flatten
(
x2_var
)
self
.
assertRaises
(
TypeError
,
test_type
)
def
test_InputError
():
out
=
paddle
.
flatten
(
x
)
self
.
assertRaises
(
ValueError
,
test_InputError
)
class
TestFlattenPython
(
unittest
.
TestCase
):
def
test_python_api
(
self
):
image_shape
=
(
2
,
3
,
4
,
4
)
x
=
np
.
arange
(
image_shape
[
0
]
*
image_shape
[
1
]
*
image_shape
[
2
]
*
image_shape
[
3
]).
reshape
(
image_shape
)
/
100.
x
=
x
.
astype
(
'float32'
)
def
test_InputError
():
out
=
paddle
.
flatten
(
x
)
self
.
assertRaises
(
ValueError
,
test_InputError
)
def
test_Negative
():
paddle
.
enable_imperative
()
img
=
paddle
.
imperative
.
to_variable
(
x
)
out
=
paddle
.
flatten
(
img
,
start_axis
=-
2
,
stop_axis
=-
1
)
return
out
.
numpy
().
shape
res_shape
=
test_Negative
()
self
.
assertTrue
((
2
,
3
,
16
)
==
res_shape
)
if
__name__
==
"__main__"
:
unittest
.
main
()
python/paddle/fluid/tests/unittests/test_fleet_rolemaker_2.py
浏览文件 @
862dde5e
...
...
@@ -14,6 +14,7 @@
"""Test cases for role makers."""
from
__future__
import
print_function
import
paddle
import
os
import
unittest
...
...
@@ -162,7 +163,8 @@ class TestCloudRoleMaker2(unittest.TestCase):
data
=
"1 1 1 1
\n
"
f
.
write
(
data
)
dataset
=
fluid
.
DatasetFactory
().
create_dataset
(
"InMemoryDataset"
)
dataset
=
paddle
.
fleet
.
DatasetFactory
().
create_dataset
(
"InMemoryDataset"
)
dataset
.
set_filelist
([
"test_fleet_gloo_role_maker_1.txt"
])
dataset
.
set_use_var
([
show
,
label
])
dataset
.
load_into_memory
()
...
...
python/paddle/fluid/tests/unittests/test_fusion_gru_op.py
浏览文件 @
862dde5e
...
...
@@ -30,6 +30,7 @@ def fusion_gru(
wh
,
# D x 3D
bias
,
# 1 x 3D
is_reverse
,
origin_mode
,
act_state
,
act_gate
):
return
gru
(
fc
(
x
,
wx
,
bias
),
...
...
@@ -40,7 +41,8 @@ def fusion_gru(
(
1
,
wh
.
shape
[
1
]),
dtype
=
'float32'
),
is_reverse
,
act_state
,
act_gate
)
act_gate
,
origin_mode
=
origin_mode
)
class
TestFusionGRUOp
(
OpTest
):
...
...
@@ -57,6 +59,8 @@ class TestFusionGRUOp(OpTest):
self
.
with_bias
=
True
self
.
act_state
=
'tanh'
self
.
act_gate
=
'sigmoid'
self
.
origin_mode
=
False
self
.
use_mkldnn
=
False
self
.
set_confs
()
T
=
sum
(
self
.
lod
[
0
])
...
...
@@ -73,7 +77,7 @@ class TestFusionGRUOp(OpTest):
(
N
,
self
.
D
),
dtype
=
'float32'
)
_
,
_
,
_
,
hidden
=
fusion_gru
(
x
,
self
.
lod
,
h0
,
wx
,
wh
,
bias
,
self
.
is_reverse
,
x
,
self
.
lod
,
h0
,
wx
,
wh
,
bias
,
self
.
is_reverse
,
self
.
origin_mode
,
ACTIVATION
[
self
.
act_state
],
ACTIVATION
[
self
.
act_gate
])
self
.
inputs
=
{
'X'
:
(
x
,
self
.
lod
),
'WeightX'
:
wx
,
'WeightH'
:
wh
}
...
...
@@ -89,7 +93,9 @@ class TestFusionGRUOp(OpTest):
self
.
attrs
=
{
'activation'
:
self
.
act_state
,
'gate_activation'
:
self
.
act_gate
,
'is_reverse'
:
self
.
is_reverse
'is_reverse'
:
self
.
is_reverse
,
'origin_mode'
:
self
.
origin_mode
,
'use_mkldnn'
:
self
.
use_mkldnn
}
def
test_check_output
(
self
):
...
...
python/paddle/fluid/tests/unittests/test_layers.py
浏览文件 @
862dde5e
...
...
@@ -180,6 +180,51 @@ class TestLayer(LayerTest):
self
.
assertRaises
(
TypeError
,
test_type
)
def
test_Flatten
(
self
):
inp
=
np
.
ones
([
3
,
4
,
4
,
5
],
dtype
=
'float32'
)
with
self
.
static_graph
():
t
=
layers
.
data
(
name
=
'data'
,
shape
=
[
3
,
4
,
4
,
5
],
dtype
=
'float32'
,
append_batch_size
=
False
)
flatten
=
nn
.
Flatten
()
ret
=
flatten
(
t
)
static_ret
=
self
.
get_static_graph_result
(
feed
=
{
'data'
:
inp
},
fetch_list
=
[
ret
])[
0
]
with
self
.
dynamic_graph
():
t
=
base
.
to_variable
(
inp
)
flatten
=
nn
.
Flatten
()
dy_ret
=
flatten
(
t
)
dy_ret_value
=
dy_ret
.
numpy
()
self
.
assertTrue
(
np
.
array_equal
(
static_ret
,
dy_ret_value
))
with
self
.
static_graph
():
# the input of Linear must be Variable.
def
test_Variable
():
inp
=
np
.
ones
([
3
,
32
,
32
],
dtype
=
'float32'
)
linear
=
nn
.
Linear
(
32
,
4
,
bias_attr
=
fluid
.
initializer
.
ConstantInitializer
(
value
=
1
))
linear_ret1
=
linear
(
inp
)
self
.
assertRaises
(
TypeError
,
test_Variable
)
# the input dtype of Linear must be float16 or float32 or float64
# float16 only can be set on GPU place
def
test_type
():
inp
=
np
.
ones
([
3
,
32
,
32
],
dtype
=
'int32'
)
linear
=
nn
.
Linear
(
32
,
4
,
bias_attr
=
fluid
.
initializer
.
ConstantInitializer
(
value
=
1
))
linear_ret2
=
linear
(
inp
)
self
.
assertRaises
(
TypeError
,
test_type
)
def
test_layer_norm
(
self
):
inp
=
np
.
ones
([
3
,
32
,
32
],
dtype
=
'float32'
)
with
self
.
static_graph
():
...
...
python/paddle/fluid/tests/unittests/test_monitor.py
浏览文件 @
862dde5e
...
...
@@ -16,6 +16,7 @@ TestCases for Monitor
"""
from
__future__
import
print_function
import
paddle
import
paddle.fluid
as
fluid
import
paddle.fluid.core
as
core
import
numpy
as
np
...
...
@@ -51,7 +52,8 @@ class TestDatasetWithStat(unittest.TestCase):
name
=
slot
,
shape
=
[
1
],
dtype
=
"int64"
,
lod_level
=
1
)
slots_vars
.
append
(
var
)
dataset
=
fluid
.
DatasetFactory
().
create_dataset
(
"InMemoryDataset"
)
dataset
=
paddle
.
fleet
.
DatasetFactory
().
create_dataset
(
"InMemoryDataset"
)
dataset
.
set_batch_size
(
32
)
dataset
.
set_thread
(
3
)
dataset
.
set_filelist
([
...
...
python/paddle/fluid/tests/unittests/test_pipeline.py
浏览文件 @
862dde5e
# Copyright (c) 20
18
PaddlePaddle Authors. All Rights Reserved.
# Copyright (c) 20
20
PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
...
...
@@ -13,6 +13,7 @@
# limitations under the License.
from
__future__
import
print_function
import
paddle
import
paddle.fluid
as
fluid
import
paddle.fluid.layers
as
layers
import
numpy
as
np
...
...
python/paddle/fluid/tests/unittests/white_list/no_check_set_white_list.py
浏览文件 @
862dde5e
...
...
@@ -17,6 +17,7 @@ no_check_set_white_list = [
'fake_quantize_range_abs_max'
,
'coalesce_tensor'
,
'flatten2'
,
'flatten_contiguous_range'
,
'lrn'
,
'squeeze2'
,
'reshape2'
,
...
...
python/paddle/fluid/trainer_desc.py
浏览文件 @
862dde5e
...
...
@@ -15,7 +15,10 @@
import
sys
import
os
__all__
=
[
'TrainerDesc'
,
'MultiTrainer'
,
'DistMultiTrainer'
,
'PipelineTrainer'
]
__all__
=
[
'TrainerDesc'
,
'MultiTrainer'
,
'DistMultiTrainer'
,
'PipelineTrainer'
,
'HeterXpuTrainer'
]
class
TrainerDesc
(
object
):
...
...
@@ -48,6 +51,43 @@ class TrainerDesc(object):
self
.
_program
=
None
self
.
_infer
=
False
def
_set_heter_info
(
self
,
ret
):
#ret = = fu.split_program_by_device(program)
#start_list, end_list, send_list, recv_list, program_list = fu.split_program_by_device(program)
#if len(start_list) != 3:
# print("start_list len=", len(start_list), " will not set heter info")
# return
#for i in start_list[0]:
# self.proto_desc.op_run_start_idx.append(i)
#for i in end_list[0]:
# self.proto_desc.op_run_end_idx.append(i)
#for i in send_list[0]:
# self.proto_desc.op_run_send_list.append(i)
#for i in recv_list[0]:
# self.proto_desc.op_run_recv_list.append(i)
if
ret
is
None
:
return
#for i in ret[0]: # start_list[1]:
# self.proto_desc.xpu_start_idx.append(i)
self
.
proto_desc
.
xpu_start_idx
=
ret
[
0
]
#for i in ret[1]: #end_list[1]:
# self.proto_desc.o_end_idx.append(i)
self
.
proto_desc
.
xpu_end_idx
=
ret
[
1
]
for
i
in
ret
[
2
]:
# send_list[1]:
self
.
proto_desc
.
xpu_send_list
.
append
(
i
)
for
i
in
ret
[
3
]:
# recv_list[1]:
self
.
proto_desc
.
xpu_recv_list
.
append
(
i
)
#for i in start_list[2]:
# self.proto_desc.op_run_end_start_idx.append(i)
#for i in end_list[2]:
# self.proto_desc.op_run_end_idx.append(i)
#for i in send_list[2]:
# self.proto_desc.op_run_end_send_list.append(i)
#for i in recv_list[2]:
# self.proto_desc.op_run_end_recv_list.append(i)
def
_set_fetch_var_and_info
(
self
,
fetch_vars
,
fetch_info
,
print_period
):
# convert fetch_info to list
fetch_info
=
list
(
fetch_info
)
...
...
@@ -122,6 +162,10 @@ class TrainerDesc(object):
for
param
in
dump_param
:
self
.
proto_desc
.
dump_param
.
append
(
param
)
def
_set_worker_places
(
self
,
worker_places
):
for
place
in
worker_places
:
self
.
proto_desc
.
worker_places
.
append
(
place
)
def
_set_thread_barrier
(
self
,
thread_barrier
):
self
.
proto_desc
.
thread_barrier
=
thread_barrier
...
...
@@ -272,6 +316,30 @@ class DistMultiTrainer(TrainerDesc):
self
.
_device_worker
.
_gen_worker_desc
(
self
.
proto_desc
)
class
HeterXpuTrainer
(
TrainerDesc
):
"""
Implement of HeterXpuTrainer.
It's for Distributed training.
"""
def
__init__
(
self
):
super
(
HeterXpuTrainer
,
self
).
__init__
()
pass
def
_set_program
(
self
,
program
):
super
(
HeterXpuTrainer
,
self
).
_set_program
(
program
)
self
.
_program
=
program
def
_gen_trainer_desc
(
self
):
super
(
HeterXpuTrainer
,
self
).
_gen_trainer_desc
()
self
.
proto_desc
.
class_name
=
"HeterXpuTrainer"
if
self
.
_program
==
None
:
raise
RuntimeError
(
"None Program"
)
self
.
_device_worker
.
_set_infer
(
self
.
_infer
)
self
.
_device_worker
.
_set_program
(
self
.
_program
)
self
.
_device_worker
.
_gen_worker_desc
(
self
.
proto_desc
)
class
PipelineTrainer
(
TrainerDesc
):
"""
Implement of PipelineTrainer.
...
...
python/paddle/fluid/trainer_factory.py
浏览文件 @
862dde5e
...
...
@@ -22,7 +22,7 @@ from paddle.fluid.log_helper import get_logger
local_logger
=
get_logger
(
__name__
,
logging
.
INFO
,
fmt
=
'%(asctime)s-%(levelname)s: %(message)s'
)
from
.trainer_desc
import
MultiTrainer
,
DistMultiTrainer
,
PipelineTrainer
from
.trainer_desc
import
MultiTrainer
,
DistMultiTrainer
,
PipelineTrainer
,
HeterXpuTrainer
from
.device_worker
import
Hogwild
,
DownpourSGD
,
Section
,
DownpourSGDOPT
from
.framework
import
Variable
from
multiprocessing
import
Process
,
Manager
...
...
@@ -75,6 +75,8 @@ class TrainerFactory(object):
if
opt_info
.
get
(
"dump_param"
)
is
not
None
and
len
(
opt_info
.
get
(
"dump_param"
))
!=
0
:
trainer
.
_set_dump_param
(
opt_info
[
"dump_param"
])
if
opt_info
.
get
(
"worker_places"
)
is
not
None
:
trainer
.
_set_worker_places
(
opt_info
[
"worker_places"
])
if
opt_info
.
get
(
"enable_random_dump"
)
is
not
None
:
trainer
.
_set_enable_random_dump
(
opt_info
[
"enable_random_dump"
])
...
...
python/paddle/nn/__init__.py
浏览文件 @
862dde5e
...
...
@@ -63,6 +63,7 @@ from .layer.common import Pool2D #DEFINE_ALIAS
from
.layer.common
import
Pad2D
#DEFINE_ALIAS
from
.layer.common
import
Embedding
#DEFINE_ALIAS
from
.layer.common
import
Linear
#DEFINE_ALIAS
from
.layer.common
import
Flatten
#DEFINE_ALIAS
from
.layer.common
import
UpSample
#DEFINE_ALIAS
from
.layer.conv
import
Conv2D
#DEFINE_ALIAS
from
.layer.conv
import
Conv2DTranspose
#DEFINE_ALIAS
...
...
python/paddle/nn/layer/__init__.py
浏览文件 @
862dde5e
...
...
@@ -39,6 +39,7 @@ from .common import Pool2D #DEFINE_ALIAS
from
.common
import
Pad2D
#DEFINE_ALIAS
from
.common
import
Embedding
#DEFINE_ALIAS
from
.common
import
Linear
#DEFINE_ALIAS
from
.common
import
Flatten
#DEFINE_ALIAS
from
.common
import
UpSample
#DEFINE_ALIAS
from
.conv
import
Conv2D
#DEFINE_ALIAS
from
.conv
import
Conv2DTranspose
#DEFINE_ALIAS
...
...
python/paddle/nn/layer/common.py
浏览文件 @
862dde5e
...
...
@@ -17,6 +17,7 @@ from ...fluid.dygraph import BilinearTensorProduct #DEFINE_ALIAS
from
...fluid.dygraph
import
Pool2D
#DEFINE_ALIAS
from
...fluid.dygraph
import
Embedding
#DEFINE_ALIAS
from
...fluid.dygraph
import
Linear
#DEFINE_ALIAS
from
...fluid.dygraph
import
Flatten
#DEFINE_ALIAS
from
...fluid.dygraph
import
layers
from
..
import
functional
as
F
...
...
python/paddle/tensor/manipulation.py
浏览文件 @
862dde5e
...
...
@@ -25,7 +25,6 @@ import numpy as np
from
..fluid.layers
import
cast
#DEFINE_ALIAS
from
..fluid.layers
import
expand
#DEFINE_ALIAS
from
..fluid.layers
import
expand_as
#DEFINE_ALIAS
from
..fluid.layers
import
flatten
#DEFINE_ALIAS
from
..fluid.layers
import
reshape
#DEFINE_ALIAS
from
..fluid.layers
import
scatter
#DEFINE_ALIAS
from
..fluid.layers
import
slice
#DEFINE_ALIAS
...
...
@@ -169,6 +168,114 @@ def flip(x, axis, name=None):
reverse
=
flip
#DEFINE_ALIAS
def
flatten
(
x
,
start_axis
=
0
,
stop_axis
=-
1
,
name
=
None
):
"""
**Flatten op**
Flattens a contiguous range of axes in a tensor according to start_axis and stop_axis.
For Example:
.. code-block:: text
Case 1:
Given
X.shape = (3, 100, 100, 4)
and
start_axis = 1
end_axis = 2
We get:
Out.shape = (3, 1000 * 100, 2)
Case 2:
Given
X.shape = (3, 100, 100, 4)
and
start_axis = 0
stop_axis = -1
We get:
Out.shape = (3 * 100 * 100 * 4)
Args:
x (Variable): A tensor of number of dimentions >= axis. A tensor with data type float32,
float64, int8, int32, int64.
start_axis (int): the start axis to flatten
stop_axis (int): the stop axis to flatten
name(str, Optional): For details, please refer to :ref:`api_guide_Name`.
Generally, no setting is required. Default: None.
Returns:
Variable: A tensor with the contents of the input tensor, with input
\
axes flattened by indicated start axis and end axis.
\
A Tensor with data type same as input x.
Raises:
ValueError: If x is not a Variable.
ValueError: If start_axis or stop_axis is illegal.
Examples:
.. code-block:: python
import paddle
import numpy as np
paddle.enable_imperative()
image_shape=(2, 3, 4, 4)
x = np.arange(image_shape[0] * image_shape[1] * image_shape[2] * image_shape[3]).reshape(image_shape) / 100.
x = x.astype('float32')
img = paddle.imperative.to_variable(x)
out = paddle.flatten(img, start_axis=1, stop_axis=2)
# out shape is [2, 12, 4]
"""
if
not
(
isinstance
(
x
,
Variable
)):
raise
ValueError
(
"The input x should be a Variable"
)
check_variable_and_dtype
(
x
,
'x'
,
[
'float32'
,
'float64'
,
'int8'
,
'int32'
,
'int64'
],
'flatten'
)
helper
=
LayerHelper
(
'flatten'
,
**
locals
())
x_dim
=
len
(
x
.
shape
)
if
not
(
isinstance
(
start_axis
,
int
))
or
(
start_axis
>
x_dim
-
1
)
or
start_axis
<
-
x_dim
:
raise
ValueError
(
"The start_axis should be a int, and in range [-rank(x), rank(x))"
)
if
not
(
isinstance
(
stop_axis
,
int
))
or
(
stop_axis
>
x_dim
-
1
)
or
stop_axis
<
-
x_dim
:
raise
ValueError
(
"The stop_axis should be a int, and in range [-rank(x), rank(x))"
)
if
start_axis
<
0
:
start_axis
=
start_axis
+
x_dim
if
stop_axis
<
0
:
stop_axis
=
stop_axis
+
x_dim
if
start_axis
>
stop_axis
:
raise
ValueError
(
"The stop_axis should be larger than stat_axis"
)
if
in_dygraph_mode
():
dy_out
,
_
=
core
.
ops
.
flatten_contiguous_range
(
x
,
'start_axis'
,
start_axis
,
'stop_axis'
,
stop_axis
)
return
dy_out
out
=
helper
.
create_variable_for_type_inference
(
x
.
dtype
)
x_shape
=
helper
.
create_variable_for_type_inference
(
x
.
dtype
)
helper
.
append_op
(
type
=
'flatten_contiguous_range'
,
inputs
=
{
"X"
:
x
},
outputs
=
{
'Out'
:
out
,
'XShape'
:
x_shape
},
attrs
=
{
"start_axis"
:
start_axis
,
"stop_axis"
:
stop_axis
})
return
out
def
roll
(
x
,
shifts
,
axis
=
None
,
name
=
None
):
"""
:alias_main: paddle.roll
...
...
@@ -252,13 +359,18 @@ def roll(x, shifts, axis=None, name=None):
return
out
def
stack
(
x
,
axis
=
0
,
out
=
None
,
name
=
None
):
def
stack
(
x
,
axis
=
0
,
name
=
None
):
"""
:alias_main: paddle.stack
:alias: paddle.stack,
paddle.tensor.stack,
paddle.tensor.manipulation.stack
:alias: paddle.stack,
paddle.tensor.stack,
paddle.tensor.manipulation.stack
This OP stacks all the inputs :code:`x` along axis.
This OP stacks all the input tensors ``x`` along ``axis`` dimemsion.
All tensors must be of the same shape and same dtype.
For example, given N tensors of shape [A, B], if ``axis == 0``, the shape of stacked
tensor is [N, A, B]; if ``axis == 1``, the shape of stacked
tensor is [A, N, B], etc.
.. code-block:: text
...
...
@@ -284,7 +396,6 @@ def stack(x, axis=0, out=None, name=None):
Case 2:
Input:
x[0].shape = [1, 2]
x[0].data = [ [1.0 , 2.0 ] ]
...
...
@@ -295,7 +406,7 @@ def stack(x, axis=0, out=None, name=None):
Attrs:
axis = 1 or axis = -2
axis = 1 or axis = -2
# If axis = -2, axis = axis+ndim(x[0])+1 = -2+2+1 = 1.
Output:
Out.shape = [1, 3, 2]
...
...
@@ -304,65 +415,40 @@ def stack(x, axis=0, out=None, name=None):
[5.0, 6.0] ] ]
Args:
x (Variable|list(Variable)): Input :code:`x` can be a single Tensor, a :code:`list` of Tensors.
If :code:`x` is a :code:`list`, the shapes of all these Tensors
must be the same. Supposing input is N dims
Tensors :math:`[d_0, d_1, ..., d_{n-1}]`, the output is N+1 dims
Tensor :math:`[d_0, d_1, d_{axis-1}, len(x), d_{axis}, ..., d_{n-1}]`.
Support data types: float32, float64, int32, int64.
axis (int, optional): The axis along which all inputs are stacked. ``axis`` range is :math:`[-(R+1), R+1)`.
R is the first tensor of inputs. If ``axis`` < 0, :math:`axis=axis+rank(x[0])+1`.
The default value of axis is 0.
x (Tensor|list[Tensor]): Input ``x`` can be a single tensor, or a ``list`` of tensors.
If ``x`` is a ``list``, the Tensors in ``x``
must be of the same shape and dtype. Support data types: float32, float64, int32, int64.
axis (int, optional): The axis along which all inputs are stacked. ``axis`` range is ``[-(R+1), R+1)``,
where ``R`` is the number of dimensions of the first input tensor ``x[0]``.
If ``axis < 0``, ``axis = axis+R+1``. The default value of axis is 0.
name (str, optional): Please refer to :ref:`api_guide_Name`, Default None.
Returns:
Variable: The stacked Tensor, has same data type with input Tensors. Output dim is :math:`rank(x[0])+1`
.
Tensor: The stacked tensor with same data type as input
.
Example:
.. code-block:: python
import numpy as np
import paddle
import
paddle.fluid as fluid
import
numpy as np
data1 = np.array([[1.0, 2.0]])
data2 = np.array([[3.0, 4.0]])
data3 = np.array([[5.0, 6.0]])
with fluid.dygraph.guard():
x1 = fluid.dygraph.to_variable(data1)
x2 = fluid.dygraph.to_variable(data2)
x3 = fluid.dygraph.to_variable(data3)
result = paddle.stack([x1, x2, x3], axis=0)
# result shape: [3, 1, 2]
# result value: [[[1.0, 2.0]],
# [[3.0, 4.0]],
# [[5.0, 6.0]]]
"""
helper
=
LayerHelper
(
'stack'
,
**
locals
())
axis
=
0
if
axis
is
None
else
axis
if
not
isinstance
(
x
,
list
)
and
not
isinstance
(
x
,
tuple
):
x
=
[
x
]
out
=
helper
.
create_variable_for_type_inference
(
x
[
0
].
dtype
)
if
not
in_dygraph_mode
()
and
\
x
[
0
].
desc
.
type
()
==
core
.
VarDesc
.
VarType
.
LOD_TENSOR_ARRAY
:
assert
len
(
x
)
==
1
,
"If the elements of 'x' in stack are Variable(LoDTensorArray), "
\
"number of the elements must be 1, but received %s."
%
len
(
x
)
out_index
=
helper
.
create_variable_for_type_inference
(
dtype
=
"int32"
)
helper
.
append_op
(
type
=
'tensor_array_to_tensor'
,
inputs
=
{
'X'
:
x
[
0
]},
outputs
=
{
'Out'
:
[
out
],
'OutIndex'
:
[
out_index
]},
attrs
=
{
'axis'
:
axis
,
'use_stack'
:
True
})
else
:
helper
.
append_op
(
type
=
'stack'
,
inputs
=
{
'X'
:
x
},
outputs
=
{
'Y'
:
out
},
attrs
=
{
'axis'
:
axis
})
return
out
paddle.enable_imperative()
x1 = paddle.imperative.to_variable(data1)
x2 = paddle.imperative.to_variable(data2)
x3 = paddle.imperative.to_variable(data3)
out = paddle.stack([x1, x2, x3], axis=0)
print(out.shape) # [3, 1, 2]
print(out.numpy())
# [[[1., 2.]],
# [[3., 4.]],
# [[5., 6.]]]
"""
return
layers
.
stack
(
x
,
axis
,
name
)
def
split
(
x
,
num_or_sections
,
axis
=
0
,
name
=
None
):
...
...
python/paddle/utils/__init__.py
浏览文件 @
862dde5e
...
...
@@ -16,8 +16,9 @@ from .plot import Ploter
from
.profiler
import
ProfilerOptions
from
.profiler
import
Profiler
from
.profiler
import
get_profiler
from
.deprecated
import
deprecated
__all__
=
[
'dump_config'
,
'Ploter'
]
__all__
=
[
'dump_config'
,
'Ploter'
,
'deprecated'
]
#TODO: define new api under this directory
# __all__ = ['unique_name',
...
...
python/paddle/utils/deprecated.py
0 → 100644
浏览文件 @
862dde5e
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""
decorator to deprecate a function or class
"""
import
warnings
import
functools
import
paddle
def
deprecated
(
update_to
=
""
,
since
=
""
,
reason
=
""
):
"""Decorate a function to signify its deprecation.
This function wraps a method that will soon be removed and does two things:
- The docstring of the API will be modified to include a notice
about deprecation."
- Raises a :class:`~exceptions.DeprecatedWarning` when old API is called.
Args:
since(str): The version at which the decorated method is considered deprecated.
update_to(str): The new API users should use.
reason(str): The reason why the API is deprecated.
Returns:
decorator: decorated function or class.
"""
def
decorator
(
func
):
"""construct warning message, and return a decorated function or class."""
assert
isinstance
(
update_to
,
str
),
'type of "update_to" must be str.'
assert
isinstance
(
since
,
str
),
'type of "since" must be str.'
assert
isinstance
(
reason
,
str
),
'type of "reason" must be str.'
_since
=
since
.
strip
()
_update_to
=
update_to
.
strip
()
_reason
=
reason
.
strip
()
msg
=
'API "{}.{}" is deprecated'
.
format
(
func
.
__module__
,
func
.
__name__
)
if
len
(
_since
)
>
0
:
msg
+=
" since {}"
.
format
(
_since
)
msg
+=
", and may be removed in future versions."
if
len
(
_update_to
)
>
0
:
assert
_update_to
.
startswith
(
"paddle."
),
'Argument update_to must start with "paddle.", your value is "{}"'
.
format
(
update_to
)
msg
+=
' Use "{}" instead.'
.
format
(
_update_to
)
if
len
(
_reason
)
>
0
:
msg
+=
"
\n
reason: {}"
.
format
(
_reason
)
@
functools
.
wraps
(
func
)
def
wrapper
(
*
args
,
**
kwargs
):
"""deprecated warning should be fired in 3 circumstances:
1. current version is develop version, i.e. "0.0.0", because we assume develop version is always the latest version.
2. since version is empty, in this case, API is deprecated in all versions.
3. current version is newer than since version.
"""
v_current
=
[
int
(
i
)
for
i
in
paddle
.
__version__
.
split
(
"."
)]
v_current
+=
[
0
]
*
(
4
-
len
(
v_current
))
v_since
=
[
int
(
i
)
for
i
in
_since
.
split
(
"."
)]
v_since
+=
[
0
]
*
(
4
-
len
(
v_since
))
if
paddle
.
__version__
==
"0.0.0"
or
_since
==
""
or
v_current
>=
v_since
:
warnings
.
simplefilter
(
'always'
,
DeprecationWarning
)
# turn off filter
warnings
.
warn
(
msg
,
category
=
DeprecationWarning
,
stacklevel
=
2
)
warnings
.
simplefilter
(
'default'
,
DeprecationWarning
)
# reset filter
return
func
(
*
args
,
**
kwargs
)
return
wrapper
return
decorator
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录