Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle
提交
5dcfb699
P
Paddle
项目概览
PaddlePaddle
/
Paddle
1 年多 前同步成功
通知
2302
Star
20931
Fork
5422
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1423
列表
看板
标记
里程碑
合并请求
543
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1,423
Issue
1,423
列表
看板
标记
里程碑
合并请求
543
合并请求
543
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
提交
5dcfb699
编写于
3月 23, 2022
作者:
P
phlrain
浏览文件
操作
浏览文件
下载
差异文件
Merge branch 'develop' of
https://github.com/PaddlePaddle/Paddle
into add_some_yaml_config
上级
31363c3f
8e67629c
变更
75
展开全部
显示空白变更内容
内联
并排
Showing
75 changed file
with
3210 addition
and
607 deletion
+3210
-607
cmake/external/ascend.cmake
cmake/external/ascend.cmake
+3
-3
paddle/fluid/distributed/ps/service/brpc_ps_client.cc
paddle/fluid/distributed/ps/service/brpc_ps_client.cc
+15
-18
paddle/fluid/distributed/ps/service/communicator/communicator.cc
...fluid/distributed/ps/service/communicator/communicator.cc
+25
-49
paddle/fluid/distributed/ps/service/communicator/communicator.h
.../fluid/distributed/ps/service/communicator/communicator.h
+2
-2
paddle/fluid/distributed/ps/table/CMakeLists.txt
paddle/fluid/distributed/ps/table/CMakeLists.txt
+2
-1
paddle/fluid/distributed/ps/table/common_dense_table.cc
paddle/fluid/distributed/ps/table/common_dense_table.cc
+23
-13
paddle/fluid/distributed/ps/table/ctr_accessor.cc
paddle/fluid/distributed/ps/table/ctr_accessor.cc
+5
-1
paddle/fluid/distributed/ps/table/ctr_accessor.h
paddle/fluid/distributed/ps/table/ctr_accessor.h
+14
-3
paddle/fluid/distributed/ps/table/depends/dense.h
paddle/fluid/distributed/ps/table/depends/dense.h
+36
-14
paddle/fluid/distributed/ps/table/sparse_accessor.cc
paddle/fluid/distributed/ps/table/sparse_accessor.cc
+339
-0
paddle/fluid/distributed/ps/table/sparse_accessor.h
paddle/fluid/distributed/ps/table/sparse_accessor.h
+208
-0
paddle/fluid/distributed/ps/table/table.cc
paddle/fluid/distributed/ps/table/table.cc
+2
-0
paddle/fluid/distributed/ps/wrapper/fleet.cc
paddle/fluid/distributed/ps/wrapper/fleet.cc
+163
-135
paddle/fluid/distributed/ps/wrapper/fleet.h
paddle/fluid/distributed/ps/wrapper/fleet.h
+21
-7
paddle/fluid/distributed/test/memory_sparse_table_test.cc
paddle/fluid/distributed/test/memory_sparse_table_test.cc
+3
-3
paddle/fluid/eager/tests/task_tests/eager_utils_test.cc
paddle/fluid/eager/tests/task_tests/eager_utils_test.cc
+3
-0
paddle/fluid/framework/CMakeLists.txt
paddle/fluid/framework/CMakeLists.txt
+5
-4
paddle/fluid/framework/device_worker.h
paddle/fluid/framework/device_worker.h
+82
-0
paddle/fluid/framework/device_worker_factory.cc
paddle/fluid/framework/device_worker_factory.cc
+1
-0
paddle/fluid/framework/dist_multi_trainer.cc
paddle/fluid/framework/dist_multi_trainer.cc
+15
-3
paddle/fluid/framework/downpour_lite_worker.cc
paddle/fluid/framework/downpour_lite_worker.cc
+566
-0
paddle/fluid/framework/fleet/metrics.cc
paddle/fluid/framework/fleet/metrics.cc
+1
-1
paddle/fluid/framework/fleet/metrics.h
paddle/fluid/framework/fleet/metrics.h
+1
-1
paddle/fluid/framework/pull_dense_worker.cc
paddle/fluid/framework/pull_dense_worker.cc
+9
-0
paddle/fluid/framework/the_one_ps.proto
paddle/fluid/framework/the_one_ps.proto
+0
-0
paddle/fluid/operators/abs_op.cc
paddle/fluid/operators/abs_op.cc
+26
-2
paddle/fluid/operators/conv_op_npu.cc
paddle/fluid/operators/conv_op_npu.cc
+204
-0
paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc
paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc
+16
-27
paddle/fluid/operators/pscore/distributed_lookup_table_op.h
paddle/fluid/operators/pscore/distributed_lookup_table_op.h
+9
-12
paddle/fluid/operators/pscore/distributed_push_sparse_op.cc
paddle/fluid/operators/pscore/distributed_push_sparse_op.cc
+3
-0
paddle/fluid/operators/pscore/distributed_push_sparse_op.h
paddle/fluid/operators/pscore/distributed_push_sparse_op.h
+7
-10
paddle/fluid/operators/pscore/send_op.cc
paddle/fluid/operators/pscore/send_op.cc
+1
-1
paddle/fluid/platform/device/npu/npu_op_runner.cc
paddle/fluid/platform/device/npu/npu_op_runner.cc
+2
-0
paddle/fluid/pybind/fleet_py.cc
paddle/fluid/pybind/fleet_py.cc
+2
-0
paddle/phi/common/layout.h
paddle/phi/common/layout.h
+12
-0
paddle/phi/kernels/cpu/elementwise_grad_kernel.cc
paddle/phi/kernels/cpu/elementwise_grad_kernel.cc
+2
-2
paddle/phi/kernels/cpu/elementwise_kernel.cc
paddle/phi/kernels/cpu/elementwise_kernel.cc
+4
-16
paddle/phi/kernels/elementwise_kernel.h
paddle/phi/kernels/elementwise_kernel.h
+10
-10
paddle/phi/kernels/gpu/elementwise_grad_kernel.cu
paddle/phi/kernels/gpu/elementwise_grad_kernel.cu
+2
-2
paddle/phi/kernels/gpu/elementwise_kernel.cu
paddle/phi/kernels/gpu/elementwise_kernel.cu
+4
-16
paddle/phi/kernels/impl/elementwise_kernel_impl.h
paddle/phi/kernels/impl/elementwise_kernel_impl.h
+10
-10
paddle/phi/ops/compat/elementwise_sig.cc
paddle/phi/ops/compat/elementwise_sig.cc
+37
-35
python/paddle/distributed/fleet/base/fleet_base.py
python/paddle/distributed/fleet/base/fleet_base.py
+104
-3
python/paddle/distributed/fleet/base/runtime_factory.py
python/paddle/distributed/fleet/base/runtime_factory.py
+1
-1
python/paddle/distributed/fleet/meta_optimizers/__init__.py
python/paddle/distributed/fleet/meta_optimizers/__init__.py
+1
-1
python/paddle/distributed/fleet/meta_optimizers/ps_optimizer.py
.../paddle/distributed/fleet/meta_optimizers/ps_optimizer.py
+13
-7
python/paddle/distributed/fleet/runtime/the_one_ps.py
python/paddle/distributed/fleet/runtime/the_one_ps.py
+8
-9
python/paddle/distributed/fleet/utils/ps_util.py
python/paddle/distributed/fleet/utils/ps_util.py
+1
-1
python/paddle/distributed/passes/ps_server_pass.py
python/paddle/distributed/passes/ps_server_pass.py
+0
-2
python/paddle/distributed/passes/ps_trainer_pass.py
python/paddle/distributed/passes/ps_trainer_pass.py
+10
-5
python/paddle/distributed/ps/the_one_ps.py
python/paddle/distributed/ps/the_one_ps.py
+141
-46
python/paddle/distributed/ps/utils/ps_program_builder.py
python/paddle/distributed/ps/utils/ps_program_builder.py
+50
-4
python/paddle/distributed/ps/utils/public.py
python/paddle/distributed/ps/utils/public.py
+35
-29
python/paddle/fluid/communicator.py
python/paddle/fluid/communicator.py
+21
-7
python/paddle/fluid/dataset.py
python/paddle/fluid/dataset.py
+6
-7
python/paddle/fluid/device_worker.py
python/paddle/fluid/device_worker.py
+183
-38
python/paddle/fluid/tests/unittests/distributed_passes/test_ps_trainer_pass.py
...ests/unittests/distributed_passes/test_ps_trainer_pass.py
+15
-15
python/paddle/fluid/tests/unittests/mkldnn/test_activation_bf16_mkldnn_op.py
.../tests/unittests/mkldnn/test_activation_bf16_mkldnn_op.py
+106
-1
python/paddle/fluid/tests/unittests/mkldnn/test_softplus_mkldnn_op.py
...e/fluid/tests/unittests/mkldnn/test_softplus_mkldnn_op.py
+35
-5
python/paddle/fluid/tests/unittests/npu/test_conv3d_op_npu.py
...on/paddle/fluid/tests/unittests/npu/test_conv3d_op_npu.py
+543
-0
python/paddle/fluid/tests/unittests/ps/test_the_one_ps.py
python/paddle/fluid/tests/unittests/ps/test_the_one_ps.py
+1
-1
python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_auto_async.py
.../unittests/test_dist_fleet_a_sync_optimizer_auto_async.py
+2
-1
python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_auto_geo.py
...ts/unittests/test_dist_fleet_a_sync_optimizer_auto_geo.py
+2
-2
python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_geo.py
...d/tests/unittests/test_dist_fleet_a_sync_optimizer_geo.py
+2
-2
python/paddle/fluid/tests/unittests/test_dist_fleet_base.py
python/paddle/fluid/tests/unittests/test_dist_fleet_base.py
+3
-2
python/paddle/fluid/tests/unittests/test_dist_fleet_ctr.py
python/paddle/fluid/tests/unittests/test_dist_fleet_ctr.py
+9
-6
python/paddle/fluid/tests/unittests/test_dist_fleet_ctr2.py
python/paddle/fluid/tests/unittests/test_dist_fleet_ctr2.py
+6
-4
python/paddle/fluid/tests/unittests/test_dist_fleet_geo.py
python/paddle/fluid/tests/unittests/test_dist_fleet_geo.py
+1
-0
python/paddle/fluid/tests/unittests/test_dist_fleet_ps10.py
python/paddle/fluid/tests/unittests/test_dist_fleet_ps10.py
+4
-3
python/paddle/fluid/tests/unittests/test_dist_fleet_ps12.py
python/paddle/fluid/tests/unittests/test_dist_fleet_ps12.py
+2
-0
python/paddle/fluid/tests/unittests/test_dist_fleet_ps2.py
python/paddle/fluid/tests/unittests/test_dist_fleet_ps2.py
+2
-0
python/paddle/fluid/tests/unittests/test_dist_fleet_ps7.py
python/paddle/fluid/tests/unittests/test_dist_fleet_ps7.py
+3
-1
python/paddle/fluid/tests/unittests/test_dist_fleet_ps8.py
python/paddle/fluid/tests/unittests/test_dist_fleet_ps8.py
+2
-1
python/paddle/fluid/tests/unittests/test_dist_fleet_ps9.py
python/paddle/fluid/tests/unittests/test_dist_fleet_ps9.py
+2
-1
python/paddle/fluid/trainer_factory.py
python/paddle/fluid/trainer_factory.py
+1
-1
未找到文件。
cmake/external/ascend.cmake
浏览文件 @
5dcfb699
...
...
@@ -90,9 +90,9 @@ endif()
if
(
WITH_ASCEND_CL
)
macro
(
find_ascend_toolkit_version ascend_toolkit_version_info
)
file
(
READ
${
ascend_toolkit_version_info
}
ASCEND_TOOLKIT_VERSION_CONTENTS
)
string
(
REGEX MATCH
"version=([0-9]+\.[0-9]+\.[0-9]+\.[a-z]*[0-9]*)"
ASCEND_TOOLKIT_VERSION
"
${
ASCEND_TOOLKIT_VERSION_CONTENTS
}
"
)
string
(
REGEX REPLACE
"version=([0-9]+\.[0-9]+\.[0-9]+\.[a-z]*[0-9]*)"
"
\\
1"
ASCEND_TOOLKIT_VERSION
"
${
ASCEND_TOOLKIT_VERSION
}
"
)
string
(
REGEX REPLACE
"[a-z|\.]"
""
CANN_VERSION
${
ASCEND_TOOLKIT_VERSION
}
)
string
(
REGEX MATCH
"version=([0-9]+\.[0-9]+\.
(RC)?
[0-9]+\.[a-z]*[0-9]*)"
ASCEND_TOOLKIT_VERSION
"
${
ASCEND_TOOLKIT_VERSION_CONTENTS
}
"
)
string
(
REGEX REPLACE
"version=([0-9]+\.[0-9]+\.
(RC)?
[0-9]+\.[a-z]*[0-9]*)"
"
\\
1"
ASCEND_TOOLKIT_VERSION
"
${
ASCEND_TOOLKIT_VERSION
}
"
)
string
(
REGEX REPLACE
"[
A-Z]|[
a-z|\.]"
""
CANN_VERSION
${
ASCEND_TOOLKIT_VERSION
}
)
add_definitions
(
"-DCANN_VERSION_CODE=
${
CANN_VERSION
}
"
)
if
(
NOT ASCEND_TOOLKIT_VERSION
)
set
(
ASCEND_TOOLKIT_VERSION
"???"
)
...
...
paddle/fluid/distributed/ps/service/brpc_ps_client.cc
浏览文件 @
5dcfb699
...
...
@@ -1315,11 +1315,11 @@ std::future<int32_t> BrpcPsClient::push_sparse(size_t table_id,
CostTimer
parse_timer
(
"pserver_client_push_sparse_parse"
);
int
push_sparse_async_num
=
_push_sparse_task_queue_map
[
table_id
]
->
Size
();
while
(
push_sparse_async_num
>
FLAGS_pserver_max_async_call_num
)
{
// LOG(INFO) << "push_sparse Waiting for async_call_num comsume, task_num:"
// << push_sparse_async_num << ", max_task_limit:" <<
// FLAGS_pserver_max_async_call_num;
// LOG(INFO) << "push_sparse Waiting for async_call_num comsume,
// task_num:"
// << push_sparse_async_num
// << ", max_task_limit:" << FLAGS_pserver_max_async_call_num;
usleep
(
5000
);
// 5ms
// push_sparse_async_num = _push_sparse_task_queue_map[table_id]->size();
push_sparse_async_num
=
_push_sparse_task_queue_map
[
table_id
]
->
Size
();
}
auto
put_timer
=
std
::
make_shared
<
CostTimer
>
(
"client_push_sparse_put"
);
...
...
@@ -1381,8 +1381,7 @@ void BrpcPsClient::push_sparse_task_consume() {
::
ThreadPool
async_push_sparse_shard_threads
(
FLAGS_pserver_sparse_merge_thread
);
while
(
_running
)
{
platform
::
Timer
timeline
;
timeline
.
Start
();
auto
async_start_time_ms
=
butil
::
gettimeofday_ms
();
// 所有sparseTable的pushTask 进行处理
for
(
auto
&
push_sparse_task_itr
:
_push_sparse_task_queue_map
)
{
auto
table_id
=
push_sparse_task_itr
.
first
;
...
...
@@ -1497,9 +1496,8 @@ void BrpcPsClient::push_sparse_task_consume() {
std
::
vector
<
std
::
future
<
int
>>
().
swap
(
merge_status
);
}
}
timeline
.
Pause
();
auto
wait_ms
=
FLAGS_pserver_async_push_sparse_interval_ms
-
(
timeline
.
ElapsedMS
());
auto
wait_ms
=
FLAGS_pserver_async_push_sparse_interval_ms
-
(
butil
::
gettimeofday_ms
()
-
async_start_time_ms
);
if
(
wait_ms
>
0
)
{
usleep
(
wait_ms
*
1000
);
}
...
...
@@ -1661,9 +1659,10 @@ std::future<int32_t> BrpcPsClient::push_dense(const Region *regions,
std
::
make_shared
<
CostTimer
>
(
"pserver_client_push_dense_parse"
);
int
push_dense_async_num
=
_push_dense_task_queue_map
[
table_id
]
->
Size
();
while
(
push_dense_async_num
>
FLAGS_pserver_max_async_call_num
)
{
LOG
(
INFO
)
<<
"push_dense Waiting for async_call_num comsume, task_num:"
<<
push_dense_async_num
<<
", max_task_limit:"
<<
FLAGS_pserver_max_async_call_num
;
// LOG(INFO) << "push_dense Waiting for async_call_num comsume,
// task_num:"
// << push_dense_async_num
// << ", max_task_limit:" << FLAGS_pserver_max_async_call_num;
usleep
(
5000
);
// 5ms
push_dense_async_num
=
_push_dense_task_queue_map
[
table_id
]
->
Size
();
}
...
...
@@ -1701,8 +1700,7 @@ void BrpcPsClient::push_dense_task_consume() {
static
bool
scale_gradient
=
FLAGS_pserver_scale_gradient_by_merge
;
::
ThreadPool
async_merge_dense_threads
(
10
);
while
(
_running
)
{
platform
::
Timer
timeline
;
timeline
.
Start
();
auto
async_start_time_ms
=
butil
::
gettimeofday_ms
();
for
(
auto
&
task_queue_itr
:
_push_dense_task_queue_map
)
{
auto
&
task_queue
=
task_queue_itr
.
second
;
auto
queue_size
=
task_queue
->
Size
();
...
...
@@ -1791,9 +1789,8 @@ void BrpcPsClient::push_dense_task_consume() {
push_dense_raw_gradient
(
task_ptr
,
total_send_data
,
total_send_data_size
,
closure
);
}
timeline
.
Pause
();
auto
wait_ms
=
FLAGS_pserver_async_push_dense_interval_ms
-
(
timeline
.
ElapsedMS
());
auto
wait_ms
=
FLAGS_pserver_async_push_dense_interval_ms
-
(
butil
::
gettimeofday_ms
()
-
async_start_time_ms
);
if
(
wait_ms
>
0
)
{
usleep
(
wait_ms
*
1000
);
}
...
...
paddle/fluid/distributed/ps/service/communicator/communicator.cc
浏览文件 @
5dcfb699
...
...
@@ -13,11 +13,10 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/distributed/ps/service/communicator/communicator.h"
#include <google/protobuf/text_format.h>
#include "gflags/gflags.h"
#include "paddle/fluid/distributed/ps/service/brpc_ps_client.h"
#include "paddle/fluid/distributed/ps/wrapper/fleet.h"
#include "paddle/fluid/platform/profiler.h"
#include "paddle/fluid/string/string_helper.h"
...
...
@@ -66,34 +65,9 @@ std::shared_ptr<Communicator> Communicator::communicator_(nullptr);
void
Communicator
::
InitBrpcClient
(
const
std
::
string
&
dist_desc
,
const
std
::
vector
<
std
::
string
>
&
host_sign_list
)
{
// not used, just for psclient's init
std
::
map
<
uint64_t
,
std
::
vector
<
paddle
::
distributed
::
Region
>>
_dense_pull_regions
;
for
(
auto
&
iter
:
recv_varname_to_ctx_
)
{
auto
tid
=
iter
.
first
;
auto
var_names
=
iter
.
second
;
auto
&
regions
=
_dense_pull_regions
[
tid
];
regions
.
reserve
(
var_names
.
size
());
for
(
auto
&
t
:
var_names
)
{
Variable
*
var
=
recv_scope_
->
FindVar
(
t
);
LoDTensor
*
tensor
=
var
->
GetMutable
<
LoDTensor
>
();
float
*
w
=
tensor
->
data
<
float
>
();
paddle
::
distributed
::
Region
reg
(
w
,
tensor
->
numel
());
regions
.
emplace_back
(
std
::
move
(
reg
));
}
}
auto
fleet
=
paddle
::
distributed
::
FleetWrapper
::
GetInstance
();
if
(
_worker_ptr
.
get
()
==
nullptr
)
{
google
::
protobuf
::
TextFormat
::
ParseFromString
(
dist_desc
,
&
_ps_param
);
init_gflag
(
_ps_param
.
init_gflags
());
servers_
=
host_sign_list
.
size
();
_ps_env
=
paddle
::
distributed
::
PaddlePSEnvironment
();
_ps_env
.
set_ps_servers
(
&
host_sign_list
,
servers_
);
_worker_ptr
=
std
::
unique_ptr
<
paddle
::
distributed
::
PSClient
>
(
paddle
::
distributed
::
PSClientFactory
::
create
(
_ps_param
));
_worker_ptr
->
configure
(
_ps_param
,
_dense_pull_regions
,
_ps_env
,
trainer_id_
);
_worker_ptr
=
fleet
->
worker_ptr_
;
}
return
;
}
...
...
@@ -146,11 +120,11 @@ void Communicator::RpcRecvDense(const std::vector<std::string> &varnames,
for
(
auto
&
t
:
varnames
)
{
Variable
*
var
=
scope
->
FindVar
(
t
);
LoDTensor
*
tensor
=
var
->
GetMutable
<
LoDTensor
>
();
VLOG
(
1
)
<<
"AsyncCommunicator::RecvNoBarrier Var "
<<
t
<<
" On gpu? "
VLOG
(
3
)
<<
"AsyncCommunicator::RecvNoBarrier Var "
<<
t
<<
" On gpu? "
<<
platform
::
is_gpu_place
(
tensor
->
place
());
float
*
temp_recv_data
=
tensor
->
mutable_data
<
float
>
(
platform
::
CPUPlace
());
VLOG
(
1
)
<<
"AsyncCommunicator::RpcRecvDense Var "
<<
t
<<
" table_id "
VLOG
(
3
)
<<
"AsyncCommunicator::RpcRecvDense Var "
<<
t
<<
" table_id "
<<
table_id
<<
" Temp_data[0] "
<<
temp_recv_data
[
0
]
<<
" Temp_data[-1] "
<<
temp_recv_data
[
tensor
->
numel
()
-
1
];
if
(
platform
::
is_gpu_place
(
tensor
->
place
()))
{
...
...
@@ -481,7 +455,7 @@ void AsyncCommunicator::RecvNoBarrier() {
for
(
auto
&
t
:
var_names
)
{
Variable
*
var
=
recv_scope_
->
FindVar
(
t
);
LoDTensor
*
tensor
=
var
->
GetMutable
<
LoDTensor
>
();
VLOG
(
1
)
<<
"AsyncCommunicator::RecvNoBarrier Var "
<<
t
<<
" On gpu? "
VLOG
(
3
)
<<
"AsyncCommunicator::RecvNoBarrier Var "
<<
t
<<
" On gpu? "
<<
platform
::
is_gpu_place
(
tensor
->
place
());
if
(
platform
::
is_gpu_place
(
tensor
->
place
()))
{
#ifdef PADDLE_WITH_CUDA
...
...
@@ -653,7 +627,7 @@ void AsyncCommunicator::PushSparseFromTensorAsync(
input
->
lod
().
size
()
?
input
->
lod
()[
0
].
size
()
-
1
:
input
->
dims
()[
0
];
if
(
batch_size
==
-
1
)
{
batch_size
=
cur_batch_size
;
}
else
{
}
else
if
(
batch_size
!=
cur_batch_size
)
{
// CHECK(batch_size == cur_batch_size); // NOLINT
batch_size_consist
=
false
;
break
;
...
...
@@ -676,7 +650,8 @@ void AsyncCommunicator::PushSparseFromTensorAsync(
size_t
output_len
=
0
;
size_t
input_idx
=
0
;
VLOG
(
2
)
<<
"fleet.cc::emb_dim: "
<<
fea_dim
;
VLOG
(
2
)
<<
"fleet.cc::emb_dim: "
<<
fea_dim
<<
" batch_size: "
<<
batch_size
<<
" batch_size_consist: "
<<
batch_size_consist
;
// TODO(zhaocaibei123): check type of show/clk is int? float? uint64?
// const long int* show_tensor = shows->data<int64_t>();
...
...
@@ -687,13 +662,14 @@ void AsyncCommunicator::PushSparseFromTensorAsync(
for
(
size_t
index
=
0
;
index
<
inputs
->
size
();
++
index
)
{
framework
::
LoDTensor
*
g_tensor
=
outputs
->
at
(
index
);
float
*
g
=
g_tensor
->
data
<
float
>
();
// no cvm
if
(
batch_size_consist
)
{
// TODO(zhaocaibei123): add config
// scale_sparse_gradient_with_batch_size_
Eigen
::
Map
<
Eigen
::
Matrix
<
float
,
Eigen
::
Dynamic
,
Eigen
::
Dynamic
,
Eigen
::
RowMajor
>>
g_mat
(
g
,
g_tensor
->
numel
()
/
fea_dim
,
fea_dim
);
g_mat
.
rightCols
(
fea_dim
)
*=
batch_size
;
g_mat
.
rightCols
(
fea_dim
-
2
)
*=
batch_size
;
// hard code here, because of cvm_grad op
}
const
framework
::
LoDTensor
*
tensor
=
inputs
->
at
(
index
);
...
...
@@ -710,16 +686,16 @@ void AsyncCommunicator::PushSparseFromTensorAsync(
continue
;
}
push_keys
.
emplace_back
(
real_id
);
push_values
.
emplace_back
(
fea_dim
+
3
);
push_values
.
emplace_back
(
fea_dim
+
1
);
// slot show clk grad... consistent with CtrCommonPushValue defined in
// ctr_accessor.h
push_values
.
back
()[
0
]
=
2
;
// TODO(zhaocaibei123): slot
push_values
.
back
()[
1
]
=
(
i
>=
show_size
?
1
:
static_cast
<
float
>
(
show_tensor
[
i
]));
push_values
.
back
()[
2
]
=
(
i
>=
clk_size
?
0
:
static_cast
<
float
>
(
clk_tensor
[
i
]));
//
push_values.back()[1] =
//
(i >= show_size ? 1 : static_cast<float>(show_tensor[i]));
//
push_values.back()[2] =
//
(i >= clk_size ? 0 : static_cast<float>(clk_tensor[i]));
float
*
data
=
push_values
.
back
().
data
()
+
3
;
float
*
data
=
push_values
.
back
().
data
()
+
1
;
// hard code here
memcpy
(
data
,
g
+
output_len
,
sizeof
(
float
)
*
fea_dim
);
...
...
@@ -733,16 +709,16 @@ void AsyncCommunicator::PushSparseFromTensorAsync(
continue
;
}
push_keys
.
emplace_back
(
real_id
);
push_values
.
emplace_back
(
fea_dim
+
3
);
push_values
.
emplace_back
(
fea_dim
+
1
);
// slot show clk grad... consistent with CtrCommonPushValue defined in
// ctr_accessor.h
push_values
.
back
()[
0
]
=
2
;
// TODO(zhaocaibei123): slot
push_values
.
back
()[
1
]
=
(
i
>=
show_size
?
1
:
static_cast
<
float
>
(
show_tensor
[
i
]));
push_values
.
back
()[
2
]
=
(
i
>=
clk_size
?
0
:
static_cast
<
float
>
(
clk_tensor
[
i
]));
//
push_values.back()[1] =
//
(i >= show_size ? 1 : static_cast<float>(show_tensor[i]));
//
push_values.back()[2] =
//
(i >= clk_size ? 0 : static_cast<float>(clk_tensor[i]));
float
*
data
=
push_values
.
back
().
data
()
+
3
;
float
*
data
=
push_values
.
back
().
data
()
+
1
;
memcpy
(
data
,
g
+
output_len
,
sizeof
(
float
)
*
fea_dim
);
...
...
@@ -837,7 +813,7 @@ void AsyncCommunicator::Stop() {
if
(
!
communicator_
)
{
VLOG
(
0
)
<<
"Communicator is not inited, do nothing"
;
}
else
{
_worker_ptr
->
finalize_worker
();
//
_worker_ptr->finalize_worker();
VLOG
(
1
)
<<
"client finalize_worker done"
;
if
(
recv_thread_
)
{
VLOG
(
1
)
<<
"stop recv thread"
;
...
...
paddle/fluid/distributed/ps/service/communicator/communicator.h
浏览文件 @
5dcfb699
...
...
@@ -360,13 +360,13 @@ class Communicator {
PSClient
*
GetPsClient
()
{
return
_worker_ptr
.
get
();
}
std
::
unique
_ptr
<
paddle
::
distributed
::
PSClient
>
GetPsClientPtr
()
{
std
::
shared
_ptr
<
paddle
::
distributed
::
PSClient
>
GetPsClientPtr
()
{
return
std
::
move
(
_worker_ptr
);
}
RecvCtxMap
&
GetRecvCtxMap
()
{
return
recv_varname_to_ctx_
;
}
std
::
unique
_ptr
<
PSClient
>
_worker_ptr
;
// pointer to worker
std
::
shared
_ptr
<
PSClient
>
_worker_ptr
;
// pointer to worker
protected:
bool
running_
=
false
;
...
...
paddle/fluid/distributed/ps/table/CMakeLists.txt
浏览文件 @
5dcfb699
...
...
@@ -43,11 +43,12 @@ set_source_files_properties(table.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPI
set_source_files_properties
(
sparse_sgd_rule.cc PROPERTIES COMPILE_FLAGS
${
DISTRIBUTE_COMPILE_FLAGS
}
)
set_source_files_properties
(
ctr_double_accessor.cc PROPERTIES COMPILE_FLAGS
${
DISTRIBUTE_COMPILE_FLAGS
}
)
set_source_files_properties
(
ctr_accessor.cc PROPERTIES COMPILE_FLAGS
${
DISTRIBUTE_COMPILE_FLAGS
}
)
set_source_files_properties
(
sparse_accessor.cc PROPERTIES COMPILE_FLAGS
${
DISTRIBUTE_COMPILE_FLAGS
}
)
set_source_files_properties
(
downpour_ctr_accessor.cc PROPERTIES COMPILE_FLAGS
${
DISTRIBUTE_COMPILE_FLAGS
}
)
set_source_files_properties
(
memory_sparse_table.cc PROPERTIES COMPILE_FLAGS
${
DISTRIBUTE_COMPILE_FLAGS
}
)
cc_library
(
sparse_sgd_rule SRCS sparse_sgd_rule.cc DEPS
${
TABLE_DEPS
}
ps_framework_proto
)
cc_library
(
ctr_double_accessor SRCS ctr_double_accessor.cc DEPS
${
TABLE_DEPS
}
ps_framework_proto sparse_sgd_rule
)
cc_library
(
ctr_accessor SRCS ctr_accessor.cc DEPS
${
TABLE_DEPS
}
ps_framework_proto sparse_sgd_rule
)
cc_library
(
ctr_accessor SRCS ctr_accessor.cc
sparse_accessor.cc
DEPS
${
TABLE_DEPS
}
ps_framework_proto sparse_sgd_rule
)
cc_library
(
downpour_ctr_accessor SRCS downpour_ctr_accessor.cc DEPS
${
TABLE_DEPS
}
ps_framework_proto sparse_sgd_rule
)
cc_library
(
memory_sparse_table SRCS memory_sparse_table.cc DEPS ps_framework_proto
${
TABLE_DEPS
}
fs afs_wrapper ctr_accessor common_table
)
...
...
paddle/fluid/distributed/ps/table/common_dense_table.cc
浏览文件 @
5dcfb699
...
...
@@ -115,6 +115,8 @@ int32_t CommonDenseTable::initialize_optimizer() {
// optimizer_->set_global_lr(_global_lr); //no use
}
else
if
(
name
==
"sum"
)
{
optimizer_
=
std
::
make_shared
<
DSUM
>
(
common
,
&
values_
);
}
else
if
(
name
==
"summary"
)
{
optimizer_
=
std
::
make_shared
<
DSummary
>
(
common
,
&
values_
);
}
else
{
VLOG
(
0
)
<<
"init optimizer failed"
;
}
...
...
@@ -339,11 +341,18 @@ int32_t CommonDenseTable::save(const std::string& path,
auto
common
=
_config
.
common
();
int
size
=
static_cast
<
int
>
(
common
.
params
().
size
());
if
(
_config
.
common
().
name
()
==
"summary"
)
{
for
(
int
x
=
0
;
x
<
param_dim_
;
++
x
)
{
result_buffer_param
[
x
].
emplace_back
(
std
::
to_string
(
values_
[
param_idx_
][
x
]));
}
}
else
{
std
::
ostringstream
os
;
for
(
int
x
=
0
;
x
<
size
;
++
x
)
{
auto
&
varname
=
common
.
params
()[
x
];
auto
&
dim
=
common
.
dims
()[
x
];
VLOG
(
0
)
<<
"CommonDenseTable::save dim "
<<
x
<<
" size: "
<<
dim
;
VLOG
(
3
)
<<
"CommonDenseTable::save dim "
<<
x
<<
" size: "
<<
dim
;
for
(
int
y
=
0
;
y
<
dim
;
++
y
)
{
os
.
clear
();
os
.
str
(
""
);
...
...
@@ -355,6 +364,7 @@ int32_t CommonDenseTable::save(const std::string& path,
}
}
}
}
int
retry_num
=
0
;
int
err_no
=
0
;
...
...
paddle/fluid/distributed/ps/table/ctr_accessor.cc
浏览文件 @
5dcfb699
...
...
@@ -65,7 +65,7 @@ size_t CtrCommonAccessor::mf_size() {
// pull value
size_t
CtrCommonAccessor
::
select_dim
()
{
auto
embedx_dim
=
_config
.
embedx_dim
();
return
1
+
embedx_dim
;
return
3
+
embedx_dim
;
}
size_t
CtrCommonAccessor
::
select_dim_size
(
size_t
dim
)
{
return
sizeof
(
float
);
}
...
...
@@ -213,6 +213,10 @@ int32_t CtrCommonAccessor::select(float** select_values, const float** values,
for
(
size_t
value_item
=
0
;
value_item
<
num
;
++
value_item
)
{
float
*
select_value
=
select_values
[
value_item
];
const
float
*
value
=
values
[
value_item
];
select_value
[
CtrCommonPullValue
::
show_index
()]
=
value
[
common_feature_value
.
show_index
()];
select_value
[
CtrCommonPullValue
::
click_index
()]
=
value
[
common_feature_value
.
click_index
()];
select_value
[
CtrCommonPullValue
::
embed_w_index
()]
=
value
[
common_feature_value
.
embed_w_index
()];
memcpy
(
select_value
+
CtrCommonPullValue
::
embedx_w_index
(),
...
...
paddle/fluid/distributed/ps/table/ctr_accessor.h
浏览文件 @
5dcfb699
...
...
@@ -24,6 +24,7 @@
namespace
paddle
{
namespace
distributed
{
// DownpourUnitAccessor
class
CtrCommonAccessor
:
public
ValueAccessor
{
public:
struct
CtrCommonFeatureValue
{
...
...
@@ -106,15 +107,25 @@ class CtrCommonAccessor : public ValueAccessor {
struct
CtrCommonPullValue
{
/*
float show;
float click;
float embed_w;
std::vector<float> embedx_w;
*/
static
int
dim
(
int
embedx_dim
)
{
return
1
+
embedx_dim
;
}
static
int
dim
(
int
embedx_dim
)
{
return
3
+
embedx_dim
;
}
static
int
dim_size
(
size_t
dim
)
{
return
sizeof
(
float
);
}
static
int
size
(
int
embedx_dim
)
{
return
dim
(
embedx_dim
)
*
sizeof
(
float
);
}
static
int
embed_w_index
()
{
return
0
;
}
static
int
embedx_w_index
()
{
return
1
;
}
static
int
show_index
()
{
return
0
;
}
static
int
click_index
()
{
return
1
;
}
static
int
embed_w_index
()
{
return
2
;
}
static
int
embedx_w_index
()
{
return
3
;
}
static
float
&
show
(
float
*
val
)
{
return
val
[
CtrCommonPullValue
::
show_index
()];
}
static
float
&
click
(
float
*
val
)
{
return
val
[
CtrCommonPullValue
::
click_index
()];
}
static
float
&
embed_w
(
float
*
val
)
{
return
val
[
CtrCommonPullValue
::
embed_w_index
()];
}
...
...
paddle/fluid/distributed/ps/table/depends/dense.h
浏览文件 @
5dcfb699
...
...
@@ -196,26 +196,19 @@ class DAdamD2Sum : public DenseOptimizer {
for
(
int
x
=
0
;
x
<
static_cast
<
int
>
(
names
.
size
());
++
x
)
{
if
(
names
[
x
]
==
"LearningRate"
)
{
learning_rate
=
(
*
values
)[
x
].
data
();
}
if
(
names
[
x
]
==
"Param"
)
{
}
else
if
(
names
[
x
]
==
"Param"
)
{
param
=
(
*
values
)[
x
].
data
();
}
if
(
names
[
x
]
==
"Moment"
)
{
}
else
if
(
names
[
x
]
==
"Moment"
)
{
mom_velocity
=
(
*
values
)[
x
].
data
();
}
if
(
names
[
x
]
==
"G2Sum"
)
{
}
else
if
(
names
[
x
]
==
"G2Sum"
)
{
ada_g2sum
=
(
*
values
)[
x
].
data
();
}
if
(
names
[
x
]
==
"D2Sum"
)
{
}
else
if
(
names
[
x
]
==
"D2Sum"
)
{
ada_d2sum
=
(
*
values
)[
x
].
data
();
}
if
(
names
[
x
]
==
"MomentDecayRate"
)
{
}
else
if
(
names
[
x
]
==
"MomentDecayRate"
)
{
mom_decay_rate
=
(
*
values
)[
x
].
data
();
}
if
(
names
[
x
]
==
"AdaDecayRate"
)
{
}
else
if
(
names
[
x
]
==
"AdaDecayRate"
)
{
ada_decay_rate
=
(
*
values
)[
x
].
data
();
}
if
(
names
[
x
]
==
"AdaEpsilon"
)
{
}
else
if
(
names
[
x
]
==
"AdaEpsilon"
)
{
ada_epsilon
=
(
*
values
)[
x
].
data
();
}
}
...
...
@@ -268,5 +261,34 @@ class DAdamD2Sum : public DenseOptimizer {
float
*
ada_epsilon
;
};
// for data_norm
class
DSummary
:
public
DenseOptimizer
{
public:
explicit
DSummary
(
const
CommonAccessorParameter
&
accessor
,
std
::
vector
<
std
::
vector
<
float
>>*
values
)
{
auto
&
names
=
accessor
.
params
();
for
(
int
x
=
0
;
x
<
static_cast
<
int
>
(
names
.
size
());
++
x
)
{
if
(
names
[
x
]
==
"Param"
)
{
param
=
(
*
values
)[
x
].
data
();
}
else
if
(
names
[
x
]
==
"SummaryDecayRate"
)
{
summary_decay_rate
=
(
*
values
)[
x
].
data
();
}
}
}
void
update
(
const
float
*
update_values
,
size_t
num
,
int
begin
,
int
end
)
override
{
auto
update_numel
=
end
-
begin
;
Eigen
::
Map
<
Eigen
::
MatrixXf
>
mat_w
(
param
+
begin
,
1
,
update_numel
);
Eigen
::
Map
<
const
Eigen
::
MatrixXf
>
mat_grad
(
update_values
+
begin
,
1
,
update_numel
);
mat_w
=
mat_w
*
summary_decay_rate_d
+
mat_grad
;
}
float
*
summary_decay_rate
;
double
summary_decay_rate_d
=
0.999999
;
float
*
param
;
};
}
// namespace distributed
}
// namespace paddle
paddle/fluid/distributed/ps/table/sparse_accessor.cc
0 → 100644
浏览文件 @
5dcfb699
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/distributed/ps/table/sparse_accessor.h"
#include <gflags/gflags.h>
#include "glog/logging.h"
#include "paddle/fluid/string/string_helper.h"
namespace
paddle
{
namespace
distributed
{
int
SparseAccessor
::
initialize
()
{
auto
name
=
_config
.
embed_sgd_param
().
name
();
_embed_sgd_rule
=
CREATE_PSCORE_CLASS
(
SparseValueSGDRule
,
name
);
_embed_sgd_rule
->
load_config
(
_config
.
embed_sgd_param
(),
1
);
name
=
_config
.
embedx_sgd_param
().
name
();
_embedx_sgd_rule
=
CREATE_PSCORE_CLASS
(
SparseValueSGDRule
,
name
);
_embedx_sgd_rule
->
load_config
(
_config
.
embedx_sgd_param
(),
_config
.
embedx_dim
());
sparse_feature_value
.
embed_sgd_dim
=
_embed_sgd_rule
->
dim
();
sparse_feature_value
.
embedx_dim
=
_config
.
embedx_dim
();
sparse_feature_value
.
embedx_sgd_dim
=
_embedx_sgd_rule
->
dim
();
_show_click_decay_rate
=
_config
.
ctr_accessor_param
().
show_click_decay_rate
();
return
0
;
}
void
SparseAccessor
::
GetTableInfo
(
AccessorInfo
&
info
)
{
info
.
dim
=
dim
();
info
.
size
=
size
();
info
.
select_dim
=
select_dim
();
info
.
select_size
=
select_size
();
info
.
update_dim
=
update_dim
();
info
.
update_size
=
update_size
();
info
.
fea_dim
=
fea_dim
();
}
size_t
SparseAccessor
::
dim
()
{
return
sparse_feature_value
.
dim
();
}
size_t
SparseAccessor
::
dim_size
(
size_t
dim
)
{
auto
embedx_dim
=
_config
.
embedx_dim
();
return
sparse_feature_value
.
dim_size
(
dim
,
embedx_dim
);
}
size_t
SparseAccessor
::
size
()
{
return
sparse_feature_value
.
size
();
}
size_t
SparseAccessor
::
mf_size
()
{
return
(
_config
.
embedx_dim
()
+
sparse_feature_value
.
embedx_sgd_dim
)
*
sizeof
(
float
);
// embedx embedx_g2sum
}
// pull value
size_t
SparseAccessor
::
select_dim
()
{
auto
embedx_dim
=
_config
.
embedx_dim
();
return
1
+
embedx_dim
;
}
size_t
SparseAccessor
::
select_dim_size
(
size_t
dim
)
{
return
sizeof
(
float
);
}
size_t
SparseAccessor
::
select_size
()
{
return
select_dim
()
*
sizeof
(
float
);
}
// push value
size_t
SparseAccessor
::
update_dim
()
{
auto
embedx_dim
=
_config
.
embedx_dim
();
return
4
+
embedx_dim
;
}
size_t
SparseAccessor
::
update_dim_size
(
size_t
dim
)
{
return
sizeof
(
float
);
}
size_t
SparseAccessor
::
update_size
()
{
return
update_dim
()
*
sizeof
(
float
);
}
bool
SparseAccessor
::
shrink
(
float
*
value
)
{
auto
base_threshold
=
_config
.
ctr_accessor_param
().
base_threshold
();
auto
delta_threshold
=
_config
.
ctr_accessor_param
().
delta_threshold
();
auto
delete_after_unseen_days
=
_config
.
ctr_accessor_param
().
delete_after_unseen_days
();
auto
delete_threshold
=
_config
.
ctr_accessor_param
().
delete_threshold
();
// time_decay first
sparse_feature_value
.
show
(
value
)
*=
_show_click_decay_rate
;
sparse_feature_value
.
click
(
value
)
*=
_show_click_decay_rate
;
// shrink after
auto
score
=
show_click_score
(
sparse_feature_value
.
show
(
value
),
sparse_feature_value
.
click
(
value
));
auto
unseen_days
=
sparse_feature_value
.
unseen_days
(
value
);
if
(
score
<
delete_threshold
||
unseen_days
>
delete_after_unseen_days
)
{
return
true
;
}
return
false
;
}
bool
SparseAccessor
::
save
(
float
*
value
,
int
param
)
{
auto
base_threshold
=
_config
.
ctr_accessor_param
().
base_threshold
();
auto
delta_threshold
=
_config
.
ctr_accessor_param
().
delta_threshold
();
auto
delta_keep_days
=
_config
.
ctr_accessor_param
().
delta_keep_days
();
if
(
param
==
2
)
{
delta_threshold
=
0
;
}
switch
(
param
)
{
// save all
case
0
:
{
return
true
;
}
// save xbox delta
case
1
:
// save xbox base
case
2
:
{
if
(
show_click_score
(
sparse_feature_value
.
show
(
value
),
sparse_feature_value
.
click
(
value
))
>=
base_threshold
&&
sparse_feature_value
.
delta_score
(
value
)
>=
delta_threshold
&&
sparse_feature_value
.
unseen_days
(
value
)
<=
delta_keep_days
)
{
// do this after save, because it must not be modified when retry
if
(
param
==
2
)
{
sparse_feature_value
.
delta_score
(
value
)
=
0
;
}
return
true
;
}
else
{
return
false
;
}
}
// already decayed in shrink
case
3
:
{
// do this after save, because it must not be modified when retry
// sparse_feature_value.unseen_days(value)++;
return
true
;
}
// save revert batch_model
case
5
:
{
return
true
;
}
default:
return
true
;
}
}
void
SparseAccessor
::
update_stat_after_save
(
float
*
value
,
int
param
)
{
auto
base_threshold
=
_config
.
ctr_accessor_param
().
base_threshold
();
auto
delta_threshold
=
_config
.
ctr_accessor_param
().
delta_threshold
();
auto
delta_keep_days
=
_config
.
ctr_accessor_param
().
delta_keep_days
();
if
(
param
==
2
)
{
delta_threshold
=
0
;
}
switch
(
param
)
{
case
1
:
{
if
(
show_click_score
(
sparse_feature_value
.
show
(
value
),
sparse_feature_value
.
click
(
value
))
>=
base_threshold
&&
sparse_feature_value
.
delta_score
(
value
)
>=
delta_threshold
&&
sparse_feature_value
.
unseen_days
(
value
)
<=
delta_keep_days
)
{
sparse_feature_value
.
delta_score
(
value
)
=
0
;
}
}
return
;
case
3
:
{
sparse_feature_value
.
unseen_days
(
value
)
++
;
}
return
;
default:
return
;
}
}
int32_t
SparseAccessor
::
create
(
float
**
values
,
size_t
num
)
{
auto
embedx_dim
=
_config
.
embedx_dim
();
for
(
size_t
value_item
=
0
;
value_item
<
num
;
++
value_item
)
{
float
*
value
=
values
[
value_item
];
value
[
sparse_feature_value
.
unseen_days_index
()]
=
0
;
value
[
sparse_feature_value
.
delta_score_index
()]
=
0
;
value
[
sparse_feature_value
.
show_index
()]
=
0
;
value
[
sparse_feature_value
.
click_index
()]
=
0
;
value
[
sparse_feature_value
.
slot_index
()]
=
-
1
;
_embed_sgd_rule
->
init_value
(
value
+
sparse_feature_value
.
embed_w_index
(),
value
+
sparse_feature_value
.
embed_g2sum_index
());
_embedx_sgd_rule
->
init_value
(
value
+
sparse_feature_value
.
embedx_w_index
(),
value
+
sparse_feature_value
.
embedx_g2sum_index
(),
false
);
}
return
0
;
}
bool
SparseAccessor
::
need_extend_mf
(
float
*
value
)
{
float
show
=
value
[
sparse_feature_value
.
show_index
()];
float
click
=
value
[
sparse_feature_value
.
click_index
()];
float
score
=
(
show
-
click
)
*
_config
.
ctr_accessor_param
().
nonclk_coeff
()
+
click
*
_config
.
ctr_accessor_param
().
click_coeff
();
return
score
>=
_config
.
embedx_threshold
();
}
bool
SparseAccessor
::
has_mf
(
size_t
size
)
{
return
size
>
sparse_feature_value
.
embedx_g2sum_index
();
}
// from SparseFeatureValue to SparsePullValue
int32_t
SparseAccessor
::
select
(
float
**
select_values
,
const
float
**
values
,
size_t
num
)
{
auto
embedx_dim
=
_config
.
embedx_dim
();
for
(
size_t
value_item
=
0
;
value_item
<
num
;
++
value_item
)
{
float
*
select_value
=
select_values
[
value_item
];
const
float
*
value
=
values
[
value_item
];
select_value
[
SparsePullValue
::
embed_w_index
()]
=
value
[
sparse_feature_value
.
embed_w_index
()];
memcpy
(
select_value
+
SparsePullValue
::
embedx_w_index
(),
value
+
sparse_feature_value
.
embedx_w_index
(),
embedx_dim
*
sizeof
(
float
));
}
return
0
;
}
// from SparsePushValue to SparsePushValue
// first dim: item
// second dim: field num
int32_t
SparseAccessor
::
merge
(
float
**
update_values
,
const
float
**
other_update_values
,
size_t
num
)
{
auto
embedx_dim
=
_config
.
embedx_dim
();
size_t
total_dim
=
SparsePushValue
::
dim
(
embedx_dim
);
for
(
size_t
value_item
=
0
;
value_item
<
num
;
++
value_item
)
{
float
*
update_value
=
update_values
[
value_item
];
const
float
*
other_update_value
=
other_update_values
[
value_item
];
for
(
auto
i
=
0u
;
i
<
total_dim
;
++
i
)
{
if
(
i
!=
SparsePushValue
::
slot_index
())
{
update_value
[
i
]
+=
other_update_value
[
i
];
}
}
}
return
0
;
}
// from SparsePushValue to SparseFeatureValue
// first dim: item
// second dim: field num
int32_t
SparseAccessor
::
update
(
float
**
update_values
,
const
float
**
push_values
,
size_t
num
)
{
auto
embedx_dim
=
_config
.
embedx_dim
();
for
(
size_t
value_item
=
0
;
value_item
<
num
;
++
value_item
)
{
float
*
update_value
=
update_values
[
value_item
];
const
float
*
push_value
=
push_values
[
value_item
];
float
push_show
=
push_value
[
SparsePushValue
::
show_index
()];
float
push_click
=
push_value
[
SparsePushValue
::
click_index
()];
float
slot
=
push_value
[
SparsePushValue
::
slot_index
()];
update_value
[
sparse_feature_value
.
show_index
()]
+=
push_show
;
update_value
[
sparse_feature_value
.
click_index
()]
+=
push_click
;
update_value
[
sparse_feature_value
.
slot_index
()]
=
slot
;
update_value
[
sparse_feature_value
.
delta_score_index
()]
+=
(
push_show
-
push_click
)
*
_config
.
ctr_accessor_param
().
nonclk_coeff
()
+
push_click
*
_config
.
ctr_accessor_param
().
click_coeff
();
update_value
[
sparse_feature_value
.
unseen_days_index
()]
=
0
;
_embed_sgd_rule
->
update_value
(
update_value
+
sparse_feature_value
.
embed_w_index
(),
update_value
+
sparse_feature_value
.
embed_g2sum_index
(),
push_value
+
SparsePushValue
::
embed_g_index
());
_embedx_sgd_rule
->
update_value
(
update_value
+
sparse_feature_value
.
embedx_w_index
(),
update_value
+
sparse_feature_value
.
embedx_g2sum_index
(),
push_value
+
SparsePushValue
::
embedx_g_index
());
}
return
0
;
}
bool
SparseAccessor
::
create_value
(
int
stage
,
const
float
*
value
)
{
// stage == 0, pull
// stage == 1, push
if
(
stage
==
0
)
{
return
true
;
}
else
if
(
stage
==
1
)
{
// operation
auto
show
=
SparsePushValue
::
show
(
const_cast
<
float
*>
(
value
));
auto
click
=
SparsePushValue
::
click
(
const_cast
<
float
*>
(
value
));
auto
score
=
show_click_score
(
show
,
click
);
if
(
score
<=
0
)
{
return
false
;
}
if
(
score
>=
1
)
{
return
true
;
}
return
local_uniform_real_distribution
<
float
>
()(
local_random_engine
())
<
score
;
}
else
{
return
true
;
}
}
float
SparseAccessor
::
show_click_score
(
float
show
,
float
click
)
{
auto
nonclk_coeff
=
_config
.
ctr_accessor_param
().
nonclk_coeff
();
auto
click_coeff
=
_config
.
ctr_accessor_param
().
click_coeff
();
return
(
show
-
click
)
*
nonclk_coeff
+
click
*
click_coeff
;
}
std
::
string
SparseAccessor
::
parse_to_string
(
const
float
*
v
,
int
param
)
{
thread_local
std
::
ostringstream
os
;
os
.
clear
();
os
.
str
(
""
);
os
<<
v
[
0
]
<<
" "
<<
v
[
1
]
<<
" "
<<
v
[
2
]
<<
" "
<<
v
[
3
]
<<
" "
<<
v
[
4
]
<<
" "
<<
v
[
5
];
for
(
int
i
=
sparse_feature_value
.
embed_g2sum_index
();
i
<
sparse_feature_value
.
embedx_w_index
();
i
++
)
{
os
<<
" "
<<
v
[
i
];
}
auto
show
=
sparse_feature_value
.
show
(
const_cast
<
float
*>
(
v
));
auto
click
=
sparse_feature_value
.
click
(
const_cast
<
float
*>
(
v
));
auto
score
=
show_click_score
(
show
,
click
);
if
(
score
>=
_config
.
embedx_threshold
()
&&
param
>
sparse_feature_value
.
embedx_w_index
())
{
for
(
auto
i
=
sparse_feature_value
.
embedx_w_index
();
i
<
sparse_feature_value
.
dim
();
++
i
)
{
os
<<
" "
<<
v
[
i
];
}
}
return
os
.
str
();
}
int
SparseAccessor
::
parse_from_string
(
const
std
::
string
&
str
,
float
*
value
)
{
int
embedx_dim
=
_config
.
embedx_dim
();
_embedx_sgd_rule
->
init_value
(
value
+
sparse_feature_value
.
embedx_w_index
(),
value
+
sparse_feature_value
.
embedx_g2sum_index
());
auto
ret
=
paddle
::
string
::
str_to_float
(
str
.
data
(),
value
);
CHECK
(
ret
>=
6
)
<<
"expect more than 6 real:"
<<
ret
;
return
ret
;
}
}
// namespace distributed
}
// namespace paddle
paddle/fluid/distributed/ps/table/sparse_accessor.h
0 → 100644
浏览文件 @
5dcfb699
// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <stdint.h>
#include <stdio.h>
#include <vector>
#include "paddle/fluid/distributed/common/registerer.h"
#include "paddle/fluid/distributed/ps.pb.h"
#include "paddle/fluid/distributed/ps/table/accessor.h"
#include "paddle/fluid/distributed/ps/table/sparse_sgd_rule.h"
namespace
paddle
{
namespace
distributed
{
// no show click, for word2vec(DownpourSparseValueAccessor)
class
SparseAccessor
:
public
ValueAccessor
{
public:
struct
SparseFeatureValue
{
/*
float slot;
float unseen_days;
float delta_score;
float show;
float click;
float embed_w;
std::vector<float> embed_g2sum;
std::vector<float> embedx_w;
std::<vector>float embedx_g2sum;
*/
int
dim
()
{
return
6
+
embed_sgd_dim
+
embedx_sgd_dim
+
embedx_dim
;
}
int
dim_size
(
size_t
dim
,
int
embedx_dim
)
{
return
sizeof
(
float
);
}
int
size
()
{
return
dim
()
*
sizeof
(
float
);
}
int
slot_index
()
{
return
0
;
}
int
unseen_days_index
()
{
return
slot_index
()
+
1
;
}
int
delta_score_index
()
{
return
unseen_days_index
()
+
1
;
}
int
show_index
()
{
return
delta_score_index
()
+
1
;
}
int
click_index
()
{
return
show_index
()
+
1
;
}
int
embed_w_index
()
{
return
click_index
()
+
1
;
}
int
embed_g2sum_index
()
{
return
embed_w_index
()
+
1
;
}
int
embedx_w_index
()
{
return
embed_g2sum_index
()
+
embed_sgd_dim
;
}
int
embedx_g2sum_index
()
{
return
embedx_w_index
()
+
embedx_dim
;
}
float
&
unseen_days
(
float
*
val
)
{
return
val
[
unseen_days_index
()];
}
float
&
delta_score
(
float
*
val
)
{
return
val
[
delta_score_index
()];
}
float
&
show
(
float
*
val
)
{
return
val
[
show_index
()];
}
float
&
click
(
float
*
val
)
{
return
val
[
click_index
()];
}
float
&
slot
(
float
*
val
)
{
return
val
[
slot_index
()];
}
float
&
embed_w
(
float
*
val
)
{
return
val
[
embed_w_index
()];
}
float
&
embed_g2sum
(
float
*
val
)
{
return
val
[
embed_g2sum_index
()];
}
float
&
embedx_w
(
float
*
val
)
{
return
val
[
embedx_w_index
()];
}
float
&
embedx_g2sum
(
float
*
val
)
{
return
val
[
embedx_g2sum_index
()];
}
int
embed_sgd_dim
;
int
embedx_dim
;
int
embedx_sgd_dim
;
};
struct
SparsePushValue
{
/*
float slot;
float show;
float click;
float embed_g;
std::vector<float> embedx_g;
*/
static
int
dim
(
int
embedx_dim
)
{
return
4
+
embedx_dim
;
}
static
int
dim_size
(
int
dim
,
int
embedx_dim
)
{
return
sizeof
(
float
);
}
static
int
size
(
int
embedx_dim
)
{
return
dim
(
embedx_dim
)
*
sizeof
(
float
);
}
static
int
slot_index
()
{
return
0
;
}
static
int
show_index
()
{
return
SparsePushValue
::
slot_index
()
+
1
;
}
static
int
click_index
()
{
return
SparsePushValue
::
show_index
()
+
1
;
}
static
int
embed_g_index
()
{
return
SparsePushValue
::
click_index
()
+
1
;
}
static
int
embedx_g_index
()
{
return
SparsePushValue
::
embed_g_index
()
+
1
;
}
static
float
&
slot
(
float
*
val
)
{
return
val
[
SparsePushValue
::
slot_index
()];
}
static
float
&
show
(
float
*
val
)
{
return
val
[
SparsePushValue
::
show_index
()];
}
static
float
&
click
(
float
*
val
)
{
return
val
[
SparsePushValue
::
click_index
()];
}
static
float
&
embed_g
(
float
*
val
)
{
return
val
[
SparsePushValue
::
embed_g_index
()];
}
static
float
*
embedx_g
(
float
*
val
)
{
return
val
+
SparsePushValue
::
embedx_g_index
();
}
};
struct
SparsePullValue
{
/*
float embed_w;
std::vector<float> embedx_w;
*/
static
int
dim
(
int
embedx_dim
)
{
return
1
+
embedx_dim
;
}
static
int
dim_size
(
size_t
dim
)
{
return
sizeof
(
float
);
}
static
int
size
(
int
embedx_dim
)
{
return
dim
(
embedx_dim
)
*
sizeof
(
float
);
}
static
int
embed_w_index
()
{
return
0
;
}
static
int
embedx_w_index
()
{
return
1
;
}
static
float
&
embed_w
(
float
*
val
)
{
return
val
[
SparsePullValue
::
embed_w_index
()];
}
static
float
*
embedx_w
(
float
*
val
)
{
return
val
+
SparsePullValue
::
embedx_w_index
();
}
};
SparseAccessor
()
{}
virtual
int
initialize
();
virtual
void
GetTableInfo
(
AccessorInfo
&
info
);
virtual
~
SparseAccessor
()
{}
// value维度
virtual
size_t
dim
();
// value各个维度的size
virtual
size_t
dim_size
(
size_t
dim
);
// value各维度相加总size
virtual
size_t
size
();
// value中mf动态长度部分总size大小, sparse下生效
virtual
size_t
mf_size
();
// pull value维度
virtual
size_t
select_dim
();
// pull value各个维度的size
virtual
size_t
select_dim_size
(
size_t
dim
);
// pull value各维度相加总size
virtual
size_t
select_size
();
// push value维度
virtual
size_t
update_dim
();
// push value各个维度的size
virtual
size_t
update_dim_size
(
size_t
dim
);
// push value各维度相加总size
virtual
size_t
update_size
();
// 判断该value是否进行shrink
virtual
bool
shrink
(
float
*
value
);
// 判断该value是否保存到ssd
// virtual bool save_ssd(float* value);
virtual
bool
need_extend_mf
(
float
*
value
);
virtual
bool
has_mf
(
size_t
size
);
// 判断该value是否在save阶段dump,
// param作为参数用于标识save阶段,如downpour的xbox与batch_model
// param = 0, save all feature
// param = 1, save delta feature
// param = 2, save xbox base feature
bool
save
(
float
*
value
,
int
param
)
override
;
// update delta_score and unseen_days after save
void
update_stat_after_save
(
float
*
value
,
int
param
)
override
;
// keys不存在时,为values生成随机值
// 要求value的内存由外部调用者分配完毕
virtual
int32_t
create
(
float
**
value
,
size_t
num
);
// 从values中选取到select_values中
virtual
int32_t
select
(
float
**
select_values
,
const
float
**
values
,
size_t
num
);
// 将update_values聚合到一起
virtual
int32_t
merge
(
float
**
update_values
,
const
float
**
other_update_values
,
size_t
num
);
// 将update_values聚合到一起,通过it.next判定是否进入下一个key
// virtual int32_t merge(float** update_values, iterator it);
// 将update_values更新应用到values中
virtual
int32_t
update
(
float
**
values
,
const
float
**
update_values
,
size_t
num
);
std
::
string
parse_to_string
(
const
float
*
value
,
int
param
)
override
;
int32_t
parse_from_string
(
const
std
::
string
&
str
,
float
*
v
)
override
;
virtual
bool
create_value
(
int
type
,
const
float
*
value
);
// 这个接口目前只用来取show
float
get_field
(
float
*
value
,
const
std
::
string
&
name
)
override
{
// CHECK(name == "show");
if
(
name
==
"show"
)
{
return
sparse_feature_value
.
show
(
value
);
}
return
0.0
;
}
private:
// float show_click_score(float show, float click);
// SparseValueSGDRule* _embed_sgd_rule;
// SparseValueSGDRule* _embedx_sgd_rule;
// SparseFeatureValue sparse_feature_value;
float
_show_click_decay_rate
;
int32_t
_ssd_unseenday_threshold
;
public:
// TODO(zhaocaibei123): it should be private, but we make it public
// for unit test
SparseFeatureValue
sparse_feature_value
;
float
show_click_score
(
float
show
,
float
click
);
SparseValueSGDRule
*
_embed_sgd_rule
;
SparseValueSGDRule
*
_embedx_sgd_rule
;
};
}
// namespace distributed
}
// namespace paddle
paddle/fluid/distributed/ps/table/table.cc
浏览文件 @
5dcfb699
...
...
@@ -27,6 +27,7 @@
#endif
#include "paddle/fluid/distributed/ps/table/ctr_accessor.h"
#include "paddle/fluid/distributed/ps/table/memory_sparse_table.h"
#include "paddle/fluid/distributed/ps/table/sparse_accessor.h"
#include "paddle/fluid/distributed/ps/table/tensor_accessor.h"
#include "paddle/fluid/distributed/ps/table/tensor_table.h"
...
...
@@ -49,6 +50,7 @@ REGISTER_PSCORE_CLASS(Table, MemorySparseTable);
REGISTER_PSCORE_CLASS
(
Table
,
MemorySparseGeoTable
);
REGISTER_PSCORE_CLASS
(
ValueAccessor
,
CommMergeAccessor
);
REGISTER_PSCORE_CLASS
(
ValueAccessor
,
CtrCommonAccessor
);
REGISTER_PSCORE_CLASS
(
ValueAccessor
,
SparseAccessor
);
REGISTER_PSCORE_CLASS
(
SparseValueSGDRule
,
StdAdaGradSGDRule
);
REGISTER_PSCORE_CLASS
(
SparseValueSGDRule
,
SparseAdamSGDRule
);
REGISTER_PSCORE_CLASS
(
SparseValueSGDRule
,
SparseNaiveSGDRule
);
...
...
paddle/fluid/distributed/ps/wrapper/fleet.cc
浏览文件 @
5dcfb699
...
...
@@ -12,9 +12,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/distributed/ps/wrapper/fleet.h"
#include <google/protobuf/text_format.h>
#include "paddle/fluid/distributed/ps/service/communicator/communicator.h"
#include "paddle/fluid/distributed/ps/table/table.h"
#include "paddle/fluid/distributed/ps/wrapper/fleet.h"
namespace
paddle
{
namespace
distributed
{
...
...
@@ -29,6 +31,25 @@ std::shared_ptr<FleetWrapper> FleetWrapper::s_instance_ = NULL;
bool
FleetWrapper
::
is_initialized_
=
false
;
std
::
shared_ptr
<
paddle
::
distributed
::
PSCore
>
FleetWrapper
::
pserver_ptr_
=
NULL
;
std
::
shared_ptr
<
paddle
::
distributed
::
PSClient
>
FleetWrapper
::
worker_ptr_
=
NULL
;
int
FleetWrapper
::
RegisterHeterCallback
(
HeterCallBackFunc
handler
)
{
VLOG
(
0
)
<<
"RegisterHeterCallback support later"
;
return
0
;
}
int32_t
FleetWrapper
::
CopyTable
(
const
uint64_t
src_table_id
,
const
uint64_t
dest_table_id
)
{
VLOG
(
0
)
<<
"CopyTable support later"
;
return
0
;
}
int32_t
FleetWrapper
::
CopyTableByFeasign
(
const
uint64_t
src_table_id
,
const
uint64_t
dest_table_id
,
const
std
::
vector
<
uint64_t
>&
feasign_list
)
{
VLOG
(
0
)
<<
"CopyTableByFeasign support later"
;
return
0
;
}
void
FleetWrapper
::
Stop
()
{
StopServer
();
}
...
...
@@ -88,63 +109,59 @@ void FleetWrapper::InitServer(
}
}
// void FleetWrapper::InitWorker(
// const std::string& dist_desc, const std::vector<uint64_t>&
// host_sign_list, Scope* scope, const RpcCtxMap& send_ctx, const
// std::unordered_map<uint64_t, std::vector<std::string>>&
// dense_varnames,
// const std::map<std::string, std::string>& envs, int node_num, int index)
// {
// if (!is_initialized_) {
// VLOG(3) << "Going to init worker";
// Communicator::InitInstance<AsyncCommunicator>(
// send_ctx, dense_varnames, dist_desc, host_sign_list, scope, envs);
// pserver_ptr_ = std::shared_ptr<paddle::distributed::PSCore>(
// new paddle::distributed::PSCore());
// pserver_ptr_->init_worker(dist_desc, _regions,
// const_cast<uint64_t*>(host_sign_list.data()),
// node_num, index);
// is_initialized_ = true;
// } else {
// VLOG(3) << "Worker can be initialized only once";
// }
// }
void
FleetWrapper
::
InitWorker
(
const
std
::
string
&
dist_desc
,
const
std
::
vector
<
std
::
string
>&
host_sign_list
,
Scope
*
scope
,
const
RpcCtxMap
&
send_ctx
,
const
std
::
unordered_map
<
uint64_t
,
std
::
vector
<
std
::
string
>>&
dense_varnames
,
const
std
::
map
<
std
::
string
,
std
::
string
>&
envs
,
int
node_num
,
int
index
)
{
if
(
!
is_initialized_
)
{
VLOG
(
3
)
<<
"Going to init worker"
;
Communicator
::
InitInstance
<
AsyncCommunicator
>
(
send_ctx
,
dense_varnames
,
dist_desc
,
host_sign_list
,
scope
,
envs
);
void
FleetWrapper
::
InitGFlag
(
const
std
::
string
&
gflags
)
{
VLOG
(
3
)
<<
"Init With Gflags:"
<<
gflags
;
std
::
vector
<
std
::
string
>
flags
=
paddle
::
string
::
split_string
(
gflags
);
if
(
flags
.
size
()
<
1
)
{
flags
.
push_back
(
"-max_body_size=314217728"
);
flags
.
push_back
(
"-bthread_concurrency=40"
);
flags
.
push_back
(
"-socket_max_unwritten_bytes=2048000000"
);
flags
.
push_back
(
"-max_connection_pool_size=1950"
);
}
auto
it
=
flags
.
begin
();
flags
.
insert
(
it
,
"exe default"
);
char
*
flags_ptr
[
flags
.
size
()];
for
(
size_t
i
=
0
;
i
<
flags
.
size
();
++
i
)
{
flags_ptr
[
i
]
=
(
char
*
)(
flags
[
i
].
c_str
());
// NOLINT
}
int
params_cnt
=
flags
.
size
();
char
**
params_ptr
=
&
(
flags_ptr
[
0
]);
::
GFLAGS_NAMESPACE
::
ParseCommandLineFlags
(
&
params_cnt
,
&
params_ptr
,
true
);
}
pserver_ptr_
=
std
::
shared_ptr
<
paddle
::
distributed
::
PSCore
>
(
new
paddle
::
distributed
::
PSCore
());
pserver_ptr_
->
init_worker
(
dist_desc
,
_regions
,
&
host_sign_list
,
node_num
,
index
);
is_initialized_
=
true
;
void
FleetWrapper
::
InitWorker
(
const
std
::
string
&
dist_desc
,
const
std
::
vector
<
std
::
string
>&
host_sign_list
,
int
index
)
{
if
(
!
is_initialized_
)
{
// not used, just for psclient's init
// TODO(zhaocaibei123): remove this later
std
::
map
<
uint64_t
,
std
::
vector
<
paddle
::
distributed
::
Region
>>
dense_pull_regions
;
if
(
worker_ptr_
.
get
()
==
nullptr
)
{
paddle
::
distributed
::
PSParameter
ps_param
;
google
::
protobuf
::
TextFormat
::
ParseFromString
(
dist_desc
,
&
ps_param
);
InitGFlag
(
ps_param
.
init_gflags
());
int
servers
=
host_sign_list
.
size
();
ps_env_
.
set_ps_servers
(
&
host_sign_list
,
servers
);
worker_ptr_
=
std
::
shared_ptr
<
paddle
::
distributed
::
PSClient
>
(
paddle
::
distributed
::
PSClientFactory
::
create
(
ps_param
));
worker_ptr_
->
configure
(
ps_param
,
dense_pull_regions
,
ps_env_
,
index
);
}
}
else
{
VLOG
(
3
)
<<
"
Worker
can be initialized only once"
;
VLOG
(
3
)
<<
"
Client
can be initialized only once"
;
}
}
void
FleetWrapper
::
StopServer
()
{
VLOG
(
3
)
<<
"Going to stop server"
;
auto
*
communicator
=
Communicator
::
GetInstance
();
auto
status
=
communicator
->
_worker_ptr
->
stop_server
();
auto
status
=
worker_ptr_
->
stop_server
();
status
.
wait
();
}
void
FleetWrapper
::
FinalizeWorker
()
{
VLOG
(
3
)
<<
"Going to finalize worker"
;
pserv
er_ptr_
->
finalize_worker
();
work
er_ptr_
->
finalize_worker
();
}
void
FleetWrapper
::
BarrierWithTable
(
uint32_t
barrier_type
)
{
...
...
@@ -161,15 +178,21 @@ uint64_t FleetWrapper::RunServer(const std::string& ip, uint32_t port) {
std
::
vector
<
uint64_t
>
FleetWrapper
::
GetClientsInfo
()
{
VLOG
(
3
)
<<
"Going to get client info"
;
auto
*
communicator
=
Communicator
::
GetInstance
();
std
::
vector
<
uint64_t
>
res
=
communicator
->
GetClientInfo
();
std
::
vector
<
uint64_t
>
res
=
ps_env_
.
get_client_info
();
for
(
auto
rr
:
res
)
{
VLOG
(
2
)
<<
"FleetWrapper::GetClientInfo "
<<
rr
;
}
return
res
;
}
int
FleetWrapper
::
SetClients
(
std
::
vector
<
uint64_t
>&
host_sign_list
)
{
int
node
=
host_sign_list
.
size
();
return
ps_env_
.
set_ps_clients
(
host_sign_list
.
data
(),
node
);
}
void
FleetWrapper
::
CreateClient2ClientConnection
()
{
VLOG
(
1
)
<<
"Going to create client2client connection"
;
auto
*
communicator
=
Communicator
::
GetInstance
();
communicator
->
_worker_ptr
->
create_client2client_connection
(
worker_ptr_
->
create_client2client_connection
(
client2client_request_timeout_ms_
,
client2client_connect_timeout_ms_
,
client2client_max_retry_
);
}
...
...
@@ -314,10 +337,9 @@ void FleetWrapper::PullSparseToTensorSync(const uint64_t table_id, int fea_dim,
pull_result_ptr
.
push_back
(
output_data
+
output_len
);
}
}
auto
*
communicator
=
Communicator
::
GetInstance
();
auto
status
=
communicator
->
_worker_ptr
->
pull_sparse
(
pull_result_ptr
.
data
(),
table_id
,
fea_keys
.
data
(),
fea_keys
.
size
(),
is_training
);
auto
status
=
worker_ptr_
->
pull_sparse
(
pull_result_ptr
.
data
(),
table_id
,
fea_keys
.
data
(),
fea_keys
.
size
(),
is_training
);
status
.
wait
();
auto
ret
=
status
.
get
();
if
(
ret
!=
0
)
{
...
...
@@ -344,8 +366,7 @@ void FleetWrapper::PullDenseVarsAsync(
paddle
::
distributed
::
Region
reg
(
w
,
tensor
->
numel
());
regions
[
i
]
=
std
::
move
(
reg
);
}
auto
status
=
pserver_ptr_
->
_worker_ptr
->
pull_dense
(
regions
.
data
(),
regions
.
size
(),
tid
);
auto
status
=
worker_ptr_
->
pull_dense
(
regions
.
data
(),
regions
.
size
(),
tid
);
pull_dense_status
->
push_back
(
std
::
move
(
status
));
}
...
...
@@ -362,9 +383,7 @@ void FleetWrapper::PullDenseVarsSync(
paddle
::
distributed
::
Region
reg
(
w
,
tensor
->
numel
());
regions
.
emplace_back
(
std
::
move
(
reg
));
}
auto
*
communicator
=
Communicator
::
GetInstance
();
auto
status
=
communicator
->
_worker_ptr
->
pull_dense
(
regions
.
data
(),
regions
.
size
(),
tid
);
auto
status
=
worker_ptr_
->
pull_dense
(
regions
.
data
(),
regions
.
size
(),
tid
);
status
.
wait
();
}
...
...
@@ -381,9 +400,8 @@ void FleetWrapper::PushDenseParamSync(
paddle
::
distributed
::
Region
reg
(
g
,
tensor
->
numel
());
regions
.
emplace_back
(
std
::
move
(
reg
));
}
auto
*
communicator
=
Communicator
::
GetInstance
();
auto
push_status
=
communicator
->
_worker_ptr
->
push_dense_param
(
regions
.
data
(),
regions
.
size
(),
table_id
);
auto
push_status
=
worker_ptr_
->
push_dense_param
(
regions
.
data
(),
regions
.
size
(),
table_id
);
push_status
.
wait
();
auto
status
=
push_status
.
get
();
CHECK
(
status
==
0
)
<<
"push dense param failed, status["
<<
status
<<
"]"
;
...
...
@@ -404,7 +422,24 @@ void FleetWrapper::PushDenseVarsAsync(
Variable
*
var
=
scope
.
FindVar
(
t
);
CHECK
(
var
!=
nullptr
)
<<
"var["
<<
t
<<
"] not found"
;
LoDTensor
*
tensor
=
var
->
GetMutable
<
LoDTensor
>
();
int
count
=
tensor
->
numel
();
float
*
g
=
tensor
->
mutable_data
<
float
>
(
place
);
// TODO(zhaocaibei123): how to get batch_size in op?
if
(
scale_datanorm
>=
0
)
{
if
(
t
.
find
(
".batch_size@GRAD"
)
!=
std
::
string
::
npos
||
t
.
find
(
".batch_sum@GRAD"
)
!=
std
::
string
::
npos
)
{
Eigen
::
Map
<
Eigen
::
MatrixXf
>
mat
(
g
,
1
,
count
);
float
scale
=
1.0
/
batch_size
;
mat
*=
scale
;
}
else
if
(
t
.
find
(
".batch_square_sum@GRAD"
)
!=
std
::
string
::
npos
)
{
VLOG
(
3
)
<<
"epsilon: "
<<
scale_datanorm
;
for
(
int
i
=
0
;
i
<
count
;
++
i
)
{
g
[
i
]
=
(
g
[
i
]
-
batch_size
*
scale_datanorm
)
/
batch_size
+
batch_size
*
scale_datanorm
;
}
}
}
paddle
::
distributed
::
Region
reg
(
g
,
tensor
->
numel
());
regions
.
emplace_back
(
std
::
move
(
reg
));
VLOG
(
3
)
<<
"FleetWrapper::PushDenseVarsAsync Var "
<<
t
<<
" talbe_id "
...
...
@@ -412,12 +447,8 @@ void FleetWrapper::PushDenseVarsAsync(
<<
g
[
tensor
->
numel
()
-
1
];
}
auto
*
communicator
=
dynamic_cast
<
AsyncCommunicator
*>
(
Communicator
::
GetInstance
());
auto
push_status
=
communicator
->
_worker_ptr
->
push_dense
(
regions
.
data
(),
regions
.
size
(),
table_id
);
communicator
->
PushDensePostProcessing
();
auto
push_status
=
worker_ptr_
->
push_dense
(
regions
.
data
(),
regions
.
size
(),
table_id
);
}
void
FleetWrapper
::
PushSparseVarsAsync
(
...
...
@@ -463,7 +494,7 @@ void FleetWrapper::PushSparseFromTensorAsync(
const
uint64_t
table_id
,
int
fea_dim
,
uint64_t
padding_id
,
platform
::
Place
place
,
std
::
vector
<
const
LoDTensor
*>*
inputs
,
const
LoDTensor
*
shows
,
const
LoDTensor
*
clks
,
std
::
vector
<
LoDTensor
*>*
outputs
)
{
std
::
vector
<
LoDTensor
*>*
outputs
,
bool
use_cvm_op
)
{
int
batch_size
=
-
1
;
bool
batch_size_consist
=
true
;
for
(
auto
*
input
:
*
inputs
)
{
...
...
@@ -471,7 +502,7 @@ void FleetWrapper::PushSparseFromTensorAsync(
input
->
lod
().
size
()
?
input
->
lod
()[
0
].
size
()
-
1
:
input
->
dims
()[
0
];
if
(
batch_size
==
-
1
)
{
batch_size
=
cur_batch_size
;
}
else
{
}
else
if
(
batch_size
!=
cur_batch_size
)
{
// CHECK(batch_size == cur_batch_size); // NOLINT
batch_size_consist
=
false
;
break
;
...
...
@@ -511,8 +542,12 @@ void FleetWrapper::PushSparseFromTensorAsync(
Eigen
::
Map
<
Eigen
::
Matrix
<
float
,
Eigen
::
Dynamic
,
Eigen
::
Dynamic
,
Eigen
::
RowMajor
>>
g_mat
(
g
,
g_tensor
->
numel
()
/
fea_dim
,
fea_dim
);
if
(
use_cvm_op
)
{
g_mat
.
rightCols
(
fea_dim
-
2
)
*=
batch_size
;
}
else
{
g_mat
.
rightCols
(
fea_dim
)
*=
batch_size
;
}
}
const
framework
::
LoDTensor
*
tensor
=
inputs
->
at
(
index
);
const
int64_t
*
ids
=
tensor
->
data
<
int64_t
>
();
...
...
@@ -528,19 +563,24 @@ void FleetWrapper::PushSparseFromTensorAsync(
continue
;
}
push_keys
.
emplace_back
(
real_id
);
if
(
use_cvm_op
)
{
push_values
.
emplace_back
(
fea_dim
+
1
);
push_values
.
back
()[
0
]
=
2
;
// TODO(zhaocaibei123): slot
float
*
data
=
push_values
.
back
().
data
()
+
1
;
memcpy
(
data
,
g
+
output_len
,
sizeof
(
float
)
*
fea_dim
);
}
else
{
push_values
.
emplace_back
(
fea_dim
+
3
);
// slot show clk grad... consistent with CtrCommonPushValue defined in
// slot show clk grad... consistent with CtrCommonPushValue defined
// in
// ctr_accessor.h
push_values
.
back
()[
0
]
=
2
;
// TODO(zhaocaibei123): slot
push_values
.
back
()[
1
]
=
(
i
>=
show_size
?
1
:
static_cast
<
float
>
(
show_tensor
[
i
]));
push_values
.
back
()[
2
]
=
(
i
>=
clk_size
?
0
:
static_cast
<
float
>
(
clk_tensor
[
i
]));
float
*
data
=
push_values
.
back
().
data
()
+
3
;
memcpy
(
data
,
g
+
output_len
,
sizeof
(
float
)
*
fea_dim
);
}
++
input_idx
;
}
}
...
...
@@ -551,6 +591,12 @@ void FleetWrapper::PushSparseFromTensorAsync(
continue
;
}
push_keys
.
emplace_back
(
real_id
);
if
(
use_cvm_op
)
{
push_values
.
emplace_back
(
fea_dim
+
1
);
push_values
.
back
()[
0
]
=
2
;
// TODO(zhaocaibei123): slot
float
*
data
=
push_values
.
back
().
data
()
+
1
;
memcpy
(
data
,
g
+
output_len
,
sizeof
(
float
)
*
fea_dim
);
}
else
{
push_values
.
emplace_back
(
fea_dim
+
3
);
// slot show clk grad... consistent with CtrCommonPushValue defined in
// ctr_accessor.h
...
...
@@ -559,11 +605,9 @@ void FleetWrapper::PushSparseFromTensorAsync(
(
i
>=
show_size
?
1
:
static_cast
<
float
>
(
show_tensor
[
i
]));
push_values
.
back
()[
2
]
=
(
i
>=
clk_size
?
0
:
static_cast
<
float
>
(
clk_tensor
[
i
]));
float
*
data
=
push_values
.
back
().
data
()
+
3
;
memcpy
(
data
,
g
+
output_len
,
sizeof
(
float
)
*
fea_dim
);
}
++
input_idx
;
}
}
...
...
@@ -576,19 +620,13 @@ void FleetWrapper::PushSparseFromTensorAsync(
push_g_vec
[
i
]
=
push_values
.
at
(
i
).
data
();
}
auto
*
communicator
=
Communicator
::
GetInstance
();
PADDLE_ENFORCE_EQ
(
communicator
->
Check
(
table_id
),
true
,
platform
::
errors
::
InvalidArgument
(
"can not find table: %s, please check your config"
,
table_id
));
auto
status
=
communicator
->
_worker_ptr
->
push_sparse
(
table_id
,
push_keys
.
data
(),
(
const
float
**
)
push_g_vec
.
data
(),
auto
status
=
worker_ptr_
->
push_sparse
(
table_id
,
push_keys
.
data
(),
(
const
float
**
)
push_g_vec
.
data
(),
push_keys
.
size
());
}
void
FleetWrapper
::
LoadModel
(
const
std
::
string
&
path
,
const
int
mode
)
{
auto
*
communicator
=
Communicator
::
GetInstance
();
auto
ret
=
communicator
->
_worker_ptr
->
load
(
path
,
std
::
to_string
(
mode
));
auto
ret
=
worker_ptr_
->
load
(
path
,
std
::
to_string
(
mode
));
ret
.
wait
();
if
(
ret
.
get
()
!=
0
)
{
LOG
(
ERROR
)
<<
"load model from path:"
<<
path
<<
" failed"
;
...
...
@@ -597,11 +635,7 @@ void FleetWrapper::LoadModel(const std::string& path, const int mode) {
void
FleetWrapper
::
LoadModelOneTable
(
const
uint64_t
table_id
,
const
std
::
string
&
path
,
const
int
mode
)
{
auto
*
communicator
=
Communicator
::
GetInstance
();
auto
ret
=
communicator
->
_worker_ptr
->
load
(
table_id
,
path
,
std
::
to_string
(
mode
));
// auto ret =
// pserver_ptr_->_worker_ptr->load(table_id, path, std::to_string(mode));
auto
ret
=
worker_ptr_
->
load
(
table_id
,
path
,
std
::
to_string
(
mode
));
ret
.
wait
();
if
(
ret
.
get
()
!=
0
)
{
LOG
(
ERROR
)
<<
"load model of table id: "
<<
table_id
...
...
@@ -610,8 +644,7 @@ void FleetWrapper::LoadModelOneTable(const uint64_t table_id,
}
void
FleetWrapper
::
SaveModel
(
const
std
::
string
&
path
,
const
int
mode
)
{
auto
*
communicator
=
Communicator
::
GetInstance
();
auto
ret
=
communicator
->
_worker_ptr
->
save
(
path
,
std
::
to_string
(
mode
));
auto
ret
=
worker_ptr_
->
save
(
path
,
std
::
to_string
(
mode
));
ret
.
wait
();
int32_t
feasign_cnt
=
ret
.
get
();
if
(
feasign_cnt
==
-
1
)
{
...
...
@@ -621,9 +654,7 @@ void FleetWrapper::SaveModel(const std::string& path, const int mode) {
void
FleetWrapper
::
SaveModelOneTable
(
const
uint64_t
table_id
,
const
std
::
string
&
path
,
const
int
mode
)
{
auto
*
communicator
=
Communicator
::
GetInstance
();
auto
ret
=
communicator
->
_worker_ptr
->
save
(
table_id
,
path
,
std
::
to_string
(
mode
));
auto
ret
=
worker_ptr_
->
save
(
table_id
,
path
,
std
::
to_string
(
mode
));
ret
.
wait
();
if
(
ret
.
get
()
!=
0
)
{
LOG
(
ERROR
)
<<
"save model of table id: "
<<
table_id
...
...
@@ -633,8 +664,7 @@ void FleetWrapper::SaveModelOneTable(const uint64_t table_id,
void
FleetWrapper
::
RecvAndSaveTable
(
const
uint64_t
table_id
,
const
std
::
string
&
path
)
{
auto
*
communicator
=
Communicator
::
GetInstance
();
auto
ret
=
communicator
->
_worker_ptr
->
recv_and_save_table
(
table_id
,
path
);
auto
ret
=
worker_ptr_
->
recv_and_save_table
(
table_id
,
path
);
if
(
ret
!=
0
)
{
LOG
(
ERROR
)
<<
"save model of table id: "
<<
table_id
<<
", to path: "
<<
path
<<
" failed"
;
...
...
@@ -642,8 +672,7 @@ void FleetWrapper::RecvAndSaveTable(const uint64_t table_id,
}
void
FleetWrapper
::
PrintTableStat
(
const
uint64_t
table_id
)
{
auto
*
communicator
=
Communicator
::
GetInstance
();
auto
ret
=
communicator
->
_worker_ptr
->
print_table_stat
(
table_id
);
auto
ret
=
worker_ptr_
->
print_table_stat
(
table_id
);
ret
.
wait
();
int32_t
err_code
=
ret
.
get
();
if
(
err_code
==
-
1
)
{
...
...
@@ -652,9 +681,7 @@ void FleetWrapper::PrintTableStat(const uint64_t table_id) {
}
void
FleetWrapper
::
ShrinkSparseTable
(
int
table_id
,
int
threshold
)
{
auto
*
communicator
=
Communicator
::
GetInstance
();
auto
ret
=
communicator
->
_worker_ptr
->
shrink
(
table_id
,
std
::
to_string
(
threshold
));
auto
ret
=
worker_ptr_
->
shrink
(
table_id
,
std
::
to_string
(
threshold
));
ret
.
wait
();
int32_t
err_code
=
ret
.
get
();
if
(
err_code
==
-
1
)
{
...
...
@@ -720,30 +747,31 @@ void FleetWrapper::ShrinkDenseTable(int table_id, Scope* scope,
}
void
FleetWrapper
::
ClientFlush
()
{
auto
ret
=
pserver_ptr_
->
_worker_ptr
->
flush
();
if
(
worker_ptr_
.
get
()
==
nullptr
)
{
VLOG
(
0
)
<<
"worker_ptr null, do nothing"
;
return
;
}
auto
ret
=
worker_ptr_
->
flush
();
ret
.
wait
();
int32_t
err_code
=
ret
.
get
();
if
(
err_code
==
-
1
)
{
LOG
(
ERROR
)
<<
"Client Flush failed"
;
}
}
int
FleetWrapper
::
RegisterClientToClientMsgHandler
(
int
msg_type
,
MsgHandlerFunc
handler
)
{
VLOG
(
1
)
<<
"calling FleetWrapper::RegisterClientToClientMsgHandler"
;
auto
*
communicator
=
Communicator
::
GetInstance
();
// for unittest which does not call fleet.init_worker() first
if
(
communicator
==
nullptr
)
{
VLOG
(
0
)
<<
"FleetWrapper::RegisterClientToClientMsgHandler communicator is "
"null"
;
if
(
worker_ptr_
.
get
()
==
nullptr
)
{
VLOG
(
0
)
<<
"FleetWrapper::Client is null"
;
return
-
1
;
}
else
{
return
communicator
->
_worker_ptr
->
registe_client2client_msg_handler
(
msg_type
,
handler
);
return
worker_ptr_
->
registe_client2client_msg_handler
(
msg_type
,
handler
);
}
}
std
::
future
<
int32_t
>
FleetWrapper
::
SendClientToClientMsg
(
int
msg_type
,
int
to_client_id
,
const
std
::
string
&
msg
)
{
auto
*
communicator
=
Communicator
::
GetInstance
();
return
communicator
->
_worker_ptr
->
send_client2client_msg
(
msg_type
,
to_client_id
,
msg
);
return
worker_ptr_
->
send_client2client_msg
(
msg_type
,
to_client_id
,
msg
);
}
std
::
default_random_engine
&
FleetWrapper
::
LocalRandomEngine
()
{
...
...
paddle/fluid/distributed/ps/wrapper/fleet.h
浏览文件 @
5dcfb699
...
...
@@ -71,11 +71,22 @@ class FleetWrapper : public PSWrapper {
}
virtual
int32_t
Initialize
(
InitContext
&
context
)
{
return
0
;
}
// TODO(zhaocaibei123: later)
int32_t
CopyTable
(
const
uint64_t
src_table_id
,
const
uint64_t
dest_table_id
);
int32_t
CopyTableByFeasign
(
const
uint64_t
src_table_id
,
const
uint64_t
dest_table_id
,
const
std
::
vector
<
uint64_t
>&
feasign_list
);
typedef
std
::
function
<
void
(
int
,
int
)
>
HeterCallBackFunc
;
int
RegisterHeterCallback
(
HeterCallBackFunc
handler
);
virtual
void
Stop
()
override
;
virtual
void
Load
(
WrapperContext
&
context
)
override
;
virtual
void
Save
(
WrapperContext
&
context
)
override
;
// set client to client communication config
void
SetClient2ClientConfig
(
int
request_timeout_ms
,
int
connect_timeout_ms
,
int
max_retry
);
...
...
@@ -168,7 +179,8 @@ class FleetWrapper : public PSWrapper {
std
::
vector
<
const
LoDTensor
*>*
inputs
,
const
LoDTensor
*
shows
,
const
LoDTensor
*
clicks
,
std
::
vector
<
LoDTensor
*>*
outputs
);
std
::
vector
<
LoDTensor
*>*
outputs
,
bool
use_cvm_op
=
false
);
// Push sparse variables to server in Async mode
// Param<In>: scope, table_id, fea_keys, sparse_grad_names
// Param<Out>: push_values, push_sparse_status
...
...
@@ -185,12 +197,7 @@ class FleetWrapper : public PSWrapper {
const
std
::
vector
<
framework
::
ProgramDesc
>&
server_sub_program
=
{});
// init trainer
void
InitWorker
(
const
std
::
string
&
dist_desc
,
const
std
::
vector
<
std
::
string
>&
host_sign_list
,
Scope
*
scope
,
const
RpcCtxMap
&
send_ctx
,
const
std
::
unordered_map
<
uint64_t
,
std
::
vector
<
std
::
string
>>&
dense_varnames
,
const
std
::
map
<
std
::
string
,
std
::
string
>&
envs
,
int
node_num
,
int
index
);
const
std
::
vector
<
std
::
string
>&
host_sign_list
,
int
index
);
// stop server
void
StopServer
();
...
...
@@ -200,6 +207,8 @@ class FleetWrapper : public PSWrapper {
uint64_t
RunServer
(
const
std
::
string
&
ip
,
uint32_t
port
);
// get client info
std
::
vector
<
uint64_t
>
GetClientsInfo
();
// set client info
int
SetClients
(
std
::
vector
<
uint64_t
>&
host_sign_list
);
// NOLINT
// create client to client connection
void
CreateClient2ClientConnection
();
// flush all push requests
...
...
@@ -255,10 +264,15 @@ class FleetWrapper : public PSWrapper {
// this performs better than rand_r, especially large data
std
::
default_random_engine
&
LocalRandomEngine
();
// for init worker
void
InitGFlag
(
const
std
::
string
&
gflags
);
static
std
::
shared_ptr
<
paddle
::
distributed
::
PSCore
>
pserver_ptr_
;
static
std
::
shared_ptr
<
paddle
::
distributed
::
PSClient
>
worker_ptr_
;
private:
static
std
::
shared_ptr
<
FleetWrapper
>
s_instance_
;
paddle
::
distributed
::
PaddlePSEnvironment
ps_env_
;
size_t
GetAbsoluteSum
(
size_t
start
,
size_t
end
,
size_t
level
,
const
framework
::
LoD
&
lod
);
...
...
paddle/fluid/distributed/test/memory_sparse_table_test.cc
浏览文件 @
5dcfb699
...
...
@@ -74,7 +74,7 @@ TEST(MemorySparseTable, SGD) {
std
::
vector
<
uint32_t
>
init_fres
=
{
1
,
1
,
1
,
1
,
1
};
std
::
vector
<
float
>
init_values
;
init_values
.
resize
(
init_keys
.
size
()
*
(
emb_dim
+
1
));
init_values
.
resize
(
init_keys
.
size
()
*
(
emb_dim
+
3
));
auto
value
=
PullSparseValue
(
init_keys
,
init_fres
,
emb_dim
);
table
->
pull_sparse
(
init_values
.
data
(),
value
);
...
...
@@ -119,11 +119,11 @@ TEST(MemorySparseTable, SGD) {
}
std
::
vector
<
float
>
pull_values
;
pull_values
.
resize
(
init_keys
.
size
()
*
(
emb_dim
+
1
));
pull_values
.
resize
(
init_keys
.
size
()
*
(
emb_dim
+
3
));
table
->
pull_sparse
(
pull_values
.
data
(),
value
);
for
(
size_t
i
=
0
;
i
<
init_keys
.
size
();
++
i
)
{
for
(
size_t
j
=
0
;
j
<
emb_dim
+
1
;
++
j
)
{
for
(
size_t
j
=
2
;
j
<
emb_dim
+
3
;
++
j
)
{
auto
update_val
=
init_values
[
i
*
(
emb_dim
+
1
)
+
j
]
-
0.1
*
total_gradients
[
3
+
i
*
(
emb_dim
+
4
)
+
j
];
VLOG
(
3
)
<<
total_gradients
[
i
*
(
emb_dim
+
4
)
+
j
+
3
]
<<
":"
...
...
paddle/fluid/eager/tests/task_tests/eager_utils_test.cc
浏览文件 @
5dcfb699
...
...
@@ -24,6 +24,9 @@
#include "paddle/fluid/eager/utils.h"
#include "paddle/phi/api/lib/utils/allocator.h"
#include "paddle/phi/core/kernel_registry.h"
PD_DECLARE_KERNEL
(
full
,
CPU
,
ALL_LAYOUT
);
namespace
egr
{
...
...
paddle/fluid/framework/CMakeLists.txt
浏览文件 @
5dcfb699
...
...
@@ -235,7 +235,7 @@ if(WITH_PYTHON)
py_proto_compile
(
trainer_py_proto SRCS trainer_desc.proto data_feed.proto
)
py_proto_compile
(
distributed_strategy_py_proto SRCS distributed_strategy.proto
)
py_proto_compile
(
pass_desc_py_proto SRCS pass_desc.proto
)
py_proto_compile
(
ps_py_proto SRCS ps.proto
)
py_proto_compile
(
ps_py_proto SRCS
the_one_
ps.proto
)
#Generate an empty \
#__init__.py to make framework_py_proto as a valid python module.
add_custom_target
(
fleet_proto_init ALL
...
...
@@ -249,7 +249,7 @@ if(WITH_PYTHON)
COMMAND
${
CMAKE_COMMAND
}
-E make_directory
${
PADDLE_BINARY_DIR
}
/python/paddle/fluid/proto
COMMAND cp *.py
${
PADDLE_BINARY_DIR
}
/python/paddle/fluid/proto/
COMMAND cp distributed_strategy_*.py
${
PADDLE_BINARY_DIR
}
/python/paddle/distributed/fleet/proto
COMMAND cp ps_pb2.py
${
PADDLE_BINARY_DIR
}
/python/paddle/distributed/fleet/proto
COMMAND cp
the_one_
ps_pb2.py
${
PADDLE_BINARY_DIR
}
/python/paddle/distributed/fleet/proto
COMMENT
"Copy generated python proto into directory paddle/fluid/proto."
WORKING_DIRECTORY
${
CMAKE_CURRENT_BINARY_DIR
}
)
add_custom_target
(
fleet_executor_proto_init ALL DEPENDS fleet_proto_init fleet_executor_desc_py_proto
...
...
@@ -261,7 +261,7 @@ if(WITH_PYTHON)
add_custom_command
(
TARGET framework_py_proto POST_BUILD
COMMAND
${
CMAKE_COMMAND
}
-E make_directory
${
PADDLE_BINARY_DIR
}
/python/paddle/fluid/proto
COMMAND copy /Y *.py
${
proto_dstpath
}
COMMAND copy /Y ps_pb2.py
${
fleet_proto_dstpath
}
COMMAND copy /Y
the_one_
ps_pb2.py
${
fleet_proto_dstpath
}
COMMAND copy /Y distributed_strategy_*.py
${
fleet_proto_dstpath
}
COMMENT
"Copy generated python proto into directory paddle/fluid/proto."
COMMENT
"Copy generated python proto into directory paddle/distributed/fleet/proto."
...
...
@@ -314,7 +314,7 @@ if(WITH_DISTRIBUTE)
dist_multi_trainer.cc trainer_factory.cc trainer.cc data_feed_factory.cc
heterxpu_trainer.cc heter_pipeline_trainer.cc
data_feed.cc device_worker.cc hogwild_worker.cc hetercpu_worker.cc
downpour_worker.cc downpour_worker_opt.cc
downpour_worker.cc downpour_
lite_worker.cc downpour_
worker_opt.cc
pull_dense_worker.cc section_worker.cc heter_section_worker.cc device_worker_factory.cc data_set.cc DEPS op_registry
device_context scope framework_proto data_feed_proto heter_service_proto trainer_desc_proto glog
index_sampler index_wrapper sampler index_dataset_proto
...
...
@@ -329,6 +329,7 @@ if(WITH_DISTRIBUTE)
set_source_files_properties
(
device_worker.cc PROPERTIES COMPILE_FLAGS
${
DISTRIBUTE_COMPILE_FLAGS
}
)
set_source_files_properties
(
multi_trainer.cc PROPERTIES COMPILE_FLAGS
${
DISTRIBUTE_COMPILE_FLAGS
}
)
set_source_files_properties
(
hogwild_worker.cc PROPERTIES COMPILE_FLAGS
${
DISTRIBUTE_COMPILE_FLAGS
}
)
set_source_files_properties
(
downpour_lite_worker.cc PROPERTIES COMPILE_FLAGS
${
DISTRIBUTE_COMPILE_FLAGS
}
)
set_source_files_properties
(
heter_section_worker.cc PROPERTIES COMPILE_FLAGS
${
DISTRIBUTE_COMPILE_FLAGS
}
)
set_source_files_properties
(
heter_pipeline_trainer.cc PROPERTIES COMPILE_FLAGS
${
DISTRIBUTE_COMPILE_FLAGS
}
)
else
()
...
...
paddle/fluid/framework/device_worker.h
浏览文件 @
5dcfb699
...
...
@@ -27,6 +27,10 @@ limitations under the License. */
#include <utility> // NOLINT
#include <vector>
#if defined(PADDLE_WITH_PSCORE)
#include "paddle/fluid/distributed/ps/wrapper/fleet.h"
#endif
#include "paddle/fluid/framework/data_feed.h"
#include "paddle/fluid/framework/executor_gc_helper.h"
#include "paddle/fluid/framework/heter_util.h"
...
...
@@ -107,7 +111,12 @@ class PullDenseWorker {
bool
CheckUpdateParam
(
uint64_t
table_id
);
private:
#if defined(PADDLE_WITH_PSCORE)
std
::
shared_ptr
<
paddle
::
distributed
::
FleetWrapper
>
fleet_ptr_
;
#else
std
::
shared_ptr
<
paddle
::
framework
::
FleetWrapper
>
fleet_ptr_
;
#endif
PullDenseWorkerParameter
param_
;
DownpourWorkerParameter
dwp_param_
;
Scope
*
root_scope_
;
...
...
@@ -341,6 +350,79 @@ class DownpourWorker : public HogwildWorker {
// std::vector<std::pair<uint64_t, uint64_t>> copy_dense_tables_;
};
// Based on DownpourWorker,remove push pull code into operator
#if defined(PADDLE_WITH_PSCORE)
class
DownpourLiteWorker
:
public
HogwildWorker
{
public:
DownpourLiteWorker
()
{}
virtual
~
DownpourLiteWorker
()
{}
virtual
void
Initialize
(
const
TrainerDesc
&
desc
);
virtual
void
TrainFiles
();
virtual
void
TrainFilesWithProfiler
();
protected:
std
::
shared_ptr
<
paddle
::
distributed
::
FleetWrapper
>
fleet_ptr_
;
std
::
shared_ptr
<
paddle
::
framework
::
PullDenseWorker
>
pull_dense_worker_
;
void
PushGradients
();
void
CopySparseTable
();
void
CopyDenseTable
();
void
CopyDenseVars
();
DownpourWorkerParameter
param_
;
// copy table
CopyTableConfig
copy_table_config_
;
std
::
vector
<
std
::
pair
<
uint64_t
,
uint64_t
>>
copy_sparse_tables_
;
std
::
unordered_map
<
uint64_t
,
std
::
unordered_set
<
uint64_t
>>
feasign_set_
;
// actually pushed feasign of each table
std
::
map
<
uint64_t
,
std
::
vector
<
uint64_t
>>
sparse_push_keys_
;
std
::
map
<
uint64_t
,
std
::
vector
<
std
::
string
>>
sparse_key_names_
;
// feasign
std
::
map
<
uint64_t
,
std
::
vector
<
uint64_t
>>
features_
;
// feasign embedding
std
::
map
<
uint64_t
,
std
::
vector
<
std
::
vector
<
float
>>>
feature_values_
;
std
::
map
<
uint64_t
,
std
::
vector
<
std
::
string
>>
sparse_value_names_
;
// adjust ins weight
AdjustInsWeightConfig
adjust_ins_weight_config_
;
// check nan and inf during training
std
::
vector
<
std
::
string
>
check_nan_var_names_
;
bool
need_to_push_sparse_
;
// feasign stats
std
::
map
<
uint64_t
,
std
::
vector
<
float
>>
feature_labels_
;
std
::
map
<
uint64_t
,
std
::
vector
<
std
::
string
>>
sparse_grad_names_
;
// feasign embedding gradient
std
::
map
<
uint64_t
,
std
::
vector
<
std
::
vector
<
float
>>>
feature_grads_
;
std
::
vector
<::
std
::
future
<
int32_t
>>
push_sparse_status_
;
bool
dump_slot_
;
bool
need_to_push_dense_
;
std
::
map
<
uint64_t
,
std
::
vector
<
std
::
string
>>
dense_grad_names_
;
float
scale_datanorm_
;
std
::
vector
<::
std
::
future
<
int32_t
>>
push_dense_status_
;
// skipped ops
std
::
vector
<
std
::
string
>
skip_ops_
;
// just save the value in param_ for easy access
std
::
map
<
uint64_t
,
std
::
string
>
label_var_name_
;
std
::
map
<
uint64_t
,
std
::
vector
<
std
::
string
>>
dense_value_names_
;
std
::
map
<
uint64_t
,
uint64_t
>
table_dependency_
;
std
::
vector
<
std
::
pair
<
uint64_t
,
uint64_t
>>
copy_dense_tables_
;
// multitask
std
::
map
<
int32_t
,
uint64_t
>
cond2table_map_
;
std
::
set
<
uint64_t
>
condvalue_set_
;
bool
flag_partial_push_
;
private:
// std::vector<std::string> dump_param_;
// just save the value in param_ for easy access
// std::map<uint64_t, std::string> label_var_name_;
// std::map<uint64_t, std::vector<std::string>> dense_value_names_;
std
::
shared_ptr
<
PullDenseWorker
>
_pull_dense_worker
;
std
::
vector
<
float
>
nid_show_
;
// std::map<uint64_t, uint64_t> table_dependency_;
// std::vector<std::pair<uint64_t, uint64_t>> copy_dense_tables_;
};
#endif
class
DownpourWorkerOpt
:
public
DownpourWorker
{
public:
DownpourWorkerOpt
()
{}
...
...
paddle/fluid/framework/device_worker_factory.cc
浏览文件 @
5dcfb699
...
...
@@ -67,6 +67,7 @@ REGISTER_DEVICE_WORKER_CLASS(DownpourWorker);
REGISTER_DEVICE_WORKER_CLASS
(
DownpourWorkerOpt
);
#if defined(PADDLE_WITH_PSCORE)
REGISTER_DEVICE_WORKER_CLASS
(
DownpourLiteWorker
);
REGISTER_DEVICE_WORKER_CLASS
(
HeterSectionWorker
);
#endif
...
...
paddle/fluid/framework/dist_multi_trainer.cc
浏览文件 @
5dcfb699
...
...
@@ -12,6 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#if defined(PADDLE_WITH_PSCORE)
#include "paddle/fluid/distributed/ps/wrapper/fleet.h"
#endif
#include "paddle/fluid/framework/convert_utils.h"
#include "paddle/fluid/framework/device_worker_factory.h"
#include "paddle/fluid/framework/trainer.h"
...
...
@@ -62,7 +66,11 @@ void DistMultiTrainer::Initialize(const TrainerDesc &trainer_desc,
}
void
DistMultiTrainer
::
RegisterHeterCallback
()
{
#ifdef PADDLE_WITH_PSCORE
auto
fleet_ptr
=
paddle
::
distributed
::
FleetWrapper
::
GetInstance
();
#else
auto
fleet_ptr
=
FleetWrapper
::
GetInstance
();
#endif
fleet_ptr
->
RegisterHeterCallback
(
[
this
](
int
worker
,
int
taskid
)
{
workers_
[
worker
]
->
Schedule
(
taskid
);
});
}
...
...
@@ -93,7 +101,7 @@ void DistMultiTrainer::InitTrainerEnv(const ProgramDesc &main_program,
workers_
[
i
]
->
SetRootScope
(
root_scope_
);
workers_
[
i
]
->
CreateDeviceResource
(
main_program
);
// Program
workers_
[
i
]
->
BindingDataFeedMemory
();
#if
def PADDLE_WITH_PSLIB
#if
defined(PADDLE_WITH_PSLIB) || defined(PADDLE_WITH_PSCORE)
workers_
[
i
]
->
CacheProgram
(
main_program
);
#endif
}
...
...
@@ -110,7 +118,7 @@ void DistMultiTrainer::InitOtherEnv(const ProgramDesc &main_program) {
}
pull_dense_worker_
->
SetRootScope
(
root_scope_
);
pull_dense_worker_
->
Start
();
#if
def PADDLE_WITH_PSLIB
#if
defined(PADDLE_WITH_PSLIB) || defined(PADDLE_WITH_PSCORE)
for
(
int
i
=
0
;
i
<
thread_num_
;
++
i
)
{
workers_
[
i
]
->
GetXpuOpIndex
();
}
...
...
@@ -176,8 +184,12 @@ void DistMultiTrainer::Finalize() {
pull_dense_worker_
->
Stop
();
root_scope_
->
DropKids
();
// flush local client push queue
// flush local client push queue
#ifdef PADDLE_WITH_PSCORE
auto
fleet_ptr_
=
paddle
::
distributed
::
FleetWrapper
::
GetInstance
();
#else
auto
fleet_ptr_
=
FleetWrapper
::
GetInstance
();
#endif
fleet_ptr_
->
ClientFlush
();
}
...
...
paddle/fluid/framework/downpour_lite_worker.cc
0 → 100644
浏览文件 @
5dcfb699
此差异已折叠。
点击以展开。
paddle/fluid/framework/fleet/metrics.cc
浏览文件 @
5dcfb699
...
...
@@ -19,7 +19,7 @@
#include <numeric>
#include "paddle/fluid/framework/lod_tensor.h"
#if defined(PADDLE_WITH_PSLIB)
#if defined(PADDLE_WITH_PSLIB)
|| defined(PADDLE_WITH_PSCORE)
namespace
paddle
{
namespace
framework
{
...
...
paddle/fluid/framework/fleet/metrics.h
浏览文件 @
5dcfb699
...
...
@@ -38,7 +38,7 @@ limitations under the License. */
#include "paddle/fluid/framework/fleet/gloo_wrapper.h"
#endif
#if defined(PADDLE_WITH_PSLIB)
#if defined(PADDLE_WITH_PSLIB)
|| defined(PADDLE_WITH_PSCORE)
namespace
paddle
{
namespace
framework
{
...
...
paddle/fluid/framework/pull_dense_worker.cc
浏览文件 @
5dcfb699
...
...
@@ -61,7 +61,13 @@ void PullDenseWorker::Initialize(const TrainerDesc& param) {
last_versions_
[
tid
]
=
0
;
current_version_
[
tid
]
=
0
;
}
#if defined(PADDLE_WITH_PSCORE)
fleet_ptr_
=
paddle
::
distributed
::
FleetWrapper
::
GetInstance
();
#else
fleet_ptr_
=
FleetWrapper
::
GetInstance
();
#endif
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
copy_streams_
.
clear
();
#endif
...
...
@@ -170,6 +176,9 @@ void PullDenseWorker::PullDense(bool force_update) {
VLOG
(
3
)
<<
"pull dense "
<<
force_update
<<
" "
<<
tid
;
fleet_ptr_
->
PullDenseVarsAsync
(
*
root_scope_
,
tid
,
dense_value_names_
[
tid
],
&
pull_dense_status_
,
false
);
#elif defined(PADDLE_WITH_PSCORE)
fleet_ptr_
->
PullDenseVarsAsync
(
*
root_scope_
,
tid
,
dense_value_names_
[
tid
],
&
pull_dense_status_
,
true
);
#else
fleet_ptr_
->
PullDenseVarsAsync
(
*
root_scope_
,
tid
,
dense_value_names_
[
tid
],
&
pull_dense_status_
,
true
);
...
...
paddle/fluid/framework/ps.proto
→
paddle/fluid/framework/
the_one_
ps.proto
浏览文件 @
5dcfb699
文件已移动
paddle/fluid/operators/abs_op.cc
浏览文件 @
5dcfb699
...
...
@@ -30,6 +30,21 @@ namespace operators {
class
AbsOp
:
public
framework
::
OperatorWithKernel
{
public:
using
framework
::
OperatorWithKernel
::
OperatorWithKernel
;
framework
::
OpKernelType
GetExpectedKernelType
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
input_data_type
=
framework
::
OperatorWithKernel
::
IndicateVarDataType
(
ctx
,
"X"
);
#ifdef PADDLE_WITH_MKLDNN
if
(
this
->
CanMKLDNNBeUsed
(
ctx
,
input_data_type
))
{
return
framework
::
OpKernelType
(
input_data_type
,
ctx
.
GetPlace
(),
framework
::
DataLayout
::
kMKLDNN
,
framework
::
LibraryType
::
kMKLDNN
);
}
#endif
return
framework
::
OpKernelType
(
input_data_type
,
ctx
.
GetPlace
());
}
};
class
AbsOpMaker
:
public
framework
::
OpProtoAndCheckerMaker
{
...
...
@@ -72,8 +87,17 @@ class AbsGradOp : public framework::OperatorWithKernel {
protected:
framework
::
OpKernelType
GetExpectedKernelType
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
dtype
=
OperatorWithKernel
::
IndicateVarDataType
(
ctx
,
"X"
);
return
framework
::
OpKernelType
(
dtype
,
ctx
.
GetPlace
());
auto
input_data_type
=
framework
::
OperatorWithKernel
::
IndicateVarDataType
(
ctx
,
"X"
);
#ifdef PADDLE_WITH_MKLDNN
if
(
this
->
CanMKLDNNBeUsed
(
ctx
,
input_data_type
))
{
return
framework
::
OpKernelType
(
input_data_type
,
ctx
.
GetPlace
(),
framework
::
DataLayout
::
kMKLDNN
,
framework
::
LibraryType
::
kMKLDNN
);
}
#endif
return
framework
::
OpKernelType
(
input_data_type
,
ctx
.
GetPlace
());
}
};
...
...
paddle/fluid/operators/conv_op_npu.cc
浏览文件 @
5dcfb699
...
...
@@ -390,6 +390,204 @@ class NPUConvGradOpKernel : public framework::OpKernel<T> {
}
}
};
template
<
typename
T
>
class
NPUConv3dKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
const
Tensor
*
input
=
ctx
.
Input
<
Tensor
>
(
"Input"
);
const
Tensor
*
filter
=
ctx
.
Input
<
Tensor
>
(
"Filter"
);
Tensor
*
output
=
ctx
.
Output
<
Tensor
>
(
"Output"
);
const
std
::
vector
<
int
>
strides
=
ctx
.
Attr
<
std
::
vector
<
int
>>
(
"strides"
);
std
::
vector
<
int
>
paddings
=
ctx
.
Attr
<
std
::
vector
<
int
>>
(
"paddings"
);
std
::
vector
<
int
>
dilations
=
ctx
.
Attr
<
std
::
vector
<
int
>>
(
"dilations"
);
int
groups
=
ctx
.
Attr
<
int
>
(
"groups"
);
const
std
::
string
padding_algorithm
=
ctx
.
Attr
<
std
::
string
>
(
"padding_algorithm"
);
const
std
::
string
data_format
=
ctx
.
Attr
<
std
::
string
>
(
"data_format"
);
PADDLE_ENFORCE_EQ
(
data_format
,
"NCDHW"
,
platform
::
errors
::
Unimplemented
(
"the data_format must be NCDHW in "
"the npu kernel of conv3d, but got data_format "
"= [%s]"
,
data_format
));
PADDLE_ENFORCE_EQ
(
groups
,
1
,
platform
::
errors
::
Unimplemented
(
"the groups must be 1 in "
"the npu kernel of conv3d, but got groups "
"= [%d]"
,
groups
));
output
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
auto
&
dev_ctx
=
ctx
.
template
device_context
<
NPUDeviceContext
>();
auto
input_tensor
=
ctx
.
AllocateTmpTensor
<
T
,
NPUDeviceContext
>
(
input
->
dims
(),
dev_ctx
);
auto
filter_tensor
=
ctx
.
AllocateTmpTensor
<
T
,
NPUDeviceContext
>
(
filter
->
dims
(),
dev_ctx
);
auto
output_tensor
=
ctx
.
AllocateTmpTensor
<
T
,
NPUDeviceContext
>
(
output
->
dims
(),
dev_ctx
);
input_tensor
.
ShareDataWith
(
*
input
);
filter_tensor
.
ShareDataWith
(
*
filter
);
output_tensor
.
ShareDataWith
(
*
output
);
input_tensor
.
set_layout
(
DataLayout
::
kNCDHW
);
filter_tensor
.
set_layout
(
DataLayout
::
kNCDHW
);
output_tensor
.
set_layout
(
DataLayout
::
kNCDHW
);
// update padding and dilation
auto
in_dims
=
input
->
dims
();
auto
filter_dims
=
filter
->
dims
();
framework
::
DDim
in_data_dims
;
framework
::
DDim
filter_data_dims
;
in_data_dims
=
phi
::
slice_ddim
(
in_dims
,
2
,
in_dims
.
size
());
filter_data_dims
=
phi
::
slice_ddim
(
filter_dims
,
2
,
in_dims
.
size
());
std
::
vector
<
int
>
ksize
=
phi
::
vectorize
<
int
>
(
filter_data_dims
);
UpdatePaddingAndDilation
(
&
paddings
,
&
dilations
,
padding_algorithm
,
in_data_dims
,
strides
,
ksize
);
std
::
vector
<
int
>
strides_vec
(
5
,
1
);
std
::
vector
<
int
>
dilations_vec
(
5
,
1
);
strides_vec
[
2
]
=
strides
[
0
];
strides_vec
[
3
]
=
strides
[
1
];
strides_vec
[
4
]
=
strides
[
2
];
dilations_vec
[
2
]
=
dilations
[
0
];
dilations_vec
[
3
]
=
dilations
[
1
];
dilations_vec
[
4
]
=
dilations
[
2
];
auto
stream
=
ctx
.
template
device_context
<
NPUDeviceContext
>().
stream
();
const
auto
&
runner
=
NpuOpRunner
(
"Conv3D"
,
{
input_tensor
,
filter_tensor
},
{
output_tensor
},
{{
"strides"
,
strides_vec
},
{
"pads"
,
paddings
},
{
"dilations"
,
dilations_vec
},
{
"groups"
,
groups
},
{
"data_format"
,
data_format
}});
runner
.
Run
(
stream
);
}
};
template
<
typename
T
>
class
NPUConv3dGradKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
const
Tensor
*
input
=
ctx
.
Input
<
Tensor
>
(
"Input"
);
const
Tensor
*
filter
=
ctx
.
Input
<
Tensor
>
(
"Filter"
);
const
Tensor
*
output_grad
=
ctx
.
Input
<
Tensor
>
(
framework
::
GradVarName
(
"Output"
));
Tensor
*
input_grad
=
ctx
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"Input"
));
Tensor
*
filter_grad
=
ctx
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"Filter"
));
const
std
::
vector
<
int
>
strides
=
ctx
.
Attr
<
std
::
vector
<
int
>>
(
"strides"
);
std
::
vector
<
int
>
paddings
=
ctx
.
Attr
<
std
::
vector
<
int
>>
(
"paddings"
);
std
::
vector
<
int
>
dilations
=
ctx
.
Attr
<
std
::
vector
<
int
>>
(
"dilations"
);
int
groups
=
ctx
.
Attr
<
int
>
(
"groups"
);
const
std
::
string
padding_algorithm
=
ctx
.
Attr
<
std
::
string
>
(
"padding_algorithm"
);
const
std
::
string
data_format
=
ctx
.
Attr
<
std
::
string
>
(
"data_format"
);
PADDLE_ENFORCE_EQ
(
data_format
,
"NCDHW"
,
platform
::
errors
::
Unimplemented
(
"the data_format must be NCDHW in "
"the npu kernel of conv3d, but got data_format "
"= [%s]"
,
data_format
));
PADDLE_ENFORCE_EQ
(
groups
,
1
,
platform
::
errors
::
Unimplemented
(
"the groups must be 1 in "
"the npu kernel of conv3d, but got groups "
"= [%d]"
,
groups
));
auto
&
dev_ctx
=
ctx
.
template
device_context
<
NPUDeviceContext
>();
auto
input_tensor
=
ctx
.
AllocateTmpTensor
<
T
,
NPUDeviceContext
>
(
input
->
dims
(),
dev_ctx
);
auto
filter_tensor
=
ctx
.
AllocateTmpTensor
<
T
,
NPUDeviceContext
>
(
filter
->
dims
(),
dev_ctx
);
auto
output_grad_tensor
=
ctx
.
AllocateTmpTensor
<
T
,
NPUDeviceContext
>
(
output_grad
->
dims
(),
dev_ctx
);
input_tensor
.
ShareDataWith
(
*
input
);
filter_tensor
.
ShareDataWith
(
*
filter
);
output_grad_tensor
.
ShareDataWith
(
*
output_grad
);
input_tensor
.
set_layout
(
DataLayout
::
kNCDHW
);
filter_tensor
.
set_layout
(
DataLayout
::
kNCDHW
);
output_grad_tensor
.
set_layout
(
DataLayout
::
kNCDHW
);
// update padding and dilation
auto
in_dims
=
input
->
dims
();
auto
filter_dims
=
filter
->
dims
();
framework
::
DDim
in_data_dims
;
framework
::
DDim
filter_data_dims
;
in_data_dims
=
phi
::
slice_ddim
(
in_dims
,
1
,
in_dims
.
size
()
-
1
);
filter_data_dims
=
phi
::
slice_ddim
(
filter_dims
,
2
,
in_dims
.
size
());
std
::
vector
<
int
>
ksize
=
phi
::
vectorize
<
int
>
(
filter_data_dims
);
UpdatePaddingAndDilation
(
&
paddings
,
&
dilations
,
padding_algorithm
,
in_data_dims
,
strides
,
ksize
);
std
::
vector
<
int
>
strides_vec
(
5
,
1
);
std
::
vector
<
int
>
dilations_vec
(
5
,
1
);
strides_vec
[
2
]
=
strides
[
0
];
strides_vec
[
3
]
=
strides
[
1
];
strides_vec
[
4
]
=
strides
[
2
];
dilations_vec
[
2
]
=
dilations
[
0
];
dilations_vec
[
3
]
=
dilations
[
1
];
dilations_vec
[
4
]
=
dilations
[
2
];
auto
stream
=
ctx
.
template
device_context
<
NPUDeviceContext
>().
stream
();
if
(
filter_grad
)
{
filter_grad
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
std
::
vector
<
int
>
filter_shape_vec
=
phi
::
vectorize
<
int
>
(
filter
->
dims
());
Tensor
filter_grad_tensor
=
ctx
.
AllocateTmpTensor
<
T
,
NPUDeviceContext
>
(
filter_grad
->
dims
(),
dev_ctx
);
filter_grad_tensor
.
ShareDataWith
(
*
filter_grad
);
filter_grad_tensor
.
set_layout
(
DataLayout
::
kNCDHW
);
const
auto
&
runner
=
NpuOpRunner
(
"Conv3DBackpropFilterD"
,
{
input_tensor
,
output_grad_tensor
},
{
filter_grad_tensor
},
{{
"filter_size"
,
filter_shape_vec
},
{
"strides"
,
strides_vec
},
{
"pads"
,
paddings
},
{
"dilations"
,
dilations_vec
},
{
"groups"
,
groups
},
{
"data_format"
,
data_format
}});
runner
.
Run
(
stream
);
}
if
(
input_grad
)
{
input_grad
->
mutable_data
<
T
>
(
ctx
.
GetPlace
());
std
::
vector
<
int
>
input_shape_vec
=
phi
::
vectorize
<
int
>
(
input
->
dims
());
Tensor
input_grad_tensor
=
ctx
.
AllocateTmpTensor
<
T
,
NPUDeviceContext
>
(
input_grad
->
dims
(),
dev_ctx
);
input_grad_tensor
.
ShareDataWith
(
*
input_grad
);
input_grad_tensor
.
set_layout
(
DataLayout
::
kNCDHW
);
const
auto
&
runner
=
NpuOpRunner
(
"Conv3DBackpropInputD"
,
{
filter_tensor
,
output_grad_tensor
},
{
input_grad_tensor
},
{{
"input_size"
,
input_shape_vec
},
{
"strides"
,
strides_vec
},
{
"pads"
,
paddings
},
{
"dilations"
,
dilations_vec
},
{
"groups"
,
groups
},
{
"data_format"
,
data_format
}});
runner
.
Run
(
stream
);
}
}
};
}
// namespace operators
}
// namespace paddle
...
...
@@ -408,3 +606,9 @@ REGISTER_OP_NPU_KERNEL(conv2d, ops::NPUConvOpKernel<float>,
REGISTER_OP_NPU_KERNEL
(
conv2d_grad
,
ops
::
NPUConvGradOpKernel
<
float
>
,
ops
::
NPUConvGradOpKernel
<
plat
::
float16
>
);
REGISTER_OP_NPU_KERNEL
(
conv3d
,
ops
::
NPUConv3dKernel
<
float
>
,
ops
::
NPUConv3dKernel
<
plat
::
float16
>
);
REGISTER_OP_NPU_KERNEL
(
conv3d_grad
,
ops
::
NPUConv3dGradKernel
<
float
>
,
ops
::
NPUConv3dGradKernel
<
plat
::
float16
>
);
paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc
浏览文件 @
5dcfb699
...
...
@@ -316,14 +316,6 @@ using ExpMKLDNNGradUseOutFunctor = MKLDNNActivationGradUseOutFunc<
namespace
ops
=
paddle
::
operators
;
#define REGISTER_ACTIVATION_MKLDNN_KERNEL(act_type, functor, grad_functor) \
REGISTER_OP_KERNEL(act_type, MKLDNN, ::paddle::platform::CPUPlace, \
ops::MKLDNNActivationKernel<ops::functor<float>>); \
REGISTER_OP_KERNEL( \
act_type##_grad, MKLDNN, ::paddle::platform::CPUPlace, \
ops::MKLDNNActivationGradKernel<ops::grad_functor<float>>);
#define REGISTER_ACTIVATION_MKLDNN_BF16_KERNEL(act_type, functor, \
grad_functor) \
REGISTER_OP_KERNEL( \
act_type, MKLDNN, ::paddle::platform::CPUPlace, \
ops::MKLDNNActivationKernel<ops::functor<float>>, \
...
...
@@ -339,30 +331,27 @@ namespace ops = paddle::operators;
ops::MKLDNNActivationKernel<ops::functor<float>>);
#define FOR_EACH_MKLDNN_KERNEL_FUNCTOR(__macro) \
__macro(relu6, Relu6MKLDNNFunctor, Relu6MKLDNNGradFunctor); \
__macro(leaky_relu, ReluMKLDNNFunctor, ReluMKLDNNGradFunctor); \
__macro(swish, SwishMKLDNNFunctor, SwishMKLDNNGradFunctor); \
__macro(hard_swish, HardSwishMKLDNNFunctor, HardSwishMKLDNNGradFunctor); \
__macro(tanh, TanhMKLDNNFunctor, TanhMKLDNNGradUseOutFunctor); \
__macro(abs, AbsMKLDNNFunctor, AbsMKLDNNGradFunctor); \
__macro(elu, EluMKLDNNFunctor, EluMKLDNNGradUseOutFunctor); \
__macro(exp, ExpMKLDNNFunctor, ExpMKLDNNGradUseOutFunctor);
__macro(exp, ExpMKLDNNFunctor, ExpMKLDNNGradUseOutFunctor); \
__macro(gelu, GeluMKLDNNFunctor, GeluMKLDNNGradFunctor); \
__macro(hard_swish, HardSwishMKLDNNFunctor, HardSwishMKLDNNGradFunctor); \
__macro(leaky_relu, ReluMKLDNNFunctor, ReluMKLDNNGradFunctor); \
__macro(mish, MishMKLDNNFunctor, MishMKLDNNGradFunctor); \
__macro(relu, ReluMKLDNNFunctor, ReluMKLDNNGradFunctor); \
__macro(relu6, Relu6MKLDNNFunctor, Relu6MKLDNNGradFunctor); \
__macro(sigmoid, SigmoidMKLDNNFunctor, SigmoidMKLDNNGradUseOutFunctor); \
__macro(sqrt, SqrtMKLDNNFunctor, SqrtMKLDNNGradUseOutFunctor); \
__macro(swish, SwishMKLDNNFunctor, SwishMKLDNNGradFunctor); \
__macro(tanh, TanhMKLDNNFunctor, TanhMKLDNNGradUseOutFunctor);
FOR_EACH_MKLDNN_KERNEL_FUNCTOR
(
REGISTER_ACTIVATION_MKLDNN_KERNEL
);
REGISTER_ACTIVATION_MKLDNN_KERNEL_FWD_ONLY
(
round
,
RoundMKLDNNFunctor
);
REGISTER_ACTIVATION_MKLDNN_BF16_KERNEL
(
relu
,
ReluMKLDNNFunctor
,
ReluMKLDNNGradFunctor
);
REGISTER_ACTIVATION_MKLDNN_BF16_KERNEL
(
gelu
,
GeluMKLDNNFunctor
,
GeluMKLDNNGradFunctor
);
REGISTER_ACTIVATION_MKLDNN_BF16_KERNEL
(
sigmoid
,
SigmoidMKLDNNFunctor
,
SigmoidMKLDNNGradUseOutFunctor
);
REGISTER_ACTIVATION_MKLDNN_BF16_KERNEL
(
sqrt
,
SqrtMKLDNNFunctor
,
SqrtMKLDNNGradUseOutFunctor
);
REGISTER_ACTIVATION_MKLDNN_BF16_KERNEL
(
mish
,
MishMKLDNNFunctor
,
MishMKLDNNGradFunctor
);
REGISTER_ACTIVATION_MKLDNN_KERNEL_FWD_ONLY
(
round
,
RoundMKLDNNFunctor
);
namespace
ops
=
paddle
::
operators
;
REGISTER_OP_KERNEL
(
softplus
,
MKLDNN
,
paddle
::
platform
::
CPUPlace
,
ops
::
MKLDNNActivationKernel
<
ops
::
SoftplusMKLDNNFunctor
<
float
>>
);
ops
::
MKLDNNActivationKernel
<
ops
::
SoftplusMKLDNNFunctor
<
float
>>
,
ops
::
MKLDNNActivationKernel
<
ops
::
SoftplusMKLDNNFunctor
<
paddle
::
platform
::
bfloat16
>>
);
paddle/fluid/operators/pscore/distributed_lookup_table_op.h
浏览文件 @
5dcfb699
...
...
@@ -13,7 +13,6 @@
#include <algorithm>
#include <string>
#include <vector>
#include "paddle/fluid/distributed/ps/service/communicator/communicator.h"
#include "paddle/fluid/distributed/ps/wrapper/fleet.h"
#include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/framework/op_registry.h"
...
...
@@ -52,15 +51,13 @@ class DistributedLookupTableKernel : public framework::OpKernel<T> {
auto
inputs
=
context
.
MultiInput
<
framework
::
LoDTensor
>
(
"Ids"
);
auto
outputs
=
context
.
MultiOutput
<
framework
::
LoDTensor
>
(
"Outputs"
);
// auto fleet = distributed::FleetWrapper::GetInstance();
auto
*
communicator
=
(
distributed
::
AsyncCommunicator
*
)
distributed
::
Communicator
::
GetInstance
();
auto
fleet
=
distributed
::
FleetWrapper
::
GetInstance
();
if
(
platform
::
is_cpu_place
(
context
.
GetPlace
()))
{
communicator
->
PullSparseToTensorSync
(
static_cast
<
uint64_t
>
(
table_id
),
emb_dim
,
static_cast
<
uint64_t
>
(
padding_idx
),
context
.
GetPlace
(),
!
is_test
,
&
inputs
,
&
outputs
);
fleet
->
PullSparseToTensorSync
(
static_cast
<
uint64_t
>
(
table_id
),
emb_dim
,
static_cast
<
uint64_t
>
(
padding_idx
)
,
context
.
GetPlace
(),
!
is_test
,
&
inputs
,
&
outputs
);
}
else
{
auto
inputs_variable
=
context
.
MultiInputVar
(
"Ids"
);
auto
outputs_variable
=
context
.
MultiOutputVar
(
"Outputs"
);
...
...
@@ -96,10 +93,10 @@ class DistributedLookupTableKernel : public framework::OpKernel<T> {
}
// use fleet->PullSparse
communicator
->
PullSparseToTensorSync
(
static_cast
<
uint64_t
>
(
table_id
),
emb_dim
,
static_cast
<
uint64_t
>
(
padding_idx
),
cpu_place
,
!
is_test
,
&
tmp_input_vec
,
&
tmp_output_vec
);
fleet
->
PullSparseToTensorSync
(
static_cast
<
uint64_t
>
(
table_id
),
emb_dim
,
static_cast
<
uint64_t
>
(
padding_idx
)
,
cpu_place
,
!
is_test
,
&
tmp_input_vec
,
&
tmp_output_vec
);
// cp temp to origin
for
(
size_t
idx
=
0
;
idx
<
output_var_size
;
++
idx
)
{
...
...
paddle/fluid/operators/pscore/distributed_push_sparse_op.cc
浏览文件 @
5dcfb699
...
...
@@ -106,6 +106,9 @@ class DistributedPushSparseOpMaker : public framework::OpProtoAndCheckerMaker {
"for training."
)
.
SetDefault
(
false
);
AddAttr
<
bool
>
(
"use_cvm_op"
,
"(boolean, default false) Use cvm op or not."
)
.
SetDefault
(
false
);
AddComment
(
R"DOC(
Lookup Tablel Prefetch Operator.
This operator is used to perform lookup on parameter W,
...
...
paddle/fluid/operators/pscore/distributed_push_sparse_op.h
浏览文件 @
5dcfb699
...
...
@@ -13,7 +13,6 @@
#include <algorithm>
#include <string>
#include <vector>
#include "paddle/fluid/distributed/ps/service/communicator/communicator.h"
#include "paddle/fluid/distributed/ps/wrapper/fleet.h"
#include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/framework/op_registry.h"
...
...
@@ -32,22 +31,20 @@ class DistributedPushSparseKernel : public framework::OpKernel<T> {
auto
padding_idx
=
context
.
Attr
<
int64_t
>
(
"padding_idx"
);
auto
table_id
=
context
.
Attr
<
int
>
(
"table_id"
);
auto
emb_dim
=
context
.
Attr
<
int
>
(
"size"
);
VLOG
(
1
)
<<
"push_sparse.h::emb_dim: "
<<
emb_dim
;
auto
use_cvm_op
=
context
.
Attr
<
bool
>
(
"use_cvm_op"
)
;
auto
inputs
=
context
.
MultiInput
<
framework
::
LoDTensor
>
(
"Ids"
);
auto
shows
=
context
.
Input
<
framework
::
LoDTensor
>
(
"Shows"
);
auto
clks
=
context
.
Input
<
framework
::
LoDTensor
>
(
"Clicks"
);
auto
outputs
=
context
.
MultiOutput
<
framework
::
LoDTensor
>
(
"Outputs"
);
// auto fleet = distributed::FleetWrapper::GetInstance();
auto
*
communicator
=
(
distributed
::
AsyncCommunicator
*
)
distributed
::
Communicator
::
GetInstance
();
auto
fleet
=
distributed
::
FleetWrapper
::
GetInstance
();
if
(
platform
::
is_cpu_place
(
context
.
GetPlace
()))
{
communicator
->
PushSparseFromTensorAsync
(
static_cast
<
uint64_t
>
(
table_id
),
emb_dim
,
static_cast
<
uint64_t
>
(
padding_idx
),
context
.
GetPlace
(),
&
input
s
,
shows
,
clks
,
&
outputs
);
fleet
->
PushSparseFromTensorAsync
(
static_cast
<
uint64_t
>
(
table_id
),
emb_dim
,
static_cast
<
uint64_t
>
(
padding_idx
)
,
context
.
GetPlace
(),
&
inputs
,
shows
,
clk
s
,
&
outputs
,
use_cvm_op
);
}
else
{
auto
inputs_variable
=
context
.
MultiInputVar
(
"Ids"
);
auto
outputs_variable
=
context
.
MultiOutputVar
(
"Outputs"
);
...
...
@@ -94,7 +91,7 @@ class DistributedPushSparseKernel : public framework::OpKernel<T> {
}
// use fleet->PullSparse
communicator
->
PushSparseFromTensorAsync
(
fleet
->
PushSparseFromTensorAsync
(
static_cast
<
uint64_t
>
(
table_id
),
emb_dim
,
static_cast
<
uint64_t
>
(
padding_idx
),
context
.
GetPlace
(),
&
tmp_input_vec
,
tmp_shows_tensor
,
tmp_clicks_tensor
,
&
tmp_output_vec
);
...
...
paddle/fluid/operators/pscore/send_op.cc
浏览文件 @
5dcfb699
...
...
@@ -53,7 +53,7 @@ class SendOp : public framework::OperatorBase {
send_varnames
[
0
]
!=
"@PS_STEP_COUNTER@"
)
{
auto
fleet
=
paddle
::
distributed
::
FleetWrapper
::
GetInstance
();
std
::
vector
<::
std
::
future
<
int32_t
>>
status
;
fleet
->
PushDenseVarsAsync
(
scope
,
table_id
,
ins
,
&
status
,
0
,
-
1
);
fleet
->
PushDenseVarsAsync
(
scope
,
table_id
,
ins
,
&
status
,
-
1
,
-
1
);
}
else
{
auto
*
communicator
=
paddle
::
distributed
::
Communicator
::
GetInstance
();
if
(
communicator
->
Check
(
send_varnames
))
{
...
...
paddle/fluid/platform/device/npu/npu_op_runner.cc
浏览文件 @
5dcfb699
...
...
@@ -47,6 +47,8 @@ static std::map<framework::proto::VarType::Type, aclDataType>
static
std
::
map
<
DataLayout
,
aclFormat
>
DATA_LAYOUT_2_ACL_FORMAT
=
{
{
DataLayout
::
kNCHW
,
ACL_FORMAT_NCHW
},
{
DataLayout
::
kNHWC
,
ACL_FORMAT_NHWC
},
{
DataLayout
::
kNCDHW
,
ACL_FORMAT_NCDHW
},
{
DataLayout
::
kNDHWC
,
ACL_FORMAT_NDHWC
},
{
DataLayout
::
kAnyLayout
,
ACL_FORMAT_ND
},
};
...
...
paddle/fluid/pybind/fleet_py.cc
浏览文件 @
5dcfb699
...
...
@@ -77,6 +77,8 @@ void BindDistFleetWrapper(py::module* m) {
.
def
(
"stop_worker"
,
&
FleetWrapper
::
FinalizeWorker
)
.
def
(
"barrier"
,
&
FleetWrapper
::
BarrierWithTable
)
.
def
(
"shrink_sparse_table"
,
&
FleetWrapper
::
ShrinkSparseTable
)
.
def
(
"set_clients"
,
&
FleetWrapper
::
SetClients
)
.
def
(
"get_client_info"
,
&
FleetWrapper
::
GetClientsInfo
)
.
def
(
"create_client2client_connection"
,
&
FleetWrapper
::
CreateClient2ClientConnection
);
}
...
...
paddle/phi/common/layout.h
浏览文件 @
5dcfb699
...
...
@@ -30,6 +30,8 @@ enum class DataLayout {
SPARSE_COO
,
SPARSE_CSR
,
NUM_DATA_LAYOUTS
,
NDHWC
,
NCDHW
,
// See Note [ Why we need ALL in basic kernel key member? ]
ALL_LAYOUT
=
UNDEFINED
,
// Note: Unify phi DataLayout and fluid::framework::DataLayout,
...
...
@@ -43,6 +45,8 @@ enum class DataLayout {
kNHWC
=
NHWC
,
kNCHW
=
NCHW
,
kMKLDNN
=
MKLDNN
,
// all layouts supported by MKLDNN internally
kNDHWC
=
NDHWC
,
kNCDHW
=
NCDHW
,
};
}
// namespace experimental
...
...
@@ -70,6 +74,10 @@ inline DataLayout StringToDataLayout(const std::string& str) {
return
DataLayout
::
SPARSE_COO
;
}
else
if
(
s
==
"SPARSE_CSR"
)
{
return
DataLayout
::
SPARSE_CSR
;
}
else
if
(
s
==
"NDHWC"
)
{
return
DataLayout
::
kNDHWC
;
}
else
if
(
s
==
"NCDHW"
)
{
return
DataLayout
::
kNCDHW
;
}
else
{
PD_THROW
(
"Unknown data layout type string: "
,
s
,
"."
);
}
...
...
@@ -89,6 +97,10 @@ inline std::string DataLayoutToString(const DataLayout& layout) {
return
"SPARSE_COO"
;
case
DataLayout
::
SPARSE_CSR
:
return
"SPARSE_CSR"
;
case
DataLayout
::
kNDHWC
:
return
"NDHWC"
;
case
DataLayout
::
kNCDHW
:
return
"NCDHW"
;
default:
PD_THROW
(
"Unknown Data Layout type "
,
static_cast
<
int
>
(
layout
),
"."
);
}
...
...
paddle/phi/kernels/cpu/elementwise_grad_kernel.cc
浏览文件 @
5dcfb699
...
...
@@ -259,7 +259,7 @@ PD_REGISTER_KERNEL(multiply_triple_grad,
phi
::
dtype
::
bfloat16
,
phi
::
dtype
::
complex
<
float
>
,
phi
::
dtype
::
complex
<
double
>
)
{}
PD_REGISTER_KERNEL
(
elementwise_
fmax_grad
,
PD_REGISTER_KERNEL
(
fmax_grad
,
CPU
,
ALL_LAYOUT
,
phi
::
ElementwiseFMaxGradKernel
,
...
...
@@ -268,7 +268,7 @@ PD_REGISTER_KERNEL(elementwise_fmax_grad,
int
,
int64_t
)
{}
PD_REGISTER_KERNEL
(
elementwise_
fmin_grad
,
PD_REGISTER_KERNEL
(
fmin_grad
,
CPU
,
ALL_LAYOUT
,
phi
::
ElementwiseFMinGradKernel
,
...
...
paddle/phi/kernels/cpu/elementwise_kernel.cc
浏览文件 @
5dcfb699
...
...
@@ -87,23 +87,11 @@ using complex128 = ::phi::dtype::complex<double>;
// NOTE(chenweihang): using bfloat16 will cause redefine with xpu bfloat16
// using bfloat16 = ::phi::dtype::bfloat16;
PD_REGISTER_KERNEL
(
elementwise_fmax
,
CPU
,
ALL_LAYOUT
,
phi
::
ElementwiseFMaxKernel
,
float
,
double
,
int
,
int64_t
)
{}
PD_REGISTER_KERNEL
(
fmax
,
CPU
,
ALL_LAYOUT
,
phi
::
FMaxKernel
,
float
,
double
,
int
,
int64_t
)
{}
PD_REGISTER_KERNEL
(
elementwise_fmin
,
CPU
,
ALL_LAYOUT
,
phi
::
ElementwiseFMinKernel
,
float
,
double
,
int
,
int64_t
)
{}
PD_REGISTER_KERNEL
(
fmin
,
CPU
,
ALL_LAYOUT
,
phi
::
FMinKernel
,
float
,
double
,
int
,
int64_t
)
{}
PD_REGISTER_KERNEL
(
add_raw
,
CPU
,
...
...
paddle/phi/kernels/elementwise_kernel.h
浏览文件 @
5dcfb699
...
...
@@ -20,14 +20,14 @@
namespace
phi
{
template
<
typename
T
,
typename
Context
>
void
Elementwise
FMaxKernel
(
const
Context
&
dev_ctx
,
void
FMaxKernel
(
const
Context
&
dev_ctx
,
const
DenseTensor
&
x
,
const
DenseTensor
&
y
,
int
axis
,
DenseTensor
*
out
);
template
<
typename
T
,
typename
Context
>
void
Elementwise
FMinKernel
(
const
Context
&
dev_ctx
,
void
FMinKernel
(
const
Context
&
dev_ctx
,
const
DenseTensor
&
x
,
const
DenseTensor
&
y
,
int
axis
,
...
...
paddle/phi/kernels/gpu/elementwise_grad_kernel.cu
浏览文件 @
5dcfb699
...
...
@@ -282,7 +282,7 @@ PD_REGISTER_KERNEL(multiply_triple_grad,
phi
::
dtype
::
bfloat16
,
phi
::
dtype
::
complex
<
float
>
,
phi
::
dtype
::
complex
<
double
>
)
{}
PD_REGISTER_KERNEL
(
elementwise_
fmax_grad
,
PD_REGISTER_KERNEL
(
fmax_grad
,
GPU
,
ALL_LAYOUT
,
phi
::
ElementwiseFMaxGradKernel
,
...
...
@@ -291,7 +291,7 @@ PD_REGISTER_KERNEL(elementwise_fmax_grad,
int
,
int64_t
)
{}
PD_REGISTER_KERNEL
(
elementwise_
fmin_grad
,
PD_REGISTER_KERNEL
(
fmin_grad
,
GPU
,
ALL_LAYOUT
,
phi
::
ElementwiseFMinGradKernel
,
...
...
paddle/phi/kernels/gpu/elementwise_kernel.cu
浏览文件 @
5dcfb699
...
...
@@ -57,23 +57,11 @@ using bfloat16 = phi::dtype::bfloat16;
using
complex64
=
::
phi
::
dtype
::
complex
<
float
>
;
using
complex128
=
::
phi
::
dtype
::
complex
<
double
>
;
PD_REGISTER_KERNEL
(
elementwise_fmax
,
GPU
,
ALL_LAYOUT
,
phi
::
ElementwiseFMaxKernel
,
float
,
double
,
int
,
int64_t
)
{}
PD_REGISTER_KERNEL
(
fmax
,
GPU
,
ALL_LAYOUT
,
phi
::
FMaxKernel
,
float
,
double
,
int
,
int64_t
)
{}
PD_REGISTER_KERNEL
(
elementwise_fmin
,
GPU
,
ALL_LAYOUT
,
phi
::
ElementwiseFMinKernel
,
float
,
double
,
int
,
int64_t
)
{}
PD_REGISTER_KERNEL
(
fmin
,
GPU
,
ALL_LAYOUT
,
phi
::
FMinKernel
,
float
,
double
,
int
,
int64_t
)
{}
PD_REGISTER_KERNEL
(
add_raw
,
GPU
,
...
...
paddle/phi/kernels/impl/elementwise_kernel_impl.h
浏览文件 @
5dcfb699
...
...
@@ -23,7 +23,7 @@
namespace
phi
{
template
<
typename
T
,
typename
Context
>
void
Elementwise
FMaxKernel
(
const
Context
&
dev_ctx
,
void
FMaxKernel
(
const
Context
&
dev_ctx
,
const
DenseTensor
&
x
,
const
DenseTensor
&
y
,
int
axis
,
...
...
@@ -34,7 +34,7 @@ void ElementwiseFMaxKernel(const Context& dev_ctx,
}
template
<
typename
T
,
typename
Context
>
void
Elementwise
FMinKernel
(
const
Context
&
dev_ctx
,
void
FMinKernel
(
const
Context
&
dev_ctx
,
const
DenseTensor
&
x
,
const
DenseTensor
&
y
,
int
axis
,
...
...
paddle/phi/ops/compat/elementwise_sig.cc
浏览文件 @
5dcfb699
...
...
@@ -19,25 +19,19 @@ namespace phi {
KernelSignature
ElementwiseAddOpArgumentMapping
(
const
ArgumentMappingContext
&
ctx
)
{
int
axis
=
paddle
::
any_cast
<
int
>
(
ctx
.
Attr
(
"axis"
));
if
(
ctx
.
IsDenseTensorInput
(
"X"
))
{
if
(
axis
==
-
1
)
{
return
KernelSignature
(
"add"
,
{
"X"
,
"Y"
},
{},
{
"Out"
});
}
return
KernelSignature
(
"add_raw"
,
{
"X"
,
"Y"
},
{
"axis"
},
{
"Out"
});
}
return
KernelSignature
(
"unregistered"
,
{},
{},
{});
}
KernelSignature
ElementwiseSubOpArgumentMapping
(
const
ArgumentMappingContext
&
ctx
)
{
int
axis
=
paddle
::
any_cast
<
int
>
(
ctx
.
Attr
(
"axis"
));
if
(
ctx
.
IsDenseTensorInput
(
"X"
))
{
if
(
axis
==
-
1
)
{
return
KernelSignature
(
"subtract"
,
{
"X"
,
"Y"
},
{},
{
"Out"
});
}
return
KernelSignature
(
"subtract_raw"
,
{
"X"
,
"Y"
},
{
"axis"
},
{
"Out"
});
}
return
KernelSignature
(
"unregistered"
,
{},
{},
{});
}
KernelSignature
ElementwiseMulOpArgumentMapping
(
...
...
@@ -55,24 +49,18 @@ KernelSignature ElementwiseMulOpArgumentMapping(
KernelSignature
ElementwiseDivOpArgumentMapping
(
const
ArgumentMappingContext
&
ctx
)
{
int
axis
=
paddle
::
any_cast
<
int
>
(
ctx
.
Attr
(
"axis"
));
if
(
ctx
.
IsDenseTensorInput
(
"X"
))
{
if
(
axis
==
-
1
)
{
return
KernelSignature
(
"divide"
,
{
"X"
,
"Y"
},
{},
{
"Out"
});
}
return
KernelSignature
(
"divide_raw"
,
{
"X"
,
"Y"
},
{
"axis"
},
{
"Out"
});
}
return
KernelSignature
(
"unregistered"
,
{},
{},
{});
}
KernelSignature
ElementwiseAddGradOpArgumentMapping
(
const
ArgumentMappingContext
&
ctx
)
{
if
(
ctx
.
IsDenseTensorInput
(
"X"
))
{
return
KernelSignature
(
"add_grad"
,
{
"X"
,
"Y"
,
GradVarName
(
"Out"
)},
{
"axis"
},
{
GradVarName
(
"X"
),
GradVarName
(
"Y"
)});
}
return
KernelSignature
(
"unregistered"
,
{},
{},
{});
}
KernelSignature
ElementwiseAddDoubleGradOpArgumentMapping
(
...
...
@@ -91,13 +79,10 @@ KernelSignature ElementwiseAddTripleGradOpArgumentMapping(
KernelSignature
ElementwiseSubGradOpArgumentMapping
(
const
ArgumentMappingContext
&
ctx
)
{
if
(
ctx
.
IsDenseTensorInput
(
"X"
))
{
return
KernelSignature
(
"subtract_grad"
,
{
"X"
,
"Y"
,
GradVarName
(
"Out"
)},
{
"axis"
},
{
GradVarName
(
"X"
),
GradVarName
(
"Y"
)});
}
return
KernelSignature
(
"unregistered"
,
{},
{},
{});
}
KernelSignature
ElementwiseSubDoubleGradOpArgumentMapping
(
...
...
@@ -116,7 +101,7 @@ KernelSignature ElementwiseDivGradOpArgumentMapping(
KernelSignature
ElementwiseFMinGradOpArgumentMapping
(
const
ArgumentMappingContext
&
ctx
)
{
return
KernelSignature
(
"
elementwise_
fmin_grad"
,
return
KernelSignature
(
"fmin_grad"
,
{
"X"
,
"Y"
,
GradVarName
(
"Out"
)},
{
"axis"
},
{
GradVarName
(
"X"
),
GradVarName
(
"Y"
)});
...
...
@@ -138,9 +123,19 @@ KernelSignature ElementwiseMulGradOpArgumentMapping(
{
GradVarName
(
"X"
),
GradVarName
(
"Y"
)});
}
KernelSignature
ElementwiseFMaxOpArgumentMapping
(
const
ArgumentMappingContext
&
ctx
)
{
return
KernelSignature
(
"fmax"
,
{
"X"
,
"Y"
},
{
"axis"
},
{
"Out"
});
}
KernelSignature
ElementwiseFMinOpArgumentMapping
(
const
ArgumentMappingContext
&
ctx
)
{
return
KernelSignature
(
"fmin"
,
{
"X"
,
"Y"
},
{
"axis"
},
{
"Out"
});
}
KernelSignature
ElementwiseFMaxGradOpArgumentMapping
(
const
ArgumentMappingContext
&
ctx
)
{
return
KernelSignature
(
"
elementwise_
fmax_grad"
,
return
KernelSignature
(
"fmax_grad"
,
{
"X"
,
"Y"
,
GradVarName
(
"Out"
)},
{
"axis"
},
{
GradVarName
(
"X"
),
GradVarName
(
"Y"
)});
...
...
@@ -179,6 +174,10 @@ PD_REGISTER_BASE_KERNEL_NAME(elementwise_div_grad_grad, divide_double_grad);
PD_REGISTER_BASE_KERNEL_NAME
(
elementwise_mul_grad
,
multiply_grad
);
PD_REGISTER_BASE_KERNEL_NAME
(
elementwise_mul_grad_grad
,
multiply_double_grad
);
PD_REGISTER_BASE_KERNEL_NAME
(
elementwise_mul_triple_grad
,
multiply_triple_grad
);
PD_REGISTER_BASE_KERNEL_NAME
(
elementwise_fmax
,
fmax
);
PD_REGISTER_BASE_KERNEL_NAME
(
elementwise_fmin
,
fmin
);
PD_REGISTER_BASE_KERNEL_NAME
(
elementwise_fmax_grad
,
fmax_grad
);
PD_REGISTER_BASE_KERNEL_NAME
(
elementwise_fmin_grad
,
fmin_grad
);
PD_REGISTER_ARG_MAPPING_FN
(
elementwise_add
,
phi
::
ElementwiseAddOpArgumentMapping
);
...
...
@@ -208,9 +207,12 @@ PD_REGISTER_ARG_MAPPING_FN(elementwise_mul_grad_grad,
phi
::
ElementwiseMulDoubleGradOpArgumentMapping
);
PD_REGISTER_ARG_MAPPING_FN
(
elementwise_mul_triple_grad
,
phi
::
ElementwiseMulTripleGradOpArgumentMapping
);
PD_REGISTER_ARG_MAPPING_FN
(
elementwise_fmax
,
phi
::
ElementwiseFMaxOpArgumentMapping
);
PD_REGISTER_ARG_MAPPING_FN
(
elementwise_fmin
,
phi
::
ElementwiseFMinOpArgumentMapping
);
PD_REGISTER_ARG_MAPPING_FN
(
elementwise_fmax_grad
,
phi
::
ElementwiseFMaxGradOpArgumentMapping
);
PD_REGISTER_ARG_MAPPING_FN
(
elementwise_fmin_grad
,
phi
::
ElementwiseFMinGradOpArgumentMapping
);
python/paddle/distributed/fleet/base/fleet_base.py
浏览文件 @
5dcfb699
...
...
@@ -578,7 +578,7 @@ class Fleet(object):
@
is_non_distributed_check
@
inited_runtime_handler
def
init_worker
(
self
):
def
init_worker
(
self
,
scopes
=
None
):
"""
initialize `Communicator` for parameter server training.
...
...
@@ -599,7 +599,7 @@ class Fleet(object):
fleet.init_worker()
"""
self
.
_runtime_handle
.
_init_worker
()
self
.
_runtime_handle
.
_init_worker
(
scopes
)
@
is_non_distributed_check
@
inited_runtime_handler
...
...
@@ -1419,6 +1419,21 @@ class Fleet(object):
# for more examples, please reference https://github.com/PaddlePaddle/FleetX
"""
if
not
isinstance
(
loss
,
list
):
return
self
.
_minimize_impl
(
loss
,
startup_program
,
parameter_list
,
no_grad_set
)
else
:
if
paddle
.
fluid
.
framework
.
in_dygraph_mode
(
)
or
self
.
_role_maker
.
_is_non_distributed
()
or
self
.
_is_collective
:
raise
ValueError
(
"loss can be list only in PS mode"
)
return
self
.
_minimize_losses_impl
(
loss
,
startup_program
,
parameter_list
,
no_grad_set
)
def
_minimize_impl
(
self
,
loss
,
startup_program
=
None
,
parameter_list
=
None
,
no_grad_set
=
None
):
context
=
{}
context
[
"user_defined_strategy"
]
=
copy
.
deepcopy
(
self
.
_user_defined_strategy
)
...
...
@@ -1447,6 +1462,7 @@ class Fleet(object):
"sharding_degree"
]
context
[
"origin_main_program"
]
=
self
.
origin_main_program
context
[
"origin_main_programs"
]
=
[
self
.
origin_main_program
]
context
[
"loss"
]
=
loss
if
startup_program
==
None
:
self
.
origin_startup_program
=
\
...
...
@@ -1457,6 +1473,7 @@ class Fleet(object):
startup_program
.
clone
(
for_test
=
False
)
context
[
"origin_startup_program"
]
=
startup_program
context
[
"origin_startup_programs"
]
=
[
startup_program
]
context
[
"role_maker"
]
=
self
.
_role_maker
# Use the auto-parallel's routines instead
...
...
@@ -1512,6 +1529,8 @@ class Fleet(object):
copy_user_defined_strategy
,
can_not_apply_optimizer_list
)
context
[
"valid_strategy"
]
=
copy
.
deepcopy
(
valid_strategy
)
# print("valid_strategy:", context["valid_strategy"])
# print("user_defined_strategy:", context["user_defined_strategy"])
applied_meta_list
=
self
.
strategy_compiler
.
_get_applied_meta_list
()
applied_graph_list
=
self
.
strategy_compiler
.
_get_applied_graph_list
()
...
...
@@ -1539,13 +1558,17 @@ class Fleet(object):
loss
,
startup_program
,
parameter_list
,
no_grad_set
=
no_grad_set
)
if
meta_optimizer
:
# print("before minimize program id:", id(loss.block.program))
optimize_ops
,
params_grads
=
meta_optimizer
.
minimize
(
loss
,
startup_program
,
parameter_list
,
no_grad_set
=
no_grad_set
)
# print("after minimize program id:", id(loss.block.program))
default_program
=
paddle
.
static
.
default_main_program
()
# print("default program id:", id(default_program))
if
id
(
default_program
)
!=
id
(
loss
.
block
.
program
):
paddle
.
fluid
.
framework
.
switch_main_program
(
loss
.
block
.
program
)
# print("default program id after switch:", id(default_program))
else
:
optimize_ops
,
params_grads
=
self
.
user_defined_optimizer
.
minimize
(
...
...
@@ -1555,6 +1578,7 @@ class Fleet(object):
context
[
"program_params_grads"
]
=
params_grads
if
graph_optimizer
:
# print("before graph minimize program id:", id(loss.block.program))
optimize_ops
,
params_grads
=
graph_optimizer
.
minimize
(
loss
,
startup_program
,
parameter_list
,
no_grad_set
=
no_grad_set
)
# since we do not encourage users to use graph operations
...
...
@@ -1568,13 +1592,90 @@ class Fleet(object):
if
not
self
.
_role_maker
.
_is_heter_parameter_server_mode
:
program
=
paddle
.
static
.
default_main_program
()
opt_info
=
{}
opt_info
=
{}
if
program
.
_fleet_opt
is
None
else
program
.
_fleet_opt
opt_info
[
"mpi_size"
]
=
self
.
worker_num
()
opt_info
[
"mpi_rank"
]
=
self
.
worker_index
()
for
k
,
v
in
self
.
_user_defined_strategy
.
trainer_desc_configs
.
items
(
):
opt_info
[
k
]
=
v
program
.
_fleet_opt
=
opt_info
if
self
.
_runtime_handle
is
None
:
self
.
_runtime_handle
=
RuntimeFactory
().
_create_runtime
(
context
)
import
paddle.distributed.fleet
as
fleet
fleet
.
util
.
_set_strategy
(
context
[
"valid_strategy"
])
return
optimize_ops
,
params_grads
def
_minimize_losses_impl
(
self
,
losses
,
startup_programs
=
None
,
parameter_list
=
None
,
no_grad_set
=
None
):
context
=
{}
# cache original feed forward program
self
.
origin_main_program
=
losses
[
0
].
block
.
program
context
[
"origin_main_program"
]
=
self
.
origin_main_program
context
[
"origin_main_programs"
]
=
[]
for
loss
in
losses
:
context
[
"origin_main_programs"
].
append
(
loss
.
block
.
program
)
context
[
"loss"
]
=
losses
if
startup_programs
is
None
:
if
len
(
losses
)
==
1
:
startup_programs
=
[
paddle
.
static
.
default_startup_program
()]
else
:
raise
ValueError
(
"startup_program can't be None when loss is list."
)
self
.
origin_startup_program
=
startup_programs
[
0
].
clone
(
for_test
=
False
)
context
[
"origin_startup_program"
]
=
startup_programs
[
0
]
context
[
"origin_startup_programs"
]
=
[]
for
program
in
startup_programs
:
context
[
"origin_startup_programs"
].
append
(
program
)
context
[
"role_maker"
]
=
self
.
_role_maker
context
[
"user_defined_strategy"
]
=
copy
.
deepcopy
(
self
.
_user_defined_strategy
)
context
[
"valid_strategy"
]
=
copy
.
deepcopy
(
self
.
_user_defined_strategy
)
self
.
_context
=
context
self
.
valid_strategy
=
context
[
"valid_strategy"
]
self
.
valid_strategy
.
_enable_env
()
optimize_ops
=
[]
params_grads
=
[]
from
..meta_optimizers
import
ParameterServerOptimizer
ps_optimizer
=
ParameterServerOptimizer
(
self
.
user_defined_optimizer
)
ps_optimizer
.
_set_basic_info
(
losses
,
self
.
_role_maker
,
self
.
user_defined_optimizer
,
self
.
_user_defined_strategy
)
optimize_ops
,
params_grads
=
ps_optimizer
.
minimize_losses_impl
(
losses
,
startup_programs
,
parameter_list
,
no_grad_set
=
no_grad_set
)
# default_program = paddle.static.default_main_program()
# if id(default_program) != id(losses[0].block.program):
# paddle.fluid.framework.switch_main_program(losses[0].block.program)
context
[
"program_optimize_ops"
]
=
optimize_ops
context
[
"program_params_grads"
]
=
params_grads
for
loss
in
losses
:
program
=
loss
.
block
.
program
opt_info
=
{}
if
program
.
_fleet_opt
is
None
else
program
.
_fleet_opt
opt_info
[
"mpi_size"
]
=
self
.
worker_num
()
opt_info
[
"mpi_rank"
]
=
self
.
worker_index
()
for
k
,
v
in
self
.
_user_defined_strategy
.
trainer_desc_configs
.
items
(
):
opt_info
[
k
]
=
v
program
.
_fleet_opt
=
opt_info
# print("fleet base opt info:", id(program), program._fleet_opt)
if
self
.
_runtime_handle
is
None
:
self
.
_runtime_handle
=
RuntimeFactory
().
_create_runtime
(
context
)
...
...
python/paddle/distributed/fleet/base/runtime_factory.py
浏览文件 @
5dcfb699
...
...
@@ -13,7 +13,7 @@
# limitations under the License.
from
..runtime.collective_runtime
import
CollectiveRuntime
from
..runtime.parameter_server_runtime
import
ParameterServerRuntime
from
..
runtime
.the_one_ps
import
TheOnePSRuntime
from
..
.ps
.the_one_ps
import
TheOnePSRuntime
__all__
=
[]
...
...
python/paddle/distributed/fleet/meta_optimizers/__init__.py
浏览文件 @
5dcfb699
...
...
@@ -17,7 +17,7 @@ from .asp_optimizer import ASPOptimizer
from
.recompute_optimizer
import
RecomputeOptimizer
from
.gradient_merge_optimizer
import
GradientMergeOptimizer
from
.graph_execution_optimizer
import
GraphExecutionOptimizer
from
.p
arameter_server
_optimizer
import
ParameterServerOptimizer
from
.p
s
_optimizer
import
ParameterServerOptimizer
from
.pipeline_optimizer
import
PipelineOptimizer
from
.localsgd_optimizer
import
LocalSGDOptimizer
from
.localsgd_optimizer
import
AdaptiveLocalSGDOptimizer
...
...
python/paddle/distributed/fleet/meta_optimizers/ps_optimizer.py
浏览文件 @
5dcfb699
...
...
@@ -110,8 +110,9 @@ class ParameterServerOptimizer(MetaOptimizerBase):
no_grad_set
)
if
startup_program
==
None
:
startup_program
=
paddle
.
static
.
default_startup_program
()
print
(
"program after inner optimizer minimize:"
,
str
(
loss
.
block
.
program
))
# print("program after inner optimizer minimize:",
# str(loss.block.program))
self
.
_set_origin_programs
([
loss
])
self
.
_init_ps_pass_context
(
loss
,
startup_program
)
ps_builder
=
PsProgramBuilderFactory
().
_create_ps_program_builder
(
...
...
@@ -181,7 +182,6 @@ class ParameterServerOptimizer(MetaOptimizerBase):
if
not
var
.
persistable
or
var
.
desc
.
type
(
)
!=
core
.
VarDesc
.
VarType
.
LOD_TENSOR
:
continue
set_var_lod_type
(
var
)
param_memory_size
+=
get_var_mem_size
(
var
)
processed_var_names
.
add
(
varname
)
...
...
@@ -211,9 +211,8 @@ class ParameterServerOptimizer(MetaOptimizerBase):
data_count
*=
(
-
x
)
else
:
data_count
*=
x
program_tmp_vars
[
var_name
]
=
(
data_count
,
neg_dim_count
,
vars_metatools
.
dtype_to_size
[
var
.
dtype
])
program_tmp_vars
[
var_name
]
=
(
data_count
,
neg_dim_count
,
dtype_to_size
[
var
.
dtype
])
for
varname
in
program_tmp_vars
:
data_count
,
neg_dim_count
,
type_size
=
program_tmp_vars
[
varname
]
...
...
@@ -228,12 +227,19 @@ class ParameterServerOptimizer(MetaOptimizerBase):
return
False
def
_enable_strategy
(
self
,
dist_strategy
,
context
):
a_sync_configs
=
dist_strategy
.
a_sync_configs
if
dist_strategy
.
a_sync_configs
[
"k_steps"
]
>=
0
:
return
dist_strategy
.
a_sync
=
True
a_sync_configs
=
dist_strategy
.
a_sync_configs
is_geo
=
self
.
_can_apply_geo
(
context
[
"origin_main_program"
])
dist_strategy
.
a_sync_configs
[
"k_steps"
]
=
800
if
is_geo
else
0
a_sync_configs
[
"k_steps"
]
=
800
if
is_geo
else
0
dist_strategy
.
a_sync_configs
=
a_sync_configs
def
_disable_strategy
(
self
,
dist_strategy
):
dist_strategy
.
a_sync
=
False
a_sync_configs
=
dist_strategy
.
a_sync_configs
dist_strategy
.
a_sync_configs
[
"k_steps"
]
=
-
1
dist_strategy
.
a_sync_configs
=
a_sync_configs
python/paddle/distributed/fleet/runtime/the_one_ps.py
浏览文件 @
5dcfb699
...
...
@@ -62,9 +62,9 @@ def get_default_accessor_proto(accessor, varname, o_main_program):
if
not
accessor
.
HasField
(
"accessor_class"
):
accessor
.
accessor_class
=
"CtrCommonAccessor"
if
not
accessor
.
HasField
(
"fea_dim"
):
accessor
.
fea_dim
=
embedding_dim
+
2
accessor
.
fea_dim
=
embedding_dim
if
not
accessor
.
HasField
(
"embedx_dim"
):
accessor
.
embedx_dim
=
embedding_dim
-
1
accessor
.
embedx_dim
=
embedding_dim
-
3
if
not
accessor
.
HasField
(
"embedx_threshold"
):
accessor
.
embedx_threshold
=
0
...
...
@@ -129,15 +129,15 @@ def check_embedding_dim(accessor, varname, o_main_program):
embedding_dim
=
var
.
shape
[
1
]
break
fea_dim
=
accessor
.
fea_dim
if
fea_dim
!=
embedding_dim
+
2
:
if
fea_dim
!=
embedding_dim
:
raise
ValueError
(
"The fea_dim is wrong, it will be sparse_embedding_dim
+ 2
: {}, but got {}"
.
format
(
embedding_dim
+
2
,
fea_dim
))
"The fea_dim is wrong, it will be sparse_embedding_dim: {}, but got {}"
.
format
(
embedding_dim
,
fea_dim
))
embedx_dim
=
accessor
.
embedx_dim
if
embedx_dim
!=
embedding_dim
-
1
:
if
embedx_dim
!=
embedding_dim
-
3
:
raise
ValueError
(
"The embedx_dim is wrong, it will be sparse_embedding_dim -
1
: {}, but got {}"
.
format
(
embedding_dim
-
1
,
embedx_dim
))
"The embedx_dim is wrong, it will be sparse_embedding_dim -
3
: {}, but got {}"
.
format
(
embedding_dim
-
3
,
embedx_dim
))
class
Accessor
:
...
...
@@ -927,7 +927,6 @@ class TheOnePSRuntime(RuntimeBase):
tables
=
[]
for
idx
,
(
name
,
ctx
)
in
enumerate
(
send_ctx
.
items
()):
print
(
" wxm python test send_ctx.items-->"
,
idx
,
(
name
,
ctx
))
if
ctx
.
is_tensor_table
()
or
len
(
ctx
.
origin_varnames
())
<
1
:
continue
...
...
python/paddle/distributed/fleet/utils/ps_util.py
浏览文件 @
5dcfb699
...
...
@@ -75,7 +75,7 @@ class DistributedInfer:
if
self
.
sparse_table_maps
is
None
:
self
.
sparse_table_maps
=
{}
send_ctx
=
fleet
.
fleet
.
_runtime_handle
.
_
communicator
.
send_ctx_
send_ctx
=
fleet
.
fleet
.
_runtime_handle
.
_
send_ctx
for
gradname
,
ctx
in
send_ctx
.
items
():
if
ctx
.
is_sparse
:
param
=
gradname
.
strip
(
"@GRAD"
)
...
...
python/paddle/distributed/passes/ps_server_pass.py
浏览文件 @
5dcfb699
...
...
@@ -155,8 +155,6 @@ class AddListenAndServPass(PassBase):
main_program
.
global_block
().
append_op
(
type
=
"listen_and_serv"
,
inputs
=
{
'X'
:
[]},
outputs
=
{},
attrs
=
opt
)
attrs
[
'cloned_main'
]
=
main_program
@
register_pass
(
"add_rpc_global_flags_pass"
)
class
AddRpcGlobalFlagsPass
(
PassBase
):
...
...
python/paddle/distributed/passes/ps_trainer_pass.py
浏览文件 @
5dcfb699
...
...
@@ -116,7 +116,7 @@ class DistributedOpsPass(PassBase):
def
_check_conflict
(
self
,
other_pass
):
return
True
def
_push_sparse_fuse
(
self
,
_program
,
push_sparse_ops
,
attrs
):
def
_push_sparse_fuse
(
self
,
_program
,
push_sparse_ops
,
attrs
,
use_cvm_op
):
if
attrs
[
'use_ps_gpu'
]:
return
if
len
(
push_sparse_ops
)
==
0
:
...
...
@@ -211,7 +211,8 @@ class DistributedOpsPass(PassBase):
"is_distributed"
:
is_distributed
,
"padding_idx"
:
padding_idx
,
"table_id"
:
table_id
,
"size"
:
self
.
emb_size
[
param
]
"size"
:
self
.
emb_size
[
param
],
"use_cvm_op"
:
use_cvm_op
})
def
_pull_sparse_fuse
(
self
,
_program
,
pull_sparse_ops
,
attrs
,
send_ctx
):
...
...
@@ -420,6 +421,7 @@ class DistributedOpsPass(PassBase):
pull_sparse_ids
=
{}
push_sparse_ops
=
{}
ops
=
{}
use_cvm_op
=
False
for
op
in
_program
.
global_block
().
ops
:
if
op
.
type
in
SPARSE_OP_TYPE_DICT
.
keys
()
\
and
op
.
attr
(
'remote_prefetch'
)
is
True
:
...
...
@@ -433,6 +435,9 @@ class DistributedOpsPass(PassBase):
ids
=
pull_sparse_ids
.
get
(
param_name
,
[])
ids
.
append
(
op
.
input
(
"Ids"
)[
0
])
pull_sparse_ids
[
param_name
]
=
ids
if
op
.
type
==
'cvm'
:
use_cvm_op
=
True
for
op
in
_program
.
global_block
().
ops
:
if
op
.
type
in
SPARSE_GRAD_OP_TYPE_DICT
.
keys
():
param_name
=
op
.
input
(
SPARSE_GRAD_OP_TYPE_DICT
[
op
.
type
])[
0
]
...
...
@@ -442,16 +447,16 @@ class DistributedOpsPass(PassBase):
ops
.
append
(
op
)
push_sparse_ops
[
param_name
]
=
ops
return
pull_sparse_ops
,
push_sparse_ops
return
pull_sparse_ops
,
push_sparse_ops
,
use_cvm_op
def
_apply_single_impl
(
self
,
main_program
,
startup_program
,
pass_ctx
):
attrs
=
pass_ctx
.
_attrs
pull_sparse_ops
,
push_sparse_ops
=
self
.
_get_pull_sparse_ops
(
pull_sparse_ops
,
push_sparse_ops
,
use_cvm_op
=
self
.
_get_pull_sparse_ops
(
main_program
,
attrs
)
send_ctx
=
get_the_one_send_context
(
attrs
,
split_dense_table
=
attrs
[
'is_heter_ps_mode'
])
self
.
_pull_sparse_fuse
(
main_program
,
pull_sparse_ops
,
attrs
,
send_ctx
)
self
.
_push_sparse_fuse
(
main_program
,
push_sparse_ops
,
attrs
)
self
.
_push_sparse_fuse
(
main_program
,
push_sparse_ops
,
attrs
,
use_cvm_op
)
@
register_pass
(
"delete_optimizer_pass"
)
...
...
python/paddle/distributed/ps/the_one_ps.py
浏览文件 @
5dcfb699
...
...
@@ -15,7 +15,7 @@
import
warnings
import
os
from
paddle.distributed.fleet.proto
import
ps_pb2
import
paddle.distributed.fleet.proto.the_one_ps_pb2
as
ps_pb2
import
paddle.fluid
as
fluid
import
paddle.distributed.fleet
as
fleet
from
paddle.fluid
import
core
...
...
@@ -68,16 +68,30 @@ def check_embedding_dim(accessor_proto, varname, program_id, context):
print
(
'new var: {}, {}, {}'
.
format
(
var
,
embedding_dim
,
accessor_proto
.
fea_dim
))
break
fea_dim
=
accessor_proto
.
fea_dim
if
accessor_proto
.
accessor_class
==
"SparseAccessor"
:
if
fea_dim
!=
embedding_dim
+
2
:
raise
ValueError
(
"The fea_dim is wrong, it will be sparse_embedding_dim + 2: {}, but got {}"
.
format
(
embedding_dim
+
2
,
fea_dim
))
else
:
if
fea_dim
!=
embedding_dim
:
raise
ValueError
(
"The fea_dim is wrong, it will be sparse_embedding_dim: {}, but got {}"
.
format
(
embedding_dim
,
fea_dim
))
embedx_dim
=
accessor_proto
.
embedx_dim
if
accessor_proto
.
accessor_class
==
"SparseAccessor"
:
if
embedx_dim
!=
embedding_dim
-
1
:
raise
ValueError
(
"The embedx_dim is wrong, it will be sparse_embedding_dim - 1: {}, but got {}"
.
format
(
embedding_dim
-
1
,
embedx_dim
))
else
:
if
embedx_dim
!=
embedding_dim
-
3
:
raise
ValueError
(
"The embedx_dim is wrong, it will be sparse_embedding_dim - 3: {}, but got {}"
.
format
(
embedding_dim
-
3
,
embedx_dim
))
class
Service
:
...
...
@@ -119,11 +133,18 @@ class Accessor:
break
if
not
accessor_proto
.
HasField
(
"accessor_class"
):
accessor_proto
.
accessor_class
=
"CtrCommonAccessor"
# DownpourSparseValueAccessor
accessor_proto
.
accessor_class
=
"SparseAccessor"
if
not
accessor_proto
.
HasField
(
"fea_dim"
):
if
accessor_proto
.
accessor_class
==
"SparseAccessor"
:
accessor_proto
.
fea_dim
=
embedding_dim
+
2
else
:
accessor_proto
.
fea_dim
=
embedding_dim
if
not
accessor_proto
.
HasField
(
"embedx_dim"
):
if
accessor_proto
.
accessor_class
==
"SparseAccessor"
:
accessor_proto
.
embedx_dim
=
embedding_dim
-
1
else
:
accessor_proto
.
embedx_dim
=
embedding_dim
-
3
if
not
accessor_proto
.
HasField
(
"embedx_threshold"
):
accessor_proto
.
embedx_threshold
=
0
...
...
@@ -268,16 +289,16 @@ class CommonAccessor(Accessor):
attr_str
=
""
origin_var_name
=
value_name
print
(
"get_initializer_attr param name:"
,
value_name
)
#
print("get_initializer_attr param name:", value_name)
for
op
in
o_startup_program
.
global_block
().
ops
:
if
op
.
type
in
self
.
opt_init_map
.
keys
(
)
and
origin_var_name
==
op
.
output
(
"Out"
)[
0
]:
init_attr
=
[
op
.
type
]
print
(
"get_initializer_attr op type:"
,
op
.
type
)
#
print("get_initializer_attr op type:", op.type)
for
attr
in
self
.
opt_init_map
[
op
.
type
]:
print
(
"get_initializer_attr opt_init_map attr:"
,
attr
)
#
print("get_initializer_attr opt_init_map attr:", attr)
init_attr
.
append
(
str
(
op
.
attr
(
attr
)))
print
(
"get_initializer_attr op attr:"
,
str
(
op
.
attr
(
attr
)))
#
print("get_initializer_attr op attr:", str(op.attr(attr)))
attr_str
=
l_in
.
join
(
init_attr
)
break
return
attr_str
...
...
@@ -288,16 +309,16 @@ class CommonAccessor(Accessor):
size
=
ctx
.
sections
()[
0
]
single_dim
=
ctx
.
sections
()[
1
]
if
ctx
.
is_sparse
()
else
1
adam_d2sum
=
context
[
"user_defined_strategy"
].
adam_d2sum
print
(
"parse_by_optimizer table_id:{} is_datanorm:{}"
.
format
(
ctx
.
table_id
(),
ctx
.
is_datanorm_table
()))
#
print("parse_by_optimizer table_id:{} is_datanorm:{}".format(
#
ctx.table_id(), ctx.is_datanorm_table()))
main_program
,
startup_program
,
idx
=
get_program_by_id
(
context
,
ctx
.
program_id
())
pserver_id
=
get_role_id
(
context
[
'role_maker'
])
pserver_num
=
len
(
get_ps_endpoints
(
context
[
'role_maker'
]))
optimizer_ops
=
get_optimize_ops
(
main_program
)
print
(
"the one ps optimizer_ops:"
,
optimizer_ops
)
print
(
"the one ps parse_by_optimizer grad_name:"
,
grad_name
)
#
print("the one ps optimizer_ops:", optimizer_ops)
#
print("the one ps parse_by_optimizer grad_name:", grad_name)
oop
=
None
for
op
in
optimizer_ops
:
...
...
@@ -394,7 +415,7 @@ class CommonAccessor(Accessor):
initializer
=
self
.
get_initializer_attr
(
param
.
name
,
startup_program
)
elif
formal_name
==
"SummaryDecayRate"
:
initializer
=
"fill_constant&0.99999"
initializer
=
"fill_constant&0.99999
9
"
else
:
initializer
=
"fill_constant&0"
initializers
.
append
(
initializer
)
...
...
@@ -740,7 +761,6 @@ class PsDescBuilder(object):
def
_get_tables
(
self
):
tables
=
[]
for
idx
,
(
name
,
ctx
)
in
enumerate
(
self
.
send_ctx
.
items
()):
print
(
'####### {}
\n
'
.
format
(
ctx
.
is_sparse
()))
if
ctx
.
is_sparse
():
if
self
.
ps_mode
==
DistributedMode
.
GEO
:
tables
.
append
(
globals
()[
'GeoSparseTable'
](
self
.
context
,
...
...
@@ -778,11 +798,11 @@ class PsDescBuilder(object):
return
text_format
.
MessageToString
(
self
.
ps_desc
)
def
build_server_desc
(
self
):
self
.
sparse_table_maps
=
{}
for
table
in
self
.
tables
:
table_proto
=
self
.
ps_desc
.
server_param
.
downpour_server_param
.
downpour_table_param
.
add
(
)
table
.
_set
(
table_proto
)
self
.
sparse_table_maps
=
{}
if
table_proto
.
type
==
ps_pb2
.
PS_SPARSE_TABLE
and
table_proto
.
common
is
not
None
:
self
.
sparse_table_maps
[
table_proto
.
common
.
table_name
]
=
table_proto
.
table_id
...
...
@@ -801,6 +821,7 @@ class TheOnePSRuntime(RuntimeBase):
self
.
_worker
=
fluid
.
core
.
DistFleetWrapper
()
self
.
_server_sub_program
=
[]
self
.
_heter_client
=
None
self
.
_send_ctx
=
None
def
_set_basic_info
(
self
,
context
):
self
.
context
=
context
...
...
@@ -835,7 +856,40 @@ class TheOnePSRuntime(RuntimeBase):
self
.
ps_desc_builder
=
PsDescBuilder
(
self
.
context
)
def
_init_worker
(
self
):
def
_init_params
(
self
,
scopes
,
send_ctx
,
recv_map
):
for
name
,
ctx
in
send_ctx
.
items
():
if
ctx
.
is_sparse
():
continue
_
,
_
,
idx
=
get_program_by_id
(
self
.
context
,
ctx
.
program_id
())
scope
=
scopes
[
idx
]
table_id
=
ctx
.
table_id
()
var_names
=
recv_map
[
table_id
]
# print("init params:", idx, table_id, var_names)
self
.
_worker
.
push_dense_params
(
scope
,
table_id
,
var_names
)
def
_pull_all_dense
(
self
,
scopes
,
send_ctx
,
recv_map
):
for
name
,
ctx
in
send_ctx
.
items
():
if
ctx
.
is_sparse
():
continue
_
,
_
,
idx
=
get_program_by_id
(
self
.
context
,
ctx
.
program_id
())
scope
=
scopes
[
idx
]
table_id
=
ctx
.
table_id
()
var_names
=
recv_map
[
table_id
]
# print("pull all dense:", idx, table_id, var_names)
self
.
_worker
.
pull_dense_params
(
scope
,
table_id
,
var_names
)
def
_pull_dense
(
self
,
program
,
scope
,
send_ctx
,
recv_map
):
for
name
,
ctx
in
send_ctx
.
items
():
if
ctx
.
is_sparse
():
continue
if
ctx
.
program_id
()
!=
id
(
program
):
continue
table_id
=
ctx
.
table_id
()
var_names
=
recv_map
[
table_id
]
# print("pull dense:", table_id, var_names)
self
.
_worker
.
pull_dense_params
(
scope
,
table_id
,
var_names
)
def
_init_worker
(
self
,
scopes
=
None
):
worker_desc
=
self
.
ps_desc_builder
.
build_worker_desc
()
if
self
.
context
[
'use_ps_gpu'
]:
...
...
@@ -866,6 +920,7 @@ class TheOnePSRuntime(RuntimeBase):
split_dense_table
=
self
.
is_heter_ps_mode
,
use_origin_program
=
self
.
is_heter_ps_mode
,
ep_list
=
self
.
endpoints
)
self
.
_send_ctx
=
send_ctx
trainer_config
=
self
.
context
[
'trainer'
]
debug
=
bool
(
int
(
os
.
getenv
(
"PSERVER_DEBUG"
,
"0"
)))
...
...
@@ -889,23 +944,32 @@ class TheOnePSRuntime(RuntimeBase):
kwargs
.
update
(
sync_kwargs
)
print
(
"communicator config:"
,
trainer_config
.
get_communicator_flags
())
role_id
=
get_role_id
(
self
.
role_maker
)
self
.
_worker
.
init_worker
(
proto_txt
,
self
.
string_hosts
,
role_id
)
if
self
.
context
[
'ps_mode'
]
==
DistributedMode
.
GEO
:
self
.
_communicator
=
Communicator
(
trainer_config
.
mode
,
kwargs
,
trainer_config
.
get_communicator_flags
())
self
.
_communicator
.
init_with_ctx
(
send_ctx
,
dense_map
,
proto_txt
,
self
.
string_hosts
,
fluid
.
global_scope
())
fleet
.
util
.
barrier
()
info
=
self
.
_communicator
.
get_client_info
()
# info = self._communicator.get_client_info()
info
=
self
.
_worker
.
get_client_info
()
if
isinstance
(
info
,
list
)
and
len
(
info
)
>
0
:
all_info
=
self
.
role_maker
.
_all_gather
(
info
[
0
])
# for unittest
if
not
isinstance
(
all_info
,
list
):
warnings
.
warn
(
"gloo may not initialize correctly"
)
all_info
=
[
all_info
]
self
.
_communicator
.
set_clients
(
all_info
)
self
.
_communicator
.
create_client_to_client_connection
()
# self._communicator.set_clients(all_info)
# self._communicator.create_client_to_client_connection()
self
.
_worker
.
set_clients
(
all_info
)
self
.
_worker
.
create_client2client_connection
()
print
(
'create c2c connection done'
)
else
:
print
(
'cannot create c2c connection'
)
...
...
@@ -914,6 +978,7 @@ class TheOnePSRuntime(RuntimeBase):
is_test
=
bool
(
int
(
os
.
getenv
(
"TEST_MODE"
,
"0"
)))
# for GEO
if
self
.
role_maker
.
_is_first_worker
()
and
self
.
is_heter_ps_mode
:
# for ps-heter mode load all parameters on first_worker
init_params
=
get_the_one_recv_context
(
...
...
@@ -921,12 +986,34 @@ class TheOnePSRuntime(RuntimeBase):
else
:
init_params
=
dense_map
# if not is_test:
# self._communicator.init_params(init_params)
# fleet.util.barrier()
# self._communicator.pull_dense(init_params)
# fleet.util.barrier()
if
scopes
is
None
:
if
len
(
self
.
origin_main_programs
)
>
1
:
raise
ValueError
(
"You must set the scope list when you have Multiple programs"
)
scopes
=
[
fluid
.
global_scope
()]
if
len
(
self
.
origin_main_programs
)
!=
len
(
scopes
):
raise
VauleError
(
"len(programs) != len(scopes)"
)
self
.
scopes
=
scopes
if
not
is_test
:
if
self
.
context
[
'ps_mode'
]
==
DistributedMode
.
GEO
:
self
.
_communicator
.
init_params
(
init_params
)
else
:
if
role_id
==
0
:
self
.
_init_params
(
scopes
,
send_ctx
,
dense_map
)
fleet
.
util
.
barrier
()
self
.
_
communicator
.
pull_dense
(
init_params
)
self
.
_
pull_all_dense
(
scopes
,
send_ctx
,
dense_map
)
fleet
.
util
.
barrier
()
if
self
.
context
[
'ps_mode'
]
==
DistributedMode
.
GEO
:
if
not
self
.
_communicator
.
is_running
():
self
.
_communicator
.
start
()
else
:
...
...
@@ -996,7 +1083,9 @@ class TheOnePSRuntime(RuntimeBase):
self
.
_server
.
run_server
(
host
,
int
(
port
))
def
_stop_worker
(
self
):
if
self
.
context
[
'ps_mode'
]
==
DistributedMode
.
GEO
:
self
.
_communicator
.
stop
()
self
.
_worker
.
stop_worker
()
if
self
.
is_heter_ps_mode
:
assert
self
.
_heter_client
!=
None
,
"heter client should not be None in heterps mode"
self
.
_heter_client
.
stop
()
...
...
@@ -1151,7 +1240,11 @@ class TheOnePSRuntime(RuntimeBase):
"in fleet.save() function, executor must be as Executor type"
)
import
paddle
program
=
self
.
origin_main_program
if
main_program
is
None
else
main_program
program
=
self
.
origin_main_programs
[
0
]
if
main_program
is
None
else
main_program
_
,
_
,
idx
=
get_program_by_id
(
self
.
context
,
id
(
program
))
scope
=
self
.
scopes
[
idx
]
print
(
"save inference model scope idx:"
,
idx
)
if
isinstance
(
program
,
CompiledProgram
):
raise
TypeError
(
...
...
@@ -1180,12 +1273,14 @@ class TheOnePSRuntime(RuntimeBase):
sparse_names
=
self
.
_save_sparse_params
(
executor
,
dirname
,
sparses
,
main_program
,
mode
)
denses
=
get_the_one_recv_context
(
dense_map
=
get_the_one_recv_context
(
self
.
context
,
split_dense_table
=
self
.
is_heter_ps_mode
)
send_ctx
=
get_the_one_send_context
(
self
.
context
,
is_dense
=
True
,
split_dense_table
=
self
.
is_heter_ps_mode
,
use_origin_program
=
True
)
self
.
_communicator
.
pull_dense
(
denses
)
use_origin_program
=
self
.
is_heter_ps_mode
,
ep_list
=
self
.
endpoints
)
self
.
_pull_dense
(
program
,
scope
,
send_ctx
,
dense_map
)
generate_vars
=
self
.
context
[
"user_defined_strategy"
].
trainer_desc_configs
[
"stat_var_names"
]
...
...
@@ -1196,7 +1291,7 @@ class TheOnePSRuntime(RuntimeBase):
infer_program
.
list_vars
()))
for
var
in
remaining_vars
:
tensor
=
var
.
get_value
()
tensor
=
var
.
get_value
(
scope
)
paddle
.
save
(
tensor
,
os
.
path
.
join
(
model_path
,
var
.
name
),
...
...
python/paddle/distributed/ps/utils/ps_program_builder.py
浏览文件 @
5dcfb699
...
...
@@ -37,6 +37,37 @@ class PsProgramBuilder(object):
self
.
server_endpoints
=
self
.
attrs
[
'role_maker'
].
_get_pserver_endpoints
(
)
def
_build_trainer_desc
(
self
):
opt_info
=
self
.
loss
.
block
.
program
.
_fleet_opt
opt_info
=
{}
if
opt_info
is
None
else
opt_info
opt_info
[
"trainer"
]
=
opt_info
.
get
(
"trainer"
,
"DistMultiTrainer"
)
opt_info
[
"device_worker"
]
=
opt_info
.
get
(
"device_worker"
,
"DownpourLite"
)
pid
=
str
(
id
(
self
.
cloned_main
))
program_configs
=
{
pid
:
{
'pull_dense'
:
[],
'push_dense'
:
[],
'pull_sparse'
:
[],
'push_sparse'
:
[]
}
}
dense_table_config
=
{}
send_ctx
=
get_the_one_send_context
(
self
.
attrs
)
recv_ctx
=
get_the_one_recv_context
(
self
.
attrs
)
for
name
,
ctx
in
send_ctx
.
items
():
if
ctx
.
program_id
()
!=
id
(
self
.
loss
.
block
.
program
):
continue
if
ctx
.
is_sparse
():
continue
if
not
ctx
.
is_tensor_table
():
program_configs
[
pid
][
'pull_dense'
].
append
(
ctx
.
table_id
())
program_configs
[
pid
][
'push_dense'
].
append
(
ctx
.
table_id
())
dense_table_config
[
ctx
.
table_id
()]
=
recv_ctx
[
ctx
.
table_id
()]
opt_info
[
'program_configs'
]
=
program_configs
opt_info
[
'dense_table_config'
]
=
dense_table_config
self
.
cloned_main
.
_fleet_opt
=
opt_info
def
_optimize_programs
(
self
):
pass
...
...
@@ -63,7 +94,15 @@ class PsProgramBuilder(object):
logger
.
info
(
"start building trainer program"
)
self
.
_build_trainer_programs
()
fluid
.
framework
.
switch_startup_program
(
self
.
cloned_startup
)
# print("ps_program_build before =", id(self.loss.block.program))
self
.
_build_trainer_desc
()
self
.
loss
.
block
.
program
=
self
.
cloned_main
# print("ps_program_build after =", id(self.loss.block.program))
# print("ps_program_build clone after =", id(self.cloned_main))
# print("ps_program_build after trainer_desc",
# id(self.loss.block.program))
# print("ps_program build trainer desc",
# self.loss.block.program._fleet_opt)
elif
self
.
attrs
[
'is_server'
]:
logger
.
info
(
"start building pserver program"
)
...
...
@@ -92,6 +131,13 @@ class GeoPsProgramBuilder(PsProgramBuilder): # 仅 CPU 模式
return
def
_build_pserver_programs
(
self
):
add_listen_and_serv_pass
=
new_pass
(
'add_listen_and_serv_pass'
,
self
.
attrs
)
add_listen_and_serv_pass
.
apply
([
self
.
attrs
[
'_main_server'
]],
[
None
],
self
.
pass_ctx
)
return
class
CpuSyncPsProgramBuilder
(
PsProgramBuilder
):
def
__init__
(
self
,
pass_ctx
):
...
...
@@ -103,13 +149,13 @@ class CpuSyncPsProgramBuilder(PsProgramBuilder):
format
(
self
.
ps_mode
,
"PsProgramBuilder"
))
def
_build_trainer_programs
(
self
):
print
(
"build trainer program entry"
)
print
(
"before ps program builder program:"
,
self
.
cloned_main
)
#
print("build trainer program entry")
#
print("before ps program builder program:", self.cloned_main)
add_lr_decay_table_pass
=
new_pass
(
"add_lr_decay_table_pass"
,
self
.
attrs
)
add_lr_decay_table_pass
.
apply
([],
[],
self
.
pass_ctx
)
print
(
"before distributed op pass"
)
#
print("before distributed op pass")
distributed_ops_pass
=
new_pass
(
"distributed_ops_pass"
,
self
.
attrs
)
distributed_ops_pass
.
apply
([
self
.
cloned_main
],
[
None
],
self
.
pass_ctx
)
...
...
@@ -129,7 +175,7 @@ class CpuSyncPsProgramBuilder(PsProgramBuilder):
self
.
attrs
[
'origin_main_program'
]
=
self
.
cloned_main
self
.
attrs
[
'origin_startup_program'
]
=
self
.
cloned_startup
print
(
"after ps program builder program:"
,
self
.
cloned_main
)
#
print("after ps program builder program:", self.cloned_main)
if
self
.
launch_barrier
and
self
.
launch_barrier_flag
:
wait_server_ready
(
self
.
server_endpoints
)
...
...
python/paddle/distributed/ps/utils/public.py
浏览文件 @
5dcfb699
...
...
@@ -23,7 +23,6 @@ import logging
import
six
import
paddle.fluid
as
fluid
from
paddle.fluid
import
core
from
paddle.fluid.core
import
CommContext
import
paddle.fluid.framework
as
framework
import
paddle.distributed.fleet
as
fleet
...
...
@@ -73,9 +72,9 @@ def logger_config(log_path, logging_name):
return
logger
ps_log_root_dir
=
'/ps_log/'
ps_log_root_dir
=
'
.
/ps_log/'
logger
=
logger_config
(
log_path
=
'/ps_usr_print_log'
,
logging_name
=
'ps_usr_print_log'
)
log_path
=
'
.
/ps_usr_print_log'
,
logging_name
=
'ps_usr_print_log'
)
class
DistributedMode
:
...
...
@@ -342,6 +341,7 @@ def get_dense_send_context(program,
aggregate
=
True
print
(
"public get_dense_send_context dense_table:"
,
grad_name
,
var_numel
,
origin_varnames
)
from
paddle.fluid.core
import
CommContext
dense_ctx
=
CommContext
(
grad_name
,
[
grad_name
],
[
"127.0.0.1:6071"
],
[
var_numel
],
origin_varnames
,
trainer_id
,
aggregate
,
False
,
False
,
idx
,
False
,
False
,
...
...
@@ -364,6 +364,7 @@ def get_dense_send_context(program,
aggregate
=
True
print
(
"public get_dense_send_context data_norm table:"
,
grad_name
,
var_numel
,
origin_varnames
)
from
paddle.fluid.core
import
CommContext
data_norm_ctx
=
CommContext
(
grad_name
,
[
grad_name
],
[
"127.0.0.1:6071"
],
[
var_numel
],
origin_varnames
,
trainer_id
,
aggregate
,
False
,
False
,
idx
,
False
,
True
,
...
...
@@ -378,6 +379,7 @@ def get_dense_send_context(program,
var_numel
=
reduce
(
lambda
x
,
y
:
x
*
y
,
var
.
shape
)
grad_name
=
origin_varname
aggregate
=
True
from
paddle.fluid.core
import
CommContext
dense_ctx
=
CommContext
(
grad_name
,
[
grad_name
],
[
"127.0.0.1:6071"
],
[
var_numel
],
[
origin_varname
],
trainer_id
,
aggregate
,
False
,
False
,
idx
,
False
,
False
,
...
...
@@ -407,7 +409,7 @@ def get_geo_trainer_send_context(context):
var
=
program
.
global_block
().
vars
[
grad
.
merged_var
.
name
]
var_numel
=
reduce
(
lambda
x
,
y
:
x
*
y
,
var
.
shape
[
1
:])
from
paddle.fluid.core
import
CommContext
sparse_ctx
=
CommContext
(
grad_name
,
[
grad_name
],
[
"127.0.0.1:6071"
],
[
var_numel
],
[
grad_name
],
trainer_id
,
True
,
True
,
...
...
@@ -432,6 +434,7 @@ def _step_ctx(idx, role_maker):
endpoints
=
get_ps_endpoints
(
role_maker
)
sections
=
[
1
]
*
len
(
endpoints
)
names
=
[
name
]
*
len
(
endpoints
)
from
paddle.fluid.core
import
CommContext
ctx
=
CommContext
(
name
,
names
,
endpoints
,
sections
,
[
name
],
trainer_id
,
True
,
False
,
False
,
idx
,
True
,
False
,
-
1
)
return
name
,
ctx
...
...
@@ -448,12 +451,8 @@ def get_the_one_send_context(context,
origin_programs
=
context
[
'origin_main_programs'
]
idx
=
0
for
i
,
program
in
enumerate
(
origin_programs
):
merged_dense_pairs
=
context
[
'merged_dense_pairs'
][
i
]
idx
=
get_dense_send_context
(
program
,
send_ctx
,
idx
,
merged_dense_pairs
,
trainer_id
,
split_dense_table
)
distibuted_varnames
=
get_sparse_tablenames
(
origin_programs
,
True
)
print
(
"public distibuted_varnames:"
,
distibuted_varnames
)
#
print("public distibuted_varnames:", distibuted_varnames)
for
i
,
program
in
enumerate
(
origin_programs
):
merged_sparse_pairs
=
context
[
'merged_sparse_pairs'
][
i
]
for
merged
in
merged_sparse_pairs
:
...
...
@@ -472,10 +471,11 @@ def get_the_one_send_context(context,
shape
=
list
(
var
.
shape
)
shape
[
0
]
=
0
if
is_distributed
else
shape
[
0
]
print
(
"public get_the_one_send_context sparse:"
,
grad_name
,
splited_varname
,
shape
)
#
print("public get_the_one_send_context sparse:", grad_name,
#
splited_varname, shape)
if
grad_name
in
send_ctx
:
continue
from
paddle.fluid.core
import
CommContext
sparse_ctx
=
CommContext
(
grad_name
,
splited_varname
,
ep_list
,
shape
,
[
grad_name
],
trainer_id
,
True
,
True
,
is_distributed
,
idx
,
False
,
False
,
...
...
@@ -484,6 +484,11 @@ def get_the_one_send_context(context,
idx
+=
1
send_ctx
[
sparse_ctx
.
var_name
()]
=
sparse_ctx
for
i
,
program
in
enumerate
(
origin_programs
):
merged_dense_pairs
=
context
[
'merged_dense_pairs'
][
i
]
idx
=
get_dense_send_context
(
program
,
send_ctx
,
idx
,
merged_dense_pairs
,
trainer_id
,
split_dense_table
)
if
len
(
context
[
'tensor_table'
])
>
0
and
context
[
'is_worker'
]:
name
,
ctx
=
_step_ctx
(
idx
,
context
[
'role_maker'
])
send_ctx
[
name
]
=
ctx
...
...
@@ -1258,8 +1263,8 @@ def build_var_distributed(context):
context
[
"merged_variable_map"
]
=
{}
for
origin_program
in
origin_programs
:
sparse_pairs
,
dense_pairs
=
get_param_grads
(
origin_program
)
print
(
"public build_var_distributed sparse_pairs:"
,
sparse_pairs
)
print
(
"public build_var_distributed dense_pairs:"
,
dense_pairs
)
#
print("public build_var_distributed sparse_pairs:", sparse_pairs)
#
print("public build_var_distributed dense_pairs:", dense_pairs)
origin_for_sparse
=
[]
origin_for_dense
=
[]
merged_sparse_pairs
=
[]
...
...
@@ -1279,8 +1284,8 @@ def build_var_distributed(context):
m_grad
=
MergedVariable
(
grad
,
[
grad
],
[
0
])
merged_variables_pairs
.
append
((
m_param
,
m_grad
))
merged_dense_pairs
.
append
((
m_param
,
m_grad
))
print
(
"public build_var_distributed merged_dense_pairs:"
,
merged_dense_pairs
)
#
print("public build_var_distributed merged_dense_pairs:",
#
merged_dense_pairs)
for
sparse_pair
in
origin_for_sparse
:
param
,
grad
=
sparse_pair
...
...
@@ -1289,8 +1294,8 @@ def build_var_distributed(context):
m_grad
=
MergedVariable
(
grad
,
[
grad
],
[
0
])
merged_variables_pairs
.
append
((
m_param
,
m_grad
))
merged_sparse_pairs
.
append
((
m_param
,
m_grad
))
print
(
"public build_var_distributed merged_sparse_pairs:"
,
merged_sparse_pairs
)
#
print("public build_var_distributed merged_sparse_pairs:",
#
merged_sparse_pairs)
for
merged
in
merged_variables_pairs
:
m_param
,
m_grad
=
merged
...
...
@@ -1315,18 +1320,19 @@ def build_var_distributed(context):
context
[
"param_name_to_grad_name"
]
=
param_name_to_grad_name
context
[
"grad_name_to_param_name"
]
=
grad_name_to_param_name
print
(
"public build_var_distributed origin_sparse_pairs:"
,
context
[
"origin_sparse_pairs"
])
print
(
"public build_var_distributed origin_for_dense:"
,
context
[
"origin_dense_pairs"
])
print
(
"public build_var_distributed merged_sparse_pairs:"
,
context
[
"merged_sparse_pairs"
])
print
(
"public build_var_distributed merged_dense_pairs:"
,
context
[
'merged_dense_pairs'
])
print
(
"public build_var_distributed param_name_to_grad_name:"
,
param_name_to_grad_name
)
print
(
"public build_var_distributed grad_name_to_param_name:"
,
grad_name_to_param_name
)
# print("public build_var_distributed origin_sparse_pairs:",
# context["origin_sparse_pairs"])
# print("public build_var_distributed origin_for_dense:",
# context["origin_dense_pairs"])
# print("public build_var_distributed merged_sparse_pairs:",
# context["merged_sparse_pairs"])
# print("public build_var_distributed merged_dense_pairs:",
# context['merged_dense_pairs'])
# print("public build_var_distributed param_name_to_grad_name:",
# param_name_to_grad_name)
# print("public build_var_distributed grad_name_to_param_name:",
# grad_name_to_param_name)
def
_is_opt_role_op
(
op
):
...
...
python/paddle/fluid/communicator.py
浏览文件 @
5dcfb699
...
...
@@ -62,8 +62,13 @@ class Communicator(object):
"""
# set all recv op to not_run mode
if
kwargs
==
None
:
if
envs
==
None
:
envs
=
{}
else
:
if
mode
==
DistributedMode
.
SYNC
:
envs
[
"pserver_endpoints"
]
=
','
.
join
(
kwargs
[
"pserver_endpoints"
])
envs
[
"pserver_endpoints"
]
=
','
.
join
(
kwargs
[
"pserver_endpoints"
])
envs
[
"trainers"
]
=
str
(
kwargs
[
"trainers"
])
envs
[
"trainer_id"
]
=
str
(
kwargs
[
"trainer_id"
])
...
...
@@ -129,6 +134,9 @@ class Communicator(object):
comm.start()
comm.stop()
"""
if
self
.
communicator_
==
None
:
print
(
'you must call init_with_ctx first to init comm before start'
)
return
self
.
communicator_
.
start
()
def
stop
(
self
):
...
...
@@ -148,6 +156,9 @@ class Communicator(object):
comm.start()
comm.stop()
"""
if
self
.
communicator_
==
None
:
print
(
'you must call init_with_ctx first to init comm before stop'
)
return
self
.
communicator_
.
stop
()
def
is_running
(
self
):
...
...
@@ -166,6 +177,9 @@ class Communicator(object):
comm = fluid.communicator.Communicator(prog)
comm.is_running()
"""
if
self
.
communicator_
==
None
:
print
(
'you must call init_with_ctx first to init comm before stop'
)
return
self
.
communicator_
.
is_running
()
def
recv
(
self
):
...
...
python/paddle/fluid/dataset.py
浏览文件 @
5dcfb699
...
...
@@ -862,9 +862,9 @@ class InMemoryDataset(DatasetBase):
thread_num(int): shuffle thread num. Default is 12.
"""
from
paddle.fluid.incubate.fleet.parameter_server.pslib
import
PSLib
if
fleet
is
not
None
:
if
not
isinstance
(
fleet
,
PSLib
):
if
hasattr
(
fleet
,
"barrier_worker"
):
print
(
"pscore fleet"
)
fleet
.
barrier_worker
()
else
:
fleet
.
_role_maker
.
barrier_worker
()
...
...
@@ -879,20 +879,20 @@ class InMemoryDataset(DatasetBase):
self
.
dataset
.
set_fleet_send_batch_size
(
self
.
fleet_send_batch_size
)
self
.
dataset
.
set_fleet_send_sleep_seconds
(
self
.
fleet_send_sleep_seconds
)
if
fleet
is
not
None
:
if
not
isinstance
(
fleet
,
PSLib
):
if
hasattr
(
fleet
,
"barrier_worker"
):
fleet
.
barrier_worker
()
else
:
fleet
.
_role_maker
.
barrier_worker
()
self
.
dataset
.
global_shuffle
(
thread_num
)
if
fleet
is
not
None
:
if
not
isinstance
(
fleet
,
PSLib
):
if
hasattr
(
fleet
,
"barrier_worker"
):
fleet
.
barrier_worker
()
else
:
fleet
.
_role_maker
.
barrier_worker
()
if
self
.
merge_by_lineid
:
self
.
dataset
.
merge_by_lineid
()
if
fleet
is
not
None
:
if
not
isinstance
(
fleet
,
PSLib
):
if
hasattr
(
fleet
,
"barrier_worker"
):
fleet
.
barrier_worker
()
else
:
fleet
.
_role_maker
.
barrier_worker
()
...
...
@@ -1026,9 +1026,8 @@ class InMemoryDataset(DatasetBase):
local_data_size
=
np
.
array
([
local_data_size
])
print
(
'global shuffle local_data_size: '
,
local_data_size
)
if
fleet
is
not
None
:
from
paddle.fluid.incubate.fleet.parameter_server.pslib
import
PSLib
global_data_size
=
local_data_size
*
0
if
not
isinstance
(
fleet
,
PSLib
):
if
hasattr
(
fleet
,
"util"
):
global_data_size
=
fleet
.
util
.
all_reduce
(
local_data_size
)
else
:
fleet
.
_role_maker
.
all_reduce_worker
(
local_data_size
,
...
...
python/paddle/fluid/device_worker.py
浏览文件 @
5dcfb699
...
...
@@ -99,6 +99,7 @@ class Hogwild(DeviceWorker):
dense_table_set
=
set
()
program_id
=
str
(
id
(
self
.
_program
))
print
(
"device worker program id:"
,
program_id
)
if
self
.
_program
==
None
:
print
(
"program of current device worker is not configured"
)
exit
(
-
1
)
...
...
@@ -115,15 +116,20 @@ class Hogwild(DeviceWorker):
from
paddle.fluid.incubate.fleet.parameter_server
import
version
if
version
.
is_transpiler
()
and
"fleet_desc"
not
in
opt_info
:
if
version
.
is_transpiler
(
)
and
"fleet_desc"
not
in
opt_info
and
"program_configs"
not
in
opt_info
:
return
program_configs
=
opt_info
[
"program_configs"
]
print
(
"device worker program_configs:"
,
program_configs
)
for
pid
in
program_configs
:
print
(
"device worker"
,
pid
,
program_id
)
if
pid
==
program_id
:
pc
=
downpour
.
program_config
.
add
()
pc
.
program_id
=
program_id
print
(
"device worker pull dense:"
,
program_configs
[
program_id
][
"pull_dense"
])
for
i
in
program_configs
[
program_id
][
"push_sparse"
]:
pc
.
push_sparse_table_id
.
extend
([
i
])
for
i
in
program_configs
[
program_id
][
"push_dense"
]:
...
...
@@ -139,8 +145,11 @@ class Hogwild(DeviceWorker):
trainer_desc
.
device_worker_name
=
"HogwildWorker"
pull_thread
=
trainer_desc
.
pull_dense_param
pull_thread
.
device_num
=
trainer_desc
.
thread_num
if
opt_info
.
get
(
"program_id_to_worker"
)
is
None
:
raise
ValueError
(
"opt_info must have program_id_to_worker"
)
if
opt_info
.
get
(
"program_id_to_worker"
)
is
None
and
opt_info
.
get
(
"dense_table_config"
)
is
None
:
raise
ValueError
(
"opt_info must have program_id_to_worker or dense_table_config"
)
if
opt_info
.
get
(
"program_id_to_worker"
)
is
not
None
:
prog_id_to_worker
=
opt_info
[
"program_id_to_worker"
]
if
prog_id_to_worker
.
get
(
program_id
)
is
None
:
raise
ValueError
(
"%s not found in program_id_to_worker"
%
...
...
@@ -155,13 +164,14 @@ class Hogwild(DeviceWorker):
sparse_len
=
len
(
worker
.
get_desc
().
sparse_table
)
for
i
in
range
(
sparse_len
):
sparse_table
=
downpour
.
sparse_table
.
add
()
sparse_table
.
table_id
=
worker
.
get_desc
().
sparse_table
[
i
].
table_id
sparse_table
.
sparse_key_name
.
extend
(
worker
.
get_desc
().
sparse_table
[
i
].
slot_key
)
sparse_table
.
sparse_value_name
.
extend
(
worker
.
get_desc
()
.
sparse_table
[
i
].
slot_value
)
sparse_table
.
sparse_grad_name
.
extend
(
worker
.
get_desc
().
sparse_table
[
i
].
slot_gradient
)
sparse_table
.
table_id
=
worker
.
get_desc
().
sparse_table
[
i
].
table_id
sparse_table
.
sparse_key_name
.
extend
(
worker
.
get_desc
()
.
sparse_table
[
i
].
slot_key
)
sparse_table
.
sparse_value_name
.
extend
(
worker
.
get_desc
(
).
sparse_table
[
i
].
slot_value
)
sparse_table
.
sparse_grad_name
.
extend
(
worker
.
get_desc
(
).
sparse_table
[
i
].
slot_gradient
)
sparse_table
.
fea_dim
=
\
self
.
_fleet_desc
.
server_param
.
downpour_server_param
.
downpour_table_param
[
i
].
accessor
.
fea_dim
...
...
@@ -178,11 +188,146 @@ class Hogwild(DeviceWorker):
dense_table
.
dense_grad_name
.
extend
(
i
.
dense_gradient_variable_name
)
hogwild
.
skip_ops
.
extend
(
worker
.
get_desc
().
skip_op
)
else
:
dense_table_config
=
opt_info
.
get
(
"dense_table_config"
)
print
(
"device worker dense_table_config:"
,
dense_table_config
)
for
table_id
,
varnames
in
dense_table_config
.
items
():
dense_table
=
pull_thread
.
dense_table
.
add
()
dense_table
.
dense_value_name
.
extend
(
varnames
)
dense_table
.
table_id
=
table_id
if
self
.
_infer
:
hogwild
.
skip_ops
.
extend
(
[
"push_sparse"
,
"push_sparse_v2"
,
"push_dense"
])
class
DownpourLite
(
DeviceWorker
):
"""
DownpourLite is a kind of SGD algorithm.
"""
def
__init__
(
self
):
"""Init."""
super
(
DownpourLite
,
self
).
__init__
()
def
_gen_worker_desc
(
self
,
trainer_desc
):
"""
Generator worker desc, which device worker is DownpourLiteWorker.
Args:
trainer_desc(TrainerDesc): a TrainerDesc object
"""
print
(
"create DownpourLiteWorker"
)
trainer_desc
.
device_worker_name
=
"DownpourLiteWorker"
if
self
.
_infer
:
# just ignore feed op for inference model
trainer_desc
.
downpour_param
.
skip_ops
.
extend
([
"feed"
,
"push_sparse"
,
"push_sparse_v2"
,
"push_dense"
,
"distributed_push_sparse"
,
"send"
])
dense_table_set
=
set
()
program_id
=
str
(
id
(
self
.
_program
))
print
(
"device worker program id:"
,
program_id
)
if
self
.
_program
==
None
:
print
(
"program of current device worker is not configured"
)
exit
(
-
1
)
opt_info
=
self
.
_program
.
_fleet_opt
# when opt_info is None or empty dict, it should return
if
not
opt_info
:
return
downpour
=
trainer_desc
.
downpour_param
if
opt_info
[
"stat_var_names"
]:
for
i
in
opt_info
[
"stat_var_names"
]:
downpour
.
stat_var_names
.
extend
([
i
])
from
paddle.fluid.incubate.fleet.parameter_server
import
version
if
version
.
is_transpiler
(
)
and
"fleet_desc"
not
in
opt_info
and
"program_configs"
not
in
opt_info
:
return
program_configs
=
opt_info
[
"program_configs"
]
print
(
"device worker program_configs:"
,
program_configs
)
for
pid
in
program_configs
:
print
(
"device worker"
,
pid
,
program_id
)
if
pid
==
program_id
:
pc
=
downpour
.
program_config
.
add
()
pc
.
program_id
=
program_id
print
(
"device worker pull dense:"
,
program_configs
[
program_id
][
"pull_dense"
])
for
i
in
program_configs
[
program_id
][
"push_sparse"
]:
pc
.
push_sparse_table_id
.
extend
([
i
])
for
i
in
program_configs
[
program_id
][
"push_dense"
]:
pc
.
push_dense_table_id
.
extend
([
i
])
dense_table_set
.
add
(
i
)
for
i
in
program_configs
[
program_id
][
"pull_sparse"
]:
pc
.
pull_sparse_table_id
.
extend
([
i
])
for
i
in
program_configs
[
program_id
][
"pull_dense"
]:
pc
.
pull_dense_table_id
.
extend
([
i
])
dense_table_set
.
add
(
i
)
break
pull_thread
=
trainer_desc
.
pull_dense_param
pull_thread
.
device_num
=
trainer_desc
.
thread_num
if
opt_info
.
get
(
"program_id_to_worker"
)
is
None
and
opt_info
.
get
(
"dense_table_config"
)
is
None
:
raise
ValueError
(
"opt_info must have program_id_to_worker or dense_table_config"
)
if
opt_info
.
get
(
"program_id_to_worker"
)
is
not
None
:
prog_id_to_worker
=
opt_info
[
"program_id_to_worker"
]
if
prog_id_to_worker
.
get
(
program_id
)
is
None
:
raise
ValueError
(
"%s not found in program_id_to_worker"
%
program_id
)
worker
=
opt_info
[
"program_id_to_worker"
][
program_id
]
for
i
in
worker
.
get_desc
().
dense_table
:
if
i
.
table_id
in
dense_table_set
:
dense_table
=
pull_thread
.
dense_table
.
add
()
dense_table
.
dense_value_name
.
extend
(
i
.
dense_variable_name
)
dense_table
.
table_id
=
\
i
.
table_id
sparse_len
=
len
(
worker
.
get_desc
().
sparse_table
)
for
i
in
range
(
sparse_len
):
sparse_table
=
downpour
.
sparse_table
.
add
()
sparse_table
.
table_id
=
worker
.
get_desc
().
sparse_table
[
i
].
table_id
sparse_table
.
sparse_key_name
.
extend
(
worker
.
get_desc
()
.
sparse_table
[
i
].
slot_key
)
sparse_table
.
sparse_value_name
.
extend
(
worker
.
get_desc
(
).
sparse_table
[
i
].
slot_value
)
sparse_table
.
sparse_grad_name
.
extend
(
worker
.
get_desc
(
).
sparse_table
[
i
].
slot_gradient
)
sparse_table
.
fea_dim
=
\
self
.
_fleet_desc
.
server_param
.
downpour_server_param
.
downpour_table_param
[
i
].
accessor
.
fea_dim
# not use emb_dim
sparse_table
.
emb_dim
=
-
1
# not use hard code click
sparse_table
.
label_var_name
=
""
for
i
in
worker
.
get_desc
().
dense_table
:
if
i
.
table_id
in
dense_table_set
:
dense_table
=
downpour
.
dense_table
.
add
()
dense_table
.
table_id
=
i
.
table_id
dense_table
.
dense_value_name
.
extend
(
i
.
dense_variable_name
)
dense_table
.
dense_grad_name
.
extend
(
i
.
dense_gradient_variable_name
)
downpour
.
skip_ops
.
extend
(
worker
.
get_desc
().
skip_op
)
else
:
dense_table_config
=
opt_info
.
get
(
"dense_table_config"
)
print
(
"device worker dense_table_config:"
,
dense_table_config
)
for
table_id
,
varnames
in
dense_table_config
.
items
():
dense_table
=
pull_thread
.
dense_table
.
add
()
dense_table
.
dense_value_name
.
extend
(
varnames
)
dense_table
.
table_id
=
table_id
if
self
.
_infer
:
downpour
.
skip_ops
.
extend
(
[
"push_sparse"
,
"push_sparse_v2"
,
"push_dense"
])
class
DownpourSGD
(
DeviceWorker
):
"""
DownpourSGD is a kind of distributed SGD algorithm.
...
...
python/paddle/fluid/tests/unittests/distributed_passes/test_ps_trainer_pass.py
浏览文件 @
5dcfb699
...
...
@@ -57,8 +57,8 @@ class TestPsTrainerPass(PsPassTestBase):
remove_path_if_exists
(
self
.
config
[
'log_dir'
])
self
.
ps_launch
()
file1
=
'/ps_log/async_run_minimize_debug:_0_worker_main.prototxt'
file2
=
'/ps_log/async_run_minimize_debug:_1_worker_main.prototxt'
file1
=
'
.
/ps_log/async_run_minimize_debug:_0_worker_main.prototxt'
file2
=
'
.
/ps_log/async_run_minimize_debug:_1_worker_main.prototxt'
if
self
.
check
(
file1
,
file2
):
logger
.
info
(
'test_ps_optimizer_minimize_cpu_async passed!'
)
else
:
...
...
@@ -79,8 +79,8 @@ class TestPsTrainerPass(PsPassTestBase):
remove_path_if_exists
(
self
.
config
[
'log_dir'
])
self
.
ps_launch
()
'''
file1 = '/ps_log/sync_run_minimize_debug:_0_worker_main.prototxt'
file2 = '/ps_log/sync_run_minimize_debug:_1_worker_main.prototxt'
file1 = '
.
/ps_log/sync_run_minimize_debug:_0_worker_main.prototxt'
file2 = '
.
/ps_log/sync_run_minimize_debug:_1_worker_main.prototxt'
if self.check(file1, file2):
logger.info('test_ps_optimizer_minimize_cpu_sync passed!')
else:
...
...
@@ -102,8 +102,8 @@ class TestPsTrainerPass(PsPassTestBase):
remove_path_if_exists
(
self
.
config
[
'log_dir'
])
self
.
ps_launch
()
file1
=
'/ps_log/geo_run_minimize_debug:_0_worker_main.prototxt'
file2
=
'/ps_log/geo_run_minimize_debug:_1_worker_main.prototxt'
file1
=
'
.
/ps_log/geo_run_minimize_debug:_0_worker_main.prototxt'
file2
=
'
.
/ps_log/geo_run_minimize_debug:_1_worker_main.prototxt'
if
self
.
check
(
file1
,
file2
):
logger
.
info
(
'test_ps_optimizer_minimize_cpu_geo passed!'
)
else
:
...
...
@@ -130,10 +130,10 @@ class TestPsTrainerPass(PsPassTestBase):
remove_path_if_exists
(
self
.
config
[
'log_dir'
])
self
.
ps_launch
(
'heter-ps'
)
'''
file1 = '/ps_log/heter_run_minimize_debug:_0_worker_main.prototxt'
file2 = '/ps_log/heter_run_minimize_debug:_1_worker_main.prototxt'
file3 = '/ps_log/heter_run_minimize_debug:_0_heter_worker_main.prototxt'
file4 = '/ps_log/heter_run_minimize_debug:_1_heter_worker_main.prototxt'
file1 = '
.
/ps_log/heter_run_minimize_debug:_0_worker_main.prototxt'
file2 = '
.
/ps_log/heter_run_minimize_debug:_1_worker_main.prototxt'
file3 = '
.
/ps_log/heter_run_minimize_debug:_0_heter_worker_main.prototxt'
file4 = '
.
/ps_log/heter_run_minimize_debug:_1_heter_worker_main.prototxt'
if self.check(file1, file2) and self.check(file3, file4):
logger.info('test_ps_optimizer_minimize_heter passed!')
else:
...
...
@@ -155,8 +155,8 @@ class TestPsTrainerPass(PsPassTestBase):
remove_path_if_exists
(
self
.
config
[
'log_dir'
])
self
.
ps_launch
(
"gpu-ps"
)
file1
=
'/ps_log/gpubox_run_minimize_debug:_0_worker_main.prototxt'
file2
=
'/ps_log/gpubox_run_minimize_debug:_1_worker_main.prototxt'
file1
=
'
.
/ps_log/gpubox_run_minimize_debug:_0_worker_main.prototxt'
file2
=
'
.
/ps_log/gpubox_run_minimize_debug:_1_worker_main.prototxt'
if
self
.
check
(
file1
,
file2
):
logger
.
info
(
'test_ps_optimizer_minimize_gpu passed!'
)
else
:
...
...
@@ -180,8 +180,8 @@ class TestPsTrainerPass(PsPassTestBase):
remove_path_if_exists
(
self
.
config
[
'log_dir'
])
self
.
ps_launch
(
"cpu-ps"
)
file1
=
'/ps_log/async_append_send_ops_pass_debug:_0_worker_main.prototxt'
file2
=
'/ps_log/async_append_send_ops_pass_debug:_1_worker_main.prototxt'
file1
=
'
.
/ps_log/async_append_send_ops_pass_debug:_0_worker_main.prototxt'
file2
=
'
.
/ps_log/async_append_send_ops_pass_debug:_1_worker_main.prototxt'
if
self
.
check
(
file1
,
file2
):
logger
.
info
(
'test_append_send_ops_pass passed!'
)
else
:
...
...
@@ -192,5 +192,5 @@ class TestPsTrainerPass(PsPassTestBase):
if
__name__
==
'__main__'
:
remove_path_if_exists
(
'/ps_log'
)
remove_path_if_exists
(
'
.
/ps_log'
)
unittest
.
main
()
python/paddle/fluid/tests/unittests/mkldnn/test_activation_bf16_mkldnn_op.py
浏览文件 @
5dcfb699
...
...
@@ -50,11 +50,11 @@ class MKLDNNBF16ActivationOp(object):
self
.
dtype
=
np
.
uint16
self
.
init_data
()
self
.
config
()
self
.
set_attrs
()
self
.
out
=
self
.
op_forward
(
self
.
x
)
self
.
inputs
=
{
'X'
:
convert_float_to_uint16
(
self
.
x
)}
self
.
outputs
=
{
'Out'
:
self
.
out
}
self
.
set_attrs
()
def
calculate_grads
(
self
):
self
.
dx
=
self
.
op_grad
(
self
.
out
,
self
.
x
)
...
...
@@ -162,5 +162,110 @@ class TestMKLDNNMishBF16Op(MKLDNNBF16ActivationOp, TestActivation):
return
dout
*
((
np
.
exp
(
x
)
*
omega
)
/
delta
**
2
)
class
TestMKLDNNRelu6BF16Op
(
MKLDNNBF16ActivationOp
,
TestActivation
):
def
config
(
self
):
self
.
op_type
=
"relu6"
def
op_forward
(
self
,
x
):
return
np
.
clip
(
x
,
0
,
6
)
def
op_grad
(
self
,
dout
,
x
):
return
np
.
where
((
x
>
0
)
&
(
x
<=
6
),
dout
,
0
)
class
TestMKLDNNLeakyReluBF16Op
(
MKLDNNBF16ActivationOp
,
TestActivation
):
def
config
(
self
):
self
.
op_type
=
"leaky_relu"
def
op_forward
(
self
,
x
):
return
np
.
where
(
x
>
0
,
x
,
self
.
alpha
*
x
)
def
op_grad
(
self
,
dout
,
x
):
return
np
.
where
(
x
>
0
,
dout
,
self
.
alpha
*
dout
)
def
set_attrs
(
self
):
self
.
alpha
=
0.2
self
.
attrs
=
{
"use_mkldnn"
:
True
,
"alpha"
:
self
.
alpha
}
class
TestMKLDNNSwishBF16Op
(
MKLDNNBF16ActivationOp
,
TestActivation
):
def
config
(
self
):
self
.
op_type
=
"swish"
def
expit
(
self
,
val
):
return
1
/
(
1
+
np
.
exp
(
-
self
.
beta
*
val
))
def
op_forward
(
self
,
x
):
return
x
*
self
.
expit
(
x
)
def
op_grad
(
self
,
dout
,
x
):
return
dout
*
self
.
expit
(
x
)
*
(
1
+
self
.
beta
*
x
*
(
1
-
self
.
expit
(
x
)))
def
set_attrs
(
self
):
self
.
beta
=
0.2
self
.
attrs
=
{
"use_mkldnn"
:
True
,
"beta"
:
self
.
beta
}
class
TestMKLDNNHardSwishBF16Op
(
MKLDNNBF16ActivationOp
,
TestActivation
):
def
config
(
self
):
self
.
op_type
=
"hard_swish"
def
op_forward
(
self
,
x
):
result
=
np
.
where
(
x
<
-
3
,
0
,
x
)
return
np
.
where
(
result
>
3
,
result
,
result
*
(
result
+
3
)
/
6
)
def
op_grad
(
self
,
dout
,
x
):
result
=
np
.
where
(
x
<
-
3
,
0
,
x
)
return
np
.
where
(
result
>
3
,
dout
,
dout
*
(
2
*
x
+
3
)
/
6
)
class
TestMKLDNNTanhBF16Op
(
MKLDNNBF16ActivationOp
,
TestActivation
):
def
config
(
self
):
self
.
op_type
=
"tanh"
def
op_forward
(
self
,
x
):
return
np
.
tanh
(
x
)
def
op_grad
(
self
,
dout
,
x
):
return
dout
*
(
1
-
np
.
tanh
(
x
)
**
2
)
class
TestMKLDNNAbsBF16Op
(
MKLDNNBF16ActivationOp
,
TestActivation
):
def
config
(
self
):
self
.
op_type
=
"abs"
def
op_forward
(
self
,
x
):
return
np
.
absolute
(
x
)
def
op_grad
(
self
,
dout
,
x
):
return
dout
*
np
.
sign
(
x
)
class
TestMKLDNNEluBF16Op
(
MKLDNNBF16ActivationOp
,
TestActivation
):
def
config
(
self
):
self
.
op_type
=
"elu"
def
op_forward
(
self
,
x
):
return
np
.
where
(
x
>
0
,
x
,
self
.
alpha
*
(
np
.
exp
(
x
)
-
1
))
def
op_grad
(
self
,
dout
,
x
):
return
np
.
where
(
x
>
0
,
dout
,
dout
*
self
.
alpha
*
np
.
exp
(
x
))
def
set_attrs
(
self
):
self
.
alpha
=
0.2
self
.
attrs
=
{
"use_mkldnn"
:
True
,
"alpha"
:
self
.
alpha
}
class
TestMKLDNNExpBF16Op
(
MKLDNNBF16ActivationOp
,
TestActivation
):
def
config
(
self
):
self
.
op_type
=
"exp"
def
op_forward
(
self
,
x
):
return
np
.
exp
(
x
)
def
op_grad
(
self
,
dout
,
x
):
return
dout
*
np
.
exp
(
x
)
if
__name__
==
'__main__'
:
unittest
.
main
()
python/paddle/fluid/tests/unittests/mkldnn/test_softplus_mkldnn_op.py
浏览文件 @
5dcfb699
...
...
@@ -16,7 +16,7 @@ from __future__ import print_function
import
unittest
import
numpy
as
np
from
paddle.fluid.tests.unittests.op_test
import
OpTest
,
OpTestTool
from
paddle.fluid.tests.unittests.op_test
import
OpTest
,
OpTestTool
,
convert_float_to_uint16
import
paddle
import
paddle.fluid
as
fluid
import
paddle.fluid.core
as
core
...
...
@@ -30,23 +30,32 @@ def ref_softplus(x, beta, threshold):
return
out
@
OpTestTool
.
skip_if
(
not
(
isinstance
(
_current_expected_place
(),
core
.
CPUPlace
)),
"GPU is not supported"
)
@
OpTestTool
.
skip_if_not_cpu_bf16
()
class
TestSoftplusOneDNNOp
(
OpTest
):
def
setUp
(
self
):
self
.
op_type
=
"softplus"
self
.
beta
=
1
self
.
threshold
=
20
self
.
config
()
self
.
set_dtype
()
self
.
attrs
=
{
'use_mkldnn'
:
True
,
'beta'
:
self
.
beta
}
self
.
inputs
=
{
'X'
:
np
.
random
.
random
(
self
.
x_shape
).
astype
(
np
.
float32
)}
self
.
x
=
np
.
random
.
random
(
self
.
x_shape
)
self
.
out
=
ref_softplus
(
self
.
x
,
self
.
beta
,
self
.
threshold
)
if
self
.
dtype
!=
np
.
float32
:
self
.
x
=
convert_float_to_uint16
(
self
.
x
)
self
.
inputs
=
{
'X'
:
self
.
out
}
self
.
outputs
=
{
'Out'
:
ref_softplus
(
self
.
inputs
[
'X'
]
,
self
.
beta
,
self
.
threshold
)
'Out'
:
ref_softplus
(
self
.
out
,
self
.
beta
,
self
.
threshold
)
}
def
config
(
self
):
self
.
x_shape
=
(
10
,
10
)
def
set_dtype
(
self
):
self
.
dtype
=
np
.
float32
def
test_check_output
(
self
):
self
.
check_output
()
...
...
@@ -73,6 +82,27 @@ class TestSoftplus3DExtendedFunctorOneDNNOp(TestSoftplusOneDNNOp):
self
.
beta
=
0.4
class
TestSoftplusBF16OneDNNOp
(
TestSoftplusOneDNNOp
):
def
set_dtype
(
self
):
self
.
dtype
=
np
.
uint16
class
TestSoftplus4DBF16OneDNNOp
(
TestSoftplus4DOneDNNOp
):
def
set_dtype
(
self
):
self
.
dtype
=
np
.
uint16
class
TestSoftplus6DBF16OneDNNOp
(
TestSoftplus6DOneDNNOp
):
def
set_dtype
(
self
):
self
.
dtype
=
np
.
uint16
class
TestSoftplus3DExtendedFunctorBF16OneDNNOp
(
TestSoftplus3DExtendedFunctorOneDNNOp
):
def
set_dtype
(
self
):
self
.
dtype
=
np
.
uint16
if
__name__
==
"__main__"
:
paddle
.
enable_static
()
unittest
.
main
()
python/paddle/fluid/tests/unittests/npu/test_conv3d_op_npu.py
0 → 100644
浏览文件 @
5dcfb699
此差异已折叠。
点击以展开。
python/paddle/fluid/tests/unittests/ps/test_the_one_ps.py
浏览文件 @
5dcfb699
...
...
@@ -26,7 +26,7 @@ import paddle
from
paddle.fluid.tests.unittests.distributed_passes.ps_pass_test_base
import
*
from
paddle.distributed.ps.utils.public
import
logger
,
ps_log_root_dir
from
ps_dnn_trainer
import
DnnTrainer
from
paddle.distributed.fleet.proto
import
ps_pb2
import
paddle.distributed.fleet.proto.the_one_ps_pb2
as
ps_pb2
from
google.protobuf
import
text_format
...
...
python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_auto_async.py
浏览文件 @
5dcfb699
...
...
@@ -12,9 +12,10 @@
# See the License for the specific language governing permissions and
# limitations under the License.
import
os
os
.
environ
[
"WITH_DISTRIBUTE"
]
=
"ON"
import
unittest
import
paddle
import
os
import
paddle.distributed.fleet.base.role_maker
as
role_maker
import
time
...
...
python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_auto_geo.py
浏览文件 @
5dcfb699
...
...
@@ -11,10 +11,10 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
os
os
.
environ
[
"WITH_DISTRIBUTE"
]
=
"ON"
import
unittest
import
paddle
import
os
import
paddle.distributed.fleet.base.role_maker
as
role_maker
import
time
...
...
python/paddle/fluid/tests/unittests/test_dist_fleet_a_sync_optimizer_geo.py
浏览文件 @
5dcfb699
...
...
@@ -11,10 +11,10 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import
os
os
.
environ
[
"WITH_DISTRIBUTE"
]
=
"ON"
import
unittest
import
paddle
import
os
import
paddle.distributed.fleet.base.role_maker
as
role_maker
import
time
...
...
python/paddle/fluid/tests/unittests/test_dist_fleet_base.py
浏览文件 @
5dcfb699
...
...
@@ -309,7 +309,7 @@ class TestFleetBase(unittest.TestCase):
(
tr1_proc
,
tr1_out
,
tr1_err
,
tr1_out_log
,
tr1_err_log
))
def
_run_cluster
(
self
,
model
,
envs
):
env
=
{
'GRAD_CLIP'
:
str
(
self
.
_grad_clip_mode
)}
env
=
{
'GRAD_CLIP'
:
str
(
self
.
_grad_clip_mode
)
,
'WITH_DISTRIBUTE'
:
'ON'
}
python_path
=
self
.
_python_interp
gloo_path
=
tempfile
.
mkdtemp
()
...
...
@@ -343,7 +343,8 @@ class TestFleetBase(unittest.TestCase):
tr1_proc
,
tr1_out
,
tr1_err
,
tr1_out_log
,
tr1_err_log
=
tr1
# Wait until trainer process terminate
time_out
=
120
#time_out = 120
time_out
=
60
cur_time
=
0
while
True
:
...
...
python/paddle/fluid/tests/unittests/test_dist_fleet_ctr.py
浏览文件 @
5dcfb699
...
...
@@ -51,8 +51,9 @@ class TestDistMnistAsyncInMemoryDataset2x2(TestFleetBase):
tr0_losses
,
tr1_losses
=
self
.
_run_cluster
(
model_file
,
required_envs
)
def
test_dist_train
(
self
):
self
.
check_with_place
(
"dist_fleet_ctr.py"
,
delta
=
1e-5
,
check_error_log
=
False
)
# self.check_with_place(
# "dist_fleet_ctr.py", delta=1e-5, check_error_log=False)
print
(
'recover later'
)
class
TestDistMnistAsync2x2
(
TestFleetBase
):
...
...
@@ -85,8 +86,9 @@ class TestDistMnistAsync2x2(TestFleetBase):
tr0_losses
,
tr1_losses
=
self
.
_run_cluster
(
model_file
,
required_envs
)
def
test_dist_train
(
self
):
self
.
check_with_place
(
"dist_fleet_ctr.py"
,
delta
=
1e-5
,
check_error_log
=
False
)
# self.check_with_place(
# "dist_fleet_ctr.py", delta=1e-5, check_error_log=False)
print
(
'recover later'
)
class
TestDistCtrHalfAsync2x2
(
TestFleetBase
):
...
...
@@ -122,8 +124,9 @@ class TestDistCtrHalfAsync2x2(TestFleetBase):
tr0_losses
,
tr1_losses
=
self
.
_run_cluster
(
model_file
,
required_envs
)
def
test_dist_train
(
self
):
self
.
check_with_place
(
"dist_fleet_ctr.py"
,
delta
=
1e-5
,
check_error_log
=
False
)
# self.check_with_place(
# "dist_fleet_ctr.py", delta=1e-5, check_error_log=False)
print
(
'recover later'
)
if
__name__
==
"__main__"
:
...
...
python/paddle/fluid/tests/unittests/test_dist_fleet_ctr2.py
浏览文件 @
5dcfb699
...
...
@@ -52,8 +52,9 @@ class TestDistMnistSync2x2(TestFleetBase):
tr0_losses
,
tr1_losses
=
self
.
_run_cluster
(
model_file
,
required_envs
)
def
test_dist_train
(
self
):
self
.
check_with_place
(
"dist_fleet_ctr.py"
,
delta
=
1e-5
,
check_error_log
=
False
)
# self.check_with_place(
# "dist_fleet_ctr.py", delta=1e-5, check_error_log=False)
print
(
'recover later'
)
# @unittest.skip(reason="Skip unstable ut, reader need to be rewrite")
...
...
@@ -91,8 +92,9 @@ class TestDistMnistAsyncDataset2x2(TestFleetBase):
tr0_losses
,
tr1_losses
=
self
.
_run_cluster
(
model_file
,
required_envs
)
def
test_dist_train
(
self
):
self
.
check_with_place
(
"dist_fleet_ctr.py"
,
delta
=
1e-5
,
check_error_log
=
False
)
# self.check_with_place(
# "dist_fleet_ctr.py", delta=1e-5, check_error_log=False)
print
(
'recover later'
)
if
__name__
==
"__main__"
:
...
...
python/paddle/fluid/tests/unittests/test_dist_fleet_geo.py
浏览文件 @
5dcfb699
...
...
@@ -15,6 +15,7 @@
from
__future__
import
print_function
import
os
os
.
environ
[
"WITH_DISTRIBUTE"
]
=
"ON"
import
unittest
import
paddle
import
paddle.fluid
as
fluid
...
...
python/paddle/fluid/tests/unittests/test_dist_fleet_ps10.py
浏览文件 @
5dcfb699
...
...
@@ -13,14 +13,14 @@
# limitations under the License.
from
__future__
import
print_function
import
os
os
.
environ
[
"WITH_DISTRIBUTE"
]
=
"ON"
import
paddle.fluid
as
fluid
import
paddle.distributed.fleet.base.role_maker
as
role_maker
import
paddle.distributed.fleet
as
fleet
import
unittest
import
paddle
import
os
paddle
.
enable_static
()
# For Net
...
...
@@ -74,11 +74,12 @@ class TestExponentialDecay(unittest.TestCase):
strategy
=
paddle
.
distributed
.
fleet
.
DistributedStrategy
()
strategy
.
a_sync
=
True
optimizer
=
fleet
.
distributed_optimizer
(
optimizer
,
strategy
)
optimizer
.
minimize
(
loss
)
optimizer
.
minimize
(
[
loss
]
)
fleet
.
init_server
()
if
__name__
==
'__main__'
:
os
.
environ
[
"GLOG_v"
]
=
"4"
os
.
environ
[
"GLOG_logtostderr"
]
=
"1"
unittest
.
main
()
python/paddle/fluid/tests/unittests/test_dist_fleet_ps12.py
浏览文件 @
5dcfb699
...
...
@@ -15,6 +15,8 @@
from
__future__
import
print_function
import
os
os
.
environ
[
"WITH_DISTRIBUTE"
]
=
"ON"
import
unittest
import
tempfile
import
shutil
...
...
python/paddle/fluid/tests/unittests/test_dist_fleet_ps2.py
浏览文件 @
5dcfb699
...
...
@@ -15,6 +15,8 @@
from
__future__
import
print_function
import
os
os
.
environ
[
"WITH_DISTRIBUTE"
]
=
"ON"
import
unittest
import
tempfile
import
shutil
...
...
python/paddle/fluid/tests/unittests/test_dist_fleet_ps7.py
浏览文件 @
5dcfb699
...
...
@@ -13,10 +13,12 @@
# limitations under the License.
from
__future__
import
print_function
import
os
os
.
environ
[
"WITH_DISTRIBUTE"
]
=
"ON"
import
paddle.distributed.fleet
as
fleet
import
paddle.distributed.fleet.base.role_maker
as
role_maker
import
paddle.fluid
as
fluid
import
os
import
unittest
import
paddle
paddle
.
enable_static
()
...
...
python/paddle/fluid/tests/unittests/test_dist_fleet_ps8.py
浏览文件 @
5dcfb699
...
...
@@ -13,10 +13,11 @@
# limitations under the License.
from
__future__
import
print_function
import
os
os
.
environ
[
"WITH_DISTRIBUTE"
]
=
"ON"
import
paddle.distributed.fleet
as
fleet
import
paddle.distributed.fleet.base.role_maker
as
role_maker
import
paddle.fluid
as
fluid
import
os
import
unittest
import
paddle
paddle
.
enable_static
()
...
...
python/paddle/fluid/tests/unittests/test_dist_fleet_ps9.py
浏览文件 @
5dcfb699
...
...
@@ -13,10 +13,11 @@
# limitations under the License.
from
__future__
import
print_function
import
os
os
.
environ
[
"WITH_DISTRIBUTE"
]
=
"ON"
import
paddle.distributed.fleet
as
fleet
import
paddle.distributed.fleet.base.role_maker
as
role_maker
import
paddle.fluid
as
fluid
import
os
import
unittest
import
paddle
paddle
.
enable_static
()
...
...
python/paddle/fluid/trainer_factory.py
浏览文件 @
5dcfb699
...
...
@@ -23,7 +23,7 @@ local_logger = get_logger(
__name__
,
logging
.
INFO
,
fmt
=
'%(asctime)s-%(levelname)s: %(message)s'
)
from
.trainer_desc
import
MultiTrainer
,
DistMultiTrainer
,
PipelineTrainer
,
HeterXpuTrainer
,
PSGPUTrainer
,
HeterPipelineTrainer
from
.device_worker
import
Hogwild
,
DownpourSGD
,
Section
,
DownpourSGDOPT
,
HeterSection
from
.device_worker
import
Hogwild
,
DownpourSGD
,
DownpourLite
,
Section
,
DownpourSGDOPT
,
HeterSection
from
.framework
import
Variable
from
multiprocessing
import
Process
,
Manager
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录