Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
urhero
Paddle
提交
4427df37
P
Paddle
项目概览
urhero
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
0
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
体验新版 GitCode,发现更多精彩内容 >>
未验证
提交
4427df37
编写于
12月 26, 2020
作者:
L
liuyuhui
提交者:
GitHub
12月 26, 2020
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
[Kunlun] PR2: Support MultiDevicePass and BKCL in parallel executor (#29574)
上级
0b74428d
变更
59
隐藏空白更改
内联
并排
Showing
59 changed file
with
1479 addition
and
290 deletion
+1479
-290
CMakeLists.txt
CMakeLists.txt
+9
-1
cmake/external/xpu.cmake
cmake/external/xpu.cmake
+14
-0
paddle/fluid/framework/details/all_reduce_op_handle.cc
paddle/fluid/framework/details/all_reduce_op_handle.cc
+65
-0
paddle/fluid/framework/details/all_reduce_op_handle.h
paddle/fluid/framework/details/all_reduce_op_handle.h
+16
-2
paddle/fluid/framework/details/bkcl_op_handle.h
paddle/fluid/framework/details/bkcl_op_handle.h
+131
-0
paddle/fluid/framework/details/broadcast_op_handle.cc
paddle/fluid/framework/details/broadcast_op_handle.cc
+67
-1
paddle/fluid/framework/details/broadcast_op_handle.h
paddle/fluid/framework/details/broadcast_op_handle.h
+26
-2
paddle/fluid/framework/details/broadcast_op_handle_test.cc
paddle/fluid/framework/details/broadcast_op_handle_test.cc
+16
-4
paddle/fluid/framework/details/broadcast_op_handle_test.h
paddle/fluid/framework/details/broadcast_op_handle_test.h
+48
-14
paddle/fluid/framework/details/build_strategy.cc
paddle/fluid/framework/details/build_strategy.cc
+45
-11
paddle/fluid/framework/details/build_strategy.h
paddle/fluid/framework/details/build_strategy.h
+10
-2
paddle/fluid/framework/details/execution_strategy.h
paddle/fluid/framework/details/execution_strategy.h
+4
-7
paddle/fluid/framework/details/fused_all_reduce_op_handle.cc
paddle/fluid/framework/details/fused_all_reduce_op_handle.cc
+14
-2
paddle/fluid/framework/details/fused_all_reduce_op_handle.h
paddle/fluid/framework/details/fused_all_reduce_op_handle.h
+9
-0
paddle/fluid/framework/details/fused_broadcast_op_handle.h
paddle/fluid/framework/details/fused_broadcast_op_handle.h
+11
-4
paddle/fluid/framework/details/fused_broadcast_op_handle_test.cc
...fluid/framework/details/fused_broadcast_op_handle_test.cc
+25
-12
paddle/fluid/framework/details/gather_op_handle_test.cc
paddle/fluid/framework/details/gather_op_handle_test.cc
+2
-2
paddle/fluid/framework/details/multi_devices_helper.h
paddle/fluid/framework/details/multi_devices_helper.h
+1
-0
paddle/fluid/framework/details/op_handle_base.cc
paddle/fluid/framework/details/op_handle_base.cc
+60
-7
paddle/fluid/framework/details/op_handle_base.h
paddle/fluid/framework/details/op_handle_base.h
+4
-2
paddle/fluid/framework/details/reduce_op_handle.cc
paddle/fluid/framework/details/reduce_op_handle.cc
+55
-1
paddle/fluid/framework/details/reduce_op_handle.h
paddle/fluid/framework/details/reduce_op_handle.h
+18
-0
paddle/fluid/framework/details/reduce_op_handle_test.cc
paddle/fluid/framework/details/reduce_op_handle_test.cc
+3
-3
paddle/fluid/framework/ir/memory_optimize_pass/test_reference_count_pass_last_lived_ops.cc
...optimize_pass/test_reference_count_pass_last_lived_ops.cc
+2
-2
paddle/fluid/framework/ir/multi_devices_graph_pass/fuse_all_reduce_op_pass.cc
...rk/ir/multi_devices_graph_pass/fuse_all_reduce_op_pass.cc
+21
-0
paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.cc
...k/ir/multi_devices_graph_pass/multi_devices_graph_pass.cc
+28
-0
paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.h
...rk/ir/multi_devices_graph_pass/multi_devices_graph_pass.h
+5
-0
paddle/fluid/framework/parallel_executor.cc
paddle/fluid/framework/parallel_executor.cc
+198
-22
paddle/fluid/framework/parallel_executor.h
paddle/fluid/framework/parallel_executor.h
+2
-0
paddle/fluid/framework/var_type_traits.cc
paddle/fluid/framework/var_type_traits.cc
+4
-0
paddle/fluid/framework/var_type_traits.h
paddle/fluid/framework/var_type_traits.h
+11
-0
paddle/fluid/framework/var_type_traits_test.cc
paddle/fluid/framework/var_type_traits_test.cc
+3
-0
paddle/fluid/platform/bkcl_helper.h
paddle/fluid/platform/bkcl_helper.h
+280
-0
paddle/fluid/platform/device_context.h
paddle/fluid/platform/device_context.h
+28
-2
paddle/fluid/pybind/pybind.cc
paddle/fluid/pybind/pybind.cc
+6
-5
python/paddle/fluid/compiler.py
python/paddle/fluid/compiler.py
+8
-7
python/paddle/fluid/framework.py
python/paddle/fluid/framework.py
+1
-0
python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
...ddle/fluid/tests/unittests/parallel_executor_test_base.py
+29
-9
python/paddle/fluid/tests/unittests/seresnext_net.py
python/paddle/fluid/tests/unittests/seresnext_net.py
+13
-8
python/paddle/fluid/tests/unittests/seresnext_test_base.py
python/paddle/fluid/tests/unittests/seresnext_test_base.py
+12
-11
python/paddle/fluid/tests/unittests/test_fuse_all_reduce_pass.py
...paddle/fluid/tests/unittests/test_fuse_all_reduce_pass.py
+19
-17
python/paddle/fluid/tests/unittests/test_fuse_elewise_add_act_pass.py
...e/fluid/tests/unittests/test_fuse_elewise_add_act_pass.py
+11
-9
python/paddle/fluid/tests/unittests/test_fuse_optimizer_pass.py
.../paddle/fluid/tests/unittests/test_fuse_optimizer_pass.py
+20
-18
python/paddle/fluid/tests/unittests/test_fuse_relu_depthwise_conv_pass.py
...uid/tests/unittests/test_fuse_relu_depthwise_conv_pass.py
+9
-9
python/paddle/fluid/tests/unittests/test_ir_inplace_pass.py
python/paddle/fluid/tests/unittests/test_ir_inplace_pass.py
+2
-2
python/paddle/fluid/tests/unittests/test_ir_memory_optimize_ifelse_op.py
...luid/tests/unittests/test_ir_memory_optimize_ifelse_op.py
+1
-1
python/paddle/fluid/tests/unittests/test_ir_memory_optimize_pass.py
...dle/fluid/tests/unittests/test_ir_memory_optimize_pass.py
+9
-9
python/paddle/fluid/tests/unittests/test_ir_memory_optimize_transformer.py
...id/tests/unittests/test_ir_memory_optimize_transformer.py
+3
-3
python/paddle/fluid/tests/unittests/test_mix_precision_all_reduce_fuse.py
...uid/tests/unittests/test_mix_precision_all_reduce_fuse.py
+4
-4
python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py
...dle/fluid/tests/unittests/test_parallel_executor_mnist.py
+44
-28
python/paddle/fluid/tests/unittests/test_parallel_executor_pg.py
...paddle/fluid/tests/unittests/test_parallel_executor_pg.py
+10
-10
python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext_base_cpu.py
...ts/unittests/test_parallel_executor_seresnext_base_cpu.py
+5
-2
python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext_base_gpu.py
...ts/unittests/test_parallel_executor_seresnext_base_gpu.py
+2
-2
python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext_with_fuse_all_reduce_cpu.py
...t_parallel_executor_seresnext_with_fuse_all_reduce_cpu.py
+3
-2
python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext_with_fuse_all_reduce_gpu.py
...t_parallel_executor_seresnext_with_fuse_all_reduce_gpu.py
+2
-2
python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext_with_reduce_cpu.py
...tests/test_parallel_executor_seresnext_with_reduce_cpu.py
+22
-21
python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext_with_reduce_gpu.py
...tests/test_parallel_executor_seresnext_with_reduce_gpu.py
+3
-2
python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py
...uid/tests/unittests/test_parallel_executor_transformer.py
+4
-4
python/paddle/fluid/tests/unittests/test_program_prune_backward.py
...ddle/fluid/tests/unittests/test_program_prune_backward.py
+2
-2
未找到文件。
CMakeLists.txt
浏览文件 @
4427df37
...
...
@@ -29,7 +29,7 @@ include(generic) # simplify cmake module
find_package
(
CUDA QUIET
)
option
(
WITH_GPU
"Compile PaddlePaddle with NVIDIA GPU"
${
CUDA_FOUND
}
)
option
(
WITH_TENSORRT
"Compile PaddlePaddle with NVIDIA TensorRT"
OFF
)
option
(
WITH_XPU
"Compile PaddlePaddle with BAIDU KUNLUN
"
OFF
)
option
(
WITH_XPU
"Compile PaddlePaddle with BAIDU KUNLUN
XPU"
OFF
)
option
(
WITH_WIN_DUMP_DBG
"Compile with windows core dump debug mode"
OFF
)
if
(
WITH_GPU AND WITH_XPU
)
message
(
FATAL_ERROR
"Error when compile GPU and XPU at the same time"
)
...
...
@@ -166,6 +166,7 @@ option(WITH_DGC "Use DGC(Deep Gradient Compression) or not" ${WITH_DISTRIBUTE}
option
(
SANITIZER_TYPE
"Choose the type of sanitizer, options are: Address, Leak, Memory, Thread, Undefined"
OFF
)
option
(
WITH_LITE
"Compile Paddle Fluid with Lite Engine"
OFF
)
option
(
WITH_NCCL
"Compile PaddlePaddle with NCCL support"
ON
)
option
(
WITH_XPU_BKCL
"Compile PaddlePaddle with BAIDU KUNLUN XPU BKCL"
OFF
)
option
(
WITH_CRYPTO
"Compile PaddlePaddle with crypto support"
ON
)
option
(
WITH_ARM
"Compile PaddlePaddle with arm support"
OFF
)
option
(
WITH_SW
"Compile PaddlePaddle with sw support"
OFF
)
...
...
@@ -213,6 +214,13 @@ if (NOT WITH_GPU AND WITH_NCCL)
"Disable NCCL when compiling without GPU"
FORCE
)
endif
()
if
(
NOT WITH_XPU AND WITH_XPU_BKCL
)
MESSAGE
(
WARNING
"Disable BKCL when compiling without XPU. Force WITH_XPU_BKCL=OFF."
)
set
(
WITH_XPU_BKCL OFF CACHE STRING
"Disable BKCL when compiling without XPU"
FORCE
)
endif
()
if
(
WITH_NCCL
)
add_definitions
(
"-DPADDLE_WITH_NCCL"
)
include
(
nccl
)
...
...
cmake/external/xpu.cmake
浏览文件 @
4427df37
...
...
@@ -47,4 +47,18 @@ set_property(TARGET shared_xpuapi PROPERTY IMPORTED_LOCATION "${XPU_API_LIB}")
generate_dummy_static_lib
(
LIB_NAME
"xpulib"
GENERATOR
"xpu.cmake"
)
TARGET_LINK_LIBRARIES
(
xpulib
${
XPU_API_LIB
}
${
XPU_RT_LIB
}
)
if
(
WITH_XPU_BKCL
)
MESSAGE
(
STATUS
"Compile with XPU BKCL!"
)
ADD_DEFINITIONS
(
-DPADDLE_WITH_XPU_BKCL
)
SET
(
XPU_BKCL_LIB_NAME
"libbkcl.so"
)
SET
(
XPU_BKCL_LIB
"
${
XPU_LIB_DIR
}
/
${
XPU_BKCL_LIB_NAME
}
"
)
SET
(
XPU_BKCL_INC_DIR
"
${
THIRD_PARTY_PATH
}
/install/xpu/include"
)
INCLUDE_DIRECTORIES
(
${
XPU_BKCL_INC_DIR
}
)
TARGET_LINK_LIBRARIES
(
xpulib
${
XPU_API_LIB
}
${
XPU_RT_LIB
}
${
XPU_BKCL_LIB
}
)
else
(
WITH_XPU_BKCL
)
TARGET_LINK_LIBRARIES
(
xpulib
${
XPU_API_LIB
}
${
XPU_RT_LIB
}
)
endif
(
WITH_XPU_BKCL
)
ADD_DEPENDENCIES
(
xpulib
${
XPU_PROJECT
}
)
paddle/fluid/framework/details/all_reduce_op_handle.cc
浏览文件 @
4427df37
...
...
@@ -43,6 +43,19 @@ AllReduceOpHandle::AllReduceOpHandle(ir::Node *node,
"number of local scopes is %d."
,
places_
.
size
(),
local_scopes_
.
size
()));
}
#elif defined(PADDLE_WITH_XPU_BKCL)
AllReduceOpHandle
::
AllReduceOpHandle
(
ir
::
Node
*
node
,
const
std
::
vector
<
Scope
*>
&
local_scopes
,
const
std
::
vector
<
platform
::
Place
>
&
places
,
const
platform
::
BKCLCommunicator
*
ctxs
)
:
BKCLOpHandleBase
(
node
,
places
,
ctxs
),
local_scopes_
(
local_scopes
)
{
PADDLE_ENFORCE_EQ
(
places_
.
size
(),
local_scopes_
.
size
(),
platform
::
errors
::
InvalidArgument
(
"The number of places and the number of local scopes "
"should be equal, but got number of places is %d and "
"number of local scopes is %d."
,
places_
.
size
(),
local_scopes_
.
size
()));
}
#else
AllReduceOpHandle
::
AllReduceOpHandle
(
ir
::
Node
*
node
,
const
std
::
vector
<
Scope
*>
&
local_scopes
,
...
...
@@ -98,6 +111,9 @@ void AllReduceOpHandle::AllReduceImpl(
places
.
reserve
(
num_places
);
int64_t
numel
=
-
1
;
bool
is_gpu_place
=
false
;
#if defined(PADDLE_WITH_XPU_BKCL)
bool
is_xpu_place
=
false
;
#endif
auto
dtype
=
static_cast
<
framework
::
proto
::
VarType
::
Type
>
(
0
);
for
(
size_t
i
=
0
;
i
<
local_exec_scopes_
.
size
();
++
i
)
{
auto
&
local_scope
=
local_exec_scopes_
[
i
];
...
...
@@ -117,6 +133,9 @@ void AllReduceOpHandle::AllReduceImpl(
in_var_handles
[
i
]
->
name
(),
numel
));
dtype
=
lod_tensor
.
type
();
is_gpu_place
=
platform
::
is_gpu_place
(
lod_tensor
.
place
());
#if defined(PADDLE_WITH_XPU_BKCL)
is_xpu_place
=
platform
::
is_xpu_place
(
lod_tensor
.
place
());
#endif
}
PADDLE_ENFORCE_EQ
(
numel
,
static_cast
<
int64_t
>
(
lod_tensor
.
numel
()),
...
...
@@ -128,6 +147,12 @@ void AllReduceOpHandle::AllReduceImpl(
platform
::
errors
::
PreconditionNotMet
(
"The dtype of tensors of the same variable in different local "
"scopes should be equal."
));
#if defined(PADDLE_WITH_XPU_BKCL)
PADDLE_ENFORCE_EQ
(
is_xpu_place
,
platform
::
is_xpu_place
(
lod_tensor
.
place
()),
platform
::
errors
::
PreconditionNotMet
(
"The place type of tensors of the same variable "
"in different local scopes should be equal."
));
#endif
PADDLE_ENFORCE_EQ
(
is_gpu_place
,
platform
::
is_gpu_place
(
lod_tensor
.
place
()),
platform
::
errors
::
PreconditionNotMet
(
"The place type of tensors of the same variable "
...
...
@@ -179,6 +204,25 @@ void AllReduceOpHandle::AllReduceFunc(
#else
PADDLE_THROW
(
platform
::
errors
::
PreconditionNotMet
(
"Not compiled with CUDA."
));
#endif
}
else
if
(
is_xpu_place
(
places
[
0
]))
{
#if defined(PADDLE_WITH_XPU_BKCL)
PADDLE_ENFORCE_NOT_NULL
(
bkcl_ctxs_
,
platform
::
errors
::
InvalidArgument
(
"The bkcl context should not be NULL."
));
BKCLDataType
bkcl_dtype
=
platform
::
ToBKCLDataType
(
dtype
);
std
::
vector
<
std
::
function
<
void
()
>>
all_reduce_calls
;
for
(
size_t
i
=
0
;
i
<
local_exec_scopes_
.
size
();
++
i
)
{
auto
&
p
=
places
[
i
];
void
*
buffer
=
const_cast
<
void
*>
(
lod_tensor_data
.
at
(
i
));
all_reduce_calls
.
emplace_back
([
=
]
{
BKCLAllReduce
(
p
,
buffer
,
buffer
,
numel
,
bkcl_dtype
,
BKCL_ADD
);
});
}
BKCLAllReduceFunc
(
all_reduce_calls
);
#else
PADDLE_THROW
(
platform
::
errors
::
PreconditionNotMet
(
"Not compiled with BKCL."
));
#endif
}
else
{
// Special handle CPU only Operator's gradient. Like CRF
auto
&
trg
=
*
local_exec_scopes_
[
0
]
...
...
@@ -205,6 +249,27 @@ void AllReduceOpHandle::AllReduceFunc(
VLOG
(
10
)
<<
Name
()
<<
" size:"
<<
numel
*
SizeOfType
(
dtype
);
}
#if defined(PADDLE_WITH_XPU_BKCL)
void
AllReduceOpHandle
::
BKCLAllReduceFunc
(
const
std
::
vector
<
std
::
function
<
void
()
>>
&
all_reduce_calls
)
{
this
->
RunAndRecordEvent
([
&
]
{
if
(
all_reduce_calls
.
size
()
==
1UL
)
{
all_reduce_calls
[
0
]();
}
else
{
PADDLE_ENFORCE_EQ
(
bkcl_group_start
(),
BKCL_SUCCESS
,
platform
::
errors
::
PreconditionNotMet
(
"bkcl_group_start failed"
));
for
(
auto
&
call
:
all_reduce_calls
)
{
call
();
}
PADDLE_ENFORCE_EQ
(
bkcl_group_end
(),
BKCL_SUCCESS
,
platform
::
errors
::
PreconditionNotMet
(
"bkcl_group_end failed"
));
}
});
}
#endif
#if defined(PADDLE_WITH_NCCL)
void
AllReduceOpHandle
::
NCCLAllReduceFunc
(
const
std
::
vector
<
std
::
function
<
void
()
>>
&
all_reduce_calls
)
{
...
...
paddle/fluid/framework/details/all_reduce_op_handle.h
浏览文件 @
4427df37
...
...
@@ -34,6 +34,9 @@ class NCCLCommunicator;
#if defined(PADDLE_WITH_NCCL)
#include "paddle/fluid/framework/details/nccl_op_handle.h"
#include "paddle/fluid/platform/nccl_helper.h"
#elif defined(PADDLE_WITH_XPU_BKCL)
#include "paddle/fluid/framework/details/bkcl_op_handle.h"
#include "paddle/fluid/platform/bkcl_helper.h"
#endif
namespace
paddle
{
...
...
@@ -46,6 +49,12 @@ class AllReduceOpHandle : public NCCLOpHandleBase {
AllReduceOpHandle
(
ir
::
Node
*
node
,
const
std
::
vector
<
Scope
*>
&
local_scopes
,
const
std
::
vector
<
platform
::
Place
>
&
places
,
const
platform
::
NCCLCommunicator
*
ctxs
);
#elif defined(PADDLE_WITH_XPU_BKCL)
class
AllReduceOpHandle
:
public
BKCLOpHandleBase
{
public:
AllReduceOpHandle
(
ir
::
Node
*
node
,
const
std
::
vector
<
Scope
*>
&
local_scopes
,
const
std
::
vector
<
platform
::
Place
>
&
places
,
const
platform
::
BKCLCommunicator
*
ctxs
);
#else
class
AllReduceOpHandle
:
public
OpHandleBase
{
public:
...
...
@@ -65,8 +74,8 @@ class AllReduceOpHandle : public OpHandleBase {
std
::
vector
<
Scope
*>
local_scopes_
;
#if
ndef PADDLE_WITH_NCCL
// NCCLOpHandleBase already have these attributes.
#if
!(PADDLE_WITH_NCCL || PADDLE_WITH_XPU_BKCL)
// NCCLOpHandleBase a
nd BKCLOpHandleBase a
lready have these attributes.
// Will polish it by class inheritance framework.
std
::
vector
<
platform
::
Place
>
places_
;
#endif
...
...
@@ -78,6 +87,11 @@ class AllReduceOpHandle : public OpHandleBase {
void
SyncNCCLAllReduce
();
#endif
#if defined(PADDLE_WITH_XPU_BKCL)
void
BKCLAllReduceFunc
(
const
std
::
vector
<
std
::
function
<
void
()
>>
&
all_reduce_calls
);
#endif
void
AllReduceImpl
(
const
std
::
vector
<
VarHandle
*>
&
in_var_handles
,
const
std
::
vector
<
VarHandle
*>
&
out_var_handles
);
...
...
paddle/fluid/framework/details/bkcl_op_handle.h
0 → 100644
浏览文件 @
4427df37
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "xpu/bkcl.h"
#include <string>
#include <unordered_map>
#include <vector>
#include "paddle/fluid/framework/details/op_handle_base.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/platform/bkcl_helper.h"
DECLARE_bool
(
sync_bkcl_allreduce
);
namespace
paddle
{
namespace
framework
{
namespace
details
{
class
BKCLOpHandleBase
:
public
OpHandleBase
{
public:
BKCLOpHandleBase
(
ir
::
Node
*
node
,
const
std
::
vector
<
platform
::
Place
>&
places
,
const
platform
::
BKCLCommunicator
*
bkcl_ctxs
)
:
OpHandleBase
(
node
),
places_
(
places
),
bkcl_ctxs_
(
bkcl_ctxs
)
{
if
(
bkcl_ctxs
==
nullptr
)
{
return
;
}
// init device context
auto
default_bkcl_ctxs
=
bkcl_ctxs_
->
DefaultFlatCtx
();
for
(
auto
&
p
:
places_
)
{
this
->
SetDeviceContext
(
p
,
default_bkcl_ctxs
->
DevCtx
(
p
));
}
}
virtual
~
BKCLOpHandleBase
()
{}
void
SetRunEnv
(
int
run_order
,
bool
use_hierarchical_allreduce
)
{
PADDLE_ENFORCE_GE
(
run_order
,
0
,
platform
::
errors
::
InvalidArgument
(
"The argument run_order must be >= 0, but got %d."
,
run_order
));
PADDLE_ENFORCE_NE
(
use_hierarchical_allreduce
,
true
,
platform
::
errors
::
Unimplemented
(
"xpu doesn't support hierarchical_allreduce"
));
run_order_
=
run_order
;
use_hierarchical_allreduce_
=
use_hierarchical_allreduce
;
VLOG
(
10
)
<<
"SetRunEnv "
<<
" run_order:"
<<
run_order
<<
", use_hierarchical_allreduce:"
<<
use_hierarchical_allreduce
;
if
(
bkcl_ctxs_
==
nullptr
)
{
return
;
}
if
(
!
use_hierarchical_allreduce_
)
{
auto
ctxs
=
bkcl_ctxs_
->
GetFlatCtx
(
run_order
);
for
(
auto
&
p
:
places_
)
{
this
->
SetDeviceContext
(
p
,
ctxs
->
DevCtx
(
p
));
}
return
;
}
}
void
FlatBKCLAllReduce
(
platform
::
Place
place
,
const
void
*
sendbuff
,
void
*
recvbuff
,
size_t
count
,
BKCLDataType
datatype
,
BKCLOp
op
)
{
PADDLE_ENFORCE_GE
(
run_order_
,
0
,
platform
::
errors
::
InvalidArgument
(
"The argument run_order_ must be >= 0, but got %d."
,
run_order_
));
auto
flat_bkcl_ctxs
=
bkcl_ctxs_
->
GetFlatCtx
(
run_order_
);
int
dev_id
=
BOOST_GET_CONST
(
platform
::
XPUPlace
,
place
).
device
;
auto
&
bkcl_ctx
=
flat_bkcl_ctxs
->
at
(
dev_id
);
auto
comm
=
bkcl_ctx
.
comm_
;
VLOG
(
10
)
<<
"before all reduce buffer:"
<<
sendbuff
<<
", numel:"
<<
count
<<
", dev_id:"
<<
dev_id
<<
", dtype:"
<<
datatype
<<
", place:"
<<
place
;
PADDLE_ENFORCE_EQ
(
bkcl_all_reduce
(
comm
,
sendbuff
,
recvbuff
,
count
,
datatype
,
op
,
NULL
),
BKCL_SUCCESS
,
platform
::
errors
::
PreconditionNotMet
(
"bckl all reduce failed"
));
}
void
BKCLAllReduce
(
platform
::
Place
place
,
const
void
*
sendbuff
,
void
*
recvbuff
,
size_t
count
,
BKCLDataType
datatype
,
BKCLOp
op
)
{
PADDLE_ENFORCE_GE
(
run_order_
,
0
,
platform
::
errors
::
InvalidArgument
(
"The argument run_order_ must be >= 0, but got %d."
,
run_order_
));
PADDLE_ENFORCE_EQ
(
use_hierarchical_allreduce_
,
false
,
platform
::
errors
::
Unimplemented
(
"xpu doesn't support hierarchical all reduce"
));
if
(
!
use_hierarchical_allreduce_
)
{
FlatBKCLAllReduce
(
place
,
sendbuff
,
recvbuff
,
count
,
datatype
,
op
);
return
;
}
}
protected:
std
::
vector
<
platform
::
Place
>
places_
;
const
platform
::
BKCLCommunicator
*
bkcl_ctxs_
{
nullptr
};
// When multi trainer call collective function, they need run the same order.
// Or the program will hang.So we use allreduce_deps_pass to set this
// run_order_.
int
run_order_
{
0
};
// Use 2d allreduce or not.
bool
use_hierarchical_allreduce_
{
false
};
};
}
// namespace details
}
// namespace framework
}
// namespace paddle
paddle/fluid/framework/details/broadcast_op_handle.cc
浏览文件 @
4427df37
...
...
@@ -80,7 +80,7 @@ void BroadcastOpHandle::BroadcastOneVar(
&
VariableVisitor
::
GetMutableTensor
(
out_var
));
});
}
}
else
{
}
else
if
(
platform
::
is_gpu_place
(
in_tensor
.
place
()))
{
#if defined(PADDLE_WITH_NCCL)
VarHandle
*
out_handle
=
nullptr
;
int
root_id
=
...
...
@@ -141,6 +141,72 @@ void BroadcastOpHandle::BroadcastOneVar(
#else
PADDLE_THROW
(
platform
::
errors
::
PreconditionNotMet
(
"Not compiled with NCLL."
));
#endif
}
else
{
#if defined(PADDLE_WITH_XPU_BKCL)
VarHandle
*
out_handle
=
nullptr
;
int
root_id
=
BOOST_GET_CONST
(
platform
::
XPUPlace
,
in_tensor
.
place
()).
device
;
std
::
vector
<
std
::
function
<
void
()
>>
broadcast_calls
;
int
type
=
platform
::
ToBKCLDataType
(
in_tensor
.
type
());
size_t
numel
=
static_cast
<
size_t
>
(
in_tensor
.
numel
());
for
(
auto
out_var_handle
:
out_var_handles
)
{
Variable
*
out_var
=
var_scopes
.
at
(
out_var_handle
->
scope_idx
())
->
FindVar
(
out_var_handle
->
name
());
int
dst_id
=
BOOST_GET_CONST
(
platform
::
XPUPlace
,
out_var_handle
->
place
()).
device
;
auto
&
bkcl_ctx
=
bkcl_ctxs_
->
at
(
dst_id
);
void
*
send_recv_buffer
=
nullptr
;
if
(
root_id
==
dst_id
)
{
send_recv_buffer
=
const_cast
<
void
*>
(
in_tensor
.
data
<
void
>
());
out_handle
=
out_var_handle
;
}
else
{
send_recv_buffer
=
VariableVisitor
::
GetMutableTensor
(
out_var
)
.
Resize
(
in_tensor
.
dims
())
.
mutable_data
(
out_var_handle
->
place
());
}
broadcast_calls
.
emplace_back
([
send_recv_buffer
,
numel
,
type
,
root_id
,
&
bkcl_ctx
]
{
PADDLE_ENFORCE_EQ
(
bkcl_broadcast
(
bkcl_ctx
.
comm
(),
send_recv_buffer
,
send_recv_buffer
,
numel
,
static_cast
<
BKCLDataType
>
(
type
),
root_id
,
nullptr
),
BKCL_SUCCESS
,
platform
::
errors
::
Unavailable
(
"bkcl_broadcast failed"
));
});
}
WaitInputVarGenerated
();
this
->
RunAndRecordEvent
([
&
]
{
{
PADDLE_ENFORCE_EQ
(
bkcl_group_start
(),
BKCL_SUCCESS
,
platform
::
errors
::
Unavailable
(
"bkcl_group_start failed"
));
for
(
auto
&
call
:
broadcast_calls
)
{
call
();
}
PADDLE_ENFORCE_EQ
(
bkcl_group_end
(),
BKCL_SUCCESS
,
platform
::
errors
::
Unavailable
(
"bkcl_group_end failed"
));
}
if
(
!
out_handle
->
IsTheSameVar
(
in_var_handle
))
{
auto
out_var
=
var_scopes
.
at
(
in_var_handle
.
scope_idx
())
->
FindVar
(
out_var_handles
[
0
]
->
name
());
paddle
::
framework
::
TensorCopy
(
in_tensor
,
in_var_handle
.
place
(),
*
(
dev_ctxes_
.
at
(
in_var_handle
.
place
())),
&
VariableVisitor
::
GetMutableTensor
(
out_var
));
}
});
#else
PADDLE_THROW
(
platform
::
errors
::
PreconditionNotMet
(
"Not compiled with BKCL."
));
#endif
}
}
...
...
paddle/fluid/framework/details/broadcast_op_handle.h
浏览文件 @
4427df37
...
...
@@ -34,12 +34,19 @@ class Node;
}
// namespace ir
}
// namespace framework
namespace
platform
{
#if defined(PADDLE_WITH_NCCL)
struct
NCCLContextMap
;
#endif
#if defined(PADDLE_WITH_XPU_BKCL)
struct
BKCLContextMap
;
#endif
}
// namespace platform
}
// namespace paddle
#if defined(PADDLE_WITH_NCCL)
#include "paddle/fluid/platform/nccl_helper.h"
#elif defined(PADDLE_WITH_XPU_BKCL)
#include "paddle/fluid/platform/bkcl_helper.h"
#endif
namespace
paddle
{
...
...
@@ -63,11 +70,26 @@ struct BroadcastOpHandle : public OpHandleBase {
}
}
}
#else
#endif
#if defined(PADDLE_WITH_XPU_BKCL)
BroadcastOpHandle
(
ir
::
Node
*
node
,
const
std
::
vector
<
Scope
*>
&
local_scopes
,
const
std
::
vector
<
platform
::
Place
>
&
places
,
const
platform
::
BKCLContextMap
*
bkcl_ctxs
)
:
OpHandleBase
(
node
),
local_scopes_
(
local_scopes
),
places_
(
places
),
bkcl_ctxs_
(
bkcl_ctxs
)
{
if
(
bkcl_ctxs_
)
{
for
(
auto
&
p_ctx
:
bkcl_ctxs_
->
contexts_
)
{
this
->
SetDeviceContext
(
platform
::
XPUPlace
(
p_ctx
.
first
),
p_ctx
.
second
.
ctx_
.
get
());
}
}
}
#endif
BroadcastOpHandle
(
ir
::
Node
*
node
,
const
std
::
vector
<
Scope
*>
&
local_scopes
,
const
std
::
vector
<
platform
::
Place
>
&
places
)
:
OpHandleBase
(
node
),
local_scopes_
(
local_scopes
),
places_
(
places
)
{}
#endif
std
::
string
Name
()
const
override
;
...
...
@@ -86,6 +108,8 @@ struct BroadcastOpHandle : public OpHandleBase {
std
::
vector
<
platform
::
Place
>
places_
;
#if defined(PADDLE_WITH_NCCL)
const
platform
::
NCCLContextMap
*
nccl_ctxs_
;
#elif defined(PADDLE_WITH_XPU_BKCL)
const
platform
::
BKCLContextMap
*
bkcl_ctxs_
;
#endif
void
InitOutputValue
(
const
VarHandle
&
in_var_handle
,
...
...
paddle/fluid/framework/details/broadcast_op_handle_test.cc
浏览文件 @
4427df37
...
...
@@ -18,10 +18,12 @@ namespace paddle {
namespace
framework
{
namespace
details
{
using
DeviceType
=
paddle
::
platform
::
DeviceType
;
TEST
(
BroadcastTester
,
TestCPUBroadcastTestLodTensor
)
{
TestBroadcastOpHandle
test_op
;
size_t
input_scope_idx
=
0
;
test_op
.
InitCtxOn
Gpu
(
false
);
test_op
.
InitCtxOn
Device
(
p
::
kCPU
);
test_op
.
InitBroadcastOp
(
input_scope_idx
);
test_op
.
TestBroadcastLodTensor
(
input_scope_idx
);
}
...
...
@@ -29,7 +31,7 @@ TEST(BroadcastTester, TestCPUBroadcastTestLodTensor) {
TEST
(
BroadcastTester
,
TestCPUBroadcastTestSelectedRows
)
{
TestBroadcastOpHandle
test_op
;
size_t
input_scope_idx
=
0
;
test_op
.
InitCtxOn
Gpu
(
false
);
test_op
.
InitCtxOn
Device
(
p
::
kCPU
);
test_op
.
InitBroadcastOp
(
input_scope_idx
);
test_op
.
TestBroadcastSelectedRows
(
input_scope_idx
);
}
...
...
@@ -38,7 +40,7 @@ TEST(BroadcastTester, TestCPUBroadcastTestSelectedRows) {
TEST
(
BroadcastTester
,
TestGPUBroadcastTestLodTensor
)
{
TestBroadcastOpHandle
test_op
;
size_t
input_scope_idx
=
0
;
test_op
.
InitCtxOn
Gpu
(
true
);
test_op
.
InitCtxOn
Device
(
p
::
kCUDA
);
test_op
.
InitBroadcastOp
(
input_scope_idx
);
test_op
.
TestBroadcastLodTensor
(
input_scope_idx
);
}
...
...
@@ -46,12 +48,22 @@ TEST(BroadcastTester, TestGPUBroadcastTestLodTensor) {
TEST
(
BroadcastTester
,
TestGPUBroadcastTestSelectedRows
)
{
TestBroadcastOpHandle
test_op
;
size_t
input_scope_idx
=
0
;
test_op
.
InitCtxOn
Gpu
(
true
);
test_op
.
InitCtxOn
Device
(
p
::
kCUDA
);
test_op
.
InitBroadcastOp
(
input_scope_idx
);
test_op
.
TestBroadcastSelectedRows
(
input_scope_idx
);
}
#endif
#if defined(PADDLE_WITH_XPU_BKCL)
TEST
(
BroadcastTester
,
TestXPUBroadcastTestLodTensor
)
{
TestBroadcastOpHandle
test_op
;
size_t
input_scope_idx
=
0
;
test_op
.
InitCtxOnDevice
(
p
::
kXPU
);
test_op
.
InitBroadcastOp
(
input_scope_idx
);
test_op
.
TestBroadcastLodTensor
(
input_scope_idx
);
}
#endif
}
// namespace details
}
// namespace framework
}
// namespace paddle
paddle/fluid/framework/details/broadcast_op_handle_test.h
浏览文件 @
4427df37
...
...
@@ -33,7 +33,7 @@ struct VarHandle;
namespace
f
=
paddle
::
framework
;
namespace
p
=
paddle
::
platform
;
using
UseDevice
=
paddle
::
framework
::
details
::
ExecutionStrategy
::
UseDevic
e
;
using
DeviceType
=
paddle
::
platform
::
DeviceTyp
e
;
// test data amount
const
f
::
DDim
kDims
=
{
20
,
20
};
...
...
@@ -47,11 +47,15 @@ struct TestBroadcastOpHandle {
std
::
vector
<
VarHandleBase
*>
vars_
;
std
::
vector
<
std
::
unique_ptr
<
ir
::
Node
>>
nodes_
;
std
::
vector
<
p
::
Place
>
place_list_
;
bool
use_gpu
_
;
DeviceType
use_device
_
;
#if defined(PADDLE_WITH_NCCL)
std
::
unique_ptr
<
platform
::
NCCLContextMap
>
nccl_ctxs_
;
#endif
#if defined(PADDLE_WITH_XPU_BKCL)
std
::
unique_ptr
<
platform
::
BKCLContextMap
>
bkcl_ctxs_
;
#endif
void
WaitAll
()
{
for
(
size_t
j
=
0
;
j
<
ctxs_
.
size
();
++
j
)
{
ctxs_
[
j
]
->
Wait
();
...
...
@@ -60,12 +64,36 @@ struct TestBroadcastOpHandle {
if
(
nccl_ctxs_
)
{
nccl_ctxs_
->
WaitAll
();
}
#endif
#if defined(PADDLE_WITH_XPU_BKCL)
if
(
bkcl_ctxs_
)
{
bkcl_ctxs_
->
WaitAll
();
}
#endif
}
void
InitCtxOnGpu
(
bool
use_gpu
)
{
use_gpu_
=
use_gpu
;
if
(
use_gpu_
)
{
void
InitCtxOnDevice
(
DeviceType
use_device
)
{
use_device_
=
use_device
;
if
(
use_device_
==
p
::
kXPU
)
{
#if defined(PADDLE_WITH_XPU_BKCL)
int
count
=
p
::
GetXPUDeviceCount
();
if
(
count
<=
1
)
{
LOG
(
WARNING
)
<<
"Cannot test multi-xpu Broadcast, because the XPU "
"device count is "
<<
count
;
exit
(
0
);
}
for
(
int
i
=
0
;
i
<
count
;
++
i
)
{
auto
p
=
p
::
XPUPlace
(
i
);
place_list_
.
push_back
(
p
);
ctxs_
.
emplace_back
(
new
p
::
XPUDeviceContext
(
p
));
}
bkcl_ctxs_
.
reset
(
new
platform
::
BKCLContextMap
(
place_list_
));
#else
PADDLE_THROW
(
platform
::
errors
::
PreconditionNotMet
(
"Not compiled with BKCL."
));
#endif
}
else
if
(
use_device_
==
p
::
kCUDA
)
{
#if defined(PADDLE_WITH_NCCL)
int
count
=
p
::
GetCUDADeviceCount
();
if
(
count
<=
1
)
{
...
...
@@ -91,6 +119,9 @@ struct TestBroadcastOpHandle {
place_list_
.
push_back
(
p
);
ctxs_
.
emplace_back
(
new
p
::
CPUDeviceContext
(
p
));
}
#if defined(PADDLE_WITH_XPU_BKCL)
bkcl_ctxs_
.
reset
(
nullptr
);
#endif
#if defined(PADDLE_WITH_NCCL)
nccl_ctxs_
.
reset
(
nullptr
);
#endif
...
...
@@ -111,22 +142,25 @@ struct TestBroadcastOpHandle {
nodes_
.
emplace_back
(
ir
::
CreateNodeForTest
(
"node0"
,
ir
::
Node
::
Type
::
kOperation
));
if
(
use_
gpu_
)
{
if
(
use_
device_
==
p
::
kCUDA
)
{
#if defined(PADDLE_WITH_NCCL)
op_handle_
=
new
BroadcastOpHandle
(
nodes_
.
back
().
get
(),
local_scopes_
,
place_list_
,
nccl_ctxs_
.
get
());
#else
PADDLE_THROW
(
platform
::
errors
::
PreconditionNotMet
(
"Not compiled with NC
L
L."
));
platform
::
errors
::
PreconditionNotMet
(
"Not compiled with NC
C
L."
));
#endif
}
else
{
#if defined(PADDLE_WITH_
NC
CL)
}
else
if
(
use_device_
==
p
::
kXPU
)
{
#if defined(PADDLE_WITH_
XPU_BK
CL)
op_handle_
=
new
BroadcastOpHandle
(
nodes_
.
back
().
get
(),
local_scopes_
,
place_list_
,
nc
cl_ctxs_
.
get
());
place_list_
,
bk
cl_ctxs_
.
get
());
#else
PADDLE_THROW
(
platform
::
errors
::
PreconditionNotMet
(
"Not compiled with BKCL."
));
#endif
}
else
{
op_handle_
=
new
BroadcastOpHandle
(
nodes_
.
back
().
get
(),
local_scopes_
,
place_list_
);
#endif
}
op_handle_
->
SetLocalExecScopes
(
scope_map
);
...
...
@@ -149,7 +183,7 @@ struct TestBroadcastOpHandle {
op_handle_
->
AddInput
(
dummy_var_handle
);
for
(
size_t
j
=
0
;
j
<
place_list_
.
size
();
++
j
)
{
if
(
!
use_gpu_
)
{
if
(
use_device_
!=
p
::
kCUDA
)
{
op_handle_
->
SetDeviceContext
(
place_list_
[
j
],
ctxs_
[
j
].
get
());
}
nodes_
.
emplace_back
(
...
...
@@ -275,7 +309,7 @@ struct TestBroadcastOpHandle {
f
::
LoD
lod
{{
0
,
10
,
20
}};
auto
send_vector
=
InitLoDTensor
(
"input"
,
input_scope_idx
,
lod
);
UseDevice
use_device
=
UseDevice
::
kCPU
;
DeviceType
use_device
=
p
::
kCPU
;
op_handle_
->
Run
(
use_device
);
WaitAll
();
...
...
@@ -290,7 +324,7 @@ struct TestBroadcastOpHandle {
int
height
=
static_cast
<
int
>
(
kDims
[
0
]
*
2
);
auto
send_vector
=
InitSelectedRows
(
"input"
,
input_scope_idx
,
rows
,
height
);
UseDevice
use_device
=
UseDevice
::
kCPU
;
DeviceType
use_device
=
p
::
kCPU
;
op_handle_
->
Run
(
use_device
);
WaitAll
();
...
...
paddle/fluid/framework/details/build_strategy.cc
浏览文件 @
4427df37
...
...
@@ -313,10 +313,13 @@ ir::Graph *BuildStrategy::Apply(ir::Graph *graph,
const
std
::
vector
<
Scope
*>
&
local_scopes
,
const
size_t
&
nranks
,
#if defined(PADDLE_WITH_NCCL)
const
bool
use_cuda
,
DeviceType
use_device
,
platform
::
NCCLCommunicator
*
nccl_ctxs
)
const
{
#elif defined(PADDLE_WITH_XPU) && defined(PADDLE_WITH_XPU_BKCL)
DeviceType
use_device
,
platform
::
BKCLCommunicator
*
bkcl_ctxs
)
const
{
#else
const
bool
use_cuda
)
const
{
DeviceType
use_device
)
const
{
#endif
VLOG
(
1
)
<<
"apply all passes"
;
// Create a default one if not finalized by user.
...
...
@@ -336,9 +339,16 @@ ir::Graph *BuildStrategy::Apply(ir::Graph *graph,
pass
->
Set
<
size_t
>
(
kNRanks
,
new
size_t
(
nranks
));
#if defined(PADDLE_WITH_NCCL)
platform
::
NCCLCommunicator
*
nctx
=
use_cuda
?
nccl_ctxs
:
nullptr
;
platform
::
NCCLCommunicator
*
nctx
=
(
use_device
==
p
::
kCUDA
)
?
nccl_ctxs
:
nullptr
;
pass
->
Erase
(
kNCCLCtxs
);
pass
->
SetNotOwned
<
platform
::
NCCLCommunicator
>
(
kNCCLCtxs
,
nctx
);
#elif defined(PADDLE_WITH_XPU) && defined(PADDLE_WITH_XPU_BKCL)
// ToDo: more check
platform
::
BKCLCommunicator
*
bkcl_ctx
=
(
use_device
==
p
::
kXPU
)
?
bkcl_ctxs
:
nullptr
;
pass
->
Erase
(
kBKCLCtxs
);
pass
->
SetNotOwned
<
platform
::
BKCLCommunicator
>
(
kBKCLCtxs
,
bkcl_ctx
);
#endif
}
else
if
(
pass
->
Type
()
==
"fuse_all_reduce_op_pass"
)
{
pass
->
Erase
(
kNRanks
);
...
...
@@ -349,12 +359,24 @@ ir::Graph *BuildStrategy::Apply(ir::Graph *graph,
pass
->
SetNotOwned
<
const
std
::
vector
<
Scope
*>>
(
kLocalScopes
,
&
local_scopes
);
#if defined(PADDLE_WITH_NCCL)
platform
::
NCCLCommunicator
*
nctx
=
use_cuda
?
nccl_ctxs
:
nullptr
;
platform
::
NCCLCommunicator
*
nctx
=
(
use_device
==
p
::
kCUDA
)
?
nccl_ctxs
:
nullptr
;
pass
->
Erase
(
kNCCLCtxs
);
pass
->
SetNotOwned
<
platform
::
NCCLCommunicator
>
(
kNCCLCtxs
,
nctx
);
pass
->
Erase
(
kUseHierarchicalAllReduce
);
pass
->
Set
<
bool
>
(
kUseHierarchicalAllReduce
,
new
bool
(
use_hierarchical_allreduce_
));
#elif defined(PADDLE_WITH_XPU) && defined(PADDLE_WITH_XPU_BKCL)
platform
::
BKCLCommunicator
*
nctx
=
(
use_device
==
p
::
kXPU
)
?
bkcl_ctxs
:
nullptr
;
pass
->
Erase
(
kBKCLCtxs
);
pass
->
SetNotOwned
<
platform
::
BKCLCommunicator
>
(
kBKCLCtxs
,
nctx
);
pass
->
Erase
(
kUseHierarchicalAllReduce
);
PADDLE_ENFORCE_EQ
(
use_hierarchical_allreduce_
,
false
,
platform
::
errors
::
Unimplemented
(
"xpu doesn't support hierarchical_allreduce"
));
pass
->
Set
<
bool
>
(
kUseHierarchicalAllReduce
,
new
bool
(
use_hierarchical_allreduce_
));
#endif
}
else
if
(
pass
->
Type
()
==
"coalesce_grad_tensor_pass"
)
{
pass
->
Erase
(
kNRanks
);
...
...
@@ -364,35 +386,47 @@ ir::Graph *BuildStrategy::Apply(ir::Graph *graph,
<<
enable_sequential_execution_
;
}
else
if
(
pass
->
Type
()
==
"all_reduce_deps_pass"
)
{
#if defined(PADDLE_WITH_NCCL)
platform
::
NCCLCommunicator
*
nctx
=
use_cuda
?
nccl_ctxs
:
nullptr
;
platform
::
NCCLCommunicator
*
nctx
=
(
use_device
==
p
::
kCUDA
)
?
nccl_ctxs
:
nullptr
;
pass
->
Erase
(
kNCCLCtxs
);
pass
->
SetNotOwned
<
platform
::
NCCLCommunicator
>
(
kNCCLCtxs
,
nctx
);
pass
->
Erase
(
kUseHierarchicalAllReduce
);
pass
->
Set
<
bool
>
(
kUseHierarchicalAllReduce
,
new
bool
(
use_hierarchical_allreduce_
));
#elif defined(PADDLE_WITH_XPU) && defined(PADDLE_WITH_XPU_BKCL)
platform
::
BKCLCommunicator
*
nctx
=
(
use_device
==
p
::
kXPU
)
?
bkcl_ctxs
:
nullptr
;
pass
->
Erase
(
kBKCLCtxs
);
pass
->
SetNotOwned
<
platform
::
BKCLCommunicator
>
(
kBKCLCtxs
,
nctx
);
pass
->
Erase
(
kUseHierarchicalAllReduce
);
PADDLE_ENFORCE_EQ
(
use_hierarchical_allreduce_
,
false
,
platform
::
errors
::
Unimplemented
(
"xpu doesn't support hierarchical_allreduce"
));
pass
->
Set
<
bool
>
(
kUseHierarchicalAllReduce
,
new
bool
(
use_hierarchical_allreduce_
));
#endif
VLOG
(
1
)
<<
"SeqOnlyAllReduceOps:"
<<
SeqOnlyAllReduceOps
(
*
this
)
<<
", num_trainers:"
<<
num_trainers_
;
}
else
if
(
pass
->
Type
()
==
"fuse_relu_depthwise_conv_pass"
)
{
if
(
!
use_cuda
)
{
if
(
use_device
!=
p
::
kCUDA
)
{
LOG
(
WARNING
)
<<
"fuse_relu_depthwise_conv_pass is only supported on "
"GPU, skipped."
;
continue
;
}
}
else
if
(
pass
->
Type
()
==
"fusion_group_pass"
)
{
pass
->
Set
<
bool
>
(
"use_gpu"
,
new
bool
(
use_cuda
));
if
(
!
use_cuda
)
{
pass
->
Set
<
bool
>
(
"use_gpu"
,
new
bool
(
(
use_device
==
p
::
kCUDA
)
));
if
(
use_device
!=
p
::
kCUDA
)
{
LOG
(
WARNING
)
<<
"fusion_group_pass is only supported on GPU, skipped."
;
continue
;
}
}
else
if
(
pass
->
Type
()
==
"fuse_bn_act_pass"
)
{
if
(
!
use_cuda
)
{
if
(
use_device
!=
p
::
kCUDA
)
{
LOG
(
WARNING
)
<<
"fuse_bn_act_pass is only supported on "
"GPU, skipped."
;
continue
;
}
}
else
if
(
pass
->
Type
()
==
"fuse_bn_add_act_pass"
)
{
if
(
!
use_cuda
)
{
if
(
use_device
!=
p
::
kCUDA
)
{
LOG
(
WARNING
)
<<
"fuse_bn_add_act_pass is only supported on "
"GPU, skipped."
;
continue
;
...
...
@@ -401,7 +435,7 @@ ir::Graph *BuildStrategy::Apply(ir::Graph *graph,
pass
->
Set
(
"mkldnn_enabled_op_types"
,
new
std
::
unordered_set
<
std
::
string
>
(
mkldnn_enabled_op_types_
));
}
else
if
(
pass
->
Type
()
==
"backward_optimizer_op_deps_pass"
)
{
if
(
!
use_cuda
)
{
if
(
use_device
!=
p
::
kCUDA
)
{
VLOG
(
1
)
<<
"backward_optimizer_op_deps_pass is only supported on "
"GPU, skipped."
;
continue
;
...
...
paddle/fluid/framework/details/build_strategy.h
浏览文件 @
4427df37
...
...
@@ -41,11 +41,15 @@ class NCCLCommunicator;
#if defined(PADDLE_WITH_NCCL)
#include "paddle/fluid/platform/nccl_helper.h"
#elif defined(PADDLE_WITH_XPU) && defined(PADDLE_WITH_XPU_BKCL)
#include "paddle/fluid/platform/bkcl_helper.h"
#endif
namespace
paddle
{
namespace
framework
{
namespace
details
{
using
DeviceType
=
paddle
::
platform
::
DeviceType
;
namespace
p
=
paddle
::
platform
;
struct
BuildStrategy
{
// ParallelExecutor supports two modes of ReduceStrategy, kAllReduce and
...
...
@@ -147,6 +151,7 @@ struct BuildStrategy {
// NCCL config
size_t
nccl_comm_num_
{
1
};
size_t
bkcl_comm_num_
{
1
};
// The picture is here:
// https://github.com/PaddlePaddle/Paddle/pull/17263#discussion_r285411396
bool
use_hierarchical_allreduce_
{
false
};
...
...
@@ -181,10 +186,13 @@ struct BuildStrategy {
const
std
::
vector
<
Scope
*>
&
local_scopes
,
const
size_t
&
nranks
,
#if defined(PADDLE_WITH_NCCL)
const
bool
use_cuda
,
DeviceType
use_device
,
platform
::
NCCLCommunicator
*
nccl_ctxs
)
const
;
#elif defined(PADDLE_WITH_XPU) && defined(PADDLE_WITH_XPU_BKCL)
DeviceType
use_device
,
platform
::
BKCLCommunicator
*
bkcl_ctxs
)
const
;
#else
const
bool
use_cuda
)
const
;
DeviceType
use_device
)
const
;
#endif
// If set true, ParallelExecutor would build the main_program into multiple
...
...
paddle/fluid/framework/details/execution_strategy.h
浏览文件 @
4427df37
...
...
@@ -14,22 +14,19 @@
#pragma once
#include <cstddef> // for size_t
#include "paddle/fluid/platform/device_context.h"
namespace
paddle
{
namespace
framework
{
namespace
details
{
using
DeviceType
=
paddle
::
platform
::
DeviceType
;
namespace
p
=
paddle
::
platform
;
struct
ExecutionStrategy
{
enum
ExecutorType
{
kDefault
=
0
,
kExperimental
=
1
};
enum
UseDevice
{
kCPU
=
0
,
kCUDA
=
1
,
kXPU
=
2
,
};
// num_threads indicates the size of thread pool.
size_t
num_threads_
{
0
};
UseDevice
use_device_
{
kCUDA
}
;
DeviceType
use_device_
=
p
::
kCUDA
;
// Note that allow_op_delay is invalid now.
bool
allow_op_delay_
{
false
};
// num_iteration_per_drop_scope indicates how many
...
...
paddle/fluid/framework/details/fused_all_reduce_op_handle.cc
浏览文件 @
4427df37
...
...
@@ -37,6 +37,13 @@ FusedAllReduceOpHandle::FusedAllReduceOpHandle(
const
platform
::
NCCLCommunicator
*
ctxs
)
:
AllReduceOpHandle
(
node
,
local_scopes
,
places
,
ctxs
),
num_of_all_reduce_
(
num_of_all_reduce
)
{}
#elif defined(PADDLE_WITH_XPU_BKCL)
FusedAllReduceOpHandle
::
FusedAllReduceOpHandle
(
ir
::
Node
*
node
,
const
std
::
vector
<
Scope
*>
&
local_scopes
,
const
std
::
vector
<
platform
::
Place
>
&
places
,
const
size_t
num_of_all_reduce
,
const
platform
::
BKCLCommunicator
*
ctxs
)
:
AllReduceOpHandle
(
node
,
local_scopes
,
places
,
ctxs
),
num_of_all_reduce_
(
num_of_all_reduce
)
{}
#else
FusedAllReduceOpHandle
::
FusedAllReduceOpHandle
(
ir
::
Node
*
node
,
const
std
::
vector
<
Scope
*>
&
local_scopes
,
...
...
@@ -73,9 +80,14 @@ void FusedAllReduceOpHandle::RunImpl() {
"handles is %d, and the number of output variable handles is %d."
,
in_var_handles
.
size
(),
out_var_handles
.
size
()));
// Note: some gradient op doesn't have CUDAKernel, so the gradients of
// those op are in CPUPlace, in this case, the all reduce should not be fused.
// Note: some gradient op doesn't have CUDAKernel, so the gradients of
// those op are in CPUPlace, in this case, the all reduce should not be fused.
#if defined(PADDLE_WITH_XPU_BKCL)
// TODO(liuyuhui): XPU don't support fuse all reduce for now
if
(
InputIsInDifferentPlace
(
in_var_handles
)
||
true
)
{
#else
if
(
InputIsInDifferentPlace
(
in_var_handles
))
{
#endif
for
(
size_t
j
=
0
;
j
<
num_of_all_reduce_
;
++
j
)
{
std
::
vector
<
VarHandle
*>
dev_inputs
;
std
::
vector
<
VarHandle
*>
dev_outputs
;
...
...
paddle/fluid/framework/details/fused_all_reduce_op_handle.h
浏览文件 @
4427df37
...
...
@@ -36,6 +36,8 @@ class NCCLCommunicator;
#if defined(PADDLE_WITH_NCCL)
#include "paddle/fluid/framework/details/nccl_op_handle.h"
#include "paddle/fluid/platform/nccl_helper.h"
#elif defined(PADDLE_WITH_XPU_BKCL)
#include "paddle/fluid/platform/bkcl_helper.h"
#endif
namespace
paddle
{
...
...
@@ -49,6 +51,13 @@ struct FusedAllReduceOpHandle : public AllReduceOpHandle {
const
std
::
vector
<
platform
::
Place
>
&
places
,
const
size_t
num_of_all_reduce
,
const
platform
::
NCCLCommunicator
*
ctxs
);
#elif defined(PADDLE_WITH_XPU_BKCL)
struct
FusedAllReduceOpHandle
:
public
AllReduceOpHandle
{
FusedAllReduceOpHandle
(
ir
::
Node
*
node
,
const
std
::
vector
<
Scope
*>
&
local_scopes
,
const
std
::
vector
<
platform
::
Place
>
&
places
,
const
size_t
num_of_all_reduce
,
const
platform
::
BKCLCommunicator
*
ctxs
);
#else
struct
FusedAllReduceOpHandle
:
public
AllReduceOpHandle
{
FusedAllReduceOpHandle
(
ir
::
Node
*
node
,
...
...
paddle/fluid/framework/details/fused_broadcast_op_handle.h
浏览文件 @
4427df37
...
...
@@ -52,11 +52,18 @@ struct FusedBroadcastOpHandle : public BroadcastOpHandle {
const
std
::
vector
<
platform
::
Place
>
&
places
,
const
platform
::
NCCLContextMap
*
nccl_ctx
)
:
BroadcastOpHandle
(
node
,
local_scopes
,
places
,
nccl_ctx
)
{}
#else
FusedBroadcastOpHandle
(
ir
::
Node
*
node
,
const
std
::
vector
<
Scope
*>
local_scopes
,
const
std
::
vector
<
platform
::
Place
>&
places
)
:
BroadcastOpHandle
(
node
,
local_scopes
,
places
)
{}
#endif
#if defined(PADDLE_WITH_XPU_BKCL)
FusedBroadcastOpHandle
(
ir
::
Node
*
node
,
const
std
::
vector
<
Scope
*>
local_scopes
,
const
std
::
vector
<
platform
::
Place
>
&
places
,
const
platform
::
BKCLContextMap
*
bkcl_ctx
)
:
BroadcastOpHandle
(
node
,
local_scopes
,
places
,
bkcl_ctx
)
{}
#endif
FusedBroadcastOpHandle
(
ir
::
Node
*
node
,
const
std
::
vector
<
Scope
*>
local_scopes
,
const
std
::
vector
<
platform
::
Place
>
&
places
)
:
BroadcastOpHandle
(
node
,
local_scopes
,
places
)
{}
std
::
string
Name
()
const
override
;
protected:
...
...
paddle/fluid/framework/details/fused_broadcast_op_handle_test.cc
浏览文件 @
4427df37
...
...
@@ -32,7 +32,7 @@ namespace framework {
namespace
details
{
struct
VarHandle
;
using
UseDevice
=
paddle
::
framework
::
details
::
ExecutionStrategy
::
UseDevic
e
;
using
DeviceType
=
paddle
::
platform
::
DeviceTyp
e
;
struct
TestFusedBroadcastOpHandle
:
TestBroadcastOpHandle
{
std
::
vector
<
std
::
string
>
out_varnames_
;
...
...
@@ -56,7 +56,7 @@ struct TestFusedBroadcastOpHandle : TestBroadcastOpHandle {
// create op handle node
nodes_
.
emplace_back
(
ir
::
CreateNodeForTest
(
"fused_broadcast"
,
ir
::
Node
::
Type
::
kOperation
));
if
(
use_
gpu_
)
{
if
(
use_
device_
==
p
::
kCUDA
)
{
#if defined(PADDLE_WITH_NCCL)
op_handle_
=
new
FusedBroadcastOpHandle
(
nodes_
.
back
().
get
(),
local_scopes_
,
place_list_
,
nccl_ctxs_
.
get
());
...
...
@@ -64,14 +64,17 @@ struct TestFusedBroadcastOpHandle : TestBroadcastOpHandle {
PADDLE_THROW
(
platform
::
errors
::
PreconditionNotMet
(
"Not compiled with CUDA."
));
#endif
}
else
{
#if defined(PADDLE_WITH_
NC
CL)
}
else
if
(
use_device_
==
p
::
kXPU
)
{
#if defined(PADDLE_WITH_
XPU_BK
CL)
op_handle_
=
new
FusedBroadcastOpHandle
(
nodes_
.
back
().
get
(),
local_scopes_
,
place_list_
,
nc
cl_ctxs_
.
get
());
nodes_
.
back
().
get
(),
local_scopes_
,
place_list_
,
bk
cl_ctxs_
.
get
());
#else
PADDLE_THROW
(
platform
::
errors
::
PreconditionNotMet
(
"Not compiled with XPU."
));
#endif
}
else
{
op_handle_
=
new
FusedBroadcastOpHandle
(
nodes_
.
back
().
get
(),
local_scopes_
,
place_list_
);
#endif
}
op_handle_
->
SetLocalExecScopes
(
scope_map
);
...
...
@@ -109,7 +112,7 @@ struct TestFusedBroadcastOpHandle : TestBroadcastOpHandle {
InitLoDTensor
(
varname
,
input_scope_idxes
[
i
],
lod
,
val_scalar
));
}
UseDevice
use_device
=
UseDevice
::
kCPU
;
DeviceType
use_device
=
p
::
kCPU
;
op_handle_
->
Run
(
use_device
);
WaitAll
();
...
...
@@ -133,7 +136,7 @@ struct TestFusedBroadcastOpHandle : TestBroadcastOpHandle {
rows
,
height
,
val_scalar
));
}
UseDevice
use_device
=
UseDevice
::
kCPU
;
DeviceType
use_device
=
p
::
kCPU
;
op_handle_
->
Run
(
use_device
);
WaitAll
();
...
...
@@ -150,7 +153,7 @@ struct TestFusedBroadcastOpHandle : TestBroadcastOpHandle {
TEST
(
FusedBroadcastTester
,
CPULodTensor
)
{
TestFusedBroadcastOpHandle
test_op
;
std
::
vector
<
size_t
>
input_scope_idxes
=
{
0
,
1
};
test_op
.
InitCtxOn
Gpu
(
false
);
test_op
.
InitCtxOn
Device
(
p
::
kCPU
);
test_op
.
InitFusedBroadcastOp
(
input_scope_idxes
);
test_op
.
TestFusedBroadcastLoDTensor
(
input_scope_idxes
);
}
...
...
@@ -158,7 +161,7 @@ TEST(FusedBroadcastTester, CPULodTensor) {
TEST
(
FusedBroadcastTester
,
CPUSelectedRows
)
{
TestFusedBroadcastOpHandle
test_op
;
std
::
vector
<
size_t
>
input_scope_idxes
=
{
0
,
1
};
test_op
.
InitCtxOn
Gpu
(
false
);
test_op
.
InitCtxOn
Device
(
p
::
kCPU
);
test_op
.
InitFusedBroadcastOp
(
input_scope_idxes
);
test_op
.
TestFusedBroadcastSelectedRows
(
input_scope_idxes
);
}
...
...
@@ -167,7 +170,7 @@ TEST(FusedBroadcastTester, CPUSelectedRows) {
TEST
(
FusedBroadcastTester
,
GPULodTensor
)
{
TestFusedBroadcastOpHandle
test_op
;
std
::
vector
<
size_t
>
input_scope_idxes
=
{
0
,
1
};
test_op
.
InitCtxOn
Gpu
(
true
);
test_op
.
InitCtxOn
Device
(
p
::
kCUDA
);
test_op
.
InitFusedBroadcastOp
(
input_scope_idxes
);
test_op
.
TestFusedBroadcastLoDTensor
(
input_scope_idxes
);
}
...
...
@@ -175,12 +178,22 @@ TEST(FusedBroadcastTester, GPULodTensor) {
TEST
(
FusedBroadcastTester
,
GPUSelectedRows
)
{
TestFusedBroadcastOpHandle
test_op
;
std
::
vector
<
size_t
>
input_scope_idxes
=
{
0
,
1
};
test_op
.
InitCtxOn
Gpu
(
true
);
test_op
.
InitCtxOn
Device
(
p
::
kCUDA
);
test_op
.
InitFusedBroadcastOp
(
input_scope_idxes
);
test_op
.
TestFusedBroadcastSelectedRows
(
input_scope_idxes
);
}
#endif
#if defined(PADDLE_WITH_XPU_BKCL)
TEST
(
FusedBroadcastTester
,
XPULodTensor
)
{
TestFusedBroadcastOpHandle
test_op
;
std
::
vector
<
size_t
>
input_scope_idxes
=
{
0
,
1
};
test_op
.
InitCtxOnDevice
(
p
::
kXPU
);
test_op
.
InitFusedBroadcastOp
(
input_scope_idxes
);
test_op
.
TestFusedBroadcastLoDTensor
(
input_scope_idxes
);
}
#endif
}
// namespace details
}
// namespace framework
}
// namespace paddle
paddle/fluid/framework/details/gather_op_handle_test.cc
浏览文件 @
4427df37
...
...
@@ -27,7 +27,7 @@ struct DummyVarHandle;
namespace
f
=
paddle
::
framework
;
namespace
p
=
paddle
::
platform
;
using
UseDevice
=
paddle
::
framework
::
details
::
ExecutionStrategy
::
UseDevic
e
;
using
DeviceType
=
paddle
::
platform
::
DeviceTyp
e
;
// test data amount
const
f
::
DDim
kDims
=
{
20
,
20
};
...
...
@@ -173,7 +173,7 @@ struct TestGatherOpHandle {
out_selected_rows
->
mutable_value
()
->
ShareDataWith
(
in_selected_rows
->
value
());
UseDevice
use_device
=
UseDevice
::
kCPU
;
DeviceType
use_device
=
p
::
kCPU
;
op_handle_
->
Run
(
use_device
);
WaitAll
();
...
...
paddle/fluid/framework/details/multi_devices_helper.h
浏览文件 @
4427df37
...
...
@@ -55,6 +55,7 @@ constexpr char kPlaces[] = "places";
constexpr
char
kGlobalScope
[]
=
"global_scope"
;
constexpr
char
kLocalScopes
[]
=
"local_scopes"
;
constexpr
char
kNCCLCtxs
[]
=
"nccl_ctxs"
;
constexpr
char
kBKCLCtxs
[]
=
"bkcl_ctxs"
;
constexpr
char
kUseHierarchicalAllReduce
[]
=
"use_hierarchical_allreduce"
;
// aux variables to represent dependency. Useful to resolve data hazard.
...
...
paddle/fluid/framework/details/op_handle_base.cc
浏览文件 @
4427df37
...
...
@@ -82,21 +82,74 @@ void OpHandleBase::InitCUDA() {
}
}
}
#else
PADDLE_THROW
(
platform
::
errors
::
PermissionDenied
(
"Paddle can't use CUDA device since it's not compiled with CUDA,"
"Please recompile or reinstall Paddle with GPU support."
));
#endif
}
void
OpHandleBase
::
InitXPU
()
{
#ifdef PADDLE_WITH_XPU
if
(
IsMultiDeviceTransfer
()
&&
dev_ctxes_
.
size
()
>
0
)
{
for
(
auto
&
out_var
:
outputs_
)
{
auto
*
out_var_handle
=
dynamic_cast
<
VarHandle
*>
(
out_var
);
if
(
out_var_handle
)
{
// TODO(liuyuhui): XPU now don't support sync events, add later.
}
}
}
else
{
PADDLE_ENFORCE_EQ
(
dev_ctxes_
.
size
(),
1UL
,
platform
::
errors
::
InvalidArgument
(
"%s should have only one dev_ctx."
,
Name
()));
auto
&
place
=
dev_ctxes_
.
begin
()
->
first
;
int
dev_id
=
BOOST_GET_CONST
(
platform
::
XPUPlace
,
place
).
device
;
PADDLE_ENFORCE_EQ
(
xpu_set_device
(
dev_id
),
XPU_SUCCESS
,
platform
::
errors
::
PreconditionNotMet
(
"xpu_set_device failed"
));
for
(
auto
&
out_var
:
outputs_
)
{
auto
*
out_var_handle
=
dynamic_cast
<
VarHandle
*>
(
out_var
);
if
(
out_var_handle
)
{
PADDLE_ENFORCE_EQ
(
platform
::
is_same_place
(
place
,
out_var_handle
->
place
()),
true
,
platform
::
errors
::
InvalidArgument
(
"The place of output(%s) is not consistent with the "
"place of current op(%s)."
,
out_var_handle
->
Name
(),
Name
()));
}
}
}
#else
PADDLE_THROW
(
platform
::
errors
::
PermissionDenied
(
"Paddle can't use XPU device since it's not compiled with XPU,"
"Please recompile or reinstall Paddle with XPU support."
));
#endif
}
void
OpHandleBase
::
Run
(
ExecutionStrategy
::
UseDevic
e
use_device
)
{
void
OpHandleBase
::
Run
(
DeviceTyp
e
use_device
)
{
#ifdef PADDLE_WITH_CUDA
if
(
events_
.
empty
()
&&
use_device
==
ExecutionStrategy
::
UseDevice
::
kCUDA
&&
dev_ctxes_
.
size
()
>
0
)
{
if
(
events_
.
empty
()
&&
use_device
==
p
::
kCUDA
&&
dev_ctxes_
.
size
()
>
0
)
{
InitCUDA
();
}
#else
PADDLE_ENFORCE_NE
(
use_device
,
ExecutionStrategy
::
UseDevice
::
kCUDA
,
platform
::
errors
::
InvalidArgument
(
"Argument use_cuda should be false when Paddle is not "
"compiled with CUDA."
));
PADDLE_ENFORCE_NE
(
use_device
,
p
::
kCUDA
,
platform
::
errors
::
InvalidArgument
(
"Argument use_device should not be kCUDA when Paddle is not "
"compiled with CUDA."
));
#endif
if
(
use_device
==
p
::
kXPU
&&
dev_ctxes_
.
size
()
>
0
)
{
#ifdef PADDLE_WITH_XPU
InitXPU
();
#else
PADDLE_ENFORCE_NE
(
use_device
,
p
::
kXPU
,
platform
::
errors
::
InvalidArgument
(
"Argument use_device should not be kXPU when Paddle is not "
"compiled with XPU."
));
#endif
}
// skip running current op, used with inplace_addto_op_pass
if
(
skip_running_
)
{
...
...
paddle/fluid/framework/details/op_handle_base.h
浏览文件 @
4427df37
...
...
@@ -43,7 +43,8 @@ class Node;
}
// namespace ir
namespace
details
{
using
DeviceType
=
paddle
::
platform
::
DeviceType
;
namespace
p
=
paddle
::
platform
;
// Wraps ir::Node and provide helper utilities.
// It's responsible for populating necessary fields of ir::Node.
class
OpHandleBase
{
...
...
@@ -72,7 +73,7 @@ class OpHandleBase {
virtual
std
::
string
Name
()
const
=
0
;
void
Run
(
ExecutionStrategy
::
UseDevic
e
use_device
);
void
Run
(
DeviceTyp
e
use_device
);
virtual
void
RecordWaitEventOnCtx
(
platform
::
DeviceContext
*
waited_ctx
);
...
...
@@ -145,6 +146,7 @@ class OpHandleBase {
virtual
void
RunImpl
()
=
0
;
virtual
void
InitCUDA
();
virtual
void
InitXPU
();
ir
::
Node
*
node_
;
std
::
vector
<
VarHandleBase
*>
inputs_
;
...
...
paddle/fluid/framework/details/reduce_op_handle.cc
浏览文件 @
4427df37
...
...
@@ -212,10 +212,64 @@ void ReduceOpHandle::RunImpl() {
#else
PADDLE_THROW
(
platform
::
errors
::
PreconditionNotMet
(
"Not compiled with CUDA."
));
#endif
}
else
if
(
paddle
::
platform
::
is_xpu_place
(
lod_tensors
[
0
]
->
place
()))
{
#if defined(PADDLE_WITH_XPU_BKCL)
auto
pre_in
=
pre_in_var
->
Get
<
framework
::
LoDTensor
>
();
VariableVisitor
::
ShareDimsAndLoD
(
*
pre_in_var
,
out_var
);
VariableVisitor
::
GetMutableTensor
(
out_var
).
mutable_data
(
out_var_handle
->
place
(),
pre_in
.
type
());
auto
out_p
=
out_var_handle
->
place
();
int
root_id
=
BOOST_GET_CONST
(
platform
::
XPUPlace
,
out_p
).
device
;
std
::
vector
<
std
::
function
<
void
()
>>
all_reduce_calls
;
for
(
size_t
i
=
0
;
i
<
var_scopes
.
size
();
++
i
)
{
auto
&
p
=
in_places
[
i
];
auto
&
lod_tensor
=
*
lod_tensors
[
i
];
int
dev_id
=
BOOST_GET_CONST
(
platform
::
XPUPlace
,
p
).
device
;
auto
&
bkcl_ctx
=
bkcl_ctxs_
->
at
(
dev_id
);
void
*
buffer
=
const_cast
<
void
*>
(
lod_tensor
.
data
<
void
>
());
void
*
recvbuffer
=
nullptr
;
if
(
root_id
==
dev_id
)
{
recvbuffer
=
out_var
->
GetMutable
<
framework
::
LoDTensor
>
()
->
mutable_data
(
out_var_handle
->
place
());
}
int
type
=
platform
::
ToBKCLDataType
(
lod_tensor
.
type
());
size_t
numel
=
static_cast
<
size_t
>
(
lod_tensor
.
numel
());
all_reduce_calls
.
emplace_back
([
buffer
,
recvbuffer
,
type
,
numel
,
root_id
,
&
bkcl_ctx
]
{
PADDLE_ENFORCE_EQ
(
bkcl_reduce
(
bkcl_ctx
.
comm
(),
buffer
,
recvbuffer
,
numel
,
static_cast
<
BKCLDataType
>
(
type
),
BKCL_ADD
,
root_id
,
nullptr
),
BKCL_SUCCESS
,
platform
::
errors
::
Unavailable
(
"bkcl_all_reduce failed"
));
});
}
WaitInputVarGenerated
();
this
->
RunAndRecordEvent
([
&
]
{
PADDLE_ENFORCE_EQ
(
bkcl_group_start
(),
BKCL_SUCCESS
,
platform
::
errors
::
Unavailable
(
"bkcl_group_start failed"
));
for
(
auto
&
call
:
all_reduce_calls
)
{
call
();
}
PADDLE_ENFORCE_EQ
(
bkcl_group_end
(),
BKCL_SUCCESS
,
platform
::
errors
::
Unavailable
(
"bkcl_group_end failed"
));
});
#else
PADDLE_THROW
(
platform
::
errors
::
PreconditionNotMet
(
"Not compiled with XPU."
));
#endif
}
else
{
PADDLE_THROW
(
platform
::
errors
::
InvalidArgument
(
"The place of tensor should be CPUPlace or CUDAPlace, but got %s."
,
"The place of tensor should be CPUPlace, CUDAPlace or XPUPlace, but "
"got %s."
,
lod_tensors
[
0
]
->
place
()));
}
}
...
...
paddle/fluid/framework/details/reduce_op_handle.h
浏览文件 @
4427df37
...
...
@@ -41,6 +41,8 @@ struct NCCLContextMap;
}
// namespace paddle
#if defined(PADDLE_WITH_NCCL)
#include "paddle/fluid/platform/nccl_helper.h"
#elif defined(PADDLE_WITH_XPU_BKCL)
#include "paddle/fluid/platform/bkcl_helper.h"
#endif
namespace
paddle
{
...
...
@@ -93,6 +95,22 @@ struct ReduceOpHandle : public OpHandleBase {
}
}
}
#elif defined(PADDLE_WITH_XPU_BKCL)
const
platform
::
BKCLContextMap
*
bkcl_ctxs_
;
ReduceOpHandle
(
ir
::
Node
*
node
,
const
std
::
vector
<
Scope
*>
&
local_scopes
,
const
std
::
vector
<
platform
::
Place
>
&
places
,
const
platform
::
BKCLContextMap
*
bkcl_ctxs
)
:
OpHandleBase
(
node
),
local_scopes_
(
local_scopes
),
places_
(
places
),
bkcl_ctxs_
(
bkcl_ctxs
)
{
if
(
bkcl_ctxs_
)
{
for
(
auto
&
p_ctx
:
bkcl_ctxs_
->
contexts_
)
{
this
->
SetDeviceContext
(
platform
::
XPUPlace
(
p_ctx
.
first
),
p_ctx
.
second
.
ctx_
.
get
());
}
}
}
#else
ReduceOpHandle
(
ir
::
Node
*
node
,
const
std
::
vector
<
Scope
*>
&
local_scopes
,
const
std
::
vector
<
platform
::
Place
>
&
places
)
...
...
paddle/fluid/framework/details/reduce_op_handle_test.cc
浏览文件 @
4427df37
...
...
@@ -25,7 +25,7 @@ namespace details {
namespace
f
=
paddle
::
framework
;
namespace
p
=
paddle
::
platform
;
using
UseDevice
=
paddle
::
framework
::
details
::
ExecutionStrategy
::
UseDevic
e
;
using
DeviceType
=
paddle
::
platform
::
DeviceTyp
e
;
// test data amount
const
f
::
DDim
kDims
=
{
20
,
20
};
...
...
@@ -198,7 +198,7 @@ struct TestReduceOpHandle {
out_selected_rows
->
mutable_value
()
->
ShareDataWith
(
in_selected_rows
->
value
());
UseDevice
use_device
=
UseDevice
::
kCPU
;
DeviceType
use_device
=
p
::
kCPU
;
op_handle_
->
Run
(
use_device
);
WaitAll
();
...
...
@@ -263,7 +263,7 @@ struct TestReduceOpHandle {
out_lodtensor
->
ShareDataWith
(
in_lodtensor
);
UseDevice
use_device
=
UseDevice
::
kCPU
;
DeviceType
use_device
=
p
::
kCPU
;
op_handle_
->
Run
(
use_device
);
WaitAll
();
...
...
paddle/fluid/framework/ir/memory_optimize_pass/test_reference_count_pass_last_lived_ops.cc
浏览文件 @
4427df37
...
...
@@ -30,6 +30,7 @@ DECLARE_double(eager_delete_tensor_gb);
namespace
paddle
{
namespace
framework
{
namespace
p
=
paddle
::
platform
;
static
std
::
vector
<
platform
::
Place
>
CreatePlaces
(
size_t
num
,
bool
use_cuda
)
{
std
::
vector
<
platform
::
Place
>
result
;
...
...
@@ -88,8 +89,7 @@ class ReferenceCountPassTestHelper {
FLAGS_eager_delete_tensor_gb
=
-
1
;
details
::
ExecutionStrategy
exec_strategy
;
exec_strategy
.
use_device_
=
use_cuda
?
(
ExecutionStrategy
::
kCUDA
)
:
(
ExecutionStrategy
::
kCPU
);
exec_strategy
.
use_device_
=
use_cuda
?
p
::
kCUDA
:
p
::
kCPU
;
executor_
.
reset
(
new
ParallelExecutor
(
CreatePlaces
(
1
,
use_cuda
),
{},
""
,
&
scope_
,
{},
exec_strategy
,
...
...
paddle/fluid/framework/ir/multi_devices_graph_pass/fuse_all_reduce_op_pass.cc
浏览文件 @
4427df37
...
...
@@ -41,6 +41,9 @@ class FuseAllReduceOpPass : public ir::Pass {
#if defined(PADDLE_WITH_NCCL)
auto
*
multi_nccl_ctxs
=
&
Get
<
platform
::
NCCLCommunicator
>
(
details
::
kNCCLCtxs
);
#elif defined(PADDLE_WITH_XPU_BKCL)
auto
*
multi_bkcl_ctxs
=
&
Get
<
platform
::
BKCLCommunicator
>
(
details
::
kBKCLCtxs
);
#endif
ir
::
Graph
&
result
=
*
graph
;
...
...
@@ -92,6 +95,9 @@ class FuseAllReduceOpPass : public ir::Pass {
#if defined(PADDLE_WITH_NCCL)
InsertFusedAllReduce
(
places
,
local_scopes
,
group_size
,
group_all_reduce_ops
,
multi_nccl_ctxs
,
&
result
);
#elif defined(PADDLE_WITH_XPU_BKCL)
InsertFusedAllReduce
(
places
,
local_scopes
,
group_size
,
group_all_reduce_ops
,
multi_bkcl_ctxs
,
&
result
);
#else
InsertFusedAllReduce
(
places
,
local_scopes
,
group_size
,
group_all_reduce_ops
,
&
result
);
...
...
@@ -154,6 +160,8 @@ class FuseAllReduceOpPass : public ir::Pass {
const
std
::
vector
<
ir
::
Node
*>
&
all_reduce_ops
,
#if defined(PADDLE_WITH_NCCL)
const
platform
::
NCCLCommunicator
*
multi_nccl_ctxs
,
#elif defined(PADDLE_WITH_XPU_BKCL)
const
platform
::
BKCLCommunicator
*
multi_bkcl_ctxs
,
#endif
ir
::
Graph
*
result
)
const
{
std
::
vector
<
details
::
VarHandleBase
*>
inputs
;
...
...
@@ -182,6 +190,9 @@ class FuseAllReduceOpPass : public ir::Pass {
#if defined(PADDLE_WITH_NCCL)
CreateFusedAllReduceOp
(
inputs
,
outputs
,
num_of_all_reduce
,
places
,
local_scopes
,
multi_nccl_ctxs
,
result
);
#elif defined(PADDLE_WITH_XPU_BKCL)
CreateFusedAllReduceOp
(
inputs
,
outputs
,
num_of_all_reduce
,
places
,
local_scopes
,
multi_bkcl_ctxs
,
result
);
#else
CreateFusedAllReduceOp
(
inputs
,
outputs
,
num_of_all_reduce
,
places
,
local_scopes
,
result
);
...
...
@@ -197,12 +208,18 @@ class FuseAllReduceOpPass : public ir::Pass {
const
std
::
vector
<
Scope
*>
&
local_scopes
,
#if defined(PADDLE_WITH_NCCL)
const
platform
::
NCCLCommunicator
*
multi_nccl_ctxs
,
#elif defined(PADDLE_WITH_XPU_BKCL)
const
platform
::
BKCLCommunicator
*
multi_bkcl_ctxs
,
#endif
ir
::
Graph
*
result
)
const
{
#if defined(PADDLE_WITH_NCCL)
auto
*
op_handle
=
new
details
::
FusedAllReduceOpHandle
(
result
->
CreateEmptyNode
(
"fused_all_reduce"
,
ir
::
Node
::
Type
::
kOperation
),
local_scopes
,
places
,
num_of_all_reduce
,
multi_nccl_ctxs
);
#elif defined(PADDLE_WITH_XPU_BKCL)
auto
*
op_handle
=
new
details
::
FusedAllReduceOpHandle
(
result
->
CreateEmptyNode
(
"fused_all_reduce"
,
ir
::
Node
::
Type
::
kOperation
),
local_scopes
,
places
,
num_of_all_reduce
,
multi_bkcl_ctxs
);
#else
auto
*
op_handle
=
new
details
::
FusedAllReduceOpHandle
(
result
->
CreateEmptyNode
(
"fused_all_reduce"
,
ir
::
Node
::
Type
::
kOperation
),
...
...
@@ -221,6 +238,10 @@ class FuseAllReduceOpPass : public ir::Pass {
if
(
!
multi_nccl_ctxs
)
{
SetCommunicationContext
(
places
,
op_handle
);
}
#elif defined(PADDLE_WITH_XPU_BKCL)
if
(
!
multi_bkcl_ctxs
)
{
SetCommunicationContext
(
places
,
op_handle
);
}
#else
SetCommunicationContext
(
places
,
op_handle
);
#endif
...
...
paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.cc
浏览文件 @
4427df37
...
...
@@ -162,6 +162,12 @@ void MultiDevSSAGraphBuilderBase::Init() const {
if
(
multi_nccl_ctxs_
)
{
nccl_ctxs_
=
multi_nccl_ctxs_
->
DefaultFlatCtx
();
}
#elif defined(PADDLE_WITH_XPU_BKCL)
multi_bkcl_ctxs_
=
&
Get
<
platform
::
BKCLCommunicator
>
(
details
::
kBKCLCtxs
);
bkcl_ctxs_
=
nullptr
;
if
(
multi_bkcl_ctxs_
)
{
bkcl_ctxs_
=
multi_bkcl_ctxs_
->
DefaultFlatCtx
();
}
#endif
PADDLE_ENFORCE_EQ
(
places_
.
size
(),
local_scopes_
.
size
(),
...
...
@@ -371,6 +377,11 @@ void MultiDevSSAGraphBuilderBase::SetCommunicationContext(
op_handle
->
SetDeviceContext
(
p
,
platform
::
DeviceContextPool
::
Instance
().
Get
(
p
));
}
#elif defined(PADDLE_WITH_XPU_BKCL)
if
(
bkcl_ctxs_
==
nullptr
)
{
op_handle
->
SetDeviceContext
(
p
,
platform
::
DeviceContextPool
::
Instance
().
Get
(
p
));
}
#else
op_handle
->
SetDeviceContext
(
p
,
platform
::
DeviceContextPool
::
Instance
().
Get
(
p
));
...
...
@@ -384,6 +395,10 @@ void MultiDevSSAGraphBuilderBase::CreateBroadcastOp(ir::Graph *result,
auto
*
op_handle
=
new
details
::
BroadcastOpHandle
(
result
->
CreateEmptyNode
(
"broadcast"
,
ir
::
Node
::
Type
::
kOperation
),
local_scopes_
,
places_
,
nccl_ctxs_
);
#elif defined(PADDLE_WITH_XPU_BKCL)
auto
*
op_handle
=
new
details
::
BroadcastOpHandle
(
result
->
CreateEmptyNode
(
"broadcast"
,
ir
::
Node
::
Type
::
kOperation
),
local_scopes_
,
places_
,
bkcl_ctxs_
);
#else
auto
*
op_handle
=
new
details
::
BroadcastOpHandle
(
result
->
CreateEmptyNode
(
"broadcast"
,
ir
::
Node
::
Type
::
kOperation
),
...
...
@@ -417,6 +432,10 @@ void MultiDevSSAGraphBuilderBase::CreateFusedBroadcastOp(
auto
*
op_handle
=
new
details
::
FusedBroadcastOpHandle
(
result
->
CreateEmptyNode
(
"fused_broadcast"
,
ir
::
Node
::
Type
::
kOperation
),
local_scopes_
,
places_
,
nccl_ctxs_
);
#elif defined(PADDLE_WITH_XPU_BKCL)
auto
*
op_handle
=
new
details
::
FusedBroadcastOpHandle
(
result
->
CreateEmptyNode
(
"fused_broadcast"
,
ir
::
Node
::
Type
::
kOperation
),
local_scopes_
,
places_
,
bkcl_ctxs_
);
#else
auto
*
op_handle
=
new
details
::
FusedBroadcastOpHandle
(
result
->
CreateEmptyNode
(
"fused_broadcast"
,
ir
::
Node
::
Type
::
kOperation
),
...
...
@@ -487,6 +506,11 @@ void MultiDevSSAGraphBuilderBase::CreateAllReduceOp(ir::Graph *result,
new
details
::
AllReduceOpHandle
(
result
->
CreateEmptyNode
(
"allreduce"
,
ir
::
Node
::
Type
::
kOperation
),
scopes
,
places
,
multi_nccl_ctxs_
));
#elif defined(PADDLE_WITH_XPU_BKCL)
result
->
Get
<
GraphOps
>
(
kGraphOps
).
emplace_back
(
new
details
::
AllReduceOpHandle
(
result
->
CreateEmptyNode
(
"allreduce"
,
ir
::
Node
::
Type
::
kOperation
),
scopes
,
places
,
multi_bkcl_ctxs_
));
#else
result
->
Get
<
GraphOps
>
(
kGraphOps
).
emplace_back
(
new
details
::
AllReduceOpHandle
(
...
...
@@ -565,6 +589,10 @@ details::VarHandle *MultiDevSSAGraphBuilderBase::CreateReduceOp(
result
->
Get
<
GraphOps
>
(
kGraphOps
).
emplace_back
(
new
details
::
ReduceOpHandle
(
result
->
CreateEmptyNode
(
"reduce"
,
ir
::
Node
::
Type
::
kOperation
),
local_scopes_
,
places_
,
nccl_ctxs_
));
#elif defined(PADDLE_WITH_XPU_BKCL)
result
->
Get
<
GraphOps
>
(
kGraphOps
).
emplace_back
(
new
details
::
ReduceOpHandle
(
result
->
CreateEmptyNode
(
"reduce"
,
ir
::
Node
::
Type
::
kOperation
),
local_scopes_
,
places_
,
bkcl_ctxs_
));
#else
result
->
Get
<
GraphOps
>
(
kGraphOps
).
emplace_back
(
new
details
::
ReduceOpHandle
(
result
->
CreateEmptyNode
(
"reduce"
,
ir
::
Node
::
Type
::
kOperation
),
...
...
paddle/fluid/framework/ir/multi_devices_graph_pass/multi_devices_graph_pass.h
浏览文件 @
4427df37
...
...
@@ -41,6 +41,8 @@ namespace paddle {
namespace
platform
{
class
NCCLContextMap
;
class
NCCLCommunicator
;
class
BKCLContextMap
;
class
BKCLCommunicator
;
}
namespace
framework
{
...
...
@@ -114,6 +116,9 @@ class MultiDevSSAGraphBuilderBase : public ir::Pass {
#if defined(PADDLE_WITH_NCCL)
mutable
platform
::
NCCLContextMap
*
nccl_ctxs_
{
nullptr
};
mutable
platform
::
NCCLCommunicator
*
multi_nccl_ctxs_
{
nullptr
};
#elif defined(PADDLE_WITH_XPU_BKCL)
mutable
platform
::
BKCLContextMap
*
bkcl_ctxs_
{
nullptr
};
mutable
platform
::
BKCLCommunicator
*
multi_bkcl_ctxs_
{
nullptr
};
#endif
mutable
std
::
string
loss_var_name_
;
...
...
paddle/fluid/framework/parallel_executor.cc
浏览文件 @
4427df37
...
...
@@ -63,8 +63,6 @@ static bool gProfileStarted = false;
std
::
once_flag
p2p_init_flag
;
#endif
using
UseDevice
=
paddle
::
framework
::
details
::
ExecutionStrategy
::
UseDevice
;
class
ParallelExecutorPrivate
{
public:
ParallelExecutorPrivate
(
const
std
::
vector
<
platform
::
Place
>
&
places
,
...
...
@@ -95,7 +93,7 @@ class ParallelExecutorPrivate {
}
}
bool
IsUseCUDA
(
UseDevic
e
use_device
);
bool
IsUseCUDA
(
DeviceTyp
e
use_device
);
void
SetHasFeed
(
size_t
dev_idx
,
bool
has_feed
=
true
);
...
...
@@ -272,6 +270,90 @@ class ParallelExecutorPrivate {
}
#endif
#if defined(PADDLE_WITH_XPU_BKCL)
void
InitBKCLCtxs
(
framework
::
Scope
*
scope
,
const
BuildStrategy
&
bst
)
{
VLOG
(
1
)
<<
"bkcl comm num:"
<<
bst
.
bkcl_comm_num_
<<
", nranks:"
<<
nranks_
<<
", num_trainers:"
<<
bst
.
num_trainers_
<<
", trainer_id:"
<<
bst
.
trainer_id_
;
PADDLE_ENFORCE_EQ
(
bst
.
use_hierarchical_allreduce_
,
false
,
platform
::
errors
::
Unimplemented
(
"xpu doesn't support use_hierarchical_allreduce"
));
std
::
vector
<
BKCLUniqueId
*>
flat_bkcl_ids
;
if
(
nranks_
==
1
)
{
// FIXME(gongwb): need not to create bkclid when nranks==1
bkcl_ctxs_
->
InitFlatCtxs
(
places_
,
flat_bkcl_ids
,
bst
.
num_trainers_
,
bst
.
trainer_id_
);
return
;
}
if
(
bst
.
enable_parallel_graph_
)
{
VLOG
(
1
)
<<
"use only one bkclid in pg model"
;
BKCLUniqueId
*
bkcl_id
=
nullptr
;
std
::
string
var_name
=
platform
::
GetFlatBKCLVarName
(
0
);
auto
bkcl_id_var
=
scope
->
FindVar
(
var_name
);
std
::
unique_ptr
<
BKCLUniqueId
>
id
(
new
BKCLUniqueId
());
if
(
bkcl_id_var
)
{
bkcl_id
=
bkcl_id_var
->
GetMutable
<
BKCLUniqueId
>
();
}
else
{
PADDLE_ENFORCE_EQ
(
bkcl_get_unique_id
(
id
.
get
()),
BKCL_SUCCESS
,
platform
::
errors
::
Unavailable
(
"bkcl get unique id failed"
));
bkcl_id
=
id
.
get
();
}
flat_bkcl_ids
.
push_back
(
bkcl_id
);
bkcl_ctxs_
->
InitFlatCtxs
(
places_
,
flat_bkcl_ids
,
bst
.
num_trainers_
,
bst
.
trainer_id_
);
VLOG
(
1
)
<<
"init bst bkcl context complete!"
;
return
;
}
// num_trainers ==1 && places > 1
if
(
bst
.
num_trainers_
==
1
)
{
bkcl_ctxs_
->
InitFlatCtxs
(
places_
,
flat_bkcl_ids
,
bst
.
num_trainers_
,
bst
.
trainer_id_
);
return
;
}
for
(
int
i
=
0
;
i
<
static_cast
<
int
>
(
bst
.
bkcl_comm_num_
);
i
++
)
{
std
::
string
var_name
=
platform
::
GetFlatBKCLVarName
(
i
);
auto
bkcl_id_var
=
scope
->
FindVar
(
var_name
);
PADDLE_ENFORCE_NOT_NULL
(
bkcl_id_var
,
platform
::
errors
::
NotFound
(
"can't find %s bkcl_id_var"
,
var_name
));
auto
bkcl_id
=
bkcl_id_var
->
GetMutable
<
BKCLUniqueId
>
();
flat_bkcl_ids
.
push_back
(
bkcl_id
);
}
bkcl_ctxs_
->
InitFlatCtxs
(
places_
,
flat_bkcl_ids
,
bst
.
num_trainers_
,
bst
.
trainer_id_
);
}
void
InitOrGetBKCLCommunicator
(
framework
::
Scope
*
scope
,
const
BuildStrategy
&
bst
)
{
const
std
::
string
var_name
=
"BKCLCommunicator"
;
auto
var
=
scope
->
FindVar
(
var_name
);
if
(
var
!=
nullptr
)
{
PADDLE_ENFORCE_EQ
(
var
->
IsInitialized
(),
true
,
platform
::
errors
::
PreconditionNotMet
(
"if %s exists, it must be initialized"
,
var_name
));
VLOG
(
1
)
<<
"find "
<<
var_name
<<
" in scope, so use it and does not recreate!"
;
bkcl_ctxs_
=
var
->
GetMutable
<
platform
::
BKCLCommunicator
>
();
return
;
}
VLOG
(
1
)
<<
"not find "
<<
var_name
<<
" in scope, so recreate it!"
;
bkcl_ctxs_
=
scope
->
Var
(
var_name
)
->
GetMutable
<
platform
::
BKCLCommunicator
>
();
InitBKCLCtxs
(
scope
,
bst
);
}
#endif
inline
bool
IsPersistable
(
const
std
::
string
&
name
)
const
{
auto
iter
=
is_persistable_
.
find
(
name
);
return
iter
!=
is_persistable_
.
end
()
&&
iter
->
second
;
...
...
@@ -288,9 +370,11 @@ class ParallelExecutorPrivate {
#if defined(PADDLE_WITH_NCCL)
platform
::
NCCLCommunicator
*
nccl_ctxs_
{
nullptr
};
#elif defined(PADDLE_WITH_XPU_BKCL)
platform
::
BKCLCommunicator
*
bkcl_ctxs_
{
nullptr
};
#endif
bool
own_local_scope_
;
UseDevic
e
use_device_
;
DeviceTyp
e
use_device_
;
bool
use_all_reduce_
;
size_t
nranks_
;
...
...
@@ -300,8 +384,8 @@ class ParallelExecutorPrivate {
details
::
ParallelSSAGraphExecutor
*
inference_executor_
{
nullptr
};
};
bool
ParallelExecutorPrivate
::
IsUseCUDA
(
UseDevic
e
use_device
)
{
return
use_device
==
UseDevice
::
kCUDA
;
bool
ParallelExecutorPrivate
::
IsUseCUDA
(
DeviceTyp
e
use_device
)
{
return
use_device
==
p
::
kCUDA
;
}
void
ParallelExecutorPrivate
::
SetHasFeed
(
size_t
dev_idx
,
bool
has_feed
)
{
...
...
@@ -348,7 +432,7 @@ ir::Graph *ParallelExecutorPrivate::ApplyMemoryOptimizePass(ir::Graph *graph) {
auto
addto_pass
=
ir
::
PassRegistry
::
Instance
().
Get
(
"inplace_addto_op_pass"
);
addto_pass
->
SetNotOwned
(
ir
::
kMemOptVarInfoMapList
,
&
mem_opt_var_infos_
);
addto_pass
->
SetNotOwned
(
ir
::
kLastLiveOpsOfVars
,
&
last_live_ops_of_vars
);
addto_pass
->
Set
(
ir
::
kUseCuda
,
new
bool
(
use_device_
==
UseDevice
::
kCUDA
));
addto_pass
->
Set
(
ir
::
kUseCuda
,
new
bool
(
use_device_
==
p
::
kCUDA
));
VLOG
(
10
)
<<
"Start to apply inplace_addto_op_pass"
;
graph
=
addto_pass
->
Apply
(
graph
);
VLOG
(
10
)
<<
"inplace_addto_op_pass Applied"
;
...
...
@@ -359,7 +443,7 @@ ir::Graph *ParallelExecutorPrivate::ApplyMemoryOptimizePass(ir::Graph *graph) {
ir
::
PassRegistry
::
Instance
().
Get
(
"buffer_shared_inplace_pass"
);
inplace_pass
->
SetNotOwned
(
ir
::
kMemOptVarInfoMapList
,
&
mem_opt_var_infos_
);
inplace_pass
->
SetNotOwned
(
ir
::
kLastLiveOpsOfVars
,
&
last_live_ops_of_vars
);
inplace_pass
->
Set
(
ir
::
kUseCuda
,
new
bool
(
use_device_
==
UseDevice
::
kCUDA
));
inplace_pass
->
Set
(
ir
::
kUseCuda
,
new
bool
(
use_device_
==
p
::
kCUDA
));
VLOG
(
10
)
<<
"Start to apply buffer_shared_inplace_pass"
;
graph
=
inplace_pass
->
Apply
(
graph
);
VLOG
(
10
)
<<
"buffer_shared_inplace_pass Applied"
;
...
...
@@ -375,7 +459,7 @@ ir::Graph *ParallelExecutorPrivate::ApplyMemoryOptimizePass(ir::Graph *graph) {
cross_op_memory_reuse_pass
->
SetNotOwned
(
ir
::
kLastLiveOpsOfVars
,
&
last_live_ops_of_vars
);
cross_op_memory_reuse_pass
->
Set
(
ir
::
kUseCuda
,
new
bool
(
use_device_
==
UseDevice
::
kCUDA
));
new
bool
(
use_device_
==
p
::
kCUDA
));
VLOG
(
10
)
<<
"Start to apply buffer_shared_cross_op_memory_reuse_pass"
;
graph
=
cross_op_memory_reuse_pass
->
Apply
(
graph
);
VLOG
(
10
)
<<
"buffer_shared_cross_op_memory_reuse_pass Applied"
;
...
...
@@ -564,9 +648,9 @@ ParallelExecutor::ParallelExecutor(const std::vector<platform::Place> &places,
#endif
std
::
string
device_name
;
if
(
member_
->
use_device_
==
UseDevice
::
kCPU
)
{
if
(
member_
->
use_device_
==
p
::
kCPU
)
{
device_name
=
"CPU"
;
}
else
if
(
member_
->
use_device_
==
UseDevice
::
kCUDA
)
{
}
else
if
(
member_
->
use_device_
==
p
::
kCUDA
)
{
device_name
=
"CUDA"
;
}
else
{
device_name
=
"XPU"
;
...
...
@@ -642,6 +726,27 @@ ParallelExecutor::ParallelExecutor(const std::vector<platform::Place> &places,
auto
&
nccl_ctx
=
nccl_ctxs
->
at
(
member_
->
places_
[
dev_id
]);
dev_ctx
->
set_nccl_comm
(
nccl_ctx
.
comm
());
}
#else
PADDLE_THROW
(
platform
::
errors
::
PreconditionNotMet
(
"Not compiled with CUDA."
));
#endif
}
if
(
member_
->
use_device_
==
p
::
kXPU
&&
member_
->
nranks_
>
1
)
{
#if defined(PADDLE_WITH_XPU_BKCL)
member_
->
InitOrGetBKCLCommunicator
(
scope
,
member_
->
build_strategy_
);
auto
*
bkcl_ctxs
=
member_
->
bkcl_ctxs_
->
GetSyncBatchNormCtx
(
scope
,
member_
->
places_
);
auto
&
pool
=
platform
::
DeviceContextPool
::
Instance
();
for
(
size_t
dev_id
=
0
;
dev_id
<
member_
->
places_
.
size
();
++
dev_id
)
{
auto
*
dev_ctx
=
static_cast
<
platform
::
XPUDeviceContext
*>
(
pool
.
Get
(
member_
->
places_
[
dev_id
]));
auto
&
bkcl_ctx
=
bkcl_ctxs
->
at
(
member_
->
places_
[
dev_id
]);
dev_ctx
->
set_bkcl_context
(
bkcl_ctx
.
comm
());
}
#else
PADDLE_THROW
(
platform
::
errors
::
PreconditionNotMet
(
"Not compiled with XPU."
));
#endif
}
// broadcast parameters from the 0th device to others:
...
...
@@ -671,39 +776,55 @@ ParallelExecutor::ParallelExecutor(const std::vector<platform::Place> &places,
VLOG
(
3
)
<<
"use local async mode"
;
graph
=
member_
->
build_strategy_
.
Apply
(
graph
,
{
member_
->
places_
[
0
]},
loss_var_name
,
{
member_
->
local_scopes_
[
0
]},
1
,
member_
->
IsUseCUDA
(
member_
->
use_device_
),
member_
->
nccl_ctxs_
);
{
member_
->
local_scopes_
[
0
]},
1
,
member_
->
use_device_
,
member_
->
nccl_ctxs_
);
for
(
size_t
i
=
1
;
i
<
member_
->
places_
.
size
();
++
i
)
{
graphs
[
i
]
=
member_
->
build_strategy_
.
Apply
(
graphs
[
i
],
{
member_
->
places_
[
i
]},
loss_var_name
,
{
member_
->
local_scopes_
[
i
]},
1
,
member_
->
IsUseCUDA
(
member_
->
use_device_
),
member_
->
nccl_ctxs_
);
{
member_
->
local_scopes_
[
i
]},
1
,
member_
->
use_device_
,
member_
->
nccl_ctxs_
);
async_graphs
[
i
]
=
graphs
[
i
];
}
}
else
{
graph
=
member_
->
build_strategy_
.
Apply
(
graph
,
member_
->
places_
,
loss_var_name
,
member_
->
local_scopes_
,
member_
->
nranks_
,
member_
->
IsUseCUDA
(
member_
->
use_device_
),
member_
->
nccl_ctxs_
);
member_
->
nranks_
,
member_
->
use_device_
,
member_
->
nccl_ctxs_
);
}
#elif defined(PADDLE_WITH_XPU_BKCL)
if
(
member_
->
build_strategy_
.
async_mode_
)
{
VLOG
(
3
)
<<
"use local async mode"
;
graph
=
member_
->
build_strategy_
.
Apply
(
graph
,
{
member_
->
places_
[
0
]},
loss_var_name
,
{
member_
->
local_scopes_
[
0
]},
1
,
member_
->
use_device_
,
member_
->
bkcl_ctxs_
);
for
(
size_t
i
=
1
;
i
<
member_
->
places_
.
size
();
++
i
)
{
graphs
[
i
]
=
member_
->
build_strategy_
.
Apply
(
graphs
[
i
],
{
member_
->
places_
[
i
]},
loss_var_name
,
{
member_
->
local_scopes_
[
i
]},
1
,
member_
->
use_device_
,
member_
->
bkcl_ctxs_
);
async_graphs
[
i
]
=
graphs
[
i
];
}
}
else
{
graph
=
member_
->
build_strategy_
.
Apply
(
graph
,
member_
->
places_
,
loss_var_name
,
member_
->
local_scopes_
,
member_
->
nranks_
,
member_
->
use_device_
,
member_
->
bkcl_ctxs_
);
}
#else
if
(
member_
->
build_strategy_
.
async_mode_
)
{
VLOG
(
3
)
<<
"use local async mode"
;
graph
=
member_
->
build_strategy_
.
Apply
(
graph
,
{
member_
->
places_
[
0
]},
loss_var_name
,
{
member_
->
local_scopes_
[
0
]},
1
,
member_
->
IsUseCUDA
(
member_
->
use_device_
));
{
member_
->
local_scopes_
[
0
]},
1
,
member_
->
use_device_
);
for
(
size_t
i
=
1
;
i
<
member_
->
places_
.
size
();
++
i
)
{
graphs
[
i
]
=
member_
->
build_strategy_
.
Apply
(
graphs
[
i
],
{
member_
->
places_
[
i
]},
loss_var_name
,
{
member_
->
local_scopes_
[
i
]},
1
,
member_
->
IsUseCUDA
(
member_
->
use_device_
));
{
member_
->
local_scopes_
[
i
]},
1
,
member_
->
use_device_
);
async_graphs
[
i
]
=
graphs
[
i
];
}
}
else
{
graph
=
member_
->
build_strategy_
.
Apply
(
graph
,
member_
->
places_
,
loss_var_name
,
member_
->
local_scopes_
,
member_
->
nranks_
,
member_
->
IsUseCUDA
(
member_
->
use_device_
)
);
member_
->
nranks_
,
member_
->
use_device_
);
}
#endif
...
...
@@ -847,6 +968,9 @@ void ParallelExecutor::BCastParamsToDevices(
continue
;
}
auto
&
dims
=
main_tensor
.
dims
();
VLOG
(
1
)
<<
"bcast var="
<<
var
;
if
(
paddle
::
platform
::
is_gpu_place
(
main_tensor
.
place
()))
{
#if defined(PADDLE_WITH_NCCL)
std
::
vector
<
void
*>
buffers
;
...
...
@@ -883,6 +1007,58 @@ void ParallelExecutor::BCastParamsToDevices(
}
nccl_ctxs
->
WaitAll
();
}
#endif
}
else
if
(
paddle
::
platform
::
is_xpu_place
(
main_tensor
.
place
()))
{
#if defined(PADDLE_WITH_XPU_BKCL)
std
::
vector
<
void
*>
buffers
;
buffers
.
reserve
(
member_
->
places_
.
size
());
size_t
numel
=
main_tensor
.
numel
();
BKCLDataType
data_type
=
BKCL_FLOAT
;
// BKCLDataType data_type = platform::ToBKCLDataType(main_tensor.type());
for
(
size_t
i
=
0
;
i
<
member_
->
places_
.
size
();
++
i
)
{
auto
place
=
member_
->
places_
[
i
];
void
*
buffer
;
if
(
i
==
0
&&
trainer_id
==
0
)
{
buffer
=
const_cast
<
void
*>
(
main_tensor
.
data
<
void
>
());
}
else
{
auto
local_scope
=
member_
->
local_scopes_
[
i
];
auto
*
t
=
local_scope
->
Var
(
var
)
->
GetMutable
<
LoDTensor
>
();
t
->
Resize
(
dims
);
buffer
=
t
->
mutable_data
(
place
,
main_tensor
.
type
());
}
buffers
.
push_back
(
buffer
);
}
PADDLE_ENFORCE_EQ
(
member_
->
places_
.
size
(),
buffers
.
size
(),
platform
::
errors
::
PreconditionNotMet
(
"variables' buffer size to bcast is %d, which is "
"NOT equal to places size %d"
,
buffers
.
size
(),
member_
->
places_
.
size
()));
{
auto
*
bkcl_ctxs
=
member_
->
bkcl_ctxs_
->
DefaultFlatCtx
();
PADDLE_ENFORCE_EQ
(
bkcl_group_start
(),
BKCL_SUCCESS
,
platform
::
errors
::
Unavailable
(
"bkcl_group_start failed"
));
for
(
size_t
i
=
0
;
i
<
member_
->
places_
.
size
();
++
i
)
{
auto
&
bkcl_ctx
=
bkcl_ctxs
->
at
(
member_
->
places_
[
i
]);
if
(
main_tensor
.
type
()
==
framework
::
proto
::
VarType
::
INT64
)
{
numel
*=
2
;
}
PADDLE_ENFORCE_EQ
(
bkcl_broadcast
(
bkcl_ctx
.
comm
(),
buffers
[
i
],
buffers
[
i
],
numel
,
data_type
,
0
,
NULL
),
BKCL_SUCCESS
,
platform
::
errors
::
Unavailable
(
"bkcl_broadcast failed"
));
}
PADDLE_ENFORCE_EQ
(
bkcl_group_end
(),
BKCL_SUCCESS
,
platform
::
errors
::
Unavailable
(
"bkcl_group_end failed"
));
}
#else
PADDLE_THROW
(
platform
::
errors
::
PreconditionNotMet
(
"Not compiled with BKCL."
));
#endif
}
else
{
platform
::
CPUPlace
cpu
;
...
...
paddle/fluid/framework/parallel_executor.h
浏览文件 @
4427df37
...
...
@@ -43,6 +43,8 @@ class ParallelExecutorPrivate;
using
details
::
BuildStrategy
;
using
details
::
ExecutionStrategy
;
namespace
p
=
paddle
::
platform
;
using
DeviceType
=
paddle
::
platform
::
DeviceType
;
class
ParallelExecutor
{
DISABLE_COPY_AND_ASSIGN
(
ParallelExecutor
);
...
...
paddle/fluid/framework/var_type_traits.cc
浏览文件 @
4427df37
...
...
@@ -30,6 +30,10 @@
#include "paddle/fluid/operators/cudnn_rnn_cache.h"
#endif
#if defined(PADDLE_WITH_XPU_BKCL)
#include "paddle/fluid/platform/bkcl_helper.h"
#endif
namespace
paddle
{
namespace
framework
{
...
...
paddle/fluid/framework/var_type_traits.h
浏览文件 @
4427df37
...
...
@@ -31,6 +31,10 @@
#endif
#endif
#if defined(PADDLE_WITH_XPU_BKCL)
#include "xpu/bkcl.h"
#endif
// Users should add forward declarations here
namespace
paddle
{
...
...
@@ -41,6 +45,10 @@ class Communicator;
class
NCCLCommunicator
;
#endif
#endif
#if defined(PADDLE_WITH_XPU_BKCL)
class
BKCLCommunicator
;
#endif
}
// namespace platform
namespace
framework
{
...
...
@@ -148,6 +156,9 @@ using VarTypeRegistry = detail::VarTypeRegistryImpl<
ncclUniqueId
,
platform
::
Communicator
,
platform
::
NCCLCommunicator
,
#endif
operators
::
CudnnRNNCache
,
#endif
#if defined(PADDLE_WITH_XPU_BKCL)
BKCLUniqueId
,
platform
::
BKCLCommunicator
,
#endif
int
,
float
>
;
...
...
paddle/fluid/framework/var_type_traits_test.cc
浏览文件 @
4427df37
...
...
@@ -31,6 +31,9 @@
#include "paddle/fluid/operators/conv_cudnn_op_cache.h"
#include "paddle/fluid/operators/cudnn_rnn_cache.h"
#endif
#if defined(PADDLE_WITH_XPU_BKCL)
#include "paddle/fluid/platform/bkcl_helper.h"
#endif
namespace
paddle
{
namespace
framework
{
...
...
paddle/fluid/platform/bkcl_helper.h
0 → 100644
浏览文件 @
4427df37
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef _WIN32
#if defined(PADDLE_WITH_XPU_BKCL)
#pragma once
#include <stdio.h>
#include <memory>
#include <string>
#include <thread> // NOLINT
#include <typeindex>
#include <unordered_map>
#include <vector>
#include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/platform/float16.h"
#include "paddle/fluid/platform/place.h"
#include "xpu/bkcl.h"
#include "xpu/runtime.h"
#define BKCL_ID_VARNAME "BKCLID"
namespace
paddle
{
namespace
platform
{
inline
BKCLDataType
ToBKCLDataType
(
framework
::
proto
::
VarType
::
Type
type
)
{
if
(
type
==
framework
::
proto
::
VarType
::
FP32
)
{
return
BKCL_FLOAT
;
}
else
{
PADDLE_THROW
(
platform
::
errors
::
Unimplemented
(
"BKCL currently only support FP32, "
"other data types are not supported."
));
}
}
struct
BKCLContext
{
std
::
unique_ptr
<
platform
::
XPUDeviceContext
>
ctx_
;
BKCLContext_t
comm_
;
explicit
BKCLContext
(
int
dev_id
)
:
ctx_
(
new
platform
::
XPUDeviceContext
(
XPUPlace
(
dev_id
))),
comm_
{
nullptr
}
{}
BKCLContext_t
comm
()
const
{
return
comm_
;
}
int
device_id
()
const
{
return
BOOST_GET_CONST
(
platform
::
XPUPlace
,
ctx_
->
GetPlace
()).
device
;
}
};
struct
InitBKCLPara
{
BKCLUniqueId
*
bkcl_id
;
int
rank
;
int
nranks
;
int
dev_id
;
BKCLContext_t
*
ctx
;
};
static
void
*
init_bkcl_context_func
(
void
*
args
)
{
struct
InitBKCLPara
*
para
=
(
struct
InitBKCLPara
*
)
args
;
PADDLE_ENFORCE_EQ
(
xpu_set_device
(
para
->
dev_id
),
XPU_SUCCESS
,
platform
::
errors
::
PreconditionNotMet
(
"xpu_set_device failed[%d]"
,
para
->
dev_id
));
PADDLE_ENFORCE_EQ
(
bkcl_init_rank
(
para
->
ctx
,
para
->
rank
,
para
->
nranks
,
para
->
bkcl_id
),
BKCL_SUCCESS
,
platform
::
errors
::
PreconditionNotMet
(
"bkcl_init_rank failed"
));
return
nullptr
;
}
struct
BKCLContextMap
{
std
::
unordered_map
<
int
,
BKCLContext
>
contexts_
;
std
::
vector
<
int
>
order_
;
std
::
vector
<
platform
::
Place
>
places_
;
size_t
num_trainers_
;
size_t
trainer_id_
;
BKCLUniqueId
*
bkcl_id_
;
explicit
BKCLContextMap
(
const
std
::
vector
<
platform
::
Place
>
&
places
,
BKCLUniqueId
*
bkcl_id
=
nullptr
,
size_t
num_trainers
=
1
,
size_t
trainer_id
=
0
)
{
places_
=
places
;
bkcl_id_
=
bkcl_id
;
num_trainers_
=
num_trainers
;
trainer_id_
=
trainer_id
;
}
// Synchronization is required and can only be initialized with
// multithreading.
int
init
()
{
PADDLE_ENFORCE_EQ
(
!
places_
.
empty
(),
true
,
platform
::
errors
::
InvalidArgument
(
"The BKCL place should not be empty."
));
order_
.
reserve
(
places_
.
size
());
for
(
auto
&
p
:
places_
)
{
int
dev_id
=
BOOST_GET_CONST
(
platform
::
XPUPlace
,
p
).
device
;
order_
.
emplace_back
(
dev_id
);
contexts_
.
emplace
(
dev_id
,
BKCLContext
(
dev_id
));
}
PADDLE_ENFORCE_EQ
(
order_
.
size
(),
contexts_
.
size
(),
platform
::
errors
::
Unavailable
(
"BKCL Context Map does not support "
"contain two or more same device"
));
std
::
unique_ptr
<
BKCLContext_t
[]
>
comms
(
new
BKCLContext_t
[
order_
.
size
()]);
std
::
unique_ptr
<
InitBKCLPara
[]
>
paras
(
new
InitBKCLPara
[
order_
.
size
()]);
std
::
unique_ptr
<
pthread_t
[]
>
pids
(
new
pthread_t
[
order_
.
size
()]);
BKCLResult_t
ret
;
BKCLUniqueId
id
;
// if num_trainers == 1, should create a new bkcl id for local comms.
if
(
num_trainers_
==
1
&&
bkcl_id_
==
nullptr
)
{
ret
=
bkcl_get_unique_id
(
&
id
);
PADDLE_ENFORCE_EQ
(
BKCL_SUCCESS
,
ret
,
platform
::
errors
::
PreconditionNotMet
(
"bkcl get unique id failed [%d]"
,
ret
));
bkcl_id_
=
&
id
;
}
PADDLE_ENFORCE_NOT_NULL
(
bkcl_id_
,
platform
::
errors
::
InvalidArgument
(
"The BKCL id should not be null."
));
{
int
nranks
=
num_trainers_
*
order_
.
size
();
for
(
size_t
i
=
0
;
i
<
order_
.
size
();
++
i
)
{
int
rank
;
if
(
order_
.
size
()
>
1
)
{
rank
=
trainer_id_
*
order_
.
size
()
+
i
;
}
else
{
rank
=
trainer_id_
;
}
VLOG
(
1
)
<<
"init bkcl rank:"
<<
rank
<<
", nranks:"
<<
nranks
<<
", xpu_id:"
<<
order_
[
i
];
paras
[
i
].
rank
=
rank
;
paras
[
i
].
nranks
=
nranks
;
paras
[
i
].
dev_id
=
order_
[
i
];
paras
[
i
].
bkcl_id
=
bkcl_id_
;
paras
[
i
].
ctx
=
&
comms
[
i
];
PADDLE_ENFORCE_EQ
(
pthread_create
(
&
pids
[
i
],
nullptr
,
init_bkcl_context_func
,
reinterpret_cast
<
void
*>
(
&
paras
[
i
])),
0
,
platform
::
errors
::
External
(
"pthread_create failed"
));
}
for
(
size_t
i
=
0
;
i
<
order_
.
size
();
i
++
)
{
pthread_join
(
pids
[
i
],
nullptr
);
}
}
int
i
=
0
;
for
(
auto
&
dev_id
:
order_
)
{
contexts_
.
at
(
dev_id
).
comm_
=
comms
[
i
++
];
}
return
0
;
}
BKCLContextMap
(
const
BKCLContextMap
&
other
)
=
delete
;
BKCLContextMap
&
operator
=
(
const
BKCLContextMap
&
other
)
=
delete
;
XPUDeviceContext
*
DevCtx
(
int
dev_id
)
const
{
return
at
(
dev_id
).
ctx_
.
get
();
}
XPUDeviceContext
*
DevCtx
(
platform
::
Place
p
)
const
{
return
DevCtx
(
BOOST_GET_CONST
(
platform
::
XPUPlace
,
p
).
device
);
}
const
BKCLContext
&
at
(
platform
::
Place
p
)
const
{
return
this
->
at
(
BOOST_GET_CONST
(
platform
::
XPUPlace
,
p
).
device
);
}
const
BKCLContext
&
at
(
int
dev_id
)
const
{
return
contexts_
.
at
(
dev_id
);
}
void
WaitAll
()
{
for
(
auto
&
p
:
contexts_
)
{
p
.
second
.
ctx_
->
Wait
();
}
}
};
inline
std
::
string
GetFlatBKCLVarName
(
size_t
pos
)
{
if
(
pos
==
0
)
{
return
BKCL_ID_VARNAME
;
}
return
string
::
Sprintf
(
"%s_%d"
,
BKCL_ID_VARNAME
,
static_cast
<
int
>
(
pos
));
}
class
BKCLCommunicator
{
public:
BKCLCommunicator
()
{}
virtual
~
BKCLCommunicator
()
{}
BKCLContextMap
*
DefaultFlatCtx
()
const
{
if
(
flat_ctxs_
.
size
()
==
0
)
{
return
nullptr
;
}
return
flat_ctxs_
[
0
].
get
();
}
std
::
vector
<
std
::
unique_ptr
<
BKCLContextMap
>>
*
GetFlatCtxs
()
{
return
&
flat_ctxs_
;
}
BKCLContextMap
*
GetFlatCtx
(
size_t
run_order
)
const
{
return
flat_ctxs_
[
run_order
%
flat_ctxs_
.
size
()].
get
();
}
BKCLContextMap
*
GetRunEnvBKCLCtx
(
size_t
run_order
,
bool
use_hierarchical_allreduce
)
const
{
PADDLE_ENFORCE_EQ
(
use_hierarchical_allreduce
,
false
,
platform
::
errors
::
Unimplemented
(
"Hierarchical all reduce is not support for XPU"
));
return
GetFlatCtx
(
run_order
);
}
/*
*It meets error when allreduce ophandle and sync_batch_norm_op use
*bkcl_all_reduce
*parallelly. So create a new bkcl comm for sync_batch_norm_op. And these
*codes should be polished with a unified bkcl management.
*/
BKCLContextMap
*
GetSyncBatchNormCtx
(
framework
::
Scope
*
scope
,
const
std
::
vector
<
platform
::
Place
>
&
places
)
{
auto
*
bkcl_id_var
=
scope
->
FindVar
(
BKCL_ID_VARNAME
);
if
(
bkcl_id_var
!=
nullptr
)
{
return
DefaultFlatCtx
();
}
if
(
sync_batch_norm_ctx_
.
get
()
==
nullptr
)
{
sync_batch_norm_ctx_
.
reset
(
new
BKCLContextMap
(
places
));
sync_batch_norm_ctx_
->
init
();
}
return
sync_batch_norm_ctx_
.
get
();
}
void
InitFlatCtxs
(
const
std
::
vector
<
platform
::
Place
>
&
places
,
const
std
::
vector
<
BKCLUniqueId
*>
&
bkcl_ids
,
size_t
trainers_num
,
size_t
trainer_id
)
{
if
(
bkcl_ids
.
size
()
==
0
)
{
auto
ptr
=
new
platform
::
BKCLContextMap
(
places
);
ptr
->
init
();
VLOG
(
1
)
<<
"init local trainer"
;
flat_ctxs_
.
emplace_back
(
ptr
);
return
;
}
PADDLE_ENFORCE_EQ
(
bkcl_ids
.
size
(),
1
,
platform
::
errors
::
Unimplemented
(
"Multi-all-reduce-ring is not support for XPU"
));
for
(
size_t
i
=
0
;
i
<
bkcl_ids
.
size
();
i
++
)
{
auto
ptr
=
new
platform
::
BKCLContextMap
(
places
,
bkcl_ids
[
i
],
trainers_num
,
trainer_id
);
ptr
->
init
();
VLOG
(
1
)
<<
"init trainer_id:"
<<
trainer_id
<<
", comm no:"
<<
i
;
flat_ctxs_
.
emplace_back
(
ptr
);
}
}
protected:
// Support multi bkcl comm on default bkcl ring while BKCLContextMap can't.
std
::
vector
<
std
::
unique_ptr
<
BKCLContextMap
>>
flat_ctxs_
;
// just used for sync_batch_norm op.
std
::
unique_ptr
<
BKCLContextMap
>
sync_batch_norm_ctx_
;
};
}
// namespace platform
}
// namespace paddle
#endif // PADDLE_WITH_XPU_BKCL
#endif
paddle/fluid/platform/device_context.h
浏览文件 @
4427df37
...
...
@@ -30,6 +30,10 @@ limitations under the License. */
#include "paddle/fluid/platform/gpu_info.h"
#endif
#if defined(PADDLE_WITH_XPU_BKCL)
#include "xpu/bkcl.h"
#endif
#ifdef PADDLE_WITH_MKLDNN
#include "mkldnn.hpp"
#include "paddle/fluid/framework/data_layout.h"
...
...
@@ -52,6 +56,7 @@ struct GpuDevice;
#ifdef PADDLE_WITH_XPU
#include "paddle/fluid/platform/xpu_header.h"
#include "paddle/fluid/platform/xpu_info.h"
#endif
namespace
paddle
{
...
...
@@ -64,6 +69,16 @@ void SetAllowTF32Cublas(bool active);
bool
AllowTF32Cublas
();
#endif // PADDLE_WITH_CUDA
enum
DeviceType
{
CPU
=
0
,
CUDA
=
1
,
XPU
=
2
,
};
constexpr
DeviceType
kCPU
=
DeviceType
::
CPU
;
constexpr
DeviceType
kCUDA
=
DeviceType
::
CUDA
;
constexpr
DeviceType
kXPU
=
DeviceType
::
XPU
;
class
DeviceContext
{
public:
virtual
~
DeviceContext
()
PADDLE_MAY_THROW
{}
...
...
@@ -107,9 +122,20 @@ class XPUDeviceContext : public DeviceContext {
/*! \brief Wait for all operations completion in the stream. */
void
Wait
()
const
override
;
#ifdef PADDLE_WITH_XPU_BKCL
/*! \brief Return nccl context. */
BKCLContext_t
bkcl_context
()
const
{
return
bkcl_context_
;
}
/*! \brief Set bkcl context. */
void
set_bkcl_context
(
BKCLContext_t
context
)
{
bkcl_context_
=
context
;
}
#endif
private:
XPUPlace
place_
;
xpu
::
Context
*
context_
;
#ifdef PADDLE_WITH_XPU_BKCL
BKCLContext_t
bkcl_context_
;
#endif
// Need to be the same with other DeviceContext,
// Eventhough eigen_device_ is not used in XPU
...
...
@@ -552,8 +578,8 @@ class MKLDNNDeviceContext : public CPUDeviceContext {
const
std
::
string
&
GetKeySuffix
(
void
)
const
{
return
key_suffix_
;
}
// Disable adding thread ID to the key
void
DisableThreadInfoInKey
(
void
)
{
key_attach_thread_id_
=
false
;
}
;
bool
IsThreadIdUsedInKey
(
void
)
const
{
return
key_attach_thread_id_
;
}
;
void
DisableThreadInfoInKey
(
void
)
{
key_attach_thread_id_
=
false
;
}
bool
IsThreadIdUsedInKey
(
void
)
const
{
return
key_attach_thread_id_
;
}
// Prevent next ResetBlobMap()
void
BlockNextCacheClearing
();
...
...
paddle/fluid/pybind/pybind.cc
浏览文件 @
4427df37
...
...
@@ -1308,6 +1308,7 @@ All parameter, weight, gradient are variables in Paddle.
"The module will return special predefined variable name in Paddle"
)
.
def
(
"empty"
,
[]()
{
return
kEmptyVarName
;
})
.
def
(
"temp"
,
[]()
{
return
kTempVarName
;
});
// clang-format off
py
::
class_
<
paddle
::
platform
::
DeviceContext
>
(
m
,
"DeviceContext"
)
.
def_static
(
"create"
,
...
...
@@ -2080,10 +2081,10 @@ All parameter, weight, gradient are variables in Paddle.
exec_strategy=exec_strategy)
)DOC"
);
py
::
enum_
<
ExecutionStrategy
::
UseDevice
>
(
exec_strategy
,
"UseDevice"
)
.
value
(
"CPU"
,
ExecutionStrategy
::
UseDevice
::
k
CPU
)
.
value
(
"CUDA"
,
ExecutionStrategy
::
UseDevice
::
k
CUDA
)
.
value
(
"XPU"
,
ExecutionStrategy
::
UseDevice
::
k
XPU
);
py
::
enum_
<
paddle
::
platform
::
DeviceType
>
(
m
,
"DeviceType"
,
py
::
arithmetic
()
)
.
value
(
"CPU"
,
paddle
::
platform
::
DeviceType
::
CPU
)
.
value
(
"CUDA"
,
paddle
::
platform
::
DeviceType
::
CUDA
)
.
value
(
"XPU"
,
paddle
::
platform
::
DeviceType
::
XPU
);
exec_strategy
.
def
(
py
::
init
())
.
def_property
(
...
...
@@ -2117,7 +2118,7 @@ All parameter, weight, gradient are variables in Paddle.
.
def_property
(
"_use_device"
,
[](
const
ExecutionStrategy
&
self
)
{
return
self
.
use_device_
;
},
[](
ExecutionStrategy
&
self
,
ExecutionStrategy
::
UseDevic
e
use_device
)
{
[](
ExecutionStrategy
&
self
,
paddle
::
platform
::
DeviceTyp
e
use_device
)
{
self
.
use_device_
=
use_device
;
})
// NOTE(liuyuhui): Doesn't add doc for 'use_device', because
// use_device isn‘t exposed to users.
...
...
python/paddle/fluid/compiler.py
浏览文件 @
4427df37
...
...
@@ -28,6 +28,7 @@ ExecutionStrategy = core.ParallelExecutor.ExecutionStrategy
BuildStrategy
=
core
.
ParallelExecutor
.
BuildStrategy
InferNativeConfig
=
core
.
NativeConfig
InferAnalysisConfig
=
core
.
AnalysisConfig
DeviceType
=
core
.
DeviceType
def
_place_obj
(
place
):
...
...
@@ -345,17 +346,17 @@ class CompiledProgram(object):
self
.
_exec_strategy
.
_use_device
=
use_device
if
self
.
_exec_strategy
.
num_threads
==
0
:
if
self
.
_exec_strategy
.
_use_device
==
ExecutionStrategy
.
UseDevic
e
.
CUDA
:
if
self
.
_exec_strategy
.
_use_device
==
DeviceTyp
e
.
CUDA
:
# Experiments on se-resnext shows that too many threads hurt
# performance. Worth tunning for other models in the future.
self
.
_exec_strategy
.
num_threads
=
len
(
places
)
*
4
elif
self
.
_exec_strategy
.
_use_device
==
ExecutionStrategy
.
UseDevic
e
.
XPU
:
elif
self
.
_exec_strategy
.
_use_device
==
DeviceTyp
e
.
XPU
:
# Currently only single thread is supported in Kunlun XPU.
self
.
_exec_strategy
.
num_threads
=
1
else
:
self
.
_exec_strategy
.
num_threads
=
len
(
places
)
*
2
if
self
.
_exec_strategy
.
_use_device
==
ExecutionStrategy
.
UseDevic
e
.
XPU
:
if
self
.
_exec_strategy
.
_use_device
==
DeviceTyp
e
.
XPU
:
assert
self
.
_exec_strategy
.
num_threads
==
1
,
\
"Currently only single thread is supported in Kunlun XPU."
...
...
@@ -384,7 +385,7 @@ class CompiledProgram(object):
self
.
_build_strategy
.
enable_sequential_execution
=
True
if
self
.
_program
is
not
None
and
self
.
_program
.
_enable_dgc
:
assert
self
.
_exec_strategy
.
_use_device
==
ExecutionStrategy
.
UseDevic
e
.
CUDA
,
"DGC only used under CUDA environment."
assert
self
.
_exec_strategy
.
_use_device
==
DeviceTyp
e
.
CUDA
,
"DGC only used under CUDA environment."
assert
self
.
_build_strategy
.
num_trainers
*
len
(
places
)
>
1
,
"DGC is not avaliable for single card training."
assert
self
.
_build_strategy
.
reduce_strategy
==
BuildStrategy
.
ReduceStrategy
.
AllReduce
,
"DGC
\
...
...
@@ -455,11 +456,11 @@ class CompiledProgram(object):
"If optimizer is used in control flow, "
"training on multi-places is not supported now."
)
if
isinstance
(
self
.
_place
,
core
.
CUDAPlace
):
use_device
=
ExecutionStrategy
.
UseDevic
e
.
CUDA
use_device
=
DeviceTyp
e
.
CUDA
elif
isinstance
(
self
.
_place
,
core
.
XPUPlace
):
use_device
=
ExecutionStrategy
.
UseDevic
e
.
XPU
use_device
=
DeviceTyp
e
.
XPU
else
:
use_device
=
ExecutionStrategy
.
UseDevic
e
.
CPU
use_device
=
DeviceTyp
e
.
CPU
self
.
_executor
=
self
.
_compile_data_parallel
(
use_device
=
use_device
,
scope
=
self
.
_scope
,
places
=
self
.
_places
)
return
self
...
...
python/paddle/fluid/framework.py
浏览文件 @
4427df37
...
...
@@ -462,6 +462,7 @@ def xpu_places(device_ids=None):
list of paddle.XPUPlace: Created XPU place list.
Examples:
.. code-block:: python
import paddle
import paddle.static as static
...
...
python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
浏览文件 @
4427df37
...
...
@@ -30,11 +30,17 @@ from feed_data_reader import FeedDataReader
__all__
=
[
'TestParallelExecutorBase'
]
class
DeviceType
:
CPU
=
1
GPU
=
2
XPU
=
3
class
TestParallelExecutorBase
(
unittest
.
TestCase
):
@
classmethod
def
check_network_convergence
(
cls
,
method
,
use_
cuda
=
True
,
use_
device
=
DeviceType
.
GPU
,
iter
=
5
,
batch_size
=
None
,
feed_dict
=
None
,
...
...
@@ -74,7 +80,9 @@ class TestParallelExecutorBase(unittest.TestCase):
feed_dict
,
loss
=
cls
.
build_model
(
feed_dict
,
get_data_from_feeder
,
main
,
method
,
optimizer
)
place
=
fluid
.
CUDAPlace
(
0
)
if
use_cuda
else
fluid
.
CPUPlace
()
place
=
fluid
.
CUDAPlace
(
0
)
if
use_device
==
DeviceType
.
GPU
else
fluid
.
XPUPlace
(
0
)
if
use_device
==
DeviceType
.
XPU
else
fluid
.
CPUPlace
()
exe
=
fluid
.
Executor
(
place
)
exe
.
run
(
startup
)
...
...
@@ -82,7 +90,7 @@ class TestParallelExecutorBase(unittest.TestCase):
enable_inplace
,
enable_sequential_execution
,
fuse_all_optimizer_ops
,
fuse_all_reduce_ops
,
fuse_elewise_add_act_ops
,
fuse_relu_depthwise_conv
,
use_fast_executor
,
use_ir_memory_optimize
,
use_reduce
,
use_
cuda
)
use_reduce
,
use_
device
)
if
use_parallel_executor
:
binary
=
compiler
.
CompiledProgram
(
main
).
with_data_parallel
(
...
...
@@ -94,7 +102,8 @@ class TestParallelExecutorBase(unittest.TestCase):
if
batch_size
is
not
None
:
batch_size
*=
fluid
.
core
.
get_cuda_device_count
(
)
if
use_cuda
else
int
(
)
if
use_device
==
DeviceType
.
GPU
else
fluid
.
core
.
get_xpu_device_count
(
)
if
use_device
==
DeviceType
.
XPU
else
int
(
os
.
environ
.
get
(
'CPU_NUM'
,
multiprocessing
.
cpu_count
()))
begin
=
time
.
time
()
...
...
@@ -123,7 +132,7 @@ class TestParallelExecutorBase(unittest.TestCase):
@
classmethod
def
check_pass_conflict
(
cls
,
method
,
use_
cuda
=
True
,
use_
device
=
DeviceType
.
GPU
,
feed_dict
=
None
,
get_data_from_feeder
=
None
,
use_reduce
=
False
,
...
...
@@ -143,7 +152,9 @@ class TestParallelExecutorBase(unittest.TestCase):
feed_dict
,
loss
=
cls
.
build_model
(
feed_dict
,
get_data_from_feeder
,
main
,
method
,
optimizer
)
place
=
fluid
.
CUDAPlace
(
0
)
if
use_cuda
else
fluid
.
CPUPlace
()
place
=
fluid
.
CUDAPlace
(
0
)
if
use_device
==
DeviceType
.
GPU
else
fluid
.
XPUPlace
(
0
)
if
use_device
==
DeviceType
.
XPU
else
fluid
.
CPUPlace
()
exe
=
fluid
.
Executor
(
place
)
exe
.
run
(
startup
)
...
...
@@ -151,7 +162,7 @@ class TestParallelExecutorBase(unittest.TestCase):
enable_inplace
,
enable_sequential_execution
,
fuse_all_optimizer_ops
,
fuse_all_reduce_ops
,
fuse_elewise_add_act_ops
,
fuse_relu_depthwise_conv
,
use_fast_executor
,
use_ir_memory_optimize
,
use_reduce
,
use_
cuda
)
use_reduce
,
use_
device
)
binary
=
compiler
.
CompiledProgram
(
main
).
with_data_parallel
(
loss_name
=
loss
.
name
,
...
...
@@ -165,7 +176,7 @@ class TestParallelExecutorBase(unittest.TestCase):
fuse_all_optimizer_ops
,
fuse_all_reduce_ops
,
fuse_elewise_add_act_ops
,
fuse_relu_depthwise_conv
,
use_fast_executor
,
use_ir_memory_optimize
,
use_reduce
,
use_
cuda
):
use_
device
):
exec_strategy
=
fluid
.
ExecutionStrategy
()
if
use_fast_executor
:
exec_strategy
.
use_experimental_executor
=
True
...
...
@@ -180,8 +191,17 @@ class TestParallelExecutorBase(unittest.TestCase):
build_strategy
.
enable_inplace
=
enable_inplace
build_strategy
.
enable_sequential_execution
=
enable_sequential_execution
if
use_
cuda
and
core
.
is_compiled_with_cuda
():
if
use_
device
==
DeviceType
.
GPU
and
core
.
is_compiled_with_cuda
():
build_strategy
.
remove_unnecessary_lock
=
True
if
use_device
==
DeviceType
.
XPU
and
core
.
is_compiled_with_xpu
():
build_strategy
.
fuse_elewise_add_act_ops
=
False
build_strategy
.
fuse_relu_depthwise_conv
=
False
build_strategy
.
fuse_all_optimizer_ops
=
False
build_strategy
.
fuse_all_reduce_ops
=
False
build_strategy
.
memory_optimize
=
False
build_strategy
.
enable_inplace
=
False
build_strategy
.
enable_sequential_execution
=
False
return
build_strategy
,
exec_strategy
@
classmethod
...
...
python/paddle/fluid/tests/unittests/seresnext_net.py
浏览文件 @
4427df37
...
...
@@ -19,6 +19,7 @@ fluid.core._set_eager_deletion_mode(-1, -1, False)
import
paddle.fluid.layers.ops
as
ops
from
paddle.fluid.layers.learning_rate_scheduler
import
cosine_decay
from
simple_nets
import
init_data
from
seresnext_test_base
import
DeviceType
import
math
import
os
os
.
environ
[
'CPU_NUM'
]
=
str
(
4
)
...
...
@@ -169,28 +170,32 @@ def optimizer(learning_rate=0.01):
model
=
SE_ResNeXt50Small
def
batch_size
(
use_
cuda
):
if
use_
cuda
:
def
batch_size
(
use_
device
):
if
use_
device
==
DeviceType
.
GPU
:
# Paddle uses 8GB P4 GPU for unittest so we decreased the batch size.
return
8
return
12
def
iter
(
use_
cuda
):
if
use_
cuda
:
def
iter
(
use_
device
):
if
use_
device
==
DeviceType
.
GPU
:
return
10
return
1
gpu_img
,
gpu_label
=
init_data
(
batch_size
=
batch_size
(
use_cuda
=
True
),
img_shape
=
img_shape
,
label_range
=
999
)
batch_size
=
batch_size
(
use_device
=
DeviceType
.
GPU
),
img_shape
=
img_shape
,
label_range
=
999
)
cpu_img
,
cpu_label
=
init_data
(
batch_size
=
batch_size
(
use_cuda
=
False
),
img_shape
=
img_shape
,
label_range
=
999
)
batch_size
=
batch_size
(
use_device
=
DeviceType
.
CPU
),
img_shape
=
img_shape
,
label_range
=
999
)
feed_dict_gpu
=
{
"image"
:
gpu_img
,
"label"
:
gpu_label
}
feed_dict_cpu
=
{
"image"
:
cpu_img
,
"label"
:
cpu_label
}
def
feed_dict
(
use_
cuda
):
if
use_
cuda
:
def
feed_dict
(
use_
device
):
if
use_
device
==
DeviceType
.
GPU
:
return
feed_dict_gpu
return
feed_dict_cpu
python/paddle/fluid/tests/unittests/seresnext_test_base.py
浏览文件 @
4427df37
...
...
@@ -15,34 +15,35 @@
from
__future__
import
print_function
import
seresnext_net
import
paddle.fluid.core
as
core
from
parallel_executor_test_base
import
TestParallelExecutorBase
from
parallel_executor_test_base
import
TestParallelExecutorBase
,
DeviceType
from
parallel_executor_test_base
import
DeviceType
import
numpy
as
np
class
TestResnetBase
(
TestParallelExecutorBase
):
def
_compare_result_with_origin_model
(
self
,
check_func
,
use_
cuda
,
use_
device
,
delta2
=
1e-5
,
compare_seperately
=
True
):
if
use_
cuda
and
not
core
.
is_compiled_with_cuda
():
if
use_
device
==
DeviceType
.
GPU
and
not
core
.
is_compiled_with_cuda
():
return
func_1_first_loss
,
func_1_last_loss
=
self
.
check_network_convergence
(
seresnext_net
.
model
,
feed_dict
=
seresnext_net
.
feed_dict
(
use_
cuda
),
iter
=
seresnext_net
.
iter
(
use_
cuda
),
batch_size
=
seresnext_net
.
batch_size
(
use_
cuda
),
use_
cuda
=
use_cuda
,
feed_dict
=
seresnext_net
.
feed_dict
(
use_
device
),
iter
=
seresnext_net
.
iter
(
use_
device
),
batch_size
=
seresnext_net
.
batch_size
(
use_
device
),
use_
device
=
use_device
,
use_reduce
=
False
,
optimizer
=
seresnext_net
.
optimizer
)
func_2_first_loss
,
func_2_last_loss
=
check_func
(
seresnext_net
.
model
,
feed_dict
=
seresnext_net
.
feed_dict
(
use_
cuda
),
iter
=
seresnext_net
.
iter
(
use_
cuda
),
batch_size
=
seresnext_net
.
batch_size
(
use_
cuda
),
use_
cuda
=
use_cuda
)
feed_dict
=
seresnext_net
.
feed_dict
(
use_
device
),
iter
=
seresnext_net
.
iter
(
use_
device
),
batch_size
=
seresnext_net
.
batch_size
(
use_
device
),
use_
device
=
use_device
)
if
compare_seperately
:
for
loss
in
zip
(
func_1_first_loss
,
func_2_first_loss
):
...
...
python/paddle/fluid/tests/unittests/test_fuse_all_reduce_pass.py
浏览文件 @
4427df37
...
...
@@ -14,7 +14,7 @@
from
simple_nets
import
simple_fc_net
,
fc_with_batchnorm
,
init_data
,
bow_net
from
fake_reader
import
fake_imdb_reader
from
parallel_executor_test_base
import
TestParallelExecutorBase
from
parallel_executor_test_base
import
TestParallelExecutorBase
,
DeviceType
import
paddle.fluid
as
fluid
import
paddle.fluid.core
as
core
from
functools
import
partial
...
...
@@ -30,12 +30,12 @@ class TestFuseAllReduceOpsBase(TestParallelExecutorBase):
def
compare_fuse_all_reduce_ops
(
self
,
model
,
use_
cuda
,
use_
device
,
init_feed_dict
=
None
,
get_data_from_feeder
=
None
,
optimizer
=
None
,
fuse_all_optimizer_ops
=
False
):
if
use_
cuda
and
not
core
.
is_compiled_with_cuda
():
if
use_
device
==
DeviceType
.
GPU
and
not
core
.
is_compiled_with_cuda
():
return
feed_dict_data
=
None
...
...
@@ -47,7 +47,7 @@ class TestFuseAllReduceOpsBase(TestParallelExecutorBase):
model
,
feed_dict
=
feed_dict_data
,
get_data_from_feeder
=
get_data_from_feeder
,
use_
cuda
=
use_cuda
,
use_
device
=
use_device
,
fuse_all_reduce_ops
=
False
,
fuse_all_optimizer_ops
=
fuse_all_optimizer_ops
,
optimizer
=
optimizer
)
...
...
@@ -55,7 +55,7 @@ class TestFuseAllReduceOpsBase(TestParallelExecutorBase):
model
,
feed_dict
=
feed_dict_data
,
get_data_from_feeder
=
get_data_from_feeder
,
use_
cuda
=
use_cuda
,
use_
device
=
use_device
,
fuse_all_reduce_ops
=
True
,
fuse_all_optimizer_ops
=
fuse_all_optimizer_ops
,
optimizer
=
optimizer
)
...
...
@@ -73,28 +73,30 @@ class TestFuseAllReduceOpsBase(TestParallelExecutorBase):
class
TestFuseAllReduceOps
(
TestFuseAllReduceOpsBase
):
def
_decorate_compare_fused_all_reduce
(
self
,
model
,
use_
cuda
):
def
_decorate_compare_fused_all_reduce
(
self
,
model
,
use_
device
):
self
.
compare_fuse_all_reduce_ops
(
model
,
use_
cuda
,
use_
device
,
init_feed_dict
=
init_data
,
optimizer
=
self
.
optimizer
,
fuse_all_optimizer_ops
=
True
)
def
test_simple_fc_with_fuse_all_reduce
(
self
):
self
.
_decorate_compare_fused_all_reduce
(
simple_fc_net
,
True
)
self
.
_decorate_compare_fused_all_reduce
(
simple_fc_net
,
False
)
self
.
_decorate_compare_fused_all_reduce
(
simple_fc_net
,
DeviceType
.
GPU
)
self
.
_decorate_compare_fused_all_reduce
(
simple_fc_net
,
DeviceType
.
CPU
)
def
test_batchnorm_fc_with_fuse_all_reduce
(
self
):
self
.
_decorate_compare_fused_all_reduce
(
fc_with_batchnorm
,
True
)
self
.
_decorate_compare_fused_all_reduce
(
fc_with_batchnorm
,
False
)
self
.
_decorate_compare_fused_all_reduce
(
fc_with_batchnorm
,
DeviceType
.
GPU
)
self
.
_decorate_compare_fused_all_reduce
(
fc_with_batchnorm
,
DeviceType
.
CPU
)
class
TestFuseAllReduceOpsAndOptiOps
(
TestFuseAllReduceOps
):
def
_decorate_compare_fused_all_reduce
(
self
,
model
,
use_
cuda
):
def
_decorate_compare_fused_all_reduce
(
self
,
model
,
use_
device
):
self
.
compare_fuse_all_reduce_ops
(
model
,
use_
cuda
,
use_
device
,
init_feed_dict
=
init_data
,
optimizer
=
self
.
optimizer
,
fuse_all_optimizer_ops
=
True
)
...
...
@@ -115,17 +117,17 @@ class TestFuseAllReduceOpsWithSparseGrad(TestFuseAllReduceOpsBase):
feeder
=
fluid
.
DataFeeder
(
feed_list
=
[
"words"
,
"label"
],
place
=
place
)
return
feeder
.
feed
(
self
.
train_data
)
def
_decorate_compare_fused_all_reduce
(
self
,
model
,
use_
cuda
):
def
_decorate_compare_fused_all_reduce
(
self
,
model
,
use_
device
):
self
.
compare_fuse_all_reduce_ops
(
model
,
use_
cuda
,
use_
device
,
get_data_from_feeder
=
self
.
get_data_from_feeder
,
optimizer
=
self
.
optimizer
)
def
test_simple_bow_net_with_fuse_all_reduce
(
self
):
model
=
partial
(
bow_net
,
dict_dim
=
self
.
word_dict_len
,
is_sparse
=
True
)
self
.
_decorate_compare_fused_all_reduce
(
model
,
True
)
self
.
_decorate_compare_fused_all_reduce
(
model
,
False
)
self
.
_decorate_compare_fused_all_reduce
(
model
,
DeviceType
.
GPU
)
self
.
_decorate_compare_fused_all_reduce
(
model
,
DeviceType
.
CPU
)
if
__name__
==
'__main__'
:
...
...
python/paddle/fluid/tests/unittests/test_fuse_elewise_add_act_pass.py
浏览文件 @
4427df37
...
...
@@ -13,7 +13,7 @@
# limitations under the License.
from
simple_nets
import
simple_fc_net
,
fc_with_batchnorm
,
init_data
from
parallel_executor_test_base
import
TestParallelExecutorBase
from
parallel_executor_test_base
import
TestParallelExecutorBase
,
DeviceType
import
paddle.fluid
as
fluid
import
paddle.fluid.core
as
core
import
unittest
...
...
@@ -25,8 +25,8 @@ class TestMNIST(TestParallelExecutorBase):
def
setUpClass
(
cls
):
os
.
environ
[
'CPU_NUM'
]
=
str
(
4
)
def
_compare_fuse_elewise_add_act_ops
(
self
,
model
,
use_
cuda
):
if
use_
cuda
and
not
core
.
is_compiled_with_cuda
():
def
_compare_fuse_elewise_add_act_ops
(
self
,
model
,
use_
device
):
if
use_
device
==
DeviceType
.
GPU
and
not
core
.
is_compiled_with_cuda
():
return
img
,
label
=
init_data
()
...
...
@@ -45,7 +45,7 @@ class TestMNIST(TestParallelExecutorBase):
model
,
feed_dict
=
{
"image"
:
img
,
"label"
:
label
},
use_
cuda
=
use_cuda
,
use_
device
=
use_device
,
fuse_elewise_add_act_ops
=
False
,
use_ir_memory_optimize
=
False
,
enable_inplace
=
False
,
...
...
@@ -54,7 +54,7 @@ class TestMNIST(TestParallelExecutorBase):
model
,
feed_dict
=
{
"image"
:
img
,
"label"
:
label
},
use_
cuda
=
use_cuda
,
use_
device
=
use_device
,
fuse_elewise_add_act_ops
=
True
,
use_ir_memory_optimize
=
False
,
enable_inplace
=
False
,
...
...
@@ -66,12 +66,14 @@ class TestMNIST(TestParallelExecutorBase):
self
.
assertAlmostEquals
(
loss
[
0
],
loss
[
1
],
delta
=
1e-6
)
def
test_simple_fc_with_fuse_op
(
self
):
self
.
_compare_fuse_elewise_add_act_ops
(
simple_fc_net
,
True
)
self
.
_compare_fuse_elewise_add_act_ops
(
simple_fc_net
,
False
)
self
.
_compare_fuse_elewise_add_act_ops
(
simple_fc_net
,
DeviceType
.
GPU
)
self
.
_compare_fuse_elewise_add_act_ops
(
simple_fc_net
,
DeviceType
.
CPU
)
def
test_batchnorm_fc_with_fuse_op
(
self
):
self
.
_compare_fuse_elewise_add_act_ops
(
fc_with_batchnorm
,
True
)
self
.
_compare_fuse_elewise_add_act_ops
(
fc_with_batchnorm
,
False
)
self
.
_compare_fuse_elewise_add_act_ops
(
fc_with_batchnorm
,
DeviceType
.
GPU
)
self
.
_compare_fuse_elewise_add_act_ops
(
fc_with_batchnorm
,
DeviceType
.
CPU
)
if
__name__
==
'__main__'
:
...
...
python/paddle/fluid/tests/unittests/test_fuse_optimizer_pass.py
浏览文件 @
4427df37
...
...
@@ -14,7 +14,7 @@
from
simple_nets
import
simple_fc_net
,
fc_with_batchnorm
,
init_data
,
bow_net
from
fake_reader
import
fake_imdb_reader
from
parallel_executor_test_base
import
TestParallelExecutorBase
from
parallel_executor_test_base
import
TestParallelExecutorBase
,
DeviceType
from
functools
import
partial
import
paddle
import
paddle.fluid
as
fluid
...
...
@@ -34,25 +34,25 @@ class TestFuseOptimizationOps(TestParallelExecutorBase):
def
_compare_fused_optimizer_ops
(
self
,
model
,
use_
cuda
,
use_
device
,
feed_dict
=
None
,
get_data_from_feeder
=
None
,
optimizer
=
fluid
.
optimizer
.
Adam
):
if
use_
cuda
and
not
core
.
is_compiled_with_cuda
():
if
use_
device
==
DeviceType
.
GPU
and
not
core
.
is_compiled_with_cuda
():
return
not_fuse_op_first_loss
,
not_fuse_op_last_loss
=
self
.
check_network_convergence
(
model
,
feed_dict
=
feed_dict
,
get_data_from_feeder
=
get_data_from_feeder
,
use_
cuda
=
use_cuda
,
use_
device
=
use_device
,
fuse_all_optimizer_ops
=
False
,
optimizer
=
optimizer
)
fuse_op_first_loss
,
fuse_op_last_loss
=
self
.
check_network_convergence
(
model
,
feed_dict
=
feed_dict
,
get_data_from_feeder
=
get_data_from_feeder
,
use_
cuda
=
use_cuda
,
use_
device
=
use_device
,
fuse_all_optimizer_ops
=
True
,
optimizer
=
optimizer
)
...
...
@@ -61,10 +61,11 @@ class TestFuseOptimizationOps(TestParallelExecutorBase):
for
loss
in
zip
(
not_fuse_op_last_loss
,
fuse_op_last_loss
):
self
.
assertAlmostEquals
(
loss
[
0
],
loss
[
1
],
delta
=
1e-6
)
def
_decorate_compare_fused_optimizer_ops
(
self
,
model
,
use_cuda
,
optimizer
):
def
_decorate_compare_fused_optimizer_ops
(
self
,
model
,
use_device
,
optimizer
):
self
.
_compare_fused_optimizer_ops
(
model
,
use_
cuda
,
use_
device
,
feed_dict
=
self
.
_get_feed_dict
(),
optimizer
=
optimizer
)
...
...
@@ -75,9 +76,9 @@ class TestFuseAdamOps(TestFuseOptimizationOps):
def
test_batchnorm_fc_with_fuse_op
(
self
):
self
.
_decorate_compare_fused_optimizer_ops
(
fc_with_batchnorm
,
True
,
optimizer
=
self
.
optimizer
)
fc_with_batchnorm
,
DeviceType
.
GPU
,
optimizer
=
self
.
optimizer
)
self
.
_decorate_compare_fused_optimizer_ops
(
fc_with_batchnorm
,
False
,
optimizer
=
self
.
optimizer
)
fc_with_batchnorm
,
DeviceType
.
CPU
,
optimizer
=
self
.
optimizer
)
class
TestFuseSGDOps
(
TestFuseAdamOps
):
...
...
@@ -106,10 +107,11 @@ class TestSpareFuseAdamOps(TestFuseOptimizationOps):
feeder
=
fluid
.
DataFeeder
(
feed_list
=
[
"words"
,
"label"
],
place
=
place
)
return
feeder
.
feed
(
self
.
train_data
)
def
_decorate_compare_fused_optimizer_ops
(
self
,
model
,
use_cuda
,
optimizer
):
def
_decorate_compare_fused_optimizer_ops
(
self
,
model
,
use_device
,
optimizer
):
self
.
_compare_fused_optimizer_ops
(
model
,
use_
cuda
,
use_
device
,
get_data_from_feeder
=
self
.
_get_data_from_feeder
,
optimizer
=
optimizer
)
...
...
@@ -119,9 +121,9 @@ class TestSpareFuseAdamOps(TestFuseOptimizationOps):
def
test_simple_bow_net_with_fuse_op
(
self
):
model
=
partial
(
bow_net
,
dict_dim
=
self
.
word_dict_len
,
is_sparse
=
True
)
self
.
_decorate_compare_fused_optimizer_ops
(
model
,
True
,
optimizer
=
self
.
optimizer
)
model
,
DeviceType
.
GPU
,
optimizer
=
self
.
optimizer
)
self
.
_decorate_compare_fused_optimizer_ops
(
model
,
False
,
optimizer
=
self
.
optimizer
)
model
,
DeviceType
.
CPU
,
optimizer
=
self
.
optimizer
)
class
TestSpareFuseSGDOps
(
TestSpareFuseAdamOps
):
...
...
@@ -138,18 +140,18 @@ class TestSpareFuseMomentumOps(TestSpareFuseAdamOps):
class
TestPassConflictBase
(
TestFuseAdamOps
):
def
_compare_fused_optimizer_ops
(
self
,
model
,
use_
cuda
,
use_
device
,
feed_dict
=
None
,
get_data_from_feeder
=
None
,
optimizer
=
fluid
.
optimizer
.
Adam
):
if
use_
cuda
and
not
core
.
is_compiled_with_cuda
():
if
use_
device
==
DeviceType
.
GPU
and
not
core
.
is_compiled_with_cuda
():
return
self
.
check_pass_conflict
(
model
,
feed_dict
=
feed_dict
,
get_data_from_feeder
=
get_data_from_feeder
,
use_
cuda
=
use_cuda
,
use_
device
=
use_device
,
fuse_all_optimizer_ops
=
True
,
optimizer
=
optimizer
,
enable_sequential_execution
=
True
)
...
...
@@ -161,9 +163,9 @@ class TestFuseAdamOpsPassConflict(TestPassConflictBase):
def
test_batchnorm_fc_with_fuse_op
(
self
):
self
.
_decorate_compare_fused_optimizer_ops
(
fc_with_batchnorm
,
True
,
optimizer
=
self
.
optimizer
)
fc_with_batchnorm
,
DeviceType
.
CPU
,
optimizer
=
self
.
optimizer
)
self
.
_decorate_compare_fused_optimizer_ops
(
fc_with_batchnorm
,
False
,
optimizer
=
self
.
optimizer
)
fc_with_batchnorm
,
DeviceType
.
GPU
,
optimizer
=
self
.
optimizer
)
class
TestFuseSGDOpsPassConflict
(
TestFuseAdamOpsPassConflict
):
...
...
python/paddle/fluid/tests/unittests/test_fuse_relu_depthwise_conv_pass.py
浏览文件 @
4427df37
...
...
@@ -12,7 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
from
parallel_executor_test_base
import
TestParallelExecutorBase
from
parallel_executor_test_base
import
TestParallelExecutorBase
,
DeviceType
import
paddle.fluid
as
fluid
import
paddle.fluid.core
as
core
import
numpy
as
np
...
...
@@ -72,8 +72,8 @@ class TestMNIST(TestParallelExecutorBase):
label
=
np
.
ones
(
shape
=
[
32
,
1
],
dtype
=
'int64'
)
return
img
,
label
def
_compare
(
self
,
model
,
use_
cuda
,
random_data
=
True
,
only_forward
=
False
):
if
use_
cuda
and
not
core
.
is_compiled_with_cuda
():
def
_compare
(
self
,
model
,
use_
device
,
random_data
=
True
,
only_forward
=
False
):
if
use_
device
==
DeviceType
.
GPU
and
not
core
.
is_compiled_with_cuda
():
return
img
,
label
=
self
.
_init_data
(
random_data
)
...
...
@@ -90,7 +90,7 @@ class TestMNIST(TestParallelExecutorBase):
model
,
feed_dict
=
{
"image"
:
img
,
"label"
:
label
},
use_
cuda
=
use_cuda
,
use_
device
=
use_device
,
fuse_relu_depthwise_conv
=
True
,
use_ir_memory_optimize
=
True
,
optimizer
=
_optimizer
)
...
...
@@ -98,7 +98,7 @@ class TestMNIST(TestParallelExecutorBase):
model
,
feed_dict
=
{
"image"
:
img
,
"label"
:
label
},
use_
cuda
=
use_cuda
,
use_
device
=
use_device
,
fuse_relu_depthwise_conv
=
False
,
optimizer
=
_optimizer
)
...
...
@@ -108,12 +108,12 @@ class TestMNIST(TestParallelExecutorBase):
self
.
assertAlmostEquals
(
loss
[
0
],
loss
[
1
],
delta
=
1e-6
)
def
test_simple_depthwise_with_fuse_op
(
self
):
self
.
_compare
(
simple_depthwise_net
,
True
)
self
.
_compare
(
simple_depthwise_net
,
False
)
self
.
_compare
(
simple_depthwise_net
,
DeviceType
.
GPU
)
self
.
_compare
(
simple_depthwise_net
,
DeviceType
.
CPU
)
def
test_simple_depthwise_with_fuse_op_only_forward
(
self
):
self
.
_compare
(
simple_depthwise_net
,
True
,
only_forward
=
True
)
self
.
_compare
(
simple_depthwise_net
,
False
,
only_forward
=
True
)
self
.
_compare
(
simple_depthwise_net
,
DeviceType
.
GPU
,
only_forward
=
True
)
self
.
_compare
(
simple_depthwise_net
,
DeviceType
.
CPU
,
only_forward
=
True
)
if
__name__
==
'__main__'
:
...
...
python/paddle/fluid/tests/unittests/test_ir_inplace_pass.py
浏览文件 @
4427df37
...
...
@@ -19,7 +19,7 @@ import unittest
import
numpy
as
np
import
paddle.fluid.core
as
core
import
paddle.fluid
as
fluid
from
parallel_executor_test_base
import
TestParallelExecutorBase
from
parallel_executor_test_base
import
TestParallelExecutorBase
,
DeviceType
def
fc_with_batchnorm
(
use_feed
):
...
...
@@ -58,7 +58,7 @@ class TestIrInplace(TestParallelExecutorBase):
fc_with_batchnorm
,
feed_dict
=
{
"image"
:
img
,
"label"
:
label
},
use_
cuda
=
True
,
use_
device
=
DeviceType
.
GPU
,
use_ir_memory_optimize
=
ir_memory_optimize
,
enable_inplace
=
enable_inplace
)
...
...
python/paddle/fluid/tests/unittests/test_ir_memory_optimize_ifelse_op.py
浏览文件 @
4427df37
...
...
@@ -75,7 +75,7 @@ class TestIrMemoryOptimizeIfElseOp(unittest.TestCase):
exe
=
Executor
(
place
)
exec_strategy
=
fluid
.
ExecutionStrategy
()
exec_strategy
.
_use_device
=
fluid
.
ExecutionStrategy
.
UseDevice
.
CUDA
if
use_cuda
else
fluid
.
ExecutionStrategy
.
UseDevic
e
.
CPU
exec_strategy
.
_use_device
=
core
.
DeviceType
.
CUDA
if
use_cuda
else
core
.
DeviceTyp
e
.
CPU
build_strategy
=
fluid
.
BuildStrategy
()
build_strategy
.
memory_optimize
=
use_mem_opt
...
...
python/paddle/fluid/tests/unittests/test_ir_memory_optimize_pass.py
浏览文件 @
4427df37
...
...
@@ -12,7 +12,7 @@
# See the License for the specific language governing permissions and
# limitations under the License.
from
parallel_executor_test_base
import
TestParallelExecutorBase
from
parallel_executor_test_base
import
TestParallelExecutorBase
,
DeviceType
import
paddle.fluid
as
fluid
import
paddle.fluid.core
as
core
import
numpy
as
np
...
...
@@ -60,8 +60,8 @@ class TestMNIST(TestParallelExecutorBase):
label
=
np
.
ones
(
shape
=
[
32
,
1
],
dtype
=
'int64'
)
return
img
,
label
def
_compare_ir_memory_optimize
(
self
,
model
,
use_
cuda
):
if
use_
cuda
and
not
core
.
is_compiled_with_cuda
():
def
_compare_ir_memory_optimize
(
self
,
model
,
use_
device
):
if
use_
device
==
DeviceType
.
GPU
and
not
core
.
is_compiled_with_cuda
():
return
img
,
label
=
self
.
_dummy_data
()
...
...
@@ -69,13 +69,13 @@ class TestMNIST(TestParallelExecutorBase):
model
,
feed_dict
=
{
"image"
:
img
,
"label"
:
label
},
use_
cuda
=
use_cuda
,
use_
device
=
use_device
,
use_ir_memory_optimize
=
False
)
first_loss1
,
last_loss1
=
self
.
check_network_convergence
(
model
,
feed_dict
=
{
"image"
:
img
,
"label"
:
label
},
use_
cuda
=
use_cuda
,
use_
device
=
use_device
,
use_ir_memory_optimize
=
True
)
for
loss
in
zip
(
first_loss0
,
first_loss1
):
self
.
assertAlmostEqual
(
loss
[
0
],
loss
[
1
],
delta
=
1e-6
)
...
...
@@ -83,12 +83,12 @@ class TestMNIST(TestParallelExecutorBase):
self
.
assertAlmostEqual
(
loss
[
0
],
loss
[
1
],
delta
=
1e-6
)
def
test_simple_fc_net
(
self
):
self
.
_compare_ir_memory_optimize
(
simple_fc_net
,
False
)
self
.
_compare_ir_memory_optimize
(
simple_fc_net
,
True
)
self
.
_compare_ir_memory_optimize
(
simple_fc_net
,
DeviceType
.
CPU
)
self
.
_compare_ir_memory_optimize
(
simple_fc_net
,
DeviceType
.
GPU
)
def
test_fc_with_reshape_net
(
self
):
self
.
_compare_ir_memory_optimize
(
fc_with_inplace_net
,
False
)
self
.
_compare_ir_memory_optimize
(
fc_with_inplace_net
,
True
)
self
.
_compare_ir_memory_optimize
(
fc_with_inplace_net
,
DeviceType
.
CPU
)
self
.
_compare_ir_memory_optimize
(
fc_with_inplace_net
,
DeviceType
.
GPU
)
if
__name__
==
'__main__'
:
...
...
python/paddle/fluid/tests/unittests/test_ir_memory_optimize_transformer.py
浏览文件 @
4427df37
...
...
@@ -23,7 +23,7 @@ import paddle.dataset.wmt16 as wmt16
os
.
environ
[
'FLAGS_eager_delete_tensor_gb'
]
=
"0.0"
from
parallel_executor_test_base
import
TestParallelExecutorBase
from
parallel_executor_test_base
import
TestParallelExecutorBase
,
DeviceType
from
test_parallel_executor_transformer
import
get_feed_data_reader
,
transformer
...
...
@@ -35,14 +35,14 @@ class TestTransformerWithIR(TestParallelExecutorBase):
# check python transpiler
self
.
check_network_convergence
(
transformer
,
use_
cuda
=
True
,
use_
device
=
DeviceType
.
GPU
,
feed_data_reader
=
get_feed_data_reader
(),
use_ir_memory_optimize
=
False
,
iter
=
2
)
# check IR memory optimize
self
.
check_network_convergence
(
transformer
,
use_
cuda
=
True
,
use_
device
=
DeviceType
.
GPU
,
feed_data_reader
=
get_feed_data_reader
(),
use_ir_memory_optimize
=
True
,
iter
=
2
)
...
...
python/paddle/fluid/tests/unittests/test_mix_precision_all_reduce_fuse.py
浏览文件 @
4427df37
...
...
@@ -24,7 +24,7 @@ import numpy as np
import
paddle
import
paddle.fluid
as
fluid
from
simple_nets
import
init_data
from
parallel_executor_test_base
import
TestParallelExecutorBase
from
parallel_executor_test_base
import
TestParallelExecutorBase
,
DeviceType
batch_size
=
12
img_shape
=
[
1
,
28
,
28
]
...
...
@@ -68,7 +68,7 @@ def _optimizer(learning_rate=1e-6):
class
TestResnet
(
TestParallelExecutorBase
):
def
check_model
(
self
,
use_
cuda
):
def
check_model
(
self
,
use_
device
):
img
,
label
=
init_data
(
batch_size
=
batch_size
,
img_shape
=
img_shape
,
label_range
=
9
)
img
=
np
.
float16
(
img
)
...
...
@@ -78,13 +78,13 @@ class TestResnet(TestParallelExecutorBase):
conv_net
,
feed_dict
=
feed_dict
,
iter
=
10
,
use_
cuda
=
use_cuda
,
use_
device
=
use_device
,
fuse_all_reduce_ops
=
True
,
optimizer
=
_optimizer
)
def
test_model
(
self
):
if
core
.
is_compiled_with_cuda
():
self
.
check_model
(
True
)
self
.
check_model
(
DeviceType
.
GPU
)
if
__name__
==
'__main__'
:
...
...
python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py
浏览文件 @
4427df37
...
...
@@ -18,9 +18,11 @@ import unittest
import
numpy
as
np
import
paddle.fluid.core
as
core
import
paddle
import
os
import
paddle.fluid
as
fluid
from
parallel_executor_test_base
import
TestParallelExecutorBase
from
parallel_executor_test_base
import
TestParallelExecutorBase
,
DeviceType
from
parallel_executor_test_base
import
DeviceType
def
simple_fc_net
(
use_feed
):
...
...
@@ -76,10 +78,13 @@ class TestMNIST(TestParallelExecutorBase):
def
_compare_reduce_and_allreduce
(
self
,
model
,
use_
cuda
,
use_
device
,
delta1
=
1e-6
,
delta2
=
1e-4
):
if
use_cuda
and
not
core
.
is_compiled_with_cuda
():
if
use_device
==
DeviceType
.
GPU
and
not
core
.
is_compiled_with_cuda
():
return
if
use_device
==
DeviceType
.
XPU
and
not
core
.
is_compiled_with_xpu
():
return
img
,
label
=
self
.
_init_data
()
...
...
@@ -88,14 +93,14 @@ class TestMNIST(TestParallelExecutorBase):
model
,
feed_dict
=
{
"image"
:
img
,
"label"
:
label
},
use_
cuda
=
use_cuda
,
use_
device
=
use_device
,
use_reduce
=
False
)
reduce_first_loss
,
reduce_last_loss
=
self
.
check_network_convergence
(
model
,
feed_dict
=
{
"image"
:
img
,
"label"
:
label
},
use_
cuda
=
use_cuda
,
use_
device
=
use_device
,
use_reduce
=
True
)
for
loss
in
zip
(
all_reduce_first_loss
,
reduce_first_loss
):
...
...
@@ -104,8 +109,11 @@ class TestMNIST(TestParallelExecutorBase):
self
.
assertAlmostEqual
(
loss
[
0
],
loss
[
1
],
delta
=
delta2
)
# simple_fc
def
check_simple_fc_convergence
(
self
,
use_cuda
,
use_reduce
=
False
):
if
use_cuda
and
not
core
.
is_compiled_with_cuda
():
def
check_simple_fc_convergence
(
self
,
use_device
,
use_reduce
=
False
):
if
use_device
==
DeviceType
.
GPU
and
not
core
.
is_compiled_with_cuda
():
return
if
use_device
==
DeviceType
.
XPU
and
not
core
.
is_compiled_with_xpu
():
return
img
,
label
=
self
.
_init_data
()
...
...
@@ -114,23 +122,26 @@ class TestMNIST(TestParallelExecutorBase):
simple_fc_net
,
feed_dict
=
{
"image"
:
img
,
"label"
:
label
},
use_
cuda
=
use_cuda
,
use_
device
=
use_device
,
use_reduce
=
use_reduce
)
def
test_simple_fc
(
self
):
# use_cuda
self
.
check_simple_fc_convergence
(
True
)
self
.
check_simple_fc_convergence
(
False
)
# use_device
self
.
check_simple_fc_convergence
(
DeviceType
.
GPU
)
self
.
check_simple_fc_convergence
(
DeviceType
.
CPU
)
self
.
check_simple_fc_convergence
(
DeviceType
.
XPU
)
def
test_simple_fc_with_new_strategy
(
self
):
# use_
cuda
, use_reduce
# use_
device
, use_reduce
# NOTE: the computation result of nccl_reduce is non-deterministic,
# related issue: https://github.com/NVIDIA/nccl/issues/157
self
.
_compare_reduce_and_allreduce
(
simple_fc_net
,
True
,
1e-5
,
1e-2
)
self
.
_compare_reduce_and_allreduce
(
simple_fc_net
,
False
,
1e-5
,
1e-2
)
self
.
_compare_reduce_and_allreduce
(
simple_fc_net
,
DeviceType
.
GPU
,
1e-5
,
1e-2
)
self
.
_compare_reduce_and_allreduce
(
simple_fc_net
,
DeviceType
.
CPU
,
1e-5
,
1e-2
)
def
check_simple_fc_parallel_accuracy
(
self
,
use_
cuda
):
if
use_
cuda
and
not
core
.
is_compiled_with_cuda
():
def
check_simple_fc_parallel_accuracy
(
self
,
use_
device
):
if
use_
device
==
DeviceType
.
GPU
and
not
core
.
is_compiled_with_cuda
():
return
img
,
label
=
self
.
_init_data
()
...
...
@@ -139,13 +150,13 @@ class TestMNIST(TestParallelExecutorBase):
method
=
simple_fc_net
,
feed_dict
=
{
"image"
:
img
,
"label"
:
label
},
use_
cuda
=
use_cuda
,
use_
device
=
use_device
,
use_parallel_executor
=
False
)
parallel_first_loss
,
parallel_last_loss
=
self
.
check_network_convergence
(
method
=
simple_fc_net
,
feed_dict
=
{
"image"
:
img
,
"label"
:
label
},
use_
cuda
=
use_cuda
,
use_
device
=
use_device
,
use_parallel_executor
=
True
)
self
.
assertAlmostEquals
(
...
...
@@ -156,33 +167,38 @@ class TestMNIST(TestParallelExecutorBase):
np
.
mean
(
parallel_last_loss
),
single_last_loss
,
delta
=
1e-6
)
def
test_simple_fc_parallel_accuracy
(
self
):
self
.
check_simple_fc_parallel_accuracy
(
True
)
self
.
check_simple_fc_parallel_accuracy
(
False
)
self
.
check_simple_fc_parallel_accuracy
(
DeviceType
.
GPU
)
self
.
check_simple_fc_parallel_accuracy
(
DeviceType
.
CPU
)
def
check_batchnorm_fc_convergence
(
self
,
use_cuda
,
use_fast_executor
):
if
use_cuda
and
not
core
.
is_compiled_with_cuda
():
def
check_batchnorm_fc_convergence
(
self
,
use_device
,
use_fast_executor
):
if
use_device
==
DeviceType
.
GPU
and
not
core
.
is_compiled_with_cuda
():
return
if
use_device
==
DeviceType
.
XPU
and
not
core
.
is_compiled_with_xpu
():
return
img
,
label
=
self
.
_init_data
()
self
.
check_network_convergence
(
fc_with_batchnorm
,
feed_dict
=
{
"image"
:
img
,
"label"
:
label
},
use_
cuda
=
use_cuda
,
use_
device
=
use_device
,
use_fast_executor
=
use_fast_executor
)
def
test_batchnorm_fc
(
self
):
for
use_
cuda
in
(
False
,
True
):
for
use_
device
in
(
DeviceType
.
CPU
,
DeviceType
.
GPU
):
for
use_fast_executor
in
(
False
,
True
):
self
.
check_batchnorm_fc_convergence
(
use_cuda
,
use_fast_executor
)
self
.
check_batchnorm_fc_convergence
(
use_device
,
use_fast_executor
)
def
test_batchnorm_fc_with_new_strategy
(
self
):
# NOTE: the computation result of nccl_reduce is non-deterministic,
# related issue: https://github.com/NVIDIA/nccl/issues/157
self
.
_compare_reduce_and_allreduce
(
fc_with_batchnorm
,
True
,
1e-5
,
1e-2
)
self
.
_compare_reduce_and_allreduce
(
fc_with_batchnorm
,
False
,
1e-5
,
1e-2
)
self
.
_compare_reduce_and_allreduce
(
fc_with_batchnorm
,
DeviceType
.
GPU
,
1e-5
,
1e-2
)
self
.
_compare_reduce_and_allreduce
(
fc_with_batchnorm
,
DeviceType
.
CPU
,
1e-5
,
1e-2
)
if
__name__
==
'__main__'
:
paddle
.
enable_static
()
unittest
.
main
()
python/paddle/fluid/tests/unittests/test_parallel_executor_pg.py
浏览文件 @
4427df37
...
...
@@ -21,7 +21,7 @@ import os
os
.
environ
[
'FLAGS_enable_parallel_graph'
]
=
str
(
1
)
import
paddle.fluid.core
as
core
import
os
from
parallel_executor_test_base
import
TestParallelExecutorBase
from
parallel_executor_test_base
import
TestParallelExecutorBase
,
DeviceType
from
simple_nets
import
simple_fc_net
,
init_data
...
...
@@ -31,8 +31,8 @@ class TestMNIST(TestParallelExecutorBase):
os
.
environ
[
'CPU_NUM'
]
=
str
(
4
)
# simple_fc
def
check_simple_fc_convergence
(
self
,
use_
cuda
,
use_reduce
=
False
):
if
use_
cuda
and
not
core
.
is_compiled_with_cuda
():
def
check_simple_fc_convergence
(
self
,
use_
device
,
use_reduce
=
False
):
if
use_
device
==
DeviceType
.
GPU
and
not
core
.
is_compiled_with_cuda
():
return
img
,
label
=
init_data
()
...
...
@@ -40,15 +40,15 @@ class TestMNIST(TestParallelExecutorBase):
simple_fc_net
,
feed_dict
=
{
"image"
:
img
,
"label"
:
label
},
use_
cuda
=
use_cuda
,
use_
device
=
use_device
,
use_reduce
=
use_reduce
)
def
test_simple_fc
(
self
):
# use_
cuda
# use_
device
self
.
check_simple_fc_convergence
(
True
)
def
check_simple_fc_parallel_accuracy
(
self
,
use_
cuda
):
if
use_
cuda
and
not
core
.
is_compiled_with_cuda
():
def
check_simple_fc_parallel_accuracy
(
self
,
use_
device
):
if
use_
device
and
not
core
.
is_compiled_with_cuda
():
return
img
,
label
=
init_data
()
...
...
@@ -56,13 +56,13 @@ class TestMNIST(TestParallelExecutorBase):
method
=
simple_fc_net
,
feed_dict
=
{
"image"
:
img
,
"label"
:
label
},
use_
cuda
=
use_cuda
,
use_
device
=
use_device
,
use_parallel_executor
=
False
)
parallel_first_loss
,
parallel_last_loss
=
self
.
check_network_convergence
(
method
=
simple_fc_net
,
feed_dict
=
{
"image"
:
img
,
"label"
:
label
},
use_
cuda
=
use_cuda
,
use_
device
=
use_device
,
use_parallel_executor
=
True
)
self
.
assertAlmostEquals
(
...
...
@@ -73,7 +73,7 @@ class TestMNIST(TestParallelExecutorBase):
np
.
mean
(
parallel_last_loss
),
single_last_loss
,
delta
=
1e-6
)
def
test_simple_fc_parallel_accuracy
(
self
):
self
.
check_simple_fc_parallel_accuracy
(
True
)
self
.
check_simple_fc_parallel_accuracy
(
DeviceType
.
GPU
)
if
__name__
==
'__main__'
:
...
...
python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext_base_cpu.py
浏览文件 @
4427df37
...
...
@@ -15,7 +15,7 @@
from
__future__
import
print_function
import
unittest
import
seresnext_net
from
seresnext_test_base
import
TestResnetBase
from
seresnext_test_base
import
TestResnetBase
,
DeviceType
from
functools
import
partial
...
...
@@ -30,7 +30,10 @@ class TestResnetCPU(TestResnetBase):
optimizer
=
seresnext_net
.
optimizer
,
use_parallel_executor
=
False
)
self
.
_compare_result_with_origin_model
(
check_func
,
use_cuda
=
False
,
compare_seperately
=
False
,
delta2
=
1e-3
)
check_func
,
use_device
=
DeviceType
.
CPU
,
compare_seperately
=
False
,
delta2
=
1e-3
)
if
__name__
==
'__main__'
:
...
...
python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext_base_gpu.py
浏览文件 @
4427df37
...
...
@@ -15,7 +15,7 @@
from
__future__
import
print_function
import
unittest
import
seresnext_net
from
seresnext_test_base
import
TestResnetBase
from
seresnext_test_base
import
TestResnetBase
,
DeviceType
from
functools
import
partial
...
...
@@ -30,7 +30,7 @@ class TestResnetGPU(TestResnetBase):
optimizer
=
seresnext_net
.
optimizer
,
use_parallel_executor
=
False
)
self
.
_compare_result_with_origin_model
(
check_func
,
use_
cuda
=
True
,
compare_seperately
=
False
)
check_func
,
use_
device
=
DeviceType
.
GPU
,
compare_seperately
=
False
)
if
__name__
==
'__main__'
:
...
...
python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext_with_fuse_all_reduce_cpu.py
浏览文件 @
4427df37
...
...
@@ -19,7 +19,7 @@ fluid.core._set_fuse_parameter_memory_size(131072)
import
unittest
import
seresnext_net
from
seresnext_test_base
import
TestResnetBase
from
seresnext_test_base
import
TestResnetBase
,
DeviceType
from
functools
import
partial
...
...
@@ -31,7 +31,8 @@ class TestResnetWithFuseAllReduceCPU(TestResnetBase):
self
.
check_network_convergence
,
optimizer
=
seresnext_net
.
optimizer
,
fuse_all_reduce_ops
=
True
)
self
.
_compare_result_with_origin_model
(
check_func
,
use_cuda
=
False
)
self
.
_compare_result_with_origin_model
(
check_func
,
use_device
=
DeviceType
.
CPU
)
if
__name__
==
'__main__'
:
...
...
python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext_with_fuse_all_reduce_gpu.py
浏览文件 @
4427df37
...
...
@@ -19,7 +19,7 @@ fluid.core._set_fuse_parameter_memory_size(131072)
import
unittest
import
seresnext_net
from
seresnext_test_base
import
TestResnetBase
from
seresnext_test_base
import
TestResnetBase
,
DeviceType
from
functools
import
partial
...
...
@@ -32,7 +32,7 @@ class TestResnetWithFuseAllReduceGPU(TestResnetBase):
optimizer
=
seresnext_net
.
optimizer
,
fuse_all_reduce_ops
=
True
)
self
.
_compare_result_with_origin_model
(
check_func
,
use_
cuda
=
True
,
delta2
=
1e-2
)
check_func
,
use_
device
=
DeviceType
.
GPU
,
delta2
=
1e-2
)
if
__name__
==
'__main__'
:
...
...
python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext_with_reduce_cpu.py
浏览文件 @
4427df37
...
...
@@ -14,30 +14,30 @@
from
__future__
import
print_function
import
unittest
from
parallel_executor_test_base
import
TestParallelExecutorBase
from
parallel_executor_test_base
import
TestParallelExecutorBase
,
DeviceType
import
seresnext_net
import
paddle.fluid.core
as
core
class
TestResnetWithReduceBase
(
TestParallelExecutorBase
):
def
_compare_reduce_and_allreduce
(
self
,
use_
cuda
,
delta2
=
1e-5
):
if
use_
cuda
and
not
core
.
is_compiled_with_cuda
():
def
_compare_reduce_and_allreduce
(
self
,
use_
device
,
delta2
=
1e-5
):
if
use_
device
==
DeviceType
.
GPU
and
not
core
.
is_compiled_with_cuda
():
return
all_reduce_first_loss
,
all_reduce_last_loss
=
self
.
check_network_convergence
(
seresnext_net
.
model
,
feed_dict
=
seresnext_net
.
feed_dict
(
use_
cuda
),
iter
=
seresnext_net
.
iter
(
use_
cuda
),
batch_size
=
seresnext_net
.
batch_size
(
use_
cuda
),
use_
cuda
=
use_cuda
,
feed_dict
=
seresnext_net
.
feed_dict
(
use_
device
),
iter
=
seresnext_net
.
iter
(
use_
device
),
batch_size
=
seresnext_net
.
batch_size
(
use_
device
),
use_
device
=
use_device
,
use_reduce
=
False
,
optimizer
=
seresnext_net
.
optimizer
)
reduce_first_loss
,
reduce_last_loss
=
self
.
check_network_convergence
(
seresnext_net
.
model
,
feed_dict
=
seresnext_net
.
feed_dict
(
use_
cuda
),
iter
=
seresnext_net
.
iter
(
use_
cuda
),
batch_size
=
seresnext_net
.
batch_size
(
use_
cuda
),
use_
cuda
=
use_cuda
,
feed_dict
=
seresnext_net
.
feed_dict
(
use_
device
),
iter
=
seresnext_net
.
iter
(
use_
device
),
batch_size
=
seresnext_net
.
batch_size
(
use_
device
),
use_
device
=
use_device
,
use_reduce
=
True
,
optimizer
=
seresnext_net
.
optimizer
)
...
...
@@ -46,25 +46,25 @@ class TestResnetWithReduceBase(TestParallelExecutorBase):
for
loss
in
zip
(
all_reduce_last_loss
,
reduce_last_loss
):
self
.
assertAlmostEquals
(
loss
[
0
],
loss
[
1
],
delta
=
loss
[
0
]
*
delta2
)
if
not
use_
cuda
:
if
not
use_
device
:
return
all_reduce_first_loss_seq
,
all_reduce_last_loss_seq
=
self
.
check_network_convergence
(
seresnext_net
.
model
,
feed_dict
=
seresnext_net
.
feed_dict
(
use_
cuda
),
iter
=
seresnext_net
.
iter
(
use_
cuda
),
batch_size
=
seresnext_net
.
batch_size
(
use_
cuda
),
use_
cuda
=
use_cuda
,
feed_dict
=
seresnext_net
.
feed_dict
(
use_
device
),
iter
=
seresnext_net
.
iter
(
use_
device
),
batch_size
=
seresnext_net
.
batch_size
(
use_
device
),
use_
device
=
use_device
,
use_reduce
=
False
,
optimizer
=
seresnext_net
.
optimizer
,
enable_sequential_execution
=
True
)
reduce_first_loss_seq
,
reduce_last_loss_seq
=
self
.
check_network_convergence
(
seresnext_net
.
model
,
feed_dict
=
seresnext_net
.
feed_dict
(
use_
cuda
),
iter
=
seresnext_net
.
iter
(
use_
cuda
),
batch_size
=
seresnext_net
.
batch_size
(
use_
cuda
),
use_
cuda
=
use_cuda
,
feed_dict
=
seresnext_net
.
feed_dict
(
use_
device
),
iter
=
seresnext_net
.
iter
(
use_
device
),
batch_size
=
seresnext_net
.
batch_size
(
use_
device
),
use_
device
=
use_device
,
use_reduce
=
True
,
optimizer
=
seresnext_net
.
optimizer
,
enable_sequential_execution
=
True
)
...
...
@@ -87,7 +87,8 @@ class TestResnetWithReduceBase(TestParallelExecutorBase):
class
TestResnetWithReduceCPU
(
TestResnetWithReduceBase
):
def
test_seresnext_with_reduce
(
self
):
self
.
_compare_reduce_and_allreduce
(
use_cuda
=
False
,
delta2
=
1e-3
)
self
.
_compare_reduce_and_allreduce
(
use_device
=
DeviceType
.
CPU
,
delta2
=
1e-3
)
if
__name__
==
'__main__'
:
...
...
python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext_with_reduce_gpu.py
浏览文件 @
4427df37
...
...
@@ -14,12 +14,13 @@
from
__future__
import
print_function
import
unittest
from
test_parallel_executor_seresnext_with_reduce_cpu
import
TestResnetWithReduceBase
from
test_parallel_executor_seresnext_with_reduce_cpu
import
TestResnetWithReduceBase
,
DeviceType
class
TestResnetWithReduceGPU
(
TestResnetWithReduceBase
):
def
test_seresnext_with_reduce
(
self
):
self
.
_compare_reduce_and_allreduce
(
use_cuda
=
True
,
delta2
=
1e-2
)
self
.
_compare_reduce_and_allreduce
(
use_device
=
DeviceType
.
GPU
,
delta2
=
1e-2
)
if
__name__
==
'__main__'
:
...
...
python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py
浏览文件 @
4427df37
...
...
@@ -17,7 +17,7 @@ from __future__ import print_function
import
paddle.fluid
as
fluid
import
transformer_model
import
numpy
as
np
from
parallel_executor_test_base
import
TestParallelExecutorBase
from
parallel_executor_test_base
import
TestParallelExecutorBase
,
DeviceType
import
unittest
import
paddle
import
paddle.fluid.core
as
core
...
...
@@ -191,16 +191,16 @@ class TestTransformer(TestParallelExecutorBase):
if
core
.
is_compiled_with_cuda
():
self
.
check_network_convergence
(
transformer
,
use_
cuda
=
True
,
use_
device
=
DeviceType
.
GPU
,
feed_data_reader
=
get_feed_data_reader
())
self
.
check_network_convergence
(
transformer
,
use_
cuda
=
True
,
use_
device
=
DeviceType
.
GPU
,
enable_sequential_execution
=
True
,
feed_data_reader
=
get_feed_data_reader
())
self
.
check_network_convergence
(
transformer
,
use_
cuda
=
False
,
use_
device
=
DeviceType
.
CPU
,
iter
=
2
,
feed_data_reader
=
get_feed_data_reader
())
...
...
python/paddle/fluid/tests/unittests/test_program_prune_backward.py
浏览文件 @
4427df37
...
...
@@ -22,7 +22,7 @@ import paddle.fluid as fluid
import
paddle.fluid.core
as
core
from
simple_nets
import
init_data
,
simple_fc_net
,
fc_with_batchnorm
import
seresnext_net
from
test_parallel_executor_transformer
import
transformer
,
get_feed_data_reader
from
test_parallel_executor_transformer
import
transformer
,
get_feed_data_reader
,
DeviceType
from
fake_reader
import
fake_imdb_reader
...
...
@@ -219,7 +219,7 @@ class TestProgramPruneBackward(unittest.TestCase):
with
self
.
program_scope_guard
():
self
.
check_prune_correctness
(
method
=
seresnext_net
.
model
,
feed_dict
=
seresnext_net
.
feed_dict
(
use_
cuda
=
False
),
feed_dict
=
seresnext_net
.
feed_dict
(
use_
device
=
DeviceType
.
CPU
),
optimizer
=
seresnext_net
.
optimizer
)
def
test_transformer
(
self
):
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录