Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle
提交
80dd1672
P
Paddle
项目概览
PaddlePaddle
/
Paddle
大约 1 年 前同步成功
通知
2298
Star
20931
Fork
5422
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1423
列表
看板
标记
里程碑
合并请求
543
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1,423
Issue
1,423
列表
看板
标记
里程碑
合并请求
543
合并请求
543
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
体验新版 GitCode,发现更多精彩内容 >>
未验证
提交
80dd1672
编写于
4月 06, 2023
作者:
张
张春乔
提交者:
GitHub
4月 06, 2023
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
mv PADDLE_WITH_ASCEND_CL (#52535)
上级
29c28e2f
变更
72
隐藏空白更改
内联
并排
Showing
72 changed file
with
47 addition
and
1902 deletion
+47
-1902
paddle/fluid/distributed/fleet_executor/message_bus.cc
paddle/fluid/distributed/fleet_executor/message_bus.cc
+1
-1
paddle/fluid/framework/operator.cc
paddle/fluid/framework/operator.cc
+2
-16
paddle/fluid/imperative/amp_auto_cast.cc
paddle/fluid/imperative/amp_auto_cast.cc
+0
-9
paddle/fluid/imperative/gradient_accumulator.cc
paddle/fluid/imperative/gradient_accumulator.cc
+0
-28
paddle/fluid/imperative/hccl_context.h
paddle/fluid/imperative/hccl_context.h
+0
-59
paddle/fluid/imperative/heter_ccl_context.cc
paddle/fluid/imperative/heter_ccl_context.cc
+0
-7
paddle/fluid/imperative/heter_ccl_context.h
paddle/fluid/imperative/heter_ccl_context.h
+0
-5
paddle/fluid/imperative/prepared_operator.cc
paddle/fluid/imperative/prepared_operator.cc
+0
-11
paddle/fluid/imperative/reducer.cc
paddle/fluid/imperative/reducer.cc
+5
-20
paddle/fluid/imperative/reducer.h
paddle/fluid/imperative/reducer.h
+1
-1
paddle/fluid/imperative/tracer.cc
paddle/fluid/imperative/tracer.cc
+1
-10
paddle/fluid/operators/amp/check_finite_and_unscale_op.cc
paddle/fluid/operators/amp/check_finite_and_unscale_op.cc
+0
-6
paddle/fluid/operators/coalesce_tensor_op.cc
paddle/fluid/operators/coalesce_tensor_op.cc
+1
-45
paddle/fluid/operators/collective/c_allgather_op.cc
paddle/fluid/operators/collective/c_allgather_op.cc
+0
-4
paddle/fluid/operators/collective/c_allgather_op_npu.cc
paddle/fluid/operators/collective/c_allgather_op_npu.cc
+0
-47
paddle/fluid/operators/collective/c_allgather_op_npu_test.cc
paddle/fluid/operators/collective/c_allgather_op_npu_test.cc
+0
-4
paddle/fluid/operators/collective/c_allreduce_max_op_npu_test.cc
...fluid/operators/collective/c_allreduce_max_op_npu_test.cc
+0
-4
paddle/fluid/operators/collective/c_allreduce_op.h
paddle/fluid/operators/collective/c_allreduce_op.h
+2
-179
paddle/fluid/operators/collective/c_allreduce_sum_op_npu_test.cc
...fluid/operators/collective/c_allreduce_sum_op_npu_test.cc
+0
-4
paddle/fluid/operators/collective/c_broadcast_op.cc
paddle/fluid/operators/collective/c_broadcast_op.cc
+1
-4
paddle/fluid/operators/collective/c_broadcast_op_npu.cc
paddle/fluid/operators/collective/c_broadcast_op_npu.cc
+0
-54
paddle/fluid/operators/collective/c_broadcast_op_npu_test.cc
paddle/fluid/operators/collective/c_broadcast_op_npu_test.cc
+0
-4
paddle/fluid/operators/collective/c_comm_init_hccl_op.cc
paddle/fluid/operators/collective/c_comm_init_hccl_op.cc
+1
-49
paddle/fluid/operators/collective/c_gen_hccl_id_op.cc
paddle/fluid/operators/collective/c_gen_hccl_id_op.cc
+0
-79
paddle/fluid/operators/collective/c_reduce_op.h
paddle/fluid/operators/collective/c_reduce_op.h
+2
-86
paddle/fluid/operators/collective/c_reduce_sum_op_npu_test.cc
...le/fluid/operators/collective/c_reduce_sum_op_npu_test.cc
+0
-4
paddle/fluid/operators/collective/c_reducescatter_op.cc
paddle/fluid/operators/collective/c_reducescatter_op.cc
+1
-4
paddle/fluid/operators/collective/c_reducescatter_op_npu.cc
paddle/fluid/operators/collective/c_reducescatter_op_npu.cc
+0
-55
paddle/fluid/operators/collective/c_reducescatter_op_npu_test.cc
...fluid/operators/collective/c_reducescatter_op_npu_test.cc
+0
-4
paddle/fluid/operators/collective/c_sync_calc_stream_op.h
paddle/fluid/operators/collective/c_sync_calc_stream_op.h
+0
-11
paddle/fluid/operators/collective/c_sync_comm_stream_op.h
paddle/fluid/operators/collective/c_sync_comm_stream_op.h
+1
-14
paddle/fluid/operators/collective/c_sync_comm_stream_op_npu_test.cc
...id/operators/collective/c_sync_comm_stream_op_npu_test.cc
+0
-4
paddle/fluid/operators/collective/checknumeric_npu_test.cc
paddle/fluid/operators/collective/checknumeric_npu_test.cc
+0
-4
paddle/fluid/operators/collective/gen_hccl_id_op.cc
paddle/fluid/operators/collective/gen_hccl_id_op.cc
+0
-140
paddle/fluid/operators/collective/gen_hccl_id_op_helper.cc
paddle/fluid/operators/collective/gen_hccl_id_op_helper.cc
+0
-4
paddle/fluid/operators/collective/mp_allreduce_sum_op.cc
paddle/fluid/operators/collective/mp_allreduce_sum_op.cc
+1
-4
paddle/fluid/operators/collective/partial_allgather_op.cc
paddle/fluid/operators/collective/partial_allgather_op.cc
+1
-4
paddle/fluid/operators/collective/partial_allgather_op_npu.cc
...le/fluid/operators/collective/partial_allgather_op_npu.cc
+0
-59
paddle/fluid/operators/collective/partial_recv_op.cc
paddle/fluid/operators/collective/partial_recv_op.cc
+1
-6
paddle/fluid/operators/collective/partial_recv_op_npu.cc
paddle/fluid/operators/collective/partial_recv_op_npu.cc
+0
-49
paddle/fluid/operators/collective/partial_send_op.cc
paddle/fluid/operators/collective/partial_send_op.cc
+1
-6
paddle/fluid/operators/collective/partial_send_op_npu.cc
paddle/fluid/operators/collective/partial_send_op_npu.cc
+0
-44
paddle/fluid/operators/collective/recv_v2_op.cc
paddle/fluid/operators/collective/recv_v2_op.cc
+1
-6
paddle/fluid/operators/collective/recv_v2_op_npu.cc
paddle/fluid/operators/collective/recv_v2_op_npu.cc
+0
-54
paddle/fluid/operators/collective/recv_v2_op_npu_test.cc
paddle/fluid/operators/collective/recv_v2_op_npu_test.cc
+0
-4
paddle/fluid/operators/collective/send_v2_op.cc
paddle/fluid/operators/collective/send_v2_op.cc
+1
-6
paddle/fluid/operators/collective/send_v2_op_npu.cc
paddle/fluid/operators/collective/send_v2_op_npu.cc
+0
-51
paddle/fluid/operators/collective/send_v2_op_npu_test.cc
paddle/fluid/operators/collective/send_v2_op_npu_test.cc
+0
-4
paddle/fluid/operators/controlflow/conditional_block_op.h
paddle/fluid/operators/controlflow/conditional_block_op.h
+0
-6
paddle/fluid/operators/controlflow/while_op_helper.cc
paddle/fluid/operators/controlflow/while_op_helper.cc
+2
-3
paddle/fluid/operators/math/concat_and_split.cc
paddle/fluid/operators/math/concat_and_split.cc
+1
-88
paddle/fluid/operators/memcpy_h2d_op.cc
paddle/fluid/operators/memcpy_h2d_op.cc
+0
-28
paddle/fluid/operators/memcpy_op.cc
paddle/fluid/operators/memcpy_op.cc
+0
-16
paddle/fluid/operators/memcpy_op.h
paddle/fluid/operators/memcpy_op.h
+1
-8
paddle/fluid/operators/reader/buffered_reader.cc
paddle/fluid/operators/reader/buffered_reader.cc
+0
-65
paddle/fluid/operators/reader/buffered_reader.h
paddle/fluid/operators/reader/buffered_reader.h
+1
-8
paddle/fluid/operators/run_program_op_npu.cc
paddle/fluid/operators/run_program_op_npu.cc
+0
-16
paddle/fluid/operators/scatter_op_npu.cc
paddle/fluid/operators/scatter_op_npu.cc
+0
-104
paddle/fluid/operators/softmax_with_cross_entropy_op.cc
paddle/fluid/operators/softmax_with_cross_entropy_op.cc
+6
-9
paddle/fluid/operators/tensor_formatter.cc
paddle/fluid/operators/tensor_formatter.cc
+0
-5
paddle/fluid/operators/unsqueeze_op_npu.cc
paddle/fluid/operators/unsqueeze_op_npu.cc
+0
-47
paddle/fluid/operators/utils.h
paddle/fluid/operators/utils.h
+1
-1
paddle/fluid/platform/collective_helper.h
paddle/fluid/platform/collective_helper.h
+0
-112
paddle/fluid/platform/device_context.cc
paddle/fluid/platform/device_context.cc
+0
-46
paddle/fluid/platform/gen_comm_id_helper.cc
paddle/fluid/platform/gen_comm_id_helper.cc
+1
-11
paddle/fluid/platform/init.cc
paddle/fluid/platform/init.cc
+1
-3
paddle/fluid/platform/place.h
paddle/fluid/platform/place.h
+0
-12
paddle/fluid/platform/stream_callback_manager.cc
paddle/fluid/platform/stream_callback_manager.cc
+2
-16
paddle/fluid/pybind/eager_legacy_op_function_generator.cc
paddle/fluid/pybind/eager_legacy_op_function_generator.cc
+1
-13
paddle/fluid/pybind/imperative.cc
paddle/fluid/pybind/imperative.cc
+4
-4
paddle/phi/kernels/funcs/strided_memcpy.h
paddle/phi/kernels/funcs/strided_memcpy.h
+1
-1
paddle/testing/paddle_gtest_main.cc
paddle/testing/paddle_gtest_main.cc
+0
-3
未找到文件。
paddle/fluid/distributed/fleet_executor/message_bus.cc
浏览文件 @
80dd1672
...
...
@@ -52,7 +52,7 @@ void MessageBus::Init(
}
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
defined(PADDLE_WITH_XPU_BKCL)
|| defined(PADDLE_WITH_ASCEND_CL)
defined(PADDLE_WITH_XPU_BKCL)
// NOTE: To make the brpc is compatible with collective,
// need release the handler holding the ip address.
if
(
addr_
!=
""
)
{
...
...
paddle/fluid/framework/operator.cc
浏览文件 @
80dd1672
...
...
@@ -2128,12 +2128,7 @@ OpKernelType OperatorWithKernel::InnerGetExpectedKernelType(
// CPUKernel will be executed and a warning will be given at the same
// time.
expected_kernel_key
.
place_
=
platform
::
CPUPlace
();
#ifdef PADDLE_WITH_ASCEND_CL
if
(
SupportNPU
())
{
auto
&
dev_ctx
=
ctx
.
device_context
();
expected_kernel_key
.
place_
=
dev_ctx
.
GetPlace
();
}
#endif
if
(
platform
::
is_cpu_place
(
expected_kernel_key
.
place_
))
{
LOG_FIRST_N
(
WARNING
,
1
)
<<
"Op("
<<
type_
...
...
@@ -2305,16 +2300,7 @@ void OperatorWithKernel::ChooseKernel(const ExecutionContext& ctx) const {
kernel_iter
=
kernels
.
find
(
expected_kernel_key
);
}
#endif
#ifdef PADDLE_WITH_ASCEND_CL
if
(
kernel_iter
==
kernels
.
end
()
&&
platform
::
is_npu_place
(
expected_kernel_key
.
place_
))
{
VLOG
(
3
)
<<
"missing NPU kernel: "
<<
type_
<<
", expected_kernel_key:"
<<
expected_kernel_key
<<
", fallbacking to CPU one!"
;
expected_kernel_key
.
place_
=
platform
::
CPUPlace
();
kernel_iter
=
kernels
.
find
(
expected_kernel_key
);
}
#endif
#ifdef PADDLE_WITH_MLU
if
(
kernel_iter
==
kernels
.
end
()
&&
platform
::
is_mlu_place
(
expected_kernel_key
.
place_
))
{
...
...
paddle/fluid/imperative/amp_auto_cast.cc
浏览文件 @
80dd1672
...
...
@@ -150,15 +150,6 @@ AmpOperators::AmpOperators()
unsupported_bf16_ops_
->
insert
(
unsupported_ops_gpu_bf16
.
begin
(),
unsupported_ops_gpu_bf16
.
end
());
// NOTE: GPU/NPU/XPU/MLU is compiled seperatly.
#elif defined(PADDLE_WITH_ASCEND_CL)
auto
unsupported_ops_npu_fp16
=
std
::
get
<
2
>
(
OpSupportedInfos
(
"NPU"
,
paddle
::
framework
::
proto
::
VarType
::
FP16
));
unsupported_fp16_ops_
->
insert
(
unsupported_ops_npu_fp16
.
begin
(),
unsupported_ops_npu_fp16
.
end
());
auto
unsupported_ops_npu_bf16
=
std
::
get
<
2
>
(
OpSupportedInfos
(
"NPU"
,
paddle
::
framework
::
proto
::
VarType
::
BF16
));
unsupported_bf16_ops_
->
insert
(
unsupported_ops_npu_bf16
.
begin
(),
unsupported_ops_npu_bf16
.
end
());
#elif defined(PADDLE_WITH_XPU)
auto
unsupported_ops_xpu_fp16
=
std
::
get
<
2
>
(
OpSupportedInfos
(
"XPU"
,
paddle
::
framework
::
proto
::
VarType
::
FP16
));
...
...
paddle/fluid/imperative/gradient_accumulator.cc
浏览文件 @
80dd1672
...
...
@@ -34,8 +34,6 @@
#include "paddle/phi/backends/xpu/enforce_xpu.h"
#include "xpu/refactor/math.h"
#endif
#ifdef PADDLE_WITH_ASCEND_CL
#endif
#ifdef PADDLE_WITH_MLU
#include "paddle/fluid/operators/mlu/mlu_baseop.h"
#endif
...
...
@@ -270,32 +268,6 @@ void TensorAdd(const VarType& src, VarType* dst) {
#endif
}
#ifdef PADDLE_WITH_ASCEND_CL
if
(
platform
::
is_npu_place
(
place
))
{
platform
::
DeviceContextPool
&
pool
=
platform
::
DeviceContextPool
::
Instance
();
platform
::
DeviceContext
*
ctx
=
pool
.
Get
(
place
);
auto
dev_ctx
=
dynamic_cast
<
platform
::
NPUDeviceContext
*>
(
ctx
);
if
(
data_type
==
framework
::
DataTypeTrait
<
float
>::
DataType
())
{
dst_tensor
->
mutable_data
<
float
>
(
place
);
}
else
if
(
data_type
==
framework
::
DataTypeTrait
<
double
>::
DataType
())
{
dst_tensor
->
mutable_data
<
double
>
(
place
);
}
else
if
(
data_type
==
framework
::
DataTypeTrait
<
platform
::
float16
>::
DataType
())
{
dst_tensor
->
mutable_data
<
platform
::
float16
>
(
place
);
}
else
{
PADDLE_THROW
(
platform
::
errors
::
Unimplemented
(
"Gradient accumulation of data type (%s) on place (%s) is not "
"supported in imperative mode"
,
framework
::
DataTypeToString
(
data_type
),
place
));
}
const
auto
&
runner
=
operators
::
NpuOpRunner
(
"Add"
,
{
*
dst_tensor
,
src_tensor
},
{
*
dst_tensor
},
{});
runner
.
Run
(
dev_ctx
->
stream
());
return
;
}
#endif
#ifdef PADDLE_WITH_XPU
if
(
platform
::
is_xpu_place
(
place
))
{
if
(
data_type
==
framework
::
DataTypeTrait
<
float
>::
DataType
())
{
...
...
paddle/fluid/imperative/hccl_context.h
浏览文件 @
80dd1672
...
...
@@ -12,62 +12,3 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#ifdef PADDLE_WITH_ASCEND_CL
#include <memory>
#include <string>
#include <vector>
#include "paddle/fluid/imperative/parallel_context.h"
namespace
paddle
{
namespace
framework
{
class
Variable
;
}
// namespace framework
}
// namespace paddle
namespace
paddle
{
namespace
imperative
{
class
HCCLParallelContext
:
public
ParallelContext
{
public:
explicit
HCCLParallelContext
(
const
ParallelStrategy
&
strategy
,
const
platform
::
Place
&
place
)
:
ParallelContext
(
strategy
,
place
)
{}
~
HCCLParallelContext
()
override
=
default
;
void
BcastHCCLId
(
const
std
::
vector
<
HcclRootInfo
>&
hccl_ids
,
int
root
,
// NOLINT
int
server_fd
);
void
Init
()
override
;
void
InitWithRingID
(
int
ring_id
)
override
;
void
AllReduceByStream
(
const
framework
::
Variable
&
src
,
framework
::
Variable
*
dst
,
int
ring_id
,
bool
use_calc_stream
)
override
;
void
Broadcast
(
framework
::
Variable
*
src
,
int
ring_id
)
override
;
paddle
::
platform
::
DeviceContext
*
GetDeviceContext
(
int
ring_id
)
override
;
void
WaitCompute
(
int
ring_id
)
override
;
void
WaitComm
(
int
ring_id
)
override
;
void
SynchronizeCompute
()
override
;
private:
// used for comm wait compute, compute_stream-->event-->comm_stream[ring_id]
std
::
vector
<
std
::
shared_ptr
<
platform
::
NpuStreamObject
>>
compute_events_
;
// used for compute wait comm, comm_stream[ring_id]-->event-->compute_stream
std
::
vector
<
std
::
shared_ptr
<
platform
::
NpuEventObject
>>
comm_events_
;
};
}
// namespace imperative
}
// namespace paddle
#endif
paddle/fluid/imperative/heter_ccl_context.cc
浏览文件 @
80dd1672
...
...
@@ -42,8 +42,6 @@ HeterParallelContext::HeterParallelContext(const ParallelStrategy &strategy,
:
ParallelContext
(
strategy
,
platform
::
CUDAPlace
(
device_id
))
#elif PADDLE_WITH_XPU_BKCL
:
ParallelContext
(
strategy
,
platform
::
XPUPlace
(
device_id
))
#elif PADDLE_WITH_ASCEND_CL
:
ParallelContext
(
strategy
,
platform
::
NPUPlace
(
device_id
))
#else
:
ParallelContext
(
strategy
,
platform
::
CPUPlace
())
#endif
...
...
@@ -112,11 +110,6 @@ HeterParallelContext::HeterParallelContext(const ParallelStrategy &strategy,
node_parallel_ctx_
=
std
::
make_shared
<
BKCLParallelContext
>
(
node_strategy_
,
node_place_
);
#endif
#ifdef PADDLE_WITH_ASCEND_CL
node_place_
=
platform
::
NPUPlace
(
device_id
);
node_parallel_ctx_
=
std
::
make_shared
<
HCCLParallelContext
>
(
node_strategy_
,
node_place_
);
#endif
}
void
HeterParallelContext
::
Init
()
{
...
...
paddle/fluid/imperative/heter_ccl_context.h
浏览文件 @
80dd1672
...
...
@@ -24,11 +24,6 @@
#ifdef PADDLE_WITH_XPU_BKCL
#include "paddle/fluid/imperative/bkcl_context.h"
#endif
#ifdef PADDLE_WITH_ASCEND_CL
#include "paddle/fluid/imperative/hccl_context.h"
#endif
#include "paddle/fluid/imperative/gloo_context.h"
#include "paddle/fluid/imperative/parallel_context.h"
...
...
paddle/fluid/imperative/prepared_operator.cc
浏览文件 @
80dd1672
...
...
@@ -458,17 +458,6 @@ PreparedOp PrepareImpl(
}
}
#endif
#ifdef PADDLE_WITH_ASCEND_CL
if
(
kernel_iter
==
kernels
.
end
()
&&
paddle
::
platform
::
is_npu_place
(
fluid_kernel_type
.
place_
))
{
VLOG
(
3
)
<<
"missing NPU kernel: "
<<
op
.
Type
()
<<
", expected_kernel_key:"
<<
fluid_kernel_type
<<
", fallbacking to CPU one!"
;
fluid_kernel_type
.
place_
=
platform
::
CPUPlace
();
kernel_iter
=
kernels
.
find
(
fluid_kernel_type
);
}
#endif
#ifdef PADDLE_WITH_IPU
if
(
kernel_iter
==
kernels
.
end
()
&&
paddle
::
platform
::
is_ipu_place
(
fluid_kernel_type
.
place_
))
{
...
...
paddle/fluid/imperative/reducer.cc
浏览文件 @
80dd1672
...
...
@@ -31,7 +31,7 @@ namespace imperative {
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
defined(PADDLE_WITH_XPU_BKCL) || defined(PADDLE_WITH_GLOO) || \
defined(PADDLE_WITH_
ASCEND_CL) || defined(PADDLE_WITH_
CNCL)
defined(PADDLE_WITH_CNCL)
// div the nranks
void
Group
::
DivNRanks
(
const
platform
::
DeviceContext
&
context
,
int64_t
nranks
)
{
phi
::
DenseTensor
*
tensor
=
...
...
@@ -305,17 +305,10 @@ void Group::ConcatTensors(const platform::DeviceContext &context) {
"Please recompile or reinstall Paddle with BKCL support."
));
#endif
}
else
if
(
platform
::
is_npu_place
(
place
))
{
#ifdef PADDLE_WITH_ASCEND_CL
ConcatTensorsWithType
(
static_cast
<
const
platform
::
NPUDeviceContext
&>
(
context
),
dense_tensors_
,
&
dense_contents_
,
dtype_
);
#else
PADDLE_THROW
(
platform
::
errors
::
PermissionDenied
(
"Paddle can't concat npu grads since it's not compiled with HCCL,"
"Please recompile or reinstall Paddle with HCCL support."
));
#endif
}
else
if
(
platform
::
is_mlu_place
(
place
))
{
#ifdef PADDLE_WITH_CNCL
ConcatTensorsWithType
(
...
...
@@ -365,17 +358,10 @@ void Group::SplitTensors(const platform::DeviceContext &context) {
"Please recompile or reinstall Paddle with BKCL support."
));
#endif
}
else
if
(
platform
::
is_npu_place
(
place
))
{
#ifdef PADDLE_WITH_ASCEND_CL
SplitTensorsWithType
(
static_cast
<
const
platform
::
NPUDeviceContext
&>
(
context
),
&
dense_contents_
,
&
dense_tensors_
,
dtype_
);
#else
PADDLE_THROW
(
platform
::
errors
::
PermissionDenied
(
"Paddle can't split npu grad since it's not compiled with HCCL,"
"Please recompile or reinstall Paddle with HCCL support."
));
#endif
}
else
if
(
platform
::
is_mlu_place
(
place
))
{
#ifdef PADDLE_WITH_CNCL
SplitTensorsWithType
(
...
...
@@ -1129,9 +1115,8 @@ void Reducer::FinalizeBackward() {
if
(
find_unused_vars_each_step_
)
{
// TODO(liuyuhui) support xpu about Tensorcopy/TensorFromVector/TensorToVector
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
defined(PADDLE_WITH_GLOO) || defined(PADDLE_WITH_ASCEND_CL) || \
defined(PADDLE_WITH_CNCL)
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
defined(PADDLE_WITH_GLOO) || defined(PADDLE_WITH_CNCL)
ProcessUnusedDenseVars
();
#endif
// Initialize local used vars
...
...
paddle/fluid/imperative/reducer.h
浏览文件 @
80dd1672
...
...
@@ -46,7 +46,7 @@ namespace imperative {
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
defined(PADDLE_WITH_XPU_BKCL) || defined(PADDLE_WITH_GLOO) || \
defined(PADDLE_WITH_
ASCEND_CL) || defined(PADDLE_WITH_
CNCL)
defined(PADDLE_WITH_CNCL)
template
<
typename
T
>
struct
DivNRanksFunctor
{
...
...
paddle/fluid/imperative/tracer.cc
浏览文件 @
80dd1672
...
...
@@ -135,15 +135,10 @@ paddle::framework::GarbageCollector* Tracer::MutableGarbageCollectorIfNotExists(
gc
.
reset
(
new
framework
::
CPUGarbageCollector
(
place
,
0
));
VLOG
(
10
)
<<
"Created GarbageCollector at "
<<
place
;
}
else
if
(
platform
::
is_npu_place
(
place
))
{
#if defined(PADDLE_WITH_ASCEND_CL)
// TODO(zhiqiu): fix bugs and enable NPUDefaultStreamGarbageCollector.
gc
.
reset
(
new
framework
::
NPUUnsafeFastGarbageCollector
(
place
,
0
));
VLOG
(
10
)
<<
"Created GarbageCollector at "
<<
place
;
#else
PADDLE_THROW
(
platform
::
errors
::
PermissionDenied
(
"Paddle can't use NPU device since it's not compiled with NPU,"
"Please recompile or reinstall Paddle with NPU support."
));
#endif
}
else
if
(
platform
::
is_ipu_place
(
place
))
{
#if defined(PADDLE_WITH_IPU)
gc
.
reset
(
new
framework
::
IPUGarbageCollector
(
place
,
0
));
...
...
@@ -303,12 +298,8 @@ void Tracer::TraceOpImpl(const std::string& type,
"PaddlePaddle should compile with XPU if use XPUPlace."
));
#endif
}
else
if
(
platform
::
is_npu_place
(
place
))
{
#ifdef PADDLE_WITH_ASCEND_CL
platform
::
SetNPUDeviceId
(
place
.
device
);
#else
PADDLE_THROW
(
platform
::
errors
::
PreconditionNotMet
(
"PaddlePaddle should compile with NPU if use NPUPlace."
));
#endif
}
else
if
(
platform
::
is_mlu_place
(
place
))
{
#ifdef PADDLE_WITH_MLU
platform
::
SetMLUDeviceId
(
place
.
device
);
...
...
paddle/fluid/operators/amp/check_finite_and_unscale_op.cc
浏览文件 @
80dd1672
...
...
@@ -49,12 +49,6 @@ class CheckFiniteAndUnscaleOpMaker : public framework::OpProtoAndCheckerMaker {
AddInput
(
"Scale"
,
"(Tensor) 1-dim tensor, the scale of check_finite_and_unscale "
"operator."
);
#ifdef PADDLE_WITH_ASCEND_CL
AddInput
(
"FloatStatus"
,
"(Tensor) 1-dim tensor of shape [8], allocated by "
"alloc_float_status op"
)
.
AsDispensable
();
#endif
AddOutput
(
"Out"
,
"(Tensors) The scaled output tensor of "
"check_finite_and_unscale operator."
)
...
...
paddle/fluid/operators/coalesce_tensor_op.cc
浏览文件 @
80dd1672
...
...
@@ -57,26 +57,7 @@ struct FillConstantVisitor {
void
apply
(
typename
std
::
enable_if
<!
(
std
::
is_same
<
T
,
int8_t
>::
value
||
std
::
is_same
<
T
,
int16_t
>::
value
)
>::
type
*
=
nullptr
)
const
{
#ifdef PADDLE_WITH_ASCEND_CL
if
(
platform
::
is_npu_place
(
dev_ctx_
.
GetPlace
()))
{
phi
::
DenseTensor
tensor_tmp
(
framework
::
TransToPhiDataType
(
dtype_
));
tensor_tmp
.
mutable_data
<
T
>
({
1
},
context_
.
GetPlace
());
FillNpuTensorWithConstant
<
T
>
(
&
tensor_tmp
,
static_cast
<
T
>
(
value_
));
const
auto
&
runner
=
NpuOpRunner
(
"FillD"
,
{
tensor_tmp
},
{
*
tensor_
},
{{
"dims"
,
phi
::
vectorize
(
tensor_
->
dims
())}});
auto
stream
=
context_
.
template
device_context
<
paddle
::
platform
::
NPUDeviceContext
>()
.
stream
();
runner
.
Run
(
stream
);
}
else
{
phi
::
funcs
::
SetConstant
<
DeviceContext
,
T
>
set_constant
;
set_constant
(
dev_ctx_
,
tensor_
,
static_cast
<
T
>
(
value_
));
}
#elif defined(PADDLE_WITH_MLU)
#if defined(PADDLE_WITH_MLU)
if
(
platform
::
is_mlu_place
(
context_
.
GetPlace
()))
{
FillMLUTensorWithHostValue
<
T
>
(
context_
,
static_cast
<
T
>
(
value_
),
tensor_
);
}
else
{
...
...
@@ -235,12 +216,6 @@ class CoalesceTensorOpKernel : public framework::OpKernel<T> {
// Init the continuous space
size_t
offset
=
0
;
if
(
context
.
Attr
<
bool
>
(
"copy_data"
))
{
#ifdef PADDLE_WITH_ASCEND_CL
framework
::
VisitDataType
(
dtype
,
FillConstantVisitor
<
DeviceContext
>
(
dev_ctx
,
fused_tensor
,
static_cast
<
float
>
(
0.0
),
dtype
,
context
));
#endif
for
(
size_t
i
=
0
;
i
<
in_var_names
.
size
();
++
i
)
{
size_t
len
=
static_cast
<
size_t
>
(
in_tensors
[
i
]
->
numel
());
auto
sub_tensor
=
fused_tensor
->
Slice
(
...
...
@@ -534,25 +509,6 @@ REGISTER_OPERATOR(coalesce_tensor,
namespace
ops
=
paddle
::
operators
;
namespace
plat
=
paddle
::
platform
;
#if defined(PADDLE_WITH_ASCEND_CL)
REGISTER_OP_CUDA_KERNEL
(
coalesce_tensor
,
ops
::
CoalesceTensorOpKernel
<
paddle
::
platform
::
NPUDeviceContext
,
plat
::
float16
>
,
ops
::
CoalesceTensorOpKernel
<
paddle
::
platform
::
NPUDeviceContext
,
int
>
,
ops
::
CoalesceTensorOpKernel
<
paddle
::
platform
::
NPUDeviceContext
,
float
>
,
ops
::
CoalesceTensorOpKernel
<
paddle
::
platform
::
NPUDeviceContext
,
double
>
);
#endif
#if defined(PADDLE_WITH_ASCEND_CL)
REGISTER_OP_NPU_KERNEL
(
coalesce_tensor
,
ops
::
CoalesceTensorOpKernel
<
phi
::
CPUContext
,
int
>
,
ops
::
CoalesceTensorOpKernel
<
phi
::
CPUContext
,
float
>
,
ops
::
CoalesceTensorOpKernel
<
phi
::
CPUContext
,
plat
::
float16
>
,
ops
::
CoalesceTensorOpKernel
<
phi
::
CPUContext
,
double
>
);
#endif
#if defined(PADDLE_WITH_MLU)
REGISTER_OP_MLU_KERNEL
(
coalesce_tensor
,
...
...
paddle/fluid/operators/collective/c_allgather_op.cc
浏览文件 @
80dd1672
...
...
@@ -44,10 +44,6 @@ class CAllGatherOpMaker : public framework::OpProtoAndCheckerMaker {
AddOutput
(
"Out"
,
"(Tensor) the allgather result"
);
AddAttr
<
int
>
(
"ring_id"
,
"(int default 0) communication ring id."
)
.
SetDefault
(
0
);
#if defined(PADDLE_WITH_ASCEND_CL)
AddAttr
<
std
::
string
>
(
"tag"
,
"(string default tag) tag for all gather."
)
.
SetDefault
(
"tag"
);
#endif
AddAttr
<
bool
>
(
"use_calc_stream"
,
"(bool default false) eject CUDA operations to calculation stream."
)
...
...
paddle/fluid/operators/collective/c_allgather_op_npu.cc
浏览文件 @
80dd1672
...
...
@@ -16,10 +16,6 @@ limitations under the License. */
#include "paddle/fluid/operators/collective/c_allgather_op.h"
#if defined(PADDLE_WITH_ASCEND_CL)
#include "paddle/fluid/platform/collective_helper.h"
#endif
namespace
paddle
{
namespace
operators
{
...
...
@@ -27,51 +23,8 @@ template <typename T>
class
CAllGatherOpASCENDKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
#if defined(PADDLE_WITH_ASCEND_CL)
auto
in
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"X"
);
auto
out
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
"Out"
);
HcclDataType
dtype
=
platform
::
ToHCCLDataType
(
framework
::
TransToProtoVarType
(
in
->
dtype
()));
int
ring_id
=
ctx
.
Attr
<
int
>
(
"ring_id"
);
std
::
string
group
=
std
::
string
(
HCOM_GROUP_PREFIX
)
+
std
::
to_string
(
ring_id
);
auto
place
=
ctx
.
GetPlace
();
auto
comm
=
platform
::
HCCLCommContext
::
Instance
().
Get
(
ring_id
,
place
);
int
nranks
=
comm
->
nranks
();
framework
::
DDim
out_dims
=
in
->
dims
();
out_dims
[
0
]
*=
nranks
;
out
->
mutable_data
<
T
>
(
out_dims
,
place
);
uint64_t
send_numel
=
in
->
numel
();
void
*
send_buff
=
reinterpret_cast
<
void
*>
(
const_cast
<
T
*>
(
in
->
data
<
T
>
()));
void
*
recv_buff
=
reinterpret_cast
<
void
*>
(
out
->
data
<
T
>
());
aclrtStream
stream
=
nullptr
;
if
(
ctx
.
Attr
<
bool
>
(
"use_calc_stream"
))
{
auto
dev_ctx
=
platform
::
DeviceContextPool
::
Instance
().
Get
(
place
);
stream
=
static_cast
<
platform
::
NPUDeviceContext
*>
(
dev_ctx
)
->
stream
();
}
else
{
stream
=
comm
->
stream
();
}
VLOG
(
3
)
<<
"begin hccl allgather, parameter is: "
<<
", group is "
<<
group
<<
", ring_id is "
<<
ring_id
<<
", nranks is "
<<
nranks
;
PADDLE_ENFORCE_NPU_SUCCESS
(
platform
::
dynload
::
HcclAllGather
(
send_buff
,
recv_buff
,
send_numel
,
dtype
,
comm
->
comm
(),
reinterpret_cast
<
void
*>
(
stream
)));
#else
PADDLE_THROW
(
platform
::
errors
::
PreconditionNotMet
(
"PaddlePaddle should compile with NPU."
));
#endif
}
};
...
...
paddle/fluid/operators/collective/c_allgather_op_npu_test.cc
浏览文件 @
80dd1672
...
...
@@ -34,10 +34,6 @@ limitations under the License. */
#include "paddle/fluid/string/printf.h"
#include "paddle/phi/kernels/funcs/math_function.h"
#if defined(PADDLE_WITH_ASCEND_CL)
#include "paddle/fluid/platform/collective_helper.h"
#endif
namespace
f
=
paddle
::
framework
;
namespace
p
=
paddle
::
platform
;
...
...
paddle/fluid/operators/collective/c_allreduce_max_op_npu_test.cc
浏览文件 @
80dd1672
...
...
@@ -34,10 +34,6 @@ limitations under the License. */
#include "paddle/fluid/string/printf.h"
#include "paddle/phi/kernels/funcs/math_function.h"
#if defined(PADDLE_WITH_ASCEND_CL)
#include "paddle/fluid/platform/collective_helper.h"
#endif
namespace
f
=
paddle
::
framework
;
namespace
p
=
paddle
::
platform
;
...
...
paddle/fluid/operators/collective/c_allreduce_op.h
浏览文件 @
80dd1672
...
...
@@ -24,9 +24,8 @@ limitations under the License. */
#include "paddle/fluid/memory/memory.h"
#include "paddle/phi/api/include/tensor.h"
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
defined(PADDLE_WITH_ASCEND_CL) || defined(PADDLE_WITH_XPU_BKCL) || \
defined(PADDLE_WITH_CNCL)
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
defined(PADDLE_WITH_XPU_BKCL) || defined(PADDLE_WITH_CNCL)
#include "paddle/fluid/platform/collective_helper.h"
#endif
...
...
@@ -44,17 +43,10 @@ limitations under the License. */
#include "paddle/fluid/framework/fleet/gloo_wrapper.h"
#endif
#if defined(PADDLE_WITH_ASCEND_CL)
#endif
#if defined(PADDLE_WITH_CNCL)
#include "paddle/fluid/platform/device/mlu/cncl_helper.h"
#endif
#if defined(PADDLE_WITH_ASCEND_CL)
DECLARE_bool
(
hccl_check_nan
);
#endif
namespace
paddle
{
namespace
operators
{
...
...
@@ -150,177 +142,12 @@ class CAllReduceOpCPUKernel : public framework::OpKernel<T> {
template <typename T, typename DeviceContext> \
class op_name##CPUKernel : public CAllReduceOpCPUKernel<red_type, T> {};
#if defined(PADDLE_WITH_ASCEND_CL)
// return true if found_nan or return false;
inline
bool
ContainsNan
(
const
paddle
::
platform
::
NPUDeviceContext
&
dev_ctx
,
aclrtStream
stream
,
const
phi
::
DenseTensor
*
in
)
{
phi
::
DenseTensor
out
(
in
->
type
());
phi
::
DenseTensor
mean
(
in
->
type
());
mean
.
Resize
({
1
});
mean
.
mutable_data
<
float
>
(
dev_ctx
.
GetPlace
());
std
::
vector
<
int
>
axes
;
for
(
int
i
=
0
;
i
<
in
->
dims
().
size
();
++
i
)
{
axes
.
push_back
(
i
);
}
std
::
vector
<
float
>
vec
;
try
{
const
auto
&
runner_mean
=
paddle
::
operators
::
NpuOpRunner
(
"ReduceMeanD"
,
{
*
in
},
{
mean
},
{{
"axes"
,
axes
},
{
"keep_dims"
,
false
}});
paddle
::
framework
::
TensorToVector
(
mean
,
dev_ctx
,
&
vec
);
}
catch
(...)
{
LOG
(
WARNING
)
<<
"ContainsNan catch exception"
;
return
true
;
}
VLOG
(
4
)
<<
"reducemeand result:"
<<
vec
[
0
];
if
(
std
::
isnan
(
static_cast
<
float
>
(
vec
[
0
])))
{
LOG
(
WARNING
)
<<
"ContainsNan detects nan"
;
return
true
;
}
if
(
std
::
isinf
(
static_cast
<
float
>
(
vec
[
0
])))
{
LOG
(
WARNING
)
<<
"ContainsNan detects inf"
;
}
return
false
;
}
#endif
template
<
ReduceType
red_type
,
typename
T
>
class
CAllReduceOpASCENDKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
#if defined(PADDLE_WITH_ASCEND_CL)
if
(
ctx
.
HasInput
(
"Cond"
))
{
auto
cond
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"Cond"
);
auto
place
=
cond
->
place
();
PADDLE_ENFORCE_EQ
(
platform
::
is_cpu_place
(
place
),
true
,
platform
::
errors
::
PreconditionNotMet
(
"The input `cond` tensor should be on cpu place"
));
PADDLE_ENFORCE_EQ
(
cond
->
numel
(),
1
,
platform
::
errors
::
PreconditionNotMet
(
"The input `cond` should be shape [1]"
));
if
(
!
cond
->
data
<
bool
>
()[
0
])
{
VLOG
(
4
)
<<
"Skip all reduce Op since cond is 0"
;
return
;
}
}
auto
in
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"X"
);
auto
out
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
"Out"
);
auto
place
=
ctx
.
GetPlace
();
HcclDataType
dtype
=
platform
::
ToHCCLDataType
(
framework
::
TransToProtoVarType
(
in
->
dtype
()));
int64_t
numel
=
in
->
numel
();
void
*
sendbuff
=
reinterpret_cast
<
void
*>
(
const_cast
<
T
*>
(
in
->
data
<
T
>
()));
out
->
mutable_data
<
T
>
(
in
->
dims
(),
ctx
.
GetPlace
());
void
*
recvbuff
=
reinterpret_cast
<
void
*>
(
out
->
data
<
T
>
());
int
ring_id
=
ctx
.
Attr
<
int
>
(
"ring_id"
);
std
::
string
group
=
std
::
string
(
HCOM_GROUP_PREFIX
)
+
std
::
to_string
(
ring_id
);
auto
comm
=
paddle
::
platform
::
HCCLCommContext
::
Instance
().
Get
(
ring_id
,
place
);
aclrtStream
stream
=
nullptr
;
auto
dev_ctx
=
static_cast
<
platform
::
NPUDeviceContext
*>
(
platform
::
DeviceContextPool
::
Instance
().
Get
(
place
));
if
(
ctx
.
Attr
<
bool
>
(
"use_calc_stream"
))
{
stream
=
dev_ctx
->
stream
();
}
else
{
stream
=
comm
->
stream
();
}
HcclReduceOp
hccl_red_type
=
HCCL_REDUCE_SUM
;
switch
(
red_type
)
{
case
kRedSum
:
hccl_red_type
=
HCCL_REDUCE_SUM
;
break
;
case
kRedMax
:
hccl_red_type
=
HCCL_REDUCE_MAX
;
break
;
case
kRedMin
:
hccl_red_type
=
HCCL_REDUCE_MIN
;
break
;
case
kRedProd
:
hccl_red_type
=
HCCL_REDUCE_PROD
;
break
;
default:
PADDLE_THROW
(
platform
::
errors
::
InvalidArgument
(
"Invalid reduce type: %d"
,
red_type
));
}
VLOG
(
3
)
<<
"hccl allreduce, parameter is: "
<<
"input num: "
<<
in
->
dims
()
<<
"dtype: "
<<
dtype
<<
"hccl_red_type: "
<<
hccl_red_type
<<
", group is: "
<<
group
<<
", sendbuff:"
<<
sendbuff
<<
", recvbuff:"
<<
recvbuff
<<
", out_size:"
<<
out
->
memory_size
()
<<
", use_calc_stream:"
<<
ctx
.
Attr
<
bool
>
(
"use_calc_stream"
)
<<
", stream:"
<<
stream
;
phi
::
DenseTensor
tmp
;
tmp
.
mutable_data
<
float
>
({
8
},
ctx
.
GetPlace
());
bool
found_nan
=
false
;
auto
d_type
=
framework
::
TransToProtoVarType
(
in
->
dtype
());
switch
(
d_type
)
{
case
framework
::
proto
::
VarType
::
FP16
:
{
break
;
}
case
framework
::
proto
::
VarType
::
FP32
:
{
if
(
FLAGS_hccl_check_nan
)
{
VLOG
(
3
)
<<
"prepare to FoundNanInf"
;
// NOTE: performance relating, DO NOT REMOVE!
ContainsNan
(
*
dev_ctx
,
dev_ctx
->
stream
(),
in
);
}
break
;
}
default:
break
;
}
if
(
found_nan
)
{
T
inf
=
static_cast
<
T
>
(
std
::
numeric_limits
<
float
>::
infinity
());
VLOG
(
4
)
<<
"fill input data constant inf"
;
auto
dims
=
in
->
dims
();
auto
mutable_in
=
const_cast
<
phi
::
DenseTensor
*>
(
in
);
FillNpuTensorWithConstant
<
T
>
(
mutable_in
,
inf
);
mutable_in
->
Resize
(
dims
);
}
VLOG
(
3
)
<<
"hccl allreduce, parameter is: "
<<
"input num: "
<<
numel
<<
"dtype: "
<<
dtype
<<
"hccl_red_type: "
<<
hccl_red_type
<<
", group is: "
<<
group
<<
", sendbuff:"
<<
sendbuff
<<
", recvbuff:"
<<
recvbuff
<<
", out_size:"
<<
out
->
memory_size
();
PADDLE_ENFORCE_NPU_SUCCESS
(
platform
::
dynload
::
HcclAllReduce
(
sendbuff
,
recvbuff
,
numel
,
dtype
,
hccl_red_type
,
comm
->
comm
(),
reinterpret_cast
<
void
*>
(
stream
)));
out
->
Resize
(
in
->
dims
());
#else
PADDLE_THROW
(
platform
::
errors
::
PreconditionNotMet
(
"PaddlePaddle should compile with NPU."
));
#endif
}
};
...
...
@@ -616,10 +443,6 @@ class CAllReduceOpMaker : public framework::OpProtoAndCheckerMaker {
AddOutput
(
"Out"
,
"(Tensor) the allreduced result."
);
AddAttr
<
int
>
(
"ring_id"
,
"(int default 0) communication ring id."
)
.
SetDefault
(
0
);
#if defined(PADDLE_WITH_ASCEND_CL)
AddAttr
<
std
::
string
>
(
"tag"
,
"(string default tag) tag for all reduce."
)
.
SetDefault
(
"tag"
);
#endif
AddAttr
<
bool
>
(
"use_calc_stream"
,
"(bool default false) eject CUDA operations to calculation stream."
)
...
...
paddle/fluid/operators/collective/c_allreduce_sum_op_npu_test.cc
浏览文件 @
80dd1672
...
...
@@ -31,10 +31,6 @@ limitations under the License. */
#include "paddle/fluid/string/printf.h"
#include "paddle/phi/kernels/funcs/math_function.h"
#if defined(PADDLE_WITH_ASCEND_CL)
#include "paddle/fluid/platform/collective_helper.h"
#endif
// Node1: HCCL_WHITELIST_DISABLE=1 FLAGS_selected_npus=1 GLOG_v=4 RANK_ID=1
// DEVICE_ID=1 ./paddle/fluid/operators/collective/c_allreduce_sum_op_npu_test
// Node2: HCCL_WHITELIST_DISABLE=1 FLAGS_selected_npus=0 GLOG_v=4 RANK_ID=0
...
...
paddle/fluid/operators/collective/c_broadcast_op.cc
浏览文件 @
80dd1672
...
...
@@ -42,10 +42,7 @@ class CBroadcastOpMaker : public framework::OpProtoAndCheckerMaker {
.
SetDefault
(
0
);
AddAttr
<
int
>
(
"root"
,
"(int default 0) root id for broadcasting."
)
.
SetDefault
(
0
);
#if defined(PADDLE_WITH_ASCEND_CL)
AddAttr
<
std
::
string
>
(
"tag"
,
"(string default tag) tag for broadcasting."
)
.
SetDefault
(
"tag"
);
#endif
AddAttr
<
bool
>
(
"use_calc_stream"
,
"(bool default false) eject CUDA operations to calculation stream."
)
...
...
paddle/fluid/operators/collective/c_broadcast_op_npu.cc
浏览文件 @
80dd1672
...
...
@@ -14,10 +14,6 @@ limitations under the License. */
#include "paddle/fluid/operators/collective/c_broadcast_op.h"
#if defined(PADDLE_WITH_ASCEND_CL)
#include "paddle/fluid/platform/collective_helper.h"
#endif
namespace
paddle
{
namespace
operators
{
...
...
@@ -25,58 +21,8 @@ template <typename T>
class
CBroadcastOpASCENDKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
#if defined(PADDLE_WITH_ASCEND_CL)
auto
x
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"X"
);
void
*
ptr
=
reinterpret_cast
<
void
*>
(
const_cast
<
T
*>
(
x
->
data
<
T
>
()));
int
numel
=
x
->
numel
();
HcclDataType
dtype
=
platform
::
ToHCCLDataType
(
framework
::
TransToProtoVarType
(
x
->
dtype
()));
auto
out
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
"Out"
);
int
ring_id
=
ctx
.
Attr
<
int
>
(
"ring_id"
);
auto
place
=
ctx
.
GetPlace
();
auto
comm
=
paddle
::
platform
::
HCCLCommContext
::
Instance
().
Get
(
ring_id
,
place
);
aclrtStream
stream
=
nullptr
;
auto
dev_ctx
=
platform
::
DeviceContextPool
::
Instance
().
Get
(
place
);
if
(
ctx
.
Attr
<
bool
>
(
"use_calc_stream"
))
{
stream
=
static_cast
<
platform
::
NPUDeviceContext
*>
(
dev_ctx
)
->
stream
();
}
else
{
stream
=
comm
->
stream
();
}
int
root
=
ctx
.
Attr
<
int
>
(
"root"
);
std
::
string
group
=
std
::
string
(
HCOM_GROUP_PREFIX
)
+
std
::
to_string
(
ring_id
);
VLOG
(
3
)
<<
"begin hccl broadcast, parameter is: "
<<
"root "
<<
root
<<
", group is "
<<
group
<<
", comm: "
<<
comm
->
comm
()
<<
", stream: "
<<
stream
;
PADDLE_ENFORCE_NPU_SUCCESS
(
platform
::
dynload
::
HcclBroadcast
(
ptr
,
numel
,
dtype
,
(
uint32_t
)
root
,
comm
->
comm
(),
stream
));
VLOG
(
3
)
<<
"rank "
<<
comm
->
rank
()
<<
" invoke Bcast. received "
<<
phi
::
product
(
out
->
dims
());
dev_ctx
->
Wait
();
if
(
out
!=
x
)
{
framework
::
TensorCopy
(
*
static_cast
<
const
phi
::
DenseTensor
*>
(
x
),
place
,
*
platform
::
DeviceContextPool
::
Instance
().
Get
(
place
),
static_cast
<
phi
::
DenseTensor
*>
(
out
));
}
dev_ctx
->
Wait
();
out
->
Resize
(
x
->
dims
());
out
->
set_lod
(
x
->
lod
());
#else
PADDLE_THROW
(
platform
::
errors
::
PreconditionNotMet
(
"PaddlePaddle should compile with NPU."
));
#endif
}
};
...
...
paddle/fluid/operators/collective/c_broadcast_op_npu_test.cc
浏览文件 @
80dd1672
...
...
@@ -31,10 +31,6 @@ limitations under the License. */
#include "paddle/fluid/string/printf.h"
#include "paddle/phi/kernels/funcs/math_function.h"
#if defined(PADDLE_WITH_ASCEND_CL)
#include "paddle/fluid/platform/collective_helper.h"
#endif
namespace
f
=
paddle
::
framework
;
namespace
p
=
paddle
::
platform
;
...
...
paddle/fluid/operators/collective/c_comm_init_hccl_op.cc
浏览文件 @
80dd1672
...
...
@@ -21,11 +21,6 @@ namespace framework {
class
Scope
;
}
// namespace framework
}
// namespace paddle
#if defined(PADDLE_WITH_ASCEND_CL)
#include "hccl/hccl.h"
#include "hccl/hccl_types.h"
#include "paddle/fluid/platform/collective_helper.h"
#endif
namespace
paddle
{
namespace
operators
{
...
...
@@ -48,52 +43,9 @@ class CCommInitOpAscend : public framework::OperatorBase {
auto
var
=
scope
.
FindVar
(
Input
(
"X"
));
PADDLE_ENFORCE_NOT_NULL
(
var
,
platform
::
errors
::
InvalidArgument
(
"Input con not be empty."
));
#if defined(PADDLE_WITH_ASCEND_CL)
HcclRootInfo
*
hccl_id
=
var
->
GetMutable
<
HcclRootInfo
>
();
int
rank_ids
=
Attr
<
int
>
(
"rank_ids"
);
int
rank_id
=
Attr
<
int
>
(
"rank"
);
int
rid
=
Attr
<
int
>
(
"ring_id"
);
int
device_id
=
place
.
device
;
if
(
Attr
<
int
>
(
"device_id"
)
>=
0
)
{
device_id
=
Attr
<
int
>
(
"device_id"
);
}
platform
::
HCCLCommContext
::
Instance
().
CreateHCCLComm
(
hccl_id
,
rank_ids
,
rank_id
,
device_id
,
rid
);
// Build comm
float
*
buff
;
int32_t
size
=
20
;
std
::
vector
<
float
>
input
(
size
,
0
);
for
(
int32_t
idx
=
0
;
idx
<
size
;
idx
++
)
{
input
[
idx
]
=
1.0
;
}
PADDLE_ENFORCE_NPU_SUCCESS
(
platform
::
RecordedNPUMalloc
(
reinterpret_cast
<
void
**>
(
&
buff
),
size
*
sizeof
(
float
),
device_id
));
platform
::
NPUMemcpySync
(
reinterpret_cast
<
void
*>
(
buff
),
input
.
data
(),
size
*
sizeof
(
float
),
ACL_MEMCPY_HOST_TO_DEVICE
,
size
*
sizeof
(
float
));
VLOG
(
3
)
<<
"Build buff data successful."
;
aclrtStream
stream
=
nullptr
;
auto
comm
=
paddle
::
platform
::
HCCLCommContext
::
Instance
().
Get
(
rid
,
place
);
if
(
rank_id
==
0
)
{
stream
=
comm
->
stream
();
}
else
{
auto
dev_ctx
=
platform
::
DeviceContextPool
::
Instance
().
Get
(
place
);
stream
=
static_cast
<
platform
::
NPUDeviceContext
*>
(
dev_ctx
)
->
stream
();
}
PADDLE_ENFORCE_NPU_SUCCESS
(
platform
::
dynload
::
HcclBroadcast
(
buff
,
size
,
HCCL_DATA_TYPE_FP32
,
0
,
comm
->
comm
(),
stream
));
// Synchronize stream to find hccl error in time.
platform
::
NPUStreamSync
(
stream
);
VLOG
(
3
)
<<
"Build connection successful."
;
#else
PADDLE_THROW
(
platform
::
errors
::
PreconditionNotMet
(
"PaddlePaddle should compile with NPU."
));
#endif
}
};
...
...
paddle/fluid/operators/collective/c_gen_hccl_id_op.cc
浏览文件 @
80dd1672
...
...
@@ -27,83 +27,6 @@ limitations under the License. */
namespace
paddle
{
namespace
operators
{
#ifdef PADDLE_WITH_ASCEND_CL
static
void
GenHCCLID
(
std
::
vector
<
HcclRootInfo
>*
hccl_ids
)
{
constexpr
int
timeout
=
2
*
60
+
10
;
// 2MSL+10s
constexpr
int
retry_time
=
1
;
for
(
size_t
i
=
0
;
i
<
hccl_ids
->
size
();
++
i
)
{
bool
failed
=
true
;
for
(
auto
retry_times
=
0
;
retry_times
*
retry_time
<
timeout
;
++
retry_times
)
{
auto
err
=
platform
::
dynload
::
HcclGetRootInfo
(
&
(
*
hccl_ids
)[
i
]);
if
(
err
==
0
)
{
failed
=
false
;
break
;
}
std
::
this_thread
::
sleep_for
(
std
::
chrono
::
seconds
(
retry_time
));
LOG
(
WARNING
)
<<
"HcclGetRootInfo failed, err is: "
<<
err
<<
", retry "
<<
retry_times
<<
" times"
;
}
if
(
failed
)
{
PADDLE_THROW
(
platform
::
errors
::
External
(
"HcclGetRootInfo failed!"
));
}
}
}
static
void
CopyHCCLIDToVar
(
const
std
::
vector
<
HcclRootInfo
>&
hccl_ids
,
std
::
function
<
std
::
string
(
size_t
)
>
func
,
const
framework
::
Scope
&
scope
)
{
for
(
size_t
i
=
0
;
i
<
hccl_ids
.
size
();
++
i
)
{
std
::
string
var_name
=
func
(
i
);
auto
var
=
scope
.
FindVar
(
var_name
);
PADDLE_ENFORCE_NOT_NULL
(
var
,
platform
::
errors
::
NotFound
(
"Variable with name %s is not found"
,
var_name
.
c_str
()));
auto
hccl_id
=
var
->
GetMutable
<
HcclRootInfo
>
();
memcpy
(
hccl_id
,
&
hccl_ids
[
i
],
sizeof
(
HcclRootInfo
));
}
}
class
CGenHCCLIdOp
:
public
framework
::
OperatorBase
{
public:
CGenHCCLIdOp
(
const
std
::
string
&
type
,
const
framework
::
VariableNameMap
&
inputs
,
const
framework
::
VariableNameMap
&
outputs
,
const
framework
::
AttributeMap
&
attrs
)
:
OperatorBase
(
type
,
inputs
,
outputs
,
attrs
)
{}
void
RunImpl
(
const
framework
::
Scope
&
scope
,
const
platform
::
Place
&
dev_place
)
const
override
{
int
rank
=
Attr
<
int
>
(
"rank"
);
int
ring_id
=
Attr
<
int
>
(
"ring_id"
);
std
::
function
<
std
::
string
(
size_t
)
>
func
=
[
&
](
size_t
i
)
->
std
::
string
{
return
Output
(
"Out"
);
};
std
::
string
endpoint
=
Attr
<
std
::
string
>
(
"endpoint"
);
int
server_fd
=
platform
::
SocketServer
::
GetInstance
(
endpoint
).
socket
();
std
::
vector
<
HcclRootInfo
>
hccl_ids
;
hccl_ids
.
resize
(
1
);
if
(
rank
==
0
)
{
GenHCCLID
(
&
hccl_ids
);
std
::
vector
<
std
::
string
>
endpoint_list
=
Attr
<
std
::
vector
<
std
::
string
>>
(
"other_endpoints"
);
platform
::
SendBroadCastCommID
(
endpoint_list
,
&
hccl_ids
,
ring_id
);
}
else
{
platform
::
RecvBroadCastCommID
(
server_fd
,
endpoint
,
&
hccl_ids
,
ring_id
);
}
CopyHCCLIDToVar
(
hccl_ids
,
func
,
scope
);
}
};
#else
class
CGenHCCLIdOp
:
public
framework
::
OperatorBase
{
public:
CGenHCCLIdOp
(
const
std
::
string
&
type
,
...
...
@@ -116,8 +39,6 @@ class CGenHCCLIdOp : public framework::OperatorBase {
const
platform
::
Place
&
dev_place
)
const
override
{}
};
#endif
class
CGenHCCLIdOpMaker
:
public
framework
::
OpProtoAndCheckerMaker
{
public:
void
Make
()
override
{
...
...
paddle/fluid/operators/collective/c_reduce_op.h
浏览文件 @
80dd1672
...
...
@@ -26,7 +26,7 @@ limitations under the License. */
#include "paddle/phi/core/ddim.h"
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
defined(PADDLE_WITH_XPU_BKCL)
|| defined(PADDLE_WITH_ASCEND_CL)
defined(PADDLE_WITH_XPU_BKCL)
#include "paddle/fluid/platform/collective_helper.h"
#endif
...
...
@@ -44,9 +44,6 @@ limitations under the License. */
#include "paddle/fluid/framework/fleet/gloo_wrapper.h"
#endif
#if defined(PADDLE_WITH_ASCEND_CL)
#endif
#if defined(PADDLE_WITH_CNCL)
#include "paddle/fluid/platform/device/mlu/cncl_helper.h"
#endif
...
...
@@ -134,86 +131,8 @@ template <ReduceType red_type, typename T>
class
CReduceOpASCENDKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
#if defined(PADDLE_WITH_ASCEND_CL)
auto
in
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"X"
);
auto
out
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
"Out"
);
auto
place
=
ctx
.
GetPlace
();
HcclDataType
dtype
=
platform
::
ToHCCLDataType
(
framework
::
TransToProtoVarType
(
in
->
dtype
()));
int64_t
numel
=
in
->
numel
();
void
*
sendbuff
=
reinterpret_cast
<
void
*>
(
const_cast
<
T
*>
(
in
->
data
<
T
>
()));
void
*
recvbuff
=
reinterpret_cast
<
void
*>
(
out
->
data
<
T
>
());
int
ring_id
=
ctx
.
Attr
<
int
>
(
"ring_id"
);
int
root_id
=
ctx
.
Attr
<
int
>
(
"root_id"
);
std
::
string
group
=
std
::
string
(
HCOM_GROUP_PREFIX
)
+
std
::
to_string
(
ring_id
);
auto
comm
=
paddle
::
platform
::
HCCLCommContext
::
Instance
().
Get
(
ring_id
,
place
);
aclrtStream
stream
=
nullptr
;
auto
dev_ctx
=
platform
::
DeviceContextPool
::
Instance
().
Get
(
place
);
if
(
ctx
.
Attr
<
bool
>
(
"use_calc_stream"
))
{
stream
=
static_cast
<
platform
::
NPUDeviceContext
*>
(
dev_ctx
)
->
stream
();
}
else
{
stream
=
comm
->
stream
();
}
int
rank_id
=
comm
->
rank
();
HcclReduceOp
hccl_red_type
=
HCCL_REDUCE_SUM
;
switch
(
red_type
)
{
case
kRedSum
:
hccl_red_type
=
HCCL_REDUCE_SUM
;
break
;
case
kRedMax
:
hccl_red_type
=
HCCL_REDUCE_MAX
;
break
;
case
kRedMin
:
hccl_red_type
=
HCCL_REDUCE_MIN
;
break
;
case
kRedProd
:
hccl_red_type
=
HCCL_REDUCE_PROD
;
break
;
default:
PADDLE_THROW
(
platform
::
errors
::
InvalidArgument
(
"Invalid reduce type: %d"
,
red_type
));
}
VLOG
(
3
)
<<
"begin hccl reduce, parameter is: "
<<
"input num: "
<<
numel
<<
"root_id: "
<<
root_id
<<
"dtype: "
<<
dtype
<<
"hccl_red_type: "
<<
hccl_red_type
<<
", group is: "
<<
group
;
PADDLE_ENFORCE_NPU_SUCCESS
(
platform
::
dynload
::
HcclAllReduce
(
sendbuff
,
recvbuff
,
numel
,
dtype
,
hccl_red_type
,
comm
->
comm
(),
reinterpret_cast
<
void
*>
(
stream
)));
if
(
rank_id
!=
root_id
)
{
auto
npu_place
=
place
;
memory
::
Copy
(
npu_place
,
reinterpret_cast
<
void
*>
(
out
->
data
<
T
>
()),
npu_place
,
reinterpret_cast
<
void
*>
(
const_cast
<
T
*>
(
in
->
data
<
T
>
())),
numel
*
sizeof
(
T
),
stream
);
}
out
->
Resize
(
in
->
dims
());
#else
PADDLE_THROW
(
platform
::
errors
::
PreconditionNotMet
(
"PaddlePaddle should compile with NPU."
));
#endif
}
};
...
...
@@ -433,10 +352,7 @@ class CReduceOpMaker : public framework::OpProtoAndCheckerMaker {
AddOutput
(
"Out"
,
"(Tensor) the reduced result."
);
AddAttr
<
int
>
(
"ring_id"
,
"(int default 0) communication ring id."
)
.
SetDefault
(
0
);
#if defined(PADDLE_WITH_ASCEND_CL)
AddAttr
<
std
::
string
>
(
"tag"
,
"(string default tag) tag for reduce."
)
.
SetDefault
(
"tag"
);
#endif
AddAttr
<
int
>
(
"root_id"
,
"(int default 0) root id."
).
SetDefault
(
0
);
AddAttr
<
bool
>
(
"use_calc_stream"
,
...
...
paddle/fluid/operators/collective/c_reduce_sum_op_npu_test.cc
浏览文件 @
80dd1672
...
...
@@ -31,10 +31,6 @@ limitations under the License. */
#include "paddle/fluid/string/printf.h"
#include "paddle/phi/kernels/funcs/math_function.h"
#if defined(PADDLE_WITH_ASCEND_CL)
#include "paddle/fluid/platform/collective_helper.h"
#endif
namespace
f
=
paddle
::
framework
;
namespace
p
=
paddle
::
platform
;
...
...
paddle/fluid/operators/collective/c_reducescatter_op.cc
浏览文件 @
80dd1672
...
...
@@ -50,10 +50,7 @@ class CReduceScatterOpMaker : public framework::OpProtoAndCheckerMaker {
AddAttr
<
int
>
(
"nranks"
,
"Total trainer count of the distributed training job"
)
.
SetDefault
(
1
);
#if defined(PADDLE_WITH_ASCEND_CL)
AddAttr
<
std
::
string
>
(
"tag"
,
"(string default tag) tag for reduce scatter."
)
.
SetDefault
(
"tag"
);
#endif
AddAttr
<
bool
>
(
"use_calc_stream"
,
"(bool default false) eject CUDA operations to calculation stream."
)
...
...
paddle/fluid/operators/collective/c_reducescatter_op_npu.cc
浏览文件 @
80dd1672
...
...
@@ -14,10 +14,6 @@ limitations under the License. */
#include "paddle/fluid/operators/collective/c_reducescatter_op.h"
#if defined(PADDLE_WITH_ASCEND_CL)
#include "paddle/fluid/platform/collective_helper.h"
#endif
namespace
paddle
{
namespace
operators
{
...
...
@@ -25,59 +21,8 @@ template <typename T>
class
CReduceScatterOpAscendKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
#if defined(PADDLE_WITH_ASCEND_CL)
auto
in
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"X"
);
auto
out
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
"Out"
);
int
ring_id
=
ctx
.
Attr
<
int
>
(
"ring_id"
);
std
::
string
group
=
std
::
string
(
HCOM_GROUP_PREFIX
)
+
std
::
to_string
(
ring_id
);
auto
place
=
ctx
.
GetPlace
();
auto
comm
=
platform
::
HCCLCommContext
::
Instance
().
Get
(
ring_id
,
place
);
int
nranks
=
comm
->
nranks
();
auto
out_dims
=
in
->
dims
();
PADDLE_ENFORCE_EQ
(
out_dims
[
0
]
%
nranks
,
0
,
platform
::
errors
::
InvalidArgument
(
"The input tensor X's "
"dim[0] (%d) should be divisible by nranks(%d)"
,
out_dims
[
0
],
nranks
));
out_dims
[
0
]
=
out_dims
[
0
]
/
nranks
;
out
->
mutable_data
<
T
>
(
out_dims
,
place
);
uint64_t
recv_numel
=
in
->
numel
()
/
nranks
;
void
*
inputPtr
=
reinterpret_cast
<
void
*>
(
const_cast
<
T
*>
(
in
->
data
<
T
>
()));
void
*
outputPtr
=
reinterpret_cast
<
void
*>
(
out
->
data
<
T
>
());
HcclDataType
dtype
=
platform
::
ToHCCLDataType
(
framework
::
TransToProtoVarType
(
in
->
dtype
()));
aclrtStream
stream
=
nullptr
;
if
(
ctx
.
Attr
<
bool
>
(
"use_calc_stream"
))
{
auto
dev_ctx
=
platform
::
DeviceContextPool
::
Instance
().
Get
(
place
);
stream
=
static_cast
<
platform
::
NPUDeviceContext
*>
(
dev_ctx
)
->
stream
();
}
else
{
stream
=
comm
->
stream
();
}
VLOG
(
3
)
<<
"begin hccl reduce scatter, parameter is: "
<<
"recv_numel: "
<<
recv_numel
<<
"dtype: "
<<
dtype
<<
"hccl_red_type: "
<<
HCCL_REDUCE_SUM
<<
", group is: "
<<
group
;
PADDLE_ENFORCE_NPU_SUCCESS
(
platform
::
dynload
::
HcclReduceScatter
(
inputPtr
,
outputPtr
,
recv_numel
,
dtype
,
HCCL_REDUCE_SUM
,
comm
->
comm
(),
reinterpret_cast
<
void
*>
(
stream
)));
#else
PADDLE_THROW
(
platform
::
errors
::
PreconditionNotMet
(
"PaddlePaddle should compile with NPU."
));
#endif
}
};
...
...
paddle/fluid/operators/collective/c_reducescatter_op_npu_test.cc
浏览文件 @
80dd1672
...
...
@@ -34,10 +34,6 @@ limitations under the License. */
#include "paddle/fluid/string/printf.h"
#include "paddle/phi/kernels/funcs/math_function.h"
#if defined(PADDLE_WITH_ASCEND_CL)
#include "paddle/fluid/platform/collective_helper.h"
#endif
namespace
f
=
paddle
::
framework
;
namespace
p
=
paddle
::
platform
;
...
...
paddle/fluid/operators/collective/c_sync_calc_stream_op.h
浏览文件 @
80dd1672
...
...
@@ -47,17 +47,6 @@ class CSyncCalcStreamKernel : public framework::OpKernel<T> {
platform
::
GpuStreamSync
(
dev_ctx
->
stream
());
#elif defined(PADDLE_WITH_ASCEND_CL) && !defined(_WIN32)
auto
place
=
ctx
.
GetPlace
();
PADDLE_ENFORCE_EQ
(
platform
::
is_npu_place
(
place
),
true
,
platform
::
errors
::
PreconditionNotMet
(
"Sync stream op can run on npu place only for now."
));
auto
dev_ctx
=
static_cast
<
platform
::
NPUDeviceContext
*>
(
platform
::
DeviceContextPool
::
Instance
().
Get
(
place
));
platform
::
NPUStreamSync
(
dev_ctx
->
stream
());
#elif defined(PADDLE_WITH_CNCL)
auto
place
=
ctx
.
GetPlace
();
PADDLE_ENFORCE_EQ
(
platform
::
is_mlu_place
(
place
),
...
...
paddle/fluid/operators/collective/c_sync_comm_stream_op.h
浏览文件 @
80dd1672
...
...
@@ -26,7 +26,7 @@ limitations under the License. */
#include "paddle/fluid/platform/device/mlu/cncl_helper.h"
#endif
#if defined(PADDLE_WITH_XPU_BKCL)
|| defined(PADDLE_WITH_ASCEND_CL)
#if defined(PADDLE_WITH_XPU_BKCL)
#include "paddle/fluid/platform/collective_helper.h"
#endif
...
...
@@ -45,19 +45,6 @@ class CSyncCommStreamKernel : public framework::OpKernel<T> {
platform
::
GpuStreamSync
(
stream
);
#elif defined(PADDLE_WITH_ASCEND_CL)
auto
place
=
ctx
.
GetPlace
();
PADDLE_ENFORCE_EQ
(
platform
::
is_npu_place
(
place
),
true
,
platform
::
errors
::
PreconditionNotMet
(
"Sync comm stream op can run on npu place only for "
"now, but we got %s, please check the environment."
,
place
.
DebugString
()));
int
ring_id
=
ctx
.
Attr
<
int
>
(
"ring_id"
);
auto
stream
=
platform
::
HCCLCommContext
::
Instance
().
Get
(
ring_id
,
place
)
->
stream
();
platform
::
NPUStreamSync
(
stream
);
#elif defined(PADDLE_WITH_CNCL)
auto
place
=
ctx
.
GetPlace
();
PADDLE_ENFORCE_EQ
(
platform
::
is_mlu_place
(
place
),
...
...
paddle/fluid/operators/collective/c_sync_comm_stream_op_npu_test.cc
浏览文件 @
80dd1672
...
...
@@ -31,10 +31,6 @@ limitations under the License. */
#include "paddle/fluid/string/printf.h"
#include "paddle/phi/kernels/funcs/math_function.h"
#if defined(PADDLE_WITH_ASCEND_CL)
#include "paddle/fluid/platform/collective_helper.h"
#endif
namespace
f
=
paddle
::
framework
;
namespace
p
=
paddle
::
platform
;
...
...
paddle/fluid/operators/collective/checknumeric_npu_test.cc
浏览文件 @
80dd1672
...
...
@@ -32,10 +32,6 @@ limitations under the License. */
#include "paddle/fluid/string/printf.h"
#include "paddle/phi/kernels/funcs/math_function.h"
#if defined(PADDLE_WITH_ASCEND_CL)
#include "paddle/fluid/platform/collective_helper.h"
#endif
namespace
f
=
paddle
::
framework
;
namespace
p
=
paddle
::
platform
;
...
...
paddle/fluid/operators/collective/gen_hccl_id_op.cc
浏览文件 @
80dd1672
...
...
@@ -30,144 +30,6 @@ limitations under the License. */
namespace
paddle
{
namespace
operators
{
#ifdef PADDLE_WITH_ASCEND_CL
class
GenHCCLIdOp
:
public
framework
::
OperatorBase
{
public:
GenHCCLIdOp
(
const
std
::
string
&
type
,
const
framework
::
VariableNameMap
&
inputs
,
const
framework
::
VariableNameMap
&
outputs
,
const
framework
::
AttributeMap
&
attrs
)
:
OperatorBase
(
type
,
inputs
,
outputs
,
attrs
)
{}
void
RunImpl
(
const
framework
::
Scope
&
scope
,
const
platform
::
Place
&
dev_place
)
const
override
{
std
::
vector
<
std
::
string
>
trainers
=
Attr
<
std
::
vector
<
std
::
string
>>
(
"trainers"
);
int
trainer_id
=
Attr
<
int
>
(
"trainer_id"
);
std
::
string
endpoint
=
trainers
[
trainer_id
];
PADDLE_ENFORCE_GE
(
trainer_id
,
0
,
platform
::
errors
::
InvalidArgument
(
"trainer_id %d is less than 0. Its "
"valid range is [0, trainer_size)"
));
PADDLE_ENFORCE_LT
(
trainer_id
,
static_cast
<
int
>
(
trainers
.
size
()),
platform
::
errors
::
OutOfRange
(
"trainer_id %d is out of range. Its valid "
"range is [0, trainer_size)"
,
trainer_id
));
int
hccl_comm_num
=
Attr
<
int
>
(
"hccl_comm_num"
);
int
use_hierarchical_allreduce
=
Attr
<
bool
>
(
"use_hierarchical_allreduce"
);
int
inter_nranks
=
Attr
<
int
>
(
"hierarchical_allreduce_inter_nranks"
);
int
inter_trainer_id
=
-
1
;
int
exter_trainer_id
=
-
1
;
if
(
use_hierarchical_allreduce
)
{
PADDLE_ENFORCE_GT
(
trainers
.
size
(),
1
,
platform
::
errors
::
PreconditionNotMet
(
"The number of collective trainers %llu <= 1"
,
trainers
.
size
()));
PADDLE_ENFORCE_GT
(
inter_nranks
,
1
,
platform
::
errors
::
PreconditionNotMet
(
"inter_nranks %d <= 1 while in hierarchical allreduce mode"
,
inter_nranks
));
PADDLE_ENFORCE_EQ
(
trainers
.
size
()
%
inter_nranks
,
0
,
platform
::
errors
::
PreconditionNotMet
(
"The number of trainers %llu mod inter_nranks %d is not equal 0"
,
trainers
.
size
(),
inter_nranks
));
inter_trainer_id
=
trainer_id
%
inter_nranks
;
if
(
trainer_id
%
inter_nranks
==
0
)
{
exter_trainer_id
=
trainer_id
/
inter_nranks
;
}
}
std
::
ostringstream
ss
;
for
(
size_t
i
=
0
;
i
<
trainers
.
size
();
i
++
)
{
ss
<<
trainers
[
i
]
<<
","
;
}
VLOG
(
1
)
<<
"trainer_id:"
<<
trainer_id
<<
", use_hierarchical_allreduce:"
<<
use_hierarchical_allreduce
<<
", hccl_comm_num:"
<<
hccl_comm_num
<<
", inter_nranks:"
<<
inter_nranks
<<
", inter_trainer_id:"
<<
inter_trainer_id
<<
", exter_trainer_id:"
<<
exter_trainer_id
<<
", trainers:"
<<
ss
.
str
();
int
server_fd
=
-
1
;
/// 1. init flat
std
::
function
<
std
::
string
(
size_t
)
>
func
=
platform
::
GetFlatHCCLVarName
;
if
(
trainer_id
==
0
)
{
// server endpoints
std
::
vector
<
std
::
string
>
flat_endpoints
;
flat_endpoints
.
insert
(
flat_endpoints
.
begin
(),
trainers
.
begin
()
+
1
,
trainers
.
end
());
SendBroadCastHCCLID
(
flat_endpoints
,
hccl_comm_num
,
func
,
scope
);
}
else
{
server_fd
=
CreateListenSocket
(
endpoint
);
RecvBroadCastHCCLID
(
server_fd
,
endpoint
,
hccl_comm_num
,
func
,
scope
);
}
/// 2. hierarchical inter ncclid
func
=
platform
::
GetHierarchicalInterHCCLVarName
;
if
(
inter_trainer_id
==
0
)
{
std
::
ostringstream
ss
;
ss
<<
endpoint
;
std
::
vector
<
std
::
string
>
inter_endpoints
;
for
(
int
i
=
trainer_id
+
1
;
i
<
trainer_id
+
inter_nranks
&&
i
<
static_cast
<
int
>
(
trainers
.
size
());
i
++
)
{
ss
<<
","
;
inter_endpoints
.
push_back
(
trainers
[
i
]);
ss
<<
trainers
[
i
];
}
VLOG
(
1
)
<<
"Hierarchical inter ring endpoints:"
<<
ss
.
str
();
SendBroadCastHCCLID
(
inter_endpoints
,
hccl_comm_num
,
func
,
scope
);
}
else
if
(
inter_trainer_id
>
0
)
{
VLOG
(
1
)
<<
"Hierarchical inter ring"
;
RecvBroadCastHCCLID
(
server_fd
,
endpoint
,
hccl_comm_num
,
func
,
scope
);
}
/// 3. hierarchical exter ncclid
func
=
platform
::
GetHierarchicalExterHCCLVarName
;
if
(
exter_trainer_id
==
0
)
{
std
::
ostringstream
ss
;
std
::
vector
<
std
::
string
>
exter_endpoints
;
ss
<<
endpoint
;
for
(
size_t
i
=
inter_nranks
;
i
<
trainers
.
size
();
i
+=
inter_nranks
)
{
ss
<<
","
;
exter_endpoints
.
push_back
(
trainers
[
i
]);
ss
<<
trainers
[
i
];
}
VLOG
(
1
)
<<
"Hierarchical exter ring endpoints:"
<<
ss
.
str
();
SendBroadCastHCCLID
(
exter_endpoints
,
hccl_comm_num
,
func
,
scope
);
}
else
if
(
exter_trainer_id
>
0
)
{
VLOG
(
1
)
<<
"Hierarchical exter ring"
;
RecvBroadCastHCCLID
(
server_fd
,
endpoint
,
hccl_comm_num
,
func
,
scope
);
}
// close socket server
if
(
trainer_id
!=
0
)
{
CloseSocket
(
server_fd
);
}
}
};
#else
class
GenHCCLIdOp
:
public
framework
::
OperatorBase
{
public:
GenHCCLIdOp
(
const
std
::
string
&
type
,
...
...
@@ -180,8 +42,6 @@ class GenHCCLIdOp : public framework::OperatorBase {
const
platform
::
Place
&
dev_place
)
const
override
{}
};
#endif
class
GenHCCLIdOpMaker
:
public
framework
::
OpProtoAndCheckerMaker
{
public:
void
Make
()
override
{
...
...
paddle/fluid/operators/collective/gen_hccl_id_op_helper.cc
浏览文件 @
80dd1672
...
...
@@ -30,10 +30,6 @@ limitations under the License. */
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/string/split.h"
#if defined(PADDLE_WITH_ASCEND_CL)
#include "paddle/fluid/platform/collective_helper.h"
#endif
DECLARE_int32
(
get_host_by_name_time
);
namespace
paddle
{
...
...
paddle/fluid/operators/collective/mp_allreduce_sum_op.cc
浏览文件 @
80dd1672
...
...
@@ -42,10 +42,7 @@ class MpAllReduceSumOpMaker : public framework::OpProtoAndCheckerMaker {
AddOutput
(
"Out"
,
"(Tensor) the allreduced result in model parallel."
);
AddAttr
<
int
>
(
"ring_id"
,
"(int default 0) communication ring id."
)
.
SetDefault
(
0
);
#if defined(PADDLE_WITH_ASCEND_CL)
AddAttr
<
std
::
string
>
(
"tag"
,
"(string default tag) tag for all reduce."
)
.
SetDefault
(
"tag"
);
#endif
AddAttr
<
bool
>
(
"use_calc_stream"
,
"(bool default false) eject CUDA operations to calculation stream."
)
...
...
paddle/fluid/operators/collective/partial_allgather_op.cc
浏览文件 @
80dd1672
...
...
@@ -50,10 +50,7 @@ class PartialAllGatherOpMaker : public framework::OpProtoAndCheckerMaker {
AddOutput
(
"Out"
,
"(Tensor) the allgather result"
);
AddAttr
<
int
>
(
"ring_id"
,
"(int default 0) communication ring id."
)
.
SetDefault
(
0
);
#if defined(PADDLE_WITH_ASCEND_CL)
AddAttr
<
std
::
string
>
(
"tag"
,
"(string default tag) tag for all gather."
)
.
SetDefault
(
"tag"
);
#endif
AddAttr
<
bool
>
(
"use_calc_stream"
,
"(bool default false) eject CUDA operations to calculation stream."
)
...
...
paddle/fluid/operators/collective/partial_allgather_op_npu.cc
浏览文件 @
80dd1672
...
...
@@ -24,67 +24,8 @@ template <typename T>
class
CallPartialGatherOpASCENDKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
#if defined(PADDLE_WITH_ASCEND_CL)
auto
in
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"X"
);
auto
out
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
"Out"
);
int64_t
numel
=
in
->
numel
();
HcclDataType
dtype
=
platform
::
ToHCCLDataType
(
framework
::
TransToProtoVarType
(
in
->
dtype
()));
int
rank
=
ctx
.
Attr
<
int
>
(
"rank"
);
int
ring_id
=
ctx
.
Attr
<
int
>
(
"ring_id"
);
std
::
string
group
=
std
::
string
(
HCOM_GROUP_PREFIX
)
+
std
::
to_string
(
ring_id
);
auto
place
=
ctx
.
GetPlace
();
auto
comm
=
platform
::
HCCLCommContext
::
Instance
().
Get
(
ring_id
,
place
);
int
nranks
=
comm
->
nranks
();
PADDLE_ENFORCE_EQ
(
rank
,
comm
->
rank
(),
platform
::
errors
::
InvalidArgument
(
"rank: %s should equal to %s"
,
rank
,
comm
->
rank
()));
PADDLE_ENFORCE_EQ
(
(
numel
%
nranks
),
0
,
platform
::
errors
::
InvalidArgument
(
"The input numel (%d) must be divisible by nranks(%d)"
,
numel
,
nranks
));
framework
::
DDim
dims
=
in
->
dims
();
out
->
mutable_data
<
T
>
(
dims
,
place
);
int64_t
send_numel
=
numel
/
nranks
;
int
offset
=
send_numel
*
rank
;
void
*
send_buff
=
reinterpret_cast
<
void
*>
(
const_cast
<
T
*>
(
in
->
data
<
T
>
())
+
offset
);
void
*
recv_buff
=
reinterpret_cast
<
void
*>
(
out
->
data
<
T
>
());
aclrtStream
stream
=
nullptr
;
if
(
ctx
.
Attr
<
bool
>
(
"use_calc_stream"
))
{
auto
dev_ctx
=
platform
::
DeviceContextPool
::
Instance
().
Get
(
place
);
stream
=
static_cast
<
platform
::
NPUDeviceContext
*>
(
dev_ctx
)
->
stream
();
}
else
{
stream
=
comm
->
stream
();
}
VLOG
(
3
)
<<
"begin hccl allgather, parameter is: "
<<
", group is "
<<
group
<<
", ring_id is "
<<
ring_id
<<
", nranks is "
<<
nranks
<<
", rankid is "
<<
rank
;
PADDLE_ENFORCE_NPU_SUCCESS
(
platform
::
dynload
::
HcclAllGather
(
send_buff
,
recv_buff
,
send_numel
,
dtype
,
comm
->
comm
(),
reinterpret_cast
<
void
*>
(
stream
)));
#else
PADDLE_THROW
(
platform
::
errors
::
PreconditionNotMet
(
"PaddlePaddle should compile with NPU."
));
#endif
}
};
...
...
paddle/fluid/operators/collective/partial_recv_op.cc
浏览文件 @
80dd1672
...
...
@@ -98,12 +98,7 @@ class PartialRecvOpMaker : public framework::OpProtoAndCheckerMaker {
AddAttr
<
int
>
(
"peer"
,
"(int default 0) rank id for sender."
).
SetDefault
(
0
);
AddAttr
<
int
>
(
"dtype"
,
"(int default 5('float32')) data type of tensor."
)
.
SetDefault
(
5
);
#if defined(PADDLE_WITH_ASCEND_CL)
AddAttr
<
std
::
string
>
(
"tag"
,
"(string default tag) tag for broadcasting."
)
.
SetDefault
(
"tag"
);
AddAttr
<
int
>
(
"srTag"
,
"(string default tag) tag for broadcasting."
)
.
SetDefault
(
0
);
#endif
AddAttr
<
std
::
vector
<
int
>>
(
"out_shape"
,
"shape of the output tensor."
)
.
SetDefault
(
std
::
vector
<
int
>
());
AddAttr
<
bool
>
(
...
...
paddle/fluid/operators/collective/partial_recv_op_npu.cc
浏览文件 @
80dd1672
...
...
@@ -22,57 +22,8 @@ template <typename T>
class
PartialRecvOpASCENDKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
#if defined(PADDLE_WITH_ASCEND_CL)
auto
out
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
"Out"
);
out
->
mutable_data
<
T
>
(
out
->
dims
(),
ctx
.
GetPlace
());
int
num
=
ctx
.
Attr
<
int
>
(
"num"
);
int
id
=
ctx
.
Attr
<
int
>
(
"id"
);
int
recv_numel
=
out
->
numel
()
/
num
;
int
offset
=
recv_numel
*
id
;
void
*
ptr
=
reinterpret_cast
<
void
*>
(
const_cast
<
T
*>
(
out
->
data
<
T
>
())
+
offset
);
int
numel
=
recv_numel
;
HcclDataType
dtype
=
platform
::
ToHCCLDataType
(
framework
::
TransToProtoVarType
(
out
->
dtype
()));
int
ring_id
=
ctx
.
Attr
<
int
>
(
"ring_id"
);
auto
place
=
ctx
.
GetPlace
();
auto
comm
=
paddle
::
platform
::
HCCLCommContext
::
Instance
().
Get
(
ring_id
,
place
);
aclrtStream
stream
=
nullptr
;
auto
dev_ctx
=
platform
::
DeviceContextPool
::
Instance
().
Get
(
place
);
if
(
ctx
.
Attr
<
bool
>
(
"use_calc_stream"
))
{
stream
=
static_cast
<
platform
::
NPUDeviceContext
*>
(
dev_ctx
)
->
stream
();
}
else
{
stream
=
comm
->
stream
();
}
int
nranks
=
comm
->
nranks
();
int
peer
=
ctx
.
Attr
<
int
>
(
"peer"
);
PADDLE_ENFORCE_EQ
(
nranks
,
2
,
platform
::
errors
::
InvalidArgument
(
"The nranks must be 2, but (%d)"
,
nranks
));
int
root
=
peer
;
VLOG
(
3
)
<<
"begin hccl recv, parameter is: "
<<
"ring_id:"
<<
ring_id
<<
", nranks:"
<<
nranks
<<
", peer:"
<<
peer
<<
", numel:"
<<
numel
<<
", ptr:"
<<
ptr
<<
", dtype:"
<<
dtype
<<
", root:"
<<
root
<<
", comm: "
<<
comm
->
comm
()
<<
", stream: "
<<
stream
;
PADDLE_ENFORCE_NPU_SUCCESS
(
platform
::
dynload
::
HcclBroadcast
(
ptr
,
numel
,
dtype
,
(
uint32_t
)
root
,
comm
->
comm
(),
stream
));
#else
PADDLE_THROW
(
platform
::
errors
::
PreconditionNotMet
(
"PaddlePaddle should compile with NPU."
));
#endif
}
};
...
...
paddle/fluid/operators/collective/partial_send_op.cc
浏览文件 @
80dd1672
...
...
@@ -65,12 +65,7 @@ class PartialSendMaker : public framework::OpProtoAndCheckerMaker {
AddAttr
<
int
>
(
"ring_id"
,
"(int default 0) nccl communication ring id."
)
.
SetDefault
(
0
);
AddAttr
<
int
>
(
"peer"
,
"(int default 0) rank id for receiver."
).
SetDefault
(
0
);
#if defined(PADDLE_WITH_ASCEND_CL)
AddAttr
<
std
::
string
>
(
"tag"
,
"(string default tag) tag for broadcasting."
)
.
SetDefault
(
"tag"
);
AddAttr
<
int
>
(
"srTag"
,
"(string default tag) tag for broadcasting."
)
.
SetDefault
(
0
);
#endif
AddAttr
<
bool
>
(
"use_calc_stream"
,
"(bool default false) eject CUDA operations to calculation stream."
)
...
...
paddle/fluid/operators/collective/partial_send_op_npu.cc
浏览文件 @
80dd1672
...
...
@@ -22,52 +22,8 @@ template <typename T>
class
PartialSendOpASCENDKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
#if defined(PADDLE_WITH_ASCEND_CL)
auto
x
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"X"
);
int
num
=
ctx
.
Attr
<
int
>
(
"num"
);
int
id
=
ctx
.
Attr
<
int
>
(
"id"
);
int
send_numel
=
x
->
numel
()
/
num
;
int
offset
=
send_numel
*
id
;
void
*
ptr
=
reinterpret_cast
<
void
*>
(
const_cast
<
T
*>
(
x
->
data
<
T
>
())
+
offset
);
int
numel
=
send_numel
;
HcclDataType
dtype
=
platform
::
ToHCCLDataType
(
framework
::
TransToProtoVarType
(
x
->
dtype
()));
int
ring_id
=
ctx
.
Attr
<
int
>
(
"ring_id"
);
auto
place
=
ctx
.
GetPlace
();
auto
comm
=
paddle
::
platform
::
HCCLCommContext
::
Instance
().
Get
(
ring_id
,
place
);
aclrtStream
stream
=
nullptr
;
auto
dev_ctx
=
platform
::
DeviceContextPool
::
Instance
().
Get
(
place
);
if
(
ctx
.
Attr
<
bool
>
(
"use_calc_stream"
))
{
stream
=
static_cast
<
platform
::
NPUDeviceContext
*>
(
dev_ctx
)
->
stream
();
}
else
{
stream
=
comm
->
stream
();
}
int
nranks
=
comm
->
nranks
();
int
rank
=
comm
->
rank
();
PADDLE_ENFORCE_EQ
(
nranks
,
2
,
platform
::
errors
::
InvalidArgument
(
"The nranks must be 2, but (%d)"
,
nranks
));
int
root
=
rank
;
VLOG
(
3
)
<<
"begin hccl send, parameter is: "
<<
"root "
<<
root
<<
", comm: "
<<
comm
->
comm
()
<<
", stream: "
<<
stream
;
PADDLE_ENFORCE_NPU_SUCCESS
(
platform
::
dynload
::
HcclBroadcast
(
ptr
,
numel
,
dtype
,
(
uint32_t
)
root
,
comm
->
comm
(),
stream
));
#else
PADDLE_THROW
(
platform
::
errors
::
PreconditionNotMet
(
"PaddlePaddle should compile with NPU."
));
#endif
}
};
...
...
paddle/fluid/operators/collective/recv_v2_op.cc
浏览文件 @
80dd1672
...
...
@@ -87,12 +87,7 @@ class RecvOpV2Maker : public framework::OpProtoAndCheckerMaker {
AddAttr
<
int
>
(
"peer"
,
"(int default 0) rank id for sender."
).
SetDefault
(
0
);
AddAttr
<
int
>
(
"dtype"
,
"(int default 5('float32')) data type of tensor."
)
.
SetDefault
(
5
);
#if defined(PADDLE_WITH_ASCEND_CL)
AddAttr
<
std
::
string
>
(
"tag"
,
"(string default tag) tag for broadcasting."
)
.
SetDefault
(
"tag"
);
AddAttr
<
int
>
(
"srTag"
,
"(string default tag) tag for broadcasting."
)
.
SetDefault
(
0
);
#endif
AddAttr
<
std
::
vector
<
int
>>
(
"out_shape"
,
"shape of the output tensor."
)
.
SetDefault
(
std
::
vector
<
int
>
());
AddAttr
<
bool
>
(
...
...
paddle/fluid/operators/collective/recv_v2_op_npu.cc
浏览文件 @
80dd1672
...
...
@@ -14,9 +14,6 @@ limitations under the License. */
#include "paddle/fluid/operators/collective/recv_v2_op.h"
#if defined(PADDLE_WITH_ASCEND_CL)
#include "paddle/fluid/platform/collective_helper.h"
#endif
#include "paddle/fluid/distributed/collective/process_group.h"
#include "paddle/phi/api/include/tensor.h"
...
...
@@ -27,59 +24,8 @@ template <typename T>
class
CRecvOpASCENDKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
#if defined(PADDLE_WITH_ASCEND_CL)
auto
out
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
"Out"
);
out
->
mutable_data
<
T
>
(
out
->
dims
(),
ctx
.
GetPlace
());
void
*
ptr
=
reinterpret_cast
<
void
*>
(
const_cast
<
T
*>
(
out
->
data
<
T
>
()));
int
numel
=
out
->
numel
();
HcclDataType
dtype
=
platform
::
ToHCCLDataType
(
framework
::
TransToProtoVarType
(
out
->
dtype
()));
int
ring_id
=
ctx
.
Attr
<
int
>
(
"ring_id"
);
auto
map
=
distributed
::
ProcessGroupMapFromGid
::
getInstance
();
if
(
map
->
has
(
ring_id
))
{
// Use ProcessGroup
distributed
::
ProcessGroup
*
pg
=
map
->
get
(
ring_id
);
std
::
vector
<
phi
::
DenseTensor
>
out_tensor
;
out_tensor
.
emplace_back
(
*
out
);
auto
task
=
pg
->
Recv
(
out_tensor
,
0
);
return
;
}
auto
place
=
ctx
.
GetPlace
();
auto
comm
=
paddle
::
platform
::
HCCLCommContext
::
Instance
().
Get
(
ring_id
,
place
);
aclrtStream
stream
=
nullptr
;
auto
dev_ctx
=
platform
::
DeviceContextPool
::
Instance
().
Get
(
place
);
if
(
ctx
.
Attr
<
bool
>
(
"use_calc_stream"
))
{
stream
=
static_cast
<
platform
::
NPUDeviceContext
*>
(
dev_ctx
)
->
stream
();
}
else
{
stream
=
comm
->
stream
();
}
int
nranks
=
comm
->
nranks
();
int
peer
=
ctx
.
Attr
<
int
>
(
"peer"
);
PADDLE_ENFORCE_EQ
(
nranks
,
2
,
platform
::
errors
::
InvalidArgument
(
"The nranks must be 2, but (%d)"
,
nranks
));
int
root
=
peer
;
VLOG
(
3
)
<<
"begin hccl recv, parameter is: "
<<
"ring_id:"
<<
ring_id
<<
", nranks:"
<<
nranks
<<
", peer:"
<<
peer
<<
", numel:"
<<
numel
<<
", ptr:"
<<
ptr
<<
", dtype:"
<<
dtype
<<
", root:"
<<
root
<<
", comm: "
<<
comm
->
comm
()
<<
", stream: "
<<
stream
;
PADDLE_ENFORCE_NPU_SUCCESS
(
platform
::
dynload
::
HcclBroadcast
(
ptr
,
numel
,
dtype
,
(
uint32_t
)
root
,
comm
->
comm
(),
stream
));
#else
PADDLE_THROW
(
platform
::
errors
::
PreconditionNotMet
(
"PaddlePaddle should compile with NPU."
));
#endif
}
};
...
...
paddle/fluid/operators/collective/recv_v2_op_npu_test.cc
浏览文件 @
80dd1672
...
...
@@ -31,10 +31,6 @@ limitations under the License. */
#include "paddle/fluid/string/printf.h"
#include "paddle/phi/kernels/funcs/math_function.h"
#if defined(PADDLE_WITH_ASCEND_CL)
#include "paddle/fluid/platform/collective_helper.h"
#endif
namespace
f
=
paddle
::
framework
;
namespace
p
=
paddle
::
platform
;
...
...
paddle/fluid/operators/collective/send_v2_op.cc
浏览文件 @
80dd1672
...
...
@@ -61,12 +61,7 @@ class SendOpV2Maker : public framework::OpProtoAndCheckerMaker {
AddAttr
<
int
>
(
"ring_id"
,
"(int default 0) nccl communication ring id."
)
.
SetDefault
(
0
);
AddAttr
<
int
>
(
"peer"
,
"(int default 0) rank id for receiver."
).
SetDefault
(
0
);
#if defined(PADDLE_WITH_ASCEND_CL)
AddAttr
<
std
::
string
>
(
"tag"
,
"(string default tag) tag for broadcasting."
)
.
SetDefault
(
"tag"
);
AddAttr
<
int
>
(
"srTag"
,
"(string default tag) tag for broadcasting."
)
.
SetDefault
(
0
);
#endif
AddAttr
<
bool
>
(
"use_calc_stream"
,
"(bool default false) eject CUDA operations to calculation stream."
)
...
...
paddle/fluid/operators/collective/send_v2_op_npu.cc
浏览文件 @
80dd1672
...
...
@@ -14,9 +14,6 @@ limitations under the License. */
#include "paddle/fluid/operators/collective/send_v2_op.h"
#if defined(PADDLE_WITH_ASCEND_CL)
#include "paddle/fluid/platform/collective_helper.h"
#endif
#include "paddle/fluid/distributed/collective/process_group.h"
#include "paddle/phi/api/include/tensor.h"
...
...
@@ -27,56 +24,8 @@ template <typename T>
class
CSendOpASCENDKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
#if defined(PADDLE_WITH_ASCEND_CL)
auto
x
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"X"
);
void
*
ptr
=
reinterpret_cast
<
void
*>
(
const_cast
<
T
*>
(
x
->
data
<
T
>
()));
int
numel
=
x
->
numel
();
HcclDataType
dtype
=
platform
::
ToHCCLDataType
(
framework
::
TransToProtoVarType
(
x
->
dtype
()));
int
ring_id
=
ctx
.
Attr
<
int
>
(
"ring_id"
);
auto
map
=
distributed
::
ProcessGroupMapFromGid
::
getInstance
();
if
(
map
->
has
(
ring_id
))
{
// Use ProcessGroup
distributed
::
ProcessGroup
*
pg
=
map
->
get
(
ring_id
);
std
::
vector
<
phi
::
DenseTensor
>
in_tensor
;
in_tensor
.
push_back
(
*
x
);
auto
task
=
pg
->
Send
(
in_tensor
,
1
);
return
;
}
auto
place
=
ctx
.
GetPlace
();
auto
comm
=
paddle
::
platform
::
HCCLCommContext
::
Instance
().
Get
(
ring_id
,
place
);
aclrtStream
stream
=
nullptr
;
auto
dev_ctx
=
platform
::
DeviceContextPool
::
Instance
().
Get
(
place
);
if
(
ctx
.
Attr
<
bool
>
(
"use_calc_stream"
))
{
stream
=
static_cast
<
platform
::
NPUDeviceContext
*>
(
dev_ctx
)
->
stream
();
}
else
{
stream
=
comm
->
stream
();
}
int
nranks
=
comm
->
nranks
();
int
rank
=
comm
->
rank
();
PADDLE_ENFORCE_EQ
(
nranks
,
2
,
platform
::
errors
::
InvalidArgument
(
"The nranks must be 2, but (%d)"
,
nranks
));
int
root
=
rank
;
VLOG
(
3
)
<<
"begin hccl send, parameter is: "
<<
"root "
<<
root
<<
", comm: "
<<
comm
->
comm
()
<<
", stream: "
<<
stream
;
PADDLE_ENFORCE_NPU_SUCCESS
(
platform
::
dynload
::
HcclBroadcast
(
ptr
,
numel
,
dtype
,
(
uint32_t
)
root
,
comm
->
comm
(),
stream
));
#else
PADDLE_THROW
(
platform
::
errors
::
PreconditionNotMet
(
"PaddlePaddle should compile with NPU."
));
#endif
}
};
...
...
paddle/fluid/operators/collective/send_v2_op_npu_test.cc
浏览文件 @
80dd1672
...
...
@@ -31,10 +31,6 @@ limitations under the License. */
#include "paddle/fluid/string/printf.h"
#include "paddle/phi/kernels/funcs/math_function.h"
#if defined(PADDLE_WITH_ASCEND_CL)
#include "paddle/fluid/platform/collective_helper.h"
#endif
namespace
f
=
paddle
::
framework
;
namespace
p
=
paddle
::
platform
;
...
...
paddle/fluid/operators/controlflow/conditional_block_op.h
浏览文件 @
80dd1672
...
...
@@ -84,12 +84,6 @@ class ConditionalOp : public framework::OperatorBase {
res
=
cpu_tensor
.
data
<
bool
>
()[
0
];
#endif
}
else
if
(
platform
::
is_npu_place
(
ips
[
0
]
->
place
()))
{
#ifdef PADDLE_WITH_ASCEND_CL
phi
::
DenseTensor
cpu_tensor
;
framework
::
TensorCopy
(
*
ips
[
0
],
platform
::
CPUPlace
(),
&
cpu_tensor
);
platform
::
DeviceContextPool
::
Instance
().
Get
(
ips
[
0
]
->
place
())
->
Wait
();
res
=
cpu_tensor
.
data
<
bool
>
()[
0
];
#endif
}
else
if
(
platform
::
is_xpu_place
(
ips
[
0
]
->
place
()))
{
#ifdef PADDLE_WITH_XPU
phi
::
DenseTensor
cpu_tensor
;
...
...
paddle/fluid/operators/controlflow/while_op_helper.cc
浏览文件 @
80dd1672
...
...
@@ -228,9 +228,8 @@ bool GetCondData(const phi::DenseTensor &cond) {
// platform::is_npu_place(cond.place()) or
// platform::is_xpu_place(cond.place()) is true
std
::
unique_ptr
<
phi
::
DenseTensor
>
cpu_cond
{
new
phi
::
DenseTensor
()};
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
defined(PADDLE_WITH_ASCEND_CL) || defined(PADDLE_WITH_XPU) || \
defined(PADDLE_WITH_CUSTOM_DEVICE)
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
defined(PADDLE_WITH_XPU) || defined(PADDLE_WITH_CUSTOM_DEVICE)
framework
::
TensorCopySync
(
cond
,
platform
::
CPUPlace
(),
cpu_cond
.
get
());
#else
PADDLE_THROW
(
platform
::
errors
::
PreconditionNotMet
(
...
...
paddle/fluid/operators/math/concat_and_split.cc
浏览文件 @
80dd1672
...
...
@@ -16,8 +16,7 @@ limitations under the License. */
#include "paddle/fluid/platform/device_context.h"
#include "paddle/phi/kernels/funcs/concat_and_split_functor.h"
#ifdef PADDLE_WITH_ASCEND_CL
#endif
#ifdef PADDLE_WITH_MLU
#include "paddle/fluid/operators/mlu/mlu_baseop.h"
#endif
...
...
@@ -182,84 +181,6 @@ class SplitFunctor<platform::XPUDeviceContext, T> {
};
#endif
#ifdef PADDLE_WITH_ASCEND_CL
template
<
typename
T
>
class
ConcatFunctor
<
platform
::
NPUDeviceContext
,
T
>
{
public:
void
operator
()(
const
platform
::
NPUDeviceContext
&
context
,
const
std
::
vector
<
phi
::
DenseTensor
>&
input
,
int
axis
,
phi
::
DenseTensor
*
output
)
{
int
dev_id
=
context
.
GetPlace
().
GetDeviceId
();
platform
::
NPUDeviceGuard
guard
(
dev_id
);
std
::
vector
<
std
::
string
>
names
;
for
(
size_t
i
=
0
;
i
<
input
.
size
();
++
i
)
{
names
.
push_back
(
"x"
+
std
::
to_string
(
i
));
}
NpuOpRunner
runner
{
"ConcatD"
,
{
input
},
{
*
output
},
{{
"concat_dim"
,
axis
},
{
"N"
,
static_cast
<
int
>
(
input
.
size
())}}};
runner
.
AddInputNames
(
names
);
runner
.
Run
(
context
.
stream
());
}
};
template
<
typename
T
>
class
SplitFunctor
<
platform
::
NPUDeviceContext
,
T
>
{
public:
void
operator
()(
const
platform
::
NPUDeviceContext
&
context
,
const
phi
::
DenseTensor
&
input
,
const
std
::
vector
<
const
phi
::
DenseTensor
*>&
ref_inputs
,
const
int
axis
,
std
::
vector
<
phi
::
DenseTensor
*>*
outputs
)
{
if
(
input
.
numel
()
==
0
)
{
return
;
}
size_t
num
=
outputs
->
size
();
int
input_rows
=
1
;
auto
dim_0
=
ref_inputs
[
0
]
->
dims
();
for
(
int
i
=
0
;
i
<
axis
;
++
i
)
{
input_rows
*=
dim_0
[
i
];
}
int
input_cols
=
0
;
std
::
vector
<
int64_t
>
output_cols
(
outputs
->
size
());
for
(
size_t
i
=
0
;
i
<
num
;
++
i
)
{
int
t_cols
=
ref_inputs
[
i
]
->
numel
()
/
input_rows
;
input_cols
+=
t_cols
;
output_cols
[
i
]
=
t_cols
;
}
auto
npu_place
=
context
.
GetPlace
();
// computation
for
(
int
k
=
0
;
k
<
input_rows
;
++
k
)
{
const
T
*
src_ptr
=
input
.
data
<
T
>
()
+
k
*
input_cols
;
int
col_idx
=
0
;
for
(
size_t
j
=
0
;
j
<
num
;
++
j
)
{
int
col_len
=
output_cols
[
j
];
auto
*
out_tensor
=
outputs
->
at
(
j
);
if
(
out_tensor
!=
nullptr
)
{
T
*
dst_ptr
=
out_tensor
->
data
<
T
>
()
+
k
*
col_len
;
memory
::
Copy
(
npu_place
,
dst_ptr
,
npu_place
,
src_ptr
+
col_idx
,
sizeof
(
T
)
*
col_len
,
context
.
stream
());
}
col_idx
+=
col_len
;
}
}
}
};
#endif
#ifdef PADDLE_WITH_MLU
template
<
typename
T
>
class
ConcatFunctor
<
platform
::
MLUDeviceContext
,
T
>
{
...
...
@@ -369,14 +290,6 @@ DEFINE_XPU_FUNCTOR(float)
DEFINE_XPU_FUNCTOR
(
platform
::
float16
)
#endif
#ifdef PADDLE_WITH_ASCEND_CL
#define DEFINE_NPU_FUNCTOR(type) \
template class ConcatFunctor<platform::NPUDeviceContext, type>; \
template class SplitFunctor<platform::NPUDeviceContext, type>;
FOR_ALL_TYPES
(
DEFINE_NPU_FUNCTOR
)
#endif
#ifdef PADDLE_WITH_MLU
#define DEFINE_MLU_FUNCTOR(type) \
template class ConcatFunctor<platform::MLUDeviceContext, type>; \
...
...
paddle/fluid/operators/memcpy_h2d_op.cc
浏览文件 @
80dd1672
...
...
@@ -123,34 +123,6 @@ REGISTER_OPERATOR(
paddle
::
framework
::
EmptyGradOpMaker
<
paddle
::
imperative
::
OpBase
>
,
MemcpyH2DInferShapeFunctor
);
#ifdef PADDLE_WITH_ASCEND_CL
REGISTER_OP_NPU_KERNEL_FUNCTOR
(
memcpy_h2d
,
float
,
ops
::
MemcpyH2DKernel
,
double
,
ops
::
MemcpyH2DKernel
,
int8_t
,
ops
::
MemcpyH2DKernel
,
uint8_t
,
ops
::
MemcpyH2DKernel
,
int
,
ops
::
MemcpyH2DKernel
,
int64_t
,
ops
::
MemcpyH2DKernel
,
bool
,
ops
::
MemcpyH2DKernel
,
paddle
::
platform
::
bfloat16
,
ops
::
MemcpyH2DKernel
,
paddle
::
platform
::
complex
<
float
>
,
ops
::
MemcpyH2DKernel
,
paddle
::
platform
::
complex
<
double
>
,
ops
::
MemcpyH2DKernel
,
plat
::
float16
,
ops
::
MemcpyH2DKernel
,
int16_t
,
ops
::
MemcpyH2DKernel
);
#endif
#ifdef PADDLE_WITH_IPU
REGISTER_OP_IPU_KERNEL_FUNCTOR
(
memcpy_h2d
,
float
,
...
...
paddle/fluid/operators/memcpy_op.cc
浏览文件 @
80dd1672
...
...
@@ -145,19 +145,3 @@ REGISTER_OPERATOR(
paddle
::
framework
::
EmptyGradOpMaker
<
paddle
::
framework
::
OpDesc
>
,
paddle
::
framework
::
EmptyGradOpMaker
<
paddle
::
imperative
::
OpBase
>
,
MemcpyInferShapeFunctor
);
#ifdef PADDLE_WITH_ASCEND_CL
REGISTER_OP_NPU_KERNEL_FUNCTOR
(
memcpy
,
float
,
ops
::
MemcpyKernel
,
double
,
ops
::
MemcpyKernel
,
int
,
ops
::
MemcpyKernel
,
int64_t
,
ops
::
MemcpyKernel
,
bool
,
ops
::
MemcpyKernel
,
plat
::
float16
,
ops
::
MemcpyKernel
);
#endif
paddle/fluid/operators/memcpy_op.h
浏览文件 @
80dd1672
...
...
@@ -61,14 +61,7 @@ class MemcpyFunctor {
lod_tensor
,
dev_ctx_
.
GetPlace
(),
dev_ctx_
,
&
out_tensor
);
}
else
if
(
dst_place_type_
==
DeviceType
::
CPU
)
{
framework
::
TensorCopySync
(
lod_tensor
,
platform
::
CPUPlace
(),
&
out_tensor
);
#ifdef PADDLE_WITH_ASCEND_CL
}
else
if
(
dst_place_type_
==
DeviceType
::
NPU
)
{
/* npu_pin->npu */
framework
::
TensorCopy
(
lod_tensor
,
dev_ctx_
.
GetPlace
(),
dev_ctx_
,
&
out_tensor
);
}
else
if
(
dst_place_type_
==
DeviceType
::
NPU_PINNED
)
{
/* npu->npu_pin */
framework
::
TensorCopy
(
lod_tensor
,
platform
::
NPUPinnedPlace
(),
dev_ctx_
,
&
out_tensor
);
#endif
#ifdef PADDLE_WTIH_CUSTOM_DEVICE
}
else
if
(
dst_place_type_
==
DeviceType
::
CUSTOM_DEVICE
)
{
framework
::
TensorCopy
(
...
...
paddle/fluid/operators/reader/buffered_reader.cc
浏览文件 @
80dd1672
...
...
@@ -63,21 +63,6 @@ BufferedReader::BufferedReader(
}
#endif
#ifdef PADDLE_WITH_ASCEND_CL
if
(
platform
::
is_npu_place
(
place_
))
{
int
dev_idx
=
place_
.
device
;
compute_stream_
=
((
platform
::
NPUDeviceContext
*
)(
platform
::
DeviceContextPool
::
Instance
()
.
Get
(
place_
)))
->
stream
();
events_
.
resize
(
buffer_size
);
for
(
auto
&
event
:
events_
)
{
event
=
platform
::
NpuEventResourcePool
::
Instance
().
New
(
dev_idx
);
}
stream_
=
platform
::
NpuStreamResourcePool
::
Instance
().
New
(
dev_idx
);
}
#endif
#ifdef PADDLE_WITH_MLU
if
(
platform
::
is_mlu_place
(
place_
))
{
int
dev_idx
=
place_
.
device
;
...
...
@@ -275,56 +260,6 @@ void BufferedReader::ReadAsync(size_t i) {
}
#endif
#ifdef PADDLE_WITH_ASCEND_CL
if
(
platform
::
is_npu_place
(
place_
))
{
TensorVec
&
npu
=
npu_buffer_
[
i
];
if
(
npu
.
empty
())
{
npu
.
resize
(
cpu
.
size
());
}
else
{
PADDLE_ENFORCE_EQ
(
npu
.
size
(),
cpu
.
size
(),
platform
::
errors
::
InvalidArgument
(
"Input tensor number on NPU and CPU devices are not matched. "
"The number on NPU is %d, on CPU is %d"
,
npu
.
size
(),
cpu
.
size
()));
}
std
::
vector
<
void
*>
npu_ptrs
;
npu_ptrs
.
reserve
(
cpu
.
size
());
for
(
size_t
i
=
0
;
i
<
cpu
.
size
();
++
i
)
{
npu
[
i
].
Resize
(
cpu
[
i
].
dims
());
npu
[
i
].
set_layout
(
cpu
[
i
].
layout
());
npu_ptrs
.
emplace_back
(
npu
[
i
].
mutable_data
(
place_
,
cpu
[
i
].
type
()));
}
platform
::
SetNPUDeviceId
(
place_
.
device
);
platform
::
NPUEventRecord
(
events_
[
i
].
get
(),
compute_stream_
);
platform
::
NPUStreamWaitEvent
(
stream_
.
get
(),
events_
[
i
].
get
());
platform
::
RecordEvent
record_event
(
"BufferedReader:MemoryCopy"
,
platform
::
TracerEventType
::
UserDefined
,
1
);
for
(
size_t
i
=
0
;
i
<
cpu
.
size
();
++
i
)
{
auto
cpu_place
=
cpu
[
i
].
place
();
auto
cpu_ptr
=
cpu
[
i
].
data
();
auto
npu_ptr
=
npu_ptrs
[
i
];
auto
size
=
cpu
[
i
].
numel
()
*
phi
::
SizeOf
(
cpu
[
i
].
dtype
());
if
((
platform
::
is_npu_place
(
cpu_place
)))
{
memory
::
Copy
(
place_
,
npu_ptr
,
cpu_place
,
cpu_ptr
,
size
,
stream_
.
get
());
}
else
{
memory
::
Copy
(
place_
,
npu_ptr
,
cpu_place
,
cpu_ptr
,
size
,
stream_
.
get
());
platform
::
NPUStreamSync
(
stream_
.
get
());
}
npu
[
i
].
set_lod
(
cpu
[
i
].
lod
());
}
platform
::
NPUStreamSync
(
stream_
.
get
());
}
#endif
#ifdef PADDLE_WITH_MLU
if
(
platform
::
is_mlu_place
(
place_
))
{
TensorVec
&
mlu
=
mlu_buffer_
[
i
];
...
...
paddle/fluid/operators/reader/buffered_reader.h
浏览文件 @
80dd1672
...
...
@@ -25,8 +25,7 @@
#include "paddle/fluid/platform/device/gpu/gpu_info.h"
#include "paddle/fluid/platform/device/gpu/gpu_resource_pool.h"
#endif
#ifdef PADDLE_WITH_ASCEND_CL
#endif
#ifdef PADDLE_WITH_MLU
#include "paddle/fluid/platform/device/mlu/mlu_info.h"
#include "paddle/fluid/platform/device/mlu/mlu_resource_pool.h"
...
...
@@ -93,12 +92,6 @@ class BufferedReader : public framework::DecoratedReader {
std
::
vector
<
std
::
shared_ptr
<
platform
::
CudaEventObject
>>
events_
;
#endif
#ifdef PADDLE_WITH_ASCEND_CL
aclrtStream
compute_stream_
;
std
::
shared_ptr
<
platform
::
NpuStreamObject
>
stream_
;
std
::
vector
<
std
::
shared_ptr
<
platform
::
NpuEventObject
>>
events_
;
#endif
#ifdef PADDLE_WITH_MLU
mluStream
compute_stream_
;
std
::
shared_ptr
<
platform
::
MluStreamObject
>
stream_
;
...
...
paddle/fluid/operators/run_program_op_npu.cc
浏览文件 @
80dd1672
...
...
@@ -11,19 +11,3 @@ distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef PADDLE_WITH_ASCEND_CL
#include "paddle/fluid/operators/run_program_op.h"
#include "paddle/fluid/platform/float16.h"
namespace
ops
=
paddle
::
operators
;
namespace
plat
=
paddle
::
platform
;
/* see [Why use single type kernel] */
REGISTER_OP_NPU_KERNEL
(
run_program
,
ops
::
RunProgramOpKernel
<
paddle
::
platform
::
NPUDeviceContext
,
float
>
);
REGISTER_OP_NPU_KERNEL
(
run_program_grad
,
ops
::
RunProgramGradOpKernel
<
paddle
::
platform
::
NPUDeviceContext
,
float
>
);
#endif
paddle/fluid/operators/scatter_op_npu.cc
浏览文件 @
80dd1672
...
...
@@ -11,107 +11,3 @@ distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef PADDLE_WITH_ASCEND_CL
#include <memory>
#include <string>
#include "paddle/fluid/framework/op_registry.h"
namespace
paddle
{
namespace
operators
{
template
<
typename
DeviceContext
,
typename
T
>
class
ScatterNPUKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
auto
*
x
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"X"
);
auto
*
index
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"Ids"
);
auto
*
updates
=
ctx
.
Input
<
phi
::
DenseTensor
>
(
"Updates"
);
bool
overwrite
=
ctx
.
Attr
<
bool
>
(
"overwrite"
);
auto
*
out
=
ctx
.
Output
<
phi
::
DenseTensor
>
(
"Out"
);
auto
place
=
ctx
.
GetPlace
();
out
->
mutable_data
<
T
>
(
place
);
phi
::
DenseTensor
tmp_tensor
(
index
->
type
());
const
auto
index_dims
=
index
->
dims
();
if
(
index_dims
.
size
()
==
1
)
{
tmp_tensor
.
ShareDataWith
(
*
index
);
std
::
vector
<
int64_t
>
new_dim
=
{
index_dims
[
0
],
1
};
tmp_tensor
.
Resize
(
phi
::
make_ddim
(
new_dim
));
index
=
&
tmp_tensor
;
}
const
auto
&
dev_ctx
=
ctx
.
template
device_context
<
paddle
::
platform
::
NPUDeviceContext
>();
auto
op_func_update
=
[](
const
std
::
vector
<
phi
::
DenseTensor
>&
inputs
,
const
std
::
vector
<
phi
::
DenseTensor
>&
outputs
,
const
NPUAttributeMap
&
attrs
,
const
platform
::
NPUDeviceContext
&
dev_ctx
)
{
const
auto
&
runner
=
NpuOpRunner
(
"TensorScatterUpdate"
,
inputs
,
outputs
,
attrs
);
runner
.
Run
(
dev_ctx
.
stream
());
};
auto
op_func_add
=
[](
const
std
::
vector
<
phi
::
DenseTensor
>&
inputs
,
const
std
::
vector
<
phi
::
DenseTensor
>&
outputs
,
const
NPUAttributeMap
&
attrs
,
const
platform
::
NPUDeviceContext
&
dev_ctx
)
{
const
auto
&
runner
=
NpuOpRunner
(
"TensorScatterAdd"
,
inputs
,
outputs
,
attrs
);
runner
.
Run
(
dev_ctx
.
stream
());
};
if
(
overwrite
)
{
if
(
framework
::
TransToProtoVarType
(
x
->
dtype
())
==
framework
::
proto
::
VarType
::
INT64
)
{
NpuOpRunner
::
TypeAdapter
({
*
x
,
*
index
,
*
updates
},
{
*
out
},
{},
dev_ctx
,
op_func_update
,
{
framework
::
proto
::
VarType
::
INT32
,
framework
::
proto
::
VarType
::
INT32
,
framework
::
proto
::
VarType
::
INT32
},
{
framework
::
proto
::
VarType
::
INT32
});
}
else
{
const
auto
&
runner_update
=
NpuOpRunner
(
"TensorScatterUpdate"
,
{
*
x
,
*
index
,
*
updates
},
{
*
out
},
{});
runner_update
.
Run
(
dev_ctx
.
stream
());
}
}
else
{
if
(
framework
::
TransToProtoVarType
(
x
->
dtype
())
==
framework
::
proto
::
VarType
::
INT64
)
{
NpuOpRunner
::
TypeAdapter
({
*
x
,
*
index
,
*
updates
},
{
*
out
},
{},
dev_ctx
,
op_func_add
,
{
framework
::
proto
::
VarType
::
INT32
,
framework
::
proto
::
VarType
::
INT32
,
framework
::
proto
::
VarType
::
INT32
},
{
framework
::
proto
::
VarType
::
INT32
});
}
else
{
const
auto
&
runner_add
=
NpuOpRunner
(
"TensorScatterAdd"
,
{
*
x
,
*
index
,
*
updates
},
{
*
out
},
{});
runner_add
.
Run
(
dev_ctx
.
stream
());
}
}
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
REGISTER_OP_NPU_KERNEL
(
scatter
,
ops
::
ScatterNPUKernel
<
paddle
::
platform
::
NPUDeviceContext
,
float
>
,
#ifdef PADDLE_WITH_ASCEND_INT64
ops
::
ScatterNPUKernel
<
paddle
::
platform
::
NPUDeviceContext
,
int64_t
>
,
#endif
ops
::
ScatterNPUKernel
<
paddle
::
platform
::
NPUDeviceContext
,
int
>
,
ops
::
ScatterNPUKernel
<
paddle
::
platform
::
NPUDeviceContext
,
paddle
::
platform
::
float16
>
);
#endif
paddle/fluid/operators/softmax_with_cross_entropy_op.cc
浏览文件 @
80dd1672
...
...
@@ -41,7 +41,7 @@ class SoftmaxWithCrossEntropyOpMaker
"The outputs value of softmax activation by given the input batch, "
"which will be used in backward calculation."
)
.
AsIntermediate
();
#if defined(PADDLE_WITH_
ASCEND_CL) || defined(PADDLE_WITH_
MLU)
#if defined(PADDLE_WITH_MLU)
AddOutput
(
"Backprop"
,
"(Tensor, default: Tensor<float>), A tensor in same shape with "
...
...
@@ -135,7 +135,7 @@ class SoftmaxWithCrossEntropyOp : public framework::OperatorWithKernel {
true
,
platform
::
errors
::
InvalidArgument
(
"Output(Softmax) should be not null."
));
#if defined(PADDLE_WITH_
ASCEND_CL) || defined(PADDLE_WITH_
MLU)
#if defined(PADDLE_WITH_MLU)
PADDLE_ENFORCE_EQ
(
ctx
->
HasOutput
(
"Backprop"
),
true
,
platform
::
errors
::
InvalidArgument
(
...
...
@@ -206,10 +206,7 @@ class SoftmaxWithCrossEntropyOp : public framework::OperatorWithKernel {
}
ctx
->
SetOutputDim
(
"Softmax"
,
logits_dims
);
#if defined(PADDLE_WITH_ASCEND_CL) || defined(PADDLE_WITH_MLU)
ctx
->
SetOutputDim
(
"Backprop"
,
logits_dims
);
ctx
->
ShareLoD
(
"Logits"
,
/*->*/
"Backprop"
);
#endif
logits_dims
[
axis
]
=
1
;
ctx
->
SetOutputDim
(
"Loss"
,
logits_dims
);
...
...
@@ -238,7 +235,7 @@ class SoftmaxWithCrossEntropyOpGrad : public framework::OperatorWithKernel {
true
,
platform
::
errors
::
InvalidArgument
(
"Input(Softmax) should be not null."
));
#if defined(PADDLE_WITH_
ASCEND_CL) || defined(PADDLE_WITH_
MLU)
#if defined(PADDLE_WITH_MLU)
PADDLE_ENFORCE_EQ
(
ctx
->
HasInput
(
"Backprop"
),
true
,
platform
::
errors
::
InvalidArgument
(
...
...
@@ -327,7 +324,7 @@ class SoftmaxGradMaker : public framework::SingleGradOpMaker<T> {
grad_op
->
SetType
(
"softmax_with_cross_entropy_grad"
);
grad_op
->
SetInput
(
"Label"
,
this
->
Input
(
"Label"
));
grad_op
->
SetInput
(
"Softmax"
,
this
->
Output
(
"Softmax"
));
#if defined(PADDLE_WITH_
ASCEND_CL) || defined(PADDLE_WITH_
MLU)
#if defined(PADDLE_WITH_MLU)
grad_op
->
SetInput
(
"Backprop"
,
this
->
Output
(
"Backprop"
));
#endif
grad_op
->
SetInput
(
framework
::
GradVarName
(
"Loss"
),
this
->
OutputGrad
(
"Loss"
));
...
...
@@ -359,7 +356,7 @@ REGISTER_OPERATOR(softmax_with_cross_entropy_grad,
ops
::
SoftmaxWithCrossEntropyGradInplaceInferer
);
REGISTER_OP_VERSION
(
softmax_with_cross_entropy
)
#if defined(PADDLE_WITH_
ASCEND_CL) || defined(PADDLE_WITH_
MLU)
#if defined(PADDLE_WITH_MLU)
.
AddCheckpoint
(
R"ROC(
Add a new attribute [use_softmax] )ROC"
,
...
...
paddle/fluid/operators/tensor_formatter.cc
浏览文件 @
80dd1672
...
...
@@ -127,11 +127,6 @@ void TensorFormatter::FormatData(const phi::DenseTensor& print_tensor,
}
else
{
platform
::
CPUPlace
cpu_place
;
paddle
::
framework
::
TensorCopy
(
print_tensor
,
cpu_place
,
&
cpu_tensor
);
#ifdef PADDLE_WITH_ASCEND_CL
if
(
platform
::
is_npu_place
(
print_tensor
.
place
()))
{
platform
::
DeviceContextPool
::
Instance
().
Get
(
print_tensor
.
place
())
->
Wait
();
}
#endif
data
=
cpu_tensor
.
data
<
T
>
();
}
...
...
paddle/fluid/operators/unsqueeze_op_npu.cc
浏览文件 @
80dd1672
...
...
@@ -11,50 +11,3 @@ distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef PADDLE_WITH_ASCEND_CL
#include <memory>
#include <string>
#include "paddle/fluid/operators/unsqueeze_op.h"
namespace
ops
=
paddle
::
operators
;
namespace
plat
=
paddle
::
platform
;
REGISTER_OP_NPU_KERNEL
(
unsqueeze
,
ops
::
UnsqueezeKernel
<
plat
::
NPUDeviceContext
,
float
>
,
ops
::
UnsqueezeKernel
<
plat
::
NPUDeviceContext
,
double
>
,
ops
::
UnsqueezeKernel
<
plat
::
NPUDeviceContext
,
plat
::
float16
>
,
ops
::
UnsqueezeKernel
<
plat
::
NPUDeviceContext
,
bool
>
,
ops
::
UnsqueezeKernel
<
plat
::
NPUDeviceContext
,
int
>
,
ops
::
UnsqueezeKernel
<
plat
::
NPUDeviceContext
,
int8_t
>
,
ops
::
UnsqueezeKernel
<
plat
::
NPUDeviceContext
,
int64_t
>
);
REGISTER_OP_NPU_KERNEL
(
unsqueeze2
,
ops
::
UnsqueezeKernel
<
plat
::
NPUDeviceContext
,
float
>
,
ops
::
UnsqueezeKernel
<
plat
::
NPUDeviceContext
,
double
>
,
ops
::
UnsqueezeKernel
<
plat
::
NPUDeviceContext
,
plat
::
float16
>
,
ops
::
UnsqueezeKernel
<
plat
::
NPUDeviceContext
,
bool
>
,
ops
::
UnsqueezeKernel
<
plat
::
NPUDeviceContext
,
int
>
,
ops
::
UnsqueezeKernel
<
plat
::
NPUDeviceContext
,
int8_t
>
,
ops
::
UnsqueezeKernel
<
plat
::
NPUDeviceContext
,
int64_t
>
);
REGISTER_OP_NPU_KERNEL
(
unsqueeze_grad
,
ops
::
UnsqueezeGradKernel
<
plat
::
NPUDeviceContext
,
float
>
,
ops
::
UnsqueezeGradKernel
<
plat
::
NPUDeviceContext
,
double
>
,
ops
::
UnsqueezeGradKernel
<
plat
::
NPUDeviceContext
,
plat
::
float16
>
,
ops
::
UnsqueezeGradKernel
<
plat
::
NPUDeviceContext
,
bool
>
,
ops
::
UnsqueezeGradKernel
<
plat
::
NPUDeviceContext
,
int
>
,
ops
::
UnsqueezeGradKernel
<
plat
::
NPUDeviceContext
,
int8_t
>
,
ops
::
UnsqueezeGradKernel
<
plat
::
NPUDeviceContext
,
int64_t
>
);
REGISTER_OP_NPU_KERNEL
(
unsqueeze2_grad
,
ops
::
Unsqueeze2GradKernel
<
plat
::
NPUDeviceContext
,
float
>
,
ops
::
Unsqueeze2GradKernel
<
plat
::
NPUDeviceContext
,
double
>
,
ops
::
Unsqueeze2GradKernel
<
plat
::
NPUDeviceContext
,
plat
::
float16
>
,
ops
::
Unsqueeze2GradKernel
<
plat
::
NPUDeviceContext
,
bool
>
,
ops
::
Unsqueeze2GradKernel
<
plat
::
NPUDeviceContext
,
int
>
,
ops
::
Unsqueeze2GradKernel
<
plat
::
NPUDeviceContext
,
int8_t
>
,
ops
::
Unsqueeze2GradKernel
<
plat
::
NPUDeviceContext
,
int64_t
>
);
#endif
paddle/fluid/operators/utils.h
浏览文件 @
80dd1672
...
...
@@ -92,7 +92,7 @@ inline T GetValue(const phi::DenseTensor* x) {
if
(
!
platform
::
is_cpu_place
(
x
->
place
()))
{
phi
::
DenseTensor
cpu_x
;
framework
::
TensorCopy
(
*
x
,
platform
::
CPUPlace
(),
&
cpu_x
);
#if defined(PADDLE_WITH_
ASCEND_CL) || defined(PADDLE_WITH_
MLU)
#if defined(PADDLE_WITH_MLU)
platform
::
DeviceContextPool
&
pool
=
platform
::
DeviceContextPool
::
Instance
();
const
platform
::
DeviceContext
*
dev_ctx
=
pool
.
Get
(
x
->
place
());
dev_ctx
->
Wait
();
...
...
paddle/fluid/platform/collective_helper.h
浏览文件 @
80dd1672
...
...
@@ -147,118 +147,6 @@ class NCCLCommContext {
};
#endif
#if defined(PADDLE_WITH_ASCEND_CL)
// In order to apply hierarchical communication with HCCL, we need
// a communication ring contains HCCL communicators associated to a global
// HCCLUniqueId. E.g. for a hierarchical case,
//
// 11 - 12 21 - 22
// | | | |
// 13 - 14 - 23 - 24
// | |
// 31 - 32 - 41 - 42
// | | | |
// 33 - 34 43 - 44
//
// we group (14,23,32,41) as the top, and (11,12,13,14), (21,22,23,24),
// (31,32,33,34), (41,42,43,44) as bottoms respectively.
//
// We could also use a single communication ring for the flatten case
//
// The HCCLComm instance is created and reversed in the HCCLCommContext
// singleton with a global user specified group id.
class
NPUDeviceContext
;
#define ENV_RANK_TABLE_FILE "RANK_TABLE_FILE"
#define ENV_RANK_ID "PADDLE_TRAINER_ID"
class
HCCLComm
{
public:
virtual
int
ring_id
()
const
=
0
;
virtual
int
nranks
()
const
=
0
;
virtual
int
rank
()
const
=
0
;
virtual
int
device_id
()
const
=
0
;
virtual
HcclComm
comm
()
const
=
0
;
virtual
aclrtStream
stream
()
const
=
0
;
virtual
NPUDeviceContext
*
dev_context
()
const
=
0
;
virtual
~
HCCLComm
()
=
default
;
};
// A singleton HCCL communicator context reserves communication ring ids
class
HCCLCommContext
{
public:
static
HCCLCommContext
&
Instance
()
{
static
HCCLCommContext
comm_ctx
;
return
comm_ctx
;
}
HCCLComm
*
CreateHCCLComm
(
HcclRootInfo
*
hccl_id
,
int
nranks
,
int
rank
,
int
dev_id
,
int
ring_id
);
// a latter comm with the same dev_id and the same ring_id
// will override the former
HCCLComm
*
AssignHCCLComm
(
HcclComm
comm
,
int
nranks
,
int
rank
,
int
dev_id
,
int
ring_id
);
// retrieve a communicator by the ring id in multiprocessing mode
HCCLComm
*
Get
(
int
ring_id
)
const
{
PADDLE_ENFORCE_GT
(
comm_map_
.
count
(
ring_id
),
0
,
platform
::
errors
::
InvalidArgument
(
"Communicator in ring id %d has not been initialized."
,
ring_id
));
PADDLE_ENFORCE_EQ
(
comm_map_
.
at
(
ring_id
).
size
(),
1
,
platform
::
errors
::
InvalidArgument
(
"One device id should be specified to retrieve from "
"multiple communicators."
));
return
comm_map_
.
at
(
ring_id
).
begin
()
->
second
.
get
();
}
// retrieve a communicator by the ring id and the device id
HCCLComm
*
Get
(
int
ring_id
,
int
dev_id
)
const
{
PADDLE_ENFORCE_GT
(
comm_map_
.
count
(
ring_id
),
0
,
platform
::
errors
::
InvalidArgument
(
"Communicator of ring id %d has not been initialized."
,
ring_id
));
PADDLE_ENFORCE_GT
(
comm_map_
.
at
(
ring_id
).
count
(
dev_id
),
0
,
platform
::
errors
::
InvalidArgument
(
"Communicator at device id %d has not been initialized in ring %d."
,
dev_id
,
ring_id
));
return
comm_map_
.
at
(
ring_id
).
at
(
dev_id
).
get
();
}
// retrieve a communicator by the ring id and place
HCCLComm
*
Get
(
int
ring_id
,
Place
place
)
const
{
return
Get
(
ring_id
,
place
.
device
);
}
private:
// Init global hcom
HCCLCommContext
()
{}
// we may use group feature in the feature
// HCCLCommContext() { InitHcomWorldGroup(); }
HcclComm
comm_
;
public:
~
HCCLCommContext
()
{}
std
::
once_flag
once_flag_
;
std
::
mutex
comm_map_mutex_
;
// ring id to dev-HCCLComm
std
::
map
<
int
,
std
::
map
<
int
,
std
::
unique_ptr
<
HCCLComm
>>>
comm_map_
;
// void InitHcomWorldGroup();
void
ReleaseHCCLComms
();
DISABLE_COPY_AND_ASSIGN
(
HCCLCommContext
);
};
#endif
#if defined(PADDLE_WITH_XPU_BKCL)
// In order to apply hierarchical communication with BKCL, we need
// a communication ring contains BKCL communicators associated to a global
...
...
paddle/fluid/platform/device_context.cc
浏览文件 @
80dd1672
...
...
@@ -266,51 +266,5 @@ IPUDeviceContext::~IPUDeviceContext() {}
#endif
#ifdef PADDLE_WITH_ASCEND_CL
NPUDeviceContext
::
NPUDeviceContext
(
NPUPlace
place
)
:
place_
(
place
)
{
NPUDeviceGuard
guard
(
place_
.
device
);
// PADDLE_ENFORCE_NPU_SUCCESS(aclrtCreateContext(&context_, place_.device));
// NOTE(zhiqiu): Usually, no need to create context explicitly,
// ACL creates a default context which contains 1 default stream
// and 1 sync strean after aclrtSetDevice.
platform
::
GetCurrentNPUContext
(
&
context_
);
stream_
.
reset
(
new
stream
::
NPUStream
(
place
));
}
NPUDeviceContext
::~
NPUDeviceContext
()
{
// NPUDeviceGuard guard(place_.device);
// PADDLE_ENFORCE_NPU_SUCCESS(aclrtDestroyContext(context_));
}
void
NPUDeviceContext
::
Wait
()
const
{
platform
::
RecordEvent
record_event
(
"NPUDeviceContext/wait"
,
platform
::
TracerEventType
::
UserDefined
,
2
);
VLOG
(
4
)
<<
"NPU context("
<<
this
<<
") Wait"
;
stream_
->
Wait
();
}
aclrtStream
NPUDeviceContext
::
stream
()
const
{
return
stream_
->
raw_stream
();
}
const
Place
&
NPUDeviceContext
::
GetPlace
()
const
{
return
place_
;
}
aclrtContext
NPUDeviceContext
::
context
()
const
{
return
context_
;
}
NPUPinnedDeviceContext
::
NPUPinnedDeviceContext
()
{
eigen_device_
.
reset
(
new
Eigen
::
DefaultDevice
());
}
NPUPinnedDeviceContext
::
NPUPinnedDeviceContext
(
NPUPinnedPlace
place
)
:
place_
(
place
)
{
eigen_device_
.
reset
(
new
Eigen
::
DefaultDevice
());
}
Eigen
::
DefaultDevice
*
NPUPinnedDeviceContext
::
eigen_device
()
const
{
return
eigen_device_
.
get
();
}
const
Place
&
NPUPinnedDeviceContext
::
GetPlace
()
const
{
return
place_
;
}
#endif
}
// namespace platform
}
// namespace paddle
paddle/fluid/platform/gen_comm_id_helper.cc
浏览文件 @
80dd1672
...
...
@@ -34,10 +34,6 @@ limitations under the License. */
#include "xpu/bkcl.h"
#endif
#if defined(PADDLE_WITH_ASCEND_CL)
#include "paddle/fluid/platform/collective_helper.h"
#endif
#if defined(PADDLE_WITH_CNCL)
#include <cncl.h>
#endif
...
...
@@ -334,11 +330,7 @@ static int ConnectAddr(const std::string& ep, const CommHead head) {
}
// TODO(WANGXI): maybe need to unify this hard code
#ifdef PADDLE_WITH_ASCEND_CL
#define MAX_COMMUNIQUEID_LEN 4108
#else
#define MAX_COMMUNIQUEID_LEN 1024
#endif
template
<
typename
CommUniqueId
>
static
void
RecvCommID
(
int
conn
,
CommUniqueId
*
nccl_id
)
{
...
...
@@ -456,9 +448,7 @@ INSTANT_TEMPLATE(ncclUniqueId)
#ifdef PADDLE_WITH_XPU_BKCL
INSTANT_TEMPLATE
(
BKCLUniqueId
)
#endif
#ifdef PADDLE_WITH_ASCEND_CL
INSTANT_TEMPLATE
(
HcclRootInfo
)
#endif
#ifdef PADDLE_WITH_CNCL
INSTANT_TEMPLATE
(
cnclCliqueId
)
#endif
...
...
paddle/fluid/platform/init.cc
浏览文件 @
80dd1672
...
...
@@ -228,9 +228,7 @@ void InitDevices(const std::vector<int> devices) {
#ifdef PADDLE_WITH_IPU
places
.
emplace_back
(
platform
::
IPUPlace
(
devices
[
i
]));
#endif
#ifdef PADDLE_WITH_ASCEND_CL
places
.
emplace_back
(
platform
::
NPUPlace
(
devices
[
i
]));
#endif
#ifdef PADDLE_WITH_MLU
places
.
emplace_back
(
platform
::
MLUPlace
(
devices
[
i
]));
#endif
...
...
paddle/fluid/platform/place.h
浏览文件 @
80dd1672
...
...
@@ -19,8 +19,6 @@ limitations under the License. */
#include "paddle/fluid/platform/enforce.h"
//
#ifdef PADDLE_WITH_ASCEND_CL
#endif
#include "paddle/phi/common/place.h"
namespace
paddle
{
...
...
@@ -95,24 +93,14 @@ typename Visitor::result_type VisitPlace(const Place &place,
#endif
}
case
phi
::
AllocationType
::
NPU
:
{
#ifdef PADDLE_WITH_ASCEND_CL
platform
::
NPUPlace
p
(
place
.
GetDeviceId
());
return
visitor
(
p
);
#else
PADDLE_THROW
(
platform
::
errors
::
Unavailable
(
"Paddle is not compiled with NPU. Cannot visit npu_pinned"
));
return
typename
Visitor
::
result_type
();
#endif
}
case
phi
::
AllocationType
::
NPUPINNED
:
{
#ifdef PADDLE_WITH_ASCEND_CL
platform
::
NPUPinnedPlace
p
;
return
visitor
(
p
);
#else
PADDLE_THROW
(
platform
::
errors
::
Unavailable
(
"Paddle is not compiled with NPU. Cannot visit npu_pinned"
));
return
typename
Visitor
::
result_type
();
#endif
}
case
phi
::
AllocationType
::
IPU
:
{
#ifdef PADDLE_WITH_IPU
...
...
paddle/fluid/platform/stream_callback_manager.cc
浏览文件 @
80dd1672
...
...
@@ -33,11 +33,8 @@ static void StreamCallbackFunc(gpuStream_t stream,
#endif
#endif
#if PADDLE_WITH_ASCEND_CL
static
void
StreamCallbackFunc
(
void
*
user_data
)
#endif
#if PADDLE_WITH_MLU
static
void
StreamCallbackFunc
(
void
*
user_data
)
static
void
StreamCallbackFunc
(
void
*
user_data
)
#endif
{
std
::
unique_ptr
<
std
::
function
<
void
()
>>
func
(
...
...
@@ -75,12 +72,6 @@ void StreamCallbackManager<Stream>::AddCallback(
#endif
#endif
#if PADDLE_WITH_ASCEND_CL
VLOG
(
3
)
<<
"aclrtLaunchCallback at stream: "
<<
stream_
;
// TODO(zhiqiu): failed to call aclrtLaunchCallback
NPULaunchCallback
(
StreamCallbackFunc
,
func
,
ACL_CALLBACK_BLOCK
,
stream_
);
#endif
#if PADDLE_WITH_MLU
VLOG
(
3
)
<<
"MLULaunchCallback at stream: "
<<
stream_
;
cnrtInvokeHostFunc
(
stream_
,
StreamCallbackFunc
,
func
);
...
...
@@ -94,9 +85,6 @@ void StreamCallbackManager<Stream>::Wait() const {
#endif
#ifdef PADDLE_WITH_MLU
PADDLE_ENFORCE_MLU_SUCCESS
(
cnrtQueueSync
(
stream_
));
#endif
#ifdef PADDLE_WITH_ASCEND_CL
NPUStreamSync
(
stream_
);
#endif
{
std
::
lock_guard
<
std
::
mutex
>
lock
(
mtx_
);
...
...
@@ -112,9 +100,7 @@ template struct StreamCallbackManager<gpuStream_t>;
#ifdef PADDLE_WITH_HIP
template
struct
StreamCallbackManager
<
hipStream_t
>;
#endif
#ifdef PADDLE_WITH_ASCEND_CL
template
struct
StreamCallbackManager
<
aclrtStream
>;
#endif
#ifdef PADDLE_WITH_MLU
template
struct
StreamCallbackManager
<
mluStream
>;
#endif
...
...
paddle/fluid/pybind/eager_legacy_op_function_generator.cc
浏览文件 @
80dd1672
...
...
@@ -26,12 +26,9 @@
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/variable.h"
#include "paddle/fluid/pybind/eager_generator.h"
#include "paddle/fluid/pybind/pybind.h"
#include "paddle/fluid/string/string_helper.h"
#ifdef PADDLE_WITH_ASCEND_CL
#include "paddle/fluid/framework/fleet/ascend_wrapper.h"
#endif
#include "paddle/fluid/pybind/eager_generator.h"
// phi
#include "paddle/phi/kernels/declarations.h"
...
...
@@ -485,11 +482,6 @@ int main(int argc, char* argv[]) {
return
-
1
;
}
#ifdef PADDLE_WITH_ASCEND_CL
auto
ascend_ptr
=
paddle
::
framework
::
AscendInstance
::
GetInstance
();
ascend_ptr
->
InitGEForUT
();
#endif
std
::
vector
<
std
::
string
>
headers
{
"<Python.h>"
,
"
\"
paddle/fluid/platform/enforce.h
\"
"
,
...
...
@@ -557,9 +549,5 @@ int main(int argc, char* argv[]) {
out
.
close
();
#ifdef PADDLE_WITH_ASCEND_CL
ge
::
GEFinalize
();
#endif
return
0
;
}
paddle/fluid/pybind/imperative.cc
浏览文件 @
80dd1672
...
...
@@ -2547,9 +2547,9 @@ void BindImperative(py::module *m_ptr) {
},
py
::
call_guard
<
py
::
gil_scoped_release
>
());
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) ||
\
defined(PADDLE_WITH_XPU_BKCL) || defined(PADDLE_WITH_
ASCEND_CL
) || \
defined(PADDLE_WITH_
GLOO) || defined(PADDLE_WITH_
CNCL)
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
defined(PADDLE_WITH_XPU_BKCL) || defined(PADDLE_WITH_
GLOO
) || \
defined(PADDLE_WITH_CNCL)
py
::
class_
<
imperative
::
ParallelContext
,
std
::
shared_ptr
<
imperative
::
ParallelContext
>>
(
m
,
"ParallelContext"
);
...
...
@@ -2630,7 +2630,7 @@ void BindImperative(py::module *m_ptr) {
#endif
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
defined(PADDLE_WITH_XPU_BKCL)
|| defined(PADDLE_WITH_ASCEND_CL)
defined(PADDLE_WITH_XPU_BKCL)
py
::
class_
<
imperative
::
HeterParallelContext
,
imperative
::
ParallelContext
,
std
::
shared_ptr
<
imperative
::
HeterParallelContext
>>
(
...
...
paddle/phi/kernels/funcs/strided_memcpy.h
浏览文件 @
80dd1672
...
...
@@ -57,7 +57,7 @@ inline void CopyWithContext(const Context& ctx,
const
void
*
src
,
size_t
num
)
{
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) || \
defined(PADDLE_WITH_
ASCEND_CL) || defined(PADDLE_WITH_
MLU)
defined(PADDLE_WITH_MLU)
memory_utils
::
Copy
(
dst_place
,
dst
,
src_place
,
src
,
num
,
ctx
.
stream
());
#else
PADDLE_THROW
(
...
...
paddle/testing/paddle_gtest_main.cc
浏览文件 @
80dd1672
...
...
@@ -101,9 +101,6 @@ int main(int argc, char** argv) {
int
ret
=
RUN_ALL_TESTS
();
#ifdef PADDLE_WITH_ASCEND_CL
paddle
::
platform
::
AclInstance
::
Instance
().
Finalize
();
#endif
if
(
env_str
)
free
(
env_str
);
if
(
undefok_str
)
free
(
undefok_str
);
return
ret
;
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录