Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle
提交
b4eb413e
P
Paddle
项目概览
PaddlePaddle
/
Paddle
1 年多 前同步成功
通知
2302
Star
20931
Fork
5422
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1423
列表
看板
标记
里程碑
合并请求
543
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1,423
Issue
1,423
列表
看板
标记
里程碑
合并请求
543
合并请求
543
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
b4eb413e
编写于
3月 07, 2022
作者:
Z
zn
提交者:
GitHub
3月 07, 2022
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
[MLU]support reduce tensors on mlu (#40000)
* [MLU]support reduce tensors on mlu * [MLU]fix compiler options
上级
0ad25fb9
变更
10
隐藏空白更改
内联
并排
Showing
10 changed file
with
265 addition
and
9 deletion
+265
-9
paddle/fluid/imperative/CMakeLists.txt
paddle/fluid/imperative/CMakeLists.txt
+2
-1
paddle/fluid/imperative/reducer.cc
paddle/fluid/imperative/reducer.cc
+83
-4
paddle/fluid/imperative/reducer.h
paddle/fluid/imperative/reducer.h
+1
-1
paddle/fluid/imperative/tests/CMakeLists.txt
paddle/fluid/imperative/tests/CMakeLists.txt
+1
-1
paddle/fluid/imperative/tests/test_group.cc
paddle/fluid/imperative/tests/test_group.cc
+18
-2
paddle/fluid/operators/math/CMakeLists.txt
paddle/fluid/operators/math/CMakeLists.txt
+2
-0
paddle/fluid/operators/math/concat_and_split.cc
paddle/fluid/operators/math/concat_and_split.cc
+100
-0
paddle/fluid/operators/mlu/mlu_baseop.cc
paddle/fluid/operators/mlu/mlu_baseop.cc
+42
-0
paddle/fluid/operators/mlu/mlu_baseop.h
paddle/fluid/operators/mlu/mlu_baseop.h
+11
-0
paddle/fluid/operators/strided_memcpy.h
paddle/fluid/operators/strided_memcpy.h
+5
-0
未找到文件。
paddle/fluid/imperative/CMakeLists.txt
浏览文件 @
b4eb413e
...
...
@@ -33,6 +33,7 @@ if(NOT WIN32)
endif
()
if
(
WITH_CNCL
)
cc_library
(
cncl_context SRCS cncl_context.cc DEPS collective_helper device_context tensor var_type_traits
)
cc_library
(
reducer SRCS reducer.cc DEPS layer
)
endif
()
if
(
WITH_NCCL OR WITH_RCCL OR WITH_XPU_BKCL OR WITH_ASCEND_CL
)
cc_library
(
heter_ccl_context SRCS heter_ccl_context.cc DEPS collective_helper device_context tensor var_type_traits
)
...
...
@@ -41,7 +42,7 @@ if(NOT WIN32)
endif
(
NOT WIN32
)
if
(
WITH_GLOO
)
cc_library
(
imperative_gloo_context SRCS gloo_context.cc DEPS collective_helper device_context tensor var_type_traits
)
if
(
WIN32
OR
(
NOT
(
WITH_NCCL OR WITH_RCCL OR WITH_XPU_BKCL OR WITH_ASCEND_CL
)
))
if
(
WIN32
OR
(
NOT
(
WITH_NCCL OR WITH_RCCL OR WITH_XPU_BKCL OR WITH_ASCEND_CL
OR WITH_CNCL
)
))
cc_library
(
reducer SRCS reducer.cc DEPS layer
)
endif
()
endif
()
...
...
paddle/fluid/imperative/reducer.cc
浏览文件 @
b4eb413e
...
...
@@ -31,7 +31,7 @@ namespace imperative {
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
defined(PADDLE_WITH_XPU_BKCL) || defined(PADDLE_WITH_GLOO) || \
defined(PADDLE_WITH_ASCEND_CL)
defined(PADDLE_WITH_ASCEND_CL)
|| defined(PADDLE_WITH_CNCL)
// div the nranks
void
Group
::
DivNRanks
(
const
platform
::
DeviceContext
&
context
,
int64_t
nranks
)
{
framework
::
Tensor
*
tensor
=
...
...
@@ -67,6 +67,9 @@ void Group::DivNRanks(const platform::DeviceContext &context, int64_t nranks) {
#ifdef PADDLE_WITH_XPU_BKCL
// TODO(liuyuhui) support xpu about div nranks in the future
#endif
}
else
if
(
platform
::
is_mlu_place
(
tensor
->
place
()))
{
// TODO(zhangna)
VLOG
(
4
)
<<
"divnrank for mlu not support yet"
;
}
}
...
...
@@ -222,6 +225,56 @@ void SplitTensorsWithType<platform::XPUDeviceContext>(
}
#endif
#ifdef PADDLE_WITH_CNCL
// context is used to select the stream for concat
template
<
>
void
ConcatTensorsWithType
<
platform
::
MLUDeviceContext
>
(
const
platform
::
MLUDeviceContext
&
context
,
const
std
::
vector
<
framework
::
Tensor
>
&
dense_tensors_
,
framework
::
Variable
*
p_dense_contents
,
framework
::
proto
::
VarType
::
Type
type
)
{
switch
(
type
)
{
case
framework
::
proto
::
VarType
::
FP16
:
ConcatTensorsForAllReduce
<
platform
::
MLUDeviceContext
,
platform
::
float16
>
(
context
,
dense_tensors_
,
p_dense_contents
);
break
;
case
framework
::
proto
::
VarType
::
FP32
:
ConcatTensorsForAllReduce
<
platform
::
MLUDeviceContext
,
float
>
(
context
,
dense_tensors_
,
p_dense_contents
);
break
;
default:
PADDLE_THROW
(
platform
::
errors
::
Unimplemented
(
"Data type (%s) is not supported when it concats tensors for "
"allreduce."
,
framework
::
DataTypeToString
(
type
)));
}
}
// context is used to select the stream for split
template
<
>
void
SplitTensorsWithType
<
platform
::
MLUDeviceContext
>
(
const
platform
::
MLUDeviceContext
&
context
,
framework
::
Variable
*
p_dense_contents
,
std
::
vector
<
framework
::
Tensor
>
*
p_dense_tensors
,
framework
::
proto
::
VarType
::
Type
type
)
{
switch
(
type
)
{
case
framework
::
proto
::
VarType
::
FP16
:
SplitTensorsForAllReduce
<
platform
::
MLUDeviceContext
,
platform
::
float16
>
(
context
,
p_dense_contents
,
p_dense_tensors
);
break
;
case
framework
::
proto
::
VarType
::
FP32
:
SplitTensorsForAllReduce
<
platform
::
MLUDeviceContext
,
float
>
(
context
,
p_dense_contents
,
p_dense_tensors
);
break
;
default:
PADDLE_THROW
(
platform
::
errors
::
Unimplemented
(
"Data type (%s) is not supported when it splits tensors for "
"allreduce."
,
framework
::
DataTypeToString
(
type
)));
}
}
#endif
void
Group
::
ConcatTensors
(
const
platform
::
DeviceContext
&
context
)
{
auto
place
=
context
.
GetPlace
();
if
(
platform
::
is_gpu_place
(
place
))
{
...
...
@@ -253,6 +306,16 @@ void Group::ConcatTensors(const platform::DeviceContext &context) {
PADDLE_THROW
(
platform
::
errors
::
PermissionDenied
(
"Paddle can't concat npu grads since it's not compiled with HCCL,"
"Please recompile or reinstall Paddle with HCCL support."
));
#endif
}
else
if
(
platform
::
is_mlu_place
(
place
))
{
#ifdef PADDLE_WITH_CNCL
ConcatTensorsWithType
(
static_cast
<
const
platform
::
MLUDeviceContext
&>
(
context
),
dense_tensors_
,
&
dense_contents_
,
dtype_
);
#else
PADDLE_THROW
(
platform
::
errors
::
PermissionDenied
(
"Paddle can't concat mlu grads since it's not compiled with CNCL,"
"Please recompile or reinstall Paddle with CNCL support."
));
#endif
}
else
if
(
platform
::
is_cpu_place
(
place
))
{
ConcatTensorsWithType
(
...
...
@@ -295,6 +358,16 @@ void Group::SplitTensors(const platform::DeviceContext &context) {
PADDLE_THROW
(
platform
::
errors
::
PermissionDenied
(
"Paddle can't split npu grad since it's not compiled with HCCL,"
"Please recompile or reinstall Paddle with HCCL support."
));
#endif
}
else
if
(
platform
::
is_mlu_place
(
place
))
{
#ifdef PADDLE_WITH_CNCL
SplitTensorsWithType
(
static_cast
<
const
platform
::
MLUDeviceContext
&>
(
context
),
&
dense_contents_
,
&
dense_tensors_
,
dtype_
);
#else
PADDLE_THROW
(
platform
::
errors
::
PermissionDenied
(
"Paddle can't split mlu grad since it's not compiled with CNCL,"
"Please recompile or reinstall Paddle with CNCL support."
));
#endif
}
else
if
(
platform
::
is_cpu_place
(
place
))
{
SplitTensorsWithType
(
...
...
@@ -746,6 +819,11 @@ void Reducer::MarkVarReady(const size_t var_index, const bool is_used_var) {
// TODO(liuyuhui) support XPU set constant
VLOG
(
3
)
<<
"XPU doesn't support set_constant"
;
}
#elif defined(PADDLE_WITH_CNCL)
if
(
platform
::
is_mlu_place
(
group_tensor
.
place
()))
{
// TODO(liuyuhui) support MLU set constant
VLOG
(
3
)
<<
"MLU doesn't support set_constant"
;
}
#else
auto
*
dev_ctx
=
platform
::
DeviceContextPool
::
Instance
().
Get
(
place_
);
if
(
HasGrad
(
var_index
))
{
...
...
@@ -846,12 +924,13 @@ void Reducer::MarkGroupReady(size_t group_index) {
cv_
.
notify_all
();
}
});
#elif defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_NCCL) || \
defined(PADDLE_WITH_GLOO) || defined(PADDLE_WITH_ASCEND_CL)
#elif defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_NCCL) || \
defined(PADDLE_WITH_GLOO) || defined(PADDLE_WITH_ASCEND_CL) || \
defined(PADDLE_WITH_CNCL)
FusedAllReduceSchedule
(
run_order
,
group
,
next_group_
);
#else
PADDLE_THROW
(
platform
::
errors
::
PreconditionNotMet
(
"Not compiled with BKCL or NCCL or GLOO."
));
"Not compiled with BKCL or NCCL or
CNCL or
GLOO."
));
#endif
}
}
...
...
paddle/fluid/imperative/reducer.h
浏览文件 @
b4eb413e
...
...
@@ -45,7 +45,7 @@ namespace imperative {
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
defined(PADDLE_WITH_XPU_BKCL) || defined(PADDLE_WITH_GLOO) || \
defined(PADDLE_WITH_ASCEND_CL)
defined(PADDLE_WITH_ASCEND_CL)
|| defined(PADDLE_WITH_CNCL)
template
<
typename
T
>
struct
DivNRanksFunctor
{
...
...
paddle/fluid/imperative/tests/CMakeLists.txt
浏览文件 @
b4eb413e
...
...
@@ -21,6 +21,6 @@ cc_test(test_prepare_op SRCS test_prepare_op.cc DEPS prepared_operator op_info s
cc_test
(
test_tracer SRCS test_tracer.cc DEPS tracer layer proto_desc operator op_registry variable_helper mul_op reduce_sum_op elementwise_add_op memcpy
)
cc_test
(
test_hooks SRCS test_hooks.cc DEPS tracer basic_engine layer proto_desc operator op_registry variable_helper mul_op elementwise_add_op memcpy
)
cc_test
(
test_eager SRCS test_eager.cc DEPS tracer layer prepared_operator mul_op
)
if
(
WITH_NCCL OR WITH_RCCL OR WITH_XPU_BKCL
)
if
(
WITH_NCCL OR WITH_RCCL OR WITH_XPU_BKCL
OR WITH_CNCL
)
cc_test
(
test_group SRCS test_group.cc DEPS reducer concat_and_split memcpy
)
endif
()
paddle/fluid/imperative/tests/test_group.cc
浏览文件 @
b4eb413e
...
...
@@ -72,8 +72,10 @@ void GroupConcatSplit(Place place, size_t size) {
value
.
push_back
(
static_cast
<
T
>
(
1.0
*
j
));
}
if
(
std
::
is_same
<
Place
,
platform
::
CUDAPlace
>::
value
)
{
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
if
(
std
::
is_same
<
Place
,
platform
::
CUDAPlace
>::
value
||
std
::
is_same
<
Place
,
platform
::
MLUPlace
>::
value
)
{
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
defined(PADDLE_WITH_CNCL)
paddle
::
memory
::
Copy
(
place
,
data
,
cpu_place
,
value
.
data
(),
sizeof
(
T
)
*
value
.
size
(),
0
);
#endif
...
...
@@ -180,5 +182,19 @@ TEST(TestGroup, TestXPUConcatSplit) {
}
#endif
#if defined(PADDLE_WITH_CNCL)
TEST
(
TestGroup
,
TestMLUConcatSplit
)
{
platform
::
MLUPlace
mlu_place
(
0
);
platform
::
CPUPlace
cpu_place
;
int
size
=
3
;
GroupConcatSplit
<
float
>
(
cpu_place
,
size
);
GroupConcatSplit
<
float
>
(
mlu_place
,
size
);
size
=
15
;
GroupConcatSplit
<
float
>
(
cpu_place
,
size
);
GroupConcatSplit
<
float
>
(
mlu_place
,
size
);
}
#endif
}
// namespace imperative
}
// namespace paddle
paddle/fluid/operators/math/CMakeLists.txt
浏览文件 @
b4eb413e
...
...
@@ -5,6 +5,8 @@ endif()
# please add new math_library in alphabetical order
if
(
WITH_ASCEND_CL
)
math_library
(
concat_and_split DEPS concat_and_split_functor npu_op_runner
)
elseif
(
WITH_MLU
)
math_library
(
concat_and_split DEPS concat_and_split_functor mlu_baseop
)
else
()
math_library
(
concat_and_split DEPS concat_and_split_functor
)
endif
()
...
...
paddle/fluid/operators/math/concat_and_split.cc
浏览文件 @
b4eb413e
...
...
@@ -18,6 +18,9 @@ limitations under the License. */
#ifdef PADDLE_WITH_ASCEND_CL
#include "paddle/fluid/platform/device/npu/npu_op_runner.h"
#endif
#ifdef PADDLE_WITH_MLU
#include "paddle/fluid/operators/mlu/mlu_baseop.h"
#endif
#include "paddle/phi/common/bfloat16.h"
#include "paddle/phi/common/float16.h"
...
...
@@ -226,6 +229,90 @@ class SplitFunctor<platform::NPUDeviceContext, T> {
};
#endif
#ifdef PADDLE_WITH_MLU
template
<
typename
T
>
class
ConcatFunctor
<
platform
::
MLUDeviceContext
,
T
>
{
public:
void
operator
()(
const
platform
::
MLUDeviceContext
&
context
,
const
std
::
vector
<
framework
::
Tensor
>&
input
,
int
axis
,
framework
::
Tensor
*
output
)
{
int
dev_id
=
context
.
GetPlace
().
GetDeviceId
();
platform
::
MLUDeviceGuard
guard
(
dev_id
);
auto
ins_size
=
input
.
size
();
const
int
axis_t
=
axis
;
const
int
ins_size_t
=
ins_size
;
auto
place
=
context
.
GetPlace
();
output
->
mutable_data
<
T
>
(
place
);
// mlu should do sth
// init ins tensors
std
::
vector
<
const
void
*>
inputs
;
std
::
vector
<
MLUCnnlTensorDesc
>
input_descs
;
std
::
vector
<
cnnlTensorDescriptor_t
>
desc_vector
;
for
(
size_t
i
=
0
;
i
<
ins_size
;
i
++
)
{
input_descs
.
emplace_back
(
MLUCnnlTensorDesc
(
input
[
i
],
CNNL_LAYOUT_ARRAY
,
ToCnnlDataType
(
input
[
i
].
dtype
())));
desc_vector
.
push_back
(
input_descs
.
back
().
get
());
inputs
.
push_back
(
input
[
i
].
data
());
}
// init out tensors
MLUCnnlTensorDesc
output_desc
(
*
output
,
CNNL_LAYOUT_ARRAY
,
ToCnnlDataType
(
output
->
dtype
()));
// MLU should do sth
MLUCnnl
::
Concat
(
context
,
ins_size_t
,
axis_t
,
desc_vector
.
data
(),
inputs
.
data
(),
output_desc
.
get
(),
GetBasePtr
(
output
));
}
};
template
<
typename
T
>
class
SplitFunctor
<
platform
::
MLUDeviceContext
,
T
>
{
public:
void
operator
()(
const
platform
::
MLUDeviceContext
&
context
,
const
framework
::
Tensor
&
input
,
const
std
::
vector
<
const
framework
::
Tensor
*>&
ref_inputs
,
const
int
axis
,
std
::
vector
<
framework
::
Tensor
*>*
outputs
)
{
if
(
input
.
numel
()
==
0
)
{
return
;
}
int
dev_id
=
context
.
GetPlace
().
GetDeviceId
();
platform
::
MLUDeviceGuard
guard
(
dev_id
);
auto
in_dims
=
input
.
dims
();
auto
out_size
=
outputs
->
size
();
std
::
vector
<
framework
::
DDim
>
outs_dims
(
out_size
,
in_dims
);
for
(
size_t
i
=
0
;
i
<
out_size
;
++
i
)
{
outs_dims
[
i
][
axis
]
=
ref_inputs
[
i
]
->
dims
()[
axis
];
}
// init out tensors
std
::
vector
<
void
*>
vct_tensor
;
std
::
vector
<
MLUCnnlTensorDesc
>
output_descs
;
std
::
vector
<
cnnlTensorDescriptor_t
>
desc_vector
;
for
(
size_t
i
=
0
;
i
<
out_size
;
i
++
)
{
(
*
outputs
)[
i
]
->
Resize
(
outs_dims
[
i
]);
(
*
outputs
)[
i
]
->
mutable_data
<
T
>
(
context
.
GetPlace
());
output_descs
.
emplace_back
(
MLUCnnlTensorDesc
(
*
(
*
outputs
)[
i
],
CNNL_LAYOUT_ARRAY
,
ToCnnlDataType
((
*
outputs
)[
i
]
->
dtype
())));
desc_vector
.
push_back
(
output_descs
.
back
().
get
());
vct_tensor
.
push_back
(
GetBasePtr
((
*
outputs
)[
i
]));
}
// init in tensors
MLUCnnlTensorDesc
input_desc
(
input
,
CNNL_LAYOUT_ARRAY
,
ToCnnlDataType
(
input
.
dtype
()));
// MLU should do sth
MLUCnnl
::
Split
(
context
,
out_size
,
axis
,
input_desc
.
get
(),
input
.
data
(),
desc_vector
.
data
(),
vct_tensor
.
data
());
}
};
#endif
#define DEFINE_FUNCTOR(type) \
template class ConcatFunctor<platform::CPUDeviceContext, type>; \
template class SplitFunctor<platform::CPUDeviceContext, type>;
...
...
@@ -248,6 +335,19 @@ DEFINE_XPU_FUNCTOR(float)
FOR_ALL_TYPES
(
DEFINE_NPU_FUNCTOR
)
#endif
#ifdef PADDLE_WITH_MLU
#define DEFINE_MLU_FUNCTOR(type) \
template class ConcatFunctor<platform::MLUDeviceContext, type>; \
template class SplitFunctor<platform::MLUDeviceContext, type>;
DEFINE_MLU_FUNCTOR
(
float
)
DEFINE_MLU_FUNCTOR
(
platform
::
float16
)
DEFINE_MLU_FUNCTOR
(
int64_t
)
DEFINE_MLU_FUNCTOR
(
bool
)
DEFINE_MLU_FUNCTOR
(
int
)
DEFINE_MLU_FUNCTOR
(
int8_t
)
DEFINE_MLU_FUNCTOR
(
int16_t
)
DEFINE_MLU_FUNCTOR
(
uint8_t
)
#endif
}
// namespace math
}
// namespace operators
}
// namespace paddle
paddle/fluid/operators/mlu/mlu_baseop.cc
浏览文件 @
b4eb413e
...
...
@@ -499,6 +499,27 @@ MLUCnnlTrigonDesc::~MLUCnnlTrigonDesc() {
output_desc
,
output
));
}
/* static */
void
MLUCnnl
::
Concat
(
const
MLUDeviceContext
&
dev_ctx
,
const
int
pack_num
,
const
int
axis
,
const
cnnlTensorDescriptor_t
inputs_desc
[],
const
void
*
const
inputs
[],
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
)
{
cnnlHandle_t
handle
=
dev_ctx
.
cnnl_handle
();
size_t
workspace_size
=
0
;
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlGetConcatWorkspaceSize
(
handle
,
pack_num
,
&
workspace_size
));
Tensor
workspace
(
paddle
::
experimental
::
DataType
::
INT8
);
workspace
.
Resize
(
framework
::
DDim
({
static_cast
<
int64_t
>
(
workspace_size
)}));
void
*
workspace_ptr
=
workspace
.
mutable_data
(
dev_ctx
.
GetPlace
());
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlConcat
(
handle
,
pack_num
,
axis
,
inputs_desc
,
inputs
,
workspace_ptr
,
workspace_size
,
output_desc
,
output
));
}
/* static */
void
MLUCnnl
::
Div
(
const
ExecutionContext
&
ctx
,
cnnlComputationPreference_t
prefer
,
const
cnnlTensorDescriptor_t
in0_desc
,
const
void
*
in0
,
...
...
@@ -977,6 +998,27 @@ MLUCnnlTrigonDesc::~MLUCnnlTrigonDesc() {
output_descs
,
output_ptrs
));
}
/* static */
void
MLUCnnl
::
Split
(
const
MLUDeviceContext
&
dev_ctx
,
int
split_num
,
int
axis
,
const
cnnlTensorDescriptor_t
input_desc
,
const
void
*
input_ptr
,
const
cnnlTensorDescriptor_t
output_descs
[],
void
*
output_ptrs
[])
{
cnnlHandle_t
handle
=
dev_ctx
.
cnnl_handle
();
size_t
workspace_size
;
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlGetSplitWorkspaceSize
(
handle
,
split_num
,
&
workspace_size
));
Tensor
workspace
(
paddle
::
experimental
::
DataType
::
INT8
);
workspace
.
Resize
(
framework
::
DDim
({
static_cast
<
int64_t
>
(
workspace_size
)}));
void
*
workspace_ptr
=
workspace
.
mutable_data
(
dev_ctx
.
GetPlace
());
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlSplit
(
handle
,
split_num
,
axis
,
input_desc
,
input_ptr
,
workspace_ptr
,
workspace_size
,
output_descs
,
output_ptrs
));
}
/* static */
void
MLUCnnl
::
GatherFunctor
(
const
ExecutionContext
&
ctx
,
const
int
axis
,
const
int
batch_dims
,
const
cnnlTensorDescriptor_t
params_desc
,
const
void
*
params
,
...
...
paddle/fluid/operators/mlu/mlu_baseop.h
浏览文件 @
b4eb413e
...
...
@@ -403,6 +403,11 @@ class MLUCnnl {
const
void
*
const
inputs
[],
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
);
static
void
Concat
(
const
MLUDeviceContext
&
dev_ctx
,
const
int
pack_num
,
const
int
axis
,
const
cnnlTensorDescriptor_t
inputs_desc
[],
const
void
*
const
inputs
[],
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
);
static
void
Cast
(
const
ExecutionContext
&
ctx
,
cnnlCastDataType_t
cast_type
,
const
cnnlTensorDescriptor_t
input_desc
,
const
void
*
input
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
);
...
...
@@ -566,6 +571,12 @@ class MLUCnnl {
const
cnnlTensorDescriptor_t
output_descs
[],
void
*
output_ptrs
[]);
static
void
Split
(
const
MLUDeviceContext
&
dev_ctx
,
int
split_num
,
int
axis
,
const
cnnlTensorDescriptor_t
input_desc
,
const
void
*
input_ptr
,
const
cnnlTensorDescriptor_t
output_descs
[],
void
*
output_ptrs
[]);
static
void
Scale
(
const
ExecutionContext
&
ctx
,
const
int
axis
,
const
cnnlTensorDescriptor_t
input_desc
,
const
void
*
input
,
const
cnnlTensorDescriptor_t
alpha_desc
,
const
void
*
alpha
,
...
...
paddle/fluid/operators/strided_memcpy.h
浏览文件 @
b4eb413e
...
...
@@ -109,6 +109,11 @@ inline void StridedNumelCopyWithAxis(const platform::DeviceContext& ctx,
auto
&
npu_ctx
=
reinterpret_cast
<
const
platform
::
NPUDeviceContext
&>
(
ctx
);
memory
::
Copy
(
npu_place
,
dst
+
i
*
dst_after
,
npu_place
,
src
+
i
*
src_after
,
sizeof
(
T
)
*
size
,
npu_ctx
.
stream
());
#elif defined(PADDLE_WITH_MLU)
auto
&
mlu_place
=
place
;
auto
&
mlu_ctx
=
reinterpret_cast
<
const
platform
::
MLUDeviceContext
&>
(
ctx
);
memory
::
Copy
(
mlu_place
,
dst
+
i
*
dst_after
,
mlu_place
,
src
+
i
*
src_after
,
sizeof
(
T
)
*
size
,
mlu_ctx
.
stream
());
#else
PADDLE_THROW
(
platform
::
errors
::
PreconditionNotMet
(
"Paddle is not compiled with GPU."
));
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录