Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
BaiXuePrincess
Paddle
提交
b4eb413e
P
Paddle
项目概览
BaiXuePrincess
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
0
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
0
Issue
0
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
b4eb413e
编写于
3月 07, 2022
作者:
Z
zn
提交者:
GitHub
3月 07, 2022
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
[MLU]support reduce tensors on mlu (#40000)
* [MLU]support reduce tensors on mlu * [MLU]fix compiler options
上级
0ad25fb9
变更
10
隐藏空白更改
内联
并排
Showing
10 changed file
with
265 addition
and
9 deletion
+265
-9
paddle/fluid/imperative/CMakeLists.txt
paddle/fluid/imperative/CMakeLists.txt
+2
-1
paddle/fluid/imperative/reducer.cc
paddle/fluid/imperative/reducer.cc
+83
-4
paddle/fluid/imperative/reducer.h
paddle/fluid/imperative/reducer.h
+1
-1
paddle/fluid/imperative/tests/CMakeLists.txt
paddle/fluid/imperative/tests/CMakeLists.txt
+1
-1
paddle/fluid/imperative/tests/test_group.cc
paddle/fluid/imperative/tests/test_group.cc
+18
-2
paddle/fluid/operators/math/CMakeLists.txt
paddle/fluid/operators/math/CMakeLists.txt
+2
-0
paddle/fluid/operators/math/concat_and_split.cc
paddle/fluid/operators/math/concat_and_split.cc
+100
-0
paddle/fluid/operators/mlu/mlu_baseop.cc
paddle/fluid/operators/mlu/mlu_baseop.cc
+42
-0
paddle/fluid/operators/mlu/mlu_baseop.h
paddle/fluid/operators/mlu/mlu_baseop.h
+11
-0
paddle/fluid/operators/strided_memcpy.h
paddle/fluid/operators/strided_memcpy.h
+5
-0
未找到文件。
paddle/fluid/imperative/CMakeLists.txt
浏览文件 @
b4eb413e
...
...
@@ -33,6 +33,7 @@ if(NOT WIN32)
endif
()
if
(
WITH_CNCL
)
cc_library
(
cncl_context SRCS cncl_context.cc DEPS collective_helper device_context tensor var_type_traits
)
cc_library
(
reducer SRCS reducer.cc DEPS layer
)
endif
()
if
(
WITH_NCCL OR WITH_RCCL OR WITH_XPU_BKCL OR WITH_ASCEND_CL
)
cc_library
(
heter_ccl_context SRCS heter_ccl_context.cc DEPS collective_helper device_context tensor var_type_traits
)
...
...
@@ -41,7 +42,7 @@ if(NOT WIN32)
endif
(
NOT WIN32
)
if
(
WITH_GLOO
)
cc_library
(
imperative_gloo_context SRCS gloo_context.cc DEPS collective_helper device_context tensor var_type_traits
)
if
(
WIN32
OR
(
NOT
(
WITH_NCCL OR WITH_RCCL OR WITH_XPU_BKCL OR WITH_ASCEND_CL
)
))
if
(
WIN32
OR
(
NOT
(
WITH_NCCL OR WITH_RCCL OR WITH_XPU_BKCL OR WITH_ASCEND_CL
OR WITH_CNCL
)
))
cc_library
(
reducer SRCS reducer.cc DEPS layer
)
endif
()
endif
()
...
...
paddle/fluid/imperative/reducer.cc
浏览文件 @
b4eb413e
...
...
@@ -31,7 +31,7 @@ namespace imperative {
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
defined(PADDLE_WITH_XPU_BKCL) || defined(PADDLE_WITH_GLOO) || \
defined(PADDLE_WITH_ASCEND_CL)
defined(PADDLE_WITH_ASCEND_CL)
|| defined(PADDLE_WITH_CNCL)
// div the nranks
void
Group
::
DivNRanks
(
const
platform
::
DeviceContext
&
context
,
int64_t
nranks
)
{
framework
::
Tensor
*
tensor
=
...
...
@@ -67,6 +67,9 @@ void Group::DivNRanks(const platform::DeviceContext &context, int64_t nranks) {
#ifdef PADDLE_WITH_XPU_BKCL
// TODO(liuyuhui) support xpu about div nranks in the future
#endif
}
else
if
(
platform
::
is_mlu_place
(
tensor
->
place
()))
{
// TODO(zhangna)
VLOG
(
4
)
<<
"divnrank for mlu not support yet"
;
}
}
...
...
@@ -222,6 +225,56 @@ void SplitTensorsWithType<platform::XPUDeviceContext>(
}
#endif
#ifdef PADDLE_WITH_CNCL
// context is used to select the stream for concat
template
<
>
void
ConcatTensorsWithType
<
platform
::
MLUDeviceContext
>
(
const
platform
::
MLUDeviceContext
&
context
,
const
std
::
vector
<
framework
::
Tensor
>
&
dense_tensors_
,
framework
::
Variable
*
p_dense_contents
,
framework
::
proto
::
VarType
::
Type
type
)
{
switch
(
type
)
{
case
framework
::
proto
::
VarType
::
FP16
:
ConcatTensorsForAllReduce
<
platform
::
MLUDeviceContext
,
platform
::
float16
>
(
context
,
dense_tensors_
,
p_dense_contents
);
break
;
case
framework
::
proto
::
VarType
::
FP32
:
ConcatTensorsForAllReduce
<
platform
::
MLUDeviceContext
,
float
>
(
context
,
dense_tensors_
,
p_dense_contents
);
break
;
default:
PADDLE_THROW
(
platform
::
errors
::
Unimplemented
(
"Data type (%s) is not supported when it concats tensors for "
"allreduce."
,
framework
::
DataTypeToString
(
type
)));
}
}
// context is used to select the stream for split
template
<
>
void
SplitTensorsWithType
<
platform
::
MLUDeviceContext
>
(
const
platform
::
MLUDeviceContext
&
context
,
framework
::
Variable
*
p_dense_contents
,
std
::
vector
<
framework
::
Tensor
>
*
p_dense_tensors
,
framework
::
proto
::
VarType
::
Type
type
)
{
switch
(
type
)
{
case
framework
::
proto
::
VarType
::
FP16
:
SplitTensorsForAllReduce
<
platform
::
MLUDeviceContext
,
platform
::
float16
>
(
context
,
p_dense_contents
,
p_dense_tensors
);
break
;
case
framework
::
proto
::
VarType
::
FP32
:
SplitTensorsForAllReduce
<
platform
::
MLUDeviceContext
,
float
>
(
context
,
p_dense_contents
,
p_dense_tensors
);
break
;
default:
PADDLE_THROW
(
platform
::
errors
::
Unimplemented
(
"Data type (%s) is not supported when it splits tensors for "
"allreduce."
,
framework
::
DataTypeToString
(
type
)));
}
}
#endif
void
Group
::
ConcatTensors
(
const
platform
::
DeviceContext
&
context
)
{
auto
place
=
context
.
GetPlace
();
if
(
platform
::
is_gpu_place
(
place
))
{
...
...
@@ -253,6 +306,16 @@ void Group::ConcatTensors(const platform::DeviceContext &context) {
PADDLE_THROW
(
platform
::
errors
::
PermissionDenied
(
"Paddle can't concat npu grads since it's not compiled with HCCL,"
"Please recompile or reinstall Paddle with HCCL support."
));
#endif
}
else
if
(
platform
::
is_mlu_place
(
place
))
{
#ifdef PADDLE_WITH_CNCL
ConcatTensorsWithType
(
static_cast
<
const
platform
::
MLUDeviceContext
&>
(
context
),
dense_tensors_
,
&
dense_contents_
,
dtype_
);
#else
PADDLE_THROW
(
platform
::
errors
::
PermissionDenied
(
"Paddle can't concat mlu grads since it's not compiled with CNCL,"
"Please recompile or reinstall Paddle with CNCL support."
));
#endif
}
else
if
(
platform
::
is_cpu_place
(
place
))
{
ConcatTensorsWithType
(
...
...
@@ -295,6 +358,16 @@ void Group::SplitTensors(const platform::DeviceContext &context) {
PADDLE_THROW
(
platform
::
errors
::
PermissionDenied
(
"Paddle can't split npu grad since it's not compiled with HCCL,"
"Please recompile or reinstall Paddle with HCCL support."
));
#endif
}
else
if
(
platform
::
is_mlu_place
(
place
))
{
#ifdef PADDLE_WITH_CNCL
SplitTensorsWithType
(
static_cast
<
const
platform
::
MLUDeviceContext
&>
(
context
),
&
dense_contents_
,
&
dense_tensors_
,
dtype_
);
#else
PADDLE_THROW
(
platform
::
errors
::
PermissionDenied
(
"Paddle can't split mlu grad since it's not compiled with CNCL,"
"Please recompile or reinstall Paddle with CNCL support."
));
#endif
}
else
if
(
platform
::
is_cpu_place
(
place
))
{
SplitTensorsWithType
(
...
...
@@ -746,6 +819,11 @@ void Reducer::MarkVarReady(const size_t var_index, const bool is_used_var) {
// TODO(liuyuhui) support XPU set constant
VLOG
(
3
)
<<
"XPU doesn't support set_constant"
;
}
#elif defined(PADDLE_WITH_CNCL)
if
(
platform
::
is_mlu_place
(
group_tensor
.
place
()))
{
// TODO(liuyuhui) support MLU set constant
VLOG
(
3
)
<<
"MLU doesn't support set_constant"
;
}
#else
auto
*
dev_ctx
=
platform
::
DeviceContextPool
::
Instance
().
Get
(
place_
);
if
(
HasGrad
(
var_index
))
{
...
...
@@ -846,12 +924,13 @@ void Reducer::MarkGroupReady(size_t group_index) {
cv_
.
notify_all
();
}
});
#elif defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_NCCL) || \
defined(PADDLE_WITH_GLOO) || defined(PADDLE_WITH_ASCEND_CL)
#elif defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_NCCL) || \
defined(PADDLE_WITH_GLOO) || defined(PADDLE_WITH_ASCEND_CL) || \
defined(PADDLE_WITH_CNCL)
FusedAllReduceSchedule
(
run_order
,
group
,
next_group_
);
#else
PADDLE_THROW
(
platform
::
errors
::
PreconditionNotMet
(
"Not compiled with BKCL or NCCL or GLOO."
));
"Not compiled with BKCL or NCCL or
CNCL or
GLOO."
));
#endif
}
}
...
...
paddle/fluid/imperative/reducer.h
浏览文件 @
b4eb413e
...
...
@@ -45,7 +45,7 @@ namespace imperative {
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
defined(PADDLE_WITH_XPU_BKCL) || defined(PADDLE_WITH_GLOO) || \
defined(PADDLE_WITH_ASCEND_CL)
defined(PADDLE_WITH_ASCEND_CL)
|| defined(PADDLE_WITH_CNCL)
template
<
typename
T
>
struct
DivNRanksFunctor
{
...
...
paddle/fluid/imperative/tests/CMakeLists.txt
浏览文件 @
b4eb413e
...
...
@@ -21,6 +21,6 @@ cc_test(test_prepare_op SRCS test_prepare_op.cc DEPS prepared_operator op_info s
cc_test
(
test_tracer SRCS test_tracer.cc DEPS tracer layer proto_desc operator op_registry variable_helper mul_op reduce_sum_op elementwise_add_op memcpy
)
cc_test
(
test_hooks SRCS test_hooks.cc DEPS tracer basic_engine layer proto_desc operator op_registry variable_helper mul_op elementwise_add_op memcpy
)
cc_test
(
test_eager SRCS test_eager.cc DEPS tracer layer prepared_operator mul_op
)
if
(
WITH_NCCL OR WITH_RCCL OR WITH_XPU_BKCL
)
if
(
WITH_NCCL OR WITH_RCCL OR WITH_XPU_BKCL
OR WITH_CNCL
)
cc_test
(
test_group SRCS test_group.cc DEPS reducer concat_and_split memcpy
)
endif
()
paddle/fluid/imperative/tests/test_group.cc
浏览文件 @
b4eb413e
...
...
@@ -72,8 +72,10 @@ void GroupConcatSplit(Place place, size_t size) {
value
.
push_back
(
static_cast
<
T
>
(
1.0
*
j
));
}
if
(
std
::
is_same
<
Place
,
platform
::
CUDAPlace
>::
value
)
{
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
if
(
std
::
is_same
<
Place
,
platform
::
CUDAPlace
>::
value
||
std
::
is_same
<
Place
,
platform
::
MLUPlace
>::
value
)
{
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
defined(PADDLE_WITH_CNCL)
paddle
::
memory
::
Copy
(
place
,
data
,
cpu_place
,
value
.
data
(),
sizeof
(
T
)
*
value
.
size
(),
0
);
#endif
...
...
@@ -180,5 +182,19 @@ TEST(TestGroup, TestXPUConcatSplit) {
}
#endif
#if defined(PADDLE_WITH_CNCL)
TEST
(
TestGroup
,
TestMLUConcatSplit
)
{
platform
::
MLUPlace
mlu_place
(
0
);
platform
::
CPUPlace
cpu_place
;
int
size
=
3
;
GroupConcatSplit
<
float
>
(
cpu_place
,
size
);
GroupConcatSplit
<
float
>
(
mlu_place
,
size
);
size
=
15
;
GroupConcatSplit
<
float
>
(
cpu_place
,
size
);
GroupConcatSplit
<
float
>
(
mlu_place
,
size
);
}
#endif
}
// namespace imperative
}
// namespace paddle
paddle/fluid/operators/math/CMakeLists.txt
浏览文件 @
b4eb413e
...
...
@@ -5,6 +5,8 @@ endif()
# please add new math_library in alphabetical order
if
(
WITH_ASCEND_CL
)
math_library
(
concat_and_split DEPS concat_and_split_functor npu_op_runner
)
elseif
(
WITH_MLU
)
math_library
(
concat_and_split DEPS concat_and_split_functor mlu_baseop
)
else
()
math_library
(
concat_and_split DEPS concat_and_split_functor
)
endif
()
...
...
paddle/fluid/operators/math/concat_and_split.cc
浏览文件 @
b4eb413e
...
...
@@ -18,6 +18,9 @@ limitations under the License. */
#ifdef PADDLE_WITH_ASCEND_CL
#include "paddle/fluid/platform/device/npu/npu_op_runner.h"
#endif
#ifdef PADDLE_WITH_MLU
#include "paddle/fluid/operators/mlu/mlu_baseop.h"
#endif
#include "paddle/phi/common/bfloat16.h"
#include "paddle/phi/common/float16.h"
...
...
@@ -226,6 +229,90 @@ class SplitFunctor<platform::NPUDeviceContext, T> {
};
#endif
#ifdef PADDLE_WITH_MLU
template
<
typename
T
>
class
ConcatFunctor
<
platform
::
MLUDeviceContext
,
T
>
{
public:
void
operator
()(
const
platform
::
MLUDeviceContext
&
context
,
const
std
::
vector
<
framework
::
Tensor
>&
input
,
int
axis
,
framework
::
Tensor
*
output
)
{
int
dev_id
=
context
.
GetPlace
().
GetDeviceId
();
platform
::
MLUDeviceGuard
guard
(
dev_id
);
auto
ins_size
=
input
.
size
();
const
int
axis_t
=
axis
;
const
int
ins_size_t
=
ins_size
;
auto
place
=
context
.
GetPlace
();
output
->
mutable_data
<
T
>
(
place
);
// mlu should do sth
// init ins tensors
std
::
vector
<
const
void
*>
inputs
;
std
::
vector
<
MLUCnnlTensorDesc
>
input_descs
;
std
::
vector
<
cnnlTensorDescriptor_t
>
desc_vector
;
for
(
size_t
i
=
0
;
i
<
ins_size
;
i
++
)
{
input_descs
.
emplace_back
(
MLUCnnlTensorDesc
(
input
[
i
],
CNNL_LAYOUT_ARRAY
,
ToCnnlDataType
(
input
[
i
].
dtype
())));
desc_vector
.
push_back
(
input_descs
.
back
().
get
());
inputs
.
push_back
(
input
[
i
].
data
());
}
// init out tensors
MLUCnnlTensorDesc
output_desc
(
*
output
,
CNNL_LAYOUT_ARRAY
,
ToCnnlDataType
(
output
->
dtype
()));
// MLU should do sth
MLUCnnl
::
Concat
(
context
,
ins_size_t
,
axis_t
,
desc_vector
.
data
(),
inputs
.
data
(),
output_desc
.
get
(),
GetBasePtr
(
output
));
}
};
template
<
typename
T
>
class
SplitFunctor
<
platform
::
MLUDeviceContext
,
T
>
{
public:
void
operator
()(
const
platform
::
MLUDeviceContext
&
context
,
const
framework
::
Tensor
&
input
,
const
std
::
vector
<
const
framework
::
Tensor
*>&
ref_inputs
,
const
int
axis
,
std
::
vector
<
framework
::
Tensor
*>*
outputs
)
{
if
(
input
.
numel
()
==
0
)
{
return
;
}
int
dev_id
=
context
.
GetPlace
().
GetDeviceId
();
platform
::
MLUDeviceGuard
guard
(
dev_id
);
auto
in_dims
=
input
.
dims
();
auto
out_size
=
outputs
->
size
();
std
::
vector
<
framework
::
DDim
>
outs_dims
(
out_size
,
in_dims
);
for
(
size_t
i
=
0
;
i
<
out_size
;
++
i
)
{
outs_dims
[
i
][
axis
]
=
ref_inputs
[
i
]
->
dims
()[
axis
];
}
// init out tensors
std
::
vector
<
void
*>
vct_tensor
;
std
::
vector
<
MLUCnnlTensorDesc
>
output_descs
;
std
::
vector
<
cnnlTensorDescriptor_t
>
desc_vector
;
for
(
size_t
i
=
0
;
i
<
out_size
;
i
++
)
{
(
*
outputs
)[
i
]
->
Resize
(
outs_dims
[
i
]);
(
*
outputs
)[
i
]
->
mutable_data
<
T
>
(
context
.
GetPlace
());
output_descs
.
emplace_back
(
MLUCnnlTensorDesc
(
*
(
*
outputs
)[
i
],
CNNL_LAYOUT_ARRAY
,
ToCnnlDataType
((
*
outputs
)[
i
]
->
dtype
())));
desc_vector
.
push_back
(
output_descs
.
back
().
get
());
vct_tensor
.
push_back
(
GetBasePtr
((
*
outputs
)[
i
]));
}
// init in tensors
MLUCnnlTensorDesc
input_desc
(
input
,
CNNL_LAYOUT_ARRAY
,
ToCnnlDataType
(
input
.
dtype
()));
// MLU should do sth
MLUCnnl
::
Split
(
context
,
out_size
,
axis
,
input_desc
.
get
(),
input
.
data
(),
desc_vector
.
data
(),
vct_tensor
.
data
());
}
};
#endif
#define DEFINE_FUNCTOR(type) \
template class ConcatFunctor<platform::CPUDeviceContext, type>; \
template class SplitFunctor<platform::CPUDeviceContext, type>;
...
...
@@ -248,6 +335,19 @@ DEFINE_XPU_FUNCTOR(float)
FOR_ALL_TYPES
(
DEFINE_NPU_FUNCTOR
)
#endif
#ifdef PADDLE_WITH_MLU
#define DEFINE_MLU_FUNCTOR(type) \
template class ConcatFunctor<platform::MLUDeviceContext, type>; \
template class SplitFunctor<platform::MLUDeviceContext, type>;
DEFINE_MLU_FUNCTOR
(
float
)
DEFINE_MLU_FUNCTOR
(
platform
::
float16
)
DEFINE_MLU_FUNCTOR
(
int64_t
)
DEFINE_MLU_FUNCTOR
(
bool
)
DEFINE_MLU_FUNCTOR
(
int
)
DEFINE_MLU_FUNCTOR
(
int8_t
)
DEFINE_MLU_FUNCTOR
(
int16_t
)
DEFINE_MLU_FUNCTOR
(
uint8_t
)
#endif
}
// namespace math
}
// namespace operators
}
// namespace paddle
paddle/fluid/operators/mlu/mlu_baseop.cc
浏览文件 @
b4eb413e
...
...
@@ -499,6 +499,27 @@ MLUCnnlTrigonDesc::~MLUCnnlTrigonDesc() {
output_desc
,
output
));
}
/* static */
void
MLUCnnl
::
Concat
(
const
MLUDeviceContext
&
dev_ctx
,
const
int
pack_num
,
const
int
axis
,
const
cnnlTensorDescriptor_t
inputs_desc
[],
const
void
*
const
inputs
[],
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
)
{
cnnlHandle_t
handle
=
dev_ctx
.
cnnl_handle
();
size_t
workspace_size
=
0
;
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlGetConcatWorkspaceSize
(
handle
,
pack_num
,
&
workspace_size
));
Tensor
workspace
(
paddle
::
experimental
::
DataType
::
INT8
);
workspace
.
Resize
(
framework
::
DDim
({
static_cast
<
int64_t
>
(
workspace_size
)}));
void
*
workspace_ptr
=
workspace
.
mutable_data
(
dev_ctx
.
GetPlace
());
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlConcat
(
handle
,
pack_num
,
axis
,
inputs_desc
,
inputs
,
workspace_ptr
,
workspace_size
,
output_desc
,
output
));
}
/* static */
void
MLUCnnl
::
Div
(
const
ExecutionContext
&
ctx
,
cnnlComputationPreference_t
prefer
,
const
cnnlTensorDescriptor_t
in0_desc
,
const
void
*
in0
,
...
...
@@ -977,6 +998,27 @@ MLUCnnlTrigonDesc::~MLUCnnlTrigonDesc() {
output_descs
,
output_ptrs
));
}
/* static */
void
MLUCnnl
::
Split
(
const
MLUDeviceContext
&
dev_ctx
,
int
split_num
,
int
axis
,
const
cnnlTensorDescriptor_t
input_desc
,
const
void
*
input_ptr
,
const
cnnlTensorDescriptor_t
output_descs
[],
void
*
output_ptrs
[])
{
cnnlHandle_t
handle
=
dev_ctx
.
cnnl_handle
();
size_t
workspace_size
;
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlGetSplitWorkspaceSize
(
handle
,
split_num
,
&
workspace_size
));
Tensor
workspace
(
paddle
::
experimental
::
DataType
::
INT8
);
workspace
.
Resize
(
framework
::
DDim
({
static_cast
<
int64_t
>
(
workspace_size
)}));
void
*
workspace_ptr
=
workspace
.
mutable_data
(
dev_ctx
.
GetPlace
());
PADDLE_ENFORCE_MLU_SUCCESS
(
cnnlSplit
(
handle
,
split_num
,
axis
,
input_desc
,
input_ptr
,
workspace_ptr
,
workspace_size
,
output_descs
,
output_ptrs
));
}
/* static */
void
MLUCnnl
::
GatherFunctor
(
const
ExecutionContext
&
ctx
,
const
int
axis
,
const
int
batch_dims
,
const
cnnlTensorDescriptor_t
params_desc
,
const
void
*
params
,
...
...
paddle/fluid/operators/mlu/mlu_baseop.h
浏览文件 @
b4eb413e
...
...
@@ -403,6 +403,11 @@ class MLUCnnl {
const
void
*
const
inputs
[],
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
);
static
void
Concat
(
const
MLUDeviceContext
&
dev_ctx
,
const
int
pack_num
,
const
int
axis
,
const
cnnlTensorDescriptor_t
inputs_desc
[],
const
void
*
const
inputs
[],
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
);
static
void
Cast
(
const
ExecutionContext
&
ctx
,
cnnlCastDataType_t
cast_type
,
const
cnnlTensorDescriptor_t
input_desc
,
const
void
*
input
,
const
cnnlTensorDescriptor_t
output_desc
,
void
*
output
);
...
...
@@ -566,6 +571,12 @@ class MLUCnnl {
const
cnnlTensorDescriptor_t
output_descs
[],
void
*
output_ptrs
[]);
static
void
Split
(
const
MLUDeviceContext
&
dev_ctx
,
int
split_num
,
int
axis
,
const
cnnlTensorDescriptor_t
input_desc
,
const
void
*
input_ptr
,
const
cnnlTensorDescriptor_t
output_descs
[],
void
*
output_ptrs
[]);
static
void
Scale
(
const
ExecutionContext
&
ctx
,
const
int
axis
,
const
cnnlTensorDescriptor_t
input_desc
,
const
void
*
input
,
const
cnnlTensorDescriptor_t
alpha_desc
,
const
void
*
alpha
,
...
...
paddle/fluid/operators/strided_memcpy.h
浏览文件 @
b4eb413e
...
...
@@ -109,6 +109,11 @@ inline void StridedNumelCopyWithAxis(const platform::DeviceContext& ctx,
auto
&
npu_ctx
=
reinterpret_cast
<
const
platform
::
NPUDeviceContext
&>
(
ctx
);
memory
::
Copy
(
npu_place
,
dst
+
i
*
dst_after
,
npu_place
,
src
+
i
*
src_after
,
sizeof
(
T
)
*
size
,
npu_ctx
.
stream
());
#elif defined(PADDLE_WITH_MLU)
auto
&
mlu_place
=
place
;
auto
&
mlu_ctx
=
reinterpret_cast
<
const
platform
::
MLUDeviceContext
&>
(
ctx
);
memory
::
Copy
(
mlu_place
,
dst
+
i
*
dst_after
,
mlu_place
,
src
+
i
*
src_after
,
sizeof
(
T
)
*
size
,
mlu_ctx
.
stream
());
#else
PADDLE_THROW
(
platform
::
errors
::
PreconditionNotMet
(
"Paddle is not compiled with GPU."
));
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录