Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle
提交
b451aff8
P
Paddle
项目概览
PaddlePaddle
/
Paddle
1 年多 前同步成功
通知
2305
Star
20932
Fork
5423
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1423
列表
看板
标记
里程碑
合并请求
543
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1,423
Issue
1,423
列表
看板
标记
里程碑
合并请求
543
合并请求
543
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
b451aff8
编写于
4月 10, 2023
作者:
J
jjyaoao
提交者:
GitHub
4月 10, 2023
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
delete paddle/fluid/operators/collective/*_npu.* (#52677)
上级
c1cad896
变更
30
隐藏空白更改
内联
并排
Showing
30 changed file
with
0 addition
and
2688 deletion
+0
-2688
paddle/fluid/operators/collective/c_allgather_op_npu.cc
paddle/fluid/operators/collective/c_allgather_op_npu.cc
+0
-42
paddle/fluid/operators/collective/c_allgather_op_npu_test.cc
paddle/fluid/operators/collective/c_allgather_op_npu_test.cc
+0
-186
paddle/fluid/operators/collective/c_allreduce_max_op_npu.cc
paddle/fluid/operators/collective/c_allreduce_max_op_npu.cc
+0
-31
paddle/fluid/operators/collective/c_allreduce_max_op_npu_test.cc
...fluid/operators/collective/c_allreduce_max_op_npu_test.cc
+0
-182
paddle/fluid/operators/collective/c_allreduce_min_op_npu.cc
paddle/fluid/operators/collective/c_allreduce_min_op_npu.cc
+0
-31
paddle/fluid/operators/collective/c_allreduce_prod_op_npu.cc
paddle/fluid/operators/collective/c_allreduce_prod_op_npu.cc
+0
-31
paddle/fluid/operators/collective/c_allreduce_sum_op_npu.cc
paddle/fluid/operators/collective/c_allreduce_sum_op_npu.cc
+0
-31
paddle/fluid/operators/collective/c_allreduce_sum_op_npu_test.cc
...fluid/operators/collective/c_allreduce_sum_op_npu_test.cc
+0
-193
paddle/fluid/operators/collective/c_broadcast_op_npu.cc
paddle/fluid/operators/collective/c_broadcast_op_npu.cc
+0
-39
paddle/fluid/operators/collective/c_broadcast_op_npu_test.cc
paddle/fluid/operators/collective/c_broadcast_op_npu_test.cc
+0
-175
paddle/fluid/operators/collective/c_embedding_op_npu.cc
paddle/fluid/operators/collective/c_embedding_op_npu.cc
+0
-270
paddle/fluid/operators/collective/c_identity_op_npu.cc
paddle/fluid/operators/collective/c_identity_op_npu.cc
+0
-22
paddle/fluid/operators/collective/c_reduce_max_op_npu.cc
paddle/fluid/operators/collective/c_reduce_max_op_npu.cc
+0
-30
paddle/fluid/operators/collective/c_reduce_min_op_npu.cc
paddle/fluid/operators/collective/c_reduce_min_op_npu.cc
+0
-30
paddle/fluid/operators/collective/c_reduce_prod_op_npu.cc
paddle/fluid/operators/collective/c_reduce_prod_op_npu.cc
+0
-30
paddle/fluid/operators/collective/c_reduce_sum_op_npu.cc
paddle/fluid/operators/collective/c_reduce_sum_op_npu.cc
+0
-30
paddle/fluid/operators/collective/c_reduce_sum_op_npu_test.cc
...le/fluid/operators/collective/c_reduce_sum_op_npu_test.cc
+0
-186
paddle/fluid/operators/collective/c_reducescatter_op_npu.cc
paddle/fluid/operators/collective/c_reducescatter_op_npu.cc
+0
-39
paddle/fluid/operators/collective/c_reducescatter_op_npu_test.cc
...fluid/operators/collective/c_reducescatter_op_npu_test.cc
+0
-183
paddle/fluid/operators/collective/c_sync_calc_stream_op_npu_test.cc
...id/operators/collective/c_sync_calc_stream_op_npu_test.cc
+0
-107
paddle/fluid/operators/collective/c_sync_comm_stream_op_npu_test.cc
...id/operators/collective/c_sync_comm_stream_op_npu_test.cc
+0
-184
paddle/fluid/operators/collective/checknumeric_npu_test.cc
paddle/fluid/operators/collective/checknumeric_npu_test.cc
+0
-91
paddle/fluid/operators/collective/mp_allreduce_sum_op_npu.cc
paddle/fluid/operators/collective/mp_allreduce_sum_op_npu.cc
+0
-31
paddle/fluid/operators/collective/partial_allgather_op_npu.cc
...le/fluid/operators/collective/partial_allgather_op_npu.cc
+0
-42
paddle/fluid/operators/collective/partial_recv_op_npu.cc
paddle/fluid/operators/collective/partial_recv_op_npu.cc
+0
-40
paddle/fluid/operators/collective/partial_send_op_npu.cc
paddle/fluid/operators/collective/partial_send_op_npu.cc
+0
-40
paddle/fluid/operators/collective/recv_v2_op_npu.cc
paddle/fluid/operators/collective/recv_v2_op_npu.cc
+0
-42
paddle/fluid/operators/collective/recv_v2_op_npu_test.cc
paddle/fluid/operators/collective/recv_v2_op_npu_test.cc
+0
-159
paddle/fluid/operators/collective/send_v2_op_npu.cc
paddle/fluid/operators/collective/send_v2_op_npu.cc
+0
-42
paddle/fluid/operators/collective/send_v2_op_npu_test.cc
paddle/fluid/operators/collective/send_v2_op_npu_test.cc
+0
-149
未找到文件。
paddle/fluid/operators/collective/c_allgather_op_npu.cc
已删除
100644 → 0
浏览文件 @
c1cad896
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <memory>
#include "paddle/fluid/operators/collective/c_allgather_op.h"
namespace
paddle
{
namespace
operators
{
template
<
typename
T
>
class
CAllGatherOpASCENDKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
PADDLE_THROW
(
platform
::
errors
::
PreconditionNotMet
(
"PaddlePaddle should compile with NPU."
));
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
namespace
plat
=
paddle
::
platform
;
REGISTER_OP_NPU_KERNEL
(
c_allgather
,
ops
::
CAllGatherOpASCENDKernel
<
int8_t
>
,
ops
::
CAllGatherOpASCENDKernel
<
int
>
,
ops
::
CAllGatherOpASCENDKernel
<
int64_t
>
,
ops
::
CAllGatherOpASCENDKernel
<
float
>
,
ops
::
CAllGatherOpASCENDKernel
<
plat
::
float16
>
);
paddle/fluid/operators/collective/c_allgather_op_npu_test.cc
已删除
100644 → 0
浏览文件 @
c1cad896
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifndef _WIN32
#include <unistd.h>
#endif
#include <stdio.h>
#include <string>
#include <thread> // NOLINT
#include <vector>
#include "gtest/gtest.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/operators/collective/c_allgather_op.h"
#include "paddle/fluid/operators/collective/c_allreduce_op.h"
#include "paddle/fluid/operators/collective/c_broadcast_op.h"
#include "paddle/fluid/operators/collective/c_reducescatter_op.h"
#include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h"
#include "paddle/fluid/string/printf.h"
#include "paddle/phi/kernels/funcs/math_function.h"
namespace
f
=
paddle
::
framework
;
namespace
p
=
paddle
::
platform
;
USE_OP
(
c_allgather
);
USE_NO_KERNEL_OP
(
c_gen_hccl_id
);
USE_NO_KERNEL_OP
(
c_comm_init_hccl
);
USE_OP_DEVICE_KERNEL
(
c_allgather
,
NPU
);
DECLARE_string
(
selected_npus
);
template
<
typename
T
>
void
PrintDebugInfo
(
const
std
::
string
preStr
,
const
std
::
vector
<
T
>&
data
)
{
std
::
string
debugstring
=
""
;
for
(
auto
ele
:
data
)
{
debugstring
+=
std
::
to_string
(
ele
)
+
std
::
string
(
","
);
}
VLOG
(
2
)
<<
preStr
<<
":"
<<
std
::
endl
<<
debugstring
;
}
void
PrepareUniqueId
(
f
::
Scope
*
scope
,
const
p
::
DeviceContext
&
ctx
,
HcclRootInfo
*
hccl_id
)
{
int
rank_id
=
atoi
(
getenv
(
"RANK_ID"
));
int
device_id
=
atoi
(
getenv
(
"DEVICE_ID"
));
VLOG
(
2
)
<<
"rank_id = "
<<
rank_id
<<
"; device_id = "
<<
device_id
<<
"; rank_id = "
<<
rank_id
<<
"; RANK_TABLE_FILE = "
<<
atoi
(
getenv
(
"DEVICE_ID"
));
std
::
vector
<
int
>
rank_ids
{
0
,
1
};
f
::
AttributeMap
gen_hccl_id
;
std
::
vector
<
std
::
string
>
endpointList
=
{
"127.0.0.1:6175"
,
"127.0.0.1:6177"
};
gen_hccl_id
[
"rank"
]
=
rank_id
;
gen_hccl_id
[
"endpoint"
]
=
endpointList
[
rank_id
];
std
::
vector
<
std
::
string
>
other_endpoints
=
{
endpointList
[
rank_id
==
0
?
1
:
0
]};
gen_hccl_id
[
"other_endpoints"
]
=
other_endpoints
;
auto
out
=
scope
->
Var
(
"Out"
);
auto
id
=
out
->
GetMutable
<
HcclRootInfo
>
();
VLOG
(
3
)
<<
"break"
;
auto
comm_init_op
=
f
::
OpRegistry
::
CreateOp
(
"c_gen_hccl_id"
,
{},
{{
"Out"
,
{
"Out"
}}},
gen_hccl_id
);
VLOG
(
3
)
<<
"break"
;
auto
place
=
ctx
.
GetPlace
();
comm_init_op
->
Run
(
*
scope
,
place
);
ctx
.
Wait
();
memcpy
(
hccl_id
,
id
,
1024
);
}
void
Prepare
(
f
::
Scope
*
scope
,
const
p
::
DeviceContext
&
ctx
,
HcclRootInfo
*
hccl_id
)
{
auto
x
=
scope
->
Var
(
"X"
);
auto
id
=
x
->
GetMutable
<
HcclRootInfo
>
();
memcpy
(
id
,
hccl_id
,
1024
);
int
rank_id
=
atoi
(
getenv
(
"RANK_ID"
));
int
device_id
=
atoi
(
getenv
(
"DEVICE_ID"
));
VLOG
(
2
)
<<
"rank_id = "
<<
rank_id
<<
"; device_id = "
<<
device_id
<<
"; rank_id = "
<<
rank_id
<<
"; RANK_TABLE_FILE = "
<<
atoi
(
getenv
(
"DEVICE_ID"
));
// std::vector<int> rank_ids{0, 1};
f
::
AttributeMap
comm_init_attrs
;
comm_init_attrs
[
"ring_id"
]
=
0
;
comm_init_attrs
[
"rank_ids"
]
=
2
;
comm_init_attrs
[
"rank"
]
=
rank_id
;
comm_init_attrs
[
"device_id"
]
=
device_id
;
// comm_init_attrs["rank_ids"] = rank_ids;
auto
comm_init_op
=
f
::
OpRegistry
::
CreateOp
(
"c_comm_init_hccl"
,
{{
"X"
,
{
"X"
}}},
{},
comm_init_attrs
);
auto
place
=
ctx
.
GetPlace
();
comm_init_op
->
Run
(
*
scope
,
place
);
ctx
.
Wait
();
}
void
TestHCCLAllGatherOp
(
f
::
Scope
*
scope
,
const
p
::
DeviceContext
&
ctx
)
{
// init
auto
x
=
scope
->
Var
(
"Data"
);
auto
tensor_x
=
x
->
GetMutable
<
phi
::
DenseTensor
>
();
std
::
vector
<
float
>
init
;
int
rank_id
=
atoi
(
getenv
(
"RANK_ID"
));
int
num1
=
1
;
int
num2
=
4
;
for
(
int64_t
i
=
0
;
i
<
num1
*
num2
;
++
i
)
{
init
.
push_back
(
1.0
+
rank_id
);
}
PrintDebugInfo
(
"input data"
,
init
);
paddle
::
framework
::
TensorFromVector
(
init
,
ctx
,
tensor_x
);
tensor_x
->
Resize
({
num1
,
num2
});
ctx
.
Wait
();
auto
place
=
ctx
.
GetPlace
();
auto
out
=
scope
->
Var
(
"OutData"
);
auto
tensor_out
=
out
->
GetMutable
<
phi
::
DenseTensor
>
();
tensor_out
->
Resize
({
num1
,
num2
});
tensor_out
->
mutable_data
<
float
>
(
place
);
// allocate
ctx
.
Wait
();
// run
f
::
AttributeMap
attrs
;
attrs
[
"tag"
]
=
std
::
string
(
"tagx"
);
attrs
[
"ring_id"
]
=
0
;
attrs
[
"nranks"
]
=
2
;
auto
op
=
f
::
OpRegistry
::
CreateOp
(
"c_allgather"
,
{{
"X"
,
{
"Data"
}}},
{{
"Out"
,
{
"OutData"
}}},
attrs
);
for
(
int
i
=
0
;
i
<
10
;
i
++
)
{
op
->
Run
(
*
scope
,
place
);
}
ctx
.
Wait
();
std
::
vector
<
float
>
out_vec
;
paddle
::
framework
::
TensorToVector
(
*
tensor_out
,
ctx
,
&
out_vec
);
ctx
.
Wait
();
PrintDebugInfo
(
"output data"
,
out_vec
);
EXPECT_EQ
(
out_vec
.
size
(),
init
.
size
()
*
2
);
for
(
uint32_t
i
=
0
;
i
<
out_vec
.
size
()
/
2
;
i
++
)
{
EXPECT_EQ
(
out_vec
[
i
],
1.0
);
}
for
(
uint32_t
i
=
out_vec
.
size
()
/
2
;
i
<
out_vec
.
size
();
i
++
)
{
EXPECT_EQ
(
out_vec
[
i
],
2.0
);
}
}
TEST
(
c_allgather
,
NPU
)
{
f
::
Scope
scope
;
HcclRootInfo
hccl_id
;
// only support one device, if more than one device, use first default
p
::
NPUDeviceContext
ctx
(
p
::
NPUPlace
(
atoi
(
FLAGS_selected_npus
.
c_str
())));
PrepareUniqueId
(
&
scope
,
ctx
,
&
hccl_id
);
Prepare
(
&
scope
,
ctx
,
&
hccl_id
);
TestHCCLAllGatherOp
(
&
scope
,
ctx
);
}
paddle/fluid/operators/collective/c_allreduce_max_op_npu.cc
已删除
100644 → 0
浏览文件 @
c1cad896
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/collective/c_allreduce_op.h"
namespace
paddle
{
namespace
platform
{
struct
ASCENDPlace
;
}
// namespace platform
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
namespace
plat
=
paddle
::
platform
;
REGISTER_OP_NPU_KERNEL
(
c_allreduce_max
,
ops
::
CAllReduceOpASCENDKernel
<
ops
::
kRedMax
,
int
>
,
ops
::
CAllReduceOpASCENDKernel
<
ops
::
kRedMax
,
int8_t
>
,
ops
::
CAllReduceOpASCENDKernel
<
ops
::
kRedMax
,
float
>
,
ops
::
CAllReduceOpASCENDKernel
<
ops
::
kRedMax
,
plat
::
float16
>
)
paddle/fluid/operators/collective/c_allreduce_max_op_npu_test.cc
已删除
100644 → 0
浏览文件 @
c1cad896
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifndef _WIN32
#include <unistd.h>
#endif
#include <stdio.h>
#include <string>
#include <thread> // NOLINT
#include <vector>
#include "gtest/gtest.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/operators/collective/c_allgather_op.h"
#include "paddle/fluid/operators/collective/c_allreduce_op.h"
#include "paddle/fluid/operators/collective/c_broadcast_op.h"
#include "paddle/fluid/operators/collective/c_reducescatter_op.h"
#include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h"
#include "paddle/fluid/string/printf.h"
#include "paddle/phi/kernels/funcs/math_function.h"
namespace
f
=
paddle
::
framework
;
namespace
p
=
paddle
::
platform
;
USE_OP
(
c_allreduce_max
);
USE_NO_KERNEL_OP
(
c_gen_hccl_id
);
USE_NO_KERNEL_OP
(
c_comm_init_hccl
);
USE_OP_DEVICE_KERNEL
(
c_allreduce_max
,
NPU
);
DECLARE_string
(
selected_npus
);
template
<
typename
T
>
void
PrintDebugInfo
(
const
std
::
string
preStr
,
const
std
::
vector
<
T
>&
data
)
{
std
::
string
debugstring
=
""
;
for
(
auto
ele
:
data
)
{
debugstring
+=
std
::
to_string
(
ele
)
+
std
::
string
(
","
);
}
VLOG
(
2
)
<<
preStr
<<
":"
<<
std
::
endl
<<
debugstring
;
}
void
PrepareUniqueId
(
f
::
Scope
*
scope
,
const
p
::
DeviceContext
&
ctx
,
HcclRootInfo
*
hccl_id
)
{
int
rank_id
=
atoi
(
getenv
(
"RANK_ID"
));
int
device_id
=
atoi
(
getenv
(
"DEVICE_ID"
));
VLOG
(
2
)
<<
"rank_id = "
<<
rank_id
<<
"; device_id = "
<<
device_id
<<
"; rank_id = "
<<
rank_id
<<
"; RANK_TABLE_FILE = "
<<
atoi
(
getenv
(
"DEVICE_ID"
));
std
::
vector
<
int
>
rank_ids
{
0
,
1
};
f
::
AttributeMap
gen_hccl_id
;
std
::
vector
<
std
::
string
>
endpointList
=
{
"127.0.0.1:6175"
,
"127.0.0.1:6177"
};
gen_hccl_id
[
"rank"
]
=
rank_id
;
gen_hccl_id
[
"endpoint"
]
=
endpointList
[
rank_id
];
std
::
vector
<
std
::
string
>
other_endpoints
=
{
endpointList
[
rank_id
==
0
?
1
:
0
]};
gen_hccl_id
[
"other_endpoints"
]
=
other_endpoints
;
auto
out
=
scope
->
Var
(
"Out"
);
auto
id
=
out
->
GetMutable
<
HcclRootInfo
>
();
VLOG
(
3
)
<<
"break"
;
auto
comm_init_op
=
f
::
OpRegistry
::
CreateOp
(
"c_gen_hccl_id"
,
{},
{{
"Out"
,
{
"Out"
}}},
gen_hccl_id
);
VLOG
(
3
)
<<
"break"
;
auto
place
=
ctx
.
GetPlace
();
comm_init_op
->
Run
(
*
scope
,
place
);
ctx
.
Wait
();
memcpy
(
hccl_id
,
id
,
1024
);
}
void
Prepare
(
f
::
Scope
*
scope
,
const
p
::
DeviceContext
&
ctx
,
HcclRootInfo
*
hccl_id
)
{
auto
x
=
scope
->
Var
(
"X"
);
auto
id
=
x
->
GetMutable
<
HcclRootInfo
>
();
memcpy
(
id
,
hccl_id
,
1024
);
int
rank_id
=
atoi
(
getenv
(
"RANK_ID"
));
int
device_id
=
atoi
(
getenv
(
"DEVICE_ID"
));
VLOG
(
2
)
<<
"rank_id = "
<<
rank_id
<<
"; device_id = "
<<
device_id
<<
"; rank_id = "
<<
rank_id
<<
"; RANK_TABLE_FILE = "
<<
atoi
(
getenv
(
"DEVICE_ID"
));
// std::vector<int> rank_ids{0, 1};
f
::
AttributeMap
comm_init_attrs
;
comm_init_attrs
[
"ring_id"
]
=
0
;
comm_init_attrs
[
"rank_ids"
]
=
2
;
comm_init_attrs
[
"rank"
]
=
rank_id
;
comm_init_attrs
[
"device_id"
]
=
device_id
;
// comm_init_attrs["rank_ids"] = rank_ids;
auto
comm_init_op
=
f
::
OpRegistry
::
CreateOp
(
"c_comm_init_hccl"
,
{{
"X"
,
{
"X"
}}},
{},
comm_init_attrs
);
auto
place
=
ctx
.
GetPlace
();
comm_init_op
->
Run
(
*
scope
,
place
);
ctx
.
Wait
();
}
void
TestHCCLAllReduceOp
(
f
::
Scope
*
scope
,
const
p
::
DeviceContext
&
ctx
)
{
// init
auto
x
=
scope
->
Var
(
"Data"
);
auto
tensor_x
=
x
->
GetMutable
<
phi
::
DenseTensor
>
();
std
::
vector
<
float
>
init
;
int
rank_id
=
atoi
(
getenv
(
"RANK_ID"
));
int
num1
=
100
;
int
num2
=
100
;
for
(
int64_t
i
=
0
;
i
<
num1
*
num2
;
++
i
)
{
init
.
push_back
(
1.0
+
rank_id
*
3
);
}
PrintDebugInfo
(
"input data"
,
init
);
paddle
::
framework
::
TensorFromVector
(
init
,
ctx
,
tensor_x
);
tensor_x
->
Resize
({
num1
,
num2
});
ctx
.
Wait
();
auto
place
=
ctx
.
GetPlace
();
auto
out
=
scope
->
Var
(
"OutData"
);
auto
tensor_out
=
out
->
GetMutable
<
phi
::
DenseTensor
>
();
tensor_out
->
Resize
({
num1
,
num2
});
tensor_out
->
mutable_data
<
float
>
(
place
);
// allocate
ctx
.
Wait
();
// run
f
::
AttributeMap
attrs
;
attrs
[
"tag"
]
=
std
::
string
(
"tagx"
);
attrs
[
"ring_id"
]
=
0
;
auto
op
=
f
::
OpRegistry
::
CreateOp
(
"c_allreduce_max"
,
{{
"X"
,
{
"Data"
}}},
{{
"Out"
,
{
"OutData"
}}},
attrs
);
for
(
int
i
=
0
;
i
<
10
;
i
++
)
{
op
->
Run
(
*
scope
,
place
);
}
ctx
.
Wait
();
std
::
vector
<
float
>
out_vec
;
paddle
::
framework
::
TensorToVector
(
*
tensor_out
,
ctx
,
&
out_vec
);
ctx
.
Wait
();
PrintDebugInfo
(
"output data"
,
out_vec
);
EXPECT_EQ
(
out_vec
.
size
(),
init
.
size
());
for
(
uint32_t
i
=
0
;
i
<
out_vec
.
size
();
i
++
)
{
EXPECT_EQ
(
out_vec
[
i
],
4.0
);
}
}
TEST
(
c_allreduce_max
,
NPU
)
{
f
::
Scope
scope
;
HcclRootInfo
hccl_id
;
// only support one device, if more than one device, use first default
p
::
NPUDeviceContext
ctx
(
p
::
NPUPlace
(
atoi
(
FLAGS_selected_npus
.
c_str
())));
PrepareUniqueId
(
&
scope
,
ctx
,
&
hccl_id
);
Prepare
(
&
scope
,
ctx
,
&
hccl_id
);
TestHCCLAllReduceOp
(
&
scope
,
ctx
);
}
paddle/fluid/operators/collective/c_allreduce_min_op_npu.cc
已删除
100644 → 0
浏览文件 @
c1cad896
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/collective/c_allreduce_op.h"
namespace
paddle
{
namespace
platform
{
struct
ASCENDPlace
;
}
// namespace platform
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
namespace
plat
=
paddle
::
platform
;
REGISTER_OP_NPU_KERNEL
(
c_allreduce_min
,
ops
::
CAllReduceOpASCENDKernel
<
ops
::
kRedMin
,
int
>
,
ops
::
CAllReduceOpASCENDKernel
<
ops
::
kRedMin
,
int8_t
>
,
ops
::
CAllReduceOpASCENDKernel
<
ops
::
kRedMin
,
float
>
,
ops
::
CAllReduceOpASCENDKernel
<
ops
::
kRedMin
,
plat
::
float16
>
)
paddle/fluid/operators/collective/c_allreduce_prod_op_npu.cc
已删除
100644 → 0
浏览文件 @
c1cad896
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/collective/c_allreduce_op.h"
namespace
paddle
{
namespace
platform
{
struct
ASCENDPlace
;
}
// namespace platform
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
namespace
plat
=
paddle
::
platform
;
REGISTER_OP_NPU_KERNEL
(
c_allreduce_prod
,
ops
::
CAllReduceOpASCENDKernel
<
ops
::
kRedProd
,
int
>
,
ops
::
CAllReduceOpASCENDKernel
<
ops
::
kRedProd
,
int8_t
>
,
ops
::
CAllReduceOpASCENDKernel
<
ops
::
kRedProd
,
float
>
,
ops
::
CAllReduceOpASCENDKernel
<
ops
::
kRedProd
,
plat
::
float16
>
)
paddle/fluid/operators/collective/c_allreduce_sum_op_npu.cc
已删除
100644 → 0
浏览文件 @
c1cad896
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/collective/c_allreduce_op.h"
namespace
paddle
{
namespace
platform
{
struct
ASCENDPlace
;
}
// namespace platform
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
namespace
plat
=
paddle
::
platform
;
REGISTER_OP_NPU_KERNEL
(
c_allreduce_sum
,
ops
::
CAllReduceOpASCENDKernel
<
ops
::
kRedSum
,
int
>
,
ops
::
CAllReduceOpASCENDKernel
<
ops
::
kRedSum
,
int8_t
>
,
ops
::
CAllReduceOpASCENDKernel
<
ops
::
kRedSum
,
float
>
,
ops
::
CAllReduceOpASCENDKernel
<
ops
::
kRedSum
,
plat
::
float16
>
)
paddle/fluid/operators/collective/c_allreduce_sum_op_npu_test.cc
已删除
100644 → 0
浏览文件 @
c1cad896
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifndef _WIN32
#include <unistd.h>
#endif
#include <stdio.h>
#include <string>
#include <thread> // NOLINT
#include <vector>
#include "gtest/gtest.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/operators/collective/c_allreduce_op.h"
#include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h"
#include "paddle/fluid/string/printf.h"
#include "paddle/phi/kernels/funcs/math_function.h"
// Node1: HCCL_WHITELIST_DISABLE=1 FLAGS_selected_npus=1 GLOG_v=4 RANK_ID=1
// DEVICE_ID=1 ./paddle/fluid/operators/collective/c_allreduce_sum_op_npu_test
// Node2: HCCL_WHITELIST_DISABLE=1 FLAGS_selected_npus=0 GLOG_v=4 RANK_ID=0
// DEVICE_ID=0 ./paddle/fluid/operators/collective/c_allreduce_sum_op_npu_test
namespace
f
=
paddle
::
framework
;
namespace
p
=
paddle
::
platform
;
USE_OP
(
c_allreduce_sum
);
USE_NO_KERNEL_OP
(
c_gen_hccl_id
);
USE_NO_KERNEL_OP
(
c_comm_init_hccl
);
USE_OP_DEVICE_KERNEL
(
c_allreduce_sum
,
NPU
);
DECLARE_string
(
selected_npus
);
template
<
typename
T
>
void
PrintDebugInfo
(
const
std
::
string
preStr
,
const
std
::
vector
<
T
>&
data
)
{
std
::
string
debugstring
=
""
;
std
::
cout
<<
preStr
<<
":"
<<
std
::
endl
<<
debugstring
;
for
(
auto
ele
:
data
)
{
std
::
cout
<<
ele
<<
" "
;
}
std
::
cout
<<
std
::
endl
;
}
void
PrepareUniqueId
(
f
::
Scope
*
scope
,
const
p
::
DeviceContext
&
ctx
,
HcclRootInfo
*
hccl_id
)
{
int
rank_id
=
atoi
(
getenv
(
"RANK_ID"
));
int
device_id
=
atoi
(
getenv
(
"DEVICE_ID"
));
VLOG
(
2
)
<<
"rank_id = "
<<
rank_id
<<
"; device_id = "
<<
device_id
<<
"; rank_id = "
<<
rank_id
<<
"; RANK_TABLE_FILE = "
<<
atoi
(
getenv
(
"DEVICE_ID"
));
std
::
vector
<
int
>
rank_ids
{
0
,
1
};
f
::
AttributeMap
gen_hccl_id
;
std
::
vector
<
std
::
string
>
endpointList
=
{
"127.0.0.1:6175"
,
"127.0.0.1:6177"
};
gen_hccl_id
[
"rank"
]
=
rank_id
;
gen_hccl_id
[
"endpoint"
]
=
endpointList
[
rank_id
];
std
::
vector
<
std
::
string
>
other_endpoints
=
{
endpointList
[
rank_id
==
0
?
1
:
0
]};
gen_hccl_id
[
"other_endpoints"
]
=
other_endpoints
;
auto
out
=
scope
->
Var
(
"Out"
);
auto
id
=
out
->
GetMutable
<
HcclRootInfo
>
();
VLOG
(
3
)
<<
"break"
;
auto
comm_init_op
=
f
::
OpRegistry
::
CreateOp
(
"c_gen_hccl_id"
,
{},
{{
"Out"
,
{
"Out"
}}},
gen_hccl_id
);
VLOG
(
3
)
<<
"break"
;
auto
place
=
ctx
.
GetPlace
();
comm_init_op
->
Run
(
*
scope
,
place
);
ctx
.
Wait
();
memcpy
(
hccl_id
,
id
,
1024
);
}
void
Prepare
(
f
::
Scope
*
scope
,
const
p
::
DeviceContext
&
ctx
,
HcclRootInfo
*
hccl_id
)
{
auto
x
=
scope
->
Var
(
"X"
);
auto
id
=
x
->
GetMutable
<
HcclRootInfo
>
();
memcpy
(
id
,
hccl_id
,
1024
);
int
rank_id
=
atoi
(
getenv
(
"RANK_ID"
));
int
device_id
=
atoi
(
getenv
(
"DEVICE_ID"
));
VLOG
(
2
)
<<
"rank_id = "
<<
rank_id
<<
"; device_id = "
<<
device_id
<<
"; rank_id = "
<<
rank_id
<<
"; RANK_TABLE_FILE = "
<<
atoi
(
getenv
(
"DEVICE_ID"
));
// std::vector<int> rank_ids{0, 1};
f
::
AttributeMap
comm_init_attrs
;
comm_init_attrs
[
"ring_id"
]
=
0
;
comm_init_attrs
[
"rank_ids"
]
=
2
;
comm_init_attrs
[
"rank"
]
=
rank_id
;
comm_init_attrs
[
"device_id"
]
=
device_id
;
// comm_init_attrs["rank_ids"] = rank_ids;
auto
comm_init_op
=
f
::
OpRegistry
::
CreateOp
(
"c_comm_init_hccl"
,
{{
"X"
,
{
"X"
}}},
{},
comm_init_attrs
);
auto
place
=
ctx
.
GetPlace
();
comm_init_op
->
Run
(
*
scope
,
place
);
ctx
.
Wait
();
}
template
<
typename
T
>
void
TestHCCLAllReduceOp
(
f
::
Scope
*
scope
,
const
p
::
DeviceContext
&
ctx
,
int
iter
)
{
// init
auto
x
=
scope
->
Var
(
"Data"
);
auto
tensor_x
=
x
->
GetMutable
<
phi
::
DenseTensor
>
();
int
rank_id
=
atoi
(
getenv
(
"RANK_ID"
));
int
num1
=
3
;
int
num2
=
128
;
std
::
vector
<
T
>
init
;
for
(
int64_t
i
=
0
;
i
<
num1
*
num2
;
++
i
)
{
init
.
push_back
(
static_cast
<
T
>
(
1.0
+
rank_id
));
}
init
[
0
]
=
static_cast
<
T
>
(
std
::
numeric_limits
<
float
>::
quiet_NaN
());
PrintDebugInfo
(
"input data"
,
init
);
auto
place
=
ctx
.
GetPlace
();
paddle
::
framework
::
TensorFromVector
(
init
,
ctx
,
tensor_x
);
tensor_x
->
Resize
({
num1
,
num2
});
ctx
.
Wait
();
auto
out
=
scope
->
Var
(
"OutData"
);
auto
tensor_out
=
out
->
GetMutable
<
phi
::
DenseTensor
>
();
tensor_out
->
Resize
({
num1
,
num2
});
tensor_out
->
mutable_data
<
T
>
(
place
);
// allocate
ctx
.
Wait
();
// run
f
::
AttributeMap
attrs
;
attrs
[
"tag"
]
=
std
::
string
(
"tagx_"
+
std
::
to_string
(
iter
));
attrs
[
"ring_id"
]
=
0
;
attrs
[
"use_calc_stream"
]
=
1
;
auto
op
=
f
::
OpRegistry
::
CreateOp
(
"c_allreduce_sum"
,
{{
"X"
,
{
"Data"
}}},
{{
"Out"
,
{
"OutData"
}}},
attrs
);
for
(
int
i
=
0
;
i
<
1
;
i
++
)
{
op
->
Run
(
*
scope
,
place
);
}
ctx
.
Wait
();
std
::
vector
<
T
>
out_vec
;
paddle
::
framework
::
TensorToVector
(
*
tensor_out
,
ctx
,
&
out_vec
);
ctx
.
Wait
();
PrintDebugInfo
(
"output data"
,
out_vec
);
float
diff
=
static_cast
<
float
>
(
out_vec
[
0
])
-
65504
;
EXPECT_TRUE
(
diff
<
0.1
&&
diff
>
-
0.1
);
EXPECT_EQ
(
out_vec
.
size
(),
init
.
size
());
for
(
uint32_t
i
=
1
;
i
<
10
;
i
++
)
{
EXPECT_EQ
(
out_vec
[
i
],
static_cast
<
paddle
::
platform
::
float16
>
(
3.0
));
}
}
TEST
(
c_allreduce_sum
,
NPU
)
{
f
::
Scope
scope
;
HcclRootInfo
hccl_id
;
p
::
NPUDeviceContext
ctx
(
p
::
NPUPlace
(
atoi
(
FLAGS_selected_npus
.
c_str
())));
// only support one device, if more than one device, use first default
PrepareUniqueId
(
&
scope
,
ctx
,
&
hccl_id
);
Prepare
(
&
scope
,
ctx
,
&
hccl_id
);
TestHCCLAllReduceOp
<
paddle
::
platform
::
float16
>
(
&
scope
,
ctx
,
1
);
// TestHCCLAllReduceOp<float>(&scope, ctx, 0);
}
paddle/fluid/operators/collective/c_broadcast_op_npu.cc
已删除
100644 → 0
浏览文件 @
c1cad896
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/collective/c_broadcast_op.h"
namespace
paddle
{
namespace
operators
{
template
<
typename
T
>
class
CBroadcastOpASCENDKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
PADDLE_THROW
(
platform
::
errors
::
PreconditionNotMet
(
"PaddlePaddle should compile with NPU."
));
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
namespace
plat
=
paddle
::
platform
;
REGISTER_OP_NPU_KERNEL
(
c_broadcast
,
ops
::
CBroadcastOpASCENDKernel
<
int
>
,
ops
::
CBroadcastOpASCENDKernel
<
int8_t
>
,
ops
::
CBroadcastOpASCENDKernel
<
float
>
,
ops
::
CBroadcastOpASCENDKernel
<
plat
::
float16
>
);
paddle/fluid/operators/collective/c_broadcast_op_npu_test.cc
已删除
100644 → 0
浏览文件 @
c1cad896
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifndef _WIN32
#include <unistd.h>
#endif
#include <stdio.h>
#include <string>
#include <thread> // NOLINT
#include <vector>
#include "gtest/gtest.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/operators/collective/c_broadcast_op.h"
#include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h"
#include "paddle/fluid/string/printf.h"
#include "paddle/phi/kernels/funcs/math_function.h"
namespace
f
=
paddle
::
framework
;
namespace
p
=
paddle
::
platform
;
USE_OP
(
c_broadcast
);
USE_NO_KERNEL_OP
(
c_gen_hccl_id
);
USE_NO_KERNEL_OP
(
c_comm_init_hccl
);
USE_OP_DEVICE_KERNEL
(
c_broadcast
,
NPU
);
DECLARE_string
(
selected_npus
);
template
<
typename
T
>
void
PrintDebugInfo
(
const
std
::
string
preStr
,
const
std
::
vector
<
T
>&
data
)
{
std
::
string
debugstring
=
""
;
for
(
auto
ele
:
data
)
{
debugstring
+=
std
::
to_string
(
ele
)
+
std
::
string
(
","
);
}
VLOG
(
2
)
<<
preStr
<<
":"
<<
std
::
endl
<<
debugstring
;
}
void
PrepareUniqueId
(
f
::
Scope
*
scope
,
const
p
::
DeviceContext
&
ctx
,
HcclRootInfo
*
hccl_id
)
{
int
rank_id
=
atoi
(
getenv
(
"RANK_ID"
));
int
device_id
=
atoi
(
getenv
(
"DEVICE_ID"
));
VLOG
(
2
)
<<
"rank_id = "
<<
rank_id
<<
"; device_id = "
<<
device_id
<<
"; rank_id = "
<<
rank_id
<<
"; RANK_TABLE_FILE = "
<<
atoi
(
getenv
(
"DEVICE_ID"
));
std
::
vector
<
int
>
rank_ids
{
0
,
1
};
f
::
AttributeMap
gen_hccl_id
;
std
::
vector
<
std
::
string
>
endpointList
=
{
"127.0.0.1:6175"
,
"127.0.0.1:6177"
};
gen_hccl_id
[
"rank"
]
=
rank_id
;
gen_hccl_id
[
"endpoint"
]
=
endpointList
[
rank_id
];
std
::
vector
<
std
::
string
>
other_endpoints
=
{
endpointList
[
rank_id
==
0
?
1
:
0
]};
gen_hccl_id
[
"other_endpoints"
]
=
other_endpoints
;
auto
out
=
scope
->
Var
(
"Out"
);
auto
id
=
out
->
GetMutable
<
HcclRootInfo
>
();
VLOG
(
3
)
<<
"break"
;
auto
comm_init_op
=
f
::
OpRegistry
::
CreateOp
(
"c_gen_hccl_id"
,
{},
{{
"Out"
,
{
"Out"
}}},
gen_hccl_id
);
VLOG
(
3
)
<<
"break"
;
auto
place
=
ctx
.
GetPlace
();
comm_init_op
->
Run
(
*
scope
,
place
);
ctx
.
Wait
();
memcpy
(
hccl_id
,
id
,
1024
);
}
void
Prepare
(
f
::
Scope
*
scope
,
const
p
::
DeviceContext
&
ctx
,
HcclRootInfo
*
hccl_id
)
{
auto
x
=
scope
->
Var
(
"X"
);
auto
id
=
x
->
GetMutable
<
HcclRootInfo
>
();
memcpy
(
id
,
hccl_id
,
1024
);
int
rank_id
=
atoi
(
getenv
(
"RANK_ID"
));
int
device_id
=
atoi
(
getenv
(
"DEVICE_ID"
));
VLOG
(
2
)
<<
"rank_id = "
<<
rank_id
<<
"; device_id = "
<<
device_id
<<
"; rank_id = "
<<
rank_id
<<
"; RANK_TABLE_FILE = "
<<
atoi
(
getenv
(
"DEVICE_ID"
));
// std::vector<int> rank_ids{0, 1};
f
::
AttributeMap
comm_init_attrs
;
comm_init_attrs
[
"ring_id"
]
=
0
;
comm_init_attrs
[
"rank_ids"
]
=
2
;
comm_init_attrs
[
"rank"
]
=
rank_id
;
comm_init_attrs
[
"device_id"
]
=
device_id
;
// comm_init_attrs["rank_ids"] = rank_ids;
auto
comm_init_op
=
f
::
OpRegistry
::
CreateOp
(
"c_comm_init_hccl"
,
{{
"X"
,
{
"X"
}}},
{},
comm_init_attrs
);
auto
place
=
ctx
.
GetPlace
();
comm_init_op
->
Run
(
*
scope
,
place
);
ctx
.
Wait
();
}
void
TestHCCLBroadcastOp
(
f
::
Scope
*
scope
,
const
p
::
DeviceContext
&
ctx
)
{
// init
auto
x
=
scope
->
Var
(
"Data"
);
auto
tensor_x
=
x
->
GetMutable
<
phi
::
DenseTensor
>
();
int
num
=
2
;
std
::
vector
<
float
>
init
;
int
rank_id
=
atoi
(
getenv
(
"RANK_ID"
));
for
(
int64_t
i
=
0
;
i
<
num
*
num
;
++
i
)
{
init
.
push_back
(
1.0
+
rank_id
);
}
PrintDebugInfo
(
"input data"
,
init
);
paddle
::
framework
::
TensorFromVector
(
init
,
ctx
,
tensor_x
);
tensor_x
->
Resize
({
num
,
num
});
ctx
.
Wait
();
auto
place
=
ctx
.
GetPlace
();
auto
out
=
scope
->
Var
(
"OutData"
);
auto
tensor_out
=
out
->
GetMutable
<
phi
::
DenseTensor
>
();
tensor_out
->
Resize
({
num
,
num
});
tensor_out
->
mutable_data
<
float
>
(
place
);
// allocate
ctx
.
Wait
();
// run
f
::
AttributeMap
attrs
;
attrs
[
"tag"
]
=
std
::
string
(
"tagx"
);
attrs
[
"root"
]
=
0
;
attrs
[
"ring_id"
]
=
0
;
auto
op
=
f
::
OpRegistry
::
CreateOp
(
"c_broadcast"
,
{{
"X"
,
{
"Data"
}}},
{{
"Out"
,
{
"OutData"
}}},
attrs
);
for
(
int
i
=
0
;
i
<
10
;
i
++
)
{
op
->
Run
(
*
scope
,
place
);
}
ctx
.
Wait
();
std
::
vector
<
float
>
out_vec
;
paddle
::
framework
::
TensorToVector
(
*
tensor_out
,
ctx
,
&
out_vec
);
ctx
.
Wait
();
PrintDebugInfo
(
"output data"
,
out_vec
);
EXPECT_EQ
(
out_vec
.
size
(),
init
.
size
());
for
(
uint32_t
i
=
0
;
i
<
out_vec
.
size
();
i
++
)
{
EXPECT_EQ
(
out_vec
[
i
],
1.0
);
}
}
TEST
(
c_broadcast
,
NPU
)
{
f
::
Scope
scope
;
HcclRootInfo
hccl_id
;
// only support one device, if more than one device, use first default
p
::
NPUDeviceContext
ctx
(
p
::
NPUPlace
(
atoi
(
FLAGS_selected_npus
.
c_str
())));
PrepareUniqueId
(
&
scope
,
ctx
,
&
hccl_id
);
Prepare
(
&
scope
,
ctx
,
&
hccl_id
);
TestHCCLBroadcastOp
(
&
scope
,
ctx
);
}
paddle/fluid/operators/collective/c_embedding_op_npu.cc
已删除
100644 → 0
浏览文件 @
c1cad896
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <memory>
#include <string>
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/tensor_util.h"
#include "paddle/fluid/operators/collective/c_embedding_op.h"
namespace
paddle
{
namespace
operators
{
template
<
typename
T
>
inline
void
FillNPU
(
Tensor
*
dst
,
T
val
,
const
framework
::
ExecutionContext
&
context
)
{
Tensor
value
(
dst
->
type
());
value
.
mutable_data
<
T
>
({
1
},
context
.
GetPlace
());
FillNpuTensorWithConstant
<
T
>
(
&
value
,
static_cast
<
T
>
(
val
));
auto
stream
=
context
.
template
device_context
<
paddle
::
platform
::
NPUDeviceContext
>()
.
stream
();
const
auto
&
runner
=
NpuOpRunner
(
"FillD"
,
{
value
},
{
*
dst
},
{{
"dims"
,
phi
::
vectorize
(
dst
->
dims
())}});
runner
.
Run
(
stream
);
}
template
<
typename
T
>
void
shard_index
(
const
Tensor
&
table_t
,
const
Tensor
&
ids_t
,
int64_t
start_idx
,
const
Tensor
&
id_t
,
const
framework
::
ExecutionContext
&
context
)
{
const
int
height
=
table_t
.
dims
()[
0
];
auto
stream
=
context
.
template
device_context
<
paddle
::
platform
::
NPUDeviceContext
>()
.
stream
();
phi
::
DenseTensor
id_t_d
;
id_t_d
.
mutable_data
<
T
>
(
ids_t
.
dims
(),
context
.
GetPlace
());
FillNPU
(
&
id_t_d
,
static_cast
<
T
>
(
0.0
),
context
);
id_t_d
.
Resize
(
ids_t
.
dims
());
phi
::
DenseTensor
id_t_u
;
id_t_u
.
mutable_data
<
T
>
(
ids_t
.
dims
(),
context
.
GetPlace
());
FillNPU
(
&
id_t_u
,
static_cast
<
T
>
(
height
-
1
),
context
);
id_t_u
.
Resize
(
ids_t
.
dims
());
phi
::
DenseTensor
id_matched_d
;
id_matched_d
.
mutable_data
<
bool
>
(
ids_t
.
dims
(),
context
.
GetPlace
());
phi
::
DenseTensor
id_matched_u
;
id_matched_u
.
mutable_data
<
bool
>
(
ids_t
.
dims
(),
context
.
GetPlace
());
phi
::
DenseTensor
ignore_tensor
;
ignore_tensor
.
mutable_data
<
T
>
(
ids_t
.
dims
(),
context
.
GetPlace
());
FillNPU
(
&
ignore_tensor
,
static_cast
<
T
>
(
height
),
context
);
ignore_tensor
.
Resize
(
ids_t
.
dims
());
NpuOpRunner
sub_runner
;
#if (CANN_VERSION_CODE >= 503003)
Tensor
factor_tensor
(
ids_t
.
type
());
factor_tensor
.
mutable_data
<
T
>
({
1
},
context
.
GetPlace
());
paddle
::
framework
::
TensorFromVector
(
std
::
vector
<
T
>
{
static_cast
<
T
>
(
start_idx
)},
context
.
device_context
(),
&
factor_tensor
);
sub_runner
.
SetType
(
"Sub"
)
.
AddInput
(
ids_t
)
.
AddInput
(
factor_tensor
)
.
AddOutput
(
id_t
);
#else
sub_runner
.
SetType
(
"Sub"
)
.
AddInput
(
ids_t
)
.
AddInput
(
std
::
vector
<
T
>
{
static_cast
<
T
>
(
start_idx
)})
.
AddOutput
(
id_t
);
#endif
sub_runner
.
Run
();
NpuOpRunner
lessequal1_runner
;
lessequal1_runner
.
SetType
(
"LessEqual"
)
.
AddInput
(
id_t
)
.
AddInput
(
id_t_u
)
.
AddOutput
(
id_matched_u
);
lessequal1_runner
.
Run
();
NpuOpRunner
lessequal2_runner
;
lessequal2_runner
.
SetType
(
"LessEqual"
)
.
AddInput
(
id_t_d
)
.
AddInput
(
id_t
)
.
AddOutput
(
id_matched_d
);
lessequal2_runner
.
Run
();
NpuOpRunner
(
"Equal"
,
{
id_matched_u
,
id_matched_d
},
{
id_matched_d
},
{})
.
Run
(
stream
);
NpuOpRunner
(
"Select"
,
{
id_matched_d
,
id_t
,
ignore_tensor
},
{
id_t
},
{})
.
Run
(
stream
);
}
template
<
typename
TIds
,
typename
T
>
void
NPUGetIdsEmbedding
(
const
framework
::
ExecutionContext
&
context
)
{
auto
*
table_t
=
context
.
Input
<
phi
::
DenseTensor
>
(
"W"
);
auto
*
ids_t
=
context
.
Input
<
phi
::
DenseTensor
>
(
"Ids"
);
auto
*
output_t
=
context
.
Output
<
phi
::
DenseTensor
>
(
"Out"
);
const
int64_t
start_idx
=
context
.
Attr
<
int64_t
>
(
"start_index"
);
auto
stream
=
context
.
template
device_context
<
paddle
::
platform
::
NPUDeviceContext
>()
.
stream
();
phi
::
DenseTensor
ids_t_local
;
ids_t_local
.
mutable_data
<
TIds
>
(
ids_t
->
dims
(),
context
.
GetPlace
());
shard_index
<
TIds
>
(
*
table_t
,
*
ids_t
,
start_idx
,
ids_t_local
,
context
);
auto
pad_shape
=
phi
::
make_ddim
({
table_t
->
dims
()[
0
]
+
1
,
table_t
->
dims
()[
1
]});
phi
::
DenseTensor
table_t_pad
;
size_t
mem_size
=
table_t
->
numel
()
*
phi
::
SizeOf
(
table_t
->
dtype
());
size_t
line_mem_size
=
table_t
->
dims
()[
1
]
*
phi
::
SizeOf
(
table_t
->
dtype
());
PADDLE_ENFORCE_EQ
(
line_mem_size
%
64
,
0
,
platform
::
errors
::
InvalidArgument
(
"NPU only accept the second dim must align by 64"
));
VLOG
(
10
)
<<
"mem_size:"
<<
mem_size
<<
",line_mem_size:"
<<
line_mem_size
<<
", pad_shape:"
<<
pad_shape
<<
", table_dims:"
<<
table_t
->
dims
();
uint8_t
*
pad_data
=
reinterpret_cast
<
uint8_t
*>
(
table_t_pad
.
mutable_data
<
T
>
(
pad_shape
,
context
.
GetPlace
()));
platform
::
NPUMemcpyAsync
(
pad_data
,
table_t
->
data
<
T
>
(),
mem_size
,
ACL_MEMCPY_DEVICE_TO_DEVICE
,
stream
,
mem_size
);
platform
::
NPUMemsetAsync
(
pad_data
+
mem_size
,
0
,
line_mem_size
,
stream
,
line_mem_size
);
output_t
->
mutable_data
<
T
>
(
context
.
GetPlace
());
NpuOpRunner
runner
;
runner
.
SetType
(
"GatherV2"
)
.
AddInput
(
table_t_pad
)
.
AddInput
(
ids_t_local
)
.
AddInput
(
std
::
vector
<
int32_t
>
{
0
})
#if (CANN_VERSION_CODE >= 503003)
.
AddAttrs
({{
"batch_dims"
,
0
}})
#endif
.
AddOutput
(
*
output_t
);
runner
.
Run
();
}
template
<
typename
T
>
class
CEmbeddingNPUKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
context
)
const
override
{
auto
*
ids_t
=
context
.
Input
<
phi
::
DenseTensor
>
(
"Ids"
);
const
auto
&
index_type
=
framework
::
TransToProtoVarType
(
ids_t
->
dtype
());
if
(
index_type
==
framework
::
proto
::
VarType
::
INT32
)
{
NPUGetIdsEmbedding
<
int32_t
,
T
>
(
context
);
}
else
{
PADDLE_THROW
(
platform
::
errors
::
Unavailable
(
"NPU c_embedding ids only support int32."
));
}
}
};
template
<
typename
TIds
,
typename
T
>
void
NPUUpdateEmbedding
(
const
framework
::
ExecutionContext
&
context
)
{
// get inputs
const
int64_t
start_idx
=
context
.
Attr
<
int64_t
>
(
"start_index"
);
auto
ids_t
=
context
.
Input
<
phi
::
DenseTensor
>
(
"Ids"
);
auto
d_output_t
=
context
.
Input
<
phi
::
DenseTensor
>
(
framework
::
GradVarName
(
"Out"
));
auto
table_t
=
context
.
Input
<
phi
::
DenseTensor
>
(
"W"
);
auto
table_grad_t
=
context
.
Output
<
phi
::
DenseTensor
>
(
framework
::
GradVarName
(
"W"
));
VLOG
(
10
)
<<
"ids_t:"
<<
ids_t
<<
", d_output_t:"
<<
d_output_t
<<
", table_t:"
<<
table_t
<<
", table_grad_t"
<<
table_grad_t
;
auto
stream
=
context
.
template
device_context
<
paddle
::
platform
::
NPUDeviceContext
>()
.
stream
();
// convert ids_t to local valid
phi
::
DenseTensor
ids_t_local
;
ids_t_local
.
mutable_data
<
TIds
>
(
ids_t
->
dims
(),
context
.
GetPlace
());
shard_index
<
TIds
>
(
*
table_t
,
*
ids_t
,
start_idx
,
ids_t_local
,
context
);
// padding table_t -> table_t_pad
auto
pad_shape
=
phi
::
make_ddim
({
table_t
->
dims
()[
0
]
+
1
,
table_t
->
dims
()[
1
]});
phi
::
DenseTensor
table_t_pad
;
// set table_t_pad to zero
uint8_t
*
pad_data
=
reinterpret_cast
<
uint8_t
*>
(
table_t_pad
.
mutable_data
<
T
>
(
pad_shape
,
context
.
GetPlace
()));
size_t
table_t_pad_mem_size
=
table_t_pad
.
numel
()
*
framework
::
SizeOfType
(
framework
::
TransToProtoVarType
(
table_t_pad
.
dtype
()));
platform
::
NPUMemsetAsync
(
pad_data
,
0
,
table_t_pad_mem_size
,
stream
,
table_t_pad_mem_size
);
// NOTE(zhiqiu): It seems in cann 20.1, the first input and output
// can be different tensor, but in cann 20.2+, it does inplace operation.
// Thus, the first input and output should be same tensor.
const
auto
&
runner_scatter
=
NpuOpRunner
(
"ScatterAdd"
,
{
table_t_pad
,
ids_t_local
,
*
d_output_t
},
{
table_t_pad
},
{{
"use_locking"
,
true
}});
runner_scatter
.
Run
(
stream
);
// copy table_t_pad to table_t
T
*
dst
=
table_grad_t
->
mutable_data
<
T
>
(
table_t
->
dims
(),
context
.
GetPlace
());
const
size_t
mem_size
=
table_grad_t
->
numel
()
*
phi
::
SizeOf
(
table_grad_t
->
dtype
());
// check align
size_t
line_mem_size
=
table_grad_t
->
dims
()[
1
]
*
phi
::
SizeOf
(
table_grad_t
->
dtype
());
PADDLE_ENFORCE_EQ
(
line_mem_size
%
64
,
0
,
platform
::
errors
::
InvalidArgument
(
"NPU only accept the second dim must align by 64"
));
platform
::
NPUMemcpyAsync
(
dst
,
pad_data
,
mem_size
,
ACL_MEMCPY_DEVICE_TO_DEVICE
,
stream
,
mem_size
);
}
template
<
typename
T
>
class
CEmbeddingGradNPUKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
context
)
const
override
{
auto
*
ids_t
=
context
.
Input
<
phi
::
DenseTensor
>
(
"Ids"
);
const
auto
&
index_type
=
framework
::
TransToProtoVarType
(
ids_t
->
dtype
());
if
(
index_type
==
framework
::
proto
::
VarType
::
INT32
)
{
NPUUpdateEmbedding
<
int32_t
,
T
>
(
context
);
}
else
{
PADDLE_THROW
(
platform
::
errors
::
Unavailable
(
"c_embedding ids only support int32."
));
}
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
namespace
plat
=
paddle
::
platform
;
REGISTER_OP_NPU_KERNEL
(
c_embedding
,
ops
::
CEmbeddingNPUKernel
<
float
>
,
ops
::
CEmbeddingNPUKernel
<
double
>
,
ops
::
CEmbeddingNPUKernel
<
plat
::
float16
>
);
REGISTER_OP_NPU_KERNEL
(
c_embedding_grad
,
ops
::
CEmbeddingGradNPUKernel
<
float
>
,
ops
::
CEmbeddingGradNPUKernel
<
double
>
,
ops
::
CEmbeddingGradNPUKernel
<
plat
::
float16
>
);
paddle/fluid/operators/collective/c_identity_op_npu.cc
已删除
100644 → 0
浏览文件 @
c1cad896
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/collective/c_identity_op.h"
namespace
ops
=
paddle
::
operators
;
namespace
plat
=
paddle
::
platform
;
REGISTER_OP_NPU_KERNEL
(
c_identity
,
ops
::
CIdentityOpKernel
<
float
,
plat
::
NPUPlace
>
,
ops
::
CIdentityOpKernel
<
double
,
plat
::
NPUPlace
>
,
ops
::
CIdentityOpKernel
<
int
,
plat
::
NPUPlace
>
,
ops
::
CIdentityOpKernel
<
int64_t
,
plat
::
NPUPlace
>
,
ops
::
CIdentityOpKernel
<
plat
::
float16
,
plat
::
NPUPlace
>
);
paddle/fluid/operators/collective/c_reduce_max_op_npu.cc
已删除
100644 → 0
浏览文件 @
c1cad896
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/collective/c_reduce_op.h"
namespace
paddle
{
namespace
platform
{
struct
ASCENDPlace
;
}
// namespace platform
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
namespace
plat
=
paddle
::
platform
;
REGISTER_OP_NPU_KERNEL
(
c_reduce_max
,
ops
::
CReduceOpASCENDKernel
<
ops
::
kRedMax
,
int
>
,
ops
::
CReduceOpASCENDKernel
<
ops
::
kRedMax
,
int8_t
>
,
ops
::
CReduceOpASCENDKernel
<
ops
::
kRedMax
,
float
>
,
ops
::
CReduceOpASCENDKernel
<
ops
::
kRedMax
,
plat
::
float16
>
)
paddle/fluid/operators/collective/c_reduce_min_op_npu.cc
已删除
100644 → 0
浏览文件 @
c1cad896
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/collective/c_reduce_op.h"
namespace
paddle
{
namespace
platform
{
struct
ASCENDPlace
;
}
// namespace platform
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
namespace
plat
=
paddle
::
platform
;
REGISTER_OP_NPU_KERNEL
(
c_reduce_min
,
ops
::
CReduceOpASCENDKernel
<
ops
::
kRedMin
,
int
>
,
ops
::
CReduceOpASCENDKernel
<
ops
::
kRedMin
,
int8_t
>
,
ops
::
CReduceOpASCENDKernel
<
ops
::
kRedMin
,
float
>
,
ops
::
CReduceOpASCENDKernel
<
ops
::
kRedMin
,
plat
::
float16
>
)
paddle/fluid/operators/collective/c_reduce_prod_op_npu.cc
已删除
100644 → 0
浏览文件 @
c1cad896
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/collective/c_reduce_op.h"
namespace
paddle
{
namespace
platform
{
struct
ASCENDPlace
;
}
// namespace platform
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
namespace
plat
=
paddle
::
platform
;
REGISTER_OP_NPU_KERNEL
(
c_reduce_prod
,
ops
::
CReduceOpASCENDKernel
<
ops
::
kRedProd
,
int
>
,
ops
::
CReduceOpASCENDKernel
<
ops
::
kRedProd
,
int8_t
>
,
ops
::
CReduceOpASCENDKernel
<
ops
::
kRedProd
,
float
>
,
ops
::
CReduceOpASCENDKernel
<
ops
::
kRedProd
,
plat
::
float16
>
)
paddle/fluid/operators/collective/c_reduce_sum_op_npu.cc
已删除
100644 → 0
浏览文件 @
c1cad896
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/collective/c_reduce_op.h"
namespace
paddle
{
namespace
platform
{
struct
ASCENDPlace
;
}
// namespace platform
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
namespace
plat
=
paddle
::
platform
;
REGISTER_OP_NPU_KERNEL
(
c_reduce_sum
,
ops
::
CReduceOpASCENDKernel
<
ops
::
kRedSum
,
int
>
,
ops
::
CReduceOpASCENDKernel
<
ops
::
kRedSum
,
int8_t
>
,
ops
::
CReduceOpASCENDKernel
<
ops
::
kRedSum
,
float
>
,
ops
::
CReduceOpASCENDKernel
<
ops
::
kRedSum
,
plat
::
float16
>
)
paddle/fluid/operators/collective/c_reduce_sum_op_npu_test.cc
已删除
100644 → 0
浏览文件 @
c1cad896
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifndef _WIN32
#include <unistd.h>
#endif
#include <stdio.h>
#include <string>
#include <thread> // NOLINT
#include <vector>
#include "gtest/gtest.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/operators/collective/c_reduce_op.h"
#include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h"
#include "paddle/fluid/string/printf.h"
#include "paddle/phi/kernels/funcs/math_function.h"
namespace
f
=
paddle
::
framework
;
namespace
p
=
paddle
::
platform
;
USE_OP
(
c_reduce_sum
);
USE_NO_KERNEL_OP
(
c_gen_hccl_id
);
USE_NO_KERNEL_OP
(
c_comm_init_hccl
);
USE_OP_DEVICE_KERNEL
(
c_reduce_sum
,
NPU
);
DECLARE_string
(
selected_npus
);
template
<
typename
T
>
void
PrintDebugInfo
(
const
std
::
string
preStr
,
const
std
::
vector
<
T
>&
data
)
{
std
::
string
debugstring
=
""
;
for
(
auto
ele
:
data
)
{
debugstring
+=
std
::
to_string
(
ele
)
+
std
::
string
(
","
);
}
VLOG
(
3
)
<<
preStr
<<
":"
<<
std
::
endl
<<
debugstring
;
}
void
PrepareUniqueId
(
f
::
Scope
*
scope
,
const
p
::
DeviceContext
&
ctx
,
HcclRootInfo
*
hccl_id
)
{
int
rank_id
=
atoi
(
getenv
(
"RANK_ID"
));
int
device_id
=
atoi
(
getenv
(
"DEVICE_ID"
));
VLOG
(
2
)
<<
"rank_id = "
<<
rank_id
<<
"; device_id = "
<<
device_id
<<
"; rank_id = "
<<
rank_id
<<
"; RANK_TABLE_FILE = "
<<
atoi
(
getenv
(
"DEVICE_ID"
));
std
::
vector
<
int
>
rank_ids
{
0
,
1
};
f
::
AttributeMap
gen_hccl_id
;
std
::
vector
<
std
::
string
>
endpointList
=
{
"127.0.0.1:6175"
,
"127.0.0.1:6177"
};
gen_hccl_id
[
"rank"
]
=
rank_id
;
gen_hccl_id
[
"endpoint"
]
=
endpointList
[
rank_id
];
std
::
vector
<
std
::
string
>
other_endpoints
=
{
endpointList
[
rank_id
==
0
?
1
:
0
]};
gen_hccl_id
[
"other_endpoints"
]
=
other_endpoints
;
auto
out
=
scope
->
Var
(
"Out"
);
auto
id
=
out
->
GetMutable
<
HcclRootInfo
>
();
VLOG
(
3
)
<<
"break"
;
auto
comm_init_op
=
f
::
OpRegistry
::
CreateOp
(
"c_gen_hccl_id"
,
{},
{{
"Out"
,
{
"Out"
}}},
gen_hccl_id
);
VLOG
(
3
)
<<
"break"
;
auto
place
=
ctx
.
GetPlace
();
comm_init_op
->
Run
(
*
scope
,
place
);
ctx
.
Wait
();
memcpy
(
hccl_id
,
id
,
1024
);
}
void
Prepare
(
f
::
Scope
*
scope
,
const
p
::
DeviceContext
&
ctx
,
HcclRootInfo
*
hccl_id
)
{
auto
x
=
scope
->
Var
(
"X"
);
auto
id
=
x
->
GetMutable
<
HcclRootInfo
>
();
memcpy
(
id
,
hccl_id
,
1024
);
int
rank_id
=
atoi
(
getenv
(
"RANK_ID"
));
int
device_id
=
atoi
(
getenv
(
"DEVICE_ID"
));
VLOG
(
2
)
<<
"rank_id = "
<<
rank_id
<<
"; device_id = "
<<
device_id
<<
"; rank_id = "
<<
rank_id
<<
"; RANK_TABLE_FILE = "
<<
atoi
(
getenv
(
"DEVICE_ID"
));
// std::vector<int> rank_ids{0, 1};
f
::
AttributeMap
comm_init_attrs
;
comm_init_attrs
[
"ring_id"
]
=
0
;
comm_init_attrs
[
"rank_ids"
]
=
2
;
comm_init_attrs
[
"rank"
]
=
rank_id
;
comm_init_attrs
[
"device_id"
]
=
device_id
;
// comm_init_attrs["rank_ids"] = rank_ids;
auto
comm_init_op
=
f
::
OpRegistry
::
CreateOp
(
"c_comm_init_hccl"
,
{{
"X"
,
{
"X"
}}},
{},
comm_init_attrs
);
auto
place
=
ctx
.
GetPlace
();
comm_init_op
->
Run
(
*
scope
,
place
);
ctx
.
Wait
();
}
void
TestHCCLReduceOp
(
f
::
Scope
*
scope
,
const
p
::
DeviceContext
&
ctx
,
int
iter
)
{
// init
auto
x
=
scope
->
Var
(
"Data"
);
auto
tensor_x
=
x
->
GetMutable
<
phi
::
DenseTensor
>
();
int
rank_id
=
atoi
(
getenv
(
"RANK_ID"
));
int
num1
=
3
;
int
num2
=
128
;
std
::
vector
<
float
>
init
;
for
(
int64_t
i
=
0
;
i
<
num1
*
num2
;
++
i
)
{
init
.
push_back
(
1.0
+
rank_id
);
}
PrintDebugInfo
(
"input data"
,
init
);
auto
place
=
ctx
.
GetPlace
();
paddle
::
framework
::
TensorFromVector
(
init
,
ctx
,
tensor_x
);
tensor_x
->
Resize
({
num1
,
num2
});
ctx
.
Wait
();
auto
out
=
scope
->
Var
(
"OutData"
);
auto
tensor_out
=
out
->
GetMutable
<
phi
::
DenseTensor
>
();
tensor_out
->
Resize
({
num1
,
num2
});
tensor_out
->
mutable_data
<
float
>
(
place
);
// allocate
ctx
.
Wait
();
// run
f
::
AttributeMap
attrs
;
attrs
[
"tag"
]
=
std
::
string
(
"tagx_"
+
std
::
to_string
(
iter
));
attrs
[
"ring_id"
]
=
0
;
int
root_id
=
0
;
attrs
[
"root_id"
]
=
root_id
;
auto
op
=
f
::
OpRegistry
::
CreateOp
(
"c_reduce_sum"
,
{{
"X"
,
{
"Data"
}}},
{{
"Out"
,
{
"OutData"
}}},
attrs
);
op
->
Run
(
*
scope
,
place
);
ctx
.
Wait
();
std
::
vector
<
float
>
out_vec
;
paddle
::
framework
::
TensorToVector
(
*
tensor_out
,
ctx
,
&
out_vec
);
ctx
.
Wait
();
PrintDebugInfo
(
"output data"
,
out_vec
);
EXPECT_EQ
(
out_vec
.
size
(),
init
.
size
());
for
(
uint32_t
i
=
0
;
i
<
out_vec
.
size
();
i
++
)
{
if
(
rank_id
==
root_id
)
{
EXPECT_EQ
(
out_vec
[
i
],
3.0
);
}
else
{
EXPECT_EQ
(
out_vec
[
i
],
init
[
i
]);
}
}
}
TEST
(
c_reduce_sum
,
NPU
)
{
f
::
Scope
scope
;
HcclRootInfo
hccl_id
;
// only support one device, if more than one device, use first default
p
::
NPUDeviceContext
ctx
(
p
::
NPUPlace
(
atoi
(
FLAGS_selected_npus
.
c_str
())));
PrepareUniqueId
(
&
scope
,
ctx
,
&
hccl_id
);
Prepare
(
&
scope
,
ctx
,
&
hccl_id
);
for
(
int
i
=
0
;
i
<
2
;
i
++
)
{
VLOG
(
2
)
<<
"iter num: "
<<
i
;
TestHCCLReduceOp
(
&
scope
,
ctx
,
i
);
}
}
paddle/fluid/operators/collective/c_reducescatter_op_npu.cc
已删除
100644 → 0
浏览文件 @
c1cad896
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/collective/c_reducescatter_op.h"
namespace
paddle
{
namespace
operators
{
template
<
typename
T
>
class
CReduceScatterOpAscendKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
PADDLE_THROW
(
platform
::
errors
::
PreconditionNotMet
(
"PaddlePaddle should compile with NPU."
));
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
namespace
plat
=
paddle
::
platform
;
REGISTER_OP_NPU_KERNEL
(
c_reducescatter
,
ops
::
CReduceScatterOpAscendKernel
<
int8_t
>
,
ops
::
CReduceScatterOpAscendKernel
<
int
>
,
ops
::
CReduceScatterOpAscendKernel
<
float
>
,
ops
::
CReduceScatterOpAscendKernel
<
plat
::
float16
>
);
paddle/fluid/operators/collective/c_reducescatter_op_npu_test.cc
已删除
100644 → 0
浏览文件 @
c1cad896
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifndef _WIN32
#include <unistd.h>
#endif
#include <stdio.h>
#include <string>
#include <thread> // NOLINT
#include <vector>
#include "gtest/gtest.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/operators/collective/c_allgather_op.h"
#include "paddle/fluid/operators/collective/c_allreduce_op.h"
#include "paddle/fluid/operators/collective/c_broadcast_op.h"
#include "paddle/fluid/operators/collective/c_reducescatter_op.h"
#include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h"
#include "paddle/fluid/string/printf.h"
#include "paddle/phi/kernels/funcs/math_function.h"
namespace
f
=
paddle
::
framework
;
namespace
p
=
paddle
::
platform
;
USE_OP
(
c_reducescatter
);
USE_NO_KERNEL_OP
(
c_gen_hccl_id
);
USE_NO_KERNEL_OP
(
c_comm_init_hccl
);
USE_OP_DEVICE_KERNEL
(
c_reducescatter
,
NPU
);
DECLARE_string
(
selected_npus
);
template
<
typename
T
>
void
PrintDebugInfo
(
const
std
::
string
preStr
,
const
std
::
vector
<
T
>&
data
)
{
std
::
string
debugstring
=
""
;
for
(
auto
ele
:
data
)
{
debugstring
+=
std
::
to_string
(
ele
)
+
std
::
string
(
","
);
}
VLOG
(
2
)
<<
preStr
<<
":"
<<
std
::
endl
<<
debugstring
;
}
void
PrepareUniqueId
(
f
::
Scope
*
scope
,
const
p
::
DeviceContext
&
ctx
,
HcclRootInfo
*
hccl_id
)
{
int
rank_id
=
atoi
(
getenv
(
"RANK_ID"
));
int
device_id
=
atoi
(
getenv
(
"DEVICE_ID"
));
VLOG
(
2
)
<<
"rank_id = "
<<
rank_id
<<
"; device_id = "
<<
device_id
<<
"; rank_id = "
<<
rank_id
<<
"; RANK_TABLE_FILE = "
<<
atoi
(
getenv
(
"DEVICE_ID"
));
std
::
vector
<
int
>
rank_ids
{
0
,
1
};
f
::
AttributeMap
gen_hccl_id
;
std
::
vector
<
std
::
string
>
endpointList
=
{
"127.0.0.1:6175"
,
"127.0.0.1:6177"
};
gen_hccl_id
[
"rank"
]
=
rank_id
;
gen_hccl_id
[
"endpoint"
]
=
endpointList
[
rank_id
];
std
::
vector
<
std
::
string
>
other_endpoints
=
{
endpointList
[
rank_id
==
0
?
1
:
0
]};
gen_hccl_id
[
"other_endpoints"
]
=
other_endpoints
;
auto
out
=
scope
->
Var
(
"Out"
);
auto
id
=
out
->
GetMutable
<
HcclRootInfo
>
();
VLOG
(
3
)
<<
"break"
;
auto
comm_init_op
=
f
::
OpRegistry
::
CreateOp
(
"c_gen_hccl_id"
,
{},
{{
"Out"
,
{
"Out"
}}},
gen_hccl_id
);
VLOG
(
3
)
<<
"break"
;
auto
place
=
ctx
.
GetPlace
();
comm_init_op
->
Run
(
*
scope
,
place
);
ctx
.
Wait
();
memcpy
(
hccl_id
,
id
,
1024
);
}
void
Prepare
(
f
::
Scope
*
scope
,
const
p
::
DeviceContext
&
ctx
,
HcclRootInfo
*
hccl_id
)
{
auto
x
=
scope
->
Var
(
"X"
);
auto
id
=
x
->
GetMutable
<
HcclRootInfo
>
();
memcpy
(
id
,
hccl_id
,
1024
);
int
rank_id
=
atoi
(
getenv
(
"RANK_ID"
));
int
device_id
=
atoi
(
getenv
(
"DEVICE_ID"
));
VLOG
(
2
)
<<
"rank_id = "
<<
rank_id
<<
"; device_id = "
<<
device_id
<<
"; rank_id = "
<<
rank_id
<<
"; RANK_TABLE_FILE = "
<<
atoi
(
getenv
(
"DEVICE_ID"
));
// std::vector<int> rank_ids{0, 1};
f
::
AttributeMap
comm_init_attrs
;
comm_init_attrs
[
"ring_id"
]
=
0
;
comm_init_attrs
[
"rank_ids"
]
=
2
;
comm_init_attrs
[
"rank"
]
=
rank_id
;
comm_init_attrs
[
"device_id"
]
=
device_id
;
// comm_init_attrs["rank_ids"] = rank_ids;
auto
comm_init_op
=
f
::
OpRegistry
::
CreateOp
(
"c_comm_init_hccl"
,
{{
"X"
,
{
"X"
}}},
{},
comm_init_attrs
);
auto
place
=
ctx
.
GetPlace
();
comm_init_op
->
Run
(
*
scope
,
place
);
ctx
.
Wait
();
}
void
TestHCCLReduceScatterOp
(
f
::
Scope
*
scope
,
const
p
::
DeviceContext
&
ctx
)
{
// init
auto
x
=
scope
->
Var
(
"Data"
);
auto
tensor_x
=
x
->
GetMutable
<
phi
::
DenseTensor
>
();
std
::
vector
<
float
>
init
;
int
num1
=
4
;
int
num2
=
1
;
for
(
int64_t
i
=
0
;
i
<
num1
*
num2
;
++
i
)
{
init
.
push_back
(
1.0
);
}
PrintDebugInfo
(
"input data"
,
init
);
paddle
::
framework
::
TensorFromVector
(
init
,
ctx
,
tensor_x
);
tensor_x
->
Resize
({
num1
,
num2
});
ctx
.
Wait
();
auto
place
=
ctx
.
GetPlace
();
auto
out
=
scope
->
Var
(
"OutData"
);
auto
tensor_out
=
out
->
GetMutable
<
phi
::
DenseTensor
>
();
tensor_out
->
Resize
({
num1
,
num2
});
tensor_out
->
mutable_data
<
float
>
(
place
);
// allocate
ctx
.
Wait
();
// run
f
::
AttributeMap
attrs
;
attrs
[
"tag"
]
=
std
::
string
(
"tagx"
);
attrs
[
"ring_id"
]
=
0
;
attrs
[
"nranks"
]
=
2
;
auto
op
=
f
::
OpRegistry
::
CreateOp
(
"c_reducescatter"
,
{{
"X"
,
{
"Data"
}}},
{{
"Out"
,
{
"OutData"
}}},
attrs
);
int
iter_num
=
10
;
for
(
int
i
=
0
;
i
<
iter_num
;
i
++
)
{
op
->
Run
(
*
scope
,
place
);
ctx
.
Wait
();
}
std
::
vector
<
float
>
out_vec
;
paddle
::
framework
::
TensorToVector
(
*
tensor_out
,
ctx
,
&
out_vec
);
ctx
.
Wait
();
PrintDebugInfo
(
"output data"
,
out_vec
);
EXPECT_EQ
(
out_vec
.
size
(),
init
.
size
()
/
2
);
for
(
uint32_t
i
=
0
;
i
<
out_vec
.
size
();
i
++
)
{
EXPECT_EQ
(
out_vec
[
i
],
2.0
);
}
}
TEST
(
c_reducescatter
,
NPU
)
{
f
::
Scope
scope
;
HcclRootInfo
hccl_id
;
// only support one device, if more than one device, use first default
p
::
NPUDeviceContext
ctx
(
p
::
NPUPlace
(
atoi
(
FLAGS_selected_npus
.
c_str
())));
PrepareUniqueId
(
&
scope
,
ctx
,
&
hccl_id
);
Prepare
(
&
scope
,
ctx
,
&
hccl_id
);
TestHCCLReduceScatterOp
(
&
scope
,
ctx
);
}
paddle/fluid/operators/collective/c_sync_calc_stream_op_npu_test.cc
已删除
100644 → 0
浏览文件 @
c1cad896
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifndef _WIN32
#include <unistd.h>
#endif
#include <stdio.h>
#include <string>
#include <thread> // NOLINT
#include <vector>
#include "gtest/gtest.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/string/printf.h"
#include "paddle/phi/kernels/funcs/math_function.h"
namespace
f
=
paddle
::
framework
;
namespace
p
=
paddle
::
platform
;
USE_OP_ITSELF
(
elementwise_add
);
USE_OP_DEVICE_KERNEL
(
elementwise_add
,
NPU
);
USE_OP_DEVICE_KERNEL
(
c_sync_calc_stream
,
NPU
);
template
<
typename
T
>
void
Compare
(
f
::
Scope
*
scope
,
const
p
::
DeviceContext
&
ctx
)
{
// init
auto
x
=
scope
->
Var
(
"X"
);
auto
tensor_x
=
x
->
GetMutable
<
phi
::
DenseTensor
>
();
auto
y
=
scope
->
Var
(
"Y"
);
auto
tensor_y
=
y
->
GetMutable
<
phi
::
DenseTensor
>
();
std
::
vector
<
T
>
init_x
;
for
(
int64_t
i
=
0
;
i
<
10
*
10
;
++
i
)
{
init_x
.
push_back
(
static_cast
<
T
>
(
1.0
));
}
std
::
vector
<
T
>
init_y
;
for
(
int64_t
i
=
0
;
i
<
10
*
10
;
++
i
)
{
init_y
.
push_back
(
static_cast
<
T
>
(
2.0
));
}
paddle
::
framework
::
TensorFromVector
(
init_x
,
ctx
,
tensor_x
);
tensor_x
->
Resize
({
10
,
10
});
paddle
::
framework
::
TensorFromVector
(
init_y
,
ctx
,
tensor_y
);
tensor_y
->
Resize
({
10
,
10
});
f
::
AttributeMap
attrs
;
auto
place
=
ctx
.
GetPlace
();
auto
out
=
scope
->
Var
(
"Out"
);
auto
tensor_out
=
out
->
GetMutable
<
phi
::
DenseTensor
>
();
// sync data
auto
sync_op0
=
f
::
OpRegistry
::
CreateOp
(
"c_sync_calc_stream"
,
{{
"X"
,
{
"X"
}}},
{{
"Out"
,
{
"Out"
}}},
attrs
);
sync_op0
->
Run
(
*
scope
,
place
);
// run
auto
op
=
f
::
OpRegistry
::
CreateOp
(
"elementwise_add"
,
{{
"X"
,
{
"X"
}},
{
"Y"
,
{
"Y"
}}},
{{
"Out"
,
{
"Out"
}}},
attrs
);
op
->
Run
(
*
scope
,
place
);
// sync op run
auto
sync_op
=
f
::
OpRegistry
::
CreateOp
(
"c_sync_calc_stream"
,
{{
"X"
,
{
"X"
}}},
{{
"Out"
,
{
"Out"
}}},
attrs
);
sync_op
->
Run
(
*
scope
,
place
);
std
::
vector
<
T
>
out_vec
;
paddle
::
framework
::
TensorToVector
(
*
tensor_out
,
ctx
,
&
out_vec
);
// sync op copy
auto
sync_op2
=
f
::
OpRegistry
::
CreateOp
(
"c_sync_calc_stream"
,
{{
"X"
,
{
"X"
}}},
{{
"Out"
,
{
"Out"
}}},
attrs
);
sync_op2
->
Run
(
*
scope
,
place
);
float
expected
=
3.0
;
EXPECT_EQ
(
out_vec
.
size
(),
init_x
.
size
());
for
(
uint32_t
i
=
0
;
i
<
out_vec
.
size
();
i
++
)
{
EXPECT_EQ
(
out_vec
[
i
],
static_cast
<
T
>
(
expected
));
}
}
TEST
(
c_sync_calc_stream
,
NPU_fp32
)
{
f
::
Scope
scope
;
p
::
NPUDeviceContext
ctx
(
p
::
NPUPlace
(
0
));
Compare
<
float
>
(
&
scope
,
ctx
);
}
paddle/fluid/operators/collective/c_sync_comm_stream_op_npu_test.cc
已删除
100644 → 0
浏览文件 @
c1cad896
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifndef _WIN32
#include <unistd.h>
#endif
#include <stdio.h>
#include <string>
#include <thread> // NOLINT
#include <vector>
#include "gtest/gtest.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/operators/collective/c_broadcast_op.h"
#include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h"
#include "paddle/fluid/string/printf.h"
#include "paddle/phi/kernels/funcs/math_function.h"
namespace
f
=
paddle
::
framework
;
namespace
p
=
paddle
::
platform
;
USE_OP
(
c_broadcast
);
USE_OP_DEVICE_KERNEL
(
c_sync_comm_stream
,
NPU
);
USE_NO_KERNEL_OP
(
c_gen_hccl_id
);
USE_NO_KERNEL_OP
(
c_comm_init_hccl
);
USE_OP_DEVICE_KERNEL
(
c_broadcast
,
NPU
);
DECLARE_string
(
selected_npus
);
template
<
typename
T
>
void
PrintDebugInfo
(
const
std
::
string
preStr
,
const
std
::
vector
<
T
>&
data
)
{
std
::
string
debugstring
=
""
;
for
(
auto
ele
:
data
)
{
debugstring
+=
std
::
to_string
(
ele
)
+
std
::
string
(
","
);
}
VLOG
(
2
)
<<
preStr
<<
":"
<<
std
::
endl
<<
debugstring
;
}
void
PrepareUniqueId
(
f
::
Scope
*
scope
,
const
p
::
DeviceContext
&
ctx
,
HcclRootInfo
*
hccl_id
)
{
int
rank_id
=
atoi
(
getenv
(
"RANK_ID"
));
int
device_id
=
atoi
(
getenv
(
"DEVICE_ID"
));
VLOG
(
2
)
<<
"rank_id = "
<<
rank_id
<<
"; device_id = "
<<
device_id
<<
"; rank_id = "
<<
rank_id
<<
"; RANK_TABLE_FILE = "
<<
atoi
(
getenv
(
"DEVICE_ID"
));
std
::
vector
<
int
>
rank_ids
{
0
,
1
};
f
::
AttributeMap
gen_hccl_id
;
std
::
vector
<
std
::
string
>
endpointList
=
{
"127.0.0.1:6175"
,
"127.0.0.1:6177"
};
gen_hccl_id
[
"rank"
]
=
rank_id
;
gen_hccl_id
[
"endpoint"
]
=
endpointList
[
rank_id
];
std
::
vector
<
std
::
string
>
other_endpoints
=
{
endpointList
[
rank_id
==
0
?
1
:
0
]};
gen_hccl_id
[
"other_endpoints"
]
=
other_endpoints
;
auto
out
=
scope
->
Var
(
"Out"
);
auto
id
=
out
->
GetMutable
<
HcclRootInfo
>
();
VLOG
(
3
)
<<
"break"
;
auto
comm_init_op
=
f
::
OpRegistry
::
CreateOp
(
"c_gen_hccl_id"
,
{},
{{
"Out"
,
{
"Out"
}}},
gen_hccl_id
);
VLOG
(
3
)
<<
"break"
;
auto
place
=
ctx
.
GetPlace
();
comm_init_op
->
Run
(
*
scope
,
place
);
ctx
.
Wait
();
memcpy
(
hccl_id
,
id
,
1024
);
}
void
Prepare
(
f
::
Scope
*
scope
,
const
p
::
DeviceContext
&
ctx
,
HcclRootInfo
*
hccl_id
)
{
auto
x
=
scope
->
Var
(
"X"
);
auto
id
=
x
->
GetMutable
<
HcclRootInfo
>
();
memcpy
(
id
,
hccl_id
,
1024
);
int
rank_id
=
atoi
(
getenv
(
"RANK_ID"
));
int
device_id
=
atoi
(
getenv
(
"DEVICE_ID"
));
VLOG
(
2
)
<<
"rank_id = "
<<
rank_id
<<
"; device_id = "
<<
device_id
<<
"; rank_id = "
<<
rank_id
<<
"; RANK_TABLE_FILE = "
<<
atoi
(
getenv
(
"DEVICE_ID"
));
// std::vector<int> rank_ids{0, 1};
f
::
AttributeMap
comm_init_attrs
;
comm_init_attrs
[
"ring_id"
]
=
0
;
comm_init_attrs
[
"rank_ids"
]
=
2
;
comm_init_attrs
[
"rank"
]
=
rank_id
;
comm_init_attrs
[
"device_id"
]
=
device_id
;
// comm_init_attrs["rank_ids"] = rank_ids;
auto
comm_init_op
=
f
::
OpRegistry
::
CreateOp
(
"c_comm_init_hccl"
,
{{
"X"
,
{
"X"
}}},
{},
comm_init_attrs
);
auto
place
=
ctx
.
GetPlace
();
comm_init_op
->
Run
(
*
scope
,
place
);
ctx
.
Wait
();
}
void
TestHCCLBroadcastOp
(
f
::
Scope
*
scope
,
const
p
::
DeviceContext
&
ctx
)
{
std
::
cout
<<
"BEGIN TEST:"
<<
__FUNCTION__
<<
std
::
endl
;
// init
auto
x
=
scope
->
Var
(
"Data"
);
auto
tensor_x
=
x
->
GetMutable
<
phi
::
DenseTensor
>
();
int
num
=
2
;
std
::
vector
<
float
>
init
;
int
rank_id
=
atoi
(
getenv
(
"RANK_ID"
));
std
::
cout
<<
"rank_id:"
<<
rank_id
<<
std
::
endl
;
for
(
int64_t
i
=
0
;
i
<
num
*
num
;
++
i
)
{
init
.
push_back
(
1.0
+
rank_id
);
std
::
cout
<<
init
[
0
];
}
std
::
cout
<<
std
::
endl
;
paddle
::
framework
::
TensorFromVector
(
init
,
ctx
,
tensor_x
);
tensor_x
->
Resize
({
num
,
num
});
ctx
.
Wait
();
auto
place
=
ctx
.
GetPlace
();
auto
out
=
scope
->
Var
(
"OutData"
);
auto
tensor_out
=
out
->
GetMutable
<
phi
::
DenseTensor
>
();
tensor_out
->
Resize
({
num
,
num
});
tensor_out
->
mutable_data
<
float
>
(
place
);
// allocate
ctx
.
Wait
();
// run
f
::
AttributeMap
attrs
;
attrs
[
"tag"
]
=
std
::
string
(
"tagx"
);
attrs
[
"root"
]
=
0
;
attrs
[
"ring_id"
]
=
0
;
auto
op
=
f
::
OpRegistry
::
CreateOp
(
"c_broadcast"
,
{{
"X"
,
{
"Data"
}}},
{{
"Out"
,
{
"OutData"
}}},
attrs
);
op
->
Run
(
*
scope
,
place
);
// comm sync
auto
sync_op
=
f
::
OpRegistry
::
CreateOp
(
"c_sync_comm_stream"
,
{{
"X"
,
{
"Data"
}}},
{{
"Out"
,
{
"OutData"
}}},
attrs
);
sync_op
->
Run
(
*
scope
,
place
);
// ctx.Wait();
std
::
vector
<
float
>
out_vec
;
paddle
::
framework
::
TensorToVector
(
*
tensor_out
,
ctx
,
&
out_vec
);
EXPECT_EQ
(
out_vec
.
size
(),
init
.
size
());
for
(
uint32_t
i
=
0
;
i
<
out_vec
.
size
();
i
++
)
{
EXPECT_EQ
(
out_vec
[
i
],
1.0
);
}
}
TEST
(
c_sync_comm_stream_op
,
NPU
)
{
f
::
Scope
scope
;
HcclRootInfo
hccl_id
;
// only support one device, if more than one device, use first default
p
::
NPUDeviceContext
ctx
(
p
::
NPUPlace
(
atoi
(
FLAGS_selected_npus
.
c_str
())));
PrepareUniqueId
(
&
scope
,
ctx
,
&
hccl_id
);
Prepare
(
&
scope
,
ctx
,
&
hccl_id
);
TestHCCLBroadcastOp
(
&
scope
,
ctx
);
}
paddle/fluid/operators/collective/checknumeric_npu_test.cc
已删除
100644 → 0
浏览文件 @
c1cad896
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifndef _WIN32
#include <unistd.h>
#endif
#include <stdio.h>
#include <cmath>
#include <string>
#include <thread> // NOLINT
#include <vector>
#include "gtest/gtest.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/operators/collective/c_allreduce_op.h"
#include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h"
#include "paddle/fluid/string/printf.h"
#include "paddle/phi/kernels/funcs/math_function.h"
namespace
f
=
paddle
::
framework
;
namespace
p
=
paddle
::
platform
;
USE_OP
(
c_allreduce_sum
);
USE_OP_DEVICE_KERNEL
(
c_allreduce_sum
,
NPU
);
DECLARE_string
(
selected_npus
);
template
<
typename
T
>
bool
Check
(
T
value
,
int
size
=
2
*
512
*
8192
)
{
f
::
Scope
scope
;
auto
x
=
scope
.
Var
(
"in"
);
auto
&
ctx
=
*
dynamic_cast
<
p
::
NPUDeviceContext
*>
(
p
::
DeviceContextPool
::
Instance
().
Get
(
p
::
NPUPlace
(
0
)));
auto
place
=
ctx
.
GetPlace
();
auto
tensor_x
=
x
->
GetMutable
<
phi
::
DenseTensor
>
();
tensor_x
->
Resize
({
size
});
tensor_x
->
mutable_data
<
T
>
(
place
);
// allocate
std
::
vector
<
T
>
init
;
for
(
int64_t
i
=
0
;
i
<
size
;
++
i
)
{
init
.
push_back
(
static_cast
<
T
>
(
value
));
}
paddle
::
framework
::
TensorFromVector
(
init
,
ctx
,
tensor_x
);
bool
result
=
paddle
::
operators
::
ContainsNan
(
ctx
,
ctx
.
stream
(),
tensor_x
);
return
result
;
}
TEST
(
check_numeric
,
NPU
)
{
auto
inf
=
std
::
numeric_limits
<
float
>::
infinity
();
auto
fp16_inf
=
static_cast
<
p
::
float16
>
(
inf
);
auto
nan
=
NAN
;
auto
fp16_nan
=
static_cast
<
p
::
float16
>
(
nan
);
bool
result
=
false
;
// Normal
VLOG
(
0
)
<<
"start normal"
;
result
=
Check
<
p
::
float16
>
(
static_cast
<
p
::
float16
>
(
65546
));
ASSERT_FALSE
(
result
);
Check
<
float
>
(
static_cast
<
float
>
(
1.0
));
ASSERT_FALSE
(
result
);
// Inf
VLOG
(
0
)
<<
"start inf"
;
result
=
Check
<
p
::
float16
>
(
fp16_inf
);
ASSERT_FALSE
(
result
);
result
=
Check
<
float
>
(
inf
);
ASSERT_FALSE
(
result
);
// Nan
VLOG
(
0
)
<<
"start nan"
;
result
=
Check
<
p
::
float16
>
(
fp16_nan
);
ASSERT_TRUE
(
result
);
result
=
Check
<
float
>
(
nan
);
ASSERT_TRUE
(
result
);
}
paddle/fluid/operators/collective/mp_allreduce_sum_op_npu.cc
已删除
100644 → 0
浏览文件 @
c1cad896
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/operators/collective/c_allreduce_op.h"
namespace
paddle
{
namespace
platform
{
struct
ASCENDPlace
;
}
// namespace platform
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
namespace
plat
=
paddle
::
platform
;
REGISTER_OP_NPU_KERNEL
(
mp_allreduce_sum
,
ops
::
CAllReduceOpASCENDKernel
<
ops
::
kRedSum
,
int
>
,
ops
::
CAllReduceOpASCENDKernel
<
ops
::
kRedSum
,
int8_t
>
,
ops
::
CAllReduceOpASCENDKernel
<
ops
::
kRedSum
,
float
>
,
ops
::
CAllReduceOpASCENDKernel
<
ops
::
kRedSum
,
plat
::
float16
>
)
paddle/fluid/operators/collective/partial_allgather_op_npu.cc
已删除
100644 → 0
浏览文件 @
c1cad896
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include <memory>
#include "paddle/fluid/operators/collective/partial_allgather_op.h"
#include "paddle/fluid/platform/collective_helper.h"
namespace
paddle
{
namespace
operators
{
template
<
typename
T
>
class
CallPartialGatherOpASCENDKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
PADDLE_THROW
(
platform
::
errors
::
PreconditionNotMet
(
"PaddlePaddle should compile with NPU."
));
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
namespace
plat
=
paddle
::
platform
;
REGISTER_OP_NPU_KERNEL
(
partial_allgather
,
ops
::
CallPartialGatherOpASCENDKernel
<
int8_t
>
,
ops
::
CallPartialGatherOpASCENDKernel
<
int
>
,
ops
::
CallPartialGatherOpASCENDKernel
<
float
>
,
ops
::
CallPartialGatherOpASCENDKernel
<
plat
::
float16
>
);
paddle/fluid/operators/collective/partial_recv_op_npu.cc
已删除
100644 → 0
浏览文件 @
c1cad896
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/collective/partial_recv_op.h"
#include "paddle/fluid/platform/collective_helper.h"
namespace
paddle
{
namespace
operators
{
template
<
typename
T
>
class
PartialRecvOpASCENDKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
PADDLE_THROW
(
platform
::
errors
::
PreconditionNotMet
(
"PaddlePaddle should compile with NPU."
));
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
namespace
plat
=
paddle
::
platform
;
REGISTER_OP_NPU_KERNEL
(
partial_recv
,
ops
::
PartialRecvOpASCENDKernel
<
int
>
,
ops
::
PartialRecvOpASCENDKernel
<
int8_t
>
,
ops
::
PartialRecvOpASCENDKernel
<
float
>
,
ops
::
PartialRecvOpASCENDKernel
<
plat
::
float16
>
);
paddle/fluid/operators/collective/partial_send_op_npu.cc
已删除
100644 → 0
浏览文件 @
c1cad896
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/collective/send_v2_op.h"
#include "paddle/fluid/platform/collective_helper.h"
namespace
paddle
{
namespace
operators
{
template
<
typename
T
>
class
PartialSendOpASCENDKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
PADDLE_THROW
(
platform
::
errors
::
PreconditionNotMet
(
"PaddlePaddle should compile with NPU."
));
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
namespace
plat
=
paddle
::
platform
;
REGISTER_OP_NPU_KERNEL
(
partial_send
,
ops
::
PartialSendOpASCENDKernel
<
int
>
,
ops
::
PartialSendOpASCENDKernel
<
int8_t
>
,
ops
::
PartialSendOpASCENDKernel
<
float
>
,
ops
::
PartialSendOpASCENDKernel
<
plat
::
float16
>
);
paddle/fluid/operators/collective/recv_v2_op_npu.cc
已删除
100644 → 0
浏览文件 @
c1cad896
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/collective/recv_v2_op.h"
#include "paddle/fluid/distributed/collective/process_group.h"
#include "paddle/phi/api/include/tensor.h"
namespace
paddle
{
namespace
operators
{
template
<
typename
T
>
class
CRecvOpASCENDKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
PADDLE_THROW
(
platform
::
errors
::
PreconditionNotMet
(
"PaddlePaddle should compile with NPU."
));
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
namespace
plat
=
paddle
::
platform
;
REGISTER_OP_NPU_KERNEL
(
recv_v2
,
ops
::
CRecvOpASCENDKernel
<
int
>
,
ops
::
CRecvOpASCENDKernel
<
int8_t
>
,
ops
::
CRecvOpASCENDKernel
<
float
>
,
ops
::
CRecvOpASCENDKernel
<
plat
::
float16
>
);
paddle/fluid/operators/collective/recv_v2_op_npu_test.cc
已删除
100644 → 0
浏览文件 @
c1cad896
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifndef _WIN32
#include <unistd.h>
#endif
#include <stdio.h>
#include <string>
#include <thread> // NOLINT
#include <vector>
#include "gtest/gtest.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h"
#include "paddle/fluid/operators/collective/recv_v2_op.h"
#include "paddle/fluid/string/printf.h"
#include "paddle/phi/kernels/funcs/math_function.h"
namespace
f
=
paddle
::
framework
;
namespace
p
=
paddle
::
platform
;
USE_OP
(
recv_v2
);
USE_NO_KERNEL_OP
(
c_gen_hccl_id
);
USE_NO_KERNEL_OP
(
c_comm_init_hccl
);
USE_OP_DEVICE_KERNEL
(
recv_v2
,
NPU
);
void
PrepareUniqueId
(
f
::
Scope
*
scope
,
const
p
::
DeviceContext
&
ctx
,
HcclRootInfo
*
hccl_id
)
{
int
rank_id
=
atoi
(
getenv
(
"RANK_ID"
));
int
device_id
=
atoi
(
getenv
(
"DEVICE_ID"
));
VLOG
(
2
)
<<
"rank_id = "
<<
rank_id
<<
"; device_id = "
<<
device_id
<<
"; rank_id = "
<<
rank_id
<<
"; RANK_TABLE_FILE = "
<<
atoi
(
getenv
(
"DEVICE_ID"
));
std
::
vector
<
int
>
rank_ids
{
0
,
1
};
f
::
AttributeMap
gen_hccl_id
;
std
::
vector
<
std
::
string
>
endpointList
=
{
"127.0.0.1:6175"
,
"127.0.0.1:6177"
};
gen_hccl_id
[
"rank"
]
=
rank_id
;
gen_hccl_id
[
"endpoint"
]
=
endpointList
[
rank_id
];
std
::
vector
<
std
::
string
>
other_endpoints
=
{
endpointList
[
rank_id
==
0
?
1
:
0
]};
gen_hccl_id
[
"other_endpoints"
]
=
other_endpoints
;
auto
out
=
scope
->
Var
(
"Out"
);
auto
id
=
out
->
GetMutable
<
HcclRootInfo
>
();
VLOG
(
3
)
<<
"break"
;
auto
comm_init_op
=
f
::
OpRegistry
::
CreateOp
(
"c_gen_hccl_id"
,
{},
{{
"Out"
,
{
"Out"
}}},
gen_hccl_id
);
VLOG
(
3
)
<<
"break"
;
auto
place
=
ctx
.
GetPlace
();
comm_init_op
->
Run
(
*
scope
,
place
);
ctx
.
Wait
();
memcpy
(
hccl_id
,
id
,
1024
);
}
void
Prepare
(
f
::
Scope
*
scope
,
const
p
::
DeviceContext
&
ctx
,
HcclRootInfo
*
hccl_id
)
{
auto
x
=
scope
->
Var
(
"X"
);
auto
id
=
x
->
GetMutable
<
HcclRootInfo
>
();
memcpy
(
id
,
hccl_id
,
1024
);
int
rank_id
=
atoi
(
getenv
(
"RANK_ID"
));
int
device_id
=
atoi
(
getenv
(
"DEVICE_ID"
));
VLOG
(
2
)
<<
"rank_id = "
<<
rank_id
<<
"; device_id = "
<<
device_id
<<
"; rank_id = "
<<
rank_id
<<
"; RANK_TABLE_FILE = "
<<
atoi
(
getenv
(
"DEVICE_ID"
));
// std::vector<int> rank_ids{0, 1};
f
::
AttributeMap
comm_init_attrs
;
comm_init_attrs
[
"ring_id"
]
=
0
;
comm_init_attrs
[
"rank_ids"
]
=
2
;
comm_init_attrs
[
"rank"
]
=
rank_id
;
comm_init_attrs
[
"device_id"
]
=
device_id
;
// comm_init_attrs["rank_ids"] = rank_ids;
auto
comm_init_op
=
f
::
OpRegistry
::
CreateOp
(
"c_comm_init_hccl"
,
{{
"X"
,
{
"X"
}}},
{},
comm_init_attrs
);
auto
place
=
ctx
.
GetPlace
();
comm_init_op
->
Run
(
*
scope
,
place
);
ctx
.
Wait
();
}
void
TestHcomRecvOp
(
f
::
Scope
*
scope
,
const
p
::
DeviceContext
&
ctx
)
{
std
::
cout
<<
"BEGIN TEST:"
<<
__FUNCTION__
<<
std
::
endl
;
int
num
=
atoi
(
getenv
(
"DATA_SIZE"
));
EXPECT_GT
(
num
,
0
);
EXPECT_LT
(
num
,
1
<<
15
);
int
rank_id
=
atoi
(
getenv
(
"RANK_ID"
));
VLOG
(
3
)
<<
"rank_id:"
<<
rank_id
<<
std
::
endl
;
ctx
.
Wait
();
auto
place
=
ctx
.
GetPlace
();
auto
out
=
scope
->
Var
(
"Data"
);
auto
tensor_out
=
out
->
GetMutable
<
phi
::
DenseTensor
>
();
tensor_out
->
Resize
({
num
,
num
});
tensor_out
->
mutable_data
<
float
>
(
place
);
// allocate
ctx
.
Wait
();
f
::
AttributeMap
attrs
;
attrs
[
"tag"
]
=
std
::
string
(
"srtest"
);
attrs
[
"peer"
]
=
atoi
(
getenv
(
"SRC_RANK"
));
attrs
[
"ring_id"
]
=
0
;
attrs
[
"srTag"
]
=
0
;
std
::
vector
<
int
>
out_shape
;
out_shape
.
push_back
(
num
);
out_shape
.
push_back
(
num
);
attrs
[
"out_shape"
]
=
out_shape
;
auto
op
=
f
::
OpRegistry
::
CreateOp
(
"recv_v2"
,
{},
{{
"Out"
,
{
"Data"
}}},
attrs
);
VLOG
(
3
)
<<
"CreateOp recv_v2"
;
for
(
int
i
=
0
;
i
<
10
;
i
++
)
{
op
->
Run
(
*
scope
,
place
);
}
VLOG
(
3
)
<<
"Run op recv_v2"
;
std
::
vector
<
float
>
out_vec
;
paddle
::
framework
::
TensorToVector
(
*
tensor_out
,
ctx
,
&
out_vec
);
ctx
.
Wait
();
std
::
vector
<
float
>
init
(
num
*
num
,
1.0
*
atoi
(
getenv
(
"DEST_RANK"
)));
EXPECT_EQ
(
out_vec
==
init
,
true
);
}
TEST
(
recv_v2
,
NPU
)
{
f
::
Scope
scope
;
HcclRootInfo
hccl_id
;
char
*
npu_id
=
getenv
(
"FLAGS_selected_npus"
);
VLOG
(
3
)
<<
"Select npu:"
<<
npu_id
;
p
::
NPUDeviceContext
ctx
(
p
::
NPUPlace
(
atoi
(
npu_id
)));
PrepareUniqueId
(
&
scope
,
ctx
,
&
hccl_id
);
Prepare
(
&
scope
,
ctx
,
&
hccl_id
);
TestHcomRecvOp
(
&
scope
,
ctx
);
}
paddle/fluid/operators/collective/send_v2_op_npu.cc
已删除
100644 → 0
浏览文件 @
c1cad896
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/collective/send_v2_op.h"
#include "paddle/fluid/distributed/collective/process_group.h"
#include "paddle/phi/api/include/tensor.h"
namespace
paddle
{
namespace
operators
{
template
<
typename
T
>
class
CSendOpASCENDKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
PADDLE_THROW
(
platform
::
errors
::
PreconditionNotMet
(
"PaddlePaddle should compile with NPU."
));
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
namespace
plat
=
paddle
::
platform
;
REGISTER_OP_NPU_KERNEL
(
send_v2
,
ops
::
CSendOpASCENDKernel
<
int
>
,
ops
::
CSendOpASCENDKernel
<
int8_t
>
,
ops
::
CSendOpASCENDKernel
<
float
>
,
ops
::
CSendOpASCENDKernel
<
plat
::
float16
>
);
paddle/fluid/operators/collective/send_v2_op_npu_test.cc
已删除
100644 → 0
浏览文件 @
c1cad896
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifndef _WIN32
#include <unistd.h>
#endif
#include <stdio.h>
#include <string>
#include <thread> // NOLINT
#include <vector>
#include "gtest/gtest.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/operator.h"
#include "paddle/fluid/framework/program_desc.h"
#include "paddle/fluid/operators/collective/gen_hccl_id_op_helper.h"
#include "paddle/fluid/operators/collective/send_v2_op.h"
#include "paddle/fluid/string/printf.h"
#include "paddle/phi/kernels/funcs/math_function.h"
namespace
f
=
paddle
::
framework
;
namespace
p
=
paddle
::
platform
;
USE_OP
(
send_v2
);
USE_NO_KERNEL_OP
(
c_gen_hccl_id
);
USE_NO_KERNEL_OP
(
c_comm_init_hccl
);
USE_OP_DEVICE_KERNEL
(
send_v2
,
NPU
);
void
PrepareUniqueId
(
f
::
Scope
*
scope
,
const
p
::
DeviceContext
&
ctx
,
HcclRootInfo
*
hccl_id
)
{
int
rank_id
=
atoi
(
getenv
(
"RANK_ID"
));
int
device_id
=
atoi
(
getenv
(
"DEVICE_ID"
));
VLOG
(
2
)
<<
"rank_id = "
<<
rank_id
<<
"; device_id = "
<<
device_id
<<
"; rank_id = "
<<
rank_id
<<
"; RANK_TABLE_FILE = "
<<
atoi
(
getenv
(
"DEVICE_ID"
));
std
::
vector
<
int
>
rank_ids
{
0
,
1
};
f
::
AttributeMap
gen_hccl_id
;
std
::
vector
<
std
::
string
>
endpointList
=
{
"127.0.0.1:6175"
,
"127.0.0.1:6177"
};
gen_hccl_id
[
"rank"
]
=
rank_id
;
gen_hccl_id
[
"endpoint"
]
=
endpointList
[
rank_id
];
std
::
vector
<
std
::
string
>
other_endpoints
=
{
endpointList
[
rank_id
==
0
?
1
:
0
]};
gen_hccl_id
[
"other_endpoints"
]
=
other_endpoints
;
auto
out
=
scope
->
Var
(
"Out"
);
auto
id
=
out
->
GetMutable
<
HcclRootInfo
>
();
VLOG
(
3
)
<<
"break"
;
auto
comm_init_op
=
f
::
OpRegistry
::
CreateOp
(
"c_gen_hccl_id"
,
{},
{{
"Out"
,
{
"Out"
}}},
gen_hccl_id
);
VLOG
(
3
)
<<
"break"
;
auto
place
=
ctx
.
GetPlace
();
comm_init_op
->
Run
(
*
scope
,
place
);
ctx
.
Wait
();
memcpy
(
hccl_id
,
id
,
1024
);
}
void
Prepare
(
f
::
Scope
*
scope
,
const
p
::
DeviceContext
&
ctx
,
HcclRootInfo
*
hccl_id
)
{
auto
x
=
scope
->
Var
(
"X"
);
auto
id
=
x
->
GetMutable
<
HcclRootInfo
>
();
memcpy
(
id
,
hccl_id
,
1024
);
int
rank_id
=
atoi
(
getenv
(
"RANK_ID"
));
int
device_id
=
atoi
(
getenv
(
"DEVICE_ID"
));
VLOG
(
2
)
<<
"rank_id = "
<<
rank_id
<<
"; device_id = "
<<
device_id
<<
"; rank_id = "
<<
rank_id
<<
"; RANK_TABLE_FILE = "
<<
atoi
(
getenv
(
"DEVICE_ID"
));
// std::vector<int> rank_ids{0, 1};
f
::
AttributeMap
comm_init_attrs
;
comm_init_attrs
[
"ring_id"
]
=
0
;
comm_init_attrs
[
"rank_ids"
]
=
2
;
comm_init_attrs
[
"rank"
]
=
rank_id
;
comm_init_attrs
[
"device_id"
]
=
device_id
;
// comm_init_attrs["rank_ids"] = rank_ids;
auto
comm_init_op
=
f
::
OpRegistry
::
CreateOp
(
"c_comm_init_hccl"
,
{{
"X"
,
{
"X"
}}},
{},
comm_init_attrs
);
auto
place
=
ctx
.
GetPlace
();
comm_init_op
->
Run
(
*
scope
,
place
);
ctx
.
Wait
();
}
void
TestHcomSendOp
(
f
::
Scope
*
scope
,
const
p
::
DeviceContext
&
ctx
)
{
std
::
cout
<<
"BEGIN TEST:"
<<
__FUNCTION__
<<
std
::
endl
;
auto
x
=
scope
->
Var
(
"Data"
);
auto
tensor_x
=
x
->
GetMutable
<
phi
::
DenseTensor
>
();
int
num
=
atoi
(
getenv
(
"DATA_SIZE"
));
EXPECT_GT
(
num
,
0
);
EXPECT_LT
(
num
,
1
<<
15
);
std
::
vector
<
float
>
init
(
num
*
num
,
1.0
*
atoi
(
getenv
(
"DEST_RANK"
)));
int
rank_id
=
atoi
(
getenv
(
"RANK_ID"
));
VLOG
(
3
)
<<
"rank id:"
<<
rank_id
;
paddle
::
framework
::
TensorFromVector
(
init
,
ctx
,
tensor_x
);
tensor_x
->
Resize
({
num
,
num
});
ctx
.
Wait
();
auto
place
=
ctx
.
GetPlace
();
ctx
.
Wait
();
f
::
AttributeMap
attrs
;
attrs
[
"tag"
]
=
std
::
string
(
"srtest"
);
attrs
[
"peer"
]
=
atoi
(
getenv
(
"DEST_RANK"
));
attrs
[
"ring_id"
]
=
0
;
attrs
[
"srTag"
]
=
0
;
auto
op
=
f
::
OpRegistry
::
CreateOp
(
"send_v2"
,
{{
"X"
,
{
"Data"
}}},
{},
attrs
);
for
(
int
i
=
0
;
i
<
10
;
i
++
)
{
op
->
Run
(
*
scope
,
place
);
}
VLOG
(
3
)
<<
"send run over"
;
ctx
.
Wait
();
}
TEST
(
send_v2
,
NPU
)
{
f
::
Scope
scope
;
HcclRootInfo
hccl_id
;
char
*
npu_id
=
getenv
(
"FLAGS_selected_npus"
);
VLOG
(
3
)
<<
"Select npu:"
<<
npu_id
;
p
::
NPUDeviceContext
ctx
(
p
::
NPUPlace
(
atoi
(
npu_id
)));
PrepareUniqueId
(
&
scope
,
ctx
,
&
hccl_id
);
Prepare
(
&
scope
,
ctx
,
&
hccl_id
);
TestHcomSendOp
(
&
scope
,
ctx
);
}
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录