Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle
提交
348d043e
P
Paddle
项目概览
PaddlePaddle
/
Paddle
1 年多 前同步成功
通知
2302
Star
20931
Fork
5422
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1423
列表
看板
标记
里程碑
合并请求
543
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1,423
Issue
1,423
列表
看板
标记
里程碑
合并请求
543
合并请求
543
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
348d043e
编写于
7月 13, 2021
作者:
W
WangXi
提交者:
GitHub
7月 13, 2021
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
[hybrid performance] Optimize tensor parallel plus pipeline parallel send recv size (#34110)
上级
651aad06
变更
12
隐藏空白更改
内联
并排
Showing
12 changed file
with
801 addition
and
30 deletion
+801
-30
paddle/fluid/operators/collective/partial_allgather_op.cc
paddle/fluid/operators/collective/partial_allgather_op.cc
+85
-0
paddle/fluid/operators/collective/partial_allgather_op.cu.cc
paddle/fluid/operators/collective/partial_allgather_op.cu.cc
+91
-0
paddle/fluid/operators/collective/partial_allgather_op.h
paddle/fluid/operators/collective/partial_allgather_op.h
+39
-0
paddle/fluid/operators/collective/partial_recv_op.cc
paddle/fluid/operators/collective/partial_recv_op.cc
+131
-0
paddle/fluid/operators/collective/partial_recv_op.cu.cc
paddle/fluid/operators/collective/partial_recv_op.cu.cc
+106
-0
paddle/fluid/operators/collective/partial_recv_op.h
paddle/fluid/operators/collective/partial_recv_op.h
+37
-0
paddle/fluid/operators/collective/partial_send_op.cc
paddle/fluid/operators/collective/partial_send_op.cc
+101
-0
paddle/fluid/operators/collective/partial_send_op.cu.cc
paddle/fluid/operators/collective/partial_send_op.cu.cc
+99
-0
paddle/fluid/operators/collective/partial_send_op.h
paddle/fluid/operators/collective/partial_send_op.h
+38
-0
python/paddle/distributed/fleet/meta_optimizers/pipeline_optimizer.py
...e/distributed/fleet/meta_optimizers/pipeline_optimizer.py
+2
-0
python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py
...e/distributed/fleet/meta_optimizers/sharding_optimizer.py
+19
-14
python/paddle/fluid/optimizer.py
python/paddle/fluid/optimizer.py
+53
-16
未找到文件。
paddle/fluid/operators/collective/partial_allgather_op.cc
0 → 100644
浏览文件 @
348d043e
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/collective/partial_allgather_op.h"
namespace
paddle
{
namespace
operators
{
class
PartialAllGatherOp
:
public
framework
::
OperatorWithKernel
{
public:
using
framework
::
OperatorWithKernel
::
OperatorWithKernel
;
void
InferShape
(
framework
::
InferShapeContext
*
ctx
)
const
override
{
OP_INOUT_CHECK
(
ctx
->
HasInput
(
"X"
),
"Input"
,
"X"
,
"PartialAllGather"
);
OP_INOUT_CHECK
(
ctx
->
HasOutput
(
"Out"
),
"Input"
,
"Out"
,
"PartialAllGather"
);
int
nranks
=
ctx
->
Attrs
().
Get
<
int
>
(
"nranks"
);
int
rank
=
ctx
->
Attrs
().
Get
<
int
>
(
"rank"
);
PADDLE_ENFORCE_GE
(
nranks
,
2
,
platform
::
errors
::
InvalidArgument
(
"The value of nranks should be >=2."
));
PADDLE_ENFORCE_EQ
(
(
rank
>=
0
&&
rank
<
nranks
),
true
,
platform
::
errors
::
InvalidArgument
(
"The rank (%d) for partial_allgather op must >=0 and <nranks (%d)"
,
rank
,
nranks
));
framework
::
DDim
dim
=
ctx
->
GetInputDim
(
"X"
);
ctx
->
SetOutputDim
(
"Out"
,
dim
);
}
};
class
PartialAllGatherOpMaker
:
public
framework
::
OpProtoAndCheckerMaker
{
public:
void
Make
()
{
AddInput
(
"X"
,
"(Tensor) tensor to be partial allgather"
);
AddOutput
(
"Out"
,
"(Tensor) the allgather result"
);
AddAttr
<
int
>
(
"ring_id"
,
"(int default 0) communication ring id."
)
.
SetDefault
(
0
);
#if defined(PADDLE_WITH_ASCEND_CL)
AddAttr
<
std
::
string
>
(
"tag"
,
"(string default tag) tag for all gather."
)
.
SetDefault
(
"tag"
);
#endif
AddAttr
<
bool
>
(
"use_calc_stream"
,
"(bool default false) eject CUDA operations to calculation stream."
)
.
SetDefault
(
false
);
AddAttr
<
int
>
(
"nranks"
,
"Total trainer count of the distributed training job"
);
AddAttr
<
int
>
(
"rank"
,
"Rand of the distributed training job"
);
AddComment
(
R"DOC(
PartialAllGather Operator.
Divide the Input into nranks copies and only use the rank part.
Each rank receives the aggregation of data from all ranks in the order of the ranks.
reference: https://docs.nvidia.com/deeplearning/sdk/nccl-developer-guide/docs/usage/operations.html#allgather
)DOC"
);
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
namespace
plat
=
paddle
::
platform
;
REGISTER_OP_WITHOUT_GRADIENT
(
partial_allgather
,
ops
::
PartialAllGatherOp
,
ops
::
PartialAllGatherOpMaker
);
REGISTER_OP_CPU_KERNEL
(
partial_allgather
,
ops
::
PartialAllGatherOpCPUKernel
<
float
>
,
ops
::
PartialAllGatherOpCPUKernel
<
double
>
,
ops
::
PartialAllGatherOpCPUKernel
<
int
>
,
ops
::
PartialAllGatherOpCPUKernel
<
int64_t
>
,
ops
::
PartialAllGatherOpCPUKernel
<
plat
::
float16
>
);
paddle/fluid/operators/collective/partial_allgather_op.cu.cc
0 → 100644
浏览文件 @
348d043e
/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/collective/partial_allgather_op.h"
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
#include "paddle/fluid/platform/collective_helper.h"
#include "paddle/fluid/platform/nccl_helper.h"
#endif
namespace
paddle
{
namespace
operators
{
template
<
typename
T
>
class
PartialAllGatherOpCUDAKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
auto
in
=
ctx
.
Input
<
framework
::
Tensor
>
(
"X"
);
auto
out
=
ctx
.
Output
<
framework
::
Tensor
>
(
"Out"
);
int64_t
numel
=
in
->
numel
();
ncclDataType_t
dtype
=
platform
::
ToNCCLDataType
(
in
->
type
());
int
nranks
=
ctx
.
Attr
<
int
>
(
"nranks"
);
int
rank
=
ctx
.
Attr
<
int
>
(
"rank"
);
int
rid
=
ctx
.
Attr
<
int
>
(
"ring_id"
);
auto
place
=
ctx
.
GetPlace
();
auto
comm
=
platform
::
NCCLCommContext
::
Instance
().
Get
(
rid
,
place
);
PADDLE_ENFORCE_EQ
(
nranks
,
comm
->
nranks
(),
platform
::
errors
::
InvalidArgument
(
"nranks: %s should equal to %s"
,
nranks
,
comm
->
nranks
()));
PADDLE_ENFORCE_EQ
(
rank
,
comm
->
rank
(),
platform
::
errors
::
InvalidArgument
(
"rank: %s should equal to %s"
,
rank
,
comm
->
rank
()));
PADDLE_ENFORCE_EQ
(
(
numel
%
nranks
),
0
,
platform
::
errors
::
InvalidArgument
(
"The input numel (%d) must be divisible by nranks(%d)"
,
numel
,
nranks
));
framework
::
DDim
dims
=
in
->
dims
();
out
->
mutable_data
<
T
>
(
dims
,
place
);
int64_t
send_numel
=
numel
/
nranks
;
int
offset
=
send_numel
*
rank
;
const
T
*
send_buff
=
in
->
data
<
T
>
()
+
offset
;
T
*
recv_buff
=
out
->
data
<
T
>
();
gpuStream_t
stream
=
nullptr
;
if
(
ctx
.
Attr
<
bool
>
(
"use_calc_stream"
))
{
auto
dev_ctx
=
platform
::
DeviceContextPool
::
Instance
().
Get
(
place
);
stream
=
static_cast
<
platform
::
CUDADeviceContext
*>
(
dev_ctx
)
->
stream
();
}
else
{
stream
=
comm
->
stream
();
}
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
ncclAllGather
(
send_buff
,
recv_buff
,
send_numel
,
static_cast
<
ncclDataType_t
>
(
dtype
),
comm
->
comm
(),
stream
));
#else
PADDLE_THROW
(
platform
::
errors
::
PreconditionNotMet
(
"PaddlePaddle should compile with GPU."
));
#endif
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
namespace
plat
=
paddle
::
platform
;
REGISTER_OP_CUDA_KERNEL
(
partial_allgather
,
ops
::
PartialAllGatherOpCUDAKernel
<
float
>
,
ops
::
PartialAllGatherOpCUDAKernel
<
double
>
,
ops
::
PartialAllGatherOpCUDAKernel
<
int
>
,
ops
::
PartialAllGatherOpCUDAKernel
<
int64_t
>
,
ops
::
PartialAllGatherOpCUDAKernel
<
plat
::
float16
>
);
paddle/fluid/operators/collective/partial_allgather_op.h
0 → 100644
浏览文件 @
348d043e
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <algorithm>
#include <utility>
#include <vector>
#include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/framework/ddim.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/op_registry.h"
namespace
paddle
{
namespace
operators
{
template
<
typename
T
>
class
PartialAllGatherOpCPUKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
PADDLE_THROW
(
platform
::
errors
::
Unavailable
(
"Do not support partial_allgather for cpu kernel now."
));
}
};
}
// namespace operators
}
// namespace paddle
paddle/fluid/operators/collective/partial_recv_op.cc
0 → 100644
浏览文件 @
348d043e
/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/collective/partial_recv_op.h"
#include <string>
namespace
paddle
{
namespace
operators
{
class
PartialRecvOp
:
public
framework
::
OperatorWithKernel
{
public:
using
framework
::
OperatorWithKernel
::
OperatorWithKernel
;
void
InferShape
(
framework
::
InferShapeContext
*
ctx
)
const
override
{
OP_INOUT_CHECK
(
ctx
->
HasOutput
(
"Out"
),
"Output"
,
"Out"
,
"PartialRecv"
);
int
peer
=
ctx
->
Attrs
().
Get
<
int
>
(
"peer"
);
int
ring_id
=
ctx
->
Attrs
().
Get
<
int
>
(
"ring_id"
);
int
num
=
ctx
->
Attrs
().
Get
<
int
>
(
"num"
);
int
id
=
ctx
->
Attrs
().
Get
<
int
>
(
"id"
);
auto
out_shape
=
ctx
->
Attrs
().
Get
<
std
::
vector
<
int
>>
(
"out_shape"
);
PADDLE_ENFORCE_GE
(
peer
,
0
,
platform
::
errors
::
InvalidArgument
(
"The peer (%d) for partial_recv op must be non-negative."
,
peer
));
PADDLE_ENFORCE_GE
(
ring_id
,
0
,
platform
::
errors
::
InvalidArgument
(
"The ring_id (%d) for partial_recv op must be non-negative."
,
ring_id
));
PADDLE_ENFORCE_GE
(
num
,
1
,
platform
::
errors
::
InvalidArgument
(
"The num (%d) for partial_send op must >=1"
,
num
));
PADDLE_ENFORCE_EQ
(
(
id
>=
0
&&
id
<
num
),
true
,
platform
::
errors
::
InvalidArgument
(
"The id (%d) for partial_send op must >=0 and <num (%d)"
,
id
,
num
));
PADDLE_ENFORCE_GE
(
out_shape
.
size
(),
1
,
platform
::
errors
::
InvalidArgument
(
"The size of the output shape must be greater than 0 "
"but the value given is %d."
,
out_shape
.
size
()));
for
(
size_t
i
=
0
;
i
<
out_shape
.
size
();
++
i
)
{
PADDLE_ENFORCE_GE
(
out_shape
[
i
],
1
,
platform
::
errors
::
InvalidArgument
(
"The shape attribute for partial_recv must be set "
"explicitly, but the %dth element is %d which "
"is less than 1."
,
i
,
out_shape
[
i
]));
}
auto
out_dims
=
framework
::
make_ddim
(
out_shape
);
int
numel
=
framework
::
product
(
out_dims
);
PADDLE_ENFORCE_EQ
(
(
numel
%
num
),
0
,
platform
::
errors
::
InvalidArgument
(
"The output numel (%d) must be divisible by num(%d)"
,
numel
,
num
));
ctx
->
SetOutputDim
(
"Out"
,
framework
::
make_ddim
(
out_shape
));
}
protected:
framework
::
OpKernelType
GetExpectedKernelType
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
int
dtype
=
ctx
.
Attr
<
int
>
(
"dtype"
);
framework
::
proto
::
VarType
::
Type
type
=
framework
::
proto
::
VarType
::
Type
(
dtype
);
return
framework
::
OpKernelType
(
type
,
ctx
.
GetPlace
());
}
};
class
PartialRecvOpMaker
:
public
framework
::
OpProtoAndCheckerMaker
{
public:
void
Make
()
{
AddOutput
(
"Out"
,
"(Tensor) tensor to receive."
);
AddAttr
<
int
>
(
"ring_id"
,
"(int default 0) nccl communication ring id."
)
.
SetDefault
(
0
);
AddAttr
<
int
>
(
"peer"
,
"(int default 0) rank id for sender."
).
SetDefault
(
0
);
AddAttr
<
int
>
(
"dtype"
,
"(int default 5('float32')) data type of tensor."
)
.
SetDefault
(
5
);
#if defined(PADDLE_WITH_ASCEND_CL)
AddAttr
<
std
::
string
>
(
"tag"
,
"(string default tag) tag for broadcasting."
)
.
SetDefault
(
"tag"
);
AddAttr
<
int
>
(
"srTag"
,
"(string default tag) tag for broadcasting."
)
.
SetDefault
(
0
);
#endif
AddAttr
<
std
::
vector
<
int
>>
(
"out_shape"
,
"shape of the output tensor."
)
.
SetDefault
(
std
::
vector
<
int
>
());
AddAttr
<
bool
>
(
"use_calc_stream"
,
"(bool default false) eject CUDA operations to calculation stream."
)
.
SetDefault
(
false
);
AddAttr
<
int
>
(
"num"
,
"(int default 1) The number of Output to be cut."
)
.
SetDefault
(
1
);
AddAttr
<
int
>
(
"id"
,
"(int default 0) ID of the part to be recv after Output cut."
)
.
SetDefault
(
0
);
AddComment
(
R"DOC(
Recv Operator.
Divide the Output into num copies and only recv the id part.
Reference: https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/usage/p2p.html#sendrecv
)DOC"
);
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
namespace
plat
=
paddle
::
platform
;
REGISTER_OP_WITHOUT_GRADIENT
(
partial_recv
,
ops
::
PartialRecvOp
,
ops
::
PartialRecvOpMaker
);
REGISTER_OP_CPU_KERNEL
(
partial_recv
,
ops
::
PartialRecvOpCPUKernel
<
float
>
,
ops
::
PartialRecvOpCPUKernel
<
double
>
,
ops
::
PartialRecvOpCPUKernel
<
int
>
,
ops
::
PartialRecvOpCPUKernel
<
int64_t
>
,
ops
::
PartialRecvOpCPUKernel
<
plat
::
float16
>
);
paddle/fluid/operators/collective/partial_recv_op.cu.cc
0 → 100644
浏览文件 @
348d043e
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/collective/partial_recv_op.h"
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
#include "paddle/fluid/platform/collective_helper.h"
#include "paddle/fluid/platform/nccl_helper.h"
#endif
namespace
paddle
{
namespace
operators
{
template
<
typename
T
>
class
PartialRecvOpCUDAKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
#if (defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_NCCL)) && \
NCCL_VERSION_CODE >= 2703
auto
out
=
ctx
.
Output
<
framework
::
LoDTensor
>
(
"Out"
);
auto
out_dims
=
out
->
dims
();
auto
numel
=
out
->
numel
();
int
rid
=
ctx
.
Attr
<
int
>
(
"ring_id"
);
int
peer
=
ctx
.
Attr
<
int
>
(
"peer"
);
int
data_type
=
ctx
.
Attr
<
int
>
(
"dtype"
);
int
num
=
ctx
.
Attr
<
int
>
(
"num"
);
int
id
=
ctx
.
Attr
<
int
>
(
"id"
);
framework
::
proto
::
VarType
::
Type
type
=
framework
::
proto
::
VarType
::
Type
(
data_type
);
PADDLE_ENFORCE_GE
(
rid
,
0
,
platform
::
errors
::
InvalidArgument
(
"The ring_id (%d) for partial_recv op must be non-negative."
,
rid
));
PADDLE_ENFORCE_GE
(
peer
,
0
,
platform
::
errors
::
InvalidArgument
(
"The peer (%d) for partial_recv op must be non-negative."
,
peer
));
PADDLE_ENFORCE_GE
(
num
,
1
,
platform
::
errors
::
InvalidArgument
(
"The num (%d) for partial_recv op must >=1"
,
num
));
PADDLE_ENFORCE_EQ
(
(
id
>=
0
&&
id
<
num
),
true
,
platform
::
errors
::
InvalidArgument
(
"The id (%d) for partial_recv op must >=0 and <num (%d)"
,
id
,
num
));
PADDLE_ENFORCE_EQ
(
(
numel
%
num
),
0
,
platform
::
errors
::
InvalidArgument
(
"The input numel (%d) must be divisible by num(%d)"
,
numel
,
num
));
gpuStream_t
stream
=
nullptr
;
auto
place
=
ctx
.
GetPlace
();
auto
comm
=
platform
::
NCCLCommContext
::
Instance
().
Get
(
rid
,
place
);
if
(
ctx
.
Attr
<
bool
>
(
"use_calc_stream"
))
{
auto
dev_ctx
=
platform
::
DeviceContextPool
::
Instance
().
Get
(
place
);
stream
=
static_cast
<
platform
::
CUDADeviceContext
*>
(
dev_ctx
)
->
stream
();
}
else
{
stream
=
comm
->
stream
();
}
PADDLE_ENFORCE_LT
(
peer
,
comm
->
nranks
(),
platform
::
errors
::
InvalidArgument
(
"The value of peer (%d) you set must "
"be less than comm->nranks (%d)."
,
peer
,
comm
->
nranks
()));
out
->
mutable_data
<
T
>
(
out_dims
,
place
);
ncclDataType_t
dtype
=
platform
::
ToNCCLDataType
(
type
);
int
recv_numel
=
numel
/
num
;
int
offset
=
recv_numel
*
id
;
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
ncclRecv
(
out
->
data
<
T
>
()
+
offset
,
recv_numel
,
dtype
,
peer
,
comm
->
comm
(),
stream
));
VLOG
(
3
)
<<
"rank "
<<
comm
->
rank
()
<<
" recv "
<<
recv_numel
<<
" from offset["
<<
offset
<<
"] from "
<<
peer
;
#else
PADDLE_THROW
(
platform
::
errors
::
Unavailable
(
"PaddlePaddle should be compiled with NCCL and "
"NCCL version >= 2.7.3 is needed."
));
#endif
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
namespace
plat
=
paddle
::
platform
;
REGISTER_OP_CUDA_KERNEL
(
partial_recv
,
ops
::
PartialRecvOpCUDAKernel
<
float
>
,
ops
::
PartialRecvOpCUDAKernel
<
double
>
,
ops
::
PartialRecvOpCUDAKernel
<
int
>
,
ops
::
PartialRecvOpCUDAKernel
<
int64_t
>
,
ops
::
PartialRecvOpCUDAKernel
<
plat
::
float16
>
);
paddle/fluid/operators/collective/partial_recv_op.h
0 → 100644
浏览文件 @
348d043e
/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <algorithm>
#include <utility>
#include <vector>
#include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/op_registry.h"
namespace
paddle
{
namespace
operators
{
template
<
typename
T
>
class
PartialRecvOpCPUKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
PADDLE_THROW
(
platform
::
errors
::
Unavailable
(
"Do not support partial_recv for cpu kernel now."
));
}
};
}
// namespace operators
}
// namespace paddle
paddle/fluid/operators/collective/partial_send_op.cc
0 → 100644
浏览文件 @
348d043e
/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/collective/partial_send_op.h"
namespace
paddle
{
namespace
operators
{
class
PartialSendOp
:
public
framework
::
OperatorWithKernel
{
public:
using
framework
::
OperatorWithKernel
::
OperatorWithKernel
;
void
InferShape
(
framework
::
InferShapeContext
*
ctx
)
const
override
{
OP_INOUT_CHECK
(
ctx
->
HasInput
(
"X"
),
"Input"
,
"X"
,
"PartialSend"
);
int
peer
=
ctx
->
Attrs
().
Get
<
int
>
(
"peer"
);
int
ring_id
=
ctx
->
Attrs
().
Get
<
int
>
(
"ring_id"
);
int
num
=
ctx
->
Attrs
().
Get
<
int
>
(
"num"
);
int
id
=
ctx
->
Attrs
().
Get
<
int
>
(
"id"
);
PADDLE_ENFORCE_GE
(
peer
,
0
,
platform
::
errors
::
InvalidArgument
(
"The peer (%d) for partial_send op must be non-negative."
,
peer
));
PADDLE_ENFORCE_GE
(
ring_id
,
0
,
platform
::
errors
::
InvalidArgument
(
"The ring_id (%d) for partial_send op must be non-negative."
,
ring_id
));
PADDLE_ENFORCE_GE
(
num
,
1
,
platform
::
errors
::
InvalidArgument
(
"The num (%d) for partial_send op must >=1"
,
num
));
PADDLE_ENFORCE_EQ
(
(
id
>=
0
&&
id
<
num
),
true
,
platform
::
errors
::
InvalidArgument
(
"The id (%d) for partial_send op must >=0 and <num (%d)"
,
id
,
num
));
}
protected:
framework
::
OpKernelType
GetExpectedKernelType
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
return
framework
::
OpKernelType
(
OperatorWithKernel
::
IndicateVarDataType
(
ctx
,
"X"
),
ctx
.
GetPlace
());
}
};
class
PartialSendMaker
:
public
framework
::
OpProtoAndCheckerMaker
{
public:
void
Make
()
{
AddInput
(
"X"
,
"(Tensor) tensor to be sent."
);
AddAttr
<
int
>
(
"ring_id"
,
"(int default 0) nccl communication ring id."
)
.
SetDefault
(
0
);
AddAttr
<
int
>
(
"peer"
,
"(int default 0) rank id for receiver."
).
SetDefault
(
0
);
#if defined(PADDLE_WITH_ASCEND_CL)
AddAttr
<
std
::
string
>
(
"tag"
,
"(string default tag) tag for broadcasting."
)
.
SetDefault
(
"tag"
);
AddAttr
<
int
>
(
"srTag"
,
"(string default tag) tag for broadcasting."
)
.
SetDefault
(
0
);
#endif
AddAttr
<
bool
>
(
"use_calc_stream"
,
"(bool default false) eject CUDA operations to calculation stream."
)
.
SetDefault
(
false
);
AddAttr
<
int
>
(
"num"
,
"(int default 1) The number of Input to be cut."
)
.
SetDefault
(
1
);
AddAttr
<
int
>
(
"id"
,
"(int default 0) ID of the part to be sent after Input cut."
)
.
SetDefault
(
0
);
AddComment
(
R"DOC(
PartialSend Operator.
Divide the Input into num copies and only send the id part.
Reference: https://docs.nvidia.com/deeplearning/nccl/user-guide/docs/usage/p2p.html#sendrecv
)DOC"
);
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
namespace
plat
=
paddle
::
platform
;
REGISTER_OP_WITHOUT_GRADIENT
(
partial_send
,
ops
::
PartialSendOp
,
ops
::
PartialSendMaker
);
REGISTER_OP_CPU_KERNEL
(
partial_send
,
ops
::
PartialSendOpCPUKernel
<
float
>
,
ops
::
PartialSendOpCPUKernel
<
double
>
,
ops
::
PartialSendOpCPUKernel
<
int
>
,
ops
::
PartialSendOpCPUKernel
<
int64_t
>
,
ops
::
PartialSendOpCPUKernel
<
plat
::
float16
>
);
paddle/fluid/operators/collective/partial_send_op.cu.cc
0 → 100644
浏览文件 @
348d043e
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/collective/partial_send_op.h"
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
#include "paddle/fluid/platform/collective_helper.h"
#include "paddle/fluid/platform/nccl_helper.h"
#endif
namespace
paddle
{
namespace
operators
{
template
<
typename
T
>
class
PartialSendCUDAKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
#if (defined(PADDLE_WITH_RCCL) || defined(PADDLE_WITH_NCCL)) && \
NCCL_VERSION_CODE >= 2703
auto
x
=
ctx
.
Input
<
framework
::
LoDTensor
>
(
"X"
);
int
numel
=
x
->
numel
();
int
rid
=
ctx
.
Attr
<
int
>
(
"ring_id"
);
int
peer
=
ctx
.
Attr
<
int
>
(
"peer"
);
int
num
=
ctx
.
Attr
<
int
>
(
"num"
);
int
id
=
ctx
.
Attr
<
int
>
(
"id"
);
PADDLE_ENFORCE_GE
(
rid
,
0
,
platform
::
errors
::
InvalidArgument
(
"The ring_id (%d) for partial_send op must be non-negative."
,
rid
));
PADDLE_ENFORCE_GE
(
peer
,
0
,
platform
::
errors
::
InvalidArgument
(
"The peer (%d) for partial_send op must be non-negative."
,
peer
));
PADDLE_ENFORCE_GE
(
num
,
1
,
platform
::
errors
::
InvalidArgument
(
"The num (%d) for partial_send op must >=1"
,
num
));
PADDLE_ENFORCE_EQ
(
(
id
>=
0
&&
id
<
num
),
true
,
platform
::
errors
::
InvalidArgument
(
"The id (%d) for partial_send op must >=0 and <num (%d)"
,
id
,
num
));
PADDLE_ENFORCE_EQ
(
(
numel
%
num
),
0
,
platform
::
errors
::
InvalidArgument
(
"The input numel (%d) must be divisible by num(%d)"
,
numel
,
num
));
gpuStream_t
stream
=
nullptr
;
auto
place
=
ctx
.
GetPlace
();
auto
comm
=
platform
::
NCCLCommContext
::
Instance
().
Get
(
rid
,
place
);
if
(
ctx
.
Attr
<
bool
>
(
"use_calc_stream"
))
{
auto
dev_ctx
=
platform
::
DeviceContextPool
::
Instance
().
Get
(
place
);
stream
=
static_cast
<
platform
::
CUDADeviceContext
*>
(
dev_ctx
)
->
stream
();
}
else
{
stream
=
comm
->
stream
();
}
PADDLE_ENFORCE_LT
(
peer
,
comm
->
nranks
(),
platform
::
errors
::
InvalidArgument
(
"The value of peer (%d) you set must "
"be less than comm->nranks (%d)."
,
peer
,
comm
->
nranks
()));
ncclDataType_t
dtype
=
platform
::
ToNCCLDataType
(
x
->
type
());
int
send_numel
=
numel
/
num
;
int
offset
=
send_numel
*
id
;
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
ncclSend
(
x
->
data
<
T
>
()
+
offset
,
send_numel
,
dtype
,
peer
,
comm
->
comm
(),
stream
));
VLOG
(
3
)
<<
"rank "
<<
comm
->
rank
()
<<
" send "
<<
send_numel
<<
" from offset["
<<
offset
<<
"] to "
<<
peer
;
#else
PADDLE_THROW
(
platform
::
errors
::
Unavailable
(
"PaddlePaddle should be compiled with NCCL "
"and NCCL version >= 2.7.3 is needed."
));
#endif
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
namespace
plat
=
paddle
::
platform
;
REGISTER_OP_CUDA_KERNEL
(
partial_send
,
ops
::
PartialSendCUDAKernel
<
float
>
,
ops
::
PartialSendCUDAKernel
<
double
>
,
ops
::
PartialSendCUDAKernel
<
int
>
,
ops
::
PartialSendCUDAKernel
<
int64_t
>
,
ops
::
PartialSendCUDAKernel
<
plat
::
float16
>
);
paddle/fluid/operators/collective/partial_send_op.h
0 → 100644
浏览文件 @
348d043e
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <algorithm>
#include <utility>
#include <vector>
#include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/op_registry.h"
namespace
paddle
{
namespace
operators
{
template
<
typename
T
>
class
PartialSendOpCPUKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
PADDLE_THROW
(
platform
::
errors
::
Unavailable
(
"Do not support partial_send for cpu kernel now."
));
}
};
}
// namespace operators
}
// namespace paddle
python/paddle/distributed/fleet/meta_optimizers/pipeline_optimizer.py
浏览文件 @
348d043e
...
...
@@ -183,6 +183,8 @@ class PipelineOptimizer(MetaOptimizerBase):
program
.
_pipeline_opt
[
'micro_batch_size'
]
=
self
.
micro_batch_size
program
.
_pipeline_opt
[
'schedule_mode'
]
=
self
.
schedule_mode
program
.
_pipeline_opt
[
'use_sharding'
]
=
False
program
.
_pipeline_opt
[
'mp_degree'
]
=
1
program
.
_pipeline_opt
[
'mp_rank'
]
=
0
optimize_ops
,
params_grads
,
prog_list
,
pp_pair
,
ring_map
=
self
.
wrapped_opt
.
minimize
(
loss
,
startup_program
,
parameter_list
,
no_grad_set
)
self
.
startup_program
=
orig_startup_program
.
_pipeline_opt
[
...
...
python/paddle/distributed/fleet/meta_optimizers/sharding_optimizer.py
浏览文件 @
348d043e
...
...
@@ -195,23 +195,28 @@ class ShardingOptimizer(MetaOptimizerBase):
if
self
.
pp_degree
>
1
:
pp_optimizer
=
fluid
.
optimizer
.
PipelineOptimizer
(
self
.
inner_opt
,
self
.
_gradient_merge_acc_step
)
main_program
=
loss
.
block
.
program
main_program
.
_pipeline_opt
=
dict
()
self
.
schedule_mode
=
self
.
user_defined_strategy
.
pipeline_configs
[
'schedule_mode'
]
main_program
.
_pipeline_opt
[
'schedule_mode'
]
=
self
.
schedule_mode
main_program
.
_pipeline_opt
[
'micro_batch_size'
]
=
self
.
user_defined_strategy
.
pipeline_configs
[
'micro_batch_size'
]
strategy
=
self
.
user_defined_strategy
self
.
schedule_mode
=
strategy
.
pipeline_configs
[
'schedule_mode'
]
self
.
pp_rank_
=
self
.
role_maker
.
_worker_index
()
//
(
self
.
sharding_degree
*
self
.
mp_degree
)
%
self
.
pp_degree
main_program
.
_pipeline_opt
[
'local_rank'
]
=
self
.
pp_rank_
main_program
.
_pipeline_opt
[
'global_rank'
]
=
self
.
role_maker
.
_worker_index
()
main_program
.
_pipeline_opt
[
'use_sharding'
]
=
True
pipeline_opt
=
dict
()
pipeline_opt
[
'schedule_mode'
]
=
self
.
schedule_mode
pipeline_opt
[
'micro_batch_size'
]
=
strategy
.
pipeline_configs
[
'micro_batch_size'
]
pipeline_opt
[
'local_rank'
]
=
self
.
pp_rank_
pipeline_opt
[
'global_rank'
]
=
self
.
role_maker
.
_worker_index
()
pipeline_opt
[
'use_sharding'
]
=
True
# TODO (JZ-LIANG) should revise here for support mix parallelism with pipeline
main_program
.
_pipeline_opt
[
'ring_id'
]
=
20
main_program
.
_pipeline_opt
[
'global_ring_id'
]
=
3
pipeline_opt
[
'ring_id'
]
=
20
pipeline_opt
[
'global_ring_id'
]
=
3
pipeline_opt
[
'mp_degree'
]
=
self
.
mp_degree
pipeline_opt
[
'mp_rank'
]
=
self
.
role_maker
.
_worker_index
(
)
%
self
.
mp_degree
main_program
=
loss
.
block
.
program
main_program
.
_pipeline_opt
=
pipeline_opt
optimize_ops
,
params_grads
,
program_list
,
self
.
pipeline_pair
,
self
.
pp_ring_map
=
pp_optimizer
.
minimize
(
loss
,
startup_program
,
parameter_list
,
no_grad_set
)
...
...
python/paddle/fluid/optimizer.py
浏览文件 @
348d043e
...
...
@@ -4709,9 +4709,11 @@ class PipelineOptimizer(object):
if
interval
==
-
1
:
decrease_flag
=
True
if
interval
==
1
:
assert
decrease_flag
is
False
,
\
"Pipeline stage must be in order, "
\
"please check the stage of op={}"
.
format
(
op
)
# FIXME(wangxi): recompute failed
pass
#assert decrease_flag is False, \
# "Pipeline stage must be in order, " \
# "please check the stage of op={}".format(op)
pre_stage_id
=
stage_id
return
device_list
...
...
@@ -4844,7 +4846,8 @@ class PipelineOptimizer(object):
extra_index_info
[
'index'
]
+=
1
block
.
_insert_op_without_sync
(
index
=
index
+
extra_index_info
[
'index'
],
type
=
'send_v2'
,
type
=
'send_v2'
if
self
.
mp_degree
==
1
else
'partial_send'
,
inputs
=
{
'X'
:
var
},
attrs
=
{
self
.
_op_device_key
:
prev_dev
,
...
...
@@ -4852,6 +4855,9 @@ class PipelineOptimizer(object):
'use_calc_stream'
:
False
,
'ring_id'
:
ring_id
,
'peer'
:
1
,
# if send_v2, num&id attr is not in op_attrs, will not insert
'num'
:
self
.
mp_degree
,
'id'
:
self
.
mp_rank
,
})
extra_index_info
[
'index'
]
+=
1
insert_index
=
None
...
...
@@ -4882,9 +4888,14 @@ class PipelineOptimizer(object):
var_shape
=
list
(
var
.
shape
)
var_shape
[
0
]
=
self
.
micro_batch_size
if
var_shape
[
0
]
<
0
else
var_shape
[
0
]
numel
=
np
.
prod
(
var
.
shape
)
assert
numel
%
self
.
mp_degree
==
0
,
\
"The numel={} must be divisible by mp_degree={}"
.
format
(
numel
,
self
.
mp_degree
)
block
.
_insert_op_without_sync
(
index
=
index
+
extra_index_info
[
'index'
],
type
=
'recv_v2'
,
type
=
'recv_v2'
if
self
.
mp_degree
==
1
else
'partial_recv'
,
outputs
=
{
'Out'
:
[
var
]},
attrs
=
{
'out_shape'
:
var_shape
,
...
...
@@ -4893,9 +4904,28 @@ class PipelineOptimizer(object):
self
.
_op_role_key
:
op_role
,
'use_calc_stream'
:
True
,
'peer'
:
0
,
'ring_id'
:
ring_id
'ring_id'
:
ring_id
,
# if recv_v2, num&id attr is not in op_attrs, will not insert
'num'
:
self
.
mp_degree
,
'id'
:
self
.
mp_rank
,
})
extra_index_info
[
'index'
]
+=
1
if
self
.
mp_degree
>
1
:
block
.
_insert_op_without_sync
(
index
=
index
+
extra_index_info
[
'index'
],
type
=
'partial_allgather'
,
inputs
=
{
'X'
:
[
var
]},
outputs
=
{
'Out'
:
[
var
]},
attrs
=
{
self
.
_op_device_key
:
cur_dev
,
self
.
_op_role_key
:
op_role
,
'use_calc_stream'
:
True
,
'ring_id'
:
0
,
# if recv_v2, num&id attr is not in op_attrs, will not insert
'nranks'
:
self
.
mp_degree
,
'rank'
:
self
.
mp_rank
,
})
extra_index_info
[
'index'
]
+=
1
else
:
raise
ValueError
(
"Now only 'F-then-B' and '1F1B' are supported."
...
...
@@ -5207,9 +5237,10 @@ class PipelineOptimizer(object):
block
=
program
.
block
(
0
)
recv_type
=
'recv_v2'
if
self
.
mp_degree
==
1
else
'partial_recv'
backward_recv_index
=
None
for
index
,
op
in
enumerate
(
block
.
ops
):
if
op
.
type
==
'recv_v2'
and
self
.
_is_backward_op
(
op
):
if
op
.
type
==
recv_type
and
self
.
_is_backward_op
(
op
):
backward_recv_index
=
index
break
...
...
@@ -5248,7 +5279,8 @@ class PipelineOptimizer(object):
if
startup_program
is
None
:
startup_program
=
default_startup_program
()
assert
main_program
.
_pipeline_opt
,
'Please use pipeline with fleet.'
pipeline_opt
=
main_program
.
_pipeline_opt
assert
pipeline_opt
,
'Please use pipeline with fleet.'
required_keys
=
[
'local_rank'
,
'schedule_mode'
,
...
...
@@ -5256,17 +5288,22 @@ class PipelineOptimizer(object):
'ring_id'
,
'global_ring_id'
,
'use_sharding'
,
'mp_degree'
,
'mp_rank'
,
]
for
key
in
required_keys
:
assert
key
in
main_program
.
_
pipeline_opt
,
\
assert
key
in
pipeline_opt
,
\
'Please use pipeline with fleet to use {}.'
.
format
(
key
)
self
.
local_rank
=
main_block
.
program
.
_pipeline_opt
[
'local_rank'
]
self
.
schedule_mode
=
main_block
.
program
.
_pipeline_opt
[
'schedule_mode'
]
self
.
micro_batch_size
=
main_block
.
program
.
_pipeline_opt
[
'micro_batch_size'
]
self
.
use_sharding
=
main_block
.
program
.
_pipeline_opt
[
'use_sharding'
]
self
.
ring_id
=
main_block
.
program
.
_pipeline_opt
[
'ring_id'
]
self
.
global_ring_id
=
main_block
.
program
.
_pipeline_opt
[
'global_ring_id'
]
self
.
local_rank
=
pipeline_opt
[
'local_rank'
]
self
.
schedule_mode
=
pipeline_opt
[
'schedule_mode'
]
self
.
micro_batch_size
=
pipeline_opt
[
'micro_batch_size'
]
self
.
use_sharding
=
pipeline_opt
[
'use_sharding'
]
self
.
ring_id
=
pipeline_opt
[
'ring_id'
]
self
.
global_ring_id
=
pipeline_opt
[
'global_ring_id'
]
self
.
mp_degree
=
pipeline_opt
[
'mp_degree'
]
self
.
mp_rank
=
pipeline_opt
[
'mp_rank'
]
assert
self
.
mp_degree
>=
1
assert
0
<=
self
.
mp_rank
<
self
.
mp_degree
optimize_ops
,
params_grads
=
self
.
_optimizer
.
minimize
(
loss
,
startup_program
,
parameter_list
,
no_grad_set
)
...
...
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录