Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
Crayon鑫
Paddle
提交
e6fb6599
P
Paddle
项目概览
Crayon鑫
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
e6fb6599
编写于
4月 15, 2022
作者:
H
Haohongxiang
提交者:
GitHub
4月 15, 2022
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
[Dygraph] Refactor Model Parallel in eager mode (#41761)
* refactor mp in eager mode * update * update * add uts
上级
ff818c77
变更
12
显示空白变更内容
内联
并排
Showing
12 changed file
with
305 addition
and
45 deletion
+305
-45
paddle/fluid/operators/class_center_sample_op.cu
paddle/fluid/operators/class_center_sample_op.cu
+30
-13
paddle/fluid/operators/collective/c_allreduce_op.h
paddle/fluid/operators/collective/c_allreduce_op.h
+40
-1
paddle/fluid/operators/collective/c_concat_op.cu.cc
paddle/fluid/operators/collective/c_concat_op.cu.cc
+31
-16
paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.cu
...d/operators/collective/c_softmax_with_cross_entropy_op.cu
+139
-0
paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.h
...id/operators/collective/c_softmax_with_cross_entropy_op.h
+12
-0
python/paddle/distributed/collective.py
python/paddle/distributed/collective.py
+26
-2
python/paddle/fluid/dygraph/parallel.py
python/paddle/fluid/dygraph/parallel.py
+1
-1
python/paddle/fluid/layers/tensor.py
python/paddle/fluid/layers/tensor.py
+3
-1
python/paddle/fluid/tests/unittests/test_parallel_class_center_sample.py
...luid/tests/unittests/test_parallel_class_center_sample.py
+4
-0
python/paddle/fluid/tests/unittests/test_parallel_dygraph_dataparallel.py
...uid/tests/unittests/test_parallel_dygraph_dataparallel.py
+8
-10
python/paddle/fluid/tests/unittests/test_parallel_dygraph_mp_layers.py
.../fluid/tests/unittests/test_parallel_dygraph_mp_layers.py
+3
-0
python/paddle/fluid/tests/unittests/test_parallel_dygraph_tensor_parallel.py
.../tests/unittests/test_parallel_dygraph_tensor_parallel.py
+8
-1
未找到文件。
paddle/fluid/operators/class_center_sample_op.cu
浏览文件 @
e6fb6599
...
@@ -27,8 +27,10 @@ namespace cub = hipcub;
...
@@ -27,8 +27,10 @@ namespace cub = hipcub;
#include <iterator>
#include <iterator>
#include <random>
#include <random>
#include "paddle/fluid/operators/class_center_sample_op.h"
#include "paddle/fluid/operators/class_center_sample_op.h"
#include "paddle/phi/api/include/tensor.h"
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
#include "paddle/fluid/distributed/collective/ProcessGroup.h"
#include "paddle/fluid/platform/collective_helper.h"
#include "paddle/fluid/platform/collective_helper.h"
#include "paddle/fluid/platform/device/gpu/nccl_helper.h"
#include "paddle/fluid/platform/device/gpu/nccl_helper.h"
#endif
#endif
...
@@ -328,6 +330,20 @@ class ClassCenterSampleCUDAKernel : public framework::OpKernel<T> {
...
@@ -328,6 +330,20 @@ class ClassCenterSampleCUDAKernel : public framework::OpKernel<T> {
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
if
(
nranks
>
1
)
{
if
(
nranks
>
1
)
{
auto
map
=
distributed
::
ProcessGroupMapFromGid
::
getInstance
();
if
(
map
->
has
(
rid
))
{
// Use ProcessGroup
distributed
::
ProcessGroup
*
pg
=
map
->
get
(
rid
);
std
::
vector
<
phi
::
DenseTensor
>
in_tensor
;
std
::
vector
<
phi
::
DenseTensor
>
out_tensor
;
in_tensor
.
push_back
(
num_classes_per_device
);
out_tensor
.
push_back
(
num_classes_per_device
);
distributed
::
AllreduceOptions
opts
;
opts
.
reduce_op
=
distributed
::
ReduceOp
::
SUM
;
auto
task
=
pg
->
AllReduce
(
in_tensor
,
out_tensor
,
opts
);
task
->
Wait
();
}
else
{
const
auto
&
comm
=
const
auto
&
comm
=
platform
::
NCCLCommContext
::
Instance
().
Get
(
rid
,
ctx
.
GetPlace
());
platform
::
NCCLCommContext
::
Instance
().
Get
(
rid
,
ctx
.
GetPlace
());
// use global calculate stream
// use global calculate stream
...
@@ -342,6 +358,7 @@ class ClassCenterSampleCUDAKernel : public framework::OpKernel<T> {
...
@@ -342,6 +358,7 @@ class ClassCenterSampleCUDAKernel : public framework::OpKernel<T> {
framework
::
TransToProtoVarType
(
num_classes_per_device
.
dtype
())),
framework
::
TransToProtoVarType
(
num_classes_per_device
.
dtype
())),
ncclSum
,
comm
->
comm
(),
calcu_stream
));
ncclSum
,
comm
->
comm
(),
calcu_stream
));
}
}
}
#endif
#endif
// step 2: Determine temporary device storage requirements
// step 2: Determine temporary device storage requirements
...
...
paddle/fluid/operators/collective/c_allreduce_op.h
浏览文件 @
e6fb6599
...
@@ -16,12 +16,14 @@ limitations under the License. */
...
@@ -16,12 +16,14 @@ limitations under the License. */
#include <string>
#include <string>
#include "paddle/fluid/distributed/collective/ProcessGroup.h"
#include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/fluid/memory/memory.h"
#include "paddle/fluid/memory/memory.h"
#include "paddle/fluid/platform/device/npu/npu_op_runner.h"
#include "paddle/fluid/platform/device/npu/npu_op_runner.h"
#include "paddle/phi/api/include/tensor.h"
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) || \
defined(PADDLE_WITH_ASCEND_CL) || defined(PADDLE_WITH_XPU_BKCL) || \
defined(PADDLE_WITH_ASCEND_CL) || defined(PADDLE_WITH_XPU_BKCL) || \
...
@@ -351,6 +353,7 @@ class CAllReduceOpCUDAKernel : public framework::OpKernel<T> {
...
@@ -351,6 +353,7 @@ class CAllReduceOpCUDAKernel : public framework::OpKernel<T> {
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
auto
in
=
ctx
.
Input
<
framework
::
Tensor
>
(
"X"
);
auto
in
=
ctx
.
Input
<
framework
::
Tensor
>
(
"X"
);
auto
out
=
ctx
.
Output
<
framework
::
Tensor
>
(
"Out"
);
auto
out
=
ctx
.
Output
<
framework
::
Tensor
>
(
"Out"
);
int
rid
=
ctx
.
Attr
<
int
>
(
"ring_id"
);
auto
place
=
ctx
.
GetPlace
();
auto
place
=
ctx
.
GetPlace
();
ncclDataType_t
dtype
=
ncclDataType_t
dtype
=
...
@@ -360,7 +363,43 @@ class CAllReduceOpCUDAKernel : public framework::OpKernel<T> {
...
@@ -360,7 +363,43 @@ class CAllReduceOpCUDAKernel : public framework::OpKernel<T> {
out
->
Resize
(
in
->
dims
());
out
->
Resize
(
in
->
dims
());
void
*
recvbuff
=
out
->
mutable_data
<
T
>
(
place
);
void
*
recvbuff
=
out
->
mutable_data
<
T
>
(
place
);
int
rid
=
ctx
.
Attr
<
int
>
(
"ring_id"
);
auto
map
=
distributed
::
ProcessGroupMapFromGid
::
getInstance
();
if
(
map
->
has
(
rid
))
{
// Use ProcessGroup
distributed
::
ProcessGroup
*
pg
=
map
->
get
(
rid
);
std
::
vector
<
phi
::
DenseTensor
>
in_tensor
;
std
::
vector
<
phi
::
DenseTensor
>
out_tensor
;
in_tensor
.
push_back
(
*
in
);
out_tensor
.
push_back
(
*
out
);
distributed
::
AllreduceOptions
opts
;
switch
(
red_type
)
{
case
kRedSum
:
opts
.
reduce_op
=
distributed
::
ReduceOp
::
SUM
;
break
;
case
kRedMax
:
opts
.
reduce_op
=
distributed
::
ReduceOp
::
MAX
;
break
;
case
kRedMin
:
opts
.
reduce_op
=
distributed
::
ReduceOp
::
MIN
;
break
;
case
kRedProd
:
opts
.
reduce_op
=
distributed
::
ReduceOp
::
PRODUCT
;
break
;
default:
PADDLE_THROW
(
platform
::
errors
::
InvalidArgument
(
"Invalid reduce type: %d"
,
red_type
));
}
auto
task
=
pg
->
AllReduce
(
in_tensor
,
out_tensor
,
opts
);
task
->
Wait
();
return
;
}
auto
comm
=
platform
::
NCCLCommContext
::
Instance
().
Get
(
rid
,
place
);
auto
comm
=
platform
::
NCCLCommContext
::
Instance
().
Get
(
rid
,
place
);
gpuStream_t
stream
=
nullptr
;
gpuStream_t
stream
=
nullptr
;
...
...
paddle/fluid/operators/collective/c_concat_op.cu.cc
浏览文件 @
e6fb6599
...
@@ -16,8 +16,10 @@ limitations under the License. */
...
@@ -16,8 +16,10 @@ limitations under the License. */
#include "paddle/fluid/operators/collective/c_concat_op.h"
#include "paddle/fluid/operators/collective/c_concat_op.h"
#include "paddle/fluid/operators/math/concat_and_split.h"
#include "paddle/fluid/operators/math/concat_and_split.h"
#include "paddle/phi/api/include/tensor.h"
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
#include "paddle/fluid/distributed/collective/ProcessGroup.h"
#include "paddle/fluid/platform/collective_helper.h"
#include "paddle/fluid/platform/collective_helper.h"
#include "paddle/fluid/platform/device/gpu/nccl_helper.h"
#include "paddle/fluid/platform/device/gpu/nccl_helper.h"
#endif
#endif
...
@@ -55,16 +57,28 @@ class CConcatOpCUDAKernel : public framework::OpKernel<T> {
...
@@ -55,16 +57,28 @@ class CConcatOpCUDAKernel : public framework::OpKernel<T> {
rank
,
nranks
));
rank
,
nranks
));
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
framework
::
Tensor
temp_out
;
framework
::
DDim
temp_out_dims
=
x
->
dims
();
temp_out_dims
[
0
]
*=
nranks
;
temp_out
.
mutable_data
<
T
>
(
temp_out_dims
,
place
);
auto
map
=
distributed
::
ProcessGroupMapFromGid
::
getInstance
();
if
(
map
->
has
(
rid
))
{
// Use ProcessGroup
distributed
::
ProcessGroup
*
pg
=
map
->
get
(
rid
);
std
::
vector
<
phi
::
DenseTensor
>
in_tensor
;
std
::
vector
<
phi
::
DenseTensor
>
out_tensor
;
in_tensor
.
push_back
(
*
x
);
out_tensor
.
push_back
(
temp_out
);
auto
task
=
pg
->
AllGather
(
in_tensor
,
out_tensor
);
task
->
Wait
();
}
else
{
auto
comm
=
platform
::
NCCLCommContext
::
Instance
().
Get
(
rid
,
place
);
auto
comm
=
platform
::
NCCLCommContext
::
Instance
().
Get
(
rid
,
place
);
PADDLE_ENFORCE_EQ
(
PADDLE_ENFORCE_EQ
(
nranks
,
comm
->
nranks
(),
nranks
,
comm
->
nranks
(),
platform
::
errors
::
InvalidArgument
(
"nranks: %s should equal to %s"
,
platform
::
errors
::
InvalidArgument
(
"nranks: %s should equal to %s"
,
nranks
,
comm
->
nranks
()));
nranks
,
comm
->
nranks
()));
framework
::
Tensor
temp_out
;
framework
::
DDim
temp_out_dims
=
x
->
dims
();
temp_out_dims
[
0
]
*=
nranks
;
temp_out
.
mutable_data
<
T
>
(
temp_out_dims
,
place
);
int64_t
send_numel
=
x
->
numel
();
int64_t
send_numel
=
x
->
numel
();
const
T
*
send_buff
=
x
->
data
<
T
>
();
const
T
*
send_buff
=
x
->
data
<
T
>
();
T
*
recv_buff
=
temp_out
.
data
<
T
>
();
T
*
recv_buff
=
temp_out
.
data
<
T
>
();
...
@@ -75,6 +89,7 @@ class CConcatOpCUDAKernel : public framework::OpKernel<T> {
...
@@ -75,6 +89,7 @@ class CConcatOpCUDAKernel : public framework::OpKernel<T> {
PADDLE_ENFORCE_GPU_SUCCESS
(
platform
::
dynload
::
ncclAllGather
(
PADDLE_ENFORCE_GPU_SUCCESS
(
platform
::
dynload
::
ncclAllGather
(
send_buff
,
recv_buff
,
send_numel
,
static_cast
<
ncclDataType_t
>
(
dtype
),
send_buff
,
recv_buff
,
send_numel
,
static_cast
<
ncclDataType_t
>
(
dtype
),
comm
->
comm
(),
stream
));
comm
->
comm
(),
stream
));
}
std
::
vector
<
framework
::
Tensor
>
inputs
;
std
::
vector
<
framework
::
Tensor
>
inputs
;
int
axis
=
x
->
dims
().
size
()
-
1
;
int
axis
=
x
->
dims
().
size
()
-
1
;
...
...
paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.cu
浏览文件 @
e6fb6599
...
@@ -18,6 +18,7 @@ limitations under the License. */
...
@@ -18,6 +18,7 @@ limitations under the License. */
#include "paddle/fluid/platform/collective_helper.h"
#include "paddle/fluid/platform/collective_helper.h"
#include "paddle/fluid/platform/device/gpu/nccl_helper.h"
#include "paddle/fluid/platform/device/gpu/nccl_helper.h"
#include "paddle/fluid/string/string_helper.h"
#include "paddle/fluid/string/string_helper.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/kernels/funcs/axis_utils.h"
#include "paddle/phi/kernels/funcs/axis_utils.h"
namespace
paddle
{
namespace
paddle
{
...
@@ -73,6 +74,21 @@ template <typename T>
...
@@ -73,6 +74,21 @@ template <typename T>
class
CSoftmaxWithCrossEntropyOpCUDAKernel
:
public
framework
::
OpKernel
<
T
>
{
class
CSoftmaxWithCrossEntropyOpCUDAKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
const
int
rid
=
ctx
.
Attr
<
int
>
(
"ring_id"
);
auto
map
=
distributed
::
ProcessGroupMapFromGid
::
getInstance
();
if
(
map
->
has
(
rid
))
{
CSoftmaxWithCrossEntropyProcessGroupFunctor
<
phi
::
GPUContext
,
T
>
functor_
;
functor_
(
ctx
);
}
else
{
CSoftmaxWithCrossEntropyFunctor
<
phi
::
GPUContext
,
T
>
functor_
;
functor_
(
ctx
);
}
}
};
template
<
typename
T
>
struct
CSoftmaxWithCrossEntropyFunctor
<
phi
::
GPUContext
,
T
>
{
void
operator
()(
const
framework
::
ExecutionContext
&
ctx
)
{
const
Tensor
*
logits
=
ctx
.
Input
<
Tensor
>
(
"Logits"
);
const
Tensor
*
logits
=
ctx
.
Input
<
Tensor
>
(
"Logits"
);
const
Tensor
*
labels
=
ctx
.
Input
<
Tensor
>
(
"Label"
);
const
Tensor
*
labels
=
ctx
.
Input
<
Tensor
>
(
"Label"
);
Tensor
*
softmax
=
ctx
.
Output
<
Tensor
>
(
"Softmax"
);
Tensor
*
softmax
=
ctx
.
Output
<
Tensor
>
(
"Softmax"
);
...
@@ -201,6 +217,129 @@ class CSoftmaxWithCrossEntropyOpCUDAKernel : public framework::OpKernel<T> {
...
@@ -201,6 +217,129 @@ class CSoftmaxWithCrossEntropyOpCUDAKernel : public framework::OpKernel<T> {
}
}
};
};
template
<
typename
T
>
struct
CSoftmaxWithCrossEntropyProcessGroupFunctor
<
phi
::
GPUContext
,
T
>
{
void
operator
()(
const
framework
::
ExecutionContext
&
ctx
)
{
const
Tensor
*
logits
=
ctx
.
Input
<
Tensor
>
(
"Logits"
);
const
Tensor
*
labels
=
ctx
.
Input
<
Tensor
>
(
"Label"
);
Tensor
*
softmax
=
ctx
.
Output
<
Tensor
>
(
"Softmax"
);
Tensor
*
loss
=
ctx
.
Output
<
Tensor
>
(
"Loss"
);
const
int
rid
=
ctx
.
Attr
<
int
>
(
"ring_id"
);
const
int
nranks
=
ctx
.
Attr
<
int
>
(
"nranks"
);
const
int
rank
=
ctx
.
Attr
<
int
>
(
"rank"
);
const
auto
&
place
=
ctx
.
GetPlace
();
auto
&
dev_ctx
=
ctx
.
template
device_context
<
platform
::
CUDADeviceContext
>();
auto
map
=
distributed
::
ProcessGroupMapFromGid
::
getInstance
();
distributed
::
ProcessGroup
*
pg
=
map
->
get
(
rid
);
distributed
::
AllreduceOptions
opts
;
opts
.
reduce_op
=
distributed
::
ReduceOp
::
SUM
;
// allocate memory on device.
softmax
->
mutable_data
<
T
>
(
place
);
loss
->
mutable_data
<
T
>
(
place
);
const
auto
&
logits_dims
=
logits
->
dims
();
const
auto
&
labels_dims
=
labels
->
dims
();
const
int
axis
=
logits_dims
.
size
()
-
1
;
const
int
N
=
phi
::
funcs
::
SizeToAxis
(
axis
,
logits_dims
);
const
int
D
=
phi
::
funcs
::
SizeFromAxis
(
axis
,
logits_dims
);
Tensor
logits_2d
,
softmax_2d
,
loss_2d
;
logits_2d
.
ShareDataWith
(
*
logits
).
Resize
({
N
,
D
});
softmax_2d
.
ShareDataWith
(
*
softmax
).
Resize
({
N
,
D
});
loss_2d
.
ShareDataWith
(
*
loss
).
Resize
({
N
,
1
});
auto
eigen_logits
=
math
::
EigenMatrix
<
T
>::
From
(
logits_2d
);
auto
eigen_softmax
=
math
::
EigenMatrix
<
T
>::
From
(
softmax_2d
);
// step 1, obtain logit_max
Tensor
logits_max
;
logits_max
=
ctx
.
AllocateTmpTensor
<
T
,
platform
::
CUDADeviceContext
>
({
N
,
1
},
dev_ctx
);
auto
eigen_logits_max
=
math
::
EigenMatrix
<
T
>::
From
(
logits_max
);
Eigen
::
DSizes
<
int
,
1
>
along_axis
(
1
);
eigen_logits_max
.
device
(
*
dev_ctx
.
eigen_device
())
=
eigen_logits
.
maximum
(
along_axis
);
std
::
vector
<
phi
::
DenseTensor
>
in_out
;
in_out
.
push_back
(
logits_max
);
pg
->
AllReduce
(
in_out
,
in_out
,
opts
)
->
Synchronize
();
// step 2, obtain logit - logit_max
Eigen
::
DSizes
<
int
,
2
>
batch_by_one
(
N
,
1
);
Eigen
::
DSizes
<
int
,
2
>
one_by_class
(
1
,
D
);
eigen_softmax
.
device
(
*
dev_ctx
.
eigen_device
())
=
(
eigen_logits
-
eigen_logits_max
.
reshape
(
batch_by_one
).
broadcast
(
one_by_class
))
.
unaryExpr
(
math
::
ValueClip
<
T
>
());
// step 3, obtain predict target
Tensor
predicted_logits
;
predicted_logits
=
ctx
.
AllocateTmpTensor
<
T
,
platform
::
CUDADeviceContext
>
({
N
,
1
},
dev_ctx
);
predicted_logits
.
mutable_data
<
T
>
(
place
);
auto
t
=
framework
::
EigenVector
<
T
>::
Flatten
(
predicted_logits
);
t
.
device
(
*
dev_ctx
.
eigen_device
())
=
t
.
constant
(
static_cast
<
T
>
(
0
));
const
int
start_index
=
rank
*
D
;
const
int
end_index
=
start_index
+
D
;
int
blocks
=
NumBlocks
(
N
);
int
threads
=
kNumCUDAThreads
;
const
auto
&
label_type
=
framework
::
TransToProtoVarType
(
labels
->
dtype
());
if
(
label_type
==
framework
::
proto
::
VarType
::
INT32
)
{
MaskLabelByIndex
<
T
,
int32_t
><<<
blocks
,
threads
,
0
,
dev_ctx
.
stream
()
>>>
(
predicted_logits
.
data
<
T
>
(),
softmax_2d
.
data
<
T
>
(),
labels
->
data
<
int32_t
>
(),
start_index
,
end_index
,
N
,
D
,
nranks
);
}
else
if
(
label_type
==
framework
::
proto
::
VarType
::
INT64
)
{
MaskLabelByIndex
<
T
,
int64_t
><<<
blocks
,
threads
,
0
,
dev_ctx
.
stream
()
>>>
(
predicted_logits
.
data
<
T
>
(),
softmax_2d
.
data
<
T
>
(),
labels
->
data
<
int64_t
>
(),
start_index
,
end_index
,
N
,
D
,
nranks
);
}
in_out
.
clear
();
in_out
.
push_back
(
predicted_logits
);
pg
->
AllReduce
(
in_out
,
in_out
,
opts
)
->
Synchronize
();
// step 4, obtain exp(logit)
eigen_softmax
.
device
(
*
dev_ctx
.
eigen_device
())
=
eigen_softmax
.
exp
();
// step 5, obtain sum_exp_logits
Tensor
sum_exp_logits
;
sum_exp_logits
=
ctx
.
AllocateTmpTensor
<
T
,
platform
::
CUDADeviceContext
>
({
N
,
1
},
dev_ctx
);
void
*
sum_exp_logits_buff
=
sum_exp_logits
.
mutable_data
<
T
>
(
place
);
auto
eigen_sum_exp_logits
=
math
::
EigenMatrix
<
T
>::
From
(
sum_exp_logits
);
eigen_sum_exp_logits
.
device
(
*
dev_ctx
.
eigen_device
())
=
eigen_softmax
.
sum
(
along_axis
);
in_out
.
clear
();
in_out
.
push_back
(
sum_exp_logits
);
pg
->
AllReduce
(
in_out
,
in_out
,
opts
)
->
Synchronize
();
auto
eigen_loss
=
math
::
EigenMatrix
<
T
>::
From
(
loss_2d
);
auto
eigen_predicted_logits
=
math
::
EigenMatrix
<
T
>::
From
(
predicted_logits
);
eigen_loss
.
device
(
*
dev_ctx
.
eigen_device
())
=
(
eigen_sum_exp_logits
.
log
().
unaryExpr
(
math
::
TolerableValue
<
T
>
())
-
eigen_predicted_logits
)
.
unaryExpr
(
math
::
TolerableValue
<
T
>
());
eigen_softmax
.
device
(
*
dev_ctx
.
eigen_device
())
=
(
eigen_softmax
*
eigen_sum_exp_logits
.
inverse
().
broadcast
(
one_by_class
));
}
};
template
<
typename
T
>
template
<
typename
T
>
class
CSoftmaxWithCrossEntropyGradCUDAKernel
:
public
framework
::
OpKernel
<
T
>
{
class
CSoftmaxWithCrossEntropyGradCUDAKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
public:
...
...
paddle/fluid/operators/collective/c_softmax_with_cross_entropy_op.h
浏览文件 @
e6fb6599
...
@@ -18,11 +18,13 @@ limitations under the License. */
...
@@ -18,11 +18,13 @@ limitations under the License. */
#include <utility>
#include <utility>
#include <vector>
#include <vector>
#include "paddle/fluid/distributed/collective/ProcessGroup.h"
#include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/cross_entropy.h"
#include "paddle/fluid/operators/math/cross_entropy.h"
#include "paddle/fluid/operators/math/softmax.h"
#include "paddle/fluid/operators/math/softmax.h"
#include "paddle/phi/api/include/tensor.h"
namespace
paddle
{
namespace
paddle
{
namespace
operators
{
namespace
operators
{
...
@@ -36,5 +38,15 @@ class CSoftmaxWithCrossEntropyOpCPUKernel : public framework::OpKernel<T> {
...
@@ -36,5 +38,15 @@ class CSoftmaxWithCrossEntropyOpCPUKernel : public framework::OpKernel<T> {
}
}
};
};
template
<
typename
Context
,
typename
T
>
struct
CSoftmaxWithCrossEntropyFunctor
{
void
operator
()(
const
framework
::
ExecutionContext
&
ctx
);
};
template
<
typename
Context
,
typename
T
>
struct
CSoftmaxWithCrossEntropyProcessGroupFunctor
{
void
operator
()(
const
framework
::
ExecutionContext
&
ctx
);
};
}
// namespace operators
}
// namespace operators
}
// namespace paddle
}
// namespace paddle
python/paddle/distributed/collective.py
浏览文件 @
e6fb6599
...
@@ -20,6 +20,7 @@ from ..fluid.framework import Variable
...
@@ -20,6 +20,7 @@ from ..fluid.framework import Variable
from
..fluid.framework
import
in_dygraph_mode
from
..fluid.framework
import
in_dygraph_mode
from
..fluid.framework
import
OpProtoHolder
from
..fluid.framework
import
OpProtoHolder
from
..fluid.framework
import
_non_static_mode
from
..fluid.framework
import
_non_static_mode
from
..fluid.framework
import
_in_legacy_dygraph
from
..fluid.framework
import
convert_np_dtype_to_dtype_
from
..fluid.framework
import
convert_np_dtype_to_dtype_
from
..fluid.framework
import
_varbase_creator
from
..fluid.framework
import
_varbase_creator
from
..fluid.data_feeder
import
convert_dtype
from
..fluid.data_feeder
import
convert_dtype
...
@@ -1132,13 +1133,36 @@ def _mp_allreduce(tensor,
...
@@ -1132,13 +1133,36 @@ def _mp_allreduce(tensor,
group
=
None
,
group
=
None
,
use_calc_stream
=
True
,
use_calc_stream
=
True
,
use_model_parallel
=
True
):
use_model_parallel
=
True
):
"""[it is same as allreduce above, but it su
u
ports model parallel. And it support inplace startegy]
"""[it is same as allreduce above, but it su
p
ports model parallel. And it support inplace startegy]
"""
"""
if
group
is
not
None
and
not
group
.
is_member
():
if
group
is
not
None
and
not
group
.
is_member
():
return
return
ring_id
=
0
if
group
is
None
else
group
.
id
ring_id
=
0
if
group
is
None
else
group
.
id
if
_non_static_mode
():
if
in_dygraph_mode
():
assert
op
==
ReduceOp
.
SUM
,
"Unknown parameter: {}."
.
format
(
op
)
from
paddle.autograd
import
EagerPyLayer
class
mp_allreduce_eager
(
EagerPyLayer
):
@
staticmethod
def
forward
(
ctx
,
tensor
,
use_calc_stream
,
ring_id
,
use_model_parallel
):
ctx
.
ring_id
=
ring_id
return
_C_ops
.
c_allreduce_sum_
(
tensor
,
'use_calc_stream'
,
use_calc_stream
,
'ring_id'
,
ring_id
,
"use_model_parallel"
,
use_model_parallel
)
@
staticmethod
def
backward
(
ctx
,
dy
):
return
_C_ops
.
c_identity
(
dy
,
'use_calc_stream'
,
True
,
'ring_id'
,
ctx
.
ring_id
,
'use_model_parallel'
,
True
)
return
mp_allreduce_eager
.
apply
(
tensor
,
use_calc_stream
,
ring_id
,
use_model_parallel
)
elif
_in_legacy_dygraph
():
if
op
==
ReduceOp
.
SUM
:
if
op
==
ReduceOp
.
SUM
:
return
_C_ops
.
c_allreduce_sum_
(
return
_C_ops
.
c_allreduce_sum_
(
tensor
,
'use_calc_stream'
,
use_calc_stream
,
'ring_id'
,
ring_id
,
tensor
,
'use_calc_stream'
,
use_calc_stream
,
'ring_id'
,
ring_id
,
...
...
python/paddle/fluid/dygraph/parallel.py
浏览文件 @
e6fb6599
...
@@ -378,7 +378,7 @@ def sync_params_buffers(model,
...
@@ -378,7 +378,7 @@ def sync_params_buffers(model,
param
.
name
)
param
.
name
)
# is_distributed param not need to sync when in mp mode
# is_distributed param not need to sync when in mp mode
if
isinstance
(
param
,
ParamBase
):
if
isinstance
(
param
,
(
ParamBase
,
core
.
eager
.
Tensor
)
):
if
is_model_parallel
and
param
.
is_distributed
:
if
is_model_parallel
and
param
.
is_distributed
:
continue
continue
...
...
python/paddle/fluid/layers/tensor.py
浏览文件 @
e6fb6599
...
@@ -329,7 +329,9 @@ def concat(input, axis=0, name=None):
...
@@ -329,7 +329,9 @@ def concat(input, axis=0, name=None):
axis
=
axis
.
item
(
0
)
axis
=
axis
.
item
(
0
)
if
not
isinstance
(
input
,
Variable
):
if
not
isinstance
(
input
,
Variable
):
input
=
[
t
for
t
in
input
if
t
.
shape
.
count
(
0
)
==
0
]
input
=
[
t
for
t
in
input
if
t
.
shape
.
count
(
0
)
==
0
]
return
_C_ops
.
final_state_concat
(
input
,
axis
)
out
=
_varbase_creator
()
_C_ops
.
concat
(
input
,
out
,
'axis'
,
axis
)
return
out
if
_in_legacy_dygraph
():
if
_in_legacy_dygraph
():
if
isinstance
(
axis
,
Variable
):
if
isinstance
(
axis
,
Variable
):
...
...
python/paddle/fluid/tests/unittests/test_parallel_class_center_sample.py
浏览文件 @
e6fb6599
...
@@ -14,16 +14,20 @@
...
@@ -14,16 +14,20 @@
from
__future__
import
print_function
from
__future__
import
print_function
import
os
import
unittest
import
unittest
import
paddle.fluid
as
fluid
import
paddle.fluid
as
fluid
from
test_parallel_dygraph_dataparallel
import
TestMultipleGpus
from
test_parallel_dygraph_dataparallel
import
TestMultipleGpus
from
paddle.fluid.framework
import
_test_eager_guard
class
TestParallelClassCenterSample
(
TestMultipleGpus
):
class
TestParallelClassCenterSample
(
TestMultipleGpus
):
def
test_parallel_class_center_sample
(
self
):
def
test_parallel_class_center_sample
(
self
):
self
.
run_mnist_2gpu
(
'parallel_class_center_sample.py'
)
self
.
run_mnist_2gpu
(
'parallel_class_center_sample.py'
)
self
.
run_mnist_2gpu
(
'parallel_class_center_sample.py'
,
eager_mode
=
False
)
if
__name__
==
"__main__"
:
if
__name__
==
"__main__"
:
os
.
environ
[
"FLAGS_enable_eager_mode"
]
=
"1"
unittest
.
main
()
unittest
.
main
()
python/paddle/fluid/tests/unittests/test_parallel_dygraph_dataparallel.py
浏览文件 @
e6fb6599
...
@@ -100,6 +100,7 @@ def start_local_trainers(cluster,
...
@@ -100,6 +100,7 @@ def start_local_trainers(cluster,
pod
,
pod
,
training_script
,
training_script
,
training_script_args
,
training_script_args
,
eager_mode
=
True
,
log_dir
=
None
):
log_dir
=
None
):
current_env
=
copy
.
copy
(
os
.
environ
.
copy
())
current_env
=
copy
.
copy
(
os
.
environ
.
copy
())
#paddle broadcast ncclUniqueId use socket, and
#paddle broadcast ncclUniqueId use socket, and
...
@@ -119,6 +120,9 @@ def start_local_trainers(cluster,
...
@@ -119,6 +120,9 @@ def start_local_trainers(cluster,
"PADDLE_TRAINER_ENDPOINTS"
:
","
.
join
(
cluster
.
trainers_endpoints
())
"PADDLE_TRAINER_ENDPOINTS"
:
","
.
join
(
cluster
.
trainers_endpoints
())
}
}
if
not
eager_mode
:
proc_env
[
"FLAGS_enable_eager_mode"
]
=
"%d"
%
0
current_env
.
update
(
proc_env
)
current_env
.
update
(
proc_env
)
print
(
"trainer proc env:{}"
.
format
(
current_env
))
print
(
"trainer proc env:{}"
.
format
(
current_env
))
...
@@ -145,15 +149,8 @@ def start_local_trainers(cluster,
...
@@ -145,15 +149,8 @@ def start_local_trainers(cluster,
return
procs
return
procs
def
get_dist_port_from_flags
():
DIST_UT_PORT
=
6175
if
os
.
getenv
(
"PADDLE_DIST_UT_PORT"
):
DIST_UT_PORT
=
int
(
os
.
getenv
(
"PADDLE_DIST_UT_PORT"
))
return
DIST_UT_PORT
class
TestMultipleGpus
(
unittest
.
TestCase
):
class
TestMultipleGpus
(
unittest
.
TestCase
):
def
run_mnist_2gpu
(
self
,
target_file_name
):
def
run_mnist_2gpu
(
self
,
target_file_name
,
eager_mode
=
True
):
if
not
fluid
.
core
.
is_compiled_with_cuda
(
if
not
fluid
.
core
.
is_compiled_with_cuda
(
)
or
fluid
.
core
.
get_cuda_device_count
()
==
0
:
)
or
fluid
.
core
.
get_cuda_device_count
()
==
0
:
return
return
...
@@ -167,6 +164,7 @@ class TestMultipleGpus(unittest.TestCase):
...
@@ -167,6 +164,7 @@ class TestMultipleGpus(unittest.TestCase):
procs
=
start_local_trainers
(
procs
=
start_local_trainers
(
cluster
,
cluster
,
pod
,
pod
,
eager_mode
=
eager_mode
,
training_script
=
target_file_name
,
training_script
=
target_file_name
,
training_script_args
=
[])
training_script_args
=
[])
...
@@ -206,9 +204,9 @@ class TestDataParallelGradientCheck(TestMultipleGpus):
...
@@ -206,9 +204,9 @@ class TestDataParallelGradientCheck(TestMultipleGpus):
class
TestDataParallelWithPyLayer
(
TestMultipleGpus
):
class
TestDataParallelWithPyLayer
(
TestMultipleGpus
):
def
test_parallel_dygraph_dataparallel_with_pylayer
(
self
):
def
test_parallel_dygraph_dataparallel_with_pylayer
(
self
):
with
_test_eager_guard
():
self
.
run_mnist_2gpu
(
'parallel_dygraph_dataparallel_with_pylayer.py'
)
self
.
run_mnist_2gpu
(
'parallel_dygraph_dataparallel_with_pylayer.py'
)
self
.
run_mnist_2gpu
(
'parallel_dygraph_dataparallel_with_pylayer.py'
)
self
.
run_mnist_2gpu
(
'parallel_dygraph_dataparallel_with_pylayer.py'
,
eager_mode
=
False
)
class
TestGradientCheckInEagerMode
(
TestMultipleGpus
):
class
TestGradientCheckInEagerMode
(
TestMultipleGpus
):
...
...
python/paddle/fluid/tests/unittests/test_parallel_dygraph_mp_layers.py
浏览文件 @
e6fb6599
...
@@ -14,6 +14,7 @@
...
@@ -14,6 +14,7 @@
from
__future__
import
print_function
from
__future__
import
print_function
import
os
import
unittest
import
unittest
import
paddle.fluid
as
fluid
import
paddle.fluid
as
fluid
...
@@ -23,7 +24,9 @@ from test_parallel_dygraph_dataparallel import TestMultipleGpus
...
@@ -23,7 +24,9 @@ from test_parallel_dygraph_dataparallel import TestMultipleGpus
class
TestModelParallelLayer
(
TestMultipleGpus
):
class
TestModelParallelLayer
(
TestMultipleGpus
):
def
test_hybrid_parallel_mp_layer
(
self
):
def
test_hybrid_parallel_mp_layer
(
self
):
self
.
run_mnist_2gpu
(
'hybrid_parallel_mp_layers.py'
)
self
.
run_mnist_2gpu
(
'hybrid_parallel_mp_layers.py'
)
self
.
run_mnist_2gpu
(
'hybrid_parallel_mp_layers.py'
,
eager_mode
=
False
)
if
__name__
==
"__main__"
:
if
__name__
==
"__main__"
:
os
.
environ
[
"FLAGS_enable_eager_mode"
]
=
"1"
unittest
.
main
()
unittest
.
main
()
python/paddle/fluid/tests/unittests/test_parallel_dygraph_tensor_parallel.py
浏览文件 @
e6fb6599
...
@@ -14,6 +14,7 @@
...
@@ -14,6 +14,7 @@
from
__future__
import
print_function
from
__future__
import
print_function
import
os
import
unittest
import
unittest
import
paddle.fluid
as
fluid
import
paddle.fluid
as
fluid
...
@@ -22,20 +23,26 @@ from test_parallel_dygraph_dataparallel import TestMultipleGpus
...
@@ -22,20 +23,26 @@ from test_parallel_dygraph_dataparallel import TestMultipleGpus
class
TestHybridParallel
(
TestMultipleGpus
):
class
TestHybridParallel
(
TestMultipleGpus
):
def
test_hybrid_parallel_mp_random
(
self
):
def
test_hybrid_parallel_mp_random
(
self
):
self
.
run_mnist_2gpu
(
'hybrid_parallel_mp_random.py'
)
# self.run_mnist_2gpu('hybrid_parallel_mp_random.py')
self
.
run_mnist_2gpu
(
'hybrid_parallel_mp_random.py'
,
eager_mode
=
False
)
def
test_hybrid_parallel_mp_model
(
self
):
def
test_hybrid_parallel_mp_model
(
self
):
self
.
run_mnist_2gpu
(
'hybrid_parallel_mp_model.py'
)
self
.
run_mnist_2gpu
(
'hybrid_parallel_mp_model.py'
)
self
.
run_mnist_2gpu
(
'hybrid_parallel_mp_model.py'
,
eager_mode
=
False
)
def
test_hybrid_parallel_mp_amp
(
self
):
def
test_hybrid_parallel_mp_amp
(
self
):
self
.
run_mnist_2gpu
(
'hybrid_parallel_mp_amp.py'
)
self
.
run_mnist_2gpu
(
'hybrid_parallel_mp_amp.py'
)
self
.
run_mnist_2gpu
(
'hybrid_parallel_mp_amp.py'
,
eager_mode
=
False
)
def
test_hybrid_parallel_mp_fp16
(
self
):
def
test_hybrid_parallel_mp_fp16
(
self
):
self
.
run_mnist_2gpu
(
'hybrid_parallel_mp_fp16.py'
)
self
.
run_mnist_2gpu
(
'hybrid_parallel_mp_fp16.py'
)
self
.
run_mnist_2gpu
(
'hybrid_parallel_mp_fp16.py'
,
eager_mode
=
False
)
def
test_hybrid_parallel_mp_clip_grad
(
self
):
def
test_hybrid_parallel_mp_clip_grad
(
self
):
self
.
run_mnist_2gpu
(
'hybrid_parallel_mp_clip_grad.py'
)
self
.
run_mnist_2gpu
(
'hybrid_parallel_mp_clip_grad.py'
)
self
.
run_mnist_2gpu
(
'hybrid_parallel_mp_clip_grad.py'
,
eager_mode
=
False
)
if
__name__
==
"__main__"
:
if
__name__
==
"__main__"
:
os
.
environ
[
"FLAGS_enable_eager_mode"
]
=
"1"
unittest
.
main
()
unittest
.
main
()
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录