Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
Crayon鑫
Paddle
提交
67163fb4
P
Paddle
项目概览
Crayon鑫
/
Paddle
与 Fork 源项目一致
Fork自
PaddlePaddle / Paddle
通知
1
Star
1
Fork
0
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1
列表
看板
标记
里程碑
合并请求
0
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1
Issue
1
列表
看板
标记
里程碑
合并请求
0
合并请求
0
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
67163fb4
编写于
6月 02, 2022
作者:
G
Guoxia Wang
提交者:
GitHub
6月 02, 2022
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
fix the bug of margin cross entropy loss for eager mode (#43161)
上级
85baa3c0
变更
3
隐藏空白更改
内联
并排
Showing
3 changed file
with
99 addition
and
35 deletion
+99
-35
paddle/fluid/eager/api/utils/tensor_utils.h
paddle/fluid/eager/api/utils/tensor_utils.h
+1
-1
paddle/fluid/operators/margin_cross_entropy_op.cu
paddle/fluid/operators/margin_cross_entropy_op.cu
+94
-34
python/paddle/fluid/tests/unittests/test_parallel_margin_cross_entropy.py
...uid/tests/unittests/test_parallel_margin_cross_entropy.py
+4
-0
未找到文件。
paddle/fluid/eager/api/utils/tensor_utils.h
浏览文件 @
67163fb4
...
...
@@ -15,7 +15,7 @@
#pragma once
#include "paddle/fluid/eager/eager_tensor.h"
#include "paddle/phi/api/
all
.h"
#include "paddle/phi/api/
include/tensor
.h"
namespace
egr
{
namespace
egr_utils_api
{
...
...
paddle/fluid/operators/margin_cross_entropy_op.cu
浏览文件 @
67163fb4
...
...
@@ -26,10 +26,12 @@ namespace cub = hipcub;
#include "paddle/fluid/operators/reduce_ops/reduce_op.cu.h"
#include "paddle/fluid/operators/reduce_ops/reduce_op.h"
#include "paddle/fluid/string/string_helper.h"
#include "paddle/phi/api/include/tensor.h"
#include "paddle/phi/kernels/funcs/axis_utils.h"
#include "paddle/phi/kernels/funcs/math_function.h"
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
#include "paddle/fluid/distributed/collective/ProcessGroup.h"
#include "paddle/fluid/platform/collective_helper.h"
#include "paddle/fluid/platform/device/gpu/nccl_helper.h"
#endif
...
...
@@ -63,19 +65,34 @@ void GetClassInterval(const gpuStream_t& stream, const platform::Place& place,
framework
::
TensorFromVector
(
shard_dim_vec
,
ctx
,
&
num_classes_per_device
);
int
*
num_classes_per_device_ptr
=
num_classes_per_device
.
data
<
int
>
();
const
auto
&
comm
=
platform
::
NCCLCommContext
::
Instance
().
Get
(
rid
,
place
);
// use global calculate stream
const
auto
calcu_stream
=
static_cast
<
platform
::
CUDADeviceContext
*>
(
platform
::
DeviceContextPool
::
Instance
().
Get
(
place
))
->
stream
();
PADDLE_ENFORCE_GPU_SUCCESS
(
platform
::
dynload
::
ncclAllReduce
(
num_classes_per_device_ptr
,
num_classes_per_device_ptr
,
num_classes_per_device
.
numel
(),
platform
::
ToNCCLDataType
(
framework
::
TransToProtoVarType
(
num_classes_per_device
.
dtype
())),
ncclSum
,
comm
->
comm
(),
calcu_stream
));
auto
map
=
distributed
::
ProcessGroupMapFromGid
::
getInstance
();
if
(
map
->
has
(
rid
))
{
// Use ProcessGroup
distributed
::
ProcessGroup
*
pg
=
map
->
get
(
rid
);
std
::
vector
<
phi
::
DenseTensor
>
in_tensor
;
std
::
vector
<
phi
::
DenseTensor
>
out_tensor
;
in_tensor
.
push_back
(
num_classes_per_device
);
out_tensor
.
push_back
(
num_classes_per_device
);
distributed
::
AllreduceOptions
opts
;
opts
.
reduce_op
=
distributed
::
ReduceOp
::
SUM
;
auto
task
=
pg
->
AllReduce
(
in_tensor
,
out_tensor
,
opts
);
task
->
Wait
();
}
else
{
const
auto
&
comm
=
platform
::
NCCLCommContext
::
Instance
().
Get
(
rid
,
place
);
// use global calculate stream
const
auto
calcu_stream
=
static_cast
<
platform
::
CUDADeviceContext
*>
(
platform
::
DeviceContextPool
::
Instance
().
Get
(
place
))
->
stream
();
PADDLE_ENFORCE_GPU_SUCCESS
(
platform
::
dynload
::
ncclAllReduce
(
num_classes_per_device_ptr
,
num_classes_per_device_ptr
,
num_classes_per_device
.
numel
(),
platform
::
ToNCCLDataType
(
framework
::
TransToProtoVarType
(
num_classes_per_device
.
dtype
())),
ncclSum
,
comm
->
comm
(),
calcu_stream
));
}
auto
class_interval_ptr
=
class_interval
->
mutable_data
<
int
>
({
nranks
+
1
},
place
);
...
...
@@ -228,14 +245,21 @@ class MarginCrossEntropyOpCUDAKernel : public framework::OpKernel<T> {
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
platform
::
NCCLComm
*
comm
;
distributed
::
ProcessGroup
*
pg
=
nullptr
;
gpuStream_t
stream
;
if
(
nranks
>
1
)
{
comm
=
platform
::
NCCLCommContext
::
Instance
().
Get
(
rid
,
place
);
// use global calculate stream
stream
=
static_cast
<
platform
::
CUDADeviceContext
*>
(
platform
::
DeviceContextPool
::
Instance
().
Get
(
place
))
->
stream
();
auto
map
=
distributed
::
ProcessGroupMapFromGid
::
getInstance
();
if
(
map
->
has
(
rid
))
{
// Use ProcessGroup
pg
=
map
->
get
(
rid
);
}
else
{
comm
=
platform
::
NCCLCommContext
::
Instance
().
Get
(
rid
,
place
);
// use global calculate stream
stream
=
static_cast
<
platform
::
CUDADeviceContext
*>
(
platform
::
DeviceContextPool
::
Instance
().
Get
(
place
))
->
stream
();
}
}
#endif
...
...
@@ -306,11 +330,23 @@ class MarginCrossEntropyOpCUDAKernel : public framework::OpKernel<T> {
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
if
(
nranks
>
1
)
{
PADDLE_ENFORCE_GPU_SUCCESS
(
platform
::
dynload
::
ncclAllReduce
(
logits_max_buff
,
logits_max_buff
,
logits_max
.
numel
(),
platform
::
ToNCCLDataType
(
framework
::
TransToProtoVarType
(
logits_max
.
dtype
())),
ncclMax
,
comm
->
comm
(),
stream
));
if
(
pg
)
{
std
::
vector
<
phi
::
DenseTensor
>
in_tensor
;
std
::
vector
<
phi
::
DenseTensor
>
out_tensor
;
in_tensor
.
push_back
(
logits_max
);
out_tensor
.
push_back
(
logits_max
);
distributed
::
AllreduceOptions
opts
;
opts
.
reduce_op
=
distributed
::
ReduceOp
::
MAX
;
auto
task
=
pg
->
AllReduce
(
in_tensor
,
out_tensor
,
opts
);
task
->
Wait
();
}
else
{
PADDLE_ENFORCE_GPU_SUCCESS
(
platform
::
dynload
::
ncclAllReduce
(
logits_max_buff
,
logits_max_buff
,
logits_max
.
numel
(),
platform
::
ToNCCLDataType
(
framework
::
TransToProtoVarType
(
logits_max
.
dtype
())),
ncclMax
,
comm
->
comm
(),
stream
));
}
}
#endif
...
...
@@ -329,11 +365,23 @@ class MarginCrossEntropyOpCUDAKernel : public framework::OpKernel<T> {
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
if
(
nranks
>
1
)
{
PADDLE_ENFORCE_GPU_SUCCESS
(
platform
::
dynload
::
ncclAllReduce
(
sum_exp_logits_buff
,
sum_exp_logits_buff
,
sum_exp_logits
.
numel
(),
platform
::
ToNCCLDataType
(
framework
::
TransToProtoVarType
(
sum_exp_logits
.
dtype
())),
ncclSum
,
comm
->
comm
(),
stream
));
if
(
pg
)
{
std
::
vector
<
phi
::
DenseTensor
>
in_tensor
;
std
::
vector
<
phi
::
DenseTensor
>
out_tensor
;
in_tensor
.
push_back
(
sum_exp_logits
);
out_tensor
.
push_back
(
sum_exp_logits
);
distributed
::
AllreduceOptions
opts
;
opts
.
reduce_op
=
distributed
::
ReduceOp
::
SUM
;
auto
task
=
pg
->
AllReduce
(
in_tensor
,
out_tensor
,
opts
);
task
->
Wait
();
}
else
{
PADDLE_ENFORCE_GPU_SUCCESS
(
platform
::
dynload
::
ncclAllReduce
(
sum_exp_logits_buff
,
sum_exp_logits_buff
,
sum_exp_logits
.
numel
(),
platform
::
ToNCCLDataType
(
framework
::
TransToProtoVarType
(
sum_exp_logits
.
dtype
())),
ncclSum
,
comm
->
comm
(),
stream
));
}
}
#endif
...
...
@@ -363,11 +411,23 @@ class MarginCrossEntropyOpCUDAKernel : public framework::OpKernel<T> {
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
if
(
nranks
>
1
)
{
PADDLE_ENFORCE_GPU_SUCCESS
(
platform
::
dynload
::
ncclAllReduce
(
loss_ptr
,
loss_ptr
,
loss
->
numel
(),
platform
::
ToNCCLDataType
(
framework
::
TransToProtoVarType
(
loss
->
dtype
())),
ncclSum
,
comm
->
comm
(),
stream
));
if
(
pg
)
{
std
::
vector
<
phi
::
DenseTensor
>
in_tensor
;
std
::
vector
<
phi
::
DenseTensor
>
out_tensor
;
in_tensor
.
push_back
(
*
loss
);
out_tensor
.
push_back
(
*
loss
);
distributed
::
AllreduceOptions
opts
;
opts
.
reduce_op
=
distributed
::
ReduceOp
::
SUM
;
auto
task
=
pg
->
AllReduce
(
in_tensor
,
out_tensor
,
opts
);
task
->
Wait
();
}
else
{
PADDLE_ENFORCE_GPU_SUCCESS
(
platform
::
dynload
::
ncclAllReduce
(
loss_ptr
,
loss_ptr
,
loss
->
numel
(),
platform
::
ToNCCLDataType
(
framework
::
TransToProtoVarType
(
loss
->
dtype
())),
ncclSum
,
comm
->
comm
(),
stream
));
}
}
#endif
}
...
...
python/paddle/fluid/tests/unittests/test_parallel_margin_cross_entropy.py
浏览文件 @
67163fb4
...
...
@@ -14,6 +14,7 @@
from
__future__
import
print_function
import
os
import
unittest
import
paddle.fluid
as
fluid
...
...
@@ -23,7 +24,10 @@ from test_parallel_dygraph_dataparallel import TestMultipleGpus
class
TestParallelMarginSoftmaxWithCrossEntropy
(
TestMultipleGpus
):
def
test_parallel_margin_cross_entropy
(
self
):
self
.
run_mnist_2gpu
(
'parallel_margin_cross_entropy.py'
)
self
.
run_mnist_2gpu
(
'parallel_margin_cross_entropy.py'
,
eager_mode
=
False
)
if
__name__
==
"__main__"
:
os
.
environ
[
"FLAGS_enable_eager_mode"
]
=
"1"
unittest
.
main
()
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录