Skip to content
体验新版
项目
组织
正在加载...
登录
切换导航
打开侧边栏
PaddlePaddle
Paddle
提交
b0cb4148
P
Paddle
项目概览
PaddlePaddle
/
Paddle
大约 1 年 前同步成功
通知
2299
Star
20931
Fork
5422
代码
文件
提交
分支
Tags
贡献者
分支图
Diff
Issue
1423
列表
看板
标记
里程碑
合并请求
543
Wiki
0
Wiki
分析
仓库
DevOps
项目成员
Pages
P
Paddle
项目概览
项目概览
详情
发布
仓库
仓库
文件
提交
分支
标签
贡献者
分支图
比较
Issue
1,423
Issue
1,423
列表
看板
标记
里程碑
合并请求
543
合并请求
543
Pages
分析
分析
仓库分析
DevOps
Wiki
0
Wiki
成员
成员
收起侧边栏
关闭侧边栏
动态
分支图
创建新Issue
提交
Issue看板
未验证
提交
b0cb4148
编写于
8月 16, 2021
作者:
G
Guoxia Wang
提交者:
GitHub
8月 16, 2021
浏览文件
操作
浏览文件
下载
电子邮件补丁
差异文件
support margin loss (arcface, cosface, sphereface) for single GPU and cross GPUs (#34247)
* support margin loss (arcface, cosface, sphereface)
上级
dc439a12
变更
10
隐藏空白更改
内联
并排
Showing
10 changed file
with
1597 addition
and
0 deletion
+1597
-0
paddle/fluid/operators/margin_cross_entropy_op.cc
paddle/fluid/operators/margin_cross_entropy_op.cc
+203
-0
paddle/fluid/operators/margin_cross_entropy_op.cu
paddle/fluid/operators/margin_cross_entropy_op.cu
+483
-0
paddle/fluid/operators/margin_cross_entropy_op.h
paddle/fluid/operators/margin_cross_entropy_op.h
+41
-0
python/paddle/fluid/tests/unittests/CMakeLists.txt
python/paddle/fluid/tests/unittests/CMakeLists.txt
+3
-0
python/paddle/fluid/tests/unittests/parallel_margin_cross_entropy.py
...le/fluid/tests/unittests/parallel_margin_cross_entropy.py
+188
-0
python/paddle/fluid/tests/unittests/test_margin_cross_entropy_op.py
...dle/fluid/tests/unittests/test_margin_cross_entropy_op.py
+385
-0
python/paddle/fluid/tests/unittests/test_parallel_margin_cross_entropy.py
...uid/tests/unittests/test_parallel_margin_cross_entropy.py
+29
-0
python/paddle/nn/functional/__init__.py
python/paddle/nn/functional/__init__.py
+2
-0
python/paddle/nn/functional/loss.py
python/paddle/nn/functional/loss.py
+262
-0
tools/static_mode_white_list.py
tools/static_mode_white_list.py
+1
-0
未找到文件。
paddle/fluid/operators/margin_cross_entropy_op.cc
0 → 100644
浏览文件 @
b0cb4148
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/margin_cross_entropy_op.h"
namespace
paddle
{
namespace
operators
{
class
MarginCrossEntropyOp
:
public
framework
::
OperatorWithKernel
{
public:
using
framework
::
OperatorWithKernel
::
OperatorWithKernel
;
void
InferShape
(
framework
::
InferShapeContext
*
ctx
)
const
override
{
OP_INOUT_CHECK
(
ctx
->
HasInput
(
"Logits"
),
"Input"
,
"Logits"
,
"MarginCrossEntropyOp"
);
OP_INOUT_CHECK
(
ctx
->
HasInput
(
"Label"
),
"Input"
,
"Label"
,
"MarginCrossEntropyOp"
);
OP_INOUT_CHECK
(
ctx
->
HasOutput
(
"Softmax"
),
"Output"
,
"Softmax"
,
"MarginCrossEntropyOp"
);
OP_INOUT_CHECK
(
ctx
->
HasOutput
(
"Loss"
),
"Output"
,
"Loss"
,
"MarginCrossEntropyOp"
);
auto
logits_dims
=
ctx
->
GetInputDim
(
"Logits"
);
auto
labels_dims
=
ctx
->
GetInputDim
(
"Label"
);
auto
logits_rank
=
logits_dims
.
size
();
auto
axis
=
logits_rank
-
1
;
for
(
int
i
=
0
;
i
<
logits_rank
;
i
++
)
{
if
(
i
!=
axis
)
{
if
(
ctx
->
IsRuntime
()
||
(
logits_dims
[
i
]
>
0
&&
labels_dims
[
i
]
>
0
))
{
PADDLE_ENFORCE_EQ
(
logits_dims
[
i
],
labels_dims
[
i
],
platform
::
errors
::
InvalidArgument
(
"Input(Logits) and Input(Label) should in "
"same shape in dimensions except axis."
));
}
}
}
if
(
labels_dims
.
size
()
>
1
)
{
PADDLE_ENFORCE_EQ
(
labels_dims
[
logits_rank
-
1
],
1UL
,
platform
::
errors
::
InvalidArgument
(
"the last dimension of Input(Label) should be 1."
"But received: the last dimension of Input(Label) is [%d],"
"the last dimension is [%d]"
,
labels_dims
[
logits_rank
-
1
],
logits_rank
-
1
));
}
ctx
->
SetOutputDim
(
"Softmax"
,
logits_dims
);
logits_dims
[
axis
]
=
1
;
ctx
->
SetOutputDim
(
"Loss"
,
logits_dims
);
ctx
->
ShareLoD
(
"Logits"
,
/*->*/
"Softmax"
);
ctx
->
ShareLoD
(
"Logits"
,
/*->*/
"Loss"
);
}
protected:
framework
::
OpKernelType
GetExpectedKernelType
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
return
framework
::
OpKernelType
(
OperatorWithKernel
::
IndicateVarDataType
(
ctx
,
"Logits"
),
ctx
.
device_context
());
}
};
class
MarginCrossEntropyOpMaker
:
public
framework
::
OpProtoAndCheckerMaker
{
public:
void
Make
()
{
AddInput
(
"Logits"
,
"(Tensor, default: Tensor<float>), The input tensor of unscaled "
"log probabilities, whose dimension :attr:`axis` should be scaled "
"by softmax."
);
AddInput
(
"Label"
,
"(Tensor) The input tensor of groud truth label. Label is a "
"Tensor<int64> in same shape with Input(Logits) except the shape in "
"dimension :attr:`axis` as 1."
);
AddOutput
(
"Softmax"
,
"(Tensor, default: Tensor<float>), A tensor in same shape with "
"Input(Logits). "
"The outputs value of softmax activation by given the input batch, "
"which will be used in backward calculation."
);
AddOutput
(
"Loss"
,
"(Tensor, default: Tensor<float>), A tensor in same shape with "
"Input(Logits) "
"except the shape in dimension :attr:`axis` as 1. The cross "
"entropy loss."
);
AddAttr
<
bool
>
(
"return_softmax"
,
"(bool default false) A flag to indicate "
"whether to return softmax."
)
.
SetDefault
(
false
);
AddAttr
<
int
>
(
"ring_id"
,
"(int default 0) nccl communication ring id."
)
.
SetDefault
(
0
);
AddAttr
<
int
>
(
"rank"
,
"(int default 0) rank id for MarginCrossEntropy."
)
.
SetDefault
(
0
);
AddAttr
<
int
>
(
"nranks"
,
"(int default 1) nranks id for MarginCrossEntropy."
)
.
SetDefault
(
1
);
AddAttr
<
float
>
(
"margin1"
,
"(float default 1.0) margin1 for MarginLoss."
)
.
SetDefault
(
1.0
);
AddAttr
<
float
>
(
"margin2"
,
"(float default 0.5) margin2 for MarginLoss."
)
.
SetDefault
(
0.5
);
AddAttr
<
float
>
(
"margin3"
,
"(float default 0.0) margin3 for MarginLoss."
)
.
SetDefault
(
0.0
);
AddAttr
<
float
>
(
"scale"
,
"(float default 64.0) scale for MarginLoss."
)
.
SetDefault
(
64.0
);
AddComment
(
R"DOC(
MarginCrossEntropy Operator
.. math::
L=-\frac{1}{N}\sum^N_{i=1}\log\frac{e^{s(cos(m_{1}\theta_{y_i}+m_{2})-m_{3})}}{e^{s(cos(m_{1}\theta_{y_i}+m_{2})-m_{3})}+\sum^n_{j=1,j\neq y_i} e^{scos\theta_{y_i}}}
where the :math: `\theta_{y_i}` is the angle between the feature :math: `x` and
the representation of class :math: `i`. The details of ArcFace loss
could be referred to https://arxiv.org/abs/1801.07698.
Note that the Op supports model parallel and single GPU. And Logits.shape[-1] can be different each rank.
)DOC"
);
}
};
class
MarginCrossEntropyOpGrad
:
public
framework
::
OperatorWithKernel
{
public:
using
framework
::
OperatorWithKernel
::
OperatorWithKernel
;
void
InferShape
(
framework
::
InferShapeContext
*
ctx
)
const
override
{
PADDLE_ENFORCE_EQ
(
ctx
->
HasInput
(
framework
::
GradVarName
(
"Loss"
)),
true
,
platform
::
errors
::
InvalidArgument
(
"Input(Loss@Grad) should not be null."
));
PADDLE_ENFORCE_EQ
(
ctx
->
HasInput
(
"Softmax"
),
true
,
platform
::
errors
::
InvalidArgument
(
"Input(Softmax) should be not null."
));
PADDLE_ENFORCE_EQ
(
ctx
->
HasInput
(
"Label"
),
true
,
platform
::
errors
::
InvalidArgument
(
"Input(Label) should be not null."
));
PADDLE_ENFORCE_EQ
(
ctx
->
HasOutput
(
framework
::
GradVarName
(
"Logits"
)),
true
,
platform
::
errors
::
InvalidArgument
(
"Output(Logits@Grad) should be not null."
));
ctx
->
SetOutputDim
(
framework
::
GradVarName
(
"Logits"
),
ctx
->
GetInputDim
(
"Softmax"
));
}
protected:
framework
::
OpKernelType
GetExpectedKernelType
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
return
framework
::
OpKernelType
(
OperatorWithKernel
::
IndicateVarDataType
(
ctx
,
framework
::
GradVarName
(
"Loss"
)),
ctx
.
device_context
());
}
};
template
<
typename
T
>
class
MarginCrossEntropyOpGradMaker
:
public
framework
::
SingleGradOpMaker
<
T
>
{
public:
using
framework
::
SingleGradOpMaker
<
T
>::
SingleGradOpMaker
;
protected:
void
Apply
(
GradOpPtr
<
T
>
op
)
const
override
{
op
->
SetType
(
"margin_cross_entropy_grad"
);
op
->
SetInput
(
"Softmax"
,
this
->
Output
(
"Softmax"
));
op
->
SetInput
(
"Logits"
,
this
->
Input
(
"Logits"
));
op
->
SetInput
(
"Label"
,
this
->
Input
(
"Label"
));
op
->
SetInput
(
framework
::
GradVarName
(
"Loss"
),
this
->
OutputGrad
(
"Loss"
));
op
->
SetAttrMap
(
this
->
Attrs
());
op
->
SetOutput
(
framework
::
GradVarName
(
"Logits"
),
this
->
InputGrad
(
"Logits"
));
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
namespace
plat
=
paddle
::
platform
;
REGISTER_OPERATOR
(
margin_cross_entropy
,
ops
::
MarginCrossEntropyOp
,
ops
::
MarginCrossEntropyOpMaker
,
ops
::
MarginCrossEntropyOpGradMaker
<
paddle
::
framework
::
OpDesc
>
,
ops
::
MarginCrossEntropyOpGradMaker
<
paddle
::
imperative
::
OpBase
>
);
REGISTER_OPERATOR
(
margin_cross_entropy_grad
,
ops
::
MarginCrossEntropyOpGrad
);
REGISTER_OP_CPU_KERNEL
(
margin_cross_entropy
,
ops
::
MarginCrossEntropyOpCPUKernel
<
float
>
,
ops
::
MarginCrossEntropyOpCPUKernel
<
double
>
,
ops
::
MarginCrossEntropyOpCPUKernel
<
plat
::
float16
>
);
paddle/fluid/operators/margin_cross_entropy_op.cu
0 → 100644
浏览文件 @
b0cb4148
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef PADDLE_WITH_HIP
#include <hipcub/hipcub.hpp>
namespace
cub
=
hipcub
;
#else
#include <cub/cub.cuh>
#endif
#include <vector>
#include "paddle/fluid/operators/amp/fp16_type_traits.h"
#include "paddle/fluid/operators/margin_cross_entropy_op.h"
#include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/operators/math/softmax_impl.h"
#include "paddle/fluid/operators/reduce_ops/reduce_functor_op.h"
#include "paddle/fluid/operators/reduce_ops/reduce_op.h"
#include "paddle/fluid/string/string_helper.h"
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
#include "paddle/fluid/platform/collective_helper.h"
#include "paddle/fluid/platform/nccl_helper.h"
#endif
namespace
paddle
{
namespace
operators
{
using
Tensor
=
framework
::
Tensor
;
static
constexpr
int
kNumCUDAThreads
=
512
;
static
constexpr
int
kNumMaxinumNumBlocks
=
4096
;
static
inline
int
NumBlocks
(
const
int
N
)
{
return
std
::
min
((
N
+
kNumCUDAThreads
-
1
)
/
kNumCUDAThreads
,
kNumMaxinumNumBlocks
);
}
void
GetClassInterval
(
const
gpuStream_t
&
stream
,
const
platform
::
Place
&
place
,
const
platform
::
DeviceContext
&
ctx
,
const
int
rid
,
const
int
rank
,
const
int
nranks
,
const
int
D
,
Tensor
*
class_interval
)
{
std
::
vector
<
int
>
shard_dim_vec
(
nranks
+
1
,
0
);
shard_dim_vec
[
rank
+
1
]
=
D
;
if
(
nranks
<=
1
)
{
framework
::
TensorFromVector
(
shard_dim_vec
,
ctx
,
class_interval
);
return
;
}
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
Tensor
num_classes_per_device
;
framework
::
TensorFromVector
(
shard_dim_vec
,
ctx
,
&
num_classes_per_device
);
int
*
num_classes_per_device_ptr
=
num_classes_per_device
.
data
<
int
>
();
const
auto
&
comm
=
platform
::
NCCLCommContext
::
Instance
().
Get
(
rid
,
place
);
// use global calculate stream
const
auto
calcu_stream
=
static_cast
<
platform
::
CUDADeviceContext
*>
(
platform
::
DeviceContextPool
::
Instance
().
Get
(
place
))
->
stream
();
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
ncclAllReduce
(
num_classes_per_device_ptr
,
num_classes_per_device_ptr
,
num_classes_per_device
.
numel
(),
platform
::
ToNCCLDataType
(
num_classes_per_device
.
type
()),
ncclSum
,
comm
->
comm
(),
calcu_stream
));
auto
class_interval_ptr
=
class_interval
->
mutable_data
<
int
>
({
nranks
+
1
},
place
);
size_t
cub_temp_storage_bytes
=
0
;
cub
::
DeviceScan
::
InclusiveSum
<
int
*
,
int
*>
(
nullptr
,
cub_temp_storage_bytes
,
nullptr
,
nullptr
,
nranks
+
1
,
stream
);
auto
cub_temp_storage
=
memory
::
Alloc
(
place
,
cub_temp_storage_bytes
);
cub
::
DeviceScan
::
InclusiveSum
<
int
*
,
int
*>
(
cub_temp_storage
->
ptr
(),
cub_temp_storage_bytes
,
num_classes_per_device_ptr
,
class_interval_ptr
,
nranks
+
1
,
stream
);
return
;
#endif
}
template
<
typename
T
,
typename
IndexT
>
__global__
void
AddMarginToPositiveLogitsKernel
(
T
*
logit
,
const
IndexT
*
label
,
const
float
margin1
,
const
float
margin2
,
const
float
margin3
,
const
int
rank
,
const
int
nranks
,
const
int64_t
N
,
const
int64_t
D
,
const
int
*
class_interval_ptr
)
{
using
MPType
=
typename
details
::
MPTypeTrait
<
T
>::
Type
;
int
start_index
=
class_interval_ptr
[
rank
];
int
end_index
=
class_interval_ptr
[
rank
+
1
];
int
num_classes
=
class_interval_ptr
[
nranks
];
CUDA_KERNEL_LOOP
(
i
,
N
)
{
auto
real_label
=
label
[
i
];
PADDLE_ENFORCE
((
real_label
<
num_classes
)
&&
(
real_label
>=
0
),
"The index is out of bounds, "
"please check whether the value of label and "
"input meet the number of class. It should "
"be less than [%d], but received [%d]"
,
num_classes
,
real_label
);
if
(
real_label
>=
start_index
&&
real_label
<
end_index
)
{
int64_t
offset
=
i
*
D
+
real_label
-
start_index
;
if
(
fabs
(
margin1
-
1.0
)
>
1e-8
||
fabs
(
margin2
)
>
1e-8
)
{
MPType
x
=
static_cast
<
MPType
>
(
logit
[
offset
]);
MPType
theta
=
acos
(
x
);
if
(
fabs
(
margin1
-
1.0
)
>
1e-8
)
{
theta
*=
static_cast
<
MPType
>
(
margin1
);
}
if
(
fabs
(
margin2
)
>
1e-8
)
{
theta
+=
static_cast
<
MPType
>
(
margin2
);
}
logit
[
offset
]
=
static_cast
<
T
>
(
cos
(
theta
));
}
if
(
fabs
(
margin3
)
>
1e-8
)
{
MPType
y
=
static_cast
<
MPType
>
(
logit
[
offset
]);
y
-=
static_cast
<
MPType
>
(
margin3
);
logit
[
offset
]
=
static_cast
<
T
>
(
y
);
}
}
}
}
static
__device__
__forceinline__
platform
::
float16
exp_on_device
(
platform
::
float16
x
)
{
return
::
Eigen
::
numext
::
exp
(
x
);
}
static
__device__
__forceinline__
float
exp_on_device
(
float
x
)
{
return
expf
(
x
);
}
static
__device__
__forceinline__
double
exp_on_device
(
double
x
)
{
return
exp
(
x
);
}
static
__device__
__forceinline__
platform
::
float16
log_on_device
(
platform
::
float16
x
)
{
return
::
Eigen
::
numext
::
log
(
x
);
}
static
__device__
__forceinline__
float
log_on_device
(
float
x
)
{
return
logf
(
x
);
}
static
__device__
__forceinline__
double
log_on_device
(
double
x
)
{
return
log
(
x
);
}
template
<
typename
Tx
,
typename
Ty
=
Tx
>
struct
ExpLogitTransformer
{
HOSTDEVICE
explicit
inline
ExpLogitTransformer
(
int
n
)
{}
HOSTDEVICE
inline
Ty
operator
()(
const
Tx
&
x
)
const
{
return
static_cast
<
Ty
>
(
exp_on_device
(
x
));
}
};
template
<
typename
Tx
,
typename
Ty
=
Tx
>
struct
ExpAndSum
{
using
Transformer
=
ExpLogitTransformer
<
Tx
>
;
inline
Ty
initial
()
{
return
static_cast
<
Ty
>
(
0.0
f
);
}
__device__
__forceinline__
Ty
operator
()(
const
Ty
&
a
,
const
Ty
&
b
)
const
{
return
b
+
a
;
}
};
template
<
typename
T
>
__global__
void
ScaleLogitKernel
(
T
*
logits
,
const
float
scale
,
const
int64_t
N
,
const
int64_t
D
)
{
CUDA_KERNEL_LOOP
(
i
,
N
*
D
)
{
logits
[
i
]
*=
static_cast
<
T
>
(
scale
);
}
}
template
<
typename
T
>
__global__
void
LogitsMinusMaxKernel
(
T
*
logits
,
const
T
*
logits_max_per_row
,
const
int64_t
N
,
const
int64_t
D
)
{
CUDA_KERNEL_LOOP
(
i
,
N
*
D
)
{
auto
row
=
i
/
D
;
logits
[
i
]
-=
logits_max_per_row
[
row
];
}
}
template
<
typename
T
>
__global__
void
LogitsMinusLogSumKernel
(
T
*
logits
,
const
T
*
logits_sum_per_row
,
const
int64_t
N
,
const
int64_t
D
)
{
CUDA_KERNEL_LOOP
(
i
,
N
*
D
)
{
auto
row
=
i
/
D
;
logits
[
i
]
-=
log_on_device
(
logits_sum_per_row
[
row
]);
}
}
template
<
typename
T
,
typename
IndexT
>
__global__
void
HardLabelSoftmaxWithCrossEntropyKernel
(
T
*
loss
,
T
*
log_softmax
,
const
IndexT
*
labels
,
const
int
rank
,
const
int64_t
N
,
const
int64_t
D
,
const
int
*
class_interval_ptr
)
{
int
start_index
=
class_interval_ptr
[
rank
];
CUDA_KERNEL_LOOP
(
i
,
N
*
D
)
{
auto
row
=
i
/
D
;
auto
col
=
i
%
D
;
if
((
col
+
start_index
)
==
labels
[
row
])
{
auto
softmax
=
log_softmax
[
i
];
loss
[
row
]
=
-
softmax
;
log_softmax
[
i
]
=
exp_on_device
(
softmax
);
}
else
{
log_softmax
[
i
]
=
exp_on_device
(
log_softmax
[
i
]);
}
}
}
template
<
typename
T
,
typename
IndexT
>
__global__
void
CalculateGrad
(
T
*
logits_grad
,
const
T
*
loss_grad
,
const
T
*
logits
,
const
IndexT
*
labels
,
const
float
margin1
,
const
float
margin2
,
const
float
scale
,
const
int
rank
,
const
int64_t
N
,
const
int64_t
D
,
const
int
*
class_interval_ptr
)
{
using
MPType
=
typename
details
::
MPTypeTrait
<
T
>::
Type
;
int
start_index
=
class_interval_ptr
[
rank
];
CUDA_KERNEL_LOOP
(
i
,
N
*
D
)
{
auto
row
=
i
/
D
;
auto
col
=
i
%
D
;
if
((
col
+
start_index
)
==
labels
[
row
])
{
logits_grad
[
i
]
=
(
logits_grad
[
i
]
-
static_cast
<
T
>
(
1.0
))
*
loss_grad
[
row
];
if
(
fabs
(
margin1
-
1.0
)
>
1e-8
||
fabs
(
margin2
)
>
1e-8
)
{
MPType
dout
=
static_cast
<
MPType
>
(
logits_grad
[
i
]);
MPType
one
=
static_cast
<
MPType
>
(
1.0
f
);
MPType
x
=
static_cast
<
MPType
>
(
logits
[
i
]);
MPType
m1
=
static_cast
<
MPType
>
(
margin1
);
MPType
m2
=
static_cast
<
MPType
>
(
margin2
);
MPType
d
=
m1
*
sin
(
m1
*
acos
(
x
)
+
m2
)
/
sqrt
(
one
-
x
*
x
);
logits_grad
[
i
]
=
static_cast
<
T
>
(
dout
*
d
);
}
}
else
{
logits_grad
[
i
]
*=
loss_grad
[
row
];
}
if
(
fabs
(
scale
-
1.0
)
>
1e-8
)
{
logits_grad
[
i
]
*=
static_cast
<
T
>
(
scale
);
}
}
}
template
<
typename
T
>
class
MarginCrossEntropyOpCUDAKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
const
Tensor
*
logits
=
ctx
.
Input
<
Tensor
>
(
"Logits"
);
const
Tensor
*
labels
=
ctx
.
Input
<
Tensor
>
(
"Label"
);
Tensor
*
softmax
=
ctx
.
Output
<
Tensor
>
(
"Softmax"
);
Tensor
*
loss
=
ctx
.
Output
<
Tensor
>
(
"Loss"
);
const
int
rid
=
ctx
.
Attr
<
int
>
(
"ring_id"
);
const
int
nranks
=
ctx
.
Attr
<
int
>
(
"nranks"
);
const
int
rank
=
ctx
.
Attr
<
int
>
(
"rank"
);
const
float
margin1
=
ctx
.
Attr
<
float
>
(
"margin1"
);
const
float
margin2
=
ctx
.
Attr
<
float
>
(
"margin2"
);
const
float
margin3
=
ctx
.
Attr
<
float
>
(
"margin3"
);
const
float
scale
=
ctx
.
Attr
<
float
>
(
"scale"
);
const
auto
&
place
=
ctx
.
GetPlace
();
auto
&
dev_ctx
=
ctx
.
template
device_context
<
platform
::
CUDADeviceContext
>();
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
platform
::
NCCLComm
*
comm
;
gpuStream_t
stream
;
if
(
nranks
>
1
)
{
comm
=
platform
::
NCCLCommContext
::
Instance
().
Get
(
rid
,
place
);
// use global calculate stream
stream
=
static_cast
<
platform
::
CUDADeviceContext
*>
(
platform
::
DeviceContextPool
::
Instance
().
Get
(
place
))
->
stream
();
}
#endif
// allocate memory on device.
T
*
softmax_ptr
=
softmax
->
mutable_data
<
T
>
(
place
);
T
*
loss_ptr
=
loss
->
mutable_data
<
T
>
(
place
);
const
auto
&
logits_dims
=
logits
->
dims
();
const
auto
&
labels_dims
=
labels
->
dims
();
const
int
axis
=
logits_dims
.
size
()
-
1
;
const
int
N
=
SizeToAxis
(
axis
,
logits_dims
);
const
int
D
=
SizeFromAxis
(
axis
,
logits_dims
);
int
blocks
=
NumBlocks
(
N
);
int
threads
=
kNumCUDAThreads
;
const
auto
&
label_type
=
labels
->
type
();
// copy logits to softmax variable since we can't modify logits,
// and it also be used when calculate grad
framework
::
TensorCopy
(
*
logits
,
ctx
.
GetPlace
(),
ctx
.
device_context
(),
softmax
);
Tensor
softmax_2d
;
softmax_2d
.
ShareDataWith
(
*
softmax
).
Resize
({
N
,
D
});
T
*
logits_ptr
=
softmax_2d
.
data
<
T
>
();
Tensor
class_interval
;
GetClassInterval
(
dev_ctx
.
stream
(),
place
,
ctx
.
cuda_device_context
(),
rid
,
rank
,
nranks
,
D
,
&
class_interval
);
// step 1, preprocess logits
// add margin for positive elements
// theta = acos(x_i)
// (cos(m1 * theta + m2) - m3)
// save match_logits, used for gradient computation.
if
(
label_type
==
framework
::
proto
::
VarType
::
INT32
)
{
typedef
int32_t
LabelT
;
AddMarginToPositiveLogitsKernel
<
T
><<<
NumBlocks
(
N
),
threads
,
0
,
dev_ctx
.
stream
()
>>>
(
logits_ptr
,
labels
->
data
<
LabelT
>
(),
margin1
,
margin2
,
margin3
,
rank
,
nranks
,
N
,
D
,
class_interval
.
data
<
int
>
());
}
else
if
(
label_type
==
framework
::
proto
::
VarType
::
INT64
)
{
typedef
int64_t
LabelT
;
AddMarginToPositiveLogitsKernel
<
T
><<<
NumBlocks
(
N
),
threads
,
0
,
dev_ctx
.
stream
()
>>>
(
logits_ptr
,
labels
->
data
<
LabelT
>
(),
margin1
,
margin2
,
margin3
,
rank
,
nranks
,
N
,
D
,
class_interval
.
data
<
int
>
());
}
// scale by s
ScaleLogitKernel
<
T
><<<
NumBlocks
(
N
*
D
),
threads
,
0
,
dev_ctx
.
stream
()
>>>
(
logits_ptr
,
scale
,
N
,
D
);
// step 2, obtain logit_max
Tensor
logits_max
;
logits_max
=
ctx
.
AllocateTmpTensor
<
T
,
platform
::
CUDADeviceContext
>
({
N
,
1
},
dev_ctx
);
T
*
logits_max_buff
=
logits_max
.
mutable_data
<
T
>
(
place
);
TensorReduceFunctorImpl
<
T
,
T
,
CustomMax
>
(
softmax_2d
,
&
logits_max
,
{
1
},
dev_ctx
.
stream
());
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
if
(
nranks
>
1
)
{
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
ncclAllReduce
(
logits_max_buff
,
logits_max_buff
,
logits_max
.
numel
(),
platform
::
ToNCCLDataType
(
logits_max
.
type
()),
ncclMax
,
comm
->
comm
(),
stream
));
}
#endif
// step 3, logit - logit_max
LogitsMinusMaxKernel
<
T
><<<
NumBlocks
(
N
*
D
),
threads
,
0
,
dev_ctx
.
stream
()
>>>
(
logits_ptr
,
logits_max_buff
,
N
,
D
);
// step 4, sum(exp(logit - logit_max))
Tensor
sum_exp_logits
;
sum_exp_logits
=
ctx
.
AllocateTmpTensor
<
T
,
platform
::
CUDADeviceContext
>
({
N
,
1
},
dev_ctx
);
T
*
sum_exp_logits_buff
=
sum_exp_logits
.
mutable_data
<
T
>
(
place
);
TensorReduceFunctorImpl
<
T
,
T
,
ExpAndSum
>
(
softmax_2d
,
&
sum_exp_logits
,
{
1
},
dev_ctx
.
stream
());
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
if
(
nranks
>
1
)
{
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
ncclAllReduce
(
sum_exp_logits_buff
,
sum_exp_logits_buff
,
sum_exp_logits
.
numel
(),
platform
::
ToNCCLDataType
(
sum_exp_logits
.
type
()),
ncclSum
,
comm
->
comm
(),
stream
));
}
#endif
// step 5, (logit - logit_max) - log(sum(exp(logit - logit_max)))
LogitsMinusLogSumKernel
<
T
><<<
NumBlocks
(
N
*
D
),
threads
,
0
,
dev_ctx
.
stream
()
>>>
(
logits_ptr
,
sum_exp_logits_buff
,
N
,
D
);
// step 6, prob = exp((logit - logit_max) - log(sum(exp(logit -
// logit_max))))
// loss = -((logit_i - logit_max) - log(sum(exp(logit - logit_max))))
math
::
SetConstant
<
platform
::
CUDADeviceContext
,
T
>
()(
dev_ctx
,
loss
,
static_cast
<
T
>
(
0.0
));
if
(
label_type
==
framework
::
proto
::
VarType
::
INT32
)
{
typedef
int32_t
LabelT
;
HardLabelSoftmaxWithCrossEntropyKernel
<
T
,
LabelT
><<<
blocks
,
threads
,
0
,
dev_ctx
.
stream
()
>>>
(
loss_ptr
,
logits_ptr
,
labels
->
data
<
LabelT
>
(),
rank
,
N
,
D
,
class_interval
.
data
<
int
>
());
}
else
if
(
label_type
==
framework
::
proto
::
VarType
::
INT64
)
{
typedef
int64_t
LabelT
;
HardLabelSoftmaxWithCrossEntropyKernel
<
T
,
LabelT
><<<
blocks
,
threads
,
0
,
dev_ctx
.
stream
()
>>>
(
loss_ptr
,
logits_ptr
,
labels
->
data
<
LabelT
>
(),
rank
,
N
,
D
,
class_interval
.
data
<
int
>
());
}
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
if
(
nranks
>
1
)
{
PADDLE_ENFORCE_CUDA_SUCCESS
(
platform
::
dynload
::
ncclAllReduce
(
loss_ptr
,
loss_ptr
,
loss
->
numel
(),
platform
::
ToNCCLDataType
(
loss
->
type
()),
ncclSum
,
comm
->
comm
(),
stream
));
}
#endif
}
};
template
<
typename
T
>
class
MarginCrossEntropyGradCUDAKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
context
)
const
override
{
const
Tensor
*
labels
=
context
.
Input
<
Tensor
>
(
"Label"
);
const
Tensor
*
logits
=
context
.
Input
<
Tensor
>
(
"Logits"
);
const
Tensor
*
softmax
=
context
.
Input
<
Tensor
>
(
"Softmax"
);
const
Tensor
*
loss_grad
=
context
.
Input
<
Tensor
>
(
framework
::
GradVarName
(
"Loss"
));
Tensor
*
logit_grad
=
context
.
Output
<
Tensor
>
(
framework
::
GradVarName
(
"Logits"
));
const
bool
return_softmax
=
context
.
Attr
<
bool
>
(
"return_softmax"
);
const
int
rid
=
context
.
Attr
<
int
>
(
"ring_id"
);
const
int
nranks
=
context
.
Attr
<
int
>
(
"nranks"
);
const
int
rank
=
context
.
Attr
<
int
>
(
"rank"
);
const
float
margin1
=
context
.
Attr
<
float
>
(
"margin1"
);
const
float
margin2
=
context
.
Attr
<
float
>
(
"margin2"
);
const
float
margin3
=
context
.
Attr
<
float
>
(
"margin3"
);
const
float
scale
=
context
.
Attr
<
float
>
(
"scale"
);
auto
&
dev_ctx
=
context
.
template
device_context
<
platform
::
CUDADeviceContext
>();
const
auto
sofrmax_dims
=
softmax
->
dims
();
const
int
axis
=
sofrmax_dims
.
size
()
-
1
;
const
int
N
=
SizeToAxis
(
axis
,
sofrmax_dims
);
const
int
D
=
SizeFromAxis
(
axis
,
sofrmax_dims
);
if
(
return_softmax
)
{
framework
::
TensorCopy
(
*
softmax
,
context
.
GetPlace
(),
context
.
device_context
(),
logit_grad
);
}
else
{
logit_grad
->
ShareDataWith
(
*
softmax
);
}
int
blocks
=
NumBlocks
(
N
*
D
);
int
threads
=
kNumCUDAThreads
;
const
auto
&
label_type
=
labels
->
type
();
Tensor
class_interval
;
GetClassInterval
(
dev_ctx
.
stream
(),
context
.
GetPlace
(),
context
.
cuda_device_context
(),
rid
,
rank
,
nranks
,
D
,
&
class_interval
);
if
(
label_type
==
framework
::
proto
::
VarType
::
INT32
)
{
typedef
int32_t
LabelT
;
CalculateGrad
<
T
,
LabelT
><<<
blocks
,
threads
,
0
,
dev_ctx
.
stream
()
>>>
(
logit_grad
->
data
<
T
>
(),
loss_grad
->
data
<
T
>
(),
logits
->
data
<
T
>
(),
labels
->
data
<
LabelT
>
(),
margin1
,
margin2
,
scale
,
rank
,
N
,
D
,
class_interval
.
data
<
int
>
());
}
else
if
(
label_type
==
framework
::
proto
::
VarType
::
INT64
)
{
typedef
int64_t
LabelT
;
CalculateGrad
<
T
,
LabelT
><<<
blocks
,
threads
,
0
,
dev_ctx
.
stream
()
>>>
(
logit_grad
->
data
<
T
>
(),
loss_grad
->
data
<
T
>
(),
logits
->
data
<
T
>
(),
labels
->
data
<
LabelT
>
(),
margin1
,
margin2
,
scale
,
rank
,
N
,
D
,
class_interval
.
data
<
int
>
());
}
}
};
}
// namespace operators
}
// namespace paddle
namespace
ops
=
paddle
::
operators
;
namespace
plat
=
paddle
::
platform
;
REGISTER_OP_CUDA_KERNEL
(
margin_cross_entropy
,
ops
::
MarginCrossEntropyOpCUDAKernel
<
float
>
,
ops
::
MarginCrossEntropyOpCUDAKernel
<
double
>
,
ops
::
MarginCrossEntropyOpCUDAKernel
<
plat
::
float16
>
);
REGISTER_OP_CUDA_KERNEL
(
margin_cross_entropy_grad
,
ops
::
MarginCrossEntropyGradCUDAKernel
<
float
>
,
ops
::
MarginCrossEntropyGradCUDAKernel
<
double
>
,
ops
::
MarginCrossEntropyGradCUDAKernel
<
plat
::
float16
>
);
paddle/fluid/operators/margin_cross_entropy_op.h
0 → 100644
浏览文件 @
b0cb4148
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <algorithm>
#include <utility>
#include <vector>
#include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/softmax.h"
#include "paddle/fluid/operators/softmax_op.h"
namespace
paddle
{
namespace
operators
{
template
<
typename
T
>
class
MarginCrossEntropyOpCPUKernel
:
public
framework
::
OpKernel
<
T
>
{
public:
void
Compute
(
const
framework
::
ExecutionContext
&
ctx
)
const
override
{
PADDLE_THROW
(
platform
::
errors
::
Unavailable
(
"Do not support margin_cross_entropy for cpu kernel "
"now."
));
}
};
}
// namespace operators
}
// namespace paddle
python/paddle/fluid/tests/unittests/CMakeLists.txt
浏览文件 @
b0cb4148
...
@@ -28,6 +28,7 @@ list(APPEND DIST_TEST_OPS test_parallel_dygraph_pipeline_parallel)
...
@@ -28,6 +28,7 @@ list(APPEND DIST_TEST_OPS test_parallel_dygraph_pipeline_parallel)
list
(
APPEND DIST_TEST_OPS test_parallel_dygraph_tensor_parallel
)
list
(
APPEND DIST_TEST_OPS test_parallel_dygraph_tensor_parallel
)
list
(
APPEND DIST_TEST_OPS test_parallel_dygraph_sharding_parallel
)
list
(
APPEND DIST_TEST_OPS test_parallel_dygraph_sharding_parallel
)
list
(
APPEND DIST_TEST_OPS test_parallel_dygraph_mp_layers
)
list
(
APPEND DIST_TEST_OPS test_parallel_dygraph_mp_layers
)
list
(
APPEND DIST_TEST_OPS test_parallel_margin_cross_entropy
)
set
(
MIXED_DIST_TEST_OPS
${
DIST_TEST_OPS
}
)
set
(
MIXED_DIST_TEST_OPS
${
DIST_TEST_OPS
}
)
#remove distribute unittests.
#remove distribute unittests.
list
(
APPEND MIXED_DIST_TEST_OPS test_dgc_op
)
list
(
APPEND MIXED_DIST_TEST_OPS test_dgc_op
)
...
@@ -195,6 +196,7 @@ if ((NOT WITH_GPU) AND (NOT WITH_ROCM))
...
@@ -195,6 +196,7 @@ if ((NOT WITH_GPU) AND (NOT WITH_ROCM))
LIST
(
REMOVE_ITEM TEST_OPS test_mixed_precision
)
LIST
(
REMOVE_ITEM TEST_OPS test_mixed_precision
)
LIST
(
REMOVE_ITEM TEST_OPS test_fleet_base_single
)
LIST
(
REMOVE_ITEM TEST_OPS test_fleet_base_single
)
LIST
(
REMOVE_ITEM TEST_OPS test_dygraph_recompute
)
LIST
(
REMOVE_ITEM TEST_OPS test_dygraph_recompute
)
LIST
(
REMOVE_ITEM TEST_OPS test_parallel_margin_cross_entropy
)
elseif
(
WITH_GPU
)
elseif
(
WITH_GPU
)
if
(
${
CUDNN_VERSION
}
VERSION_LESS 7100
)
if
(
${
CUDNN_VERSION
}
VERSION_LESS 7100
)
LIST
(
REMOVE_ITEM TEST_OPS test_conv2d_fusion_op
)
LIST
(
REMOVE_ITEM TEST_OPS test_conv2d_fusion_op
)
...
@@ -906,6 +908,7 @@ if(WITH_DISTRIBUTE AND WITH_GPU AND WITH_NCCL)
...
@@ -906,6 +908,7 @@ if(WITH_DISTRIBUTE AND WITH_GPU AND WITH_NCCL)
set_tests_properties
(
test_parallel_dygraph_tensor_parallel PROPERTIES TIMEOUT 200
)
set_tests_properties
(
test_parallel_dygraph_tensor_parallel PROPERTIES TIMEOUT 200
)
set_tests_properties
(
test_parallel_dygraph_sharding_parallel PROPERTIES TIMEOUT 120
)
set_tests_properties
(
test_parallel_dygraph_sharding_parallel PROPERTIES TIMEOUT 120
)
set_tests_properties
(
test_parallel_dygraph_mp_layers PROPERTIES TIMEOUT 120
)
set_tests_properties
(
test_parallel_dygraph_mp_layers PROPERTIES TIMEOUT 120
)
set_tests_properties
(
test_parallel_margin_cross_entropy PROPERTIES TIMEOUT 120
)
if
(
${
NCCL_VERSION
}
VERSION_GREATER_EQUAL 2212
)
if
(
${
NCCL_VERSION
}
VERSION_GREATER_EQUAL 2212
)
set_tests_properties
(
test_parallel_dygraph_sparse_embedding PROPERTIES TIMEOUT 120
)
set_tests_properties
(
test_parallel_dygraph_sparse_embedding PROPERTIES TIMEOUT 120
)
set_tests_properties
(
test_parallel_dygraph_transformer PROPERTIES TIMEOUT 120
)
set_tests_properties
(
test_parallel_dygraph_transformer PROPERTIES TIMEOUT 120
)
...
...
python/paddle/fluid/tests/unittests/parallel_margin_cross_entropy.py
0 → 100644
浏览文件 @
b0cb4148
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
__future__
import
division
from
__future__
import
print_function
import
unittest
import
paddle
import
numpy
as
np
import
random
import
paddle.distributed
as
dist
import
paddle.fluid
as
fluid
import
paddle.distributed.fleet
as
fleet
from
paddle
import
framework
def
set_random_seed
(
seed
):
"""Set random seed for reproducability."""
random
.
seed
(
seed
)
np
.
random
.
seed
(
seed
)
paddle
.
seed
(
seed
)
fleet
.
meta_parallel
.
model_parallel_random_seed
(
seed
)
class
TestParallelMarginSoftmaxCrossEntropyOp
(
unittest
.
TestCase
):
def
setUp
(
self
):
strategy
=
fleet
.
DistributedStrategy
()
fleet
.
init
(
is_collective
=
True
,
strategy
=
strategy
)
def
test_parallel_margin_softmax_cross_entropy
(
self
):
margin1s
=
[
1.0
,
1.0
,
1.35
]
margin2s
=
[
0.5
,
0.0
,
0.0
]
margin3s
=
[
0.0
,
0.35
,
0.0
]
scales
=
[
64.0
,
64.0
,
64.0
]
rank_id
=
dist
.
get_rank
()
num_trainer
=
dist
.
get_world_size
()
batch_size
=
2
feature_length
=
4
seed
=
1025
set_random_seed
(
seed
)
paddle
.
seed
(
rank_id
*
10
)
random
.
seed
(
seed
)
np
.
random
.
seed
(
seed
)
check_group
=
dist
.
new_group
(
list
(
range
(
num_trainer
)))
for
dtype
in
(
'float32'
,
'float64'
):
num_class_per_cards
=
[[
4
,
8
],
[
2
,
2
],
[
4
,
2
],
[
3
,
9
]]
for
num_class_per_card
in
num_class_per_cards
:
num_class
=
np
.
sum
(
num_class_per_card
)
for
margin1
,
margin2
,
margin3
,
scale
in
zip
(
margin1s
,
margin2s
,
margin3s
,
scales
):
for
_
in
range
(
5
):
np_label
=
np
.
random
.
randint
(
0
,
num_class
,
(
batch_size
,
))
label
=
paddle
.
to_tensor
(
np_label
,
dtype
=
"int64"
)
input
=
paddle
.
randn
(
shape
=
[
batch_size
,
feature_length
],
dtype
=
dtype
)
input
.
stop_gradient
=
False
input_l2
=
paddle
.
sqrt
(
paddle
.
sum
(
paddle
.
square
(
input
),
axis
=
1
,
keepdim
=
True
))
norm_input
=
paddle
.
divide
(
input
,
input_l2
)
weight
=
paddle
.
randn
(
shape
=
[
feature_length
,
num_class_per_card
[
rank_id
]
],
dtype
=
dtype
)
weight
.
stop_gradient
=
False
weight_l2
=
paddle
.
sqrt
(
paddle
.
sum
(
paddle
.
square
(
weight
),
axis
=
0
,
keepdim
=
True
))
norm_weight
=
paddle
.
divide
(
weight
,
weight_l2
)
data
=
paddle
.
matmul
(
norm_input
,
norm_weight
)
data
.
stop_gradient
=
False
sta
=
np
.
sum
(
num_class_per_card
[:
rank_id
])
if
rank_id
>
0
else
0
end
=
np
.
sum
(
num_class_per_card
[:
rank_id
+
1
])
integral_data
=
np
.
zeros
(
(
batch_size
,
num_class
),
dtype
=
dtype
)
integral_data
[:,
sta
:
end
]
=
data
.
clone
().
detach
().
numpy
(
)
integral_data
=
paddle
.
to_tensor
(
integral_data
,
dtype
=
dtype
)
paddle
.
distributed
.
all_reduce
(
integral_data
,
op
=
paddle
.
distributed
.
ReduceOp
.
SUM
,
group
=
check_group
)
integral_data
=
integral_data
.
detach
().
clone
()
integral_data
.
stop_gradient
=
False
# add arcface margin to logit
theta
=
paddle
.
acos
(
integral_data
)
one_hot_label
=
paddle
.
nn
.
functional
.
one_hot
(
label
,
num_classes
=
num_class
)
one_hot_label
.
stop_gradient
=
False
if
margin1
!=
1.0
:
theta
=
margin1
*
theta
if
margin2
!=
0.0
:
theta
=
theta
+
margin2
margin_cos
=
paddle
.
cos
(
theta
)
if
margin3
!=
0.0
:
margin_cos
=
margin_cos
-
margin3
diff
=
one_hot_label
*
(
margin_cos
-
integral_data
)
arc_data
=
(
integral_data
+
diff
)
*
scale
loss_a
,
softmax_a
=
paddle
.
nn
.
functional
.
margin_cross_entropy
(
data
,
label
,
margin1
=
margin1
,
margin2
=
margin2
,
margin3
=
margin3
,
scale
=
scale
,
group
=
check_group
,
return_softmax
=
True
,
reduction
=
None
)
loss_b
,
softmax_b
=
paddle
.
nn
.
functional
.
softmax_with_cross_entropy
(
logits
=
arc_data
,
label
=
paddle
.
reshape
(
label
,
(
-
1
,
1
)),
return_softmax
=
True
)
np
.
testing
.
assert_allclose
(
loss_a
.
numpy
(),
loss_b
.
numpy
(),
rtol
=
1e-5
)
integral_prob
=
np
.
zeros
(
(
batch_size
,
num_class
),
dtype
=
dtype
)
integral_prob
[:,
sta
:
end
]
=
softmax_a
.
clone
().
detach
(
).
numpy
()
integral_prob
=
paddle
.
to_tensor
(
integral_prob
,
dtype
=
dtype
)
paddle
.
distributed
.
all_reduce
(
integral_prob
,
op
=
paddle
.
distributed
.
ReduceOp
.
SUM
,
group
=
check_group
)
integral_prob
=
integral_prob
.
detach
().
clone
()
integral_prob
.
stop_gradient
=
False
np
.
testing
.
assert_allclose
(
integral_prob
.
numpy
(),
softmax_b
.
numpy
(),
rtol
=
1e-5
,
atol
=
1e-6
)
loss_a
=
loss_a
.
sum
()
/
batch_size
loss_b
=
loss_b
.
sum
()
/
batch_size
loss_a
.
backward
()
loss_b
.
backward
()
integral_grad
=
np
.
zeros
(
(
batch_size
,
num_class
),
dtype
=
dtype
)
integral_grad
[:,
sta
:
end
]
=
data
.
grad
.
clone
().
detach
()
integral_grad
=
paddle
.
to_tensor
(
integral_grad
,
dtype
=
dtype
)
paddle
.
distributed
.
all_reduce
(
integral_grad
,
op
=
paddle
.
distributed
.
ReduceOp
.
SUM
,
group
=
check_group
)
np
.
testing
.
assert_allclose
(
integral_data
.
grad
.
numpy
(),
integral_grad
.
numpy
(),
rtol
=
1e-5
)
if
__name__
==
'__main__'
:
unittest
.
main
()
python/paddle/fluid/tests/unittests/test_margin_cross_entropy_op.py
0 → 100644
浏览文件 @
b0cb4148
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
__future__
import
print_function
import
unittest
import
numpy
as
np
from
op_test
import
OpTest
import
math
import
random
import
paddle
import
paddle.fluid
as
fluid
from
paddle.fluid
import
core
from
paddle.fluid
import
Program
,
program_guard
def
stable_softmax_comm
(
x
):
shiftx
=
(
x
-
np
.
max
(
x
))
deno
=
np
.
log
(
np
.
sum
(
np
.
exp
(
shiftx
)))
comm
=
shiftx
-
deno
return
comm
def
margin_cross_entropy
(
logits
,
label
,
axis
,
margin1
,
margin2
,
margin3
,
scale
,
reduction
=
None
):
one_hot_label
=
np
.
zeros_like
(
logits
,
dtype
=
logits
.
dtype
)
for
i
,
lb
in
enumerate
(
label
):
one_hot_label
[
i
,
lb
]
=
1.0
# add arcface margin to logit
theta
=
np
.
arccos
(
logits
)
if
margin1
!=
1.0
:
theta
=
margin1
*
theta
if
margin2
!=
0.0
:
theta
=
theta
+
margin2
margin_cos
=
np
.
cos
(
theta
)
if
margin3
!=
0.0
:
margin_cos
=
margin_cos
-
margin3
diff
=
one_hot_label
*
(
margin_cos
-
logits
)
arc_logits
=
(
logits
+
diff
)
*
scale
comm
=
np
.
apply_along_axis
(
stable_softmax_comm
,
axis
,
arc_logits
)
loss
=
(
-
one_hot_label
*
comm
).
sum
(
axis
=
axis
,
keepdims
=
True
)
softmax
=
np
.
exp
(
comm
)
if
reduction
==
'mean'
:
loss
=
np
.
mean
(
loss
)
elif
reduction
==
'sum'
:
loss
=
np
.
sum
(
loss
)
return
loss
,
softmax
@
unittest
.
skipIf
(
not
core
.
is_compiled_with_cuda
(),
"core is not compiled with CUDA"
)
class
TestMarginCrossEntropyOp
(
OpTest
):
def
initParams
(
self
):
self
.
op_type
=
"margin_cross_entropy"
self
.
axis
=
-
1
self
.
batch_dim
=
5
self
.
feat_dim
=
41
self
.
num_class
=
37
def
init_loss_params
(
self
):
self
.
margin1
=
1.0
self
.
margin2
=
0.5
self
.
margin3
=
0.0
self
.
scale
=
2.0
def
init_dtype
(
self
):
self
.
dtype
=
np
.
float64
def
setUp
(
self
):
self
.
initParams
()
self
.
init_loss_params
()
self
.
init_dtype
()
datas
=
np
.
random
.
uniform
(
-
0.99
,
0.99
,
[
self
.
batch_dim
,
self
.
feat_dim
]).
astype
(
self
.
dtype
)
datas
=
datas
/
np
.
sqrt
(
np
.
sum
(
np
.
square
(
datas
),
axis
=
1
,
keepdims
=
True
))
weights
=
np
.
random
.
uniform
(
-
0.99
,
0.99
,
[
self
.
feat_dim
,
self
.
num_class
]).
astype
(
self
.
dtype
)
weights
=
weights
/
np
.
sqrt
(
np
.
sum
(
np
.
square
(
weights
),
axis
=
0
,
keepdims
=
True
))
logits
=
np
.
matmul
(
datas
,
weights
)
labels
=
np
.
random
.
randint
(
0
,
self
.
num_class
,
(
self
.
batch_dim
,
),
dtype
=
"int64"
)
loss
,
softmax
=
margin_cross_entropy
(
logits
,
labels
,
self
.
axis
,
self
.
margin1
,
self
.
margin2
,
self
.
margin3
,
self
.
scale
)
self
.
inputs
=
{
"Logits"
:
logits
,
"Label"
:
labels
}
self
.
outputs
=
{
"Softmax"
:
softmax
.
astype
(
self
.
dtype
),
"Loss"
:
loss
.
astype
(
self
.
dtype
)
}
self
.
attrs
=
{
'margin1'
:
self
.
margin1
,
'margin2'
:
self
.
margin2
,
'margin3'
:
self
.
margin3
,
'scale'
:
self
.
scale
,
}
def
test_check_output
(
self
):
self
.
check_output_with_place
(
core
.
CUDAPlace
(
0
),
atol
=
1e-5
)
def
test_check_grad
(
self
):
self
.
check_grad_with_place
(
core
.
CUDAPlace
(
0
),
[
"Logits"
],
"Loss"
)
@
unittest
.
skipIf
(
not
core
.
is_compiled_with_cuda
(),
"core is not compiled with CUDA"
)
class
TestMarginCrossEntropyOpFP32
(
TestMarginCrossEntropyOp
):
def
init_dtype
(
self
):
self
.
dtype
=
np
.
float32
def
test_check_grad
(
self
):
self
.
check_grad_with_place
(
core
.
CUDAPlace
(
0
),
[
"Logits"
],
"Loss"
,
numeric_grad_delta
=
5e-2
,
max_relative_error
=
5e-2
)
@
unittest
.
skipIf
(
not
core
.
is_compiled_with_cuda
(),
"core is not compiled with CUDA"
)
class
TestMarginCrossEntropyOpFP16
(
TestMarginCrossEntropyOp
):
def
init_dtype
(
self
):
self
.
dtype
=
np
.
float16
def
test_check_output
(
self
):
self
.
check_output_with_place
(
core
.
CUDAPlace
(
0
),
atol
=
5e-2
)
def
test_check_grad
(
self
):
self
.
check_grad_with_place
(
core
.
CUDAPlace
(
0
),
[
"Logits"
],
"Loss"
,
numeric_grad_delta
=
6e-1
,
max_relative_error
=
6e-1
)
@
unittest
.
skipIf
(
not
core
.
is_compiled_with_cuda
(),
"core is not compiled with CUDA"
)
class
TestMarginCrossEntropyOpCosFace
(
TestMarginCrossEntropyOp
):
def
init_loss_params
(
self
):
self
.
margin1
=
1.0
self
.
margin2
=
0.0
self
.
margin3
=
0.35
self
.
scale
=
2.0
@
unittest
.
skipIf
(
not
core
.
is_compiled_with_cuda
(),
"core is not compiled with CUDA"
)
class
TestMarginCrossEntropyOpSphereFace
(
TestMarginCrossEntropyOp
):
def
init_loss_params
(
self
):
self
.
margin1
=
1.35
self
.
margin2
=
0.0
self
.
margin3
=
0.0
self
.
scale
=
2.0
class
TestMarginCrossEntropyOpCPU
(
TestMarginCrossEntropyOp
):
def
test_check_output
(
self
):
try
:
self
.
check_output_with_place
(
core
.
CPUPlace
(),
atol
=
1e-5
)
except
RuntimeError
:
pass
def
test_check_grad
(
self
):
try
:
self
.
check_grad_with_place
(
core
.
CPUPlace
(),
[
"Logits"
],
"Loss"
)
except
RuntimeError
:
pass
@
unittest
.
skipIf
(
not
core
.
is_compiled_with_cuda
(),
"core is not compiled with CUDA"
)
class
TestMarginCrossEntropyOpV2
(
unittest
.
TestCase
):
def
setUp
(
self
):
self
.
initParams
()
np
.
random
.
seed
(
self
.
seed
)
paddle
.
framework
.
random
.
_manual_program_seed
(
self
.
seed
)
self
.
places
=
[]
if
core
.
is_compiled_with_cuda
():
self
.
places
.
append
(
paddle
.
fluid
.
CUDAPlace
(
0
))
def
initParams
(
self
):
self
.
seed
=
2021
self
.
axis
=
-
1
self
.
batch_dim
=
5
self
.
feat_dim
=
41
self
.
num_class
=
37
self
.
init_loss_params
()
self
.
init_dtype
()
self
.
init_reduction
()
def
init_loss_params
(
self
):
self
.
margin1
=
1.0
self
.
margin2
=
0.5
self
.
margin3
=
0.0
self
.
scale
=
2.0
def
init_dtype
(
self
):
self
.
dtype
=
np
.
float64
def
init_reduction
(
self
):
self
.
reduction
=
None
def
test_static
(
self
):
for
place
in
self
.
places
:
self
.
check_static_result
(
place
=
place
)
def
check_static_result
(
self
,
place
):
with
program_guard
(
Program
(),
Program
()):
datas
=
np
.
random
.
uniform
(
-
0.99
,
0.99
,
[
self
.
batch_dim
,
self
.
feat_dim
]).
astype
(
self
.
dtype
)
datas
=
datas
/
np
.
sqrt
(
np
.
sum
(
np
.
square
(
datas
),
axis
=
1
,
keepdims
=
True
))
weights
=
np
.
random
.
uniform
(
-
0.99
,
0.99
,
[
self
.
feat_dim
,
self
.
num_class
]).
astype
(
self
.
dtype
)
weights
=
weights
/
np
.
sqrt
(
np
.
sum
(
np
.
square
(
weights
),
axis
=
0
,
keepdims
=
True
))
logits_np
=
np
.
matmul
(
datas
,
weights
)
labels_np
=
np
.
random
.
randint
(
0
,
self
.
num_class
,
(
self
.
batch_dim
,
),
dtype
=
"int64"
)
loss_np
,
softmax_np
=
margin_cross_entropy
(
logits_np
,
labels_np
,
self
.
axis
,
self
.
margin1
,
self
.
margin2
,
self
.
margin3
,
self
.
scale
,
self
.
reduction
)
logits
=
paddle
.
static
.
data
(
name
=
'logits'
,
shape
=
[
self
.
batch_dim
,
self
.
num_class
],
dtype
=
self
.
dtype
)
label
=
paddle
.
static
.
data
(
name
=
'label'
,
shape
=
[
self
.
batch_dim
],
dtype
=
"int64"
)
loss
,
softmax
=
paddle
.
nn
.
functional
.
margin_cross_entropy
(
logits
,
label
,
margin1
=
self
.
margin1
,
margin2
=
self
.
margin2
,
margin3
=
self
.
margin3
,
scale
=
self
.
scale
,
return_softmax
=
True
,
reduction
=
self
.
reduction
)
exe
=
paddle
.
fluid
.
Executor
(
place
)
[
loss_res
,
softmax_res
]
=
exe
.
run
(
paddle
.
fluid
.
default_main_program
(),
feed
=
{
'logits'
:
logits_np
,
'label'
:
labels_np
},
fetch_list
=
[
loss
,
softmax
])
np
.
testing
.
assert_allclose
(
loss_res
,
loss_np
)
np
.
testing
.
assert_allclose
(
softmax_res
,
softmax_np
)
def
test_dynamic
(
self
):
for
place
in
self
.
places
:
self
.
check_dynamic_result
(
place
=
place
)
def
check_dynamic_result
(
self
,
place
):
with
paddle
.
fluid
.
dygraph
.
guard
(
place
):
datas
=
np
.
random
.
uniform
(
-
0.99
,
0.99
,
[
self
.
batch_dim
,
self
.
feat_dim
]).
astype
(
self
.
dtype
)
datas
=
datas
/
np
.
sqrt
(
np
.
sum
(
np
.
square
(
datas
),
axis
=
1
,
keepdims
=
True
))
weights
=
np
.
random
.
uniform
(
-
0.99
,
0.99
,
[
self
.
feat_dim
,
self
.
num_class
]).
astype
(
self
.
dtype
)
weights
=
weights
/
np
.
sqrt
(
np
.
sum
(
np
.
square
(
weights
),
axis
=
0
,
keepdims
=
True
))
logits_np
=
np
.
matmul
(
datas
,
weights
)
labels_np
=
np
.
random
.
randint
(
0
,
self
.
num_class
,
(
self
.
batch_dim
,
),
dtype
=
"int64"
)
loss_np
,
softmax_np
=
margin_cross_entropy
(
logits_np
,
labels_np
,
self
.
axis
,
self
.
margin1
,
self
.
margin2
,
self
.
margin3
,
self
.
scale
,
self
.
reduction
)
logits
=
paddle
.
to_tensor
(
logits_np
,
dtype
=
self
.
dtype
)
labels
=
paddle
.
to_tensor
(
labels_np
,
dtype
=
"int64"
)
loss
,
softmax
=
paddle
.
nn
.
functional
.
margin_cross_entropy
(
logits
,
labels
,
margin1
=
self
.
margin1
,
margin2
=
self
.
margin2
,
margin3
=
self
.
margin3
,
scale
=
self
.
scale
,
return_softmax
=
True
,
reduction
=
self
.
reduction
)
loss_res
=
loss
.
numpy
()
softmax_res
=
softmax
.
numpy
()
np
.
testing
.
assert_allclose
(
loss_res
,
loss_np
)
np
.
testing
.
assert_allclose
(
softmax_res
,
softmax_np
)
@
unittest
.
skipIf
(
not
core
.
is_compiled_with_cuda
(),
"core is not compiled with CUDA"
)
class
TestMarginCrossEntropyOpV3
(
TestMarginCrossEntropyOpV2
):
def
init_reduction
(
self
):
self
.
reduction
=
'mean'
@
unittest
.
skipIf
(
not
core
.
is_compiled_with_cuda
(),
"core is not compiled with CUDA"
)
class
TestMarginCrossEntropyOpV4
(
TestMarginCrossEntropyOpV2
):
def
init_reduction
(
self
):
self
.
reduction
=
'sum'
@
unittest
.
skipIf
(
not
core
.
is_compiled_with_cuda
(),
"core is not compiled with CUDA"
)
class
TestMarginCrossEntropyOpAPIError
(
unittest
.
TestCase
):
def
setUp
(
self
):
self
.
initParams
()
np
.
random
.
seed
(
self
.
seed
)
paddle
.
framework
.
random
.
_manual_program_seed
(
self
.
seed
)
self
.
places
=
[]
if
core
.
is_compiled_with_cuda
():
self
.
places
.
append
(
paddle
.
fluid
.
CUDAPlace
(
0
))
def
initParams
(
self
):
self
.
seed
=
2021
self
.
axis
=
-
1
self
.
batch_dim
=
10
self
.
feat_dim
=
41
self
.
num_class
=
37
self
.
init_loss_params
()
self
.
init_dtype
()
def
init_loss_params
(
self
):
self
.
margin1
=
1.0
self
.
margin2
=
0.5
self
.
margin3
=
0.0
self
.
scale
=
2.0
def
init_dtype
(
self
):
self
.
dtype
=
np
.
float64
def
test_dynamic_errors
(
self
):
def
test_dim
():
for
place
in
self
.
places
:
with
paddle
.
fluid
.
dygraph
.
guard
(
place
):
labels_np
=
np
.
random
.
randint
(
0
,
self
.
num_class
,
(
self
.
batch_dim
,
2
),
dtype
=
"int64"
)
logits_np
=
np
.
random
.
uniform
(
-
0.99
,
0.99
,
[
self
.
batch_dim
,
self
.
num_class
]).
astype
(
self
.
dtype
)
labels
=
paddle
.
to_tensor
(
labels_np
)
logits
=
paddle
.
to_tensor
(
logits_np
)
loss
,
softmax
=
paddle
.
nn
.
functional
.
margin_cross_entropy
(
logits
,
labels
,
margin1
=
self
.
margin1
,
margin2
=
self
.
margin2
,
margin3
=
self
.
margin3
,
scale
=
self
.
scale
,
return_softmax
=
True
,
reduction
=
None
)
self
.
assertRaises
(
ValueError
,
test_dim
)
if
__name__
==
'__main__'
:
unittest
.
main
()
python/paddle/fluid/tests/unittests/test_parallel_margin_cross_entropy.py
0 → 100644
浏览文件 @
b0cb4148
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from
__future__
import
print_function
import
unittest
import
paddle.fluid
as
fluid
from
test_parallel_dygraph_dataparallel
import
TestMultipleGpus
class
TestParallelMarginSoftmaxWithCrossEntropy
(
TestMultipleGpus
):
def
test_parallel_margin_cross_entropy
(
self
):
self
.
run_mnist_2gpu
(
'parallel_margin_cross_entropy.py'
)
if
__name__
==
"__main__"
:
unittest
.
main
()
python/paddle/nn/functional/__init__.py
浏览文件 @
b0cb4148
...
@@ -79,6 +79,7 @@ from .loss import npair_loss # noqa: F401
...
@@ -79,6 +79,7 @@ from .loss import npair_loss # noqa: F401
from
.loss
import
sigmoid_focal_loss
# noqa: F401
from
.loss
import
sigmoid_focal_loss
# noqa: F401
from
.loss
import
smooth_l1_loss
# noqa: F401
from
.loss
import
smooth_l1_loss
# noqa: F401
from
.loss
import
softmax_with_cross_entropy
# noqa: F401
from
.loss
import
softmax_with_cross_entropy
# noqa: F401
from
.loss
import
margin_cross_entropy
# noqa: F401
from
.loss
import
square_error_cost
# noqa: F401
from
.loss
import
square_error_cost
# noqa: F401
from
.loss
import
ctc_loss
# noqa: F401
from
.loss
import
ctc_loss
# noqa: F401
from
.norm
import
batch_norm
# noqa: F401
from
.norm
import
batch_norm
# noqa: F401
...
@@ -185,6 +186,7 @@ __all__ = [ #noqa
...
@@ -185,6 +186,7 @@ __all__ = [ #noqa
'sigmoid_focal_loss'
,
'sigmoid_focal_loss'
,
'smooth_l1_loss'
,
'smooth_l1_loss'
,
'softmax_with_cross_entropy'
,
'softmax_with_cross_entropy'
,
'margin_cross_entropy'
,
'square_error_cost'
,
'square_error_cost'
,
'ctc_loss'
,
'ctc_loss'
,
'affine_grid'
,
'affine_grid'
,
...
...
python/paddle/nn/functional/loss.py
浏览文件 @
b0cb4148
...
@@ -1092,6 +1092,268 @@ def ctc_loss(log_probs,
...
@@ -1092,6 +1092,268 @@ def ctc_loss(log_probs,
return
loss_out
return
loss_out
def
margin_cross_entropy
(
logits
,
label
,
margin1
=
1.0
,
margin2
=
0.5
,
margin3
=
0.0
,
scale
=
64.0
,
group
=
None
,
return_softmax
=
False
,
reduction
=
'mean'
):
"""
.. math::
L=-
\\
frac{1}{N}\sum^N_{i=1}\log
\\
frac{e^{s(cos(m_{1}
\\
theta_{y_i}+m_{2})-m_{3})}}{e^{s(cos(m_{1}
\\
theta_{y_i}+m_{2})-m_{3})}+\sum^n_{j=1,j
\\
neq y_i} e^{scos
\\
theta_{y_i}}}
where the :math:`
\\
theta_{y_i}` is the angle between the feature :math:`x` and
the representation of class :math:`i`. The details of ArcFace loss
could be referred to https://arxiv.org/abs/1801.07698.
.. hint::
The API supports model parallel and single GPU. And logits.shape[-1] can be different at each rank.
Args:
logits (Tensor): shape[N, local_num_classes], the output of the normalized X multiply the normalized W.
The logits is shard_logits when using model parallel.
label (Tensor): shape[N] or shape[N, 1], the groud truth label.
margin1 (float, optional): m1 of margin loss, default value is `1.0`.
margin2 (float, optional): m2 of margin loss, default value is `0.5`.
margin3 (float, optional): m3 of margin loss, default value is `0.0`.
scale (float, optional): s of margin loss, default value is `64.0`.
group (Group, optional): The abstract representation of group, see paddle.distributed.collective.Group.
Default `None`.
return_softmax (bool, optional): Whether return softmax probability. Default value is `False`.
reduction (str, optional): The candicates are ``'none'`` | ``'mean'`` | ``'sum'``.
If :attr:`reduction` is ``'mean'``, return the average of loss;
If :attr:`reduction` is ``'sum'``, return the sum of loss;
If :attr:`reduction` is ``'none'``, no reduction will be applied.
Default value is `'mean'`.
Returns:
``Tensor`` or Tuple of two ``Tensor`` : Return the cross entropy loss if
\
`return_softmax` is False, otherwise the tuple
\
(loss, softmax), softmax is shard_softmax when
\
using model parallel, otherwise softmax is in
\
the same shape with input logits. If ``reduction == None``,
\
the shape of loss is ``[N, 1]``, otherwise the shape is ``[1]``.
Examples:
.. code-block:: python
# required: gpu
# Single GPU
import paddle
m1 = 1.0
m2 = 0.5
m3 = 0.0
s = 64.0
batch_size = 2
feature_length = 4
num_classes = 4
label = paddle.randint(low=0, high=num_classes, shape=[batch_size], dtype='int64')
X = paddle.randn(
shape=[batch_size, feature_length],
dtype='float64')
X_l2 = paddle.sqrt(paddle.sum(paddle.square(X), axis=1, keepdim=True))
X = paddle.divide(X, X_l2)
W = paddle.randn(
shape=[feature_length, num_classes],
dtype='float64')
W_l2 = paddle.sqrt(paddle.sum(paddle.square(W), axis=0, keepdim=True))
W = paddle.divide(W, W_l2)
logits = paddle.matmul(X, W)
loss, softmax = paddle.nn.functional.margin_cross_entropy(
logits, label, margin1=m1, margin2=m2, margin3=m3, scale=s, return_softmax=True, reduction=None)
print(logits)
print(label)
print(loss)
print(softmax)
#Tensor(shape=[2, 4], dtype=float64, place=CUDAPlace(0), stop_gradient=True,
# [[ 0.85204151, -0.55557678, 0.04994566, 0.71986042],
# [-0.20198586, -0.35270476, -0.55182702, 0.09749021]])
#Tensor(shape=[2], dtype=int64, place=CUDAPlace(0), stop_gradient=True,
# [2, 3])
#Tensor(shape=[2, 1], dtype=float64, place=CUDAPlace(0), stop_gradient=True,
# [[82.37059586],
# [12.13448420]])
#Tensor(shape=[2, 4], dtype=float64, place=CUDAPlace(0), stop_gradient=True,
# [[0.99978819, 0.00000000, 0.00000000, 0.00021181],
# [0.99992995, 0.00006468, 0.00000000, 0.00000537]])
.. code-block:: python
# required: distributed
# Multi GPU, test_margin_cross_entropy.py
import paddle
import paddle.distributed as dist
strategy = dist.fleet.DistributedStrategy()
dist.fleet.init(is_collective=True, strategy=strategy)
rank_id = dist.get_rank()
m1 = 1.0
m2 = 0.5
m3 = 0.0
s = 64.0
batch_size = 2
feature_length = 4
num_class_per_card = [4, 8]
num_classes = paddle.sum(paddle.to_tensor(num_class_per_card))
label = paddle.randint(low=0, high=num_classes.item(), shape=[batch_size], dtype='int64')
label_list = []
dist.all_gather(label_list, label)
label = paddle.concat(label_list, axis=0)
X = paddle.randn(
shape=[batch_size, feature_length],
dtype='float64')
X_list = []
dist.all_gather(X_list, X)
X = paddle.concat(X_list, axis=0)
X_l2 = paddle.sqrt(paddle.sum(paddle.square(X), axis=1, keepdim=True))
X = paddle.divide(X, X_l2)
W = paddle.randn(
shape=[feature_length, num_class_per_card[rank_id]],
dtype='float64')
W_l2 = paddle.sqrt(paddle.sum(paddle.square(W), axis=0, keepdim=True))
W = paddle.divide(W, W_l2)
logits = paddle.matmul(X, W)
loss, softmax = paddle.nn.functional.margin_cross_entropy(
logits, label, margin1=m1, margin2=m2, margin3=m3, scale=s, return_softmax=True, reduction=None)
print(logits)
print(label)
print(loss)
print(softmax)
# python -m paddle.distributed.launch --gpus=0,1 test_margin_cross_entropy.py
## for rank0 input
#Tensor(shape=[4, 4], dtype=float64, place=CUDAPlace(0), stop_gradient=True,
# [[ 0.32888934, 0.02408748, -0.02763289, 0.18173063],
# [-0.52893978, -0.10623845, -0.21596515, -0.06432517],
# [-0.00536345, -0.03924667, 0.66735314, -0.28640926],
# [-0.09907366, -0.48534973, -0.10365338, -0.39472322]])
#Tensor(shape=[4], dtype=int64, place=CUDAPlace(0), stop_gradient=True,
# [11, 1 , 10, 11])
## for rank1 input
#Tensor(shape=[4, 8], dtype=float64, place=CUDAPlace(1), stop_gradient=True,
# [[ 0.68654754, 0.28137170, 0.69694954, -0.60923933, -0.57077653, 0.54576703, -0.38709028, 0.56028204],
# [-0.80360371, -0.03042448, -0.45107338, 0.49559349, 0.69998950, -0.45411693, 0.61927630, -0.82808600],
# [ 0.11457570, -0.34785879, -0.68819499, -0.26189226, -0.48241491, -0.67685711, 0.06510185, 0.49660849],
# [ 0.31604851, 0.52087884, 0.53124749, -0.86176582, -0.43426329, 0.34786144, -0.10850784, 0.51566383]])
#Tensor(shape=[4], dtype=int64, place=CUDAPlace(1), stop_gradient=True,
# [11, 1 , 10, 11])
## for rank0 output
#Tensor(shape=[4, 1], dtype=float64, place=CUDAPlace(0), stop_gradient=True,
# [[38.96608230],
# [81.28152394],
# [69.67229865],
# [31.74197251]])
#Tensor(shape=[4, 4], dtype=float64, place=CUDAPlace(0), stop_gradient=True,
# [[0.00000000, 0.00000000, 0.00000000, 0.00000000],
# [0.00000000, 0.00000000, 0.00000000, 0.00000000],
# [0.00000000, 0.00000000, 0.99998205, 0.00000000],
# [0.00000000, 0.00000000, 0.00000000, 0.00000000]])
## for rank1 output
#Tensor(shape=[4, 1], dtype=float64, place=CUDAPlace(1), stop_gradient=True,
# [[38.96608230],
# [81.28152394],
# [69.67229865],
# [31.74197251]])
#Tensor(shape=[4, 8], dtype=float64, place=CUDAPlace(1), stop_gradient=True,
# [[0.33943993, 0.00000000, 0.66051859, 0.00000000, 0.00000000, 0.00004148, 0.00000000, 0.00000000],
# [0.00000000, 0.00000000, 0.00000000, 0.00000207, 0.99432097, 0.00000000, 0.00567696, 0.00000000],
# [0.00000000, 0.00000000, 0.00000000, 0.00000000, 0.00000000, 0.00000000, 0.00000000, 0.00001795],
# [0.00000069, 0.33993085, 0.66006319, 0.00000000, 0.00000000, 0.00000528, 0.00000000, 0.00000000]])
"""
assert
reduction
in
[
'mean'
,
'sum'
,
'none'
,
None
]
if
group
is
not
None
and
not
group
.
is_member
():
return
ring_id
=
0
if
group
is
None
else
group
.
id
rank
=
0
nranks
=
1
if
core
.
is_compiled_with_dist
():
parallel_env
=
paddle
.
distributed
.
ParallelEnv
()
global_rank
=
parallel_env
.
rank
rank
=
global_rank
if
group
is
None
else
group
.
get_group_rank
(
global_rank
)
nranks
=
parallel_env
.
world_size
if
group
is
None
else
group
.
nranks
input_dims
=
len
(
list
(
logits
.
shape
))
label_dims
=
len
(
list
(
label
.
shape
))
if
input_dims
-
1
!=
label_dims
and
input_dims
!=
label_dims
:
raise
ValueError
(
'Expected nput_dims - 1 = label_dims or input_dims == label_dims
\
(got nput_dims{}, label_dims{})'
.
format
(
input_dims
,
label_dims
))
if
input_dims
-
1
==
label_dims
:
label
=
paddle
.
unsqueeze
(
label
,
axis
=-
1
)
if
in_dygraph_mode
():
softmax
,
loss
=
core
.
ops
.
margin_cross_entropy
(
logits
,
label
,
'ring_id'
,
ring_id
,
'rank'
,
rank
,
'nranks'
,
nranks
,
'margin1'
,
margin1
,
'margin2'
,
margin2
,
'margin3'
,
margin3
,
'scale'
,
scale
,
'return_softmax'
,
return_softmax
)
if
reduction
==
'mean'
:
loss
=
paddle
.
mean
(
loss
)
elif
reduction
==
'sum'
:
loss
=
paddle
.
sum
(
loss
)
if
not
return_softmax
:
return
loss
else
:
return
loss
,
softmax
op_type
=
'margin_cross_entropy'
helper
=
LayerHelper
(
op_type
,
**
locals
())
softmax
=
helper
.
create_variable_for_type_inference
(
dtype
=
logits
.
dtype
)
loss
=
helper
.
create_variable_for_type_inference
(
dtype
=
logits
.
dtype
)
check_variable_and_dtype
(
logits
,
'logits'
,
[
'float16'
,
'float32'
,
'float64'
],
'margin_cross_entropy'
)
check_variable_and_dtype
(
label
,
'label'
,
[
'int32'
,
'int64'
],
'margin_cross_entropy'
)
helper
.
append_op
(
type
=
op_type
,
inputs
=
{
'Logits'
:
logits
,
'Label'
:
label
},
outputs
=
{
'Softmax'
:
softmax
,
'Loss'
:
loss
},
attrs
=
{
'return_softmax'
:
return_softmax
,
'ring_id'
:
ring_id
,
'rank'
:
rank
,
'nranks'
:
nranks
,
'margin1'
:
margin1
,
'margin2'
:
margin2
,
'margin3'
:
margin3
,
'scale'
:
scale
,
})
if
reduction
==
'mean'
:
loss
=
paddle
.
mean
(
loss
)
elif
reduction
==
'sum'
:
loss
=
paddle
.
sum
(
loss
)
if
not
return_softmax
:
return
loss
else
:
return
loss
,
softmax
@
deprecated
(
@
deprecated
(
since
=
"2.0.0"
,
since
=
"2.0.0"
,
update_to
=
"paddle.nn.functional.cross_entropy"
,
update_to
=
"paddle.nn.functional.cross_entropy"
,
...
...
tools/static_mode_white_list.py
浏览文件 @
b0cb4148
...
@@ -719,4 +719,5 @@ STATIC_MODE_TESTING_LIST = [
...
@@ -719,4 +719,5 @@ STATIC_MODE_TESTING_LIST = [
'test_sgd_op_bf16'
,
'test_sgd_op_bf16'
,
'test_marker_op'
,
'test_marker_op'
,
'test_c_embedding_op'
,
'test_c_embedding_op'
,
'test_margin_cross_entropy_op'
,
]
]
编辑
预览
Markdown
is supported
0%
请重试
或
添加新附件
.
添加附件
取消
You are about to add
0
people
to the discussion. Proceed with caution.
先完成此消息的编辑!
取消
想要评论请
注册
或
登录