未验证 提交 b0cb4148 编写于 作者: G Guoxia Wang 提交者: GitHub

support margin loss (arcface, cosface, sphereface) for single GPU and cross GPUs (#34247)

* support margin loss (arcface, cosface, sphereface)
上级 dc439a12
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/margin_cross_entropy_op.h"
namespace paddle {
namespace operators {
class MarginCrossEntropyOp : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
OP_INOUT_CHECK(ctx->HasInput("Logits"), "Input", "Logits",
"MarginCrossEntropyOp");
OP_INOUT_CHECK(ctx->HasInput("Label"), "Input", "Label",
"MarginCrossEntropyOp");
OP_INOUT_CHECK(ctx->HasOutput("Softmax"), "Output", "Softmax",
"MarginCrossEntropyOp");
OP_INOUT_CHECK(ctx->HasOutput("Loss"), "Output", "Loss",
"MarginCrossEntropyOp");
auto logits_dims = ctx->GetInputDim("Logits");
auto labels_dims = ctx->GetInputDim("Label");
auto logits_rank = logits_dims.size();
auto axis = logits_rank - 1;
for (int i = 0; i < logits_rank; i++) {
if (i != axis) {
if (ctx->IsRuntime() || (logits_dims[i] > 0 && labels_dims[i] > 0)) {
PADDLE_ENFORCE_EQ(logits_dims[i], labels_dims[i],
platform::errors::InvalidArgument(
"Input(Logits) and Input(Label) should in "
"same shape in dimensions except axis."));
}
}
}
if (labels_dims.size() > 1) {
PADDLE_ENFORCE_EQ(
labels_dims[logits_rank - 1], 1UL,
platform::errors::InvalidArgument(
"the last dimension of Input(Label) should be 1."
"But received: the last dimension of Input(Label) is [%d],"
"the last dimension is [%d]",
labels_dims[logits_rank - 1], logits_rank - 1));
}
ctx->SetOutputDim("Softmax", logits_dims);
logits_dims[axis] = 1;
ctx->SetOutputDim("Loss", logits_dims);
ctx->ShareLoD("Logits", /*->*/ "Softmax");
ctx->ShareLoD("Logits", /*->*/ "Loss");
}
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
return framework::OpKernelType(
OperatorWithKernel::IndicateVarDataType(ctx, "Logits"),
ctx.device_context());
}
};
class MarginCrossEntropyOpMaker : public framework::OpProtoAndCheckerMaker {
public:
void Make() {
AddInput("Logits",
"(Tensor, default: Tensor<float>), The input tensor of unscaled "
"log probabilities, whose dimension :attr:`axis` should be scaled "
"by softmax.");
AddInput(
"Label",
"(Tensor) The input tensor of groud truth label. Label is a "
"Tensor<int64> in same shape with Input(Logits) except the shape in "
"dimension :attr:`axis` as 1.");
AddOutput(
"Softmax",
"(Tensor, default: Tensor<float>), A tensor in same shape with "
"Input(Logits). "
"The outputs value of softmax activation by given the input batch, "
"which will be used in backward calculation.");
AddOutput("Loss",
"(Tensor, default: Tensor<float>), A tensor in same shape with "
"Input(Logits) "
"except the shape in dimension :attr:`axis` as 1. The cross "
"entropy loss.");
AddAttr<bool>("return_softmax",
"(bool default false) A flag to indicate "
"whether to return softmax.")
.SetDefault(false);
AddAttr<int>("ring_id", "(int default 0) nccl communication ring id.")
.SetDefault(0);
AddAttr<int>("rank", "(int default 0) rank id for MarginCrossEntropy.")
.SetDefault(0);
AddAttr<int>("nranks", "(int default 1) nranks id for MarginCrossEntropy.")
.SetDefault(1);
AddAttr<float>("margin1", "(float default 1.0) margin1 for MarginLoss.")
.SetDefault(1.0);
AddAttr<float>("margin2", "(float default 0.5) margin2 for MarginLoss.")
.SetDefault(0.5);
AddAttr<float>("margin3", "(float default 0.0) margin3 for MarginLoss.")
.SetDefault(0.0);
AddAttr<float>("scale", "(float default 64.0) scale for MarginLoss.")
.SetDefault(64.0);
AddComment(R"DOC(
MarginCrossEntropy Operator
.. math::
L=-\frac{1}{N}\sum^N_{i=1}\log\frac{e^{s(cos(m_{1}\theta_{y_i}+m_{2})-m_{3})}}{e^{s(cos(m_{1}\theta_{y_i}+m_{2})-m_{3})}+\sum^n_{j=1,j\neq y_i} e^{scos\theta_{y_i}}}
where the :math: `\theta_{y_i}` is the angle between the feature :math: `x` and
the representation of class :math: `i`. The details of ArcFace loss
could be referred to https://arxiv.org/abs/1801.07698.
Note that the Op supports model parallel and single GPU. And Logits.shape[-1] can be different each rank.
)DOC");
}
};
class MarginCrossEntropyOpGrad : public framework::OperatorWithKernel {
public:
using framework::OperatorWithKernel::OperatorWithKernel;
void InferShape(framework::InferShapeContext* ctx) const override {
PADDLE_ENFORCE_EQ(ctx->HasInput(framework::GradVarName("Loss")), true,
platform::errors::InvalidArgument(
"Input(Loss@Grad) should not be null."));
PADDLE_ENFORCE_EQ(ctx->HasInput("Softmax"), true,
platform::errors::InvalidArgument(
"Input(Softmax) should be not null."));
PADDLE_ENFORCE_EQ(
ctx->HasInput("Label"), true,
platform::errors::InvalidArgument("Input(Label) should be not null."));
PADDLE_ENFORCE_EQ(ctx->HasOutput(framework::GradVarName("Logits")), true,
platform::errors::InvalidArgument(
"Output(Logits@Grad) should be not null."));
ctx->SetOutputDim(framework::GradVarName("Logits"),
ctx->GetInputDim("Softmax"));
}
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override {
return framework::OpKernelType(OperatorWithKernel::IndicateVarDataType(
ctx, framework::GradVarName("Loss")),
ctx.device_context());
}
};
template <typename T>
class MarginCrossEntropyOpGradMaker : public framework::SingleGradOpMaker<T> {
public:
using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
protected:
void Apply(GradOpPtr<T> op) const override {
op->SetType("margin_cross_entropy_grad");
op->SetInput("Softmax", this->Output("Softmax"));
op->SetInput("Logits", this->Input("Logits"));
op->SetInput("Label", this->Input("Label"));
op->SetInput(framework::GradVarName("Loss"), this->OutputGrad("Loss"));
op->SetAttrMap(this->Attrs());
op->SetOutput(framework::GradVarName("Logits"), this->InputGrad("Logits"));
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OPERATOR(
margin_cross_entropy, ops::MarginCrossEntropyOp,
ops::MarginCrossEntropyOpMaker,
ops::MarginCrossEntropyOpGradMaker<paddle::framework::OpDesc>,
ops::MarginCrossEntropyOpGradMaker<paddle::imperative::OpBase>);
REGISTER_OPERATOR(margin_cross_entropy_grad, ops::MarginCrossEntropyOpGrad);
REGISTER_OP_CPU_KERNEL(margin_cross_entropy,
ops::MarginCrossEntropyOpCPUKernel<float>,
ops::MarginCrossEntropyOpCPUKernel<double>,
ops::MarginCrossEntropyOpCPUKernel<plat::float16>);
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#ifdef PADDLE_WITH_HIP
#include <hipcub/hipcub.hpp>
namespace cub = hipcub;
#else
#include <cub/cub.cuh>
#endif
#include <vector>
#include "paddle/fluid/operators/amp/fp16_type_traits.h"
#include "paddle/fluid/operators/margin_cross_entropy_op.h"
#include "paddle/fluid/operators/math/math_function.h"
#include "paddle/fluid/operators/math/softmax_impl.h"
#include "paddle/fluid/operators/reduce_ops/reduce_functor_op.h"
#include "paddle/fluid/operators/reduce_ops/reduce_op.h"
#include "paddle/fluid/string/string_helper.h"
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
#include "paddle/fluid/platform/collective_helper.h"
#include "paddle/fluid/platform/nccl_helper.h"
#endif
namespace paddle {
namespace operators {
using Tensor = framework::Tensor;
static constexpr int kNumCUDAThreads = 512;
static constexpr int kNumMaxinumNumBlocks = 4096;
static inline int NumBlocks(const int N) {
return std::min((N + kNumCUDAThreads - 1) / kNumCUDAThreads,
kNumMaxinumNumBlocks);
}
void GetClassInterval(const gpuStream_t& stream, const platform::Place& place,
const platform::DeviceContext& ctx, const int rid,
const int rank, const int nranks, const int D,
Tensor* class_interval) {
std::vector<int> shard_dim_vec(nranks + 1, 0);
shard_dim_vec[rank + 1] = D;
if (nranks <= 1) {
framework::TensorFromVector(shard_dim_vec, ctx, class_interval);
return;
}
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
Tensor num_classes_per_device;
framework::TensorFromVector(shard_dim_vec, ctx, &num_classes_per_device);
int* num_classes_per_device_ptr = num_classes_per_device.data<int>();
const auto& comm = platform::NCCLCommContext::Instance().Get(rid, place);
// use global calculate stream
const auto calcu_stream =
static_cast<platform::CUDADeviceContext*>(
platform::DeviceContextPool::Instance().Get(place))
->stream();
PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllReduce(
num_classes_per_device_ptr, num_classes_per_device_ptr,
num_classes_per_device.numel(),
platform::ToNCCLDataType(num_classes_per_device.type()), ncclSum,
comm->comm(), calcu_stream));
auto class_interval_ptr =
class_interval->mutable_data<int>({nranks + 1}, place);
size_t cub_temp_storage_bytes = 0;
cub::DeviceScan::InclusiveSum<int*, int*>(
nullptr, cub_temp_storage_bytes, nullptr, nullptr, nranks + 1, stream);
auto cub_temp_storage = memory::Alloc(place, cub_temp_storage_bytes);
cub::DeviceScan::InclusiveSum<int*, int*>(
cub_temp_storage->ptr(), cub_temp_storage_bytes,
num_classes_per_device_ptr, class_interval_ptr, nranks + 1, stream);
return;
#endif
}
template <typename T, typename IndexT>
__global__ void AddMarginToPositiveLogitsKernel(
T* logit, const IndexT* label, const float margin1, const float margin2,
const float margin3, const int rank, const int nranks, const int64_t N,
const int64_t D, const int* class_interval_ptr) {
using MPType = typename details::MPTypeTrait<T>::Type;
int start_index = class_interval_ptr[rank];
int end_index = class_interval_ptr[rank + 1];
int num_classes = class_interval_ptr[nranks];
CUDA_KERNEL_LOOP(i, N) {
auto real_label = label[i];
PADDLE_ENFORCE((real_label < num_classes) && (real_label >= 0),
"The index is out of bounds, "
"please check whether the value of label and "
"input meet the number of class. It should "
"be less than [%d], but received [%d]",
num_classes, real_label);
if (real_label >= start_index && real_label < end_index) {
int64_t offset = i * D + real_label - start_index;
if (fabs(margin1 - 1.0) > 1e-8 || fabs(margin2) > 1e-8) {
MPType x = static_cast<MPType>(logit[offset]);
MPType theta = acos(x);
if (fabs(margin1 - 1.0) > 1e-8) {
theta *= static_cast<MPType>(margin1);
}
if (fabs(margin2) > 1e-8) {
theta += static_cast<MPType>(margin2);
}
logit[offset] = static_cast<T>(cos(theta));
}
if (fabs(margin3) > 1e-8) {
MPType y = static_cast<MPType>(logit[offset]);
y -= static_cast<MPType>(margin3);
logit[offset] = static_cast<T>(y);
}
}
}
}
static __device__ __forceinline__ platform::float16 exp_on_device(
platform::float16 x) {
return ::Eigen::numext::exp(x);
}
static __device__ __forceinline__ float exp_on_device(float x) {
return expf(x);
}
static __device__ __forceinline__ double exp_on_device(double x) {
return exp(x);
}
static __device__ __forceinline__ platform::float16 log_on_device(
platform::float16 x) {
return ::Eigen::numext::log(x);
}
static __device__ __forceinline__ float log_on_device(float x) {
return logf(x);
}
static __device__ __forceinline__ double log_on_device(double x) {
return log(x);
}
template <typename Tx, typename Ty = Tx>
struct ExpLogitTransformer {
HOSTDEVICE explicit inline ExpLogitTransformer(int n) {}
HOSTDEVICE inline Ty operator()(const Tx& x) const {
return static_cast<Ty>(exp_on_device(x));
}
};
template <typename Tx, typename Ty = Tx>
struct ExpAndSum {
using Transformer = ExpLogitTransformer<Tx>;
inline Ty initial() { return static_cast<Ty>(0.0f); }
__device__ __forceinline__ Ty operator()(const Ty& a, const Ty& b) const {
return b + a;
}
};
template <typename T>
__global__ void ScaleLogitKernel(T* logits, const float scale, const int64_t N,
const int64_t D) {
CUDA_KERNEL_LOOP(i, N * D) { logits[i] *= static_cast<T>(scale); }
}
template <typename T>
__global__ void LogitsMinusMaxKernel(T* logits, const T* logits_max_per_row,
const int64_t N, const int64_t D) {
CUDA_KERNEL_LOOP(i, N * D) {
auto row = i / D;
logits[i] -= logits_max_per_row[row];
}
}
template <typename T>
__global__ void LogitsMinusLogSumKernel(T* logits, const T* logits_sum_per_row,
const int64_t N, const int64_t D) {
CUDA_KERNEL_LOOP(i, N * D) {
auto row = i / D;
logits[i] -= log_on_device(logits_sum_per_row[row]);
}
}
template <typename T, typename IndexT>
__global__ void HardLabelSoftmaxWithCrossEntropyKernel(
T* loss, T* log_softmax, const IndexT* labels, const int rank,
const int64_t N, const int64_t D, const int* class_interval_ptr) {
int start_index = class_interval_ptr[rank];
CUDA_KERNEL_LOOP(i, N * D) {
auto row = i / D;
auto col = i % D;
if ((col + start_index) == labels[row]) {
auto softmax = log_softmax[i];
loss[row] = -softmax;
log_softmax[i] = exp_on_device(softmax);
} else {
log_softmax[i] = exp_on_device(log_softmax[i]);
}
}
}
template <typename T, typename IndexT>
__global__ void CalculateGrad(T* logits_grad, const T* loss_grad,
const T* logits, const IndexT* labels,
const float margin1, const float margin2,
const float scale, const int rank,
const int64_t N, const int64_t D,
const int* class_interval_ptr) {
using MPType = typename details::MPTypeTrait<T>::Type;
int start_index = class_interval_ptr[rank];
CUDA_KERNEL_LOOP(i, N * D) {
auto row = i / D;
auto col = i % D;
if ((col + start_index) == labels[row]) {
logits_grad[i] = (logits_grad[i] - static_cast<T>(1.0)) * loss_grad[row];
if (fabs(margin1 - 1.0) > 1e-8 || fabs(margin2) > 1e-8) {
MPType dout = static_cast<MPType>(logits_grad[i]);
MPType one = static_cast<MPType>(1.0f);
MPType x = static_cast<MPType>(logits[i]);
MPType m1 = static_cast<MPType>(margin1);
MPType m2 = static_cast<MPType>(margin2);
MPType d = m1 * sin(m1 * acos(x) + m2) / sqrt(one - x * x);
logits_grad[i] = static_cast<T>(dout * d);
}
} else {
logits_grad[i] *= loss_grad[row];
}
if (fabs(scale - 1.0) > 1e-8) {
logits_grad[i] *= static_cast<T>(scale);
}
}
}
template <typename T>
class MarginCrossEntropyOpCUDAKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
const Tensor* logits = ctx.Input<Tensor>("Logits");
const Tensor* labels = ctx.Input<Tensor>("Label");
Tensor* softmax = ctx.Output<Tensor>("Softmax");
Tensor* loss = ctx.Output<Tensor>("Loss");
const int rid = ctx.Attr<int>("ring_id");
const int nranks = ctx.Attr<int>("nranks");
const int rank = ctx.Attr<int>("rank");
const float margin1 = ctx.Attr<float>("margin1");
const float margin2 = ctx.Attr<float>("margin2");
const float margin3 = ctx.Attr<float>("margin3");
const float scale = ctx.Attr<float>("scale");
const auto& place = ctx.GetPlace();
auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
platform::NCCLComm* comm;
gpuStream_t stream;
if (nranks > 1) {
comm = platform::NCCLCommContext::Instance().Get(rid, place);
// use global calculate stream
stream = static_cast<platform::CUDADeviceContext*>(
platform::DeviceContextPool::Instance().Get(place))
->stream();
}
#endif
// allocate memory on device.
T* softmax_ptr = softmax->mutable_data<T>(place);
T* loss_ptr = loss->mutable_data<T>(place);
const auto& logits_dims = logits->dims();
const auto& labels_dims = labels->dims();
const int axis = logits_dims.size() - 1;
const int N = SizeToAxis(axis, logits_dims);
const int D = SizeFromAxis(axis, logits_dims);
int blocks = NumBlocks(N);
int threads = kNumCUDAThreads;
const auto& label_type = labels->type();
// copy logits to softmax variable since we can't modify logits,
// and it also be used when calculate grad
framework::TensorCopy(*logits, ctx.GetPlace(), ctx.device_context(),
softmax);
Tensor softmax_2d;
softmax_2d.ShareDataWith(*softmax).Resize({N, D});
T* logits_ptr = softmax_2d.data<T>();
Tensor class_interval;
GetClassInterval(dev_ctx.stream(), place, ctx.cuda_device_context(), rid,
rank, nranks, D, &class_interval);
// step 1, preprocess logits
// add margin for positive elements
// theta = acos(x_i)
// (cos(m1 * theta + m2) - m3)
// save match_logits, used for gradient computation.
if (label_type == framework::proto::VarType::INT32) {
typedef int32_t LabelT;
AddMarginToPositiveLogitsKernel<
T><<<NumBlocks(N), threads, 0, dev_ctx.stream()>>>(
logits_ptr, labels->data<LabelT>(), margin1, margin2, margin3, rank,
nranks, N, D, class_interval.data<int>());
} else if (label_type == framework::proto::VarType::INT64) {
typedef int64_t LabelT;
AddMarginToPositiveLogitsKernel<
T><<<NumBlocks(N), threads, 0, dev_ctx.stream()>>>(
logits_ptr, labels->data<LabelT>(), margin1, margin2, margin3, rank,
nranks, N, D, class_interval.data<int>());
}
// scale by s
ScaleLogitKernel<T><<<NumBlocks(N * D), threads, 0, dev_ctx.stream()>>>(
logits_ptr, scale, N, D);
// step 2, obtain logit_max
Tensor logits_max;
logits_max =
ctx.AllocateTmpTensor<T, platform::CUDADeviceContext>({N, 1}, dev_ctx);
T* logits_max_buff = logits_max.mutable_data<T>(place);
TensorReduceFunctorImpl<T, T, CustomMax>(softmax_2d, &logits_max, {1},
dev_ctx.stream());
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
if (nranks > 1) {
PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllReduce(
logits_max_buff, logits_max_buff, logits_max.numel(),
platform::ToNCCLDataType(logits_max.type()), ncclMax, comm->comm(),
stream));
}
#endif
// step 3, logit - logit_max
LogitsMinusMaxKernel<T><<<NumBlocks(N * D), threads, 0, dev_ctx.stream()>>>(
logits_ptr, logits_max_buff, N, D);
// step 4, sum(exp(logit - logit_max))
Tensor sum_exp_logits;
sum_exp_logits =
ctx.AllocateTmpTensor<T, platform::CUDADeviceContext>({N, 1}, dev_ctx);
T* sum_exp_logits_buff = sum_exp_logits.mutable_data<T>(place);
TensorReduceFunctorImpl<T, T, ExpAndSum>(softmax_2d, &sum_exp_logits, {1},
dev_ctx.stream());
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
if (nranks > 1) {
PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllReduce(
sum_exp_logits_buff, sum_exp_logits_buff, sum_exp_logits.numel(),
platform::ToNCCLDataType(sum_exp_logits.type()), ncclSum,
comm->comm(), stream));
}
#endif
// step 5, (logit - logit_max) - log(sum(exp(logit - logit_max)))
LogitsMinusLogSumKernel<
T><<<NumBlocks(N * D), threads, 0, dev_ctx.stream()>>>(
logits_ptr, sum_exp_logits_buff, N, D);
// step 6, prob = exp((logit - logit_max) - log(sum(exp(logit -
// logit_max))))
// loss = -((logit_i - logit_max) - log(sum(exp(logit - logit_max))))
math::SetConstant<platform::CUDADeviceContext, T>()(dev_ctx, loss,
static_cast<T>(0.0));
if (label_type == framework::proto::VarType::INT32) {
typedef int32_t LabelT;
HardLabelSoftmaxWithCrossEntropyKernel<
T, LabelT><<<blocks, threads, 0, dev_ctx.stream()>>>(
loss_ptr, logits_ptr, labels->data<LabelT>(), rank, N, D,
class_interval.data<int>());
} else if (label_type == framework::proto::VarType::INT64) {
typedef int64_t LabelT;
HardLabelSoftmaxWithCrossEntropyKernel<
T, LabelT><<<blocks, threads, 0, dev_ctx.stream()>>>(
loss_ptr, logits_ptr, labels->data<LabelT>(), rank, N, D,
class_interval.data<int>());
}
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
if (nranks > 1) {
PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllReduce(
loss_ptr, loss_ptr, loss->numel(),
platform::ToNCCLDataType(loss->type()), ncclSum, comm->comm(),
stream));
}
#endif
}
};
template <typename T>
class MarginCrossEntropyGradCUDAKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
const Tensor* labels = context.Input<Tensor>("Label");
const Tensor* logits = context.Input<Tensor>("Logits");
const Tensor* softmax = context.Input<Tensor>("Softmax");
const Tensor* loss_grad =
context.Input<Tensor>(framework::GradVarName("Loss"));
Tensor* logit_grad =
context.Output<Tensor>(framework::GradVarName("Logits"));
const bool return_softmax = context.Attr<bool>("return_softmax");
const int rid = context.Attr<int>("ring_id");
const int nranks = context.Attr<int>("nranks");
const int rank = context.Attr<int>("rank");
const float margin1 = context.Attr<float>("margin1");
const float margin2 = context.Attr<float>("margin2");
const float margin3 = context.Attr<float>("margin3");
const float scale = context.Attr<float>("scale");
auto& dev_ctx =
context.template device_context<platform::CUDADeviceContext>();
const auto sofrmax_dims = softmax->dims();
const int axis = sofrmax_dims.size() - 1;
const int N = SizeToAxis(axis, sofrmax_dims);
const int D = SizeFromAxis(axis, sofrmax_dims);
if (return_softmax) {
framework::TensorCopy(*softmax, context.GetPlace(),
context.device_context(), logit_grad);
} else {
logit_grad->ShareDataWith(*softmax);
}
int blocks = NumBlocks(N * D);
int threads = kNumCUDAThreads;
const auto& label_type = labels->type();
Tensor class_interval;
GetClassInterval(dev_ctx.stream(), context.GetPlace(),
context.cuda_device_context(), rid, rank, nranks, D,
&class_interval);
if (label_type == framework::proto::VarType::INT32) {
typedef int32_t LabelT;
CalculateGrad<T, LabelT><<<blocks, threads, 0, dev_ctx.stream()>>>(
logit_grad->data<T>(), loss_grad->data<T>(), logits->data<T>(),
labels->data<LabelT>(), margin1, margin2, scale, rank, N, D,
class_interval.data<int>());
} else if (label_type == framework::proto::VarType::INT64) {
typedef int64_t LabelT;
CalculateGrad<T, LabelT><<<blocks, threads, 0, dev_ctx.stream()>>>(
logit_grad->data<T>(), loss_grad->data<T>(), logits->data<T>(),
labels->data<LabelT>(), margin1, margin2, scale, rank, N, D,
class_interval.data<int>());
}
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_CUDA_KERNEL(margin_cross_entropy,
ops::MarginCrossEntropyOpCUDAKernel<float>,
ops::MarginCrossEntropyOpCUDAKernel<double>,
ops::MarginCrossEntropyOpCUDAKernel<plat::float16>);
REGISTER_OP_CUDA_KERNEL(margin_cross_entropy_grad,
ops::MarginCrossEntropyGradCUDAKernel<float>,
ops::MarginCrossEntropyGradCUDAKernel<double>,
ops::MarginCrossEntropyGradCUDAKernel<plat::float16>);
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <algorithm>
#include <utility>
#include <vector>
#include "paddle/fluid/framework/data_type.h"
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/operators/math/softmax.h"
#include "paddle/fluid/operators/softmax_op.h"
namespace paddle {
namespace operators {
template <typename T>
class MarginCrossEntropyOpCPUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
PADDLE_THROW(platform::errors::Unavailable(
"Do not support margin_cross_entropy for cpu kernel "
"now."));
}
};
} // namespace operators
} // namespace paddle
...@@ -28,6 +28,7 @@ list(APPEND DIST_TEST_OPS test_parallel_dygraph_pipeline_parallel) ...@@ -28,6 +28,7 @@ list(APPEND DIST_TEST_OPS test_parallel_dygraph_pipeline_parallel)
list(APPEND DIST_TEST_OPS test_parallel_dygraph_tensor_parallel) list(APPEND DIST_TEST_OPS test_parallel_dygraph_tensor_parallel)
list(APPEND DIST_TEST_OPS test_parallel_dygraph_sharding_parallel) list(APPEND DIST_TEST_OPS test_parallel_dygraph_sharding_parallel)
list(APPEND DIST_TEST_OPS test_parallel_dygraph_mp_layers) list(APPEND DIST_TEST_OPS test_parallel_dygraph_mp_layers)
list(APPEND DIST_TEST_OPS test_parallel_margin_cross_entropy)
set(MIXED_DIST_TEST_OPS ${DIST_TEST_OPS}) set(MIXED_DIST_TEST_OPS ${DIST_TEST_OPS})
#remove distribute unittests. #remove distribute unittests.
list(APPEND MIXED_DIST_TEST_OPS test_dgc_op) list(APPEND MIXED_DIST_TEST_OPS test_dgc_op)
...@@ -195,6 +196,7 @@ if ((NOT WITH_GPU) AND (NOT WITH_ROCM)) ...@@ -195,6 +196,7 @@ if ((NOT WITH_GPU) AND (NOT WITH_ROCM))
LIST(REMOVE_ITEM TEST_OPS test_mixed_precision) LIST(REMOVE_ITEM TEST_OPS test_mixed_precision)
LIST(REMOVE_ITEM TEST_OPS test_fleet_base_single) LIST(REMOVE_ITEM TEST_OPS test_fleet_base_single)
LIST(REMOVE_ITEM TEST_OPS test_dygraph_recompute) LIST(REMOVE_ITEM TEST_OPS test_dygraph_recompute)
LIST(REMOVE_ITEM TEST_OPS test_parallel_margin_cross_entropy)
elseif(WITH_GPU) elseif(WITH_GPU)
if (${CUDNN_VERSION} VERSION_LESS 7100) if (${CUDNN_VERSION} VERSION_LESS 7100)
LIST(REMOVE_ITEM TEST_OPS test_conv2d_fusion_op) LIST(REMOVE_ITEM TEST_OPS test_conv2d_fusion_op)
...@@ -906,6 +908,7 @@ if(WITH_DISTRIBUTE AND WITH_GPU AND WITH_NCCL) ...@@ -906,6 +908,7 @@ if(WITH_DISTRIBUTE AND WITH_GPU AND WITH_NCCL)
set_tests_properties(test_parallel_dygraph_tensor_parallel PROPERTIES TIMEOUT 200) set_tests_properties(test_parallel_dygraph_tensor_parallel PROPERTIES TIMEOUT 200)
set_tests_properties(test_parallel_dygraph_sharding_parallel PROPERTIES TIMEOUT 120) set_tests_properties(test_parallel_dygraph_sharding_parallel PROPERTIES TIMEOUT 120)
set_tests_properties(test_parallel_dygraph_mp_layers PROPERTIES TIMEOUT 120) set_tests_properties(test_parallel_dygraph_mp_layers PROPERTIES TIMEOUT 120)
set_tests_properties(test_parallel_margin_cross_entropy PROPERTIES TIMEOUT 120)
if(${NCCL_VERSION} VERSION_GREATER_EQUAL 2212) if(${NCCL_VERSION} VERSION_GREATER_EQUAL 2212)
set_tests_properties(test_parallel_dygraph_sparse_embedding PROPERTIES TIMEOUT 120) set_tests_properties(test_parallel_dygraph_sparse_embedding PROPERTIES TIMEOUT 120)
set_tests_properties(test_parallel_dygraph_transformer PROPERTIES TIMEOUT 120) set_tests_properties(test_parallel_dygraph_transformer PROPERTIES TIMEOUT 120)
......
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import division
from __future__ import print_function
import unittest
import paddle
import numpy as np
import random
import paddle.distributed as dist
import paddle.fluid as fluid
import paddle.distributed.fleet as fleet
from paddle import framework
def set_random_seed(seed):
"""Set random seed for reproducability."""
random.seed(seed)
np.random.seed(seed)
paddle.seed(seed)
fleet.meta_parallel.model_parallel_random_seed(seed)
class TestParallelMarginSoftmaxCrossEntropyOp(unittest.TestCase):
def setUp(self):
strategy = fleet.DistributedStrategy()
fleet.init(is_collective=True, strategy=strategy)
def test_parallel_margin_softmax_cross_entropy(self):
margin1s = [1.0, 1.0, 1.35]
margin2s = [0.5, 0.0, 0.0]
margin3s = [0.0, 0.35, 0.0]
scales = [64.0, 64.0, 64.0]
rank_id = dist.get_rank()
num_trainer = dist.get_world_size()
batch_size = 2
feature_length = 4
seed = 1025
set_random_seed(seed)
paddle.seed(rank_id * 10)
random.seed(seed)
np.random.seed(seed)
check_group = dist.new_group(list(range(num_trainer)))
for dtype in ('float32', 'float64'):
num_class_per_cards = [[4, 8], [2, 2], [4, 2], [3, 9]]
for num_class_per_card in num_class_per_cards:
num_class = np.sum(num_class_per_card)
for margin1, margin2, margin3, scale in zip(margin1s, margin2s,
margin3s, scales):
for _ in range(5):
np_label = np.random.randint(0, num_class,
(batch_size, ))
label = paddle.to_tensor(np_label, dtype="int64")
input = paddle.randn(
shape=[batch_size, feature_length], dtype=dtype)
input.stop_gradient = False
input_l2 = paddle.sqrt(
paddle.sum(
paddle.square(input), axis=1, keepdim=True))
norm_input = paddle.divide(input, input_l2)
weight = paddle.randn(
shape=[
feature_length, num_class_per_card[rank_id]
],
dtype=dtype)
weight.stop_gradient = False
weight_l2 = paddle.sqrt(
paddle.sum(
paddle.square(weight), axis=0, keepdim=True))
norm_weight = paddle.divide(weight, weight_l2)
data = paddle.matmul(norm_input, norm_weight)
data.stop_gradient = False
sta = np.sum(
num_class_per_card[:rank_id]) if rank_id > 0 else 0
end = np.sum(num_class_per_card[:rank_id + 1])
integral_data = np.zeros(
(batch_size, num_class), dtype=dtype)
integral_data[:, sta:end] = data.clone().detach().numpy(
)
integral_data = paddle.to_tensor(
integral_data, dtype=dtype)
paddle.distributed.all_reduce(
integral_data,
op=paddle.distributed.ReduceOp.SUM,
group=check_group)
integral_data = integral_data.detach().clone()
integral_data.stop_gradient = False
# add arcface margin to logit
theta = paddle.acos(integral_data)
one_hot_label = paddle.nn.functional.one_hot(
label, num_classes=num_class)
one_hot_label.stop_gradient = False
if margin1 != 1.0:
theta = margin1 * theta
if margin2 != 0.0:
theta = theta + margin2
margin_cos = paddle.cos(theta)
if margin3 != 0.0:
margin_cos = margin_cos - margin3
diff = one_hot_label * (margin_cos - integral_data)
arc_data = (integral_data + diff) * scale
loss_a, softmax_a = paddle.nn.functional.margin_cross_entropy(
data,
label,
margin1=margin1,
margin2=margin2,
margin3=margin3,
scale=scale,
group=check_group,
return_softmax=True,
reduction=None)
loss_b, softmax_b = paddle.nn.functional.softmax_with_cross_entropy(
logits=arc_data,
label=paddle.reshape(label, (-1, 1)),
return_softmax=True)
np.testing.assert_allclose(
loss_a.numpy(), loss_b.numpy(), rtol=1e-5)
integral_prob = np.zeros(
(batch_size, num_class), dtype=dtype)
integral_prob[:, sta:end] = softmax_a.clone().detach(
).numpy()
integral_prob = paddle.to_tensor(
integral_prob, dtype=dtype)
paddle.distributed.all_reduce(
integral_prob,
op=paddle.distributed.ReduceOp.SUM,
group=check_group)
integral_prob = integral_prob.detach().clone()
integral_prob.stop_gradient = False
np.testing.assert_allclose(
integral_prob.numpy(),
softmax_b.numpy(),
rtol=1e-5,
atol=1e-6)
loss_a = loss_a.sum() / batch_size
loss_b = loss_b.sum() / batch_size
loss_a.backward()
loss_b.backward()
integral_grad = np.zeros(
(batch_size, num_class), dtype=dtype)
integral_grad[:, sta:end] = data.grad.clone().detach()
integral_grad = paddle.to_tensor(
integral_grad, dtype=dtype)
paddle.distributed.all_reduce(
integral_grad,
op=paddle.distributed.ReduceOp.SUM,
group=check_group)
np.testing.assert_allclose(
integral_data.grad.numpy(),
integral_grad.numpy(),
rtol=1e-5)
if __name__ == '__main__':
unittest.main()
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import unittest
import numpy as np
from op_test import OpTest
import math
import random
import paddle
import paddle.fluid as fluid
from paddle.fluid import core
from paddle.fluid import Program, program_guard
def stable_softmax_comm(x):
shiftx = (x - np.max(x))
deno = np.log(np.sum(np.exp(shiftx)))
comm = shiftx - deno
return comm
def margin_cross_entropy(logits,
label,
axis,
margin1,
margin2,
margin3,
scale,
reduction=None):
one_hot_label = np.zeros_like(logits, dtype=logits.dtype)
for i, lb in enumerate(label):
one_hot_label[i, lb] = 1.0
# add arcface margin to logit
theta = np.arccos(logits)
if margin1 != 1.0:
theta = margin1 * theta
if margin2 != 0.0:
theta = theta + margin2
margin_cos = np.cos(theta)
if margin3 != 0.0:
margin_cos = margin_cos - margin3
diff = one_hot_label * (margin_cos - logits)
arc_logits = (logits + diff) * scale
comm = np.apply_along_axis(stable_softmax_comm, axis, arc_logits)
loss = (-one_hot_label * comm).sum(axis=axis, keepdims=True)
softmax = np.exp(comm)
if reduction == 'mean':
loss = np.mean(loss)
elif reduction == 'sum':
loss = np.sum(loss)
return loss, softmax
@unittest.skipIf(not core.is_compiled_with_cuda(),
"core is not compiled with CUDA")
class TestMarginCrossEntropyOp(OpTest):
def initParams(self):
self.op_type = "margin_cross_entropy"
self.axis = -1
self.batch_dim = 5
self.feat_dim = 41
self.num_class = 37
def init_loss_params(self):
self.margin1 = 1.0
self.margin2 = 0.5
self.margin3 = 0.0
self.scale = 2.0
def init_dtype(self):
self.dtype = np.float64
def setUp(self):
self.initParams()
self.init_loss_params()
self.init_dtype()
datas = np.random.uniform(
-0.99, 0.99, [self.batch_dim, self.feat_dim]).astype(self.dtype)
datas = datas / np.sqrt(np.sum(np.square(datas), axis=1, keepdims=True))
weights = np.random.uniform(
-0.99, 0.99, [self.feat_dim, self.num_class]).astype(self.dtype)
weights = weights / np.sqrt(
np.sum(np.square(weights), axis=0, keepdims=True))
logits = np.matmul(datas, weights)
labels = np.random.randint(
0, self.num_class, (self.batch_dim, ), dtype="int64")
loss, softmax = margin_cross_entropy(logits, labels, self.axis,
self.margin1, self.margin2,
self.margin3, self.scale)
self.inputs = {"Logits": logits, "Label": labels}
self.outputs = {
"Softmax": softmax.astype(self.dtype),
"Loss": loss.astype(self.dtype)
}
self.attrs = {
'margin1': self.margin1,
'margin2': self.margin2,
'margin3': self.margin3,
'scale': self.scale,
}
def test_check_output(self):
self.check_output_with_place(core.CUDAPlace(0), atol=1e-5)
def test_check_grad(self):
self.check_grad_with_place(core.CUDAPlace(0), ["Logits"], "Loss")
@unittest.skipIf(not core.is_compiled_with_cuda(),
"core is not compiled with CUDA")
class TestMarginCrossEntropyOpFP32(TestMarginCrossEntropyOp):
def init_dtype(self):
self.dtype = np.float32
def test_check_grad(self):
self.check_grad_with_place(
core.CUDAPlace(0), ["Logits"],
"Loss",
numeric_grad_delta=5e-2,
max_relative_error=5e-2)
@unittest.skipIf(not core.is_compiled_with_cuda(),
"core is not compiled with CUDA")
class TestMarginCrossEntropyOpFP16(TestMarginCrossEntropyOp):
def init_dtype(self):
self.dtype = np.float16
def test_check_output(self):
self.check_output_with_place(core.CUDAPlace(0), atol=5e-2)
def test_check_grad(self):
self.check_grad_with_place(
core.CUDAPlace(0), ["Logits"],
"Loss",
numeric_grad_delta=6e-1,
max_relative_error=6e-1)
@unittest.skipIf(not core.is_compiled_with_cuda(),
"core is not compiled with CUDA")
class TestMarginCrossEntropyOpCosFace(TestMarginCrossEntropyOp):
def init_loss_params(self):
self.margin1 = 1.0
self.margin2 = 0.0
self.margin3 = 0.35
self.scale = 2.0
@unittest.skipIf(not core.is_compiled_with_cuda(),
"core is not compiled with CUDA")
class TestMarginCrossEntropyOpSphereFace(TestMarginCrossEntropyOp):
def init_loss_params(self):
self.margin1 = 1.35
self.margin2 = 0.0
self.margin3 = 0.0
self.scale = 2.0
class TestMarginCrossEntropyOpCPU(TestMarginCrossEntropyOp):
def test_check_output(self):
try:
self.check_output_with_place(core.CPUPlace(), atol=1e-5)
except RuntimeError:
pass
def test_check_grad(self):
try:
self.check_grad_with_place(core.CPUPlace(), ["Logits"], "Loss")
except RuntimeError:
pass
@unittest.skipIf(not core.is_compiled_with_cuda(),
"core is not compiled with CUDA")
class TestMarginCrossEntropyOpV2(unittest.TestCase):
def setUp(self):
self.initParams()
np.random.seed(self.seed)
paddle.framework.random._manual_program_seed(self.seed)
self.places = []
if core.is_compiled_with_cuda():
self.places.append(paddle.fluid.CUDAPlace(0))
def initParams(self):
self.seed = 2021
self.axis = -1
self.batch_dim = 5
self.feat_dim = 41
self.num_class = 37
self.init_loss_params()
self.init_dtype()
self.init_reduction()
def init_loss_params(self):
self.margin1 = 1.0
self.margin2 = 0.5
self.margin3 = 0.0
self.scale = 2.0
def init_dtype(self):
self.dtype = np.float64
def init_reduction(self):
self.reduction = None
def test_static(self):
for place in self.places:
self.check_static_result(place=place)
def check_static_result(self, place):
with program_guard(Program(), Program()):
datas = np.random.uniform(
-0.99, 0.99, [self.batch_dim, self.feat_dim]).astype(self.dtype)
datas = datas / np.sqrt(
np.sum(np.square(datas), axis=1, keepdims=True))
weights = np.random.uniform(
-0.99, 0.99, [self.feat_dim, self.num_class]).astype(self.dtype)
weights = weights / np.sqrt(
np.sum(np.square(weights), axis=0, keepdims=True))
logits_np = np.matmul(datas, weights)
labels_np = np.random.randint(
0, self.num_class, (self.batch_dim, ), dtype="int64")
loss_np, softmax_np = margin_cross_entropy(
logits_np, labels_np, self.axis, self.margin1, self.margin2,
self.margin3, self.scale, self.reduction)
logits = paddle.static.data(
name='logits',
shape=[self.batch_dim, self.num_class],
dtype=self.dtype)
label = paddle.static.data(
name='label', shape=[self.batch_dim], dtype="int64")
loss, softmax = paddle.nn.functional.margin_cross_entropy(
logits,
label,
margin1=self.margin1,
margin2=self.margin2,
margin3=self.margin3,
scale=self.scale,
return_softmax=True,
reduction=self.reduction)
exe = paddle.fluid.Executor(place)
[loss_res, softmax_res] = exe.run(
paddle.fluid.default_main_program(),
feed={'logits': logits_np,
'label': labels_np},
fetch_list=[loss, softmax])
np.testing.assert_allclose(loss_res, loss_np)
np.testing.assert_allclose(softmax_res, softmax_np)
def test_dynamic(self):
for place in self.places:
self.check_dynamic_result(place=place)
def check_dynamic_result(self, place):
with paddle.fluid.dygraph.guard(place):
datas = np.random.uniform(
-0.99, 0.99, [self.batch_dim, self.feat_dim]).astype(self.dtype)
datas = datas / np.sqrt(
np.sum(np.square(datas), axis=1, keepdims=True))
weights = np.random.uniform(
-0.99, 0.99, [self.feat_dim, self.num_class]).astype(self.dtype)
weights = weights / np.sqrt(
np.sum(np.square(weights), axis=0, keepdims=True))
logits_np = np.matmul(datas, weights)
labels_np = np.random.randint(
0, self.num_class, (self.batch_dim, ), dtype="int64")
loss_np, softmax_np = margin_cross_entropy(
logits_np, labels_np, self.axis, self.margin1, self.margin2,
self.margin3, self.scale, self.reduction)
logits = paddle.to_tensor(logits_np, dtype=self.dtype)
labels = paddle.to_tensor(labels_np, dtype="int64")
loss, softmax = paddle.nn.functional.margin_cross_entropy(
logits,
labels,
margin1=self.margin1,
margin2=self.margin2,
margin3=self.margin3,
scale=self.scale,
return_softmax=True,
reduction=self.reduction)
loss_res = loss.numpy()
softmax_res = softmax.numpy()
np.testing.assert_allclose(loss_res, loss_np)
np.testing.assert_allclose(softmax_res, softmax_np)
@unittest.skipIf(not core.is_compiled_with_cuda(),
"core is not compiled with CUDA")
class TestMarginCrossEntropyOpV3(TestMarginCrossEntropyOpV2):
def init_reduction(self):
self.reduction = 'mean'
@unittest.skipIf(not core.is_compiled_with_cuda(),
"core is not compiled with CUDA")
class TestMarginCrossEntropyOpV4(TestMarginCrossEntropyOpV2):
def init_reduction(self):
self.reduction = 'sum'
@unittest.skipIf(not core.is_compiled_with_cuda(),
"core is not compiled with CUDA")
class TestMarginCrossEntropyOpAPIError(unittest.TestCase):
def setUp(self):
self.initParams()
np.random.seed(self.seed)
paddle.framework.random._manual_program_seed(self.seed)
self.places = []
if core.is_compiled_with_cuda():
self.places.append(paddle.fluid.CUDAPlace(0))
def initParams(self):
self.seed = 2021
self.axis = -1
self.batch_dim = 10
self.feat_dim = 41
self.num_class = 37
self.init_loss_params()
self.init_dtype()
def init_loss_params(self):
self.margin1 = 1.0
self.margin2 = 0.5
self.margin3 = 0.0
self.scale = 2.0
def init_dtype(self):
self.dtype = np.float64
def test_dynamic_errors(self):
def test_dim():
for place in self.places:
with paddle.fluid.dygraph.guard(place):
labels_np = np.random.randint(
0, self.num_class, (self.batch_dim, 2), dtype="int64")
logits_np = np.random.uniform(
-0.99, 0.99,
[self.batch_dim, self.num_class]).astype(self.dtype)
labels = paddle.to_tensor(labels_np)
logits = paddle.to_tensor(logits_np)
loss, softmax = paddle.nn.functional.margin_cross_entropy(
logits,
labels,
margin1=self.margin1,
margin2=self.margin2,
margin3=self.margin3,
scale=self.scale,
return_softmax=True,
reduction=None)
self.assertRaises(ValueError, test_dim)
if __name__ == '__main__':
unittest.main()
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import unittest
import paddle.fluid as fluid
from test_parallel_dygraph_dataparallel import TestMultipleGpus
class TestParallelMarginSoftmaxWithCrossEntropy(TestMultipleGpus):
def test_parallel_margin_cross_entropy(self):
self.run_mnist_2gpu('parallel_margin_cross_entropy.py')
if __name__ == "__main__":
unittest.main()
...@@ -79,6 +79,7 @@ from .loss import npair_loss # noqa: F401 ...@@ -79,6 +79,7 @@ from .loss import npair_loss # noqa: F401
from .loss import sigmoid_focal_loss # noqa: F401 from .loss import sigmoid_focal_loss # noqa: F401
from .loss import smooth_l1_loss # noqa: F401 from .loss import smooth_l1_loss # noqa: F401
from .loss import softmax_with_cross_entropy # noqa: F401 from .loss import softmax_with_cross_entropy # noqa: F401
from .loss import margin_cross_entropy # noqa: F401
from .loss import square_error_cost # noqa: F401 from .loss import square_error_cost # noqa: F401
from .loss import ctc_loss # noqa: F401 from .loss import ctc_loss # noqa: F401
from .norm import batch_norm # noqa: F401 from .norm import batch_norm # noqa: F401
...@@ -185,6 +186,7 @@ __all__ = [ #noqa ...@@ -185,6 +186,7 @@ __all__ = [ #noqa
'sigmoid_focal_loss', 'sigmoid_focal_loss',
'smooth_l1_loss', 'smooth_l1_loss',
'softmax_with_cross_entropy', 'softmax_with_cross_entropy',
'margin_cross_entropy',
'square_error_cost', 'square_error_cost',
'ctc_loss', 'ctc_loss',
'affine_grid', 'affine_grid',
......
...@@ -1092,6 +1092,268 @@ def ctc_loss(log_probs, ...@@ -1092,6 +1092,268 @@ def ctc_loss(log_probs,
return loss_out return loss_out
def margin_cross_entropy(logits,
label,
margin1=1.0,
margin2=0.5,
margin3=0.0,
scale=64.0,
group=None,
return_softmax=False,
reduction='mean'):
"""
.. math::
L=-\\frac{1}{N}\sum^N_{i=1}\log\\frac{e^{s(cos(m_{1}\\theta_{y_i}+m_{2})-m_{3})}}{e^{s(cos(m_{1}\\theta_{y_i}+m_{2})-m_{3})}+\sum^n_{j=1,j\\neq y_i} e^{scos\\theta_{y_i}}}
where the :math:`\\theta_{y_i}` is the angle between the feature :math:`x` and
the representation of class :math:`i`. The details of ArcFace loss
could be referred to https://arxiv.org/abs/1801.07698.
.. hint::
The API supports model parallel and single GPU. And logits.shape[-1] can be different at each rank.
Args:
logits (Tensor): shape[N, local_num_classes], the output of the normalized X multiply the normalized W.
The logits is shard_logits when using model parallel.
label (Tensor): shape[N] or shape[N, 1], the groud truth label.
margin1 (float, optional): m1 of margin loss, default value is `1.0`.
margin2 (float, optional): m2 of margin loss, default value is `0.5`.
margin3 (float, optional): m3 of margin loss, default value is `0.0`.
scale (float, optional): s of margin loss, default value is `64.0`.
group (Group, optional): The abstract representation of group, see paddle.distributed.collective.Group.
Default `None`.
return_softmax (bool, optional): Whether return softmax probability. Default value is `False`.
reduction (str, optional): The candicates are ``'none'`` | ``'mean'`` | ``'sum'``.
If :attr:`reduction` is ``'mean'``, return the average of loss;
If :attr:`reduction` is ``'sum'``, return the sum of loss;
If :attr:`reduction` is ``'none'``, no reduction will be applied.
Default value is `'mean'`.
Returns:
``Tensor`` or Tuple of two ``Tensor`` : Return the cross entropy loss if \
`return_softmax` is False, otherwise the tuple \
(loss, softmax), softmax is shard_softmax when \
using model parallel, otherwise softmax is in \
the same shape with input logits. If ``reduction == None``, \
the shape of loss is ``[N, 1]``, otherwise the shape is ``[1]``.
Examples:
.. code-block:: python
# required: gpu
# Single GPU
import paddle
m1 = 1.0
m2 = 0.5
m3 = 0.0
s = 64.0
batch_size = 2
feature_length = 4
num_classes = 4
label = paddle.randint(low=0, high=num_classes, shape=[batch_size], dtype='int64')
X = paddle.randn(
shape=[batch_size, feature_length],
dtype='float64')
X_l2 = paddle.sqrt(paddle.sum(paddle.square(X), axis=1, keepdim=True))
X = paddle.divide(X, X_l2)
W = paddle.randn(
shape=[feature_length, num_classes],
dtype='float64')
W_l2 = paddle.sqrt(paddle.sum(paddle.square(W), axis=0, keepdim=True))
W = paddle.divide(W, W_l2)
logits = paddle.matmul(X, W)
loss, softmax = paddle.nn.functional.margin_cross_entropy(
logits, label, margin1=m1, margin2=m2, margin3=m3, scale=s, return_softmax=True, reduction=None)
print(logits)
print(label)
print(loss)
print(softmax)
#Tensor(shape=[2, 4], dtype=float64, place=CUDAPlace(0), stop_gradient=True,
# [[ 0.85204151, -0.55557678, 0.04994566, 0.71986042],
# [-0.20198586, -0.35270476, -0.55182702, 0.09749021]])
#Tensor(shape=[2], dtype=int64, place=CUDAPlace(0), stop_gradient=True,
# [2, 3])
#Tensor(shape=[2, 1], dtype=float64, place=CUDAPlace(0), stop_gradient=True,
# [[82.37059586],
# [12.13448420]])
#Tensor(shape=[2, 4], dtype=float64, place=CUDAPlace(0), stop_gradient=True,
# [[0.99978819, 0.00000000, 0.00000000, 0.00021181],
# [0.99992995, 0.00006468, 0.00000000, 0.00000537]])
.. code-block:: python
# required: distributed
# Multi GPU, test_margin_cross_entropy.py
import paddle
import paddle.distributed as dist
strategy = dist.fleet.DistributedStrategy()
dist.fleet.init(is_collective=True, strategy=strategy)
rank_id = dist.get_rank()
m1 = 1.0
m2 = 0.5
m3 = 0.0
s = 64.0
batch_size = 2
feature_length = 4
num_class_per_card = [4, 8]
num_classes = paddle.sum(paddle.to_tensor(num_class_per_card))
label = paddle.randint(low=0, high=num_classes.item(), shape=[batch_size], dtype='int64')
label_list = []
dist.all_gather(label_list, label)
label = paddle.concat(label_list, axis=0)
X = paddle.randn(
shape=[batch_size, feature_length],
dtype='float64')
X_list = []
dist.all_gather(X_list, X)
X = paddle.concat(X_list, axis=0)
X_l2 = paddle.sqrt(paddle.sum(paddle.square(X), axis=1, keepdim=True))
X = paddle.divide(X, X_l2)
W = paddle.randn(
shape=[feature_length, num_class_per_card[rank_id]],
dtype='float64')
W_l2 = paddle.sqrt(paddle.sum(paddle.square(W), axis=0, keepdim=True))
W = paddle.divide(W, W_l2)
logits = paddle.matmul(X, W)
loss, softmax = paddle.nn.functional.margin_cross_entropy(
logits, label, margin1=m1, margin2=m2, margin3=m3, scale=s, return_softmax=True, reduction=None)
print(logits)
print(label)
print(loss)
print(softmax)
# python -m paddle.distributed.launch --gpus=0,1 test_margin_cross_entropy.py
## for rank0 input
#Tensor(shape=[4, 4], dtype=float64, place=CUDAPlace(0), stop_gradient=True,
# [[ 0.32888934, 0.02408748, -0.02763289, 0.18173063],
# [-0.52893978, -0.10623845, -0.21596515, -0.06432517],
# [-0.00536345, -0.03924667, 0.66735314, -0.28640926],
# [-0.09907366, -0.48534973, -0.10365338, -0.39472322]])
#Tensor(shape=[4], dtype=int64, place=CUDAPlace(0), stop_gradient=True,
# [11, 1 , 10, 11])
## for rank1 input
#Tensor(shape=[4, 8], dtype=float64, place=CUDAPlace(1), stop_gradient=True,
# [[ 0.68654754, 0.28137170, 0.69694954, -0.60923933, -0.57077653, 0.54576703, -0.38709028, 0.56028204],
# [-0.80360371, -0.03042448, -0.45107338, 0.49559349, 0.69998950, -0.45411693, 0.61927630, -0.82808600],
# [ 0.11457570, -0.34785879, -0.68819499, -0.26189226, -0.48241491, -0.67685711, 0.06510185, 0.49660849],
# [ 0.31604851, 0.52087884, 0.53124749, -0.86176582, -0.43426329, 0.34786144, -0.10850784, 0.51566383]])
#Tensor(shape=[4], dtype=int64, place=CUDAPlace(1), stop_gradient=True,
# [11, 1 , 10, 11])
## for rank0 output
#Tensor(shape=[4, 1], dtype=float64, place=CUDAPlace(0), stop_gradient=True,
# [[38.96608230],
# [81.28152394],
# [69.67229865],
# [31.74197251]])
#Tensor(shape=[4, 4], dtype=float64, place=CUDAPlace(0), stop_gradient=True,
# [[0.00000000, 0.00000000, 0.00000000, 0.00000000],
# [0.00000000, 0.00000000, 0.00000000, 0.00000000],
# [0.00000000, 0.00000000, 0.99998205, 0.00000000],
# [0.00000000, 0.00000000, 0.00000000, 0.00000000]])
## for rank1 output
#Tensor(shape=[4, 1], dtype=float64, place=CUDAPlace(1), stop_gradient=True,
# [[38.96608230],
# [81.28152394],
# [69.67229865],
# [31.74197251]])
#Tensor(shape=[4, 8], dtype=float64, place=CUDAPlace(1), stop_gradient=True,
# [[0.33943993, 0.00000000, 0.66051859, 0.00000000, 0.00000000, 0.00004148, 0.00000000, 0.00000000],
# [0.00000000, 0.00000000, 0.00000000, 0.00000207, 0.99432097, 0.00000000, 0.00567696, 0.00000000],
# [0.00000000, 0.00000000, 0.00000000, 0.00000000, 0.00000000, 0.00000000, 0.00000000, 0.00001795],
# [0.00000069, 0.33993085, 0.66006319, 0.00000000, 0.00000000, 0.00000528, 0.00000000, 0.00000000]])
"""
assert reduction in ['mean', 'sum', 'none', None]
if group is not None and not group.is_member():
return
ring_id = 0 if group is None else group.id
rank = 0
nranks = 1
if core.is_compiled_with_dist():
parallel_env = paddle.distributed.ParallelEnv()
global_rank = parallel_env.rank
rank = global_rank if group is None else group.get_group_rank(
global_rank)
nranks = parallel_env.world_size if group is None else group.nranks
input_dims = len(list(logits.shape))
label_dims = len(list(label.shape))
if input_dims - 1 != label_dims and input_dims != label_dims:
raise ValueError(
'Expected nput_dims - 1 = label_dims or input_dims == label_dims\
(got nput_dims{}, label_dims{})'.format(input_dims, label_dims))
if input_dims - 1 == label_dims:
label = paddle.unsqueeze(label, axis=-1)
if in_dygraph_mode():
softmax, loss = core.ops.margin_cross_entropy(
logits, label, 'ring_id', ring_id, 'rank', rank, 'nranks', nranks,
'margin1', margin1, 'margin2', margin2, 'margin3', margin3, 'scale',
scale, 'return_softmax', return_softmax)
if reduction == 'mean':
loss = paddle.mean(loss)
elif reduction == 'sum':
loss = paddle.sum(loss)
if not return_softmax:
return loss
else:
return loss, softmax
op_type = 'margin_cross_entropy'
helper = LayerHelper(op_type, **locals())
softmax = helper.create_variable_for_type_inference(dtype=logits.dtype)
loss = helper.create_variable_for_type_inference(dtype=logits.dtype)
check_variable_and_dtype(logits, 'logits',
['float16', 'float32', 'float64'],
'margin_cross_entropy')
check_variable_and_dtype(label, 'label', ['int32', 'int64'],
'margin_cross_entropy')
helper.append_op(
type=op_type,
inputs={'Logits': logits,
'Label': label},
outputs={'Softmax': softmax,
'Loss': loss},
attrs={
'return_softmax': return_softmax,
'ring_id': ring_id,
'rank': rank,
'nranks': nranks,
'margin1': margin1,
'margin2': margin2,
'margin3': margin3,
'scale': scale,
})
if reduction == 'mean':
loss = paddle.mean(loss)
elif reduction == 'sum':
loss = paddle.sum(loss)
if not return_softmax:
return loss
else:
return loss, softmax
@deprecated( @deprecated(
since="2.0.0", since="2.0.0",
update_to="paddle.nn.functional.cross_entropy", update_to="paddle.nn.functional.cross_entropy",
......
...@@ -719,4 +719,5 @@ STATIC_MODE_TESTING_LIST = [ ...@@ -719,4 +719,5 @@ STATIC_MODE_TESTING_LIST = [
'test_sgd_op_bf16', 'test_sgd_op_bf16',
'test_marker_op', 'test_marker_op',
'test_c_embedding_op', 'test_c_embedding_op',
'test_margin_cross_entropy_op',
] ]
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册