diff --git a/paddle/fluid/operators/margin_cross_entropy_op.cc b/paddle/fluid/operators/margin_cross_entropy_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..94c58fa6970d902689680ca5552ea3d126c424c6 --- /dev/null +++ b/paddle/fluid/operators/margin_cross_entropy_op.cc @@ -0,0 +1,203 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/margin_cross_entropy_op.h" + +namespace paddle { +namespace operators { + +class MarginCrossEntropyOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + OP_INOUT_CHECK(ctx->HasInput("Logits"), "Input", "Logits", + "MarginCrossEntropyOp"); + OP_INOUT_CHECK(ctx->HasInput("Label"), "Input", "Label", + "MarginCrossEntropyOp"); + + OP_INOUT_CHECK(ctx->HasOutput("Softmax"), "Output", "Softmax", + "MarginCrossEntropyOp"); + OP_INOUT_CHECK(ctx->HasOutput("Loss"), "Output", "Loss", + "MarginCrossEntropyOp"); + + auto logits_dims = ctx->GetInputDim("Logits"); + auto labels_dims = ctx->GetInputDim("Label"); + + auto logits_rank = logits_dims.size(); + auto axis = logits_rank - 1; + for (int i = 0; i < logits_rank; i++) { + if (i != axis) { + if (ctx->IsRuntime() || (logits_dims[i] > 0 && labels_dims[i] > 0)) { + PADDLE_ENFORCE_EQ(logits_dims[i], labels_dims[i], + platform::errors::InvalidArgument( + "Input(Logits) and Input(Label) should in " + "same shape in dimensions except axis.")); + } + } + } + + if (labels_dims.size() > 1) { + PADDLE_ENFORCE_EQ( + labels_dims[logits_rank - 1], 1UL, + platform::errors::InvalidArgument( + "the last dimension of Input(Label) should be 1." + "But received: the last dimension of Input(Label) is [%d]," + "the last dimension is [%d]", + labels_dims[logits_rank - 1], logits_rank - 1)); + } + + ctx->SetOutputDim("Softmax", logits_dims); + + logits_dims[axis] = 1; + ctx->SetOutputDim("Loss", logits_dims); + + ctx->ShareLoD("Logits", /*->*/ "Softmax"); + ctx->ShareLoD("Logits", /*->*/ "Loss"); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + OperatorWithKernel::IndicateVarDataType(ctx, "Logits"), + ctx.device_context()); + } +}; + +class MarginCrossEntropyOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() { + AddInput("Logits", + "(Tensor, default: Tensor), The input tensor of unscaled " + "log probabilities, whose dimension :attr:`axis` should be scaled " + "by softmax."); + AddInput( + "Label", + "(Tensor) The input tensor of groud truth label. Label is a " + "Tensor in same shape with Input(Logits) except the shape in " + "dimension :attr:`axis` as 1."); + AddOutput( + "Softmax", + "(Tensor, default: Tensor), A tensor in same shape with " + "Input(Logits). " + "The outputs value of softmax activation by given the input batch, " + "which will be used in backward calculation."); + AddOutput("Loss", + "(Tensor, default: Tensor), A tensor in same shape with " + "Input(Logits) " + "except the shape in dimension :attr:`axis` as 1. The cross " + "entropy loss."); + AddAttr("return_softmax", + "(bool default false) A flag to indicate " + "whether to return softmax.") + .SetDefault(false); + AddAttr("ring_id", "(int default 0) nccl communication ring id.") + .SetDefault(0); + AddAttr("rank", "(int default 0) rank id for MarginCrossEntropy.") + .SetDefault(0); + AddAttr("nranks", "(int default 1) nranks id for MarginCrossEntropy.") + .SetDefault(1); + AddAttr("margin1", "(float default 1.0) margin1 for MarginLoss.") + .SetDefault(1.0); + AddAttr("margin2", "(float default 0.5) margin2 for MarginLoss.") + .SetDefault(0.5); + AddAttr("margin3", "(float default 0.0) margin3 for MarginLoss.") + .SetDefault(0.0); + AddAttr("scale", "(float default 64.0) scale for MarginLoss.") + .SetDefault(64.0); + AddComment(R"DOC( +MarginCrossEntropy Operator +.. math:: + + L=-\frac{1}{N}\sum^N_{i=1}\log\frac{e^{s(cos(m_{1}\theta_{y_i}+m_{2})-m_{3})}}{e^{s(cos(m_{1}\theta_{y_i}+m_{2})-m_{3})}+\sum^n_{j=1,j\neq y_i} e^{scos\theta_{y_i}}} + +where the :math: `\theta_{y_i}` is the angle between the feature :math: `x` and +the representation of class :math: `i`. The details of ArcFace loss +could be referred to https://arxiv.org/abs/1801.07698. + +Note that the Op supports model parallel and single GPU. And Logits.shape[-1] can be different each rank. + +)DOC"); + } +}; + +class MarginCrossEntropyOpGrad : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE_EQ(ctx->HasInput(framework::GradVarName("Loss")), true, + platform::errors::InvalidArgument( + "Input(Loss@Grad) should not be null.")); + PADDLE_ENFORCE_EQ(ctx->HasInput("Softmax"), true, + platform::errors::InvalidArgument( + "Input(Softmax) should be not null.")); + PADDLE_ENFORCE_EQ( + ctx->HasInput("Label"), true, + platform::errors::InvalidArgument("Input(Label) should be not null.")); + + PADDLE_ENFORCE_EQ(ctx->HasOutput(framework::GradVarName("Logits")), true, + platform::errors::InvalidArgument( + "Output(Logits@Grad) should be not null.")); + + ctx->SetOutputDim(framework::GradVarName("Logits"), + ctx->GetInputDim("Softmax")); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType(OperatorWithKernel::IndicateVarDataType( + ctx, framework::GradVarName("Loss")), + ctx.device_context()); + } +}; + +template +class MarginCrossEntropyOpGradMaker : public framework::SingleGradOpMaker { + public: + using framework::SingleGradOpMaker::SingleGradOpMaker; + + protected: + void Apply(GradOpPtr op) const override { + op->SetType("margin_cross_entropy_grad"); + + op->SetInput("Softmax", this->Output("Softmax")); + op->SetInput("Logits", this->Input("Logits")); + op->SetInput("Label", this->Input("Label")); + op->SetInput(framework::GradVarName("Loss"), this->OutputGrad("Loss")); + op->SetAttrMap(this->Attrs()); + op->SetOutput(framework::GradVarName("Logits"), this->InputGrad("Logits")); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OPERATOR( + margin_cross_entropy, ops::MarginCrossEntropyOp, + ops::MarginCrossEntropyOpMaker, + ops::MarginCrossEntropyOpGradMaker, + ops::MarginCrossEntropyOpGradMaker); + +REGISTER_OPERATOR(margin_cross_entropy_grad, ops::MarginCrossEntropyOpGrad); + +REGISTER_OP_CPU_KERNEL(margin_cross_entropy, + ops::MarginCrossEntropyOpCPUKernel, + ops::MarginCrossEntropyOpCPUKernel, + ops::MarginCrossEntropyOpCPUKernel); diff --git a/paddle/fluid/operators/margin_cross_entropy_op.cu b/paddle/fluid/operators/margin_cross_entropy_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..ccdba43b0542dc82975d4c9a19af56304dc0173a --- /dev/null +++ b/paddle/fluid/operators/margin_cross_entropy_op.cu @@ -0,0 +1,483 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef PADDLE_WITH_HIP +#include +namespace cub = hipcub; +#else +#include +#endif + +#include +#include "paddle/fluid/operators/amp/fp16_type_traits.h" +#include "paddle/fluid/operators/margin_cross_entropy_op.h" +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/operators/math/softmax_impl.h" +#include "paddle/fluid/operators/reduce_ops/reduce_functor_op.h" +#include "paddle/fluid/operators/reduce_ops/reduce_op.h" +#include "paddle/fluid/string/string_helper.h" + +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) +#include "paddle/fluid/platform/collective_helper.h" +#include "paddle/fluid/platform/nccl_helper.h" +#endif + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; + +static constexpr int kNumCUDAThreads = 512; +static constexpr int kNumMaxinumNumBlocks = 4096; + +static inline int NumBlocks(const int N) { + return std::min((N + kNumCUDAThreads - 1) / kNumCUDAThreads, + kNumMaxinumNumBlocks); +} + +void GetClassInterval(const gpuStream_t& stream, const platform::Place& place, + const platform::DeviceContext& ctx, const int rid, + const int rank, const int nranks, const int D, + Tensor* class_interval) { + std::vector shard_dim_vec(nranks + 1, 0); + shard_dim_vec[rank + 1] = D; + if (nranks <= 1) { + framework::TensorFromVector(shard_dim_vec, ctx, class_interval); + return; + } + +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) + Tensor num_classes_per_device; + framework::TensorFromVector(shard_dim_vec, ctx, &num_classes_per_device); + int* num_classes_per_device_ptr = num_classes_per_device.data(); + + const auto& comm = platform::NCCLCommContext::Instance().Get(rid, place); + // use global calculate stream + const auto calcu_stream = + static_cast( + platform::DeviceContextPool::Instance().Get(place)) + ->stream(); + + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllReduce( + num_classes_per_device_ptr, num_classes_per_device_ptr, + num_classes_per_device.numel(), + platform::ToNCCLDataType(num_classes_per_device.type()), ncclSum, + comm->comm(), calcu_stream)); + + auto class_interval_ptr = + class_interval->mutable_data({nranks + 1}, place); + size_t cub_temp_storage_bytes = 0; + cub::DeviceScan::InclusiveSum( + nullptr, cub_temp_storage_bytes, nullptr, nullptr, nranks + 1, stream); + auto cub_temp_storage = memory::Alloc(place, cub_temp_storage_bytes); + cub::DeviceScan::InclusiveSum( + cub_temp_storage->ptr(), cub_temp_storage_bytes, + num_classes_per_device_ptr, class_interval_ptr, nranks + 1, stream); + return; +#endif +} + +template +__global__ void AddMarginToPositiveLogitsKernel( + T* logit, const IndexT* label, const float margin1, const float margin2, + const float margin3, const int rank, const int nranks, const int64_t N, + const int64_t D, const int* class_interval_ptr) { + using MPType = typename details::MPTypeTrait::Type; + int start_index = class_interval_ptr[rank]; + int end_index = class_interval_ptr[rank + 1]; + int num_classes = class_interval_ptr[nranks]; + CUDA_KERNEL_LOOP(i, N) { + auto real_label = label[i]; + PADDLE_ENFORCE((real_label < num_classes) && (real_label >= 0), + "The index is out of bounds, " + "please check whether the value of label and " + "input meet the number of class. It should " + "be less than [%d], but received [%d]", + num_classes, real_label); + + if (real_label >= start_index && real_label < end_index) { + int64_t offset = i * D + real_label - start_index; + if (fabs(margin1 - 1.0) > 1e-8 || fabs(margin2) > 1e-8) { + MPType x = static_cast(logit[offset]); + MPType theta = acos(x); + if (fabs(margin1 - 1.0) > 1e-8) { + theta *= static_cast(margin1); + } + if (fabs(margin2) > 1e-8) { + theta += static_cast(margin2); + } + logit[offset] = static_cast(cos(theta)); + } + if (fabs(margin3) > 1e-8) { + MPType y = static_cast(logit[offset]); + y -= static_cast(margin3); + logit[offset] = static_cast(y); + } + } + } +} + +static __device__ __forceinline__ platform::float16 exp_on_device( + platform::float16 x) { + return ::Eigen::numext::exp(x); +} +static __device__ __forceinline__ float exp_on_device(float x) { + return expf(x); +} +static __device__ __forceinline__ double exp_on_device(double x) { + return exp(x); +} +static __device__ __forceinline__ platform::float16 log_on_device( + platform::float16 x) { + return ::Eigen::numext::log(x); +} +static __device__ __forceinline__ float log_on_device(float x) { + return logf(x); +} +static __device__ __forceinline__ double log_on_device(double x) { + return log(x); +} + +template +struct ExpLogitTransformer { + HOSTDEVICE explicit inline ExpLogitTransformer(int n) {} + + HOSTDEVICE inline Ty operator()(const Tx& x) const { + return static_cast(exp_on_device(x)); + } +}; + +template +struct ExpAndSum { + using Transformer = ExpLogitTransformer; + + inline Ty initial() { return static_cast(0.0f); } + + __device__ __forceinline__ Ty operator()(const Ty& a, const Ty& b) const { + return b + a; + } +}; + +template +__global__ void ScaleLogitKernel(T* logits, const float scale, const int64_t N, + const int64_t D) { + CUDA_KERNEL_LOOP(i, N * D) { logits[i] *= static_cast(scale); } +} + +template +__global__ void LogitsMinusMaxKernel(T* logits, const T* logits_max_per_row, + const int64_t N, const int64_t D) { + CUDA_KERNEL_LOOP(i, N * D) { + auto row = i / D; + logits[i] -= logits_max_per_row[row]; + } +} + +template +__global__ void LogitsMinusLogSumKernel(T* logits, const T* logits_sum_per_row, + const int64_t N, const int64_t D) { + CUDA_KERNEL_LOOP(i, N * D) { + auto row = i / D; + logits[i] -= log_on_device(logits_sum_per_row[row]); + } +} + +template +__global__ void HardLabelSoftmaxWithCrossEntropyKernel( + T* loss, T* log_softmax, const IndexT* labels, const int rank, + const int64_t N, const int64_t D, const int* class_interval_ptr) { + int start_index = class_interval_ptr[rank]; + CUDA_KERNEL_LOOP(i, N * D) { + auto row = i / D; + auto col = i % D; + if ((col + start_index) == labels[row]) { + auto softmax = log_softmax[i]; + loss[row] = -softmax; + log_softmax[i] = exp_on_device(softmax); + } else { + log_softmax[i] = exp_on_device(log_softmax[i]); + } + } +} + +template +__global__ void CalculateGrad(T* logits_grad, const T* loss_grad, + const T* logits, const IndexT* labels, + const float margin1, const float margin2, + const float scale, const int rank, + const int64_t N, const int64_t D, + const int* class_interval_ptr) { + using MPType = typename details::MPTypeTrait::Type; + int start_index = class_interval_ptr[rank]; + CUDA_KERNEL_LOOP(i, N * D) { + auto row = i / D; + auto col = i % D; + if ((col + start_index) == labels[row]) { + logits_grad[i] = (logits_grad[i] - static_cast(1.0)) * loss_grad[row]; + if (fabs(margin1 - 1.0) > 1e-8 || fabs(margin2) > 1e-8) { + MPType dout = static_cast(logits_grad[i]); + MPType one = static_cast(1.0f); + MPType x = static_cast(logits[i]); + MPType m1 = static_cast(margin1); + MPType m2 = static_cast(margin2); + + MPType d = m1 * sin(m1 * acos(x) + m2) / sqrt(one - x * x); + logits_grad[i] = static_cast(dout * d); + } + } else { + logits_grad[i] *= loss_grad[row]; + } + if (fabs(scale - 1.0) > 1e-8) { + logits_grad[i] *= static_cast(scale); + } + } +} + +template +class MarginCrossEntropyOpCUDAKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + const Tensor* logits = ctx.Input("Logits"); + const Tensor* labels = ctx.Input("Label"); + Tensor* softmax = ctx.Output("Softmax"); + Tensor* loss = ctx.Output("Loss"); + + const int rid = ctx.Attr("ring_id"); + const int nranks = ctx.Attr("nranks"); + const int rank = ctx.Attr("rank"); + + const float margin1 = ctx.Attr("margin1"); + const float margin2 = ctx.Attr("margin2"); + const float margin3 = ctx.Attr("margin3"); + const float scale = ctx.Attr("scale"); + + const auto& place = ctx.GetPlace(); + auto& dev_ctx = ctx.template device_context(); + +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) + platform::NCCLComm* comm; + gpuStream_t stream; + if (nranks > 1) { + comm = platform::NCCLCommContext::Instance().Get(rid, place); + + // use global calculate stream + stream = static_cast( + platform::DeviceContextPool::Instance().Get(place)) + ->stream(); + } +#endif + + // allocate memory on device. + T* softmax_ptr = softmax->mutable_data(place); + T* loss_ptr = loss->mutable_data(place); + + const auto& logits_dims = logits->dims(); + const auto& labels_dims = labels->dims(); + + const int axis = logits_dims.size() - 1; + const int N = SizeToAxis(axis, logits_dims); + const int D = SizeFromAxis(axis, logits_dims); + + int blocks = NumBlocks(N); + int threads = kNumCUDAThreads; + const auto& label_type = labels->type(); + + // copy logits to softmax variable since we can't modify logits, + // and it also be used when calculate grad + framework::TensorCopy(*logits, ctx.GetPlace(), ctx.device_context(), + softmax); + + Tensor softmax_2d; + softmax_2d.ShareDataWith(*softmax).Resize({N, D}); + T* logits_ptr = softmax_2d.data(); + + Tensor class_interval; + GetClassInterval(dev_ctx.stream(), place, ctx.cuda_device_context(), rid, + rank, nranks, D, &class_interval); + + // step 1, preprocess logits + // add margin for positive elements + // theta = acos(x_i) + // (cos(m1 * theta + m2) - m3) + // save match_logits, used for gradient computation. + if (label_type == framework::proto::VarType::INT32) { + typedef int32_t LabelT; + AddMarginToPositiveLogitsKernel< + T><<>>( + logits_ptr, labels->data(), margin1, margin2, margin3, rank, + nranks, N, D, class_interval.data()); + } else if (label_type == framework::proto::VarType::INT64) { + typedef int64_t LabelT; + AddMarginToPositiveLogitsKernel< + T><<>>( + logits_ptr, labels->data(), margin1, margin2, margin3, rank, + nranks, N, D, class_interval.data()); + } + + // scale by s + ScaleLogitKernel<<>>( + logits_ptr, scale, N, D); + + // step 2, obtain logit_max + Tensor logits_max; + logits_max = + ctx.AllocateTmpTensor({N, 1}, dev_ctx); + T* logits_max_buff = logits_max.mutable_data(place); + TensorReduceFunctorImpl(softmax_2d, &logits_max, {1}, + dev_ctx.stream()); + +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) + if (nranks > 1) { + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllReduce( + logits_max_buff, logits_max_buff, logits_max.numel(), + platform::ToNCCLDataType(logits_max.type()), ncclMax, comm->comm(), + stream)); + } +#endif + + // step 3, logit - logit_max + LogitsMinusMaxKernel<<>>( + logits_ptr, logits_max_buff, N, D); + + // step 4, sum(exp(logit - logit_max)) + Tensor sum_exp_logits; + sum_exp_logits = + ctx.AllocateTmpTensor({N, 1}, dev_ctx); + T* sum_exp_logits_buff = sum_exp_logits.mutable_data(place); + TensorReduceFunctorImpl(softmax_2d, &sum_exp_logits, {1}, + dev_ctx.stream()); + +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) + if (nranks > 1) { + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllReduce( + sum_exp_logits_buff, sum_exp_logits_buff, sum_exp_logits.numel(), + platform::ToNCCLDataType(sum_exp_logits.type()), ncclSum, + comm->comm(), stream)); + } +#endif + + // step 5, (logit - logit_max) - log(sum(exp(logit - logit_max))) + LogitsMinusLogSumKernel< + T><<>>( + logits_ptr, sum_exp_logits_buff, N, D); + + // step 6, prob = exp((logit - logit_max) - log(sum(exp(logit - + // logit_max)))) + // loss = -((logit_i - logit_max) - log(sum(exp(logit - logit_max)))) + math::SetConstant()(dev_ctx, loss, + static_cast(0.0)); + if (label_type == framework::proto::VarType::INT32) { + typedef int32_t LabelT; + HardLabelSoftmaxWithCrossEntropyKernel< + T, LabelT><<>>( + loss_ptr, logits_ptr, labels->data(), rank, N, D, + class_interval.data()); + } else if (label_type == framework::proto::VarType::INT64) { + typedef int64_t LabelT; + HardLabelSoftmaxWithCrossEntropyKernel< + T, LabelT><<>>( + loss_ptr, logits_ptr, labels->data(), rank, N, D, + class_interval.data()); + } + +#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL) + if (nranks > 1) { + PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllReduce( + loss_ptr, loss_ptr, loss->numel(), + platform::ToNCCLDataType(loss->type()), ncclSum, comm->comm(), + stream)); + } +#endif + } +}; + +template +class MarginCrossEntropyGradCUDAKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + const Tensor* labels = context.Input("Label"); + const Tensor* logits = context.Input("Logits"); + const Tensor* softmax = context.Input("Softmax"); + + const Tensor* loss_grad = + context.Input(framework::GradVarName("Loss")); + Tensor* logit_grad = + context.Output(framework::GradVarName("Logits")); + + const bool return_softmax = context.Attr("return_softmax"); + + const int rid = context.Attr("ring_id"); + const int nranks = context.Attr("nranks"); + const int rank = context.Attr("rank"); + + const float margin1 = context.Attr("margin1"); + const float margin2 = context.Attr("margin2"); + const float margin3 = context.Attr("margin3"); + const float scale = context.Attr("scale"); + + auto& dev_ctx = + context.template device_context(); + + const auto sofrmax_dims = softmax->dims(); + const int axis = sofrmax_dims.size() - 1; + const int N = SizeToAxis(axis, sofrmax_dims); + const int D = SizeFromAxis(axis, sofrmax_dims); + + if (return_softmax) { + framework::TensorCopy(*softmax, context.GetPlace(), + context.device_context(), logit_grad); + } else { + logit_grad->ShareDataWith(*softmax); + } + + int blocks = NumBlocks(N * D); + int threads = kNumCUDAThreads; + const auto& label_type = labels->type(); + + Tensor class_interval; + GetClassInterval(dev_ctx.stream(), context.GetPlace(), + context.cuda_device_context(), rid, rank, nranks, D, + &class_interval); + + if (label_type == framework::proto::VarType::INT32) { + typedef int32_t LabelT; + CalculateGrad<<>>( + logit_grad->data(), loss_grad->data(), logits->data(), + labels->data(), margin1, margin2, scale, rank, N, D, + class_interval.data()); + } else if (label_type == framework::proto::VarType::INT64) { + typedef int64_t LabelT; + CalculateGrad<<>>( + logit_grad->data(), loss_grad->data(), logits->data(), + labels->data(), margin1, margin2, scale, rank, N, D, + class_interval.data()); + } + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OP_CUDA_KERNEL(margin_cross_entropy, + ops::MarginCrossEntropyOpCUDAKernel, + ops::MarginCrossEntropyOpCUDAKernel, + ops::MarginCrossEntropyOpCUDAKernel); + +REGISTER_OP_CUDA_KERNEL(margin_cross_entropy_grad, + ops::MarginCrossEntropyGradCUDAKernel, + ops::MarginCrossEntropyGradCUDAKernel, + ops::MarginCrossEntropyGradCUDAKernel); diff --git a/paddle/fluid/operators/margin_cross_entropy_op.h b/paddle/fluid/operators/margin_cross_entropy_op.h new file mode 100644 index 0000000000000000000000000000000000000000..fe0dab5d47d35a56e1806ecb2c47e9cfc8197cd0 --- /dev/null +++ b/paddle/fluid/operators/margin_cross_entropy_op.h @@ -0,0 +1,41 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include + +#include "paddle/fluid/framework/data_type.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/math/softmax.h" +#include "paddle/fluid/operators/softmax_op.h" + +namespace paddle { +namespace operators { + +template +class MarginCrossEntropyOpCPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + PADDLE_THROW(platform::errors::Unavailable( + "Do not support margin_cross_entropy for cpu kernel " + "now.")); + } +}; + +} // namespace operators +} // namespace paddle diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt index 007221ca4f9ca30b93cf3661889d9244a1c8ade4..9d8b5fb699e33aa0220e3bd9e3d0d0de482b5589 100644 --- a/python/paddle/fluid/tests/unittests/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt @@ -28,6 +28,7 @@ list(APPEND DIST_TEST_OPS test_parallel_dygraph_pipeline_parallel) list(APPEND DIST_TEST_OPS test_parallel_dygraph_tensor_parallel) list(APPEND DIST_TEST_OPS test_parallel_dygraph_sharding_parallel) list(APPEND DIST_TEST_OPS test_parallel_dygraph_mp_layers) +list(APPEND DIST_TEST_OPS test_parallel_margin_cross_entropy) set(MIXED_DIST_TEST_OPS ${DIST_TEST_OPS}) #remove distribute unittests. list(APPEND MIXED_DIST_TEST_OPS test_dgc_op) @@ -195,6 +196,7 @@ if ((NOT WITH_GPU) AND (NOT WITH_ROCM)) LIST(REMOVE_ITEM TEST_OPS test_mixed_precision) LIST(REMOVE_ITEM TEST_OPS test_fleet_base_single) LIST(REMOVE_ITEM TEST_OPS test_dygraph_recompute) + LIST(REMOVE_ITEM TEST_OPS test_parallel_margin_cross_entropy) elseif(WITH_GPU) if (${CUDNN_VERSION} VERSION_LESS 7100) LIST(REMOVE_ITEM TEST_OPS test_conv2d_fusion_op) @@ -906,6 +908,7 @@ if(WITH_DISTRIBUTE AND WITH_GPU AND WITH_NCCL) set_tests_properties(test_parallel_dygraph_tensor_parallel PROPERTIES TIMEOUT 200) set_tests_properties(test_parallel_dygraph_sharding_parallel PROPERTIES TIMEOUT 120) set_tests_properties(test_parallel_dygraph_mp_layers PROPERTIES TIMEOUT 120) + set_tests_properties(test_parallel_margin_cross_entropy PROPERTIES TIMEOUT 120) if(${NCCL_VERSION} VERSION_GREATER_EQUAL 2212) set_tests_properties(test_parallel_dygraph_sparse_embedding PROPERTIES TIMEOUT 120) set_tests_properties(test_parallel_dygraph_transformer PROPERTIES TIMEOUT 120) diff --git a/python/paddle/fluid/tests/unittests/parallel_margin_cross_entropy.py b/python/paddle/fluid/tests/unittests/parallel_margin_cross_entropy.py new file mode 100644 index 0000000000000000000000000000000000000000..475a26ee94f37264dd9da0d7dbff9134d98d223b --- /dev/null +++ b/python/paddle/fluid/tests/unittests/parallel_margin_cross_entropy.py @@ -0,0 +1,188 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import division +from __future__ import print_function + +import unittest + +import paddle +import numpy as np +import random +import paddle.distributed as dist +import paddle.fluid as fluid +import paddle.distributed.fleet as fleet +from paddle import framework + + +def set_random_seed(seed): + """Set random seed for reproducability.""" + random.seed(seed) + np.random.seed(seed) + paddle.seed(seed) + fleet.meta_parallel.model_parallel_random_seed(seed) + + +class TestParallelMarginSoftmaxCrossEntropyOp(unittest.TestCase): + def setUp(self): + strategy = fleet.DistributedStrategy() + fleet.init(is_collective=True, strategy=strategy) + + def test_parallel_margin_softmax_cross_entropy(self): + margin1s = [1.0, 1.0, 1.35] + margin2s = [0.5, 0.0, 0.0] + margin3s = [0.0, 0.35, 0.0] + scales = [64.0, 64.0, 64.0] + + rank_id = dist.get_rank() + num_trainer = dist.get_world_size() + batch_size = 2 + feature_length = 4 + seed = 1025 + set_random_seed(seed) + paddle.seed(rank_id * 10) + random.seed(seed) + np.random.seed(seed) + + check_group = dist.new_group(list(range(num_trainer))) + for dtype in ('float32', 'float64'): + + num_class_per_cards = [[4, 8], [2, 2], [4, 2], [3, 9]] + for num_class_per_card in num_class_per_cards: + + num_class = np.sum(num_class_per_card) + for margin1, margin2, margin3, scale in zip(margin1s, margin2s, + margin3s, scales): + + for _ in range(5): + np_label = np.random.randint(0, num_class, + (batch_size, )) + label = paddle.to_tensor(np_label, dtype="int64") + + input = paddle.randn( + shape=[batch_size, feature_length], dtype=dtype) + input.stop_gradient = False + input_l2 = paddle.sqrt( + paddle.sum( + paddle.square(input), axis=1, keepdim=True)) + norm_input = paddle.divide(input, input_l2) + + weight = paddle.randn( + shape=[ + feature_length, num_class_per_card[rank_id] + ], + dtype=dtype) + weight.stop_gradient = False + weight_l2 = paddle.sqrt( + paddle.sum( + paddle.square(weight), axis=0, keepdim=True)) + norm_weight = paddle.divide(weight, weight_l2) + + data = paddle.matmul(norm_input, norm_weight) + data.stop_gradient = False + + sta = np.sum( + num_class_per_card[:rank_id]) if rank_id > 0 else 0 + end = np.sum(num_class_per_card[:rank_id + 1]) + + integral_data = np.zeros( + (batch_size, num_class), dtype=dtype) + integral_data[:, sta:end] = data.clone().detach().numpy( + ) + integral_data = paddle.to_tensor( + integral_data, dtype=dtype) + + paddle.distributed.all_reduce( + integral_data, + op=paddle.distributed.ReduceOp.SUM, + group=check_group) + integral_data = integral_data.detach().clone() + integral_data.stop_gradient = False + + # add arcface margin to logit + theta = paddle.acos(integral_data) + one_hot_label = paddle.nn.functional.one_hot( + label, num_classes=num_class) + one_hot_label.stop_gradient = False + + if margin1 != 1.0: + theta = margin1 * theta + if margin2 != 0.0: + theta = theta + margin2 + margin_cos = paddle.cos(theta) + if margin3 != 0.0: + margin_cos = margin_cos - margin3 + diff = one_hot_label * (margin_cos - integral_data) + arc_data = (integral_data + diff) * scale + + loss_a, softmax_a = paddle.nn.functional.margin_cross_entropy( + data, + label, + margin1=margin1, + margin2=margin2, + margin3=margin3, + scale=scale, + group=check_group, + return_softmax=True, + reduction=None) + loss_b, softmax_b = paddle.nn.functional.softmax_with_cross_entropy( + logits=arc_data, + label=paddle.reshape(label, (-1, 1)), + return_softmax=True) + + np.testing.assert_allclose( + loss_a.numpy(), loss_b.numpy(), rtol=1e-5) + + integral_prob = np.zeros( + (batch_size, num_class), dtype=dtype) + integral_prob[:, sta:end] = softmax_a.clone().detach( + ).numpy() + integral_prob = paddle.to_tensor( + integral_prob, dtype=dtype) + paddle.distributed.all_reduce( + integral_prob, + op=paddle.distributed.ReduceOp.SUM, + group=check_group) + integral_prob = integral_prob.detach().clone() + integral_prob.stop_gradient = False + + np.testing.assert_allclose( + integral_prob.numpy(), + softmax_b.numpy(), + rtol=1e-5, + atol=1e-6) + + loss_a = loss_a.sum() / batch_size + loss_b = loss_b.sum() / batch_size + loss_a.backward() + loss_b.backward() + + integral_grad = np.zeros( + (batch_size, num_class), dtype=dtype) + integral_grad[:, sta:end] = data.grad.clone().detach() + integral_grad = paddle.to_tensor( + integral_grad, dtype=dtype) + paddle.distributed.all_reduce( + integral_grad, + op=paddle.distributed.ReduceOp.SUM, + group=check_group) + + np.testing.assert_allclose( + integral_data.grad.numpy(), + integral_grad.numpy(), + rtol=1e-5) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_margin_cross_entropy_op.py b/python/paddle/fluid/tests/unittests/test_margin_cross_entropy_op.py new file mode 100644 index 0000000000000000000000000000000000000000..85d74f379814cdea463e325cc942b98c629ba635 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_margin_cross_entropy_op.py @@ -0,0 +1,385 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import numpy as np +from op_test import OpTest +import math +import random +import paddle +import paddle.fluid as fluid +from paddle.fluid import core +from paddle.fluid import Program, program_guard + + +def stable_softmax_comm(x): + shiftx = (x - np.max(x)) + deno = np.log(np.sum(np.exp(shiftx))) + comm = shiftx - deno + return comm + + +def margin_cross_entropy(logits, + label, + axis, + margin1, + margin2, + margin3, + scale, + reduction=None): + one_hot_label = np.zeros_like(logits, dtype=logits.dtype) + for i, lb in enumerate(label): + one_hot_label[i, lb] = 1.0 + + # add arcface margin to logit + theta = np.arccos(logits) + if margin1 != 1.0: + theta = margin1 * theta + if margin2 != 0.0: + theta = theta + margin2 + margin_cos = np.cos(theta) + if margin3 != 0.0: + margin_cos = margin_cos - margin3 + diff = one_hot_label * (margin_cos - logits) + arc_logits = (logits + diff) * scale + + comm = np.apply_along_axis(stable_softmax_comm, axis, arc_logits) + loss = (-one_hot_label * comm).sum(axis=axis, keepdims=True) + softmax = np.exp(comm) + if reduction == 'mean': + loss = np.mean(loss) + elif reduction == 'sum': + loss = np.sum(loss) + return loss, softmax + + +@unittest.skipIf(not core.is_compiled_with_cuda(), + "core is not compiled with CUDA") +class TestMarginCrossEntropyOp(OpTest): + def initParams(self): + self.op_type = "margin_cross_entropy" + self.axis = -1 + self.batch_dim = 5 + self.feat_dim = 41 + self.num_class = 37 + + def init_loss_params(self): + self.margin1 = 1.0 + self.margin2 = 0.5 + self.margin3 = 0.0 + self.scale = 2.0 + + def init_dtype(self): + self.dtype = np.float64 + + def setUp(self): + self.initParams() + self.init_loss_params() + self.init_dtype() + + datas = np.random.uniform( + -0.99, 0.99, [self.batch_dim, self.feat_dim]).astype(self.dtype) + datas = datas / np.sqrt(np.sum(np.square(datas), axis=1, keepdims=True)) + weights = np.random.uniform( + -0.99, 0.99, [self.feat_dim, self.num_class]).astype(self.dtype) + weights = weights / np.sqrt( + np.sum(np.square(weights), axis=0, keepdims=True)) + logits = np.matmul(datas, weights) + + labels = np.random.randint( + 0, self.num_class, (self.batch_dim, ), dtype="int64") + + loss, softmax = margin_cross_entropy(logits, labels, self.axis, + self.margin1, self.margin2, + self.margin3, self.scale) + + self.inputs = {"Logits": logits, "Label": labels} + self.outputs = { + "Softmax": softmax.astype(self.dtype), + "Loss": loss.astype(self.dtype) + } + self.attrs = { + 'margin1': self.margin1, + 'margin2': self.margin2, + 'margin3': self.margin3, + 'scale': self.scale, + } + + def test_check_output(self): + self.check_output_with_place(core.CUDAPlace(0), atol=1e-5) + + def test_check_grad(self): + self.check_grad_with_place(core.CUDAPlace(0), ["Logits"], "Loss") + + +@unittest.skipIf(not core.is_compiled_with_cuda(), + "core is not compiled with CUDA") +class TestMarginCrossEntropyOpFP32(TestMarginCrossEntropyOp): + def init_dtype(self): + self.dtype = np.float32 + + def test_check_grad(self): + self.check_grad_with_place( + core.CUDAPlace(0), ["Logits"], + "Loss", + numeric_grad_delta=5e-2, + max_relative_error=5e-2) + + +@unittest.skipIf(not core.is_compiled_with_cuda(), + "core is not compiled with CUDA") +class TestMarginCrossEntropyOpFP16(TestMarginCrossEntropyOp): + def init_dtype(self): + self.dtype = np.float16 + + def test_check_output(self): + self.check_output_with_place(core.CUDAPlace(0), atol=5e-2) + + def test_check_grad(self): + self.check_grad_with_place( + core.CUDAPlace(0), ["Logits"], + "Loss", + numeric_grad_delta=6e-1, + max_relative_error=6e-1) + + +@unittest.skipIf(not core.is_compiled_with_cuda(), + "core is not compiled with CUDA") +class TestMarginCrossEntropyOpCosFace(TestMarginCrossEntropyOp): + def init_loss_params(self): + self.margin1 = 1.0 + self.margin2 = 0.0 + self.margin3 = 0.35 + self.scale = 2.0 + + +@unittest.skipIf(not core.is_compiled_with_cuda(), + "core is not compiled with CUDA") +class TestMarginCrossEntropyOpSphereFace(TestMarginCrossEntropyOp): + def init_loss_params(self): + self.margin1 = 1.35 + self.margin2 = 0.0 + self.margin3 = 0.0 + self.scale = 2.0 + + +class TestMarginCrossEntropyOpCPU(TestMarginCrossEntropyOp): + def test_check_output(self): + try: + self.check_output_with_place(core.CPUPlace(), atol=1e-5) + except RuntimeError: + pass + + def test_check_grad(self): + try: + self.check_grad_with_place(core.CPUPlace(), ["Logits"], "Loss") + except RuntimeError: + pass + + +@unittest.skipIf(not core.is_compiled_with_cuda(), + "core is not compiled with CUDA") +class TestMarginCrossEntropyOpV2(unittest.TestCase): + def setUp(self): + self.initParams() + np.random.seed(self.seed) + paddle.framework.random._manual_program_seed(self.seed) + self.places = [] + if core.is_compiled_with_cuda(): + self.places.append(paddle.fluid.CUDAPlace(0)) + + def initParams(self): + self.seed = 2021 + self.axis = -1 + self.batch_dim = 5 + self.feat_dim = 41 + self.num_class = 37 + self.init_loss_params() + self.init_dtype() + self.init_reduction() + + def init_loss_params(self): + self.margin1 = 1.0 + self.margin2 = 0.5 + self.margin3 = 0.0 + self.scale = 2.0 + + def init_dtype(self): + self.dtype = np.float64 + + def init_reduction(self): + self.reduction = None + + def test_static(self): + for place in self.places: + self.check_static_result(place=place) + + def check_static_result(self, place): + with program_guard(Program(), Program()): + datas = np.random.uniform( + -0.99, 0.99, [self.batch_dim, self.feat_dim]).astype(self.dtype) + datas = datas / np.sqrt( + np.sum(np.square(datas), axis=1, keepdims=True)) + weights = np.random.uniform( + -0.99, 0.99, [self.feat_dim, self.num_class]).astype(self.dtype) + weights = weights / np.sqrt( + np.sum(np.square(weights), axis=0, keepdims=True)) + + logits_np = np.matmul(datas, weights) + labels_np = np.random.randint( + 0, self.num_class, (self.batch_dim, ), dtype="int64") + + loss_np, softmax_np = margin_cross_entropy( + logits_np, labels_np, self.axis, self.margin1, self.margin2, + self.margin3, self.scale, self.reduction) + + logits = paddle.static.data( + name='logits', + shape=[self.batch_dim, self.num_class], + dtype=self.dtype) + label = paddle.static.data( + name='label', shape=[self.batch_dim], dtype="int64") + loss, softmax = paddle.nn.functional.margin_cross_entropy( + logits, + label, + margin1=self.margin1, + margin2=self.margin2, + margin3=self.margin3, + scale=self.scale, + return_softmax=True, + reduction=self.reduction) + + exe = paddle.fluid.Executor(place) + [loss_res, softmax_res] = exe.run( + paddle.fluid.default_main_program(), + feed={'logits': logits_np, + 'label': labels_np}, + fetch_list=[loss, softmax]) + np.testing.assert_allclose(loss_res, loss_np) + np.testing.assert_allclose(softmax_res, softmax_np) + + def test_dynamic(self): + for place in self.places: + self.check_dynamic_result(place=place) + + def check_dynamic_result(self, place): + with paddle.fluid.dygraph.guard(place): + datas = np.random.uniform( + -0.99, 0.99, [self.batch_dim, self.feat_dim]).astype(self.dtype) + datas = datas / np.sqrt( + np.sum(np.square(datas), axis=1, keepdims=True)) + weights = np.random.uniform( + -0.99, 0.99, [self.feat_dim, self.num_class]).astype(self.dtype) + weights = weights / np.sqrt( + np.sum(np.square(weights), axis=0, keepdims=True)) + + logits_np = np.matmul(datas, weights) + labels_np = np.random.randint( + 0, self.num_class, (self.batch_dim, ), dtype="int64") + + loss_np, softmax_np = margin_cross_entropy( + logits_np, labels_np, self.axis, self.margin1, self.margin2, + self.margin3, self.scale, self.reduction) + + logits = paddle.to_tensor(logits_np, dtype=self.dtype) + labels = paddle.to_tensor(labels_np, dtype="int64") + + loss, softmax = paddle.nn.functional.margin_cross_entropy( + logits, + labels, + margin1=self.margin1, + margin2=self.margin2, + margin3=self.margin3, + scale=self.scale, + return_softmax=True, + reduction=self.reduction) + + loss_res = loss.numpy() + softmax_res = softmax.numpy() + np.testing.assert_allclose(loss_res, loss_np) + np.testing.assert_allclose(softmax_res, softmax_np) + + +@unittest.skipIf(not core.is_compiled_with_cuda(), + "core is not compiled with CUDA") +class TestMarginCrossEntropyOpV3(TestMarginCrossEntropyOpV2): + def init_reduction(self): + self.reduction = 'mean' + + +@unittest.skipIf(not core.is_compiled_with_cuda(), + "core is not compiled with CUDA") +class TestMarginCrossEntropyOpV4(TestMarginCrossEntropyOpV2): + def init_reduction(self): + self.reduction = 'sum' + + +@unittest.skipIf(not core.is_compiled_with_cuda(), + "core is not compiled with CUDA") +class TestMarginCrossEntropyOpAPIError(unittest.TestCase): + def setUp(self): + self.initParams() + np.random.seed(self.seed) + paddle.framework.random._manual_program_seed(self.seed) + self.places = [] + if core.is_compiled_with_cuda(): + self.places.append(paddle.fluid.CUDAPlace(0)) + + def initParams(self): + self.seed = 2021 + self.axis = -1 + self.batch_dim = 10 + self.feat_dim = 41 + self.num_class = 37 + self.init_loss_params() + self.init_dtype() + + def init_loss_params(self): + self.margin1 = 1.0 + self.margin2 = 0.5 + self.margin3 = 0.0 + self.scale = 2.0 + + def init_dtype(self): + self.dtype = np.float64 + + def test_dynamic_errors(self): + def test_dim(): + for place in self.places: + with paddle.fluid.dygraph.guard(place): + labels_np = np.random.randint( + 0, self.num_class, (self.batch_dim, 2), dtype="int64") + logits_np = np.random.uniform( + -0.99, 0.99, + [self.batch_dim, self.num_class]).astype(self.dtype) + labels = paddle.to_tensor(labels_np) + logits = paddle.to_tensor(logits_np) + + loss, softmax = paddle.nn.functional.margin_cross_entropy( + logits, + labels, + margin1=self.margin1, + margin2=self.margin2, + margin3=self.margin3, + scale=self.scale, + return_softmax=True, + reduction=None) + + self.assertRaises(ValueError, test_dim) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_parallel_margin_cross_entropy.py b/python/paddle/fluid/tests/unittests/test_parallel_margin_cross_entropy.py new file mode 100644 index 0000000000000000000000000000000000000000..1b24889830ad873997fb803fb9baf574c657e2a9 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_parallel_margin_cross_entropy.py @@ -0,0 +1,29 @@ +# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import paddle.fluid as fluid + +from test_parallel_dygraph_dataparallel import TestMultipleGpus + + +class TestParallelMarginSoftmaxWithCrossEntropy(TestMultipleGpus): + def test_parallel_margin_cross_entropy(self): + self.run_mnist_2gpu('parallel_margin_cross_entropy.py') + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/nn/functional/__init__.py b/python/paddle/nn/functional/__init__.py index f3d9f9dde11a4e48bbabbdb03336021ebc3863ad..04e0b7c140d7fa2fa583ec871ca0b7f7ae19329d 100644 --- a/python/paddle/nn/functional/__init__.py +++ b/python/paddle/nn/functional/__init__.py @@ -79,6 +79,7 @@ from .loss import npair_loss # noqa: F401 from .loss import sigmoid_focal_loss # noqa: F401 from .loss import smooth_l1_loss # noqa: F401 from .loss import softmax_with_cross_entropy # noqa: F401 +from .loss import margin_cross_entropy # noqa: F401 from .loss import square_error_cost # noqa: F401 from .loss import ctc_loss # noqa: F401 from .norm import batch_norm # noqa: F401 @@ -185,6 +186,7 @@ __all__ = [ #noqa 'sigmoid_focal_loss', 'smooth_l1_loss', 'softmax_with_cross_entropy', + 'margin_cross_entropy', 'square_error_cost', 'ctc_loss', 'affine_grid', diff --git a/python/paddle/nn/functional/loss.py b/python/paddle/nn/functional/loss.py index ef2bfb3b8e0d3aeabf296cc8379e49d92c6dd6e3..d7b781c84767f216e86861f0363f77b5da2be1a3 100755 --- a/python/paddle/nn/functional/loss.py +++ b/python/paddle/nn/functional/loss.py @@ -1092,6 +1092,268 @@ def ctc_loss(log_probs, return loss_out +def margin_cross_entropy(logits, + label, + margin1=1.0, + margin2=0.5, + margin3=0.0, + scale=64.0, + group=None, + return_softmax=False, + reduction='mean'): + """ + .. math:: + + L=-\\frac{1}{N}\sum^N_{i=1}\log\\frac{e^{s(cos(m_{1}\\theta_{y_i}+m_{2})-m_{3})}}{e^{s(cos(m_{1}\\theta_{y_i}+m_{2})-m_{3})}+\sum^n_{j=1,j\\neq y_i} e^{scos\\theta_{y_i}}} + + where the :math:`\\theta_{y_i}` is the angle between the feature :math:`x` and + the representation of class :math:`i`. The details of ArcFace loss + could be referred to https://arxiv.org/abs/1801.07698. + + .. hint:: + The API supports model parallel and single GPU. And logits.shape[-1] can be different at each rank. + + Args: + logits (Tensor): shape[N, local_num_classes], the output of the normalized X multiply the normalized W. + The logits is shard_logits when using model parallel. + label (Tensor): shape[N] or shape[N, 1], the groud truth label. + margin1 (float, optional): m1 of margin loss, default value is `1.0`. + margin2 (float, optional): m2 of margin loss, default value is `0.5`. + margin3 (float, optional): m3 of margin loss, default value is `0.0`. + scale (float, optional): s of margin loss, default value is `64.0`. + group (Group, optional): The abstract representation of group, see paddle.distributed.collective.Group. + Default `None`. + return_softmax (bool, optional): Whether return softmax probability. Default value is `False`. + reduction (str, optional): The candicates are ``'none'`` | ``'mean'`` | ``'sum'``. + If :attr:`reduction` is ``'mean'``, return the average of loss; + If :attr:`reduction` is ``'sum'``, return the sum of loss; + If :attr:`reduction` is ``'none'``, no reduction will be applied. + Default value is `'mean'`. + + Returns: + ``Tensor`` or Tuple of two ``Tensor`` : Return the cross entropy loss if \ + `return_softmax` is False, otherwise the tuple \ + (loss, softmax), softmax is shard_softmax when \ + using model parallel, otherwise softmax is in \ + the same shape with input logits. If ``reduction == None``, \ + the shape of loss is ``[N, 1]``, otherwise the shape is ``[1]``. + + Examples: + + .. code-block:: python + + # required: gpu + # Single GPU + import paddle + m1 = 1.0 + m2 = 0.5 + m3 = 0.0 + s = 64.0 + batch_size = 2 + feature_length = 4 + num_classes = 4 + + label = paddle.randint(low=0, high=num_classes, shape=[batch_size], dtype='int64') + + X = paddle.randn( + shape=[batch_size, feature_length], + dtype='float64') + X_l2 = paddle.sqrt(paddle.sum(paddle.square(X), axis=1, keepdim=True)) + X = paddle.divide(X, X_l2) + + W = paddle.randn( + shape=[feature_length, num_classes], + dtype='float64') + W_l2 = paddle.sqrt(paddle.sum(paddle.square(W), axis=0, keepdim=True)) + W = paddle.divide(W, W_l2) + + logits = paddle.matmul(X, W) + loss, softmax = paddle.nn.functional.margin_cross_entropy( + logits, label, margin1=m1, margin2=m2, margin3=m3, scale=s, return_softmax=True, reduction=None) + + print(logits) + print(label) + print(loss) + print(softmax) + + #Tensor(shape=[2, 4], dtype=float64, place=CUDAPlace(0), stop_gradient=True, + # [[ 0.85204151, -0.55557678, 0.04994566, 0.71986042], + # [-0.20198586, -0.35270476, -0.55182702, 0.09749021]]) + #Tensor(shape=[2], dtype=int64, place=CUDAPlace(0), stop_gradient=True, + # [2, 3]) + #Tensor(shape=[2, 1], dtype=float64, place=CUDAPlace(0), stop_gradient=True, + # [[82.37059586], + # [12.13448420]]) + #Tensor(shape=[2, 4], dtype=float64, place=CUDAPlace(0), stop_gradient=True, + # [[0.99978819, 0.00000000, 0.00000000, 0.00021181], + # [0.99992995, 0.00006468, 0.00000000, 0.00000537]]) + + .. code-block:: python + + # required: distributed + # Multi GPU, test_margin_cross_entropy.py + import paddle + import paddle.distributed as dist + strategy = dist.fleet.DistributedStrategy() + dist.fleet.init(is_collective=True, strategy=strategy) + rank_id = dist.get_rank() + m1 = 1.0 + m2 = 0.5 + m3 = 0.0 + s = 64.0 + batch_size = 2 + feature_length = 4 + num_class_per_card = [4, 8] + num_classes = paddle.sum(paddle.to_tensor(num_class_per_card)) + + label = paddle.randint(low=0, high=num_classes.item(), shape=[batch_size], dtype='int64') + label_list = [] + dist.all_gather(label_list, label) + label = paddle.concat(label_list, axis=0) + + X = paddle.randn( + shape=[batch_size, feature_length], + dtype='float64') + X_list = [] + dist.all_gather(X_list, X) + X = paddle.concat(X_list, axis=0) + X_l2 = paddle.sqrt(paddle.sum(paddle.square(X), axis=1, keepdim=True)) + X = paddle.divide(X, X_l2) + + W = paddle.randn( + shape=[feature_length, num_class_per_card[rank_id]], + dtype='float64') + W_l2 = paddle.sqrt(paddle.sum(paddle.square(W), axis=0, keepdim=True)) + W = paddle.divide(W, W_l2) + + logits = paddle.matmul(X, W) + loss, softmax = paddle.nn.functional.margin_cross_entropy( + logits, label, margin1=m1, margin2=m2, margin3=m3, scale=s, return_softmax=True, reduction=None) + + print(logits) + print(label) + print(loss) + print(softmax) + + # python -m paddle.distributed.launch --gpus=0,1 test_margin_cross_entropy.py + ## for rank0 input + #Tensor(shape=[4, 4], dtype=float64, place=CUDAPlace(0), stop_gradient=True, + # [[ 0.32888934, 0.02408748, -0.02763289, 0.18173063], + # [-0.52893978, -0.10623845, -0.21596515, -0.06432517], + # [-0.00536345, -0.03924667, 0.66735314, -0.28640926], + # [-0.09907366, -0.48534973, -0.10365338, -0.39472322]]) + #Tensor(shape=[4], dtype=int64, place=CUDAPlace(0), stop_gradient=True, + # [11, 1 , 10, 11]) + + ## for rank1 input + #Tensor(shape=[4, 8], dtype=float64, place=CUDAPlace(1), stop_gradient=True, + # [[ 0.68654754, 0.28137170, 0.69694954, -0.60923933, -0.57077653, 0.54576703, -0.38709028, 0.56028204], + # [-0.80360371, -0.03042448, -0.45107338, 0.49559349, 0.69998950, -0.45411693, 0.61927630, -0.82808600], + # [ 0.11457570, -0.34785879, -0.68819499, -0.26189226, -0.48241491, -0.67685711, 0.06510185, 0.49660849], + # [ 0.31604851, 0.52087884, 0.53124749, -0.86176582, -0.43426329, 0.34786144, -0.10850784, 0.51566383]]) + #Tensor(shape=[4], dtype=int64, place=CUDAPlace(1), stop_gradient=True, + # [11, 1 , 10, 11]) + + ## for rank0 output + #Tensor(shape=[4, 1], dtype=float64, place=CUDAPlace(0), stop_gradient=True, + # [[38.96608230], + # [81.28152394], + # [69.67229865], + # [31.74197251]]) + #Tensor(shape=[4, 4], dtype=float64, place=CUDAPlace(0), stop_gradient=True, + # [[0.00000000, 0.00000000, 0.00000000, 0.00000000], + # [0.00000000, 0.00000000, 0.00000000, 0.00000000], + # [0.00000000, 0.00000000, 0.99998205, 0.00000000], + # [0.00000000, 0.00000000, 0.00000000, 0.00000000]]) + ## for rank1 output + #Tensor(shape=[4, 1], dtype=float64, place=CUDAPlace(1), stop_gradient=True, + # [[38.96608230], + # [81.28152394], + # [69.67229865], + # [31.74197251]]) + #Tensor(shape=[4, 8], dtype=float64, place=CUDAPlace(1), stop_gradient=True, + # [[0.33943993, 0.00000000, 0.66051859, 0.00000000, 0.00000000, 0.00004148, 0.00000000, 0.00000000], + # [0.00000000, 0.00000000, 0.00000000, 0.00000207, 0.99432097, 0.00000000, 0.00567696, 0.00000000], + # [0.00000000, 0.00000000, 0.00000000, 0.00000000, 0.00000000, 0.00000000, 0.00000000, 0.00001795], + # [0.00000069, 0.33993085, 0.66006319, 0.00000000, 0.00000000, 0.00000528, 0.00000000, 0.00000000]]) + """ + + assert reduction in ['mean', 'sum', 'none', None] + if group is not None and not group.is_member(): + return + + ring_id = 0 if group is None else group.id + rank = 0 + nranks = 1 + if core.is_compiled_with_dist(): + parallel_env = paddle.distributed.ParallelEnv() + global_rank = parallel_env.rank + rank = global_rank if group is None else group.get_group_rank( + global_rank) + nranks = parallel_env.world_size if group is None else group.nranks + + input_dims = len(list(logits.shape)) + label_dims = len(list(label.shape)) + if input_dims - 1 != label_dims and input_dims != label_dims: + raise ValueError( + 'Expected nput_dims - 1 = label_dims or input_dims == label_dims\ + (got nput_dims{}, label_dims{})'.format(input_dims, label_dims)) + if input_dims - 1 == label_dims: + label = paddle.unsqueeze(label, axis=-1) + + if in_dygraph_mode(): + softmax, loss = core.ops.margin_cross_entropy( + logits, label, 'ring_id', ring_id, 'rank', rank, 'nranks', nranks, + 'margin1', margin1, 'margin2', margin2, 'margin3', margin3, 'scale', + scale, 'return_softmax', return_softmax) + if reduction == 'mean': + loss = paddle.mean(loss) + elif reduction == 'sum': + loss = paddle.sum(loss) + if not return_softmax: + return loss + else: + return loss, softmax + + op_type = 'margin_cross_entropy' + helper = LayerHelper(op_type, **locals()) + softmax = helper.create_variable_for_type_inference(dtype=logits.dtype) + loss = helper.create_variable_for_type_inference(dtype=logits.dtype) + + check_variable_and_dtype(logits, 'logits', + ['float16', 'float32', 'float64'], + 'margin_cross_entropy') + check_variable_and_dtype(label, 'label', ['int32', 'int64'], + 'margin_cross_entropy') + + helper.append_op( + type=op_type, + inputs={'Logits': logits, + 'Label': label}, + outputs={'Softmax': softmax, + 'Loss': loss}, + attrs={ + 'return_softmax': return_softmax, + 'ring_id': ring_id, + 'rank': rank, + 'nranks': nranks, + 'margin1': margin1, + 'margin2': margin2, + 'margin3': margin3, + 'scale': scale, + }) + + if reduction == 'mean': + loss = paddle.mean(loss) + elif reduction == 'sum': + loss = paddle.sum(loss) + + if not return_softmax: + return loss + else: + return loss, softmax + + @deprecated( since="2.0.0", update_to="paddle.nn.functional.cross_entropy", diff --git a/tools/static_mode_white_list.py b/tools/static_mode_white_list.py index 616d5ae280ad1acdaa3e2812981d27bbac8f2ab0..d2f95c235b04c1b7573b9deda6400e5e24cdca93 100644 --- a/tools/static_mode_white_list.py +++ b/tools/static_mode_white_list.py @@ -719,4 +719,5 @@ STATIC_MODE_TESTING_LIST = [ 'test_sgd_op_bf16', 'test_marker_op', 'test_c_embedding_op', + 'test_margin_cross_entropy_op', ]