diff --git a/paddle/fluid/operators/margin_cross_entropy_op.cc b/paddle/fluid/operators/margin_cross_entropy_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..94c58fa6970d902689680ca5552ea3d126c424c6
--- /dev/null
+++ b/paddle/fluid/operators/margin_cross_entropy_op.cc
@@ -0,0 +1,203 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/margin_cross_entropy_op.h"
+
+namespace paddle {
+namespace operators {
+
+class MarginCrossEntropyOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    OP_INOUT_CHECK(ctx->HasInput("Logits"), "Input", "Logits",
+                   "MarginCrossEntropyOp");
+    OP_INOUT_CHECK(ctx->HasInput("Label"), "Input", "Label",
+                   "MarginCrossEntropyOp");
+
+    OP_INOUT_CHECK(ctx->HasOutput("Softmax"), "Output", "Softmax",
+                   "MarginCrossEntropyOp");
+    OP_INOUT_CHECK(ctx->HasOutput("Loss"), "Output", "Loss",
+                   "MarginCrossEntropyOp");
+
+    auto logits_dims = ctx->GetInputDim("Logits");
+    auto labels_dims = ctx->GetInputDim("Label");
+
+    auto logits_rank = logits_dims.size();
+    auto axis = logits_rank - 1;
+    for (int i = 0; i < logits_rank; i++) {
+      if (i != axis) {
+        if (ctx->IsRuntime() || (logits_dims[i] > 0 && labels_dims[i] > 0)) {
+          PADDLE_ENFORCE_EQ(logits_dims[i], labels_dims[i],
+                            platform::errors::InvalidArgument(
+                                "Input(Logits) and Input(Label) should in "
+                                "same shape in dimensions except axis."));
+        }
+      }
+    }
+
+    if (labels_dims.size() > 1) {
+      PADDLE_ENFORCE_EQ(
+          labels_dims[logits_rank - 1], 1UL,
+          platform::errors::InvalidArgument(
+              "the last dimension of Input(Label) should be 1."
+              "But received: the last dimension of Input(Label) is [%d],"
+              "the last dimension is [%d]",
+              labels_dims[logits_rank - 1], logits_rank - 1));
+    }
+
+    ctx->SetOutputDim("Softmax", logits_dims);
+
+    logits_dims[axis] = 1;
+    ctx->SetOutputDim("Loss", logits_dims);
+
+    ctx->ShareLoD("Logits", /*->*/ "Softmax");
+    ctx->ShareLoD("Logits", /*->*/ "Loss");
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(
+        OperatorWithKernel::IndicateVarDataType(ctx, "Logits"),
+        ctx.device_context());
+  }
+};
+
+class MarginCrossEntropyOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() {
+    AddInput("Logits",
+             "(Tensor, default: Tensor<float>), The input tensor of unscaled "
+             "log probabilities, whose dimension :attr:`axis` should be scaled "
+             "by softmax.");
+    AddInput(
+        "Label",
+        "(Tensor) The input tensor of groud truth label. Label is a "
+        "Tensor<int64> in same shape with Input(Logits) except the shape in "
+        "dimension :attr:`axis` as 1.");
+    AddOutput(
+        "Softmax",
+        "(Tensor, default: Tensor<float>), A tensor in same shape with "
+        "Input(Logits). "
+        "The outputs value of softmax activation by given the input batch, "
+        "which will be used in backward calculation.");
+    AddOutput("Loss",
+              "(Tensor, default: Tensor<float>), A tensor in same shape with "
+              "Input(Logits) "
+              "except the shape in dimension :attr:`axis` as 1. The cross "
+              "entropy loss.");
+    AddAttr<bool>("return_softmax",
+                  "(bool default false) A flag to indicate "
+                  "whether to return softmax.")
+        .SetDefault(false);
+    AddAttr<int>("ring_id", "(int default 0) nccl communication ring id.")
+        .SetDefault(0);
+    AddAttr<int>("rank", "(int default 0) rank id for MarginCrossEntropy.")
+        .SetDefault(0);
+    AddAttr<int>("nranks", "(int default 1) nranks id for MarginCrossEntropy.")
+        .SetDefault(1);
+    AddAttr<float>("margin1", "(float default 1.0) margin1 for MarginLoss.")
+        .SetDefault(1.0);
+    AddAttr<float>("margin2", "(float default 0.5) margin2 for MarginLoss.")
+        .SetDefault(0.5);
+    AddAttr<float>("margin3", "(float default 0.0) margin3 for MarginLoss.")
+        .SetDefault(0.0);
+    AddAttr<float>("scale", "(float default 64.0) scale for MarginLoss.")
+        .SetDefault(64.0);
+    AddComment(R"DOC(
+MarginCrossEntropy Operator
+.. math::
+
+    L=-\frac{1}{N}\sum^N_{i=1}\log\frac{e^{s(cos(m_{1}\theta_{y_i}+m_{2})-m_{3})}}{e^{s(cos(m_{1}\theta_{y_i}+m_{2})-m_{3})}+\sum^n_{j=1,j\neq y_i} e^{scos\theta_{y_i}}}
+
+where the :math: `\theta_{y_i}` is the angle between the feature :math: `x` and
+the representation of class :math: `i`. The details of ArcFace loss
+could be referred to https://arxiv.org/abs/1801.07698.
+
+Note that the Op supports model parallel and single GPU. And Logits.shape[-1] can be different each rank.
+
+)DOC");
+  }
+};
+
+class MarginCrossEntropyOpGrad : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE_EQ(ctx->HasInput(framework::GradVarName("Loss")), true,
+                      platform::errors::InvalidArgument(
+                          "Input(Loss@Grad) should not be null."));
+    PADDLE_ENFORCE_EQ(ctx->HasInput("Softmax"), true,
+                      platform::errors::InvalidArgument(
+                          "Input(Softmax) should be not null."));
+    PADDLE_ENFORCE_EQ(
+        ctx->HasInput("Label"), true,
+        platform::errors::InvalidArgument("Input(Label) should be not null."));
+
+    PADDLE_ENFORCE_EQ(ctx->HasOutput(framework::GradVarName("Logits")), true,
+                      platform::errors::InvalidArgument(
+                          "Output(Logits@Grad) should be not null."));
+
+    ctx->SetOutputDim(framework::GradVarName("Logits"),
+                      ctx->GetInputDim("Softmax"));
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    return framework::OpKernelType(OperatorWithKernel::IndicateVarDataType(
+                                       ctx, framework::GradVarName("Loss")),
+                                   ctx.device_context());
+  }
+};
+
+template <typename T>
+class MarginCrossEntropyOpGradMaker : public framework::SingleGradOpMaker<T> {
+ public:
+  using framework::SingleGradOpMaker<T>::SingleGradOpMaker;
+
+ protected:
+  void Apply(GradOpPtr<T> op) const override {
+    op->SetType("margin_cross_entropy_grad");
+
+    op->SetInput("Softmax", this->Output("Softmax"));
+    op->SetInput("Logits", this->Input("Logits"));
+    op->SetInput("Label", this->Input("Label"));
+    op->SetInput(framework::GradVarName("Loss"), this->OutputGrad("Loss"));
+    op->SetAttrMap(this->Attrs());
+    op->SetOutput(framework::GradVarName("Logits"), this->InputGrad("Logits"));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OPERATOR(
+    margin_cross_entropy, ops::MarginCrossEntropyOp,
+    ops::MarginCrossEntropyOpMaker,
+    ops::MarginCrossEntropyOpGradMaker<paddle::framework::OpDesc>,
+    ops::MarginCrossEntropyOpGradMaker<paddle::imperative::OpBase>);
+
+REGISTER_OPERATOR(margin_cross_entropy_grad, ops::MarginCrossEntropyOpGrad);
+
+REGISTER_OP_CPU_KERNEL(margin_cross_entropy,
+                       ops::MarginCrossEntropyOpCPUKernel<float>,
+                       ops::MarginCrossEntropyOpCPUKernel<double>,
+                       ops::MarginCrossEntropyOpCPUKernel<plat::float16>);
diff --git a/paddle/fluid/operators/margin_cross_entropy_op.cu b/paddle/fluid/operators/margin_cross_entropy_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..ccdba43b0542dc82975d4c9a19af56304dc0173a
--- /dev/null
+++ b/paddle/fluid/operators/margin_cross_entropy_op.cu
@@ -0,0 +1,483 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef PADDLE_WITH_HIP
+#include <hipcub/hipcub.hpp>
+namespace cub = hipcub;
+#else
+#include <cub/cub.cuh>
+#endif
+
+#include <vector>
+#include "paddle/fluid/operators/amp/fp16_type_traits.h"
+#include "paddle/fluid/operators/margin_cross_entropy_op.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/operators/math/softmax_impl.h"
+#include "paddle/fluid/operators/reduce_ops/reduce_functor_op.h"
+#include "paddle/fluid/operators/reduce_ops/reduce_op.h"
+#include "paddle/fluid/string/string_helper.h"
+
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+#include "paddle/fluid/platform/collective_helper.h"
+#include "paddle/fluid/platform/nccl_helper.h"
+#endif
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+static constexpr int kNumCUDAThreads = 512;
+static constexpr int kNumMaxinumNumBlocks = 4096;
+
+static inline int NumBlocks(const int N) {
+  return std::min((N + kNumCUDAThreads - 1) / kNumCUDAThreads,
+                  kNumMaxinumNumBlocks);
+}
+
+void GetClassInterval(const gpuStream_t& stream, const platform::Place& place,
+                      const platform::DeviceContext& ctx, const int rid,
+                      const int rank, const int nranks, const int D,
+                      Tensor* class_interval) {
+  std::vector<int> shard_dim_vec(nranks + 1, 0);
+  shard_dim_vec[rank + 1] = D;
+  if (nranks <= 1) {
+    framework::TensorFromVector(shard_dim_vec, ctx, class_interval);
+    return;
+  }
+
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+  Tensor num_classes_per_device;
+  framework::TensorFromVector(shard_dim_vec, ctx, &num_classes_per_device);
+  int* num_classes_per_device_ptr = num_classes_per_device.data<int>();
+
+  const auto& comm = platform::NCCLCommContext::Instance().Get(rid, place);
+  // use global calculate stream
+  const auto calcu_stream =
+      static_cast<platform::CUDADeviceContext*>(
+          platform::DeviceContextPool::Instance().Get(place))
+          ->stream();
+
+  PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllReduce(
+      num_classes_per_device_ptr, num_classes_per_device_ptr,
+      num_classes_per_device.numel(),
+      platform::ToNCCLDataType(num_classes_per_device.type()), ncclSum,
+      comm->comm(), calcu_stream));
+
+  auto class_interval_ptr =
+      class_interval->mutable_data<int>({nranks + 1}, place);
+  size_t cub_temp_storage_bytes = 0;
+  cub::DeviceScan::InclusiveSum<int*, int*>(
+      nullptr, cub_temp_storage_bytes, nullptr, nullptr, nranks + 1, stream);
+  auto cub_temp_storage = memory::Alloc(place, cub_temp_storage_bytes);
+  cub::DeviceScan::InclusiveSum<int*, int*>(
+      cub_temp_storage->ptr(), cub_temp_storage_bytes,
+      num_classes_per_device_ptr, class_interval_ptr, nranks + 1, stream);
+  return;
+#endif
+}
+
+template <typename T, typename IndexT>
+__global__ void AddMarginToPositiveLogitsKernel(
+    T* logit, const IndexT* label, const float margin1, const float margin2,
+    const float margin3, const int rank, const int nranks, const int64_t N,
+    const int64_t D, const int* class_interval_ptr) {
+  using MPType = typename details::MPTypeTrait<T>::Type;
+  int start_index = class_interval_ptr[rank];
+  int end_index = class_interval_ptr[rank + 1];
+  int num_classes = class_interval_ptr[nranks];
+  CUDA_KERNEL_LOOP(i, N) {
+    auto real_label = label[i];
+    PADDLE_ENFORCE((real_label < num_classes) && (real_label >= 0),
+                   "The index is out of bounds, "
+                   "please check whether the value of label and "
+                   "input meet the number of class. It should "
+                   "be less than [%d], but received [%d]",
+                   num_classes, real_label);
+
+    if (real_label >= start_index && real_label < end_index) {
+      int64_t offset = i * D + real_label - start_index;
+      if (fabs(margin1 - 1.0) > 1e-8 || fabs(margin2) > 1e-8) {
+        MPType x = static_cast<MPType>(logit[offset]);
+        MPType theta = acos(x);
+        if (fabs(margin1 - 1.0) > 1e-8) {
+          theta *= static_cast<MPType>(margin1);
+        }
+        if (fabs(margin2) > 1e-8) {
+          theta += static_cast<MPType>(margin2);
+        }
+        logit[offset] = static_cast<T>(cos(theta));
+      }
+      if (fabs(margin3) > 1e-8) {
+        MPType y = static_cast<MPType>(logit[offset]);
+        y -= static_cast<MPType>(margin3);
+        logit[offset] = static_cast<T>(y);
+      }
+    }
+  }
+}
+
+static __device__ __forceinline__ platform::float16 exp_on_device(
+    platform::float16 x) {
+  return ::Eigen::numext::exp(x);
+}
+static __device__ __forceinline__ float exp_on_device(float x) {
+  return expf(x);
+}
+static __device__ __forceinline__ double exp_on_device(double x) {
+  return exp(x);
+}
+static __device__ __forceinline__ platform::float16 log_on_device(
+    platform::float16 x) {
+  return ::Eigen::numext::log(x);
+}
+static __device__ __forceinline__ float log_on_device(float x) {
+  return logf(x);
+}
+static __device__ __forceinline__ double log_on_device(double x) {
+  return log(x);
+}
+
+template <typename Tx, typename Ty = Tx>
+struct ExpLogitTransformer {
+  HOSTDEVICE explicit inline ExpLogitTransformer(int n) {}
+
+  HOSTDEVICE inline Ty operator()(const Tx& x) const {
+    return static_cast<Ty>(exp_on_device(x));
+  }
+};
+
+template <typename Tx, typename Ty = Tx>
+struct ExpAndSum {
+  using Transformer = ExpLogitTransformer<Tx>;
+
+  inline Ty initial() { return static_cast<Ty>(0.0f); }
+
+  __device__ __forceinline__ Ty operator()(const Ty& a, const Ty& b) const {
+    return b + a;
+  }
+};
+
+template <typename T>
+__global__ void ScaleLogitKernel(T* logits, const float scale, const int64_t N,
+                                 const int64_t D) {
+  CUDA_KERNEL_LOOP(i, N * D) { logits[i] *= static_cast<T>(scale); }
+}
+
+template <typename T>
+__global__ void LogitsMinusMaxKernel(T* logits, const T* logits_max_per_row,
+                                     const int64_t N, const int64_t D) {
+  CUDA_KERNEL_LOOP(i, N * D) {
+    auto row = i / D;
+    logits[i] -= logits_max_per_row[row];
+  }
+}
+
+template <typename T>
+__global__ void LogitsMinusLogSumKernel(T* logits, const T* logits_sum_per_row,
+                                        const int64_t N, const int64_t D) {
+  CUDA_KERNEL_LOOP(i, N * D) {
+    auto row = i / D;
+    logits[i] -= log_on_device(logits_sum_per_row[row]);
+  }
+}
+
+template <typename T, typename IndexT>
+__global__ void HardLabelSoftmaxWithCrossEntropyKernel(
+    T* loss, T* log_softmax, const IndexT* labels, const int rank,
+    const int64_t N, const int64_t D, const int* class_interval_ptr) {
+  int start_index = class_interval_ptr[rank];
+  CUDA_KERNEL_LOOP(i, N * D) {
+    auto row = i / D;
+    auto col = i % D;
+    if ((col + start_index) == labels[row]) {
+      auto softmax = log_softmax[i];
+      loss[row] = -softmax;
+      log_softmax[i] = exp_on_device(softmax);
+    } else {
+      log_softmax[i] = exp_on_device(log_softmax[i]);
+    }
+  }
+}
+
+template <typename T, typename IndexT>
+__global__ void CalculateGrad(T* logits_grad, const T* loss_grad,
+                              const T* logits, const IndexT* labels,
+                              const float margin1, const float margin2,
+                              const float scale, const int rank,
+                              const int64_t N, const int64_t D,
+                              const int* class_interval_ptr) {
+  using MPType = typename details::MPTypeTrait<T>::Type;
+  int start_index = class_interval_ptr[rank];
+  CUDA_KERNEL_LOOP(i, N * D) {
+    auto row = i / D;
+    auto col = i % D;
+    if ((col + start_index) == labels[row]) {
+      logits_grad[i] = (logits_grad[i] - static_cast<T>(1.0)) * loss_grad[row];
+      if (fabs(margin1 - 1.0) > 1e-8 || fabs(margin2) > 1e-8) {
+        MPType dout = static_cast<MPType>(logits_grad[i]);
+        MPType one = static_cast<MPType>(1.0f);
+        MPType x = static_cast<MPType>(logits[i]);
+        MPType m1 = static_cast<MPType>(margin1);
+        MPType m2 = static_cast<MPType>(margin2);
+
+        MPType d = m1 * sin(m1 * acos(x) + m2) / sqrt(one - x * x);
+        logits_grad[i] = static_cast<T>(dout * d);
+      }
+    } else {
+      logits_grad[i] *= loss_grad[row];
+    }
+    if (fabs(scale - 1.0) > 1e-8) {
+      logits_grad[i] *= static_cast<T>(scale);
+    }
+  }
+}
+
+template <typename T>
+class MarginCrossEntropyOpCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    const Tensor* logits = ctx.Input<Tensor>("Logits");
+    const Tensor* labels = ctx.Input<Tensor>("Label");
+    Tensor* softmax = ctx.Output<Tensor>("Softmax");
+    Tensor* loss = ctx.Output<Tensor>("Loss");
+
+    const int rid = ctx.Attr<int>("ring_id");
+    const int nranks = ctx.Attr<int>("nranks");
+    const int rank = ctx.Attr<int>("rank");
+
+    const float margin1 = ctx.Attr<float>("margin1");
+    const float margin2 = ctx.Attr<float>("margin2");
+    const float margin3 = ctx.Attr<float>("margin3");
+    const float scale = ctx.Attr<float>("scale");
+
+    const auto& place = ctx.GetPlace();
+    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
+
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+    platform::NCCLComm* comm;
+    gpuStream_t stream;
+    if (nranks > 1) {
+      comm = platform::NCCLCommContext::Instance().Get(rid, place);
+
+      // use global calculate stream
+      stream = static_cast<platform::CUDADeviceContext*>(
+                   platform::DeviceContextPool::Instance().Get(place))
+                   ->stream();
+    }
+#endif
+
+    // allocate memory on device.
+    T* softmax_ptr = softmax->mutable_data<T>(place);
+    T* loss_ptr = loss->mutable_data<T>(place);
+
+    const auto& logits_dims = logits->dims();
+    const auto& labels_dims = labels->dims();
+
+    const int axis = logits_dims.size() - 1;
+    const int N = SizeToAxis(axis, logits_dims);
+    const int D = SizeFromAxis(axis, logits_dims);
+
+    int blocks = NumBlocks(N);
+    int threads = kNumCUDAThreads;
+    const auto& label_type = labels->type();
+
+    // copy logits to softmax variable since we can't modify logits,
+    // and it also be used when calculate grad
+    framework::TensorCopy(*logits, ctx.GetPlace(), ctx.device_context(),
+                          softmax);
+
+    Tensor softmax_2d;
+    softmax_2d.ShareDataWith(*softmax).Resize({N, D});
+    T* logits_ptr = softmax_2d.data<T>();
+
+    Tensor class_interval;
+    GetClassInterval(dev_ctx.stream(), place, ctx.cuda_device_context(), rid,
+                     rank, nranks, D, &class_interval);
+
+    // step 1, preprocess logits
+    // add margin for positive elements
+    // theta = acos(x_i)
+    // (cos(m1 * theta + m2) - m3)
+    // save match_logits, used for gradient computation.
+    if (label_type == framework::proto::VarType::INT32) {
+      typedef int32_t LabelT;
+      AddMarginToPositiveLogitsKernel<
+          T><<<NumBlocks(N), threads, 0, dev_ctx.stream()>>>(
+          logits_ptr, labels->data<LabelT>(), margin1, margin2, margin3, rank,
+          nranks, N, D, class_interval.data<int>());
+    } else if (label_type == framework::proto::VarType::INT64) {
+      typedef int64_t LabelT;
+      AddMarginToPositiveLogitsKernel<
+          T><<<NumBlocks(N), threads, 0, dev_ctx.stream()>>>(
+          logits_ptr, labels->data<LabelT>(), margin1, margin2, margin3, rank,
+          nranks, N, D, class_interval.data<int>());
+    }
+
+    // scale by s
+    ScaleLogitKernel<T><<<NumBlocks(N * D), threads, 0, dev_ctx.stream()>>>(
+        logits_ptr, scale, N, D);
+
+    // step 2, obtain logit_max
+    Tensor logits_max;
+    logits_max =
+        ctx.AllocateTmpTensor<T, platform::CUDADeviceContext>({N, 1}, dev_ctx);
+    T* logits_max_buff = logits_max.mutable_data<T>(place);
+    TensorReduceFunctorImpl<T, T, CustomMax>(softmax_2d, &logits_max, {1},
+                                             dev_ctx.stream());
+
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+    if (nranks > 1) {
+      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllReduce(
+          logits_max_buff, logits_max_buff, logits_max.numel(),
+          platform::ToNCCLDataType(logits_max.type()), ncclMax, comm->comm(),
+          stream));
+    }
+#endif
+
+    // step 3, logit - logit_max
+    LogitsMinusMaxKernel<T><<<NumBlocks(N * D), threads, 0, dev_ctx.stream()>>>(
+        logits_ptr, logits_max_buff, N, D);
+
+    // step 4, sum(exp(logit - logit_max))
+    Tensor sum_exp_logits;
+    sum_exp_logits =
+        ctx.AllocateTmpTensor<T, platform::CUDADeviceContext>({N, 1}, dev_ctx);
+    T* sum_exp_logits_buff = sum_exp_logits.mutable_data<T>(place);
+    TensorReduceFunctorImpl<T, T, ExpAndSum>(softmax_2d, &sum_exp_logits, {1},
+                                             dev_ctx.stream());
+
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+    if (nranks > 1) {
+      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllReduce(
+          sum_exp_logits_buff, sum_exp_logits_buff, sum_exp_logits.numel(),
+          platform::ToNCCLDataType(sum_exp_logits.type()), ncclSum,
+          comm->comm(), stream));
+    }
+#endif
+
+    // step 5, (logit - logit_max) - log(sum(exp(logit - logit_max)))
+    LogitsMinusLogSumKernel<
+        T><<<NumBlocks(N * D), threads, 0, dev_ctx.stream()>>>(
+        logits_ptr, sum_exp_logits_buff, N, D);
+
+    // step 6, prob = exp((logit - logit_max) - log(sum(exp(logit -
+    // logit_max))))
+    // loss = -((logit_i - logit_max) - log(sum(exp(logit - logit_max))))
+    math::SetConstant<platform::CUDADeviceContext, T>()(dev_ctx, loss,
+                                                        static_cast<T>(0.0));
+    if (label_type == framework::proto::VarType::INT32) {
+      typedef int32_t LabelT;
+      HardLabelSoftmaxWithCrossEntropyKernel<
+          T, LabelT><<<blocks, threads, 0, dev_ctx.stream()>>>(
+          loss_ptr, logits_ptr, labels->data<LabelT>(), rank, N, D,
+          class_interval.data<int>());
+    } else if (label_type == framework::proto::VarType::INT64) {
+      typedef int64_t LabelT;
+      HardLabelSoftmaxWithCrossEntropyKernel<
+          T, LabelT><<<blocks, threads, 0, dev_ctx.stream()>>>(
+          loss_ptr, logits_ptr, labels->data<LabelT>(), rank, N, D,
+          class_interval.data<int>());
+    }
+
+#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
+    if (nranks > 1) {
+      PADDLE_ENFORCE_CUDA_SUCCESS(platform::dynload::ncclAllReduce(
+          loss_ptr, loss_ptr, loss->numel(),
+          platform::ToNCCLDataType(loss->type()), ncclSum, comm->comm(),
+          stream));
+    }
+#endif
+  }
+};
+
+template <typename T>
+class MarginCrossEntropyGradCUDAKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    const Tensor* labels = context.Input<Tensor>("Label");
+    const Tensor* logits = context.Input<Tensor>("Logits");
+    const Tensor* softmax = context.Input<Tensor>("Softmax");
+
+    const Tensor* loss_grad =
+        context.Input<Tensor>(framework::GradVarName("Loss"));
+    Tensor* logit_grad =
+        context.Output<Tensor>(framework::GradVarName("Logits"));
+
+    const bool return_softmax = context.Attr<bool>("return_softmax");
+
+    const int rid = context.Attr<int>("ring_id");
+    const int nranks = context.Attr<int>("nranks");
+    const int rank = context.Attr<int>("rank");
+
+    const float margin1 = context.Attr<float>("margin1");
+    const float margin2 = context.Attr<float>("margin2");
+    const float margin3 = context.Attr<float>("margin3");
+    const float scale = context.Attr<float>("scale");
+
+    auto& dev_ctx =
+        context.template device_context<platform::CUDADeviceContext>();
+
+    const auto sofrmax_dims = softmax->dims();
+    const int axis = sofrmax_dims.size() - 1;
+    const int N = SizeToAxis(axis, sofrmax_dims);
+    const int D = SizeFromAxis(axis, sofrmax_dims);
+
+    if (return_softmax) {
+      framework::TensorCopy(*softmax, context.GetPlace(),
+                            context.device_context(), logit_grad);
+    } else {
+      logit_grad->ShareDataWith(*softmax);
+    }
+
+    int blocks = NumBlocks(N * D);
+    int threads = kNumCUDAThreads;
+    const auto& label_type = labels->type();
+
+    Tensor class_interval;
+    GetClassInterval(dev_ctx.stream(), context.GetPlace(),
+                     context.cuda_device_context(), rid, rank, nranks, D,
+                     &class_interval);
+
+    if (label_type == framework::proto::VarType::INT32) {
+      typedef int32_t LabelT;
+      CalculateGrad<T, LabelT><<<blocks, threads, 0, dev_ctx.stream()>>>(
+          logit_grad->data<T>(), loss_grad->data<T>(), logits->data<T>(),
+          labels->data<LabelT>(), margin1, margin2, scale, rank, N, D,
+          class_interval.data<int>());
+    } else if (label_type == framework::proto::VarType::INT64) {
+      typedef int64_t LabelT;
+      CalculateGrad<T, LabelT><<<blocks, threads, 0, dev_ctx.stream()>>>(
+          logit_grad->data<T>(), loss_grad->data<T>(), logits->data<T>(),
+          labels->data<LabelT>(), margin1, margin2, scale, rank, N, D,
+          class_interval.data<int>());
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+REGISTER_OP_CUDA_KERNEL(margin_cross_entropy,
+                        ops::MarginCrossEntropyOpCUDAKernel<float>,
+                        ops::MarginCrossEntropyOpCUDAKernel<double>,
+                        ops::MarginCrossEntropyOpCUDAKernel<plat::float16>);
+
+REGISTER_OP_CUDA_KERNEL(margin_cross_entropy_grad,
+                        ops::MarginCrossEntropyGradCUDAKernel<float>,
+                        ops::MarginCrossEntropyGradCUDAKernel<double>,
+                        ops::MarginCrossEntropyGradCUDAKernel<plat::float16>);
diff --git a/paddle/fluid/operators/margin_cross_entropy_op.h b/paddle/fluid/operators/margin_cross_entropy_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..fe0dab5d47d35a56e1806ecb2c47e9cfc8197cd0
--- /dev/null
+++ b/paddle/fluid/operators/margin_cross_entropy_op.h
@@ -0,0 +1,41 @@
+/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <algorithm>
+#include <utility>
+#include <vector>
+
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/softmax.h"
+#include "paddle/fluid/operators/softmax_op.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class MarginCrossEntropyOpCPUKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& ctx) const override {
+    PADDLE_THROW(platform::errors::Unavailable(
+        "Do not support margin_cross_entropy for cpu kernel "
+        "now."));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt
index 007221ca4f9ca30b93cf3661889d9244a1c8ade4..9d8b5fb699e33aa0220e3bd9e3d0d0de482b5589 100644
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -28,6 +28,7 @@ list(APPEND DIST_TEST_OPS test_parallel_dygraph_pipeline_parallel)
 list(APPEND DIST_TEST_OPS test_parallel_dygraph_tensor_parallel)
 list(APPEND DIST_TEST_OPS test_parallel_dygraph_sharding_parallel)
 list(APPEND DIST_TEST_OPS test_parallel_dygraph_mp_layers)
+list(APPEND DIST_TEST_OPS test_parallel_margin_cross_entropy)
 set(MIXED_DIST_TEST_OPS ${DIST_TEST_OPS})
 #remove distribute unittests.
 list(APPEND MIXED_DIST_TEST_OPS test_dgc_op)
@@ -195,6 +196,7 @@ if ((NOT WITH_GPU) AND (NOT WITH_ROCM))
     LIST(REMOVE_ITEM TEST_OPS test_mixed_precision)
     LIST(REMOVE_ITEM TEST_OPS test_fleet_base_single)
     LIST(REMOVE_ITEM TEST_OPS test_dygraph_recompute)
+    LIST(REMOVE_ITEM TEST_OPS test_parallel_margin_cross_entropy)
 elseif(WITH_GPU)
     if (${CUDNN_VERSION} VERSION_LESS 7100)
         LIST(REMOVE_ITEM TEST_OPS test_conv2d_fusion_op)
@@ -906,6 +908,7 @@ if(WITH_DISTRIBUTE AND WITH_GPU AND WITH_NCCL)
     set_tests_properties(test_parallel_dygraph_tensor_parallel PROPERTIES TIMEOUT 200)
     set_tests_properties(test_parallel_dygraph_sharding_parallel PROPERTIES TIMEOUT 120)
     set_tests_properties(test_parallel_dygraph_mp_layers PROPERTIES TIMEOUT 120)
+    set_tests_properties(test_parallel_margin_cross_entropy PROPERTIES TIMEOUT 120)
     if(${NCCL_VERSION} VERSION_GREATER_EQUAL 2212)
         set_tests_properties(test_parallel_dygraph_sparse_embedding PROPERTIES TIMEOUT 120)
         set_tests_properties(test_parallel_dygraph_transformer PROPERTIES TIMEOUT 120)
diff --git a/python/paddle/fluid/tests/unittests/parallel_margin_cross_entropy.py b/python/paddle/fluid/tests/unittests/parallel_margin_cross_entropy.py
new file mode 100644
index 0000000000000000000000000000000000000000..475a26ee94f37264dd9da0d7dbff9134d98d223b
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/parallel_margin_cross_entropy.py
@@ -0,0 +1,188 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import division
+from __future__ import print_function
+
+import unittest
+
+import paddle
+import numpy as np
+import random
+import paddle.distributed as dist
+import paddle.fluid as fluid
+import paddle.distributed.fleet as fleet
+from paddle import framework
+
+
+def set_random_seed(seed):
+    """Set random seed for reproducability."""
+    random.seed(seed)
+    np.random.seed(seed)
+    paddle.seed(seed)
+    fleet.meta_parallel.model_parallel_random_seed(seed)
+
+
+class TestParallelMarginSoftmaxCrossEntropyOp(unittest.TestCase):
+    def setUp(self):
+        strategy = fleet.DistributedStrategy()
+        fleet.init(is_collective=True, strategy=strategy)
+
+    def test_parallel_margin_softmax_cross_entropy(self):
+        margin1s = [1.0, 1.0, 1.35]
+        margin2s = [0.5, 0.0, 0.0]
+        margin3s = [0.0, 0.35, 0.0]
+        scales = [64.0, 64.0, 64.0]
+
+        rank_id = dist.get_rank()
+        num_trainer = dist.get_world_size()
+        batch_size = 2
+        feature_length = 4
+        seed = 1025
+        set_random_seed(seed)
+        paddle.seed(rank_id * 10)
+        random.seed(seed)
+        np.random.seed(seed)
+
+        check_group = dist.new_group(list(range(num_trainer)))
+        for dtype in ('float32', 'float64'):
+
+            num_class_per_cards = [[4, 8], [2, 2], [4, 2], [3, 9]]
+            for num_class_per_card in num_class_per_cards:
+
+                num_class = np.sum(num_class_per_card)
+                for margin1, margin2, margin3, scale in zip(margin1s, margin2s,
+                                                            margin3s, scales):
+
+                    for _ in range(5):
+                        np_label = np.random.randint(0, num_class,
+                                                     (batch_size, ))
+                        label = paddle.to_tensor(np_label, dtype="int64")
+
+                        input = paddle.randn(
+                            shape=[batch_size, feature_length], dtype=dtype)
+                        input.stop_gradient = False
+                        input_l2 = paddle.sqrt(
+                            paddle.sum(
+                                paddle.square(input), axis=1, keepdim=True))
+                        norm_input = paddle.divide(input, input_l2)
+
+                        weight = paddle.randn(
+                            shape=[
+                                feature_length, num_class_per_card[rank_id]
+                            ],
+                            dtype=dtype)
+                        weight.stop_gradient = False
+                        weight_l2 = paddle.sqrt(
+                            paddle.sum(
+                                paddle.square(weight), axis=0, keepdim=True))
+                        norm_weight = paddle.divide(weight, weight_l2)
+
+                        data = paddle.matmul(norm_input, norm_weight)
+                        data.stop_gradient = False
+
+                        sta = np.sum(
+                            num_class_per_card[:rank_id]) if rank_id > 0 else 0
+                        end = np.sum(num_class_per_card[:rank_id + 1])
+
+                        integral_data = np.zeros(
+                            (batch_size, num_class), dtype=dtype)
+                        integral_data[:, sta:end] = data.clone().detach().numpy(
+                        )
+                        integral_data = paddle.to_tensor(
+                            integral_data, dtype=dtype)
+
+                        paddle.distributed.all_reduce(
+                            integral_data,
+                            op=paddle.distributed.ReduceOp.SUM,
+                            group=check_group)
+                        integral_data = integral_data.detach().clone()
+                        integral_data.stop_gradient = False
+
+                        # add arcface margin to logit
+                        theta = paddle.acos(integral_data)
+                        one_hot_label = paddle.nn.functional.one_hot(
+                            label, num_classes=num_class)
+                        one_hot_label.stop_gradient = False
+
+                        if margin1 != 1.0:
+                            theta = margin1 * theta
+                        if margin2 != 0.0:
+                            theta = theta + margin2
+                        margin_cos = paddle.cos(theta)
+                        if margin3 != 0.0:
+                            margin_cos = margin_cos - margin3
+                        diff = one_hot_label * (margin_cos - integral_data)
+                        arc_data = (integral_data + diff) * scale
+
+                        loss_a, softmax_a = paddle.nn.functional.margin_cross_entropy(
+                            data,
+                            label,
+                            margin1=margin1,
+                            margin2=margin2,
+                            margin3=margin3,
+                            scale=scale,
+                            group=check_group,
+                            return_softmax=True,
+                            reduction=None)
+                        loss_b, softmax_b = paddle.nn.functional.softmax_with_cross_entropy(
+                            logits=arc_data,
+                            label=paddle.reshape(label, (-1, 1)),
+                            return_softmax=True)
+
+                        np.testing.assert_allclose(
+                            loss_a.numpy(), loss_b.numpy(), rtol=1e-5)
+
+                        integral_prob = np.zeros(
+                            (batch_size, num_class), dtype=dtype)
+                        integral_prob[:, sta:end] = softmax_a.clone().detach(
+                        ).numpy()
+                        integral_prob = paddle.to_tensor(
+                            integral_prob, dtype=dtype)
+                        paddle.distributed.all_reduce(
+                            integral_prob,
+                            op=paddle.distributed.ReduceOp.SUM,
+                            group=check_group)
+                        integral_prob = integral_prob.detach().clone()
+                        integral_prob.stop_gradient = False
+
+                        np.testing.assert_allclose(
+                            integral_prob.numpy(),
+                            softmax_b.numpy(),
+                            rtol=1e-5,
+                            atol=1e-6)
+
+                        loss_a = loss_a.sum() / batch_size
+                        loss_b = loss_b.sum() / batch_size
+                        loss_a.backward()
+                        loss_b.backward()
+
+                        integral_grad = np.zeros(
+                            (batch_size, num_class), dtype=dtype)
+                        integral_grad[:, sta:end] = data.grad.clone().detach()
+                        integral_grad = paddle.to_tensor(
+                            integral_grad, dtype=dtype)
+                        paddle.distributed.all_reduce(
+                            integral_grad,
+                            op=paddle.distributed.ReduceOp.SUM,
+                            group=check_group)
+
+                        np.testing.assert_allclose(
+                            integral_data.grad.numpy(),
+                            integral_grad.numpy(),
+                            rtol=1e-5)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_margin_cross_entropy_op.py b/python/paddle/fluid/tests/unittests/test_margin_cross_entropy_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..85d74f379814cdea463e325cc942b98c629ba635
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_margin_cross_entropy_op.py
@@ -0,0 +1,385 @@
+#   Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+from op_test import OpTest
+import math
+import random
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid import core
+from paddle.fluid import Program, program_guard
+
+
+def stable_softmax_comm(x):
+    shiftx = (x - np.max(x))
+    deno = np.log(np.sum(np.exp(shiftx)))
+    comm = shiftx - deno
+    return comm
+
+
+def margin_cross_entropy(logits,
+                         label,
+                         axis,
+                         margin1,
+                         margin2,
+                         margin3,
+                         scale,
+                         reduction=None):
+    one_hot_label = np.zeros_like(logits, dtype=logits.dtype)
+    for i, lb in enumerate(label):
+        one_hot_label[i, lb] = 1.0
+
+    # add arcface margin to logit
+    theta = np.arccos(logits)
+    if margin1 != 1.0:
+        theta = margin1 * theta
+    if margin2 != 0.0:
+        theta = theta + margin2
+    margin_cos = np.cos(theta)
+    if margin3 != 0.0:
+        margin_cos = margin_cos - margin3
+    diff = one_hot_label * (margin_cos - logits)
+    arc_logits = (logits + diff) * scale
+
+    comm = np.apply_along_axis(stable_softmax_comm, axis, arc_logits)
+    loss = (-one_hot_label * comm).sum(axis=axis, keepdims=True)
+    softmax = np.exp(comm)
+    if reduction == 'mean':
+        loss = np.mean(loss)
+    elif reduction == 'sum':
+        loss = np.sum(loss)
+    return loss, softmax
+
+
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestMarginCrossEntropyOp(OpTest):
+    def initParams(self):
+        self.op_type = "margin_cross_entropy"
+        self.axis = -1
+        self.batch_dim = 5
+        self.feat_dim = 41
+        self.num_class = 37
+
+    def init_loss_params(self):
+        self.margin1 = 1.0
+        self.margin2 = 0.5
+        self.margin3 = 0.0
+        self.scale = 2.0
+
+    def init_dtype(self):
+        self.dtype = np.float64
+
+    def setUp(self):
+        self.initParams()
+        self.init_loss_params()
+        self.init_dtype()
+
+        datas = np.random.uniform(
+            -0.99, 0.99, [self.batch_dim, self.feat_dim]).astype(self.dtype)
+        datas = datas / np.sqrt(np.sum(np.square(datas), axis=1, keepdims=True))
+        weights = np.random.uniform(
+            -0.99, 0.99, [self.feat_dim, self.num_class]).astype(self.dtype)
+        weights = weights / np.sqrt(
+            np.sum(np.square(weights), axis=0, keepdims=True))
+        logits = np.matmul(datas, weights)
+
+        labels = np.random.randint(
+            0, self.num_class, (self.batch_dim, ), dtype="int64")
+
+        loss, softmax = margin_cross_entropy(logits, labels, self.axis,
+                                             self.margin1, self.margin2,
+                                             self.margin3, self.scale)
+
+        self.inputs = {"Logits": logits, "Label": labels}
+        self.outputs = {
+            "Softmax": softmax.astype(self.dtype),
+            "Loss": loss.astype(self.dtype)
+        }
+        self.attrs = {
+            'margin1': self.margin1,
+            'margin2': self.margin2,
+            'margin3': self.margin3,
+            'scale': self.scale,
+        }
+
+    def test_check_output(self):
+        self.check_output_with_place(core.CUDAPlace(0), atol=1e-5)
+
+    def test_check_grad(self):
+        self.check_grad_with_place(core.CUDAPlace(0), ["Logits"], "Loss")
+
+
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestMarginCrossEntropyOpFP32(TestMarginCrossEntropyOp):
+    def init_dtype(self):
+        self.dtype = np.float32
+
+    def test_check_grad(self):
+        self.check_grad_with_place(
+            core.CUDAPlace(0), ["Logits"],
+            "Loss",
+            numeric_grad_delta=5e-2,
+            max_relative_error=5e-2)
+
+
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestMarginCrossEntropyOpFP16(TestMarginCrossEntropyOp):
+    def init_dtype(self):
+        self.dtype = np.float16
+
+    def test_check_output(self):
+        self.check_output_with_place(core.CUDAPlace(0), atol=5e-2)
+
+    def test_check_grad(self):
+        self.check_grad_with_place(
+            core.CUDAPlace(0), ["Logits"],
+            "Loss",
+            numeric_grad_delta=6e-1,
+            max_relative_error=6e-1)
+
+
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestMarginCrossEntropyOpCosFace(TestMarginCrossEntropyOp):
+    def init_loss_params(self):
+        self.margin1 = 1.0
+        self.margin2 = 0.0
+        self.margin3 = 0.35
+        self.scale = 2.0
+
+
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestMarginCrossEntropyOpSphereFace(TestMarginCrossEntropyOp):
+    def init_loss_params(self):
+        self.margin1 = 1.35
+        self.margin2 = 0.0
+        self.margin3 = 0.0
+        self.scale = 2.0
+
+
+class TestMarginCrossEntropyOpCPU(TestMarginCrossEntropyOp):
+    def test_check_output(self):
+        try:
+            self.check_output_with_place(core.CPUPlace(), atol=1e-5)
+        except RuntimeError:
+            pass
+
+    def test_check_grad(self):
+        try:
+            self.check_grad_with_place(core.CPUPlace(), ["Logits"], "Loss")
+        except RuntimeError:
+            pass
+
+
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestMarginCrossEntropyOpV2(unittest.TestCase):
+    def setUp(self):
+        self.initParams()
+        np.random.seed(self.seed)
+        paddle.framework.random._manual_program_seed(self.seed)
+        self.places = []
+        if core.is_compiled_with_cuda():
+            self.places.append(paddle.fluid.CUDAPlace(0))
+
+    def initParams(self):
+        self.seed = 2021
+        self.axis = -1
+        self.batch_dim = 5
+        self.feat_dim = 41
+        self.num_class = 37
+        self.init_loss_params()
+        self.init_dtype()
+        self.init_reduction()
+
+    def init_loss_params(self):
+        self.margin1 = 1.0
+        self.margin2 = 0.5
+        self.margin3 = 0.0
+        self.scale = 2.0
+
+    def init_dtype(self):
+        self.dtype = np.float64
+
+    def init_reduction(self):
+        self.reduction = None
+
+    def test_static(self):
+        for place in self.places:
+            self.check_static_result(place=place)
+
+    def check_static_result(self, place):
+        with program_guard(Program(), Program()):
+            datas = np.random.uniform(
+                -0.99, 0.99, [self.batch_dim, self.feat_dim]).astype(self.dtype)
+            datas = datas / np.sqrt(
+                np.sum(np.square(datas), axis=1, keepdims=True))
+            weights = np.random.uniform(
+                -0.99, 0.99, [self.feat_dim, self.num_class]).astype(self.dtype)
+            weights = weights / np.sqrt(
+                np.sum(np.square(weights), axis=0, keepdims=True))
+
+            logits_np = np.matmul(datas, weights)
+            labels_np = np.random.randint(
+                0, self.num_class, (self.batch_dim, ), dtype="int64")
+
+            loss_np, softmax_np = margin_cross_entropy(
+                logits_np, labels_np, self.axis, self.margin1, self.margin2,
+                self.margin3, self.scale, self.reduction)
+
+            logits = paddle.static.data(
+                name='logits',
+                shape=[self.batch_dim, self.num_class],
+                dtype=self.dtype)
+            label = paddle.static.data(
+                name='label', shape=[self.batch_dim], dtype="int64")
+            loss, softmax = paddle.nn.functional.margin_cross_entropy(
+                logits,
+                label,
+                margin1=self.margin1,
+                margin2=self.margin2,
+                margin3=self.margin3,
+                scale=self.scale,
+                return_softmax=True,
+                reduction=self.reduction)
+
+            exe = paddle.fluid.Executor(place)
+            [loss_res, softmax_res] = exe.run(
+                paddle.fluid.default_main_program(),
+                feed={'logits': logits_np,
+                      'label': labels_np},
+                fetch_list=[loss, softmax])
+            np.testing.assert_allclose(loss_res, loss_np)
+            np.testing.assert_allclose(softmax_res, softmax_np)
+
+    def test_dynamic(self):
+        for place in self.places:
+            self.check_dynamic_result(place=place)
+
+    def check_dynamic_result(self, place):
+        with paddle.fluid.dygraph.guard(place):
+            datas = np.random.uniform(
+                -0.99, 0.99, [self.batch_dim, self.feat_dim]).astype(self.dtype)
+            datas = datas / np.sqrt(
+                np.sum(np.square(datas), axis=1, keepdims=True))
+            weights = np.random.uniform(
+                -0.99, 0.99, [self.feat_dim, self.num_class]).astype(self.dtype)
+            weights = weights / np.sqrt(
+                np.sum(np.square(weights), axis=0, keepdims=True))
+
+            logits_np = np.matmul(datas, weights)
+            labels_np = np.random.randint(
+                0, self.num_class, (self.batch_dim, ), dtype="int64")
+
+            loss_np, softmax_np = margin_cross_entropy(
+                logits_np, labels_np, self.axis, self.margin1, self.margin2,
+                self.margin3, self.scale, self.reduction)
+
+            logits = paddle.to_tensor(logits_np, dtype=self.dtype)
+            labels = paddle.to_tensor(labels_np, dtype="int64")
+
+            loss, softmax = paddle.nn.functional.margin_cross_entropy(
+                logits,
+                labels,
+                margin1=self.margin1,
+                margin2=self.margin2,
+                margin3=self.margin3,
+                scale=self.scale,
+                return_softmax=True,
+                reduction=self.reduction)
+
+            loss_res = loss.numpy()
+            softmax_res = softmax.numpy()
+            np.testing.assert_allclose(loss_res, loss_np)
+            np.testing.assert_allclose(softmax_res, softmax_np)
+
+
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestMarginCrossEntropyOpV3(TestMarginCrossEntropyOpV2):
+    def init_reduction(self):
+        self.reduction = 'mean'
+
+
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestMarginCrossEntropyOpV4(TestMarginCrossEntropyOpV2):
+    def init_reduction(self):
+        self.reduction = 'sum'
+
+
+@unittest.skipIf(not core.is_compiled_with_cuda(),
+                 "core is not compiled with CUDA")
+class TestMarginCrossEntropyOpAPIError(unittest.TestCase):
+    def setUp(self):
+        self.initParams()
+        np.random.seed(self.seed)
+        paddle.framework.random._manual_program_seed(self.seed)
+        self.places = []
+        if core.is_compiled_with_cuda():
+            self.places.append(paddle.fluid.CUDAPlace(0))
+
+    def initParams(self):
+        self.seed = 2021
+        self.axis = -1
+        self.batch_dim = 10
+        self.feat_dim = 41
+        self.num_class = 37
+        self.init_loss_params()
+        self.init_dtype()
+
+    def init_loss_params(self):
+        self.margin1 = 1.0
+        self.margin2 = 0.5
+        self.margin3 = 0.0
+        self.scale = 2.0
+
+    def init_dtype(self):
+        self.dtype = np.float64
+
+    def test_dynamic_errors(self):
+        def test_dim():
+            for place in self.places:
+                with paddle.fluid.dygraph.guard(place):
+                    labels_np = np.random.randint(
+                        0, self.num_class, (self.batch_dim, 2), dtype="int64")
+                    logits_np = np.random.uniform(
+                        -0.99, 0.99,
+                        [self.batch_dim, self.num_class]).astype(self.dtype)
+                    labels = paddle.to_tensor(labels_np)
+                    logits = paddle.to_tensor(logits_np)
+
+                    loss, softmax = paddle.nn.functional.margin_cross_entropy(
+                        logits,
+                        labels,
+                        margin1=self.margin1,
+                        margin2=self.margin2,
+                        margin3=self.margin3,
+                        scale=self.scale,
+                        return_softmax=True,
+                        reduction=None)
+
+        self.assertRaises(ValueError, test_dim)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_parallel_margin_cross_entropy.py b/python/paddle/fluid/tests/unittests/test_parallel_margin_cross_entropy.py
new file mode 100644
index 0000000000000000000000000000000000000000..1b24889830ad873997fb803fb9baf574c657e2a9
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_parallel_margin_cross_entropy.py
@@ -0,0 +1,29 @@
+# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
+# 
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+# 
+#     http://www.apache.org/licenses/LICENSE-2.0
+# 
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import paddle.fluid as fluid
+
+from test_parallel_dygraph_dataparallel import TestMultipleGpus
+
+
+class TestParallelMarginSoftmaxWithCrossEntropy(TestMultipleGpus):
+    def test_parallel_margin_cross_entropy(self):
+        self.run_mnist_2gpu('parallel_margin_cross_entropy.py')
+
+
+if __name__ == "__main__":
+    unittest.main()
diff --git a/python/paddle/nn/functional/__init__.py b/python/paddle/nn/functional/__init__.py
index f3d9f9dde11a4e48bbabbdb03336021ebc3863ad..04e0b7c140d7fa2fa583ec871ca0b7f7ae19329d 100644
--- a/python/paddle/nn/functional/__init__.py
+++ b/python/paddle/nn/functional/__init__.py
@@ -79,6 +79,7 @@ from .loss import npair_loss  # noqa: F401
 from .loss import sigmoid_focal_loss  # noqa: F401
 from .loss import smooth_l1_loss  # noqa: F401
 from .loss import softmax_with_cross_entropy  # noqa: F401
+from .loss import margin_cross_entropy  # noqa: F401
 from .loss import square_error_cost  # noqa: F401
 from .loss import ctc_loss  # noqa: F401
 from .norm import batch_norm  # noqa: F401
@@ -185,6 +186,7 @@ __all__ = [     #noqa
            'sigmoid_focal_loss',
            'smooth_l1_loss',
            'softmax_with_cross_entropy',
+           'margin_cross_entropy',
            'square_error_cost',
            'ctc_loss',
            'affine_grid',
diff --git a/python/paddle/nn/functional/loss.py b/python/paddle/nn/functional/loss.py
index ef2bfb3b8e0d3aeabf296cc8379e49d92c6dd6e3..d7b781c84767f216e86861f0363f77b5da2be1a3 100755
--- a/python/paddle/nn/functional/loss.py
+++ b/python/paddle/nn/functional/loss.py
@@ -1092,6 +1092,268 @@ def ctc_loss(log_probs,
     return loss_out
 
 
+def margin_cross_entropy(logits,
+                         label,
+                         margin1=1.0,
+                         margin2=0.5,
+                         margin3=0.0,
+                         scale=64.0,
+                         group=None,
+                         return_softmax=False,
+                         reduction='mean'):
+    """
+    .. math::
+
+        L=-\\frac{1}{N}\sum^N_{i=1}\log\\frac{e^{s(cos(m_{1}\\theta_{y_i}+m_{2})-m_{3})}}{e^{s(cos(m_{1}\\theta_{y_i}+m_{2})-m_{3})}+\sum^n_{j=1,j\\neq y_i} e^{scos\\theta_{y_i}}}
+
+    where the :math:`\\theta_{y_i}` is the angle between the feature :math:`x` and
+    the representation of class :math:`i`. The details of ArcFace loss
+    could be referred to https://arxiv.org/abs/1801.07698.
+
+    .. hint::
+        The API supports model parallel and single GPU. And logits.shape[-1] can be different at each rank.
+
+    Args:
+    	logits (Tensor): shape[N, local_num_classes], the output of the normalized X multiply the normalized W.
+                The logits is shard_logits when using model parallel.
+    	label (Tensor): shape[N] or shape[N, 1], the groud truth label.
+    	margin1 (float, optional): m1 of margin loss, default value is `1.0`.
+    	margin2 (float, optional): m2 of margin loss, default value is `0.5`.
+    	margin3 (float, optional): m3 of margin loss, default value is `0.0`.
+    	scale (float, optional): s of margin loss, default value is `64.0`.
+        group (Group, optional): The abstract representation of group, see paddle.distributed.collective.Group.
+            Default `None`.
+        return_softmax (bool, optional): Whether return softmax probability. Default value is `False`.
+        reduction (str, optional): The candicates are ``'none'`` | ``'mean'`` | ``'sum'``.
+                    If :attr:`reduction` is ``'mean'``, return the average of loss;
+                    If :attr:`reduction` is ``'sum'``, return the sum of loss;
+                    If :attr:`reduction` is ``'none'``, no reduction will be applied.
+                    Default value is `'mean'`.
+
+    Returns:
+        ``Tensor`` or Tuple of two ``Tensor`` : Return the cross entropy loss if \
+            `return_softmax` is False, otherwise the tuple \
+            (loss, softmax), softmax is shard_softmax when \
+            using model parallel, otherwise softmax is in \
+            the same shape with input logits. If ``reduction == None``, \
+            the shape of loss is ``[N, 1]``, otherwise the shape is ``[1]``.
+
+    Examples:
+
+    .. code-block:: python
+
+        # required: gpu
+        # Single GPU
+        import paddle
+        m1 = 1.0
+        m2 = 0.5
+        m3 = 0.0
+        s = 64.0
+        batch_size = 2
+        feature_length = 4
+        num_classes = 4
+
+        label = paddle.randint(low=0, high=num_classes, shape=[batch_size], dtype='int64')
+
+        X = paddle.randn(
+            shape=[batch_size, feature_length],
+            dtype='float64')
+        X_l2 = paddle.sqrt(paddle.sum(paddle.square(X), axis=1, keepdim=True))
+        X = paddle.divide(X, X_l2)
+
+        W = paddle.randn(
+            shape=[feature_length, num_classes],
+            dtype='float64')
+        W_l2 = paddle.sqrt(paddle.sum(paddle.square(W), axis=0, keepdim=True))
+        W = paddle.divide(W, W_l2)
+
+        logits = paddle.matmul(X, W)
+        loss, softmax = paddle.nn.functional.margin_cross_entropy(
+            logits, label, margin1=m1, margin2=m2, margin3=m3, scale=s, return_softmax=True, reduction=None)
+
+        print(logits)
+        print(label)
+        print(loss)
+        print(softmax)
+        
+        #Tensor(shape=[2, 4], dtype=float64, place=CUDAPlace(0), stop_gradient=True,
+        #       [[ 0.85204151, -0.55557678,  0.04994566,  0.71986042],
+        #        [-0.20198586, -0.35270476, -0.55182702,  0.09749021]])
+        #Tensor(shape=[2], dtype=int64, place=CUDAPlace(0), stop_gradient=True,
+        #       [2, 3])
+        #Tensor(shape=[2, 1], dtype=float64, place=CUDAPlace(0), stop_gradient=True,
+        #       [[82.37059586],
+        #        [12.13448420]])
+        #Tensor(shape=[2, 4], dtype=float64, place=CUDAPlace(0), stop_gradient=True,
+        #       [[0.99978819, 0.00000000, 0.00000000, 0.00021181],
+        #        [0.99992995, 0.00006468, 0.00000000, 0.00000537]])
+
+    .. code-block:: python
+
+        # required: distributed
+        # Multi GPU, test_margin_cross_entropy.py
+        import paddle
+        import paddle.distributed as dist
+        strategy = dist.fleet.DistributedStrategy()
+        dist.fleet.init(is_collective=True, strategy=strategy)
+        rank_id = dist.get_rank()
+        m1 = 1.0
+        m2 = 0.5
+        m3 = 0.0
+        s = 64.0
+        batch_size = 2
+        feature_length = 4
+        num_class_per_card = [4, 8]
+        num_classes = paddle.sum(paddle.to_tensor(num_class_per_card))
+
+        label = paddle.randint(low=0, high=num_classes.item(), shape=[batch_size], dtype='int64')
+        label_list = []
+        dist.all_gather(label_list, label)
+        label = paddle.concat(label_list, axis=0)
+
+        X = paddle.randn(
+            shape=[batch_size, feature_length],
+            dtype='float64')
+        X_list = []
+        dist.all_gather(X_list, X)
+        X = paddle.concat(X_list, axis=0)
+        X_l2 = paddle.sqrt(paddle.sum(paddle.square(X), axis=1, keepdim=True))
+        X = paddle.divide(X, X_l2)
+
+        W = paddle.randn(
+            shape=[feature_length, num_class_per_card[rank_id]],
+            dtype='float64')
+        W_l2 = paddle.sqrt(paddle.sum(paddle.square(W), axis=0, keepdim=True))
+        W = paddle.divide(W, W_l2)
+
+        logits = paddle.matmul(X, W)
+        loss, softmax = paddle.nn.functional.margin_cross_entropy(
+            logits, label, margin1=m1, margin2=m2, margin3=m3, scale=s, return_softmax=True, reduction=None)
+
+        print(logits)
+        print(label)
+        print(loss)
+        print(softmax)
+
+        # python -m paddle.distributed.launch --gpus=0,1 test_margin_cross_entropy.py 
+        ## for rank0 input
+        #Tensor(shape=[4, 4], dtype=float64, place=CUDAPlace(0), stop_gradient=True,
+        #       [[ 0.32888934,  0.02408748, -0.02763289,  0.18173063],
+        #        [-0.52893978, -0.10623845, -0.21596515, -0.06432517],
+        #        [-0.00536345, -0.03924667,  0.66735314, -0.28640926],
+        #        [-0.09907366, -0.48534973, -0.10365338, -0.39472322]])
+        #Tensor(shape=[4], dtype=int64, place=CUDAPlace(0), stop_gradient=True,
+        #       [11, 1 , 10, 11])
+
+        ## for rank1 input
+        #Tensor(shape=[4, 8], dtype=float64, place=CUDAPlace(1), stop_gradient=True,
+        #       [[ 0.68654754,  0.28137170,  0.69694954, -0.60923933, -0.57077653,  0.54576703, -0.38709028,  0.56028204],
+        #        [-0.80360371, -0.03042448, -0.45107338,  0.49559349,  0.69998950, -0.45411693,  0.61927630, -0.82808600],
+        #        [ 0.11457570, -0.34785879, -0.68819499, -0.26189226, -0.48241491, -0.67685711,  0.06510185,  0.49660849],
+        #        [ 0.31604851,  0.52087884,  0.53124749, -0.86176582, -0.43426329,  0.34786144, -0.10850784,  0.51566383]])
+        #Tensor(shape=[4], dtype=int64, place=CUDAPlace(1), stop_gradient=True,
+        #       [11, 1 , 10, 11])
+
+        ## for rank0 output
+        #Tensor(shape=[4, 1], dtype=float64, place=CUDAPlace(0), stop_gradient=True,
+        #       [[38.96608230],
+        #        [81.28152394],
+        #        [69.67229865],
+        #        [31.74197251]])
+        #Tensor(shape=[4, 4], dtype=float64, place=CUDAPlace(0), stop_gradient=True,
+        #       [[0.00000000, 0.00000000, 0.00000000, 0.00000000],
+        #        [0.00000000, 0.00000000, 0.00000000, 0.00000000],
+        #        [0.00000000, 0.00000000, 0.99998205, 0.00000000],
+        #        [0.00000000, 0.00000000, 0.00000000, 0.00000000]])
+        ## for rank1 output
+        #Tensor(shape=[4, 1], dtype=float64, place=CUDAPlace(1), stop_gradient=True,
+        #       [[38.96608230],
+        #        [81.28152394],
+        #        [69.67229865],
+        #        [31.74197251]])
+        #Tensor(shape=[4, 8], dtype=float64, place=CUDAPlace(1), stop_gradient=True,
+        #       [[0.33943993, 0.00000000, 0.66051859, 0.00000000, 0.00000000, 0.00004148, 0.00000000, 0.00000000],
+        #        [0.00000000, 0.00000000, 0.00000000, 0.00000207, 0.99432097, 0.00000000, 0.00567696, 0.00000000],
+        #        [0.00000000, 0.00000000, 0.00000000, 0.00000000, 0.00000000, 0.00000000, 0.00000000, 0.00001795],
+        #        [0.00000069, 0.33993085, 0.66006319, 0.00000000, 0.00000000, 0.00000528, 0.00000000, 0.00000000]])
+    """
+
+    assert reduction in ['mean', 'sum', 'none', None]
+    if group is not None and not group.is_member():
+        return
+
+    ring_id = 0 if group is None else group.id
+    rank = 0
+    nranks = 1
+    if core.is_compiled_with_dist():
+        parallel_env = paddle.distributed.ParallelEnv()
+        global_rank = parallel_env.rank
+        rank = global_rank if group is None else group.get_group_rank(
+            global_rank)
+        nranks = parallel_env.world_size if group is None else group.nranks
+
+    input_dims = len(list(logits.shape))
+    label_dims = len(list(label.shape))
+    if input_dims - 1 != label_dims and input_dims != label_dims:
+        raise ValueError(
+            'Expected nput_dims - 1 = label_dims or input_dims == label_dims\
+             (got nput_dims{}, label_dims{})'.format(input_dims, label_dims))
+    if input_dims - 1 == label_dims:
+        label = paddle.unsqueeze(label, axis=-1)
+
+    if in_dygraph_mode():
+        softmax, loss = core.ops.margin_cross_entropy(
+            logits, label, 'ring_id', ring_id, 'rank', rank, 'nranks', nranks,
+            'margin1', margin1, 'margin2', margin2, 'margin3', margin3, 'scale',
+            scale, 'return_softmax', return_softmax)
+        if reduction == 'mean':
+            loss = paddle.mean(loss)
+        elif reduction == 'sum':
+            loss = paddle.sum(loss)
+        if not return_softmax:
+            return loss
+        else:
+            return loss, softmax
+
+    op_type = 'margin_cross_entropy'
+    helper = LayerHelper(op_type, **locals())
+    softmax = helper.create_variable_for_type_inference(dtype=logits.dtype)
+    loss = helper.create_variable_for_type_inference(dtype=logits.dtype)
+
+    check_variable_and_dtype(logits, 'logits',
+                             ['float16', 'float32', 'float64'],
+                             'margin_cross_entropy')
+    check_variable_and_dtype(label, 'label', ['int32', 'int64'],
+                             'margin_cross_entropy')
+
+    helper.append_op(
+        type=op_type,
+        inputs={'Logits': logits,
+                'Label': label},
+        outputs={'Softmax': softmax,
+                 'Loss': loss},
+        attrs={
+            'return_softmax': return_softmax,
+            'ring_id': ring_id,
+            'rank': rank,
+            'nranks': nranks,
+            'margin1': margin1,
+            'margin2': margin2,
+            'margin3': margin3,
+            'scale': scale,
+        })
+
+    if reduction == 'mean':
+        loss = paddle.mean(loss)
+    elif reduction == 'sum':
+        loss = paddle.sum(loss)
+
+    if not return_softmax:
+        return loss
+    else:
+        return loss, softmax
+
+
 @deprecated(
     since="2.0.0",
     update_to="paddle.nn.functional.cross_entropy",
diff --git a/tools/static_mode_white_list.py b/tools/static_mode_white_list.py
index 616d5ae280ad1acdaa3e2812981d27bbac8f2ab0..d2f95c235b04c1b7573b9deda6400e5e24cdca93 100644
--- a/tools/static_mode_white_list.py
+++ b/tools/static_mode_white_list.py
@@ -719,4 +719,5 @@ STATIC_MODE_TESTING_LIST = [
     'test_sgd_op_bf16',
     'test_marker_op',
     'test_c_embedding_op',
+    'test_margin_cross_entropy_op',
 ]