Softmax_cross_entropy op add axis (#16806)

* add attr axis infershape. test=develop * add CUDA kernel. test=develop * fix unittest. test=develop * fix unittest for soft_label. test=develop * fix fp16 unittest. test=develop * remove comment code. test=develop * refine test for axis. test=develop * add python api. test=develop * fix doc. test=develop * fix fp16 unittest. test=develop * fix ngraph test. test=develop * fix ENFORCE for test_imperative_transformer. test=develop * fit for ngraph test. test=develop * fix after rebase develop. test=develop * fix doc. test=develop * fix API.spec. test=develop * fix test_layers. test=develop * fix format. test=develop

Softmax_cross_entropy op add axis (#16806)
* add attr axis infershape. test=develop * add CUDA kernel. test=develop * fix unittest. test=develop * fix unittest for soft_label. test=develop * fix fp16 unittest. test=develop * remove comment code. test=develop * refine test for axis. test=develop * add python api. test=develop * fix doc. test=develop * fix fp16 unittest. test=develop * fix ngraph test. test=develop * fix ENFORCE for test_imperative_transformer. test=develop * fit for ngraph test. test=develop * fix after rebase develop. test=develop * fix doc. test=develop * fix API.spec. test=develop * fix test_layers. test=develop * fix format. test=develop
a71d8fdb · Kaipeng Deng · GitHub · c2e20e2a · a71d8fdb · a71d8fdb
11 changed file
--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@@ -133,7 +133,7 @@ paddle.fluid.layers.multiplex (ArgSpec(args=['inputs', 'index'], varargs=None, k
 paddle.fluid.layers.layer_norm (ArgSpec(args=['input', 'scale', 'shift', 'begin_norm_axis', 'epsilon', 'param_attr', 'bias_attr', 'act', 'name'], varargs=None, keywords=None, defaults=(True, True, 1, 1e-05, None, None, None, None)), ('document', 'de6a906950bae9f3c245cb744d22b94e'))
 paddle.fluid.layers.group_norm (ArgSpec(args=['input', 'groups', 'epsilon', 'param_attr', 'bias_attr', 'act', 'data_layout', 'name'], varargs=None, keywords=None, defaults=(1e-05, None, None, None, 'NCHW', None)), ('document', '419c3a24a83cc89219a029cf4092788b'))
 paddle.fluid.layers.spectral_norm (ArgSpec(args=['weight', 'dim', 'power_iters', 'eps', 'name'], varargs=None, keywords=None, defaults=(0, 1, 1e-12, None)), ('document', '3f536aafba30d793287b52d231baff1b'))
-paddle.fluid.layers.softmax_with_cross_entropy (ArgSpec(args=['logits', 'label', 'soft_label', 'ignore_index', 'numeric_stable_mode', 'return_softmax'], varargs=None, keywords=None, defaults=(False, -100, True, False)), ('document', 'bce1b75e3d95b75cacd1099655cbb3c3'))
+paddle.fluid.layers.softmax_with_cross_entropy (ArgSpec(args=['logits', 'label', 'soft_label', 'ignore_index', 'numeric_stable_mode', 'return_softmax', 'axis'], varargs=None, keywords=None, defaults=(False, -100, True, False, -1)), ('document', '8b074f9c56b4233a2b65d03254eb309e'))
 paddle.fluid.layers.smooth_l1 (ArgSpec(args=['x', 'y', 'inside_weight', 'outside_weight', 'sigma'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', 'c6b175d253c55baf4b9c0eca9b1dda88'))
 paddle.fluid.layers.one_hot (ArgSpec(args=['input', 'depth'], varargs=None, keywords=None, defaults=None), ('document', '960fc799549c202da1e85d626cb2c962'))
 paddle.fluid.layers.autoincreased_step_counter (ArgSpec(args=['counter_name', 'begin', 'step'], varargs=None, keywords=None, defaults=(None, 1, 1)), ('document', '67afefa80b6cc38801bd5b631fed8a4a'))

--- a/paddle/fluid/operators/cross_entropy_op.h
+++ b/paddle/fluid/operators/cross_entropy_op.h
@@ -39,9 +39,10 @@ class CrossEntropyOpKernel : public framework::OpKernel<T> {
    Tensor labels_2d = framework::ReshapeToMatrix(*labels, rank - 1);
    Tensor y_2d = framework::ReshapeToMatrix(*y, rank - 1);
+    int axis_dim = x->dims()[rank - 1];
    math::CrossEntropyFunctor<DeviceContext, T>()(
        ctx.template device_context<DeviceContext>(), &y_2d, &x_2d, &labels_2d,
-        ctx.Attr<bool>("soft_label"), ctx.Attr<int>("ignore_index"));
+        ctx.Attr<bool>("soft_label"), ctx.Attr<int>("ignore_index"), axis_dim);
  }
 };

--- a/paddle/fluid/operators/math/cross_entropy.cc
+++ b/paddle/fluid/operators/math/cross_entropy.cc
@@ -29,8 +29,13 @@ class CrossEntropyFunctor<platform::CPUDeviceContext, T> {
  void operator()(const platform::CPUDeviceContext& ctx, framework::Tensor* out,
                  const framework::Tensor* prob,
                  const framework::Tensor* labels, const bool softLabel,
-                  const int ignore_index) {
+                  const int ignore_index, const int axis_dim) {
    const int batch_size = prob->dims()[0];
+    const int num_classes = prob->dims()[1];
+    const int num_remain = num_classes / axis_dim;
+    Eigen::DSizes<int, 3> batch_axis_remain(batch_size, axis_dim, num_remain);
    if (softLabel) {
      auto in = EigenMatrix<T>::From(*prob);
      auto lbl = EigenMatrix<T>::From(*labels);
@@ -38,24 +43,24 @@ class CrossEntropyFunctor<platform::CPUDeviceContext, T> {
      loss.device(*ctx.eigen_device()) =
          -((lbl * in.log().unaryExpr(math::TolerableValue<T>()))
-                .sum(Eigen::DSizes<int, 1>(1))
+                .reshape(batch_axis_remain)
-                .reshape(Eigen::DSizes<int, 2>(batch_size, 1)));
+                .sum(Eigen::DSizes<int, 1>(1)));
    } else {
-      const int class_num = prob->dims()[1];
      const T* prob_data = prob->data<T>();
      T* loss_data = out->data<T>();
      const int64_t* label_data = labels->data<int64_t>();
      for (int i = 0; i < batch_size; ++i) {
-        int lbl = label_data[i];
+        for (int j = 0; j < num_remain; j++) {
-        PADDLE_ENFORCE_GE(lbl, 0);
+          int lbl = label_data[i * num_remain + j];
-        PADDLE_ENFORCE_LT(lbl, class_num);
+          PADDLE_ENFORCE((lbl >= 0 && lbl < axis_dim) || lbl == ignore_index);
-        PADDLE_ENFORCE((lbl >= 0 && lbl < class_num) || lbl == ignore_index);
+          int index = i * num_classes + lbl * num_remain + j;
-        int index = i * class_num + lbl;
+          int loss_idx = i * num_remain + j;
-        loss_data[i] =
+          loss_data[loss_idx] =
-            lbl == ignore_index
+              lbl == ignore_index
-                ? 0
+                  ? 0
-                : -math::TolerableValue<T>()(std::log(prob_data[index]));
+                  : -math::TolerableValue<T>()(std::log(prob_data[index]));
+        }
      }
    }
  }

--- a/paddle/fluid/operators/math/cross_entropy.cu
+++ b/paddle/fluid/operators/math/cross_entropy.cu
@@ -57,8 +57,8 @@ class CrossEntropyFunctor<platform::CUDADeviceContext, T> {
 public:
  void operator()(const platform::CUDADeviceContext& ctx,
                  framework::Tensor* out, const framework::Tensor* prob,
-                  const framework::Tensor* labels, bool softLabel,
+                  const framework::Tensor* labels, const bool softLabel,
-                  const int ignore_index) {
+                  const int ignore_index, const int axis_dim) {
    const T* prob_data = prob->data<T>();
    T* loss_data = out->mutable_data<T>(ctx.GetPlace());

--- a/paddle/fluid/operators/math/cross_entropy.h
+++ b/paddle/fluid/operators/math/cross_entropy.h
@@ -60,7 +60,7 @@ class CrossEntropyFunctor {
  void operator()(const DeviceContext& context, framework::Tensor* out,
                  const framework::Tensor* prob,
                  const framework::Tensor* labels, const bool softLabel,
-                  const int ignore_index);
+                  const int ignore_index, const int axis_dim);
 };
 }  // namespace math
 }  // namespace operators

--- a/paddle/fluid/operators/softmax_with_cross_entropy_op.cc
+++ b/paddle/fluid/operators/softmax_with_cross_entropy_op.cc
@@ -26,23 +26,28 @@ class SoftmaxWithCrossEntropyOpMaker
 public:
  void Make() override {
    AddInput("Logits",
-             "(Tensor, default: Tensor<float>), The unscaled log probabilities "
+             "(Tensor, default: Tensor<float>), The input tensor of unscaled "
-             "which is a 2-D tensor with shape [N x K]. N is the batch_size, "
+             "log probabilities, whose dimension :attr:`axis` should be scaled "
-             "and K is the class number.");
+             "by softmax.");
-    AddInput("Label",
+    AddInput(
-             "(Tensor) The ground truth which is a 2-D tensor. If soft_label "
+        "Label",
-             "is set to false, Label is a Tensor<int64> with shape [N x 1]. If "
+        "(Tensor) The input tesnor of groud truth label. If :attr:`soft_label` "
-             "soft_label is set to true, Label is a Tensor<float/double> with "
+        "is set to false, Label is a Tensor<int64> in same shape with "
-             "shape [N x K].");
+        "Input(Logits) except the shape in dimension :attr:`axis` as 1. If "
+        "soft_label is set to true, Label is a Tensor<float/double> in same "
+        "shape with Input(Logits).");
    AddOutput(
        "Softmax",
-        "(Tensor, default: Tensor<float>), A 2-D tensor with shape [N x K]. "
+        "(Tensor, default: Tensor<float>), A tensor in same shape with "
+        "Input(Logits). "
        "The outputs value of softmax activation by given the input batch, "
        "which will be used in backward calculation.")
        .AsIntermediate();
    AddOutput("Loss",
-              "(Tensor, default: Tensor<float>), A 2-D tensor. The cross "
+              "(Tensor, default: Tensor<float>), A tensor in same shape with "
-              "entropy loss with shape [N x 1].");
+              "Input(Logits) "
+              "except the shape in dimension :attr:`axis` as 1. The cross "
+              "entropy loss.");
    AddAttr<bool>(
        "soft_label",
        "(bool, default: false), A flag to indicate whether to interpretate "
@@ -60,6 +65,10 @@ class SoftmaxWithCrossEntropyOpMaker
        "does not contribute to the input gradient. Only valid if soft_label"
        "is set to False")
        .SetDefault(-100);
+    AddAttr<int>("axis",
+                 "The dimension index of Input(Logits) to perform softmax,"
+                 "default -1 for last dimension")
+        .SetDefault(-1);
    AddComment(R"DOC(
 Softmax With Cross Entropy Operator.
@@ -107,38 +116,53 @@ class SoftmaxWithCrossEntropyOp : public framework::OperatorWithKernel {
                   "Output(Softmax) should be not null.");
    PADDLE_ENFORCE(ctx->HasOutput("Loss"), "Output(Loss) should be not null.");
+    auto axis = ctx->Attrs().Get<int>("axis");
    auto logits_dims = ctx->GetInputDim("Logits");
    auto labels_dims = ctx->GetInputDim("Label");
+    auto logits_rank = logits_dims.size();
+    PADDLE_ENFORCE(axis >= -logits_rank && axis < logits_rank,
+                   "Attr(axis) value should be in range [-R, R-1], "
+                   "R is the rank of Input(Logits).");
+    axis = CanonicalAxis(axis, logits_rank);
+    for (int i = 0; i < logits_rank; i++) {
+      if (i != axis) {
+        if (ctx->IsRuntime() || (logits_dims[i] > 0 && labels_dims[i] > 0)) {
+          PADDLE_ENFORCE_EQ(
+              logits_dims[i], labels_dims[i],
+              "Input(Logits) and Input(Label) should in same shape in "
+              "dimensions except axis.");
+        }
+      }
+    }
-    int rank = logits_dims.size();
+    auto numeric_stable_mode = ctx->Attrs().Get<bool>("numeric_stable_mode");
-    PADDLE_ENFORCE_EQ(
+    if (axis != logits_rank - 1) {
-        rank, labels_dims.size(),
+      PADDLE_ENFORCE(
-        "Input(logits) and Input(Label) shall have the same rank.");
+          numeric_stable_mode,
-    bool check = ctx->IsRuntime() || (framework::product(logits_dims) > 0 &&
+          "Attr(axis) can only be -1 when not in numeric_stable_mode.");
-                                      framework::product(labels_dims) > 0);
-    if (check) {
-      PADDLE_ENFORCE_EQ(framework::slice_ddim(logits_dims, 0, rank - 1),
-                        framework::slice_ddim(labels_dims, 0, rank - 1),
-                        "Input(X) and Input(Label) shall have the same shape "
-                        "except the last dimension.");
    }
-    if (ctx->Attrs().Get<bool>("soft_label")) {
+    bool soft_label = ctx->Attrs().Get<bool>("soft_label");
-      if (check) {
+    if (soft_label) {
-        PADDLE_ENFORCE_EQ(logits_dims[rank - 1], labels_dims[rank - 1],
+      if (ctx->IsRuntime() ||
-                          "If Attr(soft_label) == true, the last dimension of "
+          (logits_dims[axis] > 0 && labels_dims[axis] > 0)) {
+        PADDLE_ENFORCE_EQ(logits_dims[axis], labels_dims[axis],
+                          "If Attr(soft_label) == true, the axis dimension of "
                          "Input(X) and Input(Label) should be equal.");
      }
    } else {
-      PADDLE_ENFORCE_EQ(labels_dims[rank - 1], 1UL,
+      if (ctx->IsRuntime() || labels_dims[axis] > 0) {
-                        "If Attr(softLabel) == false, the last dimension of "
+        PADDLE_ENFORCE_EQ(labels_dims[axis], 1UL,
-                        "Input(Label) should be 1.");
+                          "If Attr(soft_label) == false, the axis dimension of "
+                          "Input(Label) should be 1.");
+      }
    }
    ctx->SetOutputDim("Softmax", logits_dims);
-    auto loss_dims = logits_dims;
-    loss_dims[rank - 1] = 1;
+    logits_dims[axis] = 1;
-    ctx->SetOutputDim("Loss", loss_dims);
+    ctx->SetOutputDim("Loss", logits_dims);
    ctx->ShareLoD("Logits", /*->*/ "Softmax");
    ctx->ShareLoD("Logits", /*->*/ "Loss");
@@ -165,36 +189,40 @@ class SoftmaxWithCrossEntropyOpGrad : public framework::OperatorWithKernel {
    PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("Logits")),
                   "Output(Logits@Grad) should be not null.");
+    auto axis = ctx->Attrs().Get<int>("axis");
    auto softmax_dims = ctx->GetInputDim("Softmax");
    auto labels_dims = ctx->GetInputDim("Label");
+    auto softmax_rank = softmax_dims.size();
-    int rank = softmax_dims.size();
+    PADDLE_ENFORCE(axis >= -softmax_rank && axis < softmax_rank,
-    PADDLE_ENFORCE_EQ(
+                   "Attr(axis) value should be in range [-R, R-1], "
-        rank, labels_dims.size(),
+                   "R is the rank of Input(Logits).");
-        "Input(logits) and Input(Label) shall have the same rank.");
-    bool check = true;
+    axis = CanonicalAxis(axis, softmax_rank);
-    if ((!ctx->IsRuntime()) && (framework::product(softmax_dims) <= 0 ||
+    for (int i = 0; i < softmax_rank; i++) {
-                                framework::product(labels_dims) <= 0)) {
+      if (i != axis) {
-      check = false;
+        if (ctx->IsRuntime() || (softmax_dims[i] > 0 && labels_dims[i] > 0)) {
-    }
+          PADDLE_ENFORCE_EQ(
-    if (check) {
+              softmax_dims[i], labels_dims[i],
-      PADDLE_ENFORCE_EQ(
+              "Input(Logits) and Input(Label) should in same shape in "
-          framework::slice_ddim(softmax_dims, 0, rank - 1),
+              "dimensions except axis.");
-          framework::slice_ddim(labels_dims, 0, rank - 1),
+        }
-          "Input(Softmax) and Input(Label) shall have the same shape "
+      }
-          "except the last dimension.");
    }
-    if (ctx->Attrs().Get<bool>("soft_label")) {
+    bool soft_label = ctx->Attrs().Get<bool>("soft_label");
-      if (check) {
+    if (soft_label) {
-        PADDLE_ENFORCE_EQ(softmax_dims[rank - 1], labels_dims[rank - 1],
+      if (ctx->IsRuntime() ||
-                          "If Attr(soft_label) == true, the last dimension of "
+          (softmax_dims[axis] > 0 && labels_dims[axis] > 0)) {
-                          "Input( Softmax) and Input(Label) should be equal.");
+        PADDLE_ENFORCE_EQ(softmax_dims[axis], labels_dims[axis],
+                          "If Attr(soft_label) == true, the axis dimension of "
+                          "Input(X) and Input(Label) should be equal.");
      }
    } else {
-      PADDLE_ENFORCE_EQ(labels_dims[rank - 1], 1UL,
+      if (ctx->IsRuntime() || labels_dims[axis] > 0) {
-                        "If Attr(softLabel) == false, the last dimension of "
+        PADDLE_ENFORCE_EQ(labels_dims[axis], 1UL,
-                        "Input(Label) should be 1.");
+                          "If Attr(soft_label) == false, the axis dimension of "
+                          "Input(Label) should be 1.");
+      }
    }
    ctx->SetOutputDim(framework::GradVarName("Logits"),

--- a/paddle/fluid/operators/softmax_with_cross_entropy_op.cu
+++ b/paddle/fluid/operators/softmax_with_cross_entropy_op.cu
@@ -21,11 +21,13 @@ using Tensor = framework::Tensor;
 namespace {
 template <typename T>
 __global__ void CrossEntropyGrad(T* logit_grad, const int64_t* labels,
-                                 const int batch_size, const int class_num,
+                                 const int n, const int d, const int remain,
                                 const int ignore_index) {
-  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < batch_size;
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < n * remain;
       i += blockDim.x * gridDim.x) {
-    int idx = i * class_num + labels[i];
+    int idx_n = i / remain;
+    int idx_remain = i % remain;
+    int idx = idx_n * d + labels[i] * remain + idx_remain;
    logit_grad[idx] -=
        ignore_index == labels[i] ? static_cast<T>(0.) : static_cast<T>(1.);
  }
@@ -33,23 +35,26 @@ __global__ void CrossEntropyGrad(T* logit_grad, const int64_t* labels,
 template <typename T>
 __global__ void Scale(T* logit_grad, const T* loss_grad, const int num,
-                      const int class_num) {
+                      const int d, const int remain) {
  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < num;
       i += blockDim.x * gridDim.x) {
-    logit_grad[i] *= loss_grad[i / class_num];
+    int idx_n = i / d;
+    int idx_remain = i % remain;
+    logit_grad[i] *= loss_grad[idx_n * remain + idx_remain];
  }
 }
 template <typename T>
 __global__ void SoftCrossEntropyGradientKernel(T* logit_grad,
                                               const T* loss_grad,
-                                               const T* labels,
+                                               const T* labels, const int n,
-                                               const int batch_size,
+                                               const int d, const int remain) {
-                                               const int class_num) {
  int ids = blockIdx.x * blockDim.x + threadIdx.x;
-  if (ids < batch_size * class_num) {
+  if (ids < n * d) {
-    int row_ids = ids / class_num;
+    int idx_n = ids / d;
-    logit_grad[ids] = loss_grad[row_ids] * (logit_grad[ids] - labels[ids]);
+    int idx_remain = ids % remain;
+    int idx_loss = idx_n * remain + idx_remain;
+    logit_grad[ids] = loss_grad[idx_loss] * (logit_grad[ids] - labels[ids]);
  }
 }
@@ -116,23 +121,30 @@ using BlockReduce =
 template <typename T, int BlockDim>
 using BlockReduceTempStorage = typename BlockReduce<T, BlockDim>::TempStorage;
-// Make sure that BlockDim <= feature_size
+// Make sure that BlockDim <= axis_dim
 // This kernel is used to calculate the max element of each row
 template <typename T, int BlockDim>
 static __global__ void RowReductionForMax(const T* logits_data, T* max_data,
-                                          int feature_size) {
+                                          int d, int axis_dim) {
  __shared__ BlockReduceTempStorage<T, BlockDim> temp_storage;
-  auto beg_idx = feature_size * blockIdx.x + threadIdx.x;
+  // logits_data view as [n, axis_dim, remain]
-  auto end_idx = feature_size * (blockIdx.x + 1);
+  // max_data view as [n, 1, remain]
+  // blockDim = n * remain, split blockIdx to idx_n and idx_remain
+  int remain = d / axis_dim;
+  int idx_n = blockIdx.x / remain;
+  int idx_remain = blockIdx.x % remain;
+  int beg_idx = idx_n * d + threadIdx.x * remain + idx_remain;
+  int end_idx = (idx_n + 1) * d;
+  int step = BlockDim * remain;
  T cur_max = logits_data[beg_idx];
-  beg_idx += BlockDim;
+  beg_idx += step;
  while (beg_idx < end_idx) {
    if (cur_max < logits_data[beg_idx]) {
      cur_max = logits_data[beg_idx];
    }
-    beg_idx += BlockDim;
+    beg_idx += step;
  }
  cur_max = BlockReduce<T, BlockDim>(temp_storage).Reduce(cur_max, cub::Max());
@@ -143,25 +155,32 @@ static __global__ void RowReductionForMax(const T* logits_data, T* max_data,
  }
 }
-// Make sure that BlockDim <= feature_size
+// Make sure that BlockDim <= axis_dim
 template <typename T, int BlockDim, bool CalculateLogSoftmax = false>
 static __global__ void RowReductionForDiffMaxSum(const T* logits_data,
-                                                 T* max_data, T* softmax,
+                                                 T* max_data, T* softmax, int d,
-                                                 int feature_size) {
+                                                 int axis_dim) {
  __shared__ BlockReduceTempStorage<T, BlockDim> temp_storage;
-  auto beg_idx = feature_size * blockIdx.x + threadIdx.x;
+  // logits, softmax data view as [n, axis_dim, remain]
-  auto end_idx = feature_size * (blockIdx.x + 1);
+  // max_data view as [n, 1, remain]
+  // blockDim = n * remain, split blockIdx to idx_n and idx_remain
+  int remain = d / axis_dim;
+  int idx_n = blockIdx.x / remain;
+  int idx_remain = blockIdx.x % remain;
+  int beg_idx = idx_n * d + threadIdx.x * remain + idx_remain;
+  int end_idx = (idx_n + 1) * d;
  auto block_max = max_data[blockIdx.x];
+  int step = BlockDim * remain;
  softmax[beg_idx] = logits_data[beg_idx] - block_max;
  T diff_max_sum = exp_on_device(softmax[beg_idx]);
-  auto idx = beg_idx + BlockDim;
+  auto idx = beg_idx + step;
  while (idx < end_idx) {
    softmax[idx] = logits_data[idx] - block_max;
    diff_max_sum += exp_on_device(softmax[idx]);
-    idx += BlockDim;
+    idx += step;
  }
  diff_max_sum =
@@ -172,34 +191,42 @@ static __global__ void RowReductionForDiffMaxSum(const T* logits_data,
  __syncthreads();
  diff_max_sum = max_data[blockIdx.x];
  softmax[beg_idx] -= diff_max_sum;
-  beg_idx += BlockDim;
+  beg_idx += step;
  while (beg_idx < end_idx) {
    softmax[beg_idx] -= diff_max_sum;
-    beg_idx += BlockDim;
+    beg_idx += step;
  }
  if (threadIdx.x == 0) max_data[blockIdx.x] = 0;
 }
-// Make sure that BlockDim <= feature_size
+// Make sure that BlockDim <= axis_dim
 template <typename T, int BlockDim>
 static __global__ void RowReductionForSoftmaxAndCrossEntropy(
-    const T* labels_data, T* loss_data, T* softmax, int feature_size) {
+    const T* logits_data, const T* labels_data, T* loss_data, T* softmax, int d,
+    int axis_dim) {
  __shared__ BlockReduceTempStorage<T, BlockDim> temp_storage;
-  auto beg_idx = feature_size * blockIdx.x + threadIdx.x;
+  // logits, softmax, labels data view as [n, axis_dim, remain]
-  auto end_idx = feature_size * (blockIdx.x + 1);
+  // loss_data view as [n, 1, remain]
+  // blockDim = n * remain, split blockIdx to idx_n and idx_remain
+  int remain = d / axis_dim;
+  int idx_n = blockIdx.x / remain;
+  int idx_remain = blockIdx.x % remain;
+  int beg_idx = idx_n * d + threadIdx.x * remain + idx_remain;
+  int end_idx = (idx_n + 1) * d;
  // log_diff_max_sum shares memory with loss
  auto block_log_diff_max_sum = loss_data[blockIdx.x];
  auto tmp = softmax[beg_idx] - block_log_diff_max_sum;
  softmax[beg_idx] = exp_on_device(tmp);
  auto loss = -labels_data[beg_idx] * tmp;
-  beg_idx += BlockDim;
+  int step = BlockDim * remain;
+  beg_idx += step;
  while (beg_idx < end_idx) {
    tmp = softmax[beg_idx] - block_log_diff_max_sum;
    softmax[beg_idx] = exp_on_device(tmp);
    loss -= (labels_data[beg_idx] * tmp);
-    beg_idx += BlockDim;
+    beg_idx += step;
  }
  loss = BlockReduce<T, BlockDim>(temp_storage).Reduce(loss, cub::Sum());
@@ -210,21 +237,27 @@ template <typename T>
 struct HardLabelSoftmaxWithCrossEntropyFunctor {
 public:
  HardLabelSoftmaxWithCrossEntropyFunctor(const int64_t* labels, T* loss,
-                                          T* log_softmax, int feature_size)
+                                          T* log_softmax, int d, int axis_dim)
      : labels_(labels),
        loss_(loss),
        log_softmax_(log_softmax),
-        feature_size_(feature_size) {}
+        d_(d),
+        axis_dim_(axis_dim) {}
  __device__ void operator()(int idx) const {
-    auto row_idx = idx / feature_size_;
+    // logits view as [n, axis_dim, remain], where d = axis_dim * remain
-    auto col_idx = idx % feature_size_;
+    int remain = d_ / axis_dim_;
-    if (col_idx != labels_[row_idx]) {
+    int idx_n = idx / d_;
+    int idx_axis = (idx % d_) / remain;
+    int idx_remain = idx % remain;
+    // labels, loss view as [n, remain]
+    int idx_lbl = idx_n * remain + idx_remain;
+    if (idx_axis != labels_[idx_lbl]) {
      log_softmax_[idx] = exp_on_device(log_softmax_[idx]);
    } else {
      auto softmax = log_softmax_[idx];
      log_softmax_[idx] = exp_on_device(softmax);
-      loss_[row_idx] = -softmax;
+      loss_[idx_lbl] = -softmax;
    }
  }
@@ -232,7 +265,8 @@ struct HardLabelSoftmaxWithCrossEntropyFunctor {
  const int64_t* labels_;
  T* loss_;
  T* log_softmax_;
-  int feature_size_;
+  int d_;
+  int axis_dim_;
 };
 template <typename T>
@@ -240,23 +274,29 @@ struct HardLabelSoftmaxWithCrossEntropyFunctorWithIgnoreIdx {
 public:
  HardLabelSoftmaxWithCrossEntropyFunctorWithIgnoreIdx(const int64_t* labels,
                                                       T* loss, T* log_softmax,
-                                                       int feature_size,
+                                                       int d, int axis_dim,
                                                       int ignore_idx)
      : labels_(labels),
        loss_(loss),
        log_softmax_(log_softmax),
-        feature_size_(feature_size),
+        d_(d),
+        axis_dim_(axis_dim),
        ignore_idx_(ignore_idx) {}
  __device__ void operator()(int idx) const {
-    auto row_idx = idx / feature_size_;
+    // logits view as [n, axis_dim, remain], where d = axis_dim * remain
-    auto col_idx = idx % feature_size_;
+    int remain = d_ / axis_dim_;
-    if (col_idx != labels_[row_idx] || col_idx == ignore_idx_) {
+    int idx_n = idx / d_;
+    int idx_axis = (idx % d_) / remain;
+    int idx_remain = idx % remain;
+    // labels, loss view as [n, remain]
+    int idx_lbl = idx_n * remain + idx_remain;
+    if (idx_axis != labels_[idx_lbl] || idx_axis == ignore_idx_) {
      log_softmax_[idx] = exp_on_device(log_softmax_[idx]);
    } else {
      auto softmax = log_softmax_[idx];
      log_softmax_[idx] = exp_on_device(softmax);
-      loss_[row_idx] = -softmax;
+      loss_[idx_lbl] = -softmax;
    }
  }
@@ -264,44 +304,44 @@ struct HardLabelSoftmaxWithCrossEntropyFunctorWithIgnoreIdx {
  const int64_t* labels_;
  T* loss_;
  T* log_softmax_;
-  int feature_size_;
+  int d_;
+  int axis_dim_;
  int ignore_idx_;
 };
 template <typename T>
-static __global__ void SetSoftmaxToOneWhenFeatureSizeIsOne(T* out,
+static __global__ void SetSoftmaxToOneWhenFeatureSizeIsOne(T* out, int n) {
-                                                           int batch_size) {
  auto idx = threadIdx.x + blockIdx.x * blockDim.x;
-  if (idx < batch_size) out[idx] = static_cast<T>(1);
+  if (idx < n) out[idx] = static_cast<T>(1);
 }
 template <typename T>
 static void HardLabelSoftmaxWithCrossEntropy(
    const platform::CUDADeviceContext& ctx, const T* logits_data,
-    const int64_t* labels_data, T* loss_data, T* softmax_data, int batch_size,
+    const int64_t* labels_data, T* loss_data, T* softmax_data, int n, int d,
-    int feature_size, int ignore_idx) {
+    int axis_dim, int ignore_idx) {
  constexpr int kMaxBlockDim = 512;
-  int block_dim = feature_size >= kMaxBlockDim
+  int block_dim = axis_dim >= kMaxBlockDim
                      ? kMaxBlockDim
-                      : (1 << static_cast<int>(std::log2(feature_size)));
+                      : (1 << static_cast<int>(std::log2(axis_dim)));
+  int grid_dim = n * d / axis_dim;
  auto stream = ctx.stream();
-#define CALL_HARD_LABEL_SOFTMAX_WITH_CROSS_ENTROPY_FUSED_KERNEL(BlockDim)   \
+#define CALL_HARD_LABEL_SOFTMAX_WITH_CROSS_ENTROPY_FUSED_KERNEL(BlockDim)  \
-  case BlockDim: {                                                          \
+  case BlockDim: {                                                         \
-    RowReductionForMax<T, BlockDim><<<batch_size, BlockDim, 0, stream>>>(   \
+    RowReductionForMax<T, BlockDim><<<grid_dim, BlockDim, 0, stream>>>(    \
-        logits_data, loss_data, feature_size);                              \
+        logits_data, loss_data, d, axis_dim);                              \
-    RowReductionForDiffMaxSum<T, BlockDim,                                  \
+    RowReductionForDiffMaxSum<T, BlockDim,                                 \
-                              true><<<batch_size, BlockDim, 0, stream>>>(   \
+                              true><<<grid_dim, BlockDim, 0, stream>>>(    \
-        logits_data, loss_data, softmax_data, feature_size);                \
+        logits_data, loss_data, softmax_data, d, axis_dim);                \
-    platform::ForRange<platform::CUDADeviceContext> for_range(              \
+    platform::ForRange<platform::CUDADeviceContext> for_range(ctx, n* d);  \
-        ctx, batch_size* feature_size);                                     \
+    if (ignore_idx >= 0 && ignore_idx < axis_dim) {                        \
-    if (ignore_idx >= 0 && ignore_idx < feature_size) {                     \
+      for_range(HardLabelSoftmaxWithCrossEntropyFunctorWithIgnoreIdx<T>(   \
-      for_range(HardLabelSoftmaxWithCrossEntropyFunctorWithIgnoreIdx<T>(    \
+          labels_data, loss_data, softmax_data, d, axis_dim, ignore_idx)); \
-          labels_data, loss_data, softmax_data, feature_size, ignore_idx)); \
+    } else {                                                               \
-    } else {                                                                \
+      for_range(HardLabelSoftmaxWithCrossEntropyFunctor<T>(                \
-      for_range(HardLabelSoftmaxWithCrossEntropyFunctor<T>(                 \
+          labels_data, loss_data, softmax_data, d, axis_dim));             \
-          labels_data, loss_data, softmax_data, feature_size));             \
+    }                                                                      \
-    }                                                                       \
  } break
  switch (block_dim) {
@@ -315,11 +355,11 @@ static void HardLabelSoftmaxWithCrossEntropy(
    CALL_HARD_LABEL_SOFTMAX_WITH_CROSS_ENTROPY_FUSED_KERNEL(4);
    CALL_HARD_LABEL_SOFTMAX_WITH_CROSS_ENTROPY_FUSED_KERNEL(2);
    case 1:
-      SetSoftmaxToOneWhenFeatureSizeIsOne<<<(batch_size + kMaxBlockDim - 1) /
+      SetSoftmaxToOneWhenFeatureSizeIsOne<<<(grid_dim + kMaxBlockDim - 1) /
                                                kMaxBlockDim,
                                            kMaxBlockDim, 0, stream>>>(
-          softmax_data, batch_size);
+          softmax_data, grid_dim);
-      cudaMemsetAsync(loss_data, 0, batch_size * sizeof(T), stream);
+      cudaMemsetAsync(loss_data, 0, grid_dim * sizeof(T), stream);
      break;
    default:
      PADDLE_THROW("BlockDim must be 2^n in softmax_with_cross_entropy_op");
@@ -332,23 +372,23 @@ template <typename T>
 static void SoftmaxWithCrossEntropyFusedKernel(const T* logits_data,
                                               const T* labels_data,
                                               T* softmax_data, T* loss_data,
-                                               int batch_size, int feature_size,
+                                               int n, int d, int axis_dim,
                                               cudaStream_t stream) {
  constexpr int kMaxBlockDim = 512;
-  int block_dim = feature_size >= kMaxBlockDim
+  int block_dim = axis_dim >= kMaxBlockDim
                      ? kMaxBlockDim
-                      : (1 << static_cast<int>(std::log2(feature_size)));
+                      : (1 << static_cast<int>(std::log2(axis_dim)));
+  int grid_dim = n * d / axis_dim;
-#define CALL_SOFTMAX_WITH_CROSS_ENTROPY_FUSED_KERNEL(BlockDim)                \
-  case BlockDim:                                                              \
+#define CALL_SOFTMAX_WITH_CROSS_ENTROPY_FUSED_KERNEL(BlockDim)                 \
-    RowReductionForMax<T, BlockDim><<<batch_size, BlockDim, 0, stream>>>(     \
+  case BlockDim:                                                               \
-        logits_data, loss_data, feature_size);                                \
+    RowReductionForMax<T, BlockDim><<<grid_dim, BlockDim, 0, stream>>>(        \
-    RowReductionForDiffMaxSum<T,                                              \
+        logits_data, loss_data, d, axis_dim);                                  \
-                              BlockDim><<<batch_size, BlockDim, 0, stream>>>( \
+    RowReductionForDiffMaxSum<T, BlockDim><<<grid_dim, BlockDim, 0, stream>>>( \
-        logits_data, loss_data, softmax_data, feature_size);                  \
+        logits_data, loss_data, softmax_data, d, axis_dim);                    \
-    RowReductionForSoftmaxAndCrossEntropy<                                    \
+    RowReductionForSoftmaxAndCrossEntropy<                                     \
-        T, BlockDim><<<batch_size, BlockDim, 0, stream>>>(                    \
+        T, BlockDim><<<grid_dim, BlockDim, 0, stream>>>(                       \
-        labels_data, loss_data, softmax_data, feature_size);                  \
+        logits_data, labels_data, loss_data, softmax_data, d, axis_dim);       \
    break
  switch (block_dim) {
@@ -362,11 +402,11 @@ static void SoftmaxWithCrossEntropyFusedKernel(const T* logits_data,
    CALL_SOFTMAX_WITH_CROSS_ENTROPY_FUSED_KERNEL(4);
    CALL_SOFTMAX_WITH_CROSS_ENTROPY_FUSED_KERNEL(2);
    case 1:
-      SetSoftmaxToOneWhenFeatureSizeIsOne<<<(batch_size + kMaxBlockDim - 1) /
+      SetSoftmaxToOneWhenFeatureSizeIsOne<<<(grid_dim + kMaxBlockDim - 1) /
                                                kMaxBlockDim,
                                            kMaxBlockDim, 0, stream>>>(
-          softmax_data, batch_size);
+          softmax_data, n);
-      cudaMemsetAsync(loss_data, 0, batch_size * sizeof(T), stream);
+      cudaMemsetAsync(loss_data, 0, grid_dim * sizeof(T), stream);
      break;
    default:
      PADDLE_THROW("BlockDim must be 2^n in softmax_with_cross_entropy_op");
@@ -385,51 +425,46 @@ class SoftmaxWithCrossEntropyCUDAKernel : public framework::OpKernel<T> {
    const Tensor* logits = context.Input<Tensor>("Logits");
    const Tensor* labels = context.Input<Tensor>("Label");
    Tensor* softmax = context.Output<Tensor>("Softmax");
    Tensor* loss = context.Output<Tensor>("Loss");
+    const int rank = logits->dims().size();
+    const int axis = CanonicalAxis(context.Attr<int>("axis"), rank);
+    int axis_dim = logits->dims()[axis];
+    const int n = SizeToAxis(axis, logits->dims());
+    const int d = SizeFromAxis(axis, logits->dims());
    auto* softmax_data = softmax->mutable_data<T>(context.GetPlace());
    auto* loss_data = loss->mutable_data<T>(context.GetPlace());
    auto soft_label = context.Attr<bool>("soft_label");
    auto ignore_index = context.Attr<int>("ignore_index");
-    int rank = logits->dims().size();
    if (soft_label) {
-      int batch_size = 1;
-      for (int i = 0; i < rank - 1; ++i) {
-        batch_size *= logits->dims()[i];
-      }
-      int feature_size = logits->dims()[rank - 1];
      auto* logits_data = logits->data<T>();
      auto* labels_data = labels->data<T>();
      SoftmaxWithCrossEntropyFusedKernel(
-          logits_data, labels_data, softmax_data, loss_data, batch_size,
+          logits_data, labels_data, softmax_data, loss_data, n, d, axis_dim,
-          feature_size, context.cuda_device_context().stream());
+          context.cuda_device_context().stream());
    } else {
      if (!context.Attr<bool>("numeric_stable_mode")) {
-        // reshape to 2d
+        // CUDNN kernel only suppoer 2-D tensor and perfome softmax on last dim
-        Tensor logits_2d = framework::ReshapeToMatrix(*logits, rank - 1);
+        Tensor logits_2d, softmax_2d, labels_2d, loss_2d;
-        Tensor softmax_2d = framework::ReshapeToMatrix(*softmax, rank - 1);
+        logits_2d.ShareDataWith(*logits).Resize({n, d});
-        Tensor loss_2d = framework::ReshapeToMatrix(*loss, rank - 1);
+        softmax_2d.ShareDataWith(*softmax).Resize({n, d});
-        Tensor labels_2d = framework::ReshapeToMatrix(*labels, rank - 1);
+        labels_2d.ShareDataWith(*labels).Resize({n, labels->numel() / n});
+        loss_2d.ShareDataWith(*loss).Resize({n, 1});
        math::SoftmaxCUDNNFunctor<T>()(context.cuda_device_context(),
                                       &logits_2d, &softmax_2d);
        math::CrossEntropyFunctor<platform::CUDADeviceContext, T>()(
            context.cuda_device_context(), &loss_2d, &softmax_2d, &labels_2d,
-            false, ignore_index);
+            false, ignore_index, axis_dim);
      } else {
-        int batch_size = 1;
-        for (int i = 0; i < rank - 1; ++i) {
-          batch_size *= logits->dims()[i];
-        }
-        int feature_size = logits->dims()[rank - 1];
        auto* logits_data = logits->data<T>();
        auto* labels_data = labels->data<int64_t>();
        HardLabelSoftmaxWithCrossEntropy<T>(
            context.cuda_device_context(), logits_data, labels_data, loss_data,
-            softmax_data, batch_size, feature_size, ignore_index);
+            softmax_data, n, d, axis_dim, ignore_index);
      }
    }
  }
@@ -453,30 +488,31 @@ class SoftmaxWithCrossEntropyGradCUDAKernel : public framework::OpKernel<T> {
    }
    T* logit_grad_data = logit_grad->data<T>();
-    int rank = logit_grad->dims().size();
+    const int rank = logit_grad->dims().size();
-    int batch_size = 1;
+    const int axis = CanonicalAxis(context.Attr<int>("axis"), rank);
-    for (int i = 0; i < rank - 1; ++i) {
+    int axis_dim = logit_grad->dims()[axis];
-      batch_size *= logit_grad->dims()[i];
-    }
+    const int n = SizeToAxis(axis, logit_grad->dims());
+    const int d = SizeFromAxis(axis, logit_grad->dims());
+    const int remain = d / axis_dim;
-    const int class_num = logit_grad->dims()[rank - 1];
    int block = 512;
    auto stream = context.cuda_device_context().stream();
    auto ignore_index = context.Attr<int>("ignore_index");
    if (context.Attr<bool>("soft_label")) {
-      int grid = (batch_size * class_num + block - 1) / block;
+      int grid = (n * d + block - 1) / block;
      const T* label_data = labels->data<T>();
      SoftCrossEntropyGradientKernel<T><<<grid, block, 0, stream>>>(
-          logit_grad_data, loss_grad_data, label_data, batch_size, class_num);
+          logit_grad_data, loss_grad_data, label_data, n, d, remain);
    } else {
-      int grid = (batch_size + block - 1) / block;
+      int grid = (n * remain + block - 1) / block;
      const int64_t* label_data = labels->data<int64_t>();
      CrossEntropyGrad<T><<<grid, block, 0, stream>>>(
-          logit_grad_data, label_data, batch_size, class_num, ignore_index);
+          logit_grad_data, label_data, n, d, remain, ignore_index);
-      int num = batch_size * class_num;
+      int num = n * d;
      grid = (num + block - 1) / block;
      Scale<T><<<grid, block, 0, stream>>>(logit_grad_data, loss_grad_data, num,
-                                           class_num);
+                                           d, remain);
    }
  }
 };

--- a/paddle/fluid/operators/softmax_with_cross_entropy_op.h
+++ b/paddle/fluid/operators/softmax_with_cross_entropy_op.h
@@ -17,6 +17,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/cross_entropy.h"
 #include "paddle/fluid/operators/math/softmax.h"
+#include "paddle/fluid/operators/softmax_op.h"
 namespace paddle {
 namespace operators {
@@ -36,26 +37,30 @@ class SoftmaxWithCrossEntropyKernel : public framework::OpKernel<T> {
    const Tensor* labels = context.Input<Tensor>("Label");
    Tensor* softmax = context.Output<Tensor>("Softmax");
    Tensor* loss = context.Output<Tensor>("Loss");
+    const bool soft_label = context.Attr<bool>("soft_label");
+    const int rank = logits->dims().size();
+    const int axis = CanonicalAxis(context.Attr<int>("axis"), rank);
+    int axis_dim = logits->dims()[axis];
    softmax->mutable_data<T>(context.GetPlace());
    loss->mutable_data<T>(context.GetPlace());
-    // reshape to 2D tensor
+    const int n = SizeToAxis(axis, logits->dims());
-    int rank = logits->dims().size();
+    const int d = SizeFromAxis(axis, logits->dims());
-    Tensor logits_2d = framework::ReshapeToMatrix(*logits, rank - 1);
+    Tensor logits_2d, softmax_2d, labels_2d, loss_2d;
-    Tensor labels_2d = framework::ReshapeToMatrix(*labels, rank - 1);
+    logits_2d.ShareDataWith(*logits).Resize({n, d});
-    Tensor loss_2d = framework::ReshapeToMatrix(*loss, rank - 1);
+    softmax_2d.ShareDataWith(*softmax).Resize({n, d});
-    Tensor softmax_2d = framework::ReshapeToMatrix(*softmax, rank - 1);
+    labels_2d.ShareDataWith(*labels).Resize({n, labels->numel() / n});
+    loss_2d.ShareDataWith(*loss).Resize({n, d / axis_dim});
-    int axis_dim = logits->dims()[rank - 1];
    auto& dev_ctx =
        context.template device_context<platform::CPUDeviceContext>();
    math::SoftmaxFunctor<platform::CPUDeviceContext, T, false>()(
        dev_ctx, axis_dim, &logits_2d, &softmax_2d);
    math::CrossEntropyFunctor<platform::CPUDeviceContext, T>()(
-        dev_ctx, &loss_2d, &softmax_2d, &labels_2d,
+        dev_ctx, &loss_2d, &softmax_2d, &labels_2d, soft_label,
-        context.Attr<bool>("soft_label"), context.Attr<int>("ignore_index"));
+        context.Attr<int>("ignore_index"), axis_dim);
  }
 };
@@ -75,34 +80,43 @@ class SoftmaxWithCrossEntropyGradKernel : public framework::OpKernel<T> {
                            context.device_context(), logit_grad);
    }
-    int rank = logit_grad->dims().size();
+    const bool soft_label = context.Attr<bool>("soft_label");
-    const int class_num = logit_grad->dims()[rank - 1];
-    // reshape to 2d
+    const int rank = logit_grad->dims().size();
-    Tensor logit_grad_2d = framework::ReshapeToMatrix(*logit_grad, rank - 1);
+    const int axis = CanonicalAxis(context.Attr<int>("axis"), rank);
-    Tensor out_grad_2d = framework::ReshapeToMatrix(*out_grad, rank - 1);
+    int axis_dim = logit_grad->dims()[axis];
+    const int n = SizeToAxis(axis, logit_grad->dims());
+    const int d = SizeFromAxis(axis, logit_grad->dims());
+    Tensor logit_grad_2d, labels_2d, out_grad_2d;
+    logit_grad_2d.ShareDataWith(*logit_grad).Resize({n, d});
+    labels_2d.ShareDataWith(*labels).Resize({n, labels->numel() / n});
+    out_grad_2d.ShareDataWith(*out_grad).Resize({n, d / axis_dim});
    auto out_grad_mat = EigenMatrix<T>::From(out_grad_2d);
    auto logit_grad_mat = EigenMatrix<T>::From(logit_grad_2d);
    auto& place = *context.template device_context<platform::CPUDeviceContext>()
                       .eigen_device();
-    if (context.Attr<bool>("soft_label")) {
+    if (soft_label) {
-      Tensor labels_2d = framework::ReshapeToMatrix(*labels, rank - 1);
      auto lbl_mat = EigenMatrix<T>::From(labels_2d);
      logit_grad_mat.device(place) =
-          out_grad_mat.broadcast(Eigen::DSizes<int, 2>(1, class_num)) *
+          out_grad_mat.broadcast(Eigen::DSizes<int, 2>(1, axis_dim)) *
          (logit_grad_mat - lbl_mat);
    } else {
      logit_grad_mat.device(place) =
          logit_grad_mat *
-          out_grad_mat.broadcast(Eigen::DSizes<int, 2>(1, class_num));
+          out_grad_mat.broadcast(Eigen::DSizes<int, 2>(1, axis_dim));
-      const int batch_size = logit_grad_2d.dims()[0];
      const int64_t* label_data = labels->data<int64_t>();
      T* logit_grad_data = logit_grad->data<T>();
      const T* out_grad_data = out_grad->data<T>();
-      for (int i = 0; i < batch_size; ++i) {
+      const int remain = d / axis_dim;
-        logit_grad_data[i * class_num + label_data[i]] -= out_grad_data[i];
+      for (int i = 0; i < n; ++i) {
+        for (int j = 0; j < remain; j++) {
+          int idx = i * remain + j;
+          logit_grad_data[i * d + label_data[idx] * remain + j] -=
+              out_grad_data[idx];
+        }
      }
    }
  }

--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -6099,22 +6099,24 @@ def softmax_with_cross_entropy(logits,
                               soft_label=False,
                               ignore_index=kIgnoreIndex,
                               numeric_stable_mode=True,
-                               return_softmax=False):
+                               return_softmax=False,
+                               axis=-1):
    """
    **Softmax With Cross Entropy Operator.**
    Cross entropy loss with softmax is used as the output layer extensively. This
-    operator computes the softmax normalized values for each row of the input
+    operator computes the softmax normalized values for dimension :attr:`axis` of 
-    tensor, after which cross-entropy loss is computed. This provides a more
+    the input tensor, after which cross-entropy loss is computed. This provides 
-    numerically stable gradient.
+    a more numerically stable gradient.
    Because this operator performs a softmax on logits internally, it expects
    unscaled logits. This operator should not be used with the output of
    softmax operator since that would produce incorrect results.
-    When the attribute soft_label is set false, this operators expects mutually
+    When the attribute :attr:`soft_label` is set :attr:`False`, this operators 
-    exclusive hard labels, each sample in a batch is in exactly one class with a
+    expects mutually exclusive hard labels, each sample in a batch is in exactly 
-    probability of 1.0. Each sample in the batch will have a single label.
+    one class with a probability of 1.0. Each sample in the batch will have a 
+    single label.
    The equation is as follows:
@@ -6133,7 +6135,8 @@ def softmax_with_cross_entropy(logits,
        \\left(\\text{logit}_i - \\log\\left(\\sum_{i=0}^{K}
        \\exp(\\text{logit}_i)\\right)\\right), j = 1,...,K
-    3) If numeric_stable_mode is True, softmax is calculated first by:
+    3) If :attr:`numeric_stable_mode` is :attr:`True`, softmax is calculated 
+    first by:
    .. math::
@@ -6146,32 +6149,39 @@ def softmax_with_cross_entropy(logits,
    and then cross entropy loss is calculated by softmax and label.
    Args:
-        logits (Variable): The unscaled log probabilities, which is a 2-D tensor
+        logits (Variable): The input tensor of unscaled log probabilities.
-            with shape [N x K]. N is the batch_size, and K is the class number.
+        label (Variable): The ground truth  tensor. If :attr:`soft_label`
-        label (Variable): The ground truth which is a 2-D tensor. If soft_label
+            is set to :attr:`True`, Label is a Tensor<float/double> in the 
-            is set to false, Label is a Tensor<int64> with shape [N x 1]. If
+            same shape with :attr:`logits`. If :attr:`soft_label` is set to 
-            soft_label is set to true, Label is a Tensor<float/double> with
+            :attr:`True`, Label is a Tensor<int64> in the same shape with 
+            :attr:`logits` expect shape in dimension :attr:`axis` as 1.
        soft_label (bool): A flag to indicate whether to interpretate the given
-            labels as soft labels. By default, `soft_label` is set to False.
+            labels as soft labels. Default False.
        ignore_index (int): Specifies a target value that is ignored and does
                            not contribute to the input gradient. Only valid
-                            if soft_label is set to False. Default: kIgnoreIndex
+                            if :attr:`soft_label` is set to :attr:`False`. 
+                            Default: kIgnoreIndex
        numeric_stable_mode (bool): A flag to indicate whether to use a more
                                    numerically stable algorithm. Only valid
-                                    when soft_label is False and GPU is used.
+                                    when :attr:`soft_label` is :attr:`False` 
-                                    When soft_label is True or CPU is used,
+                                    and GPU is used. When :attr:`soft_label` 
-                                    the algorithm is always numerically stable.
+                                    is :attr:`True` or CPU is used, the 
+                                    algorithm is always numerically stable.
                                    Note that the speed may be slower when use
                                    stable algorithm. Default: True
        return_softmax (bool): A flag indicating whether to return the softmax
                               along with the cross entropy loss. Default: False
+        axis (int): The index of dimension to perform softmax calculations. It 
+                    should be in range :math:`[-1, rank - 1]`, while :math:`rank`
+                    is the rank of input :attr:`logits`. Default: -1.
    Returns:
        Variable or Tuple of two Variables: Return the cross entropy loss if \
                                            `return_softmax` is False, otherwise the tuple \
-                                            (loss, softmax), where the cross entropy loss is \
+                                            (loss, softmax), softmax is in the same shape \
-                                            a 2-D tensor with shape [N x 1], and softmax is a \
+                                            with input logits and cross entropy loss is in \
-                                            2-D tensor with shape [N x K].
+                                            the same shape with input logits except shape \
+                                            in dimension :attr:`axis` as 1.
    Examples:
        .. code-block:: python
@@ -6194,7 +6204,8 @@ def softmax_with_cross_entropy(logits,
        attrs={
            'soft_label': soft_label,
            'ignore_index': ignore_index,
-            'numeric_stable_mode': numeric_stable_mode
+            'numeric_stable_mode': numeric_stable_mode,
+            'axis': axis
        })
    if return_softmax:

--- a/python/paddle/fluid/tests/unittests/test_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_layers.py
@@ -1221,10 +1221,25 @@ class TestBook(LayerTest):
            y = self._get_data(name='label', shape=[1], dtype='int64')
            loss, softmax = layers.softmax_with_cross_entropy(
                x, y, return_softmax=True)
-            return (loss)
+            self.assertIsNotNone(loss)
-            return (softmax)
+            self.assertIsNotNone(softmax)
            loss = layers.softmax_with_cross_entropy(x, y)
-            return (loss)
+            self.assertIsNotNone(loss)
+            x1 = self._get_data(name='x1', shape=[16, 32, 64], dtype='float32')
+            y1 = self._get_data(name='label1', shape=[1, 32, 64], dtype='int64')
+            y2 = self._get_data(name='label2', shape=[16, 1, 64], dtype='int64')
+            y3 = self._get_data(name='label3', shape=[16, 32, 1], dtype='int64')
+            loss1 = layers.softmax_with_cross_entropy(x1, y1, axis=1)
+            loss2 = layers.softmax_with_cross_entropy(x1, y2, axis=2)
+            loss3 = layers.softmax_with_cross_entropy(x1, y3, axis=3)
+            loss4 = layers.softmax_with_cross_entropy(x1, y3, axis=-1)
+            self.assertIsNotNone(loss1)
+            self.assertIsNotNone(loss2)
+            self.assertIsNotNone(loss3)
+            self.assertIsNotNone(loss4)
+            return (loss4)
    def make_smooth_l1(self):
        with program_guard(fluid.default_main_program(),

--- a/python/paddle/fluid/tests/unittests/test_softmax_with_cross_entropy_op.py
+++ b/python/paddle/fluid/tests/unittests/test_softmax_with_cross_entropy_op.py
@@ -21,37 +21,70 @@ from op_test import OpTest
 from test_softmax_op import stable_softmax
+def cross_entropy(softmax, label, soft_label, axis, ignore_index=-1):
+    if soft_label:
+        return (-label * np.log(softmax)).sum(axis=axis, keepdims=True)
+    shape = softmax.shape
+    axis %= len(shape)
+    n = int(np.prod(shape[:axis]))
+    axis_dim = shape[axis]
+    remain = int(np.prod(shape[axis + 1:]))
+    softmax_reshape = softmax.reshape((n, axis_dim, remain))
+    label_reshape = label.reshape((n, 1, remain))
+    result = np.zeros_like(label_reshape, dtype=softmax.dtype)
+    for i in range(n):
+        for j in range(remain):
+            lbl = label_reshape[i, 0, j]
+            if lbl != ignore_index:
+                result[i, 0, j] -= np.log(softmax_reshape[i, lbl, j])
+    return result.reshape(label.shape)
 class TestSoftmaxWithCrossEntropyOp(OpTest):
    """
    Test softmax with cross entropy operator with discreate one-hot labels.
    """
    def initParams(self):
+        self.op_type = "softmax_with_cross_entropy"
        self.numeric_stable_mode = False
+        self.soft_label = False
        self.dtype = np.float64
+        self.axis = -1
+        self.ignore_index = -1
+        self.shape = [41, 37]
    def setUp(self):
        self.initParams()
-        self.op_type = "softmax_with_cross_entropy"
-        batch_size = 41
-        class_num = 37
-        logits = np.random.uniform(0.1, 1.0,
+        logits = np.random.uniform(0.1, 1.0, self.shape).astype(self.dtype)
-                                   [batch_size, class_num]).astype(self.dtype)
+        softmax = np.apply_along_axis(stable_softmax, self.axis, logits)
-        softmax = np.apply_along_axis(stable_softmax, 1, logits)
-        labels = np.random.randint(0, class_num, [batch_size, 1], dtype="int64")
+        if self.soft_label:
+            labels = np.random.uniform(0.1, 1.0, self.shape).astype(self.dtype)
+            labels /= np.sum(labels, axis=self.axis, keepdims=True)
+        else:
+            axis_dim = self.shape[self.axis]
+            self.shape[self.axis] = 1
+            labels = np.random.randint(0, axis_dim, self.shape, dtype="int64")
-        cross_entropy = np.asmatrix(
+        loss = cross_entropy(softmax, labels, self.soft_label, self.axis,
-            [[-np.log(softmax[i][labels[i][0]])]
+                             self.ignore_index)
-             for i in range(softmax.shape[0])],
-            dtype=self.dtype)
        self.inputs = {"Logits": logits, "Label": labels}
        self.outputs = {
            "Softmax": softmax.astype(self.dtype),
-            "Loss": cross_entropy.astype(self.dtype)
+            "Loss": loss.astype(self.dtype)
        }
-        self.attrs = {"numeric_stable_mode": self.numeric_stable_mode}
+        self.attrs = {
+            "numeric_stable_mode": self.numeric_stable_mode,
+            "soft_label": self.soft_label,
+        }
+        if self.ignore_index >= 0:
+            self.attrs['ignore_index'] = self.ignore_index
+        if self.axis != -1:
+            self.attrs['axis'] = self.axis
    def test_check_output(self):
        self.check_output()
@@ -62,30 +95,38 @@ class TestSoftmaxWithCrossEntropyOp(OpTest):
 class TestSoftmaxWithCrossEntropyOpNoCudnn(TestSoftmaxWithCrossEntropyOp):
    def initParams(self):
+        self.op_type = "softmax_with_cross_entropy"
        self.numeric_stable_mode = True
+        self.soft_label = False
+        self.shape = [3, 5, 7, 11]
+        self.axis = -1
+        self.ignore_index = -1
+        self.dtype = np.float64
 class TestSoftmaxWithCrossEntropyOpFp16(TestSoftmaxWithCrossEntropyOp):
    def initParams(self):
+        self.op_type = "softmax_with_cross_entropy"
        self.numeric_stable_mode = False
+        self.soft_label = False
+        self.shape = [3, 5, 7, 11]
+        self.axis = -1
+        self.ignore_index = -1
        self.dtype = np.float16
    def setUp(self):
        self.initParams()
        self.op_type = "softmax_with_cross_entropy"
-        batch_size = 41
-        class_num = 37
        # NOTE: numpy float16 have very low accuracy, use float32 for numpy check.
-        logits = np.random.uniform(0.1, 1.0,
+        logits = np.random.uniform(0.1, 1.0, self.shape).astype(np.float32)
-                                   [batch_size, class_num]).astype(np.float32)
+        softmax = np.apply_along_axis(stable_softmax, self.axis, logits)
-        softmax = np.apply_along_axis(stable_softmax, 1, logits)
-        labels = np.random.randint(0, class_num, [batch_size, 1], dtype="int64")
-        cross_entropy = np.asmatrix(
+        axis_dim = self.shape[self.axis]
-            [[-np.log(softmax[i][labels[i][0]])]
+        self.shape[self.axis] = 1
-             for i in range(softmax.shape[0])],
+        labels = np.random.randint(0, axis_dim, self.shape, dtype="int64")
-            dtype=np.float32)
+        loss = cross_entropy(softmax, labels, self.soft_label, self.axis)
        self.inputs = {
            "Logits": logits.astype(self.dtype).view(np.uint16),
@@ -93,9 +134,14 @@ class TestSoftmaxWithCrossEntropyOpFp16(TestSoftmaxWithCrossEntropyOp):
        }
        self.outputs = {
            "Softmax": softmax.astype(self.dtype),
-            "Loss": cross_entropy.astype(self.dtype)
+            "Loss": loss.astype(self.dtype)
+        }
+        self.attrs = {
+            "numeric_stable_mode": self.numeric_stable_mode,
+            "soft_label": self.soft_label,
        }
-        self.attrs = {"numeric_stable_mode": self.numeric_stable_mode}
+        if self.axis != -1:
+            self.attrs['axis'] = self.axis
    def test_check_output(self):
        self.check_output(atol=1e-2)
@@ -107,39 +153,31 @@ class TestSoftmaxWithCrossEntropyOpFp16(TestSoftmaxWithCrossEntropyOp):
 class TestSoftmaxWithCrossEntropyOpNoCudnnFp16(
        TestSoftmaxWithCrossEntropyOpFp16):
    def initParams(self):
+        self.op_type = "softmax_with_cross_entropy"
        self.numeric_stable_mode = True
+        self.soft_label = False
+        self.shape = [3, 5, 7, 11]
+        self.axis = -1
+        self.ignore_index = -1
        self.dtype = np.float16
    def test_check_grad(self):
        self.check_grad(["Logits"], "Loss", max_relative_error=0.1)
-class TestSoftmaxWithCrossEntropyOp2(OpTest):
+class TestSoftmaxWithCrossEntropyOp2(TestSoftmaxWithCrossEntropyOp):
    """
    Test softmax with cross entropy operator with soft labels.
    """
-    def setUp(self):
+    def initParams(self):
        self.op_type = "softmax_with_cross_entropy"
-        batch_size = 41
+        self.numeric_stable_mode = True
-        class_num = 37
+        self.soft_label = True
+        self.dtype = np.float64
-        logits = np.random.uniform(0.1, 1.0,
+        self.axis = -1
-                                   [batch_size, class_num]).astype("float64")
+        self.ignore_index = -1
-        softmax = np.apply_along_axis(stable_softmax, 1, logits)
+        self.shape = [41, 37]
-        labels = np.random.uniform(0.1, 1.0,
-                                   [batch_size, class_num]).astype("float64")
-        labels /= np.sum(labels, axis=1, keepdims=True)
-        cross_entropy = (-labels * np.log(softmax)).sum(
-            axis=1, keepdims=True).astype("float64")
-        self.inputs = {"Logits": logits, "Label": labels}
-        self.outputs = {
-            "Softmax": softmax.astype("float64"),
-            "Loss": cross_entropy.astype("float64")
-        }
-        self.attrs = {"soft_label": True}
    def test_check_output(self):
        self.check_output()
@@ -148,190 +186,226 @@ class TestSoftmaxWithCrossEntropyOp2(OpTest):
        self.check_grad(["Logits"], "Loss")
-class TestSoftmaxWithCrossEntropyOp3(OpTest):
+class TestSoftmaxWithCrossEntropyOp3(TestSoftmaxWithCrossEntropyOp):
    """
    Test softmax with cross entropy operator with ignore_index.
    """
    def initParams(self):
+        self.op_type = "softmax_with_cross_entropy"
        self.numeric_stable_mode = False
+        self.soft_label = False
+        self.shape = [41, 37]
+        self.ignore_index = 5
+        self.axis = -1
+        self.dtype = np.float64
-    def setUp(self):
-        self.initParams()
+class TestSoftmaxWithCrossEntropyOp3NoCudnn(TestSoftmaxWithCrossEntropyOp3):
+    def initParams(self):
        self.op_type = "softmax_with_cross_entropy"
-        batch_size = 41
+        self.numeric_stable_mode = True
-        class_num = 37
+        self.soft_label = False
+        self.shape = [3, 5, 7, 11]
-        logits = np.random.uniform(0.1, 1.0,
+        self.ignore_index = 4
-                                   [batch_size, class_num]).astype("float64")
+        self.axis = -1
-        softmax = np.apply_along_axis(stable_softmax, 1, logits)
+        self.dtype = np.float64
-        labels = np.random.randint(0, class_num, [batch_size, 1], dtype="int64")
-        ignore_index = 7
-        cross_entropy = np.asmatrix(
-            [[-np.log(softmax[i][labels[i][0]])]
-             if labels[i] != ignore_index else [0]
-             for i in range(softmax.shape[0])],
-            dtype="float64")
-        self.inputs = {"Logits": logits, "Label": labels}
-        self.outputs = {
-            "Softmax": softmax.astype("float64"),
-            "Loss": cross_entropy.astype("float64")
-        }
-        self.attrs = {
-            "ignore_index": ignore_index,
-            "numeric_stable_mode": self.numeric_stable_mode
-        }
-    def test_check_output(self):
+class TestSoftmaxWithCrossEntropyOpAxis1(TestSoftmaxWithCrossEntropyOp):
-        self.check_output()
+    """
+    Test softmax with cross entropy operator with discreate one-hot labels.
+    Given axis != -1
+    """
-    def test_check_grad(self):
+    def initParams(self):
-        self.check_grad(["Logits"], "Loss")
+        self.op_type = "softmax_with_cross_entropy"
+        self.numeric_stable_mode = True
+        self.soft_label = False
+        self.dtype = np.float64
+        self.axis = 0
+        self.ignore_index = -1
+        self.shape = [3, 5, 7, 11]
-class TestSoftmaxWithCrossEntropyOp3NoCudnn(TestSoftmaxWithCrossEntropyOp3):
+class TestSoftmaxWithCrossEntropyOpAxis2(TestSoftmaxWithCrossEntropyOp):
+    """
+    Test softmax with cross entropy operator with discreate one-hot labels.
+    Given axis != -1
+    """
    def initParams(self):
+        self.op_type = "softmax_with_cross_entropy"
        self.numeric_stable_mode = True
+        self.soft_label = False
+        self.dtype = np.float64
+        self.axis = 1
+        self.ignore_index = -1
+        self.shape = [3, 5, 7, 11]
-class TestSoftmaxWithCrossEntropyOp5(OpTest):
+class TestSoftmaxWithCrossEntropyOpAxis3(TestSoftmaxWithCrossEntropyOp):
    """
-    Test softmax with cross entropy operator with ignore_index.
+    Test softmax with cross entropy operator with discreate one-hot labels.
+    Given axis != -1
    """
    def initParams(self):
-        self.numeric_stable_mode = False
+        self.op_type = "softmax_with_cross_entropy"
+        self.numeric_stable_mode = True
+        self.soft_label = False
+        self.dtype = np.float64
+        self.axis = 2
+        self.ignore_index = -1
+        self.shape = [3, 5, 7, 11]
-    def setUp(self):
-        self.initParams()
+class TestSoftmaxWithCrossEntropyOpAxis4(TestSoftmaxWithCrossEntropyOp):
+    """
+    Test softmax with cross entropy operator with discreate one-hot labels.
+    Given axis != -1
+    """
+    def initParams(self):
        self.op_type = "softmax_with_cross_entropy"
-        batch_size = [6, 10]
+        self.numeric_stable_mode = True
-        class_num = 47
+        self.soft_label = False
+        self.dtype = np.float64
-        logits = np.random.uniform(
+        self.axis = 3
-            0.1, 1.0, tuple(batch_size + [class_num])).astype("float64")
+        self.ignore_index = -1
-        softmax = np.apply_along_axis(stable_softmax, 2, logits)
+        self.shape = [3, 5, 7, 11]
-        labels = np.random.randint(
-            0, class_num, tuple(batch_size + [1]), dtype="int64")
-        ignore_index = 7
-        softmax_2d = np.reshape(softmax, [-1, class_num])
-        labels_2d = np.reshape(labels, [-1, 1])
-        cross_entropy = np.asmatrix(
-            [[-np.log(softmax_2d[i][labels_2d[i][0]])]
-             if labels_2d[i] != ignore_index else [0]
-             for i in range(softmax_2d.shape[0])],
-            dtype="float64")
-        cross_entropy = np.reshape(cross_entropy, batch_size)
-        output_shape = tuple(batch_size + [1])
-        output_res = cross_entropy.astype("float64")
-        output_res = np.expand_dims(output_res, axis=2)
-        self.inputs = {"Logits": logits, "Label": labels}
-        self.outputs = {
-            "Softmax": softmax.astype("float64"),
-            "Loss": output_res,
-        }
-        self.attrs = {
-            "ignore_index": ignore_index,
-            "numeric_stable_mode": self.numeric_stable_mode
-        }
-    def test_check_output(self):
-        self.check_output()
-    def test_check_grad(self):
+class TestSoftmaxWithCrossEntropyOpNoCudnnFp16Axis1(
-        self.check_grad(["Logits"], "Loss")
+        TestSoftmaxWithCrossEntropyOpNoCudnnFp16):
+    def initParams(self):
+        self.op_type = "softmax_with_cross_entropy"
+        self.numeric_stable_mode = True
+        self.soft_label = False
+        self.shape = [3, 5, 7, 11]
+        self.axis = 0
+        self.ignore_index = -1
+        self.dtype = np.float16
-class TestSoftmaxWithCrossEntropyOp5NoCudnn(TestSoftmaxWithCrossEntropyOp5):
+class TestSoftmaxWithCrossEntropyOpNoCudnnFp16Axis2(
+        TestSoftmaxWithCrossEntropyOpNoCudnnFp16):
    def initParams(self):
+        self.op_type = "softmax_with_cross_entropy"
        self.numeric_stable_mode = True
+        self.soft_label = False
+        self.shape = [3, 5, 7, 11]
+        self.axis = 1
+        self.ignore_index = -1
+        self.dtype = np.float16
-class TestSoftmaxWithCrossEntropyOp6(OpTest):
+class TestSoftmaxWithCrossEntropyOpNoCudnnFp16Axis3(
-    """
+        TestSoftmaxWithCrossEntropyOpNoCudnnFp16):
-    Test softmax with cross entropy operator with soft labels.
+    def initParams(self):
-    """
+        self.op_type = "softmax_with_cross_entropy"
+        self.numeric_stable_mode = True
+        self.soft_label = False
+        self.shape = [3, 5, 7, 11]
+        self.axis = 2
+        self.ignore_index = -1
+        self.dtype = np.float16
-    def setUp(self):
+class TestSoftmaxWithCrossEntropyOpSoftLabelAxis1(
+        TestSoftmaxWithCrossEntropyOp2):
+    def initParams(self):
        self.op_type = "softmax_with_cross_entropy"
-        batch_size = [6, 10]
+        self.numeric_stable_mode = True
-        class_num = 37
+        self.soft_label = True
+        self.shape = [3, 5, 7, 11]
+        self.axis = 0
+        self.ignore_index = -1
+        self.dtype = np.float64
-        logits = np.random.uniform(
-            0.1, 1.0, tuple(batch_size + [class_num])).astype("float64")
-        softmax = np.apply_along_axis(stable_softmax, 2, logits)
-        labels = np.random.uniform(
-            0.1, 1.0, tuple(batch_size + [class_num])).astype("float64")
-        labels /= np.sum(labels, axis=2, keepdims=True)
-        cross_entropy = (-labels * np.log(softmax)).sum(
+class TestSoftmaxWithCrossEntropyOpSoftLabelAxis2(
-            axis=2, keepdims=True).astype("float64")
+        TestSoftmaxWithCrossEntropyOp2):
+    def initParams(self):
+        self.op_type = "softmax_with_cross_entropy"
+        self.numeric_stable_mode = True
+        self.soft_label = True
+        self.shape = [3, 5, 7, 11]
+        self.axis = 1
+        self.ignore_index = -1
+        self.dtype = np.float64
-        self.inputs = {"Logits": logits, "Label": labels}
-        self.outputs = {
-            "Softmax": softmax.astype("float64"),
-            "Loss": cross_entropy.astype("float64")
-        }
-        self.attrs = {"soft_label": True}
-    def test_check_output(self):
+class TestSoftmaxWithCrossEntropyOpSoftLabelAxis3(
-        self.check_output()
+        TestSoftmaxWithCrossEntropyOp2):
+    def initParams(self):
+        self.op_type = "softmax_with_cross_entropy"
+        self.numeric_stable_mode = True
+        self.soft_label = True
+        self.shape = [3, 5, 7, 11]
+        self.axis = 2
+        self.ignore_index = -1
+        self.dtype = np.float64
-    def test_check_grad(self):
-        self.check_grad(["Logits"], "Loss")
+class TestSoftmaxWithCrossEntropyOpSoftLabelAxis4(
+        TestSoftmaxWithCrossEntropyOp2):
+    def initParams(self):
+        self.op_type = "softmax_with_cross_entropy"
+        self.numeric_stable_mode = True
+        self.soft_label = True
+        self.shape = [3, 5, 7, 11]
+        self.axis = 3
+        self.ignore_index = -1
+        self.dtype = np.float64
-class TestSoftmaxWithCrossEntropyOpFp16_2(TestSoftmaxWithCrossEntropyOp):
+class TestSoftmaxWithCrossEntropyOpIgnoreIndexNoCudnnAxis1(
+        TestSoftmaxWithCrossEntropyOp3):
    def initParams(self):
-        self.numeric_stable_mode = False
+        self.op_type = "softmax_with_cross_entropy"
-        self.dtype = np.float16
+        self.numeric_stable_mode = True
+        self.soft_label = False
+        self.shape = [3, 5, 7, 11]
+        self.ignore_index = 1
+        self.axis = 0
+        self.dtype = np.float64
-    def setUp(self):
-        self.initParams()
+class TestSoftmaxWithCrossEntropyOpIgnoreIndexNoCudnnAxis2(
+        TestSoftmaxWithCrossEntropyOp3):
+    def initParams(self):
        self.op_type = "softmax_with_cross_entropy"
-        batch_size = [64, 10]
+        self.numeric_stable_mode = True
-        class_num = 37
+        self.soft_label = False
+        self.shape = [3, 5, 7, 11]
+        self.ignore_index = 0
+        self.axis = 1
+        self.dtype = np.float64
-        # NOTE: numpy float16 have very low accuracy, use float32 for numpy check.
-        logits = np.random.uniform(
-            0.1, 1.0, tuple(batch_size + [class_num])).astype(np.float32)
-        softmax = np.apply_along_axis(stable_softmax, 2, logits)
-        labels = np.random.randint(
-            0, class_num, tuple(batch_size + [1]), dtype="int64")
-        softmax_2d = np.reshape(softmax, [-1, class_num])
-        labels_2d = np.reshape(labels, [-1, 1])
-        cross_entropy = np.asmatrix(
-            [[-np.log(softmax_2d[i][labels_2d[i][0]])]
-             for i in range(softmax_2d.shape[0])],
-            dtype=np.float32)
-        cross_entropy = np.reshape(cross_entropy, batch_size)
-        output_shape = tuple(batch_size + [1])
-        output_res = cross_entropy.astype(self.dtype)
-        output_res = np.expand_dims(output_res, axis=2)
-        self.inputs = {"Logits": logits, "Label": labels}
-        self.inputs = {
+class TestSoftmaxWithCrossEntropyOpIgnoreIndexNoCudnnAxis3(
-            "Logits": logits.astype(self.dtype).view(np.uint16),
+        TestSoftmaxWithCrossEntropyOp3):
-            "Label": labels
+    def initParams(self):
-        }
+        self.op_type = "softmax_with_cross_entropy"
-        self.outputs = {
+        self.numeric_stable_mode = True
-            "Softmax": softmax.astype(self.dtype),
+        self.soft_label = False
-            "Loss": output_res,
+        self.shape = [3, 5, 7, 11]
-        }
+        self.ignore_index = 3
-        self.attrs = {"numeric_stable_mode": self.numeric_stable_mode}
+        self.axis = 2
+        self.dtype = np.float64
-    def test_check_output(self):
-        self.check_output(atol=1e-2)
-    def test_check_grad(self):
+class TestSoftmaxWithCrossEntropyOpIgnoreIndexNoCudnnAxis4(
-        self.check_grad(["Logits"], "Loss", max_relative_error=0.1)
+        TestSoftmaxWithCrossEntropyOp3):
+    def initParams(self):
+        self.op_type = "softmax_with_cross_entropy"
+        self.numeric_stable_mode = True
+        self.soft_label = False
+        self.shape = [3, 5, 7, 11]
+        self.ignore_index = 3
+        self.axis = 3
+        self.dtype = np.float64
 if __name__ == "__main__":