Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into fix-dygraph-checkpoint

a0d362ef · lujun · 6095e56f · baf60e3a · a0d362ef · a0d362ef
40 changed file
--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@@ -337,13 +337,13 @@ paddle.fluid.layers.reciprocal (ArgSpec(args=['x', 'name'], varargs=None, keywor
 paddle.fluid.layers.square (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '48dfb45d773dbc30126c3a7f777de5ee'))
 paddle.fluid.layers.softplus (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '459c5781e9d1dd88283b7c5769d7872a'))
 paddle.fluid.layers.softsign (ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '80846bcd4bd457207457a6d5411f4148'))
-paddle.fluid.layers.uniform_random (ArgSpec(args=['shape', 'dtype', 'min', 'max', 'seed'], varargs=None, keywords=None, defaults=('float32', -1.0, 1.0, 0)), ('document', '308b619af849caa82bbc31e897f5e641'))
+paddle.fluid.layers.uniform_random (ArgSpec(args=['shape', 'dtype', 'min', 'max', 'seed'], varargs=None, keywords=None, defaults=('float32', -1.0, 1.0, 0)), ('document', 'a8c4e972b7d6742c838a37abf407ed9a'))
 paddle.fluid.layers.hard_shrink (ArgSpec(args=['x', 'threshold'], varargs=None, keywords=None, defaults=(None,)), ('document', 'c142f5884f3255e0d6075c286bbd531e'))
 paddle.fluid.layers.cumsum (ArgSpec(args=['x', 'axis', 'exclusive', 'reverse'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '944d7c03057f5fc88bc78acd4d82f926'))
 paddle.fluid.layers.thresholded_relu (ArgSpec(args=['x', 'threshold'], varargs=None, keywords=None, defaults=(None,)), ('document', '90566ea449ea4c681435546e2f70610a'))
 paddle.fluid.layers.prior_box (ArgSpec(args=['input', 'image', 'min_sizes', 'max_sizes', 'aspect_ratios', 'variance', 'flip', 'clip', 'steps', 'offset', 'name', 'min_max_aspect_ratios_order'], varargs=None, keywords=None, defaults=(None, [1.0], [0.1, 0.1, 0.2, 0.2], False, False, [0.0, 0.0], 0.5, None, False)), ('document', '14cac0ee643fa6e026ad82aeeee75bd8'))
 paddle.fluid.layers.density_prior_box (ArgSpec(args=['input', 'image', 'densities', 'fixed_sizes', 'fixed_ratios', 'variance', 'clip', 'steps', 'offset', 'flatten_to_2d', 'name'], varargs=None, keywords=None, defaults=(None, None, None, [0.1, 0.1, 0.2, 0.2], False, [0.0, 0.0], 0.5, False, None)), ('document', 'a0d762bb08de9ce93bc780aa57cd5cd9'))
-paddle.fluid.layers.multi_box_head (ArgSpec(args=['inputs', 'image', 'base_size', 'num_classes', 'aspect_ratios', 'min_ratio', 'max_ratio', 'min_sizes', 'max_sizes', 'steps', 'step_w', 'step_h', 'offset', 'variance', 'flip', 'clip', 'kernel_size', 'pad', 'stride', 'name', 'min_max_aspect_ratios_order'], varargs=None, keywords=None, defaults=(None, None, None, None, None, None, None, 0.5, [0.1, 0.1, 0.2, 0.2], True, False, 1, 0, 1, None, False)), ('document', 'a6ab47a2fe681e52fabb7057ddf0efdd'))
+paddle.fluid.layers.multi_box_head (ArgSpec(args=['inputs', 'image', 'base_size', 'num_classes', 'aspect_ratios', 'min_ratio', 'max_ratio', 'min_sizes', 'max_sizes', 'steps', 'step_w', 'step_h', 'offset', 'variance', 'flip', 'clip', 'kernel_size', 'pad', 'stride', 'name', 'min_max_aspect_ratios_order'], varargs=None, keywords=None, defaults=(None, None, None, None, None, None, None, 0.5, [0.1, 0.1, 0.2, 0.2], True, False, 1, 0, 1, None, False)), ('document', 'fe9afaee481dd09f28866df22756466f'))
 paddle.fluid.layers.bipartite_match (ArgSpec(args=['dist_matrix', 'match_type', 'dist_threshold', 'name'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', '3ddb9b966f193900193a95a3df77c3c1'))
 paddle.fluid.layers.target_assign (ArgSpec(args=['input', 'matched_indices', 'negative_indices', 'mismatch_value', 'name'], varargs=None, keywords=None, defaults=(None, None, None)), ('document', 'c0b334f917828f95056f6ebe10907b1c'))
 paddle.fluid.layers.detection_output (ArgSpec(args=['loc', 'scores', 'prior_box', 'prior_box_var', 'background_label', 'nms_threshold', 'nms_top_k', 'keep_top_k', 'score_threshold', 'nms_eta'], varargs=None, keywords=None, defaults=(0, 0.3, 400, 200, 0.01, 1.0)), ('document', 'c33093a82a46e3091e789e5572588db1'))
@@ -358,7 +358,7 @@ paddle.fluid.layers.generate_mask_labels (ArgSpec(args=['im_info', 'gt_classes',
 paddle.fluid.layers.iou_similarity (ArgSpec(args=['x', 'y', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '587845f60c5d97ffdf2dfd21da52eca1'))
 paddle.fluid.layers.box_coder (ArgSpec(args=['prior_box', 'prior_box_var', 'target_box', 'code_type', 'box_normalized', 'name', 'axis'], varargs=None, keywords=None, defaults=('encode_center_size', True, None, 0)), ('document', '032d0f4b7d8f6235ee5d91e473344f0e'))
 paddle.fluid.layers.polygon_box_transform (ArgSpec(args=['input', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '0e5ac2507723a0b5adec473f9556799b'))
-paddle.fluid.layers.yolov3_loss (ArgSpec(args=['x', 'gtbox', 'gtlabel', 'anchors', 'anchor_mask', 'class_num', 'ignore_thresh', 'downsample_ratio', 'gtscore', 'use_label_smooth', 'name'], varargs=None, keywords=None, defaults=(None, True, None)), ('document', '57fa96922e42db8f064c3fb77f2255e8'))
+paddle.fluid.layers.yolov3_loss (ArgSpec(args=['x', 'gt_box', 'gt_label', 'anchors', 'anchor_mask', 'class_num', 'ignore_thresh', 'downsample_ratio', 'gt_score', 'use_label_smooth', 'name'], varargs=None, keywords=None, defaults=(None, True, None)), ('document', '4d170807a13d33925d1049d2892832bf'))
 paddle.fluid.layers.yolo_box (ArgSpec(args=['x', 'img_size', 'anchors', 'class_num', 'conf_thresh', 'downsample_ratio', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '5566169a5ab993d177792c023c7fb340'))
 paddle.fluid.layers.box_clip (ArgSpec(args=['input', 'im_info', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '397e9e02b451d99c56e20f268fa03f2e'))
 paddle.fluid.layers.multiclass_nms (ArgSpec(args=['bboxes', 'scores', 'score_threshold', 'nms_top_k', 'keep_top_k', 'nms_threshold', 'normalized', 'nms_eta', 'background_label', 'name'], varargs=None, keywords=None, defaults=(0.3, True, 1.0, 0, None)), ('document', 'ca7d1107b6c5d2d6d8221039a220fde0'))

--- a/paddle/fluid/operators/attention_lstm_op.cc
+++ b/paddle/fluid/operators/attention_lstm_op.cc
@@ -64,12 +64,19 @@ void AttentionLSTMOp::InferShape(framework::InferShapeContext* ctx) const {
  auto c_dims = ctx->GetInputDim("C0");
  PADDLE_ENFORCE_EQ(c_dims.size(), 2, "Input(C0)'s rank must be 2.");
-  PADDLE_ENFORCE_EQ(c_dims[1], D, "C0 dims should be N x %d.", D);
+  if (ctx->IsRuntime()) {
+    PADDLE_ENFORCE_EQ(c_dims[1], D, "C0 dims should be N x %d.", D);
+  }
  if (ctx->HasInput("H0")) {
    auto h_dims = ctx->GetInputDim("H0");
-    PADDLE_ENFORCE(h_dims == c_dims,
+    PADDLE_ENFORCE_EQ(h_dims.size(), 2UL, "Input(H0)'s rank must be 2.");
-                   "The dimension of Input(H0) and Input(C0) "
+    if (ctx->IsRuntime() ||
-                   "should be the same.");
+        (framework::product(c_dims) > 0 && framework::product(h_dims) > 0)) {
+      PADDLE_ENFORCE(h_dims == c_dims,
+                     "The dimension of Input(H0) and Input(C0) "
+                     "should be the same.");
+    }
  }
  auto atten_w_dims = ctx->GetInputDim("AttentionWeight");
@@ -79,6 +86,7 @@ void AttentionLSTMOp::InferShape(framework::InferShapeContext* ctx) const {
                    "AttentionWeight shapes must be (%d + %d) * 1.", M, D);
  PADDLE_ENFORCE_EQ(atten_w_dims[1], 1,
                    "AttentionWeight shapes must be (%d + %d) * 1.", M, D);
  if (ctx->HasInput("AttentionBias")) {
    auto atten_b_dims = ctx->GetInputDim("AttentionBias");
    PADDLE_ENFORCE_EQ(atten_b_dims.size(), 2,

--- a/paddle/fluid/operators/bpr_loss_op.cc
+++ b/paddle/fluid/operators/bpr_loss_op.cc
@@ -32,10 +32,14 @@ class BprLossOp : public framework::OperatorWithKernel {
    int rank = x_dims.size();
    PADDLE_ENFORCE_EQ(rank, label_dims.size(),
                      "Input(X) and Input(Label) shall have the same rank.");
-    PADDLE_ENFORCE_EQ(framework::slice_ddim(x_dims, 0, rank - 1),
-                      framework::slice_ddim(label_dims, 0, rank - 1),
+    if (ctx->IsRuntime() || (framework::product(x_dims) > 0 &&
-                      "Input(X) and Input(Label) shall have the same shape "
+                             framework::product(label_dims) > 0)) {
-                      "except the last dimension.");
+      PADDLE_ENFORCE_EQ(framework::slice_ddim(x_dims, 0, rank - 1),
+                        framework::slice_ddim(label_dims, 0, rank - 1),
+                        "Input(X) and Input(Label) shall have the same shape "
+                        "except the last dimension.");
+    }
    auto y_dims = x_dims;
    y_dims[rank - 1] = 1;

--- a/paddle/fluid/operators/controlflow/tensor_array_read_write_op.cc
+++ b/paddle/fluid/operators/controlflow/tensor_array_read_write_op.cc
@@ -81,8 +81,10 @@ class WriteToArrayInferShape : public framework::InferShapeBase {
 public:
  void operator()(framework::InferShapeContext *context) const override {
    PADDLE_ENFORCE(context->HasInput("I"), "Must set the subscript index");
-    PADDLE_ENFORCE_EQ(framework::product(context->GetInputDim("I")), 1,
+    if (context->IsRuntime()) {
-                      "The number of element of subscript index must be 1");
+      PADDLE_ENFORCE_EQ(framework::product(context->GetInputDim("I")), 1,
+                        "The number of element of subscript index must be 1");
+    }
    if (!context->HasInput("X")) {
      return;
    }

--- a/paddle/fluid/operators/data_norm_op.cc
+++ b/paddle/fluid/operators/data_norm_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/operators/data_norm_op.h"
+#include <memory>
 #include <string>
 #include "paddle/fluid/framework/data_layout.h"
 #ifdef PADDLE_WITH_MKLDNN
@@ -65,9 +66,11 @@ class DataNormOp : public framework::OperatorWithKernel {
    PADDLE_ENFORCE_EQ(ctx->GetInputDim("BatchSize").size(), 1UL);
    PADDLE_ENFORCE_EQ(ctx->GetInputDim("BatchSum").size(), 1UL);
    PADDLE_ENFORCE_EQ(ctx->GetInputDim("BatchSquareSum").size(), 1UL);
-    PADDLE_ENFORCE_EQ(ctx->GetInputDim("BatchSize")[0], C);
+    if (ctx->IsRuntime()) {
-    PADDLE_ENFORCE_EQ(ctx->GetInputDim("BatchSum")[0], C);
+      PADDLE_ENFORCE_EQ(ctx->GetInputDim("BatchSize")[0], C);
-    PADDLE_ENFORCE_EQ(ctx->GetInputDim("BatchSquareSum")[0], C);
+      PADDLE_ENFORCE_EQ(ctx->GetInputDim("BatchSum")[0], C);
+      PADDLE_ENFORCE_EQ(ctx->GetInputDim("BatchSquareSum")[0], C);
+    }
    ctx->SetOutputDim("Y", x_dims);
    ctx->SetOutputDim("Means", {C});

--- a/paddle/fluid/operators/detection/yolov3_loss_op.cc
+++ b/paddle/fluid/operators/detection/yolov3_loss_op.cc
@@ -171,8 +171,8 @@ class Yolov3LossOpMaker : public framework::OpProtoAndCheckerMaker {
         The output of previous network is in shape [N, C, H, W], while H and W
         should be the same, H and W specify the grid size, each grid point predict 
-         given number boxes, this given number, which following will be represented as S,
+         given number bounding boxes, this given number, which following will be represented as S,
-         is specified by the number of anchors, In the second dimension(the channel
+         is specified by the number of anchor clusters in each scale. In the second dimension(the channel
         dimension), C should be equal to S * (class_num + 5), class_num is the object 
         category number of source dataset(such as 80 in coco dataset), so in the 
         second(channel) dimension, apart from 4 box location coordinates x, y, w, h, 
@@ -203,7 +203,7 @@ class Yolov3LossOpMaker : public framework::OpProtoAndCheckerMaker {
         thresh, the confidence score loss of this anchor box will be ignored.
         Therefore, the yolov3 loss consist of three major parts, box location loss,
-         confidence score loss, and classification loss. The L2 loss is used for 
+         confidence score loss, and classification loss. The L1 loss is used for 
         box coordinates (w, h), and sigmoid cross entropy loss is used for box 
         coordinates (x, y), confidence score loss and classification loss.

--- a/paddle/fluid/operators/huber_loss_op.cc
+++ b/paddle/fluid/operators/huber_loss_op.cc
@@ -31,13 +31,18 @@ class HuberLossOp : public framework::OperatorWithKernel {
    auto x_dims = ctx->GetInputDim("X");
    auto y_dims = ctx->GetInputDim("Y");
-    PADDLE_ENFORCE_EQ(x_dims, y_dims);
    PADDLE_ENFORCE_EQ(x_dims.size(), 2,
                      "The rank of Input(X) must be 2 and the shape is "
                      "[batch_size, 1].");
-    PADDLE_ENFORCE_EQ(x_dims[1], 1,
+    if (ctx->IsRuntime() ||
-                      "Each row of Input(X) contains a real value, "
+        (framework::product(x_dims) > 0 && framework::product(y_dims) > 0)) {
-                      "so the 2nd dimension of Input(X) must be 1.");
+      PADDLE_ENFORCE_EQ(x_dims, y_dims, "Shape of X and Y should be same");
+    }
+    if (ctx->IsRuntime()) {
+      PADDLE_ENFORCE_EQ(x_dims[1], 1,
+                        "Each row of Input(X) contains a real value, "
+                        "so the 2nd dimension of Input(X) must be 1.");
+    }
    ctx->SetOutputDim("Residual", x_dims);
    ctx->SetOutputDim("Out", {x_dims[0], 1});

--- a/paddle/fluid/operators/layer_norm_op.cc
+++ b/paddle/fluid/operators/layer_norm_op.cc
@@ -46,11 +46,18 @@ class LayerNormOp : public framework::OperatorWithKernel {
    int right = static_cast<int>(matrix_dim[1]);
    if (ctx->HasInput("Scale")) {
      PADDLE_ENFORCE_EQ(ctx->GetInputDim("Scale").size(), 1);
-      PADDLE_ENFORCE_EQ(ctx->GetInputDim("Scale")[0], right);
+      if (ctx->IsRuntime()) {
+        PADDLE_ENFORCE_EQ(ctx->GetInputDim("Scale")[0], right,
+                          "scale should with right");
+      }
    }
    if (ctx->HasInput("Bias")) {
      PADDLE_ENFORCE_EQ(ctx->GetInputDim("Bias").size(), 1);
-      PADDLE_ENFORCE_EQ(ctx->GetInputDim("Bias")[0], right);
+      if (ctx->IsRuntime()) {
+        PADDLE_ENFORCE_EQ(ctx->GetInputDim("Bias")[0], right,
+                          "bias should with right");
+      }
    }
    ctx->SetOutputDim("Y", ctx->GetInputDim("X"));

--- a/paddle/fluid/operators/linear_chain_crf_op.cc
+++ b/paddle/fluid/operators/linear_chain_crf_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/operators/linear_chain_crf_op.h"
 #include <memory>
 namespace paddle {
@@ -152,12 +153,19 @@ class LinearChainCRFOp : public framework::OperatorWithKernel {
    auto transition_dims = ctx->GetInputDim("Transition");
    PADDLE_ENFORCE_EQ(transition_dims.size(), 2,
                      "The Input(Transition) should be a 2-D tensor.");
-    PADDLE_ENFORCE_EQ(
+    bool check = true;
-        transition_dims[0] - 2, transition_dims[1],
+    if ((!ctx->IsRuntime()) &&
-        "An invalid dimension for the Input(Transition), which should "
+        (transition_dims[0] <= 0 || transition_dims[1] <= 0)) {
-        "be a 2-D tensor with shape [(D + 2) x D].");
+      check = false;
-    PADDLE_ENFORCE_EQ(
+    }
-        emission_dims[1], transition_dims[1],
+    if (check) {
+      PADDLE_ENFORCE_EQ(
+          transition_dims[0] - 2, transition_dims[1],
+          "An invalid dimension for the Input(Transition), which should "
+          "be a 2-D tensor with shape [(D + 2) x D].");
+    }
+    PADDLE_INFERSHAPE_ENFORCE_EQ(
+        ctx, emission_dims[1], transition_dims[1],
        "The 2nd dimension of the Input(Emission) and the Input(Transition) "
        "should be equal to the tag number.");
@@ -165,8 +173,8 @@ class LinearChainCRFOp : public framework::OperatorWithKernel {
    PADDLE_ENFORCE(label_dims.size() == 2UL && label_dims[1] == 1UL,
                   "The Input(Label) should be a 2-D tensor with the 2nd "
                   "dimensions fixed to 1.");
-    PADDLE_ENFORCE_EQ(
+    PADDLE_INFERSHAPE_ENFORCE_EQ(
-        emission_dims[0], label_dims[0],
+        ctx, emission_dims[0], label_dims[0],
        "The height of Input(Emission) and the height of Input(Label) "
        "should be the same.");
@@ -211,12 +219,19 @@ class LinearChainCRFGradOp : public framework::OperatorWithKernel {
    auto transition_exps_dims = ctx->GetInputDim("TransitionExps");
    PADDLE_ENFORCE_EQ(transition_exps_dims.size(), 2,
                      "The Input(TransitionExps) should be a 2-D tensor.");
-    PADDLE_ENFORCE_EQ(
+    bool check = true;
-        transition_exps_dims[0] - 2, transition_exps_dims[1],
+    if ((!ctx->IsRuntime()) &&
-        "An invalid dimension for the Input(TransitionExps), which should "
+        (transition_exps_dims[0] <= 0 || transition_exps_dims[1] <= 0)) {
-        "be a 2-D tensor with shape [(D + 2) x D].");
+      check = false;
-    PADDLE_ENFORCE_EQ(
+    }
-        emission_exps_dims[1], transition_exps_dims[1],
+    if (check) {
+      PADDLE_ENFORCE_EQ(
+          transition_exps_dims[0] - 2, transition_exps_dims[1],
+          "An invalid dimension for the Input(TransitionExps), which should "
+          "be a 2-D tensor with shape [(D + 2) x D].");
+    }
+    PADDLE_INFERSHAPE_ENFORCE_EQ(
+        ctx, emission_exps_dims[1], transition_exps_dims[1],
        "The 2nd dimension of the Input(EmissionExps) and the "
        "Input(TransitionExps) should be equal to the tag number.");
@@ -224,8 +239,8 @@ class LinearChainCRFGradOp : public framework::OperatorWithKernel {
    PADDLE_ENFORCE(label_dims.size() == 2UL && label_dims[1] == 1UL,
                   "The Input(Label) should be a 2-D tensor with the 2nd "
                   "dimensions fixed to 1.");
-    PADDLE_ENFORCE_EQ(
+    PADDLE_INFERSHAPE_ENFORCE_EQ(
-        emission_exps_dims[0], label_dims[0],
+        ctx, emission_exps_dims[0], label_dims[0],
        "The height of Input(EmissionExps) and the height of Input(Label) "
        "should be the same.");

--- a/paddle/fluid/operators/metrics/accuracy_op.cc
+++ b/paddle/fluid/operators/metrics/accuracy_op.cc
@@ -41,10 +41,11 @@ class AccuracyOp : public framework::OperatorWithKernel {
    // it's the output of topk.
    PADDLE_ENFORCE_EQ(label_dim.size(), 2, "label's rank must be 2.");
-    PADDLE_ENFORCE_EQ(label_dim[1], 1, "label's second dimension must be 1");
+    PADDLE_INFERSHAPE_ENFORCE_EQ(ctx, label_dim[1], 1,
-    PADDLE_ENFORCE_EQ(inference_dim[0], label_dim[0],
+                                 "label's second dimension must be 1");
-                      "the inference tensor's num_rows must be"
+    PADDLE_INFERSHAPE_ENFORCE_EQ(ctx, inference_dim[0], label_dim[0],
-                      " the same as label.");
+                                 "the inference tensor's num_rows must be"
+                                 " the same as label.");
    ctx->SetOutputDim("Accuracy", {1});
    ctx->SetOutputDim("Correct", {1});

--- a/paddle/fluid/operators/metrics/auc_op.cc
+++ b/paddle/fluid/operators/metrics/auc_op.cc
@@ -28,12 +28,13 @@ class AucOp : public framework::OperatorWithKernel {
    PADDLE_ENFORCE(ctx->HasInput("Label"),
                   "Input of Label should not be null.");
    auto predict_width = ctx->GetInputDim("Predict")[1];
-    PADDLE_ENFORCE_EQ(predict_width, 2, "Only support binary classification");
+    PADDLE_INFERSHAPE_ENFORCE_EQ(ctx, predict_width, 2,
+                                 "Only support binary classification");
    auto predict_height = ctx->GetInputDim("Predict")[0];
    auto label_height = ctx->GetInputDim("Label")[0];
-    PADDLE_ENFORCE_EQ(predict_height, label_height,
+    PADDLE_INFERSHAPE_ENFORCE_EQ(ctx, predict_height, label_height,
-                      "Out and Label should have same height.");
+                                 "Out and Label should have same height.");
    int num_pred_buckets = ctx->Attrs().Get<int>("num_thresholds") + 1;
    int slide_steps = ctx->Attrs().Get<int>("slide_steps");

--- a/paddle/fluid/operators/metrics/precision_recall_op.cc
+++ b/paddle/fluid/operators/metrics/precision_recall_op.cc
@@ -40,30 +40,40 @@ class PrecisionRecallOp : public framework::OperatorWithKernel {
    auto max_probs_dims = ctx->GetInputDim("MaxProbs");
    auto labels_dims = ctx->GetInputDim("Labels");
-    PADDLE_ENFORCE_EQ(max_probs_dims[1], 1,
+    if (ctx->IsRuntime()) {
-                      "Each instance contains one max probability, so the "
+      PADDLE_ENFORCE_EQ(max_probs_dims[1], 1,
-                      "shape of Input(MaxProbs) should be [batch_size, 1].");
+                        "Each instance contains one max probability, so the "
-    PADDLE_ENFORCE_EQ(ctx->GetInputDim("Indices"), max_probs_dims,
+                        "shape of Input(MaxProbs) should be [batch_size, 1].");
-                      "The shape of Input(Indices) should be [batch_size, 1].");
+      PADDLE_ENFORCE_EQ(
-    PADDLE_ENFORCE_EQ(max_probs_dims[0], labels_dims[0],
+          ctx->GetInputDim("Indices"), max_probs_dims,
-                      "The 1st dimension of Input(MaxProbs) and "
+          "The shape of Input(Indices) should bes same with max_probs_dims");
-                      "Input(Labels) both are batch_size and the shape should "
+      PADDLE_ENFORCE_EQ(
-                      "be the same.");
+          max_probs_dims[0], labels_dims[0],
-    PADDLE_ENFORCE_EQ(labels_dims[1], 1,
+          "The 1st dimension of Input(MaxProbs) and "
-                      "The 2nd dimension of Input(Labels) contains instance "
+          "Input(Labels) both are batch_size and the shape should "
-                      "label and the shape should be equal to 1.");
+          "be the same.");
+      PADDLE_ENFORCE_EQ(labels_dims[1], 1,
+                        "The 2nd dimension of Input(Labels) contains instance "
+                        "label and the shape should be equal to 1.");
+    }
    if (ctx->HasInput("Weights")) {
      auto weights_dims = ctx->GetInputDim("Weights");
-      PADDLE_ENFORCE_EQ(weights_dims,
-                        framework::make_ddim({max_probs_dims[0], 1}),
+      if (ctx->IsRuntime()) {
-                        "The shape of Input(Weights) should be "
+        PADDLE_ENFORCE_EQ(weights_dims,
-                        "[batch_size, 1].");
+                          framework::make_ddim({max_probs_dims[0], 1}),
+                          "The shape of Input(Weights) should be "
+                          "[batch_size, 1].");
+      }
    }
    if (ctx->HasInput("StatesInfo")) {
      auto states_dims = ctx->GetInputDim("StatesInfo");
-      PADDLE_ENFORCE_EQ(states_dims, framework::make_ddim({cls_num, 4}),
-                        "The shape of Input(StatesInfo) should be "
+      if (ctx->IsRuntime()) {
-                        "[class_number, 4].");
+        PADDLE_ENFORCE_EQ(states_dims, framework::make_ddim({cls_num, 4}),
+                          "The shape of Input(StatesInfo) should be "
+                          "[class_number, 4].");
+      }
    }
    // Layouts of BatchMetrics and AccumMetrics both are:

--- a/paddle/fluid/operators/minus_op.cc
+++ b/paddle/fluid/operators/minus_op.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 #include "paddle/fluid/operators/minus_op.h"
+#include <memory>
 #include <string>
 #include <vector>
@@ -38,9 +39,12 @@ class MinusOp : public framework::OperatorWithKernel {
    auto x_dims = ctx->GetInputDim("X");
    auto y_dims = ctx->GetInputDim("Y");
-    PADDLE_ENFORCE_EQ(
+    if (ctx->IsRuntime() ||
-        x_dims, y_dims,
+        (framework::product(x_dims) > 0 && framework::product(y_dims) > 0)) {
-        "Minus operator must take two tensor with same num of elements");
+      PADDLE_ENFORCE_EQ(
+          x_dims, y_dims,
+          "Minus operator must take two tensor with same num of elements");
+    }
    ctx->SetOutputDim("Out", x_dims);
    ctx->ShareLoD("X", /*->*/ "Out");
  }

--- a/paddle/fluid/operators/modified_huber_loss_op.cc
+++ b/paddle/fluid/operators/modified_huber_loss_op.cc
@@ -28,9 +28,16 @@ class ModifiedHuberLossOp : public framework::OperatorWithKernel {
    auto x_dims = ctx->GetInputDim("X");
    auto y_dims = ctx->GetInputDim("Y");
-    PADDLE_ENFORCE_EQ(x_dims, y_dims, "The shape of X and Y must be the same.");
    PADDLE_ENFORCE_EQ(x_dims.size(), 2, "The tensor rank of X must be 2.");
-    PADDLE_ENFORCE_EQ(x_dims[1], 1, "The 2nd dimension of X must be 1.");
+    if (ctx->IsRuntime() ||
+        (framework::product(x_dims) > 0 && framework::product(y_dims) > 0)) {
+      PADDLE_ENFORCE_EQ(x_dims, y_dims,
+                        "The shape of X and Y must be the same.");
+    }
+    if (ctx->IsRuntime()) {
+      PADDLE_ENFORCE_EQ(x_dims[1], 1, "The 2nd dimension of X must be 1.");
+    }
    ctx->SetOutputDim("IntermediateVal", x_dims);
    ctx->SetOutputDim("Out", {x_dims[0], 1});
@@ -90,11 +97,13 @@ class ModifiedHuberLossGradOp : public framework::OperatorWithKernel {
    auto intermediate_dims = ctx->GetInputDim("IntermediateVal");
    auto out_grad_dims = ctx->GetInputDim(framework::GradVarName("Out"));
-    PADDLE_ENFORCE_EQ(
+    if (ctx->IsRuntime()) {
-        intermediate_dims, x_dims,
+      PADDLE_ENFORCE_EQ(
-        "The shape of X and intermediate value must be the same.");
+          intermediate_dims, x_dims,
-    PADDLE_ENFORCE_EQ(out_grad_dims, x_dims,
+          "The shape of X and intermediate value must be the same.");
-                      "The shape of Input(Out@Grad) and X must be the same.");
+      PADDLE_ENFORCE_EQ(out_grad_dims, x_dims,
+                        "The shape of Input(Out@Grad) and X must be the same.");
+    }
    if (ctx->HasOutput(framework::GradVarName("X"))) {
      ctx->SetOutputDim(framework::GradVarName("X"), x_dims);

--- a/paddle/fluid/operators/pad_constant_like_op.cc
+++ b/paddle/fluid/operators/pad_constant_like_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/operators/pad_constant_like_op.h"
+#include <memory>
 namespace paddle {
 namespace operators {
@@ -38,8 +39,16 @@ class PadConstantLikeOp : public framework::OperatorWithKernel {
                      "The dimention of X and Y should be the same.");
    for (int i = 0; i < x_dim.size(); ++i) {
-      PADDLE_ENFORCE_GE(x_dim[i], y_dim[i]);
+      if ((!ctx->IsRuntime()) && ((x_dim[i] == -1) || (y_dim[i] == -1))) {
+        continue;
+      } else {
+        PADDLE_ENFORCE_GE(
+            x_dim[i], y_dim[i],
+            "expected X_dim[i] >= Y_dim[i], but received %d < %d for dim %d",
+            x_dim[i], y_dim[i], i);
+      }
    }
    ctx->SetOutputDim("Out", x_dim);
    ctx->ShareLoD("X", /*->*/ "Out");
  }
@@ -162,7 +171,14 @@ class PadConstantLikeOpGrad : public framework::OperatorWithKernel {
      ctx->ShareLoD("Y", /*->*/ y_grad_name);
      for (int i = 0; i < y_dim.size(); ++i) {
-        PADDLE_ENFORCE_GE(dout_dim[i], y_dim[i]);
+        if ((!ctx->IsRuntime()) && ((dout_dim[i] == -1) || (y_dim[i] == -1))) {
+          continue;
+        } else {
+          PADDLE_ENFORCE_GE(dout_dim[i], y_dim[i],
+                            "expected Out_dim[i] >= Y_dim[i], but received %d "
+                            "< %d for dim %d",
+                            dout_dim[i], y_dim[i], i);
+        }
      }
    }
  }

--- a/paddle/fluid/operators/pad_op.cc
+++ b/paddle/fluid/operators/pad_op.cc
@@ -34,9 +34,16 @@ class PadOp : public framework::OperatorWithKernel {
    PADDLE_ENFORCE_EQ(x_dim.size() * 2, int64_t(paddings.size()),
                      "Size of paddings should be equal to 2 * dimension size "
                      "of input tensor.");
+    for (size_t i = 0; i < paddings.size(); ++i) {
+      PADDLE_ENFORCE_GE(paddings[i], 0, "paddings should >= 0.");
+    }
    std::vector<int64_t> out_dims(x_dim.size());
    for (int i = 0; i < x_dim.size(); ++i) {
-      out_dims[i] = x_dim[i] + paddings[i * 2] + paddings[i * 2 + 1];
+      if ((!ctx->IsRuntime()) && (x_dim[i] == -1)) {
+        out_dims[i] = -1;
+      } else {
+        out_dims[i] = x_dim[i] + paddings[i * 2] + paddings[i * 2 + 1];
+      }
    }
    ctx->SetOutputDim("Out", framework::make_ddim(out_dims));
    if (out_dims[0] == x_dim[0]) {
@@ -100,18 +107,14 @@ class PadOpGrad : public framework::OperatorWithKernel {
  using framework::OperatorWithKernel::OperatorWithKernel;
  void InferShape(framework::InferShapeContext* ctx) const override {
-    auto dout_dims = ctx->GetInputDim(framework::GradVarName("Out"));
-    auto& paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
-    for (int i = 0; i < dout_dims.size(); ++i) {
-      dout_dims[i] -= (paddings[i * 2] + paddings[i * 2 + 1]);
-    }
    auto x_grad_name = framework::GradVarName("X");
    if (ctx->HasOutput(x_grad_name)) {
      auto dout_dims = ctx->GetInputDim(framework::GradVarName("Out"));
      auto& paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
      for (int i = 0; i < dout_dims.size(); ++i) {
-        dout_dims[i] -= (paddings[i * 2] + paddings[i * 2 + 1]);
+        if (ctx->IsRuntime() || (dout_dims[i] != -1)) {
+          dout_dims[i] -= (paddings[i * 2] + paddings[i * 2 + 1]);
+        }
      }
      ctx->SetOutputDim(x_grad_name, dout_dims);
    }

--- a/paddle/fluid/operators/positive_negative_pair_op.cc
+++ b/paddle/fluid/operators/positive_negative_pair_op.cc
@@ -61,23 +61,31 @@ class PositiveNegativePairOp : public framework::OperatorWithKernel {
    auto query_dim = ctx->GetInputDim("QueryID");
    PADDLE_ENFORCE_EQ(score_dim.size(), 2, "Score should be a 2-D tensor.");
    PADDLE_ENFORCE_EQ(label_dim.size(), 2, "Label should be a 2-D tensor.");
-    PADDLE_ENFORCE_EQ(
-        label_dim[0], score_dim[0],
+    if (ctx->IsRuntime() ||
-        "Tensor Score and Label should have the same height (batch size).");
+        (score_dim[0] > 0 && label_dim[0] > 0 && query_dim[0] > 0)) {
-    PADDLE_ENFORCE_EQ(label_dim[1], 1,
+      PADDLE_ENFORCE_EQ(
-                      "The width of Label should be 1, i.e. each item should "
+          label_dim[0], score_dim[0],
-                      "have a scalar label.");
+          "Tensor Score and Label should have the same height (batch size).");
-    PADDLE_ENFORCE(query_dim == label_dim,
-                   "QueryID should have the same shape as Label.");
+      PADDLE_ENFORCE_EQ(label_dim[1], 1,
-    if (ctx->HasInput("Weight")) {
+                        "The width of Label should be 1, i.e. each item should "
-      PADDLE_ENFORCE(ctx->GetInputDim("Weight") == label_dim,
+                        "have a scalar label.");
-                     "Weight should have the same shape as Label.");
+      PADDLE_ENFORCE(query_dim == label_dim,
+                     "QueryID should have the same shape as Label.");
+      if (ctx->HasInput("Weight")) {
+        PADDLE_ENFORCE(ctx->GetInputDim("Weight") == label_dim,
+                       "Weight should have the same shape as Label.");
+      }
+      int column = ctx->Attrs().Get<int>("column");
+      auto depth = score_dim[1];
+      PADDLE_ENFORCE(column < depth && column >= -depth,
+                     "Attribute column should be in the range of [-%l, %l)",
+                     depth, depth);
    }
-    int column = ctx->Attrs().Get<int>("column");
-    auto depth = score_dim[1];
-    PADDLE_ENFORCE(column < depth && column >= -depth,
-                   "Attribute column should be in the range of [-%l, %l)",
-                   depth, depth);
    ctx->SetOutputDim("PositivePair", scalar_dim);
    ctx->SetOutputDim("NegativePair", scalar_dim);

--- a/paddle/fluid/operators/roi_align_op.cc
+++ b/paddle/fluid/operators/roi_align_op.cc
@@ -37,9 +37,11 @@ class ROIAlignOp : public framework::OperatorWithKernel {
    PADDLE_ENFORCE(rois_dims.size() == 2,
                   "ROIs should be a 2-D LoDTensor of shape (num_rois, 4)"
                   "given as [[x1, y1, x2, y2], ...].");
-    PADDLE_ENFORCE(rois_dims[1] == 4,
+    if (ctx->IsRuntime()) {
-                   "ROIs should be a 2-D LoDTensor of shape (num_rois, 4)"
+      PADDLE_ENFORCE(rois_dims[1] == 4,
-                   "given as [[x1, y1, x2, y2], ...].");
+                     "ROIs should be a 2-D LoDTensor of shape (num_rois, 4)"
+                     "given as [[x1, y1, x2, y2], ...].");
+    }
    int pooled_height = ctx->Attrs().Get<int>("pooled_height");
    int pooled_width = ctx->Attrs().Get<int>("pooled_width");
    float spatial_scale = ctx->Attrs().Get<float>("spatial_scale");

--- a/paddle/fluid/operators/sample_logits_op.cc
+++ b/paddle/fluid/operators/sample_logits_op.cc
@@ -11,7 +11,6 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/operators/sample_logits_op.h"
 #include <memory>
 #include "paddle/fluid/operators/math/sample_prob.h"
@@ -141,7 +140,10 @@ class SampleLogitsOp : public framework::OperatorWithKernel {
                      "The labels should be a 2-D tensor.");
    const int num_samples = ctx->Attrs().Get<int>("num_samples");
-    const int num_sampled_classes = labels_dims[1] + num_samples;
+    int num_sampled_classes = labels_dims[1] + num_samples;
+    if ((!ctx->IsRuntime()) && labels_dims[1] <= 0) {
+      num_sampled_classes = -1;
+    }
    ctx->SetOutputDim("Samples", {logits_dims[0], num_sampled_classes});
    ctx->SetOutputDim("Probabilities", {logits_dims[0], num_sampled_classes});
    ctx->SetOutputDim("SampledLogits", {logits_dims[0], num_sampled_classes});

--- a/paddle/fluid/operators/smooth_l1_loss_op.cc
+++ b/paddle/fluid/operators/smooth_l1_loss_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/operators/smooth_l1_loss_op.h"
+#include <memory>
 namespace paddle {
 namespace operators {
@@ -27,15 +28,39 @@ class SmoothL1LossOp : public framework::OperatorWithKernel {
    auto x_dims = ctx->GetInputDim("X");
    auto y_dims = ctx->GetInputDim("Y");
-    PADDLE_ENFORCE_EQ(x_dims, y_dims);
+    bool check = true;
+    if ((!ctx->IsRuntime()) &&
+        (framework::product(x_dims) <= 0 || framework::product(y_dims) <= 0)) {
+      check = false;
+    }
+    if (check) {
+      PADDLE_ENFORCE_EQ(x_dims, y_dims);
+    }
    PADDLE_ENFORCE_GE(x_dims.size(), 2,
                      "The tensor rank of Input(X) should not be less than 2.");
    if (ctx->HasInput("InsideWeight")) {
      PADDLE_ENFORCE(ctx->HasInput("OutsideWeight"),
                     "If weights are provided, must specify both "
                     "inside and outside weights.");
-      PADDLE_ENFORCE_EQ(ctx->GetInputDim("InsideWeight"), x_dims);
+      auto dims = ctx->GetInputDim("InsideWeight");
-      PADDLE_ENFORCE_EQ(ctx->GetInputDim("OutsideWeight"), x_dims);
+      bool check = true;
+      if ((!ctx->IsRuntime()) &&
+          (framework::product(dims) <= 0 || framework::product(x_dims) <= 0)) {
+        check = false;
+      }
+      if (check) {
+        PADDLE_ENFORCE_EQ(dims, x_dims);
+      }
+      dims = ctx->GetInputDim("OutsideWeight");
+      check = true;
+      if ((!ctx->IsRuntime()) &&
+          (framework::product(dims) <= 0 || framework::product(x_dims) <= 0)) {
+        check = false;
+      }
+      if (check) {
+        PADDLE_ENFORCE_EQ(dims, x_dims);
+      }
    }
    ctx->SetOutputDim("Diff", x_dims);
@@ -110,11 +135,11 @@ class SmoothL1LossGradOp : public framework::OperatorWithKernel {
    PADDLE_ENFORCE_GE(out_dims.size(), 2,
                      "The tensor rank of Input(Out@Grad) should be 2.");
-    PADDLE_ENFORCE_EQ(out_dims[0], in_dims[0],
+    PADDLE_INFERSHAPE_ENFORCE_EQ(ctx, out_dims[0], in_dims[0],
-                      "The 1st dimension of Input(Out@Grad) must be "
+                                 "The 1st dimension of Input(Out@Grad) must be "
-                      "same as input.");
+                                 "same as input.");
-    PADDLE_ENFORCE_EQ(out_dims[1], 1,
+    PADDLE_INFERSHAPE_ENFORCE_EQ(
-                      "The 2nd dimension of Input(Out@Grad) must be 1.");
+        ctx, out_dims[1], 1, "The 2nd dimension of Input(Out@Grad) must be 1.");
    auto x_grad_name = framework::GradVarName("X");
    auto y_grad_name = framework::GradVarName("Y");

--- a/paddle/fluid/operators/softmax_with_cross_entropy_op.cc
+++ b/paddle/fluid/operators/softmax_with_cross_entropy_op.cc
@@ -106,24 +106,36 @@ class SoftmaxWithCrossEntropyOp : public framework::OperatorWithKernel {
    auto logits_dims = ctx->GetInputDim("Logits");
    auto labels_dims = ctx->GetInputDim("Label");
+    int rank = logits_dims.size();
    PADDLE_ENFORCE_EQ(
-        logits_dims.size(), 2UL,
+        rank, labels_dims.size(),
-        "The input of softmax_with_cross_entropy should be a 2-D tensor.");
+        "Input(logits) and Input(Label) shall have the same rank.");
-    PADDLE_ENFORCE_EQ(labels_dims.size(), 2UL,
+    bool check = ctx->IsRuntime() || (framework::product(logits_dims) > 0 &&
-                      "The labels should be a 2-D tensor.");
+                                      framework::product(labels_dims) > 0);
+    if (check) {
+      PADDLE_ENFORCE_EQ(framework::slice_ddim(logits_dims, 0, rank - 1),
+                        framework::slice_ddim(labels_dims, 0, rank - 1),
+                        "Input(X) and Input(Label) shall have the same shape "
+                        "except the last dimension.");
+    }
    if (ctx->Attrs().Get<bool>("soft_label")) {
-      PADDLE_ENFORCE_EQ(logits_dims[1], labels_dims[1],
+      if (check) {
-                        "If Attr(soft_label) == true, the 2nd dimension of "
+        PADDLE_ENFORCE_EQ(logits_dims[rank - 1], labels_dims[rank - 1],
-                        "Input(X) and Input(Label) should be equal.");
+                          "If Attr(soft_label) == true, the last dimension of "
+                          "Input(X) and Input(Label) should be equal.");
+      }
    } else {
-      PADDLE_ENFORCE_EQ(labels_dims[1], 1UL,
+      PADDLE_ENFORCE_EQ(labels_dims[rank - 1], 1UL,
-                        "If Attr(soft_label) == false, the 2nd dimension of "
+                        "If Attr(softLabel) == false, the last dimension of "
                        "Input(Label) should be 1.");
    }
    ctx->SetOutputDim("Softmax", logits_dims);
-    ctx->SetOutputDim("Loss", {logits_dims[0], 1});
+    auto loss_dims = logits_dims;
+    loss_dims[rank - 1] = 1;
+    ctx->SetOutputDim("Loss", loss_dims);
    ctx->ShareLoD("Logits", /*->*/ "Softmax");
    ctx->ShareLoD("Logits", /*->*/ "Loss");
@@ -152,16 +164,33 @@ class SoftmaxWithCrossEntropyOpGrad : public framework::OperatorWithKernel {
    auto softmax_dims = ctx->GetInputDim("Softmax");
    auto labels_dims = ctx->GetInputDim("Label");
-    PADDLE_ENFORCE_EQ(labels_dims.size(), 2UL,
-                      "The labels should be a 2-D tensor.");
+    int rank = softmax_dims.size();
+    PADDLE_ENFORCE_EQ(
+        rank, labels_dims.size(),
+        "Input(logits) and Input(Label) shall have the same rank.");
+    bool check = true;
+    if ((!ctx->IsRuntime()) && (framework::product(softmax_dims) <= 0 ||
+                                framework::product(labels_dims) <= 0)) {
+      check = false;
+    }
+    if (check) {
+      PADDLE_ENFORCE_EQ(
+          framework::slice_ddim(softmax_dims, 0, rank - 1),
+          framework::slice_ddim(labels_dims, 0, rank - 1),
+          "Input(Softmax) and Input(Label) shall have the same shape "
+          "except the last dimension.");
+    }
    if (ctx->Attrs().Get<bool>("soft_label")) {
-      PADDLE_ENFORCE_EQ(softmax_dims[1], labels_dims[1],
+      if (check) {
-                        "When Attr(soft_label) == true, the 2nd dimension of "
+        PADDLE_ENFORCE_EQ(softmax_dims[rank - 1], labels_dims[rank - 1],
-                        "Input(X) and Input(Label) should be equal.");
+                          "If Attr(soft_label) == true, the last dimension of "
+                          "Input( Softmax) and Input(Label) should be equal.");
+      }
    } else {
-      PADDLE_ENFORCE_EQ(labels_dims[1], 1UL,
+      PADDLE_ENFORCE_EQ(labels_dims[rank - 1], 1UL,
-                        "When Attr(soft_label) == false, the 2nd dimension of "
+                        "If Attr(softLabel) == false, the last dimension of "
                        "Input(Label) should be 1.");
    }

--- a/paddle/fluid/operators/softmax_with_cross_entropy_op.cu
+++ b/paddle/fluid/operators/softmax_with_cross_entropy_op.cu
@@ -400,9 +400,15 @@ class SoftmaxWithCrossEntropyCUDAKernel : public framework::OpKernel<T> {
    auto soft_label = context.Attr<bool>("soft_label");
    auto ignore_index = context.Attr<int>("ignore_index");
+    int rank = logits->dims().size();
    if (soft_label) {
-      int batch_size = logits->dims()[0];
+      int batch_size = 1;
-      int feature_size = logits->dims()[1];
+      for (int i = 0; i < rank - 1; ++i) {
+        batch_size *= logits->dims()[i];
+      }
+      int feature_size = logits->dims()[rank - 1];
      auto* logits_data = logits->data<T>();
      auto* labels_data = labels->data<T>();
      SoftmaxWithCrossEntropyFusedKernel(
@@ -410,14 +416,23 @@ class SoftmaxWithCrossEntropyCUDAKernel : public framework::OpKernel<T> {
          feature_size, context.cuda_device_context().stream());
    } else {
      if (!context.Attr<bool>("numeric_stable_mode")) {
-        math::SoftmaxCUDNNFunctor<T>()(context.cuda_device_context(), logits,
+        // reshape to 2d
-                                       softmax);
+        Tensor logits_2d = framework::ReshapeToMatrix(*logits, rank - 1);
+        Tensor softmax_2d = framework::ReshapeToMatrix(*softmax, rank - 1);
+        Tensor loss_2d = framework::ReshapeToMatrix(*loss, rank - 1);
+        Tensor labels_2d = framework::ReshapeToMatrix(*labels, rank - 1);
+        math::SoftmaxCUDNNFunctor<T>()(context.cuda_device_context(),
+                                       &logits_2d, &softmax_2d);
        math::CrossEntropyFunctor<platform::CUDADeviceContext, T>()(
-            context.cuda_device_context(), loss, softmax, labels, false,
+            context.cuda_device_context(), &loss_2d, &softmax_2d, &labels_2d,
-            ignore_index);
+            false, ignore_index);
      } else {
-        int batch_size = logits->dims()[0];
+        int batch_size = 1;
-        int feature_size = logits->dims()[1];
+        for (int i = 0; i < rank - 1; ++i) {
+          batch_size *= logits->dims()[i];
+        }
+        int feature_size = logits->dims()[rank - 1];
        auto* logits_data = logits->data<T>();
        auto* labels_data = labels->data<int64_t>();
        HardLabelSoftmaxWithCrossEntropy<T>(
@@ -443,8 +458,13 @@ class SoftmaxWithCrossEntropyGradCUDAKernel : public framework::OpKernel<T> {
                          context.device_context(), logit_grad);
    T* logit_grad_data = logit_grad->data<T>();
-    const int batch_size = logit_grad->dims()[0];
+    int rank = logit_grad->dims().size();
-    const int class_num = logit_grad->dims()[1];
+    int batch_size = 1;
+    for (int i = 0; i < rank - 1; ++i) {
+      batch_size *= logit_grad->dims()[i];
+    }
+    const int class_num = logit_grad->dims()[rank - 1];
    int block = 512;
    auto stream = context.cuda_device_context().stream();
    auto ignore_index = context.Attr<int>("ignore_index");

--- a/paddle/fluid/operators/softmax_with_cross_entropy_op.h
+++ b/paddle/fluid/operators/softmax_with_cross_entropy_op.h
@@ -40,15 +40,22 @@ class SoftmaxWithCrossEntropyKernel : public framework::OpKernel<T> {
    softmax->mutable_data<T>(context.GetPlace());
    loss->mutable_data<T>(context.GetPlace());
-    int axis_dim = logits->dims()[logits->dims().size() - 1];
+    // reshape to 2D tensor
+    int rank = logits->dims().size();
+    Tensor logits_2d = framework::ReshapeToMatrix(*logits, rank - 1);
+    Tensor labels_2d = framework::ReshapeToMatrix(*labels, rank - 1);
+    Tensor loss_2d = framework::ReshapeToMatrix(*loss, rank - 1);
+    Tensor softmax_2d = framework::ReshapeToMatrix(*softmax, rank - 1);
+    int axis_dim = logits->dims()[rank - 1];
    auto& dev_ctx =
        context.template device_context<platform::CPUDeviceContext>();
    math::SoftmaxFunctor<platform::CPUDeviceContext, T, false>()(
-        dev_ctx, axis_dim, logits, softmax);
+        dev_ctx, axis_dim, &logits_2d, &softmax_2d);
    math::CrossEntropyFunctor<platform::CPUDeviceContext, T>()(
-        dev_ctx, loss, softmax, labels, context.Attr<bool>("soft_label"),
+        dev_ctx, &loss_2d, &softmax_2d, &labels_2d,
-        context.Attr<int>("ignore_index"));
+        context.Attr<bool>("soft_label"), context.Attr<int>("ignore_index"));
  }
 };
@@ -63,13 +70,19 @@ class SoftmaxWithCrossEntropyGradKernel : public framework::OpKernel<T> {
        context.Output<Tensor>(framework::GradVarName("Logits"));
    logit_grad->ShareDataWith(*context.Input<Tensor>("Softmax"));
-    const int class_num = logit_grad->dims()[1];
+    int rank = logit_grad->dims().size();
-    auto out_grad_mat = EigenMatrix<T>::From(*out_grad);
+    const int class_num = logit_grad->dims()[rank - 1];
-    auto logit_grad_mat = EigenMatrix<T>::From(*logit_grad);
+    // reshape to 2d
+    Tensor logit_grad_2d = framework::ReshapeToMatrix(*logit_grad, rank - 1);
+    Tensor out_grad_2d = framework::ReshapeToMatrix(*out_grad, rank - 1);
+    auto out_grad_mat = EigenMatrix<T>::From(out_grad_2d);
+    auto logit_grad_mat = EigenMatrix<T>::From(logit_grad_2d);
    auto& place = *context.template device_context<platform::CPUDeviceContext>()
                       .eigen_device();
    if (context.Attr<bool>("soft_label")) {
-      auto lbl_mat = EigenMatrix<T>::From(*labels);
+      Tensor labels_2d = framework::ReshapeToMatrix(*labels, rank - 1);
+      auto lbl_mat = EigenMatrix<T>::From(labels_2d);
      logit_grad_mat.device(place) =
          out_grad_mat.broadcast(Eigen::DSizes<int, 2>(1, class_num)) *
          (logit_grad_mat - lbl_mat);
@@ -78,7 +91,8 @@ class SoftmaxWithCrossEntropyGradKernel : public framework::OpKernel<T> {
          logit_grad_mat *
          out_grad_mat.broadcast(Eigen::DSizes<int, 2>(1, class_num));
-      const int batch_size = logit_grad->dims()[0];
+      const int batch_size = logit_grad_2d.dims()[0];
      const int64_t* label_data = labels->data<int64_t>();
      T* logit_grad_data = logit_grad->data<T>();
      const T* out_grad_data = out_grad->data<T>();

--- a/paddle/fluid/operators/space_to_depth_op.cc
+++ b/paddle/fluid/operators/space_to_depth_op.cc
@@ -40,19 +40,44 @@ class SpaceToDepthOp : public framework::OperatorWithKernel {
    auto blocksize = ctx->Attrs().Get<int64_t>("blocksize");
    PADDLE_ENFORCE_GT(blocksize, 1, "The blocksize should be Greater than 1");
-    PADDLE_ENFORCE_GT(x_dims[1], 0, "input channel should be Greater than 0");
+    if (ctx->IsRuntime()) {
-    PADDLE_ENFORCE_GT(x_dims[2], 0, "input Height should be Greater than 0");
+      PADDLE_ENFORCE_GT(x_dims[1], 0, "input channel should be Greater than 0");
-    PADDLE_ENFORCE_GT(x_dims[3], 0, "input Width should be Greater than 0");
+      PADDLE_ENFORCE_GT(x_dims[2], 0, "input Height should be Greater than 0");
+      PADDLE_ENFORCE_GT(x_dims[3], 0, "input Width should be Greater than 0");
-    PADDLE_ENFORCE_EQ(x_dims[1] % (blocksize * blocksize), 0,
-                      "input channel should be divisible of the square of "
+      PADDLE_ENFORCE_EQ(x_dims[1] % (blocksize * blocksize), 0,
-                      "SpaceToDepthOp blocksize");
+                        "input channel should be divisible of the square of "
-    PADDLE_ENFORCE_EQ(x_dims[2] % (blocksize), 0,
+                        "SpaceToDepthOp blocksize");
-                      "input Height should be divisible of the square of "
+      PADDLE_ENFORCE_EQ(x_dims[2] % (blocksize), 0,
-                      "SpaceToDepthOp blocksize");
+                        "input Height should be divisible of the square of "
-    PADDLE_ENFORCE_EQ(x_dims[3] % (blocksize), 0,
+                        "SpaceToDepthOp blocksize");
-                      "input Width should be divisible of the square of "
+      PADDLE_ENFORCE_EQ(x_dims[3] % (blocksize), 0,
-                      "SpaceToDepthOp blocksize");
+                        "input Width should be divisible of the square of "
+                        "SpaceToDepthOp blocksize");
+    } else {
+      if (x_dims[1] != -1) {
+        PADDLE_ENFORCE_GT(x_dims[1], 0,
+                          "input channel should be Greater than 0");
+        PADDLE_ENFORCE_EQ(x_dims[1] % (blocksize * blocksize), 0,
+                          "input channel should be divisible of the square of "
+                          "SpaceToDepthOp blocksize");
+      }
+      if (x_dims[2] != -1) {
+        PADDLE_ENFORCE_GT(x_dims[2], 0,
+                          "input Height should be Greater than 0");
+        PADDLE_ENFORCE_EQ(x_dims[2] % (blocksize), 0,
+                          "input Height should be divisible of the square of "
+                          "SpaceToDepthOp blocksize");
+      }
+      if (x_dims[3] != -1) {
+        PADDLE_ENFORCE_GT(x_dims[3], 0, "input Width should be Greater than 0");
+        PADDLE_ENFORCE_EQ(x_dims[3] % (blocksize), 0,
+                          "input Width should be divisible of the square of "
+                          "SpaceToDepthOp blocksize");
+      }
+    }
    VLOG(3) << "SpaceToDepthOp operator x.shape=" << x_dims
            << "Attribute blocksize" << blocksize << std::endl;

--- a/paddle/fluid/operators/split_lod_tensor_op.cc
+++ b/paddle/fluid/operators/split_lod_tensor_op.cc
@@ -157,7 +157,9 @@ class SplitLoDTensorInferShape : public framework::InferShapeBase {
    auto mask_dim = context->GetInputDim("Mask");
    PADDLE_ENFORCE_EQ(mask_dim.size(), 2);
-    PADDLE_ENFORCE_EQ(mask_dim[1], 1);
+    if (context->IsRuntime()) {
+      PADDLE_ENFORCE_EQ(mask_dim[1], 1);
+    }
    context->SetOutputDim("OutTrue", context->GetInputDim("X"));
    context->SetOutputDim("OutFalse", context->GetInputDim("X"));

--- a/paddle/fluid/operators/squared_l2_distance_op.cc
+++ b/paddle/fluid/operators/squared_l2_distance_op.cc
@@ -45,13 +45,26 @@ class SquaredL2DistanceOp : public framework::OperatorWithKernel {
    int rank = framework::arity(x_dims);
    PADDLE_ENFORCE_GE(rank, 2, "Tensor rank should be at least equal to 2.");
-    PADDLE_ENFORCE_EQ(product(x_dims) / x_dims[0], product(y_dims) / y_dims[0],
+    bool check = true;
-                      "Product of dimensions expcet the first dimension of "
+    if ((!ctx->IsRuntime()) &&
-                      "input and target must be equal.");
+        (framework::product(x_dims) <= 0 || framework::product(y_dims) <= 0)) {
-    PADDLE_ENFORCE(y_dims[0] == 1 || y_dims[0] == x_dims[0],
+      check = false;
-                   "First dimension of target must be equal to input "
+    }
-                   "or to 1.");
+    if (check) {
+      PADDLE_ENFORCE_EQ(product(x_dims) / x_dims[0],
+                        product(y_dims) / y_dims[0],
+                        "Product of dimensions expcet the first dimension of "
+                        "input and target must be equal.");
+    }
+    check = true;
+    if ((!ctx->IsRuntime()) && (y_dims[0] <= 0 || x_dims[0] <= 0)) {
+      check = false;
+    }
+    if (check) {
+      PADDLE_ENFORCE(y_dims[0] == 1 || y_dims[0] == x_dims[0],
+                     "First dimension of target must be equal to input "
+                     "or to 1.");
+    }
    ctx->SetOutputDim("sub_result", {x_dims[0], product(x_dims) / x_dims[0]});
    ctx->SetOutputDim("Out", {x_dims[0], 1});
    ctx->ShareLoD("X", /*->*/ "Out");
@@ -124,12 +137,12 @@ class SquaredL2DistanceGradOp : public framework::OperatorWithKernel {
    auto out_dims = ctx->GetInputDim(framework::GradVarName("Out"));
    auto x_dims = ctx->GetInputDim("X");
    auto y_dims = ctx->GetInputDim("Y");
-    PADDLE_ENFORCE_EQ(out_dims[0], x_dims[0],
+    PADDLE_INFERSHAPE_ENFORCE_EQ(ctx, out_dims[0], x_dims[0],
-                      "First dimension of output gradient and "
+                                 "First dimension of output gradient and "
-                      "input value must be equal.");
+                                 "input value must be equal.");
-    PADDLE_ENFORCE_EQ(out_dims[1], 1,
+    PADDLE_INFERSHAPE_ENFORCE_EQ(ctx, out_dims[1], 1,
-                      "Second dimension of output gradient "
+                                 "Second dimension of output gradient "
-                      "must be 1.");
+                                 "must be 1.");
    auto x_grad_name = framework::GradVarName("X");
    auto y_grad_name = framework::GradVarName("Y");
    if (ctx->HasOutput(x_grad_name)) ctx->SetOutputDim(x_grad_name, x_dims);

--- a/paddle/fluid/operators/teacher_student_sigmoid_loss_op.cc
+++ b/paddle/fluid/operators/teacher_student_sigmoid_loss_op.cc
@@ -37,12 +37,14 @@ class TeacherStudentSigmoidLossOp : public framework::OperatorWithKernel {
    PADDLE_ENFORCE_EQ(x_dims.size(), 2UL, "Input(X)'s rank should be 2.");
    PADDLE_ENFORCE_EQ(label_dims.size(), 2UL,
                      "Input(Label)'s rank should be 2.");
-    PADDLE_ENFORCE_EQ(x_dims[0], label_dims[0],
+    if (ctx->IsRuntime()) {
-                      "The 1st dimension of Input(X) and Input(Label) should "
+      PADDLE_ENFORCE_EQ(x_dims[0], label_dims[0],
-                      "be equal.");
+                        "The 1st dimension of Input(X) and Input(Label) should "
-    PADDLE_ENFORCE_EQ(label_dims[1], 1UL,
+                        "be equal.");
-                      "The 2nd dimension of "
+      PADDLE_ENFORCE_EQ(label_dims[1], 1UL,
-                      "Input(Label) should be 1.");
+                        "The 2nd dimension of "
+                        "Input(Label) should be 1.");
+    }
    ctx->SetOutputDim("Y", {x_dims[0], 1});
    ctx->ShareLoD("X", /*->*/ "Y");
  }
@@ -99,17 +101,20 @@ class TeacherStudentSigmoidLossGradientOp
    PADDLE_ENFORCE_EQ(x_dims.size(), 2, "Input(X)'s rank should be 2.");
    PADDLE_ENFORCE_EQ(dy_dims.size(), 2, "Input(Y@Grad)'s rank should be 2.");
    PADDLE_ENFORCE_EQ(label_dims.size(), 2, "Input(Label)'s rank should be 2.");
-    PADDLE_ENFORCE_EQ(x_dims[0], label_dims[0],
+    if (ctx->IsRuntime()) {
-                      "The 1st dimension of Input(X) and Input(Label) should "
+      PADDLE_ENFORCE_EQ(x_dims[0], label_dims[0],
-                      "be equal.");
+                        "The 1st dimension of Input(X) and Input(Label) should "
-    PADDLE_ENFORCE_EQ(x_dims[0], dy_dims[0],
+                        "be equal.");
-                      "The 1st dimension of Input(X) and Input(Y@Grad) should "
+      PADDLE_ENFORCE_EQ(
-                      "be equal.");
+          x_dims[0], dy_dims[0],
-    PADDLE_ENFORCE_EQ(dy_dims[1], 1,
+          "The 1st dimension of Input(X) and Input(Y@Grad) should "
-                      "The 2nd dimension of Input(Y@Grad) should be 1.");
+          "be equal.");
-    PADDLE_ENFORCE_EQ(label_dims[1], 1,
+      PADDLE_ENFORCE_EQ(dy_dims[1], 1,
-                      "When Attr(soft_label) == false, the 2nd dimension of "
+                        "The 2nd dimension of Input(Y@Grad) should be 1.");
-                      "Input(Label) should be 1.");
+      PADDLE_ENFORCE_EQ(label_dims[1], 1,
+                        "When Attr(soft_label) == false, the 2nd dimension of "
+                        "Input(Label) should be 1.");
+    }
    ctx->SetOutputDim(framework::GradVarName("X"), x_dims);
    ctx->ShareLoD("X", framework::GradVarName("X"));
  }

--- a/paddle/fluid/operators/tree_conv_op.cc
+++ b/paddle/fluid/operators/tree_conv_op.cc
@@ -64,17 +64,38 @@ class TreeConvOp : public framework::OperatorWithKernel {
    auto edge_dims = ctx->GetInputDim("EdgeSet");
    auto vector_dims = ctx->GetInputDim("NodesVector");
    auto filter_dims = ctx->GetInputDim("Filter");
-    PADDLE_ENFORCE_EQ(edge_dims[2], 2, "Input(EdgeSet) dim[2] should be 2");
+    if (ctx->IsRuntime()) {
+      PADDLE_ENFORCE_EQ(edge_dims[2], 2, "Input(EdgeSet) dim[2] should be 2");
+    } else {
+      if (edge_dims[2] != -1) {
+        PADDLE_ENFORCE_EQ(edge_dims[2], 2, "Input(EdgeSet) dim[2] should be 2");
+      }
+    }
    PADDLE_ENFORCE_EQ(edge_dims.size(), 3,
                      "The dimension of EdgeSet Tensor should be 3");
    PADDLE_ENFORCE_EQ(vector_dims.size(), 3,
                      "The dimension of NodesVector Tensor should be 3");
    PADDLE_ENFORCE_EQ(filter_dims.size(), 4,
                      "The dimension of Filter Tensor should be 4");
-    PADDLE_ENFORCE_EQ(filter_dims[1], 3, "Input(Filter) dim[1] should be 3");
-    PADDLE_ENFORCE_EQ(
+    if (ctx->IsRuntime()) {
-        filter_dims[0], vector_dims[2],
+      PADDLE_ENFORCE_EQ(filter_dims[1], 3, "Input(Filter) dim[1] should be 3");
-        "Input(Filter) dim[0] must equal to Input(NodesVector) dim[2]");
+      PADDLE_ENFORCE_EQ(
+          filter_dims[0], vector_dims[2],
+          "Input(Filter) dim[0] must equal to Input(NodesVector) dim[2]");
+    } else {
+      if (filter_dims[1] != -1) {
+        PADDLE_ENFORCE_EQ(filter_dims[1], 3,
+                          "Input(Filter) dim[1] should be 3");
+      }
+      if (filter_dims[0] != -1 && vector_dims[2] != -1) {
+        PADDLE_ENFORCE_EQ(
+            filter_dims[0], vector_dims[2],
+            "Input(Filter) dim[0] must equal to Input(NodesVector) dim[2]");
+      }
+    }
    auto output_dims = framework::make_ddim(
        {vector_dims[0], vector_dims[1], filter_dims[2], filter_dims[3]});
    ctx->SetOutputDim("Out", output_dims);

--- a/paddle/fluid/platform/enforce.h
+++ b/paddle/fluid/platform/enforce.h
@@ -356,5 +356,46 @@ using CommonType2 = typename std::add_lvalue_reference<
 #define PADDLE_ENFORCE_LE(__VAL0, __VAL1, ...) \
  __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, <=, >, __VA_ARGS__)
+#define __PADDLE_INFERSHAPE_BINARY_COMPARE(__CTX, __VAL1, __VAL2, __CMP, \
+                                           __INV_CMP, ...)               \
+  do {                                                                   \
+    auto __val1 = (__VAL1);                                              \
+    auto __val2 = (__VAL2);                                              \
+    if (!__CTX->IsRuntime()) {                                           \
+      if (__val1 == -1 || __val2 == -1) {                                \
+        break;                                                           \
+      }                                                                  \
+    }                                                                    \
+    using __TYPE1__ = decltype(__val1);                                  \
+    using __TYPE2__ = decltype(__val2);                                  \
+    using __COMMON_TYPE1__ =                                             \
+        ::paddle::platform::details::CommonType1<__TYPE1__, __TYPE2__>;  \
+    using __COMMON_TYPE2__ =                                             \
+        ::paddle::platform::details::CommonType2<__TYPE1__, __TYPE2__>;  \
+    bool __is_not_error = (static_cast<__COMMON_TYPE1__>(__val1))__CMP(  \
+        static_cast<__COMMON_TYPE2__>(__val2));                          \
+    if (UNLIKELY(!__is_not_error)) {                                     \
+      PADDLE_THROW("Enforce failed. Expected %s " #__CMP                 \
+                   " %s, but received %s:%s " #__INV_CMP " %s:%s.\n%s",  \
+                   #__VAL1, #__VAL2, #__VAL1,                            \
+                   ::paddle::string::to_string(__val1), #__VAL2,         \
+                   ::paddle::string::to_string(__val2),                  \
+                   ::paddle::string::Sprintf(__VA_ARGS__));              \
+    }                                                                    \
+  } while (0)
+#define PADDLE_INFERSHAPE_ENFORCE_EQ(__CTX, __VAL0, __VAL1, ...) \
+  __PADDLE_INFERSHAPE_BINARY_COMPARE(__CTX, __VAL0, __VAL1, ==, !=, __VA_ARGS__)
+#define PADDLE_INFERSHAPE_ENFORCE_NE(__CTX, __VAL0, __VAL1, ...) \
+  __PADDLE_INFERSHAPE_BINARY_COMPARE(__CTX, __VAL0, __VAL1, !=, ==, __VA_ARGS__)
+#define PADDLE_INFERSHAPE_ENFORCE_GT(__CTX, __VAL0, __VAL1, ...) \
+  __PADDLE_INFERSHAPE_BINARY_COMPARE(__CTX, __VAL0, __VAL1, >, <=, __VA_ARGS__)
+#define PADDLE_INFERSHAPE_ENFORCE_GE(__CTX, __VAL0, __VAL1, ...) \
+  __PADDLE_INFERSHAPE_BINARY_COMPARE(__CTX, __VAL0, __VAL1, >=, <, __VA_ARGS__)
+#define PADDLE_INFERSHAPE_ENFORCE_LT(__CTX, __VAL0, __VAL1, ...) \
+  __PADDLE_INFERSHAPE_BINARY_COMPARE(__CTX, __VAL0, __VAL1, <, >=, __VA_ARGS__)
+#define PADDLE_INFERSHAPE_ENFORCE_LE(__CTX, __VAL0, __VAL1, ...) \
+  __PADDLE_INFERSHAPE_BINARY_COMPARE(__CTX, __VAL0, __VAL1, <=, >, __VA_ARGS__)
 }  // namespace platform
 }  // namespace paddle
--- a/python/paddle/fluid/dataset.py
+++ b/python/paddle/fluid/dataset.py
@@ -219,6 +219,7 @@ class InMemoryDataset(DatasetBase):
            >>> dataset = fluid.DatasetFactory.create_dataset("InMemoryDataset")
            >>> filelist = ["a.txt", "b.txt"]
            >>> dataset.set_filelist(filelist)
+            >>> dataset.load_into_memory()
            >>> dataset.local_shuffle()
        """
        self.dataset.local_shuffle()
@@ -236,6 +237,7 @@ class InMemoryDataset(DatasetBase):
            >>> dataset = fluid.DatasetFactory.create_dataset("InMemoryDataset")
            >>> filelist = ["a.txt", "b.txt"]
            >>> dataset.set_filelist(filelist)
+            >>> dataset.load_into_memory()
            >>> dataset.global_shuffle(fleet)
        Args:
@@ -255,6 +257,25 @@ class InMemoryDataset(DatasetBase):
        if fleet is not None:
            fleet.fleet_instance.role_maker_._barrier_worker()
+    def release_memory(self):
+        """
+        Release InMemoryDataset memory data, when data will not be used again.
+        Example:
+            >>> import paddle.fluid as fluid
+            >>> import paddle.fluid.incubate.fleet.parameter_server as fleet
+            >>> dataset = fluid.DatasetFactory.create_dataset("InMemoryDataset")
+            >>> filelist = ["a.txt", "b.txt"]
+            >>> dataset.set_filelist(filelist)
+            >>> dataset.load_into_memory()
+            >>> dataset.global_shuffle(fleet)
+            >>> exe = fluid.Executor(fluid.CPUPlace())
+            >>> exe.run(fluid.default_startup_program())
+            >>> exe.train_from_dataset(fluid.default_main_program(), dataset)
+            >>> dataset.release_memory()
+        """
+        self.dataset.release_memory()
 class QueueDataset(DatasetBase):
    """

--- a/python/paddle/fluid/dygraph/layers.py
+++ b/python/paddle/fluid/dygraph/layers.py
@@ -48,6 +48,12 @@ class Layer(core.Layer):
        self._helper = LayerObjectHelper(self._full_name)
+    def train(self):
+        framework._dygraph_tracer()._train_mode()
+    def eval(self):
+        framework._dygraph_tracer()._eval_mode()
    def full_name(self):
        """Full name for this layers.
@@ -254,6 +260,12 @@ class PyLayer(core.PyLayer):
    def __init__(self):
        super(PyLayer, self).__init__()
+    def train(self):
+        framework._dygraph_tracer()._train_mode()
+    def eval(self):
+        framework._dygraph_tracer()._eval_mode()
    @classmethod
    def _do_forward(cls, inputs):
        return cls._to_tuple(cls.forward(inputs))

--- a/python/paddle/fluid/dygraph/tracer.py
+++ b/python/paddle/fluid/dygraph/tracer.py
@@ -24,7 +24,9 @@ __all__ = ['Tracer']
 def release_op(op):
-    del framework._dygraph_tracer()._ops[op._trace_id]
+    del framework._dygraph_tracer()._ops[op._trace_id].inputs
+    del framework._dygraph_tracer()._ops[op._trace_id].outputs
+    del framework._dygraph_tracer()._ops[op._trace_id].backward_refs
 class Tracer(core.Tracer):
@@ -38,6 +40,7 @@ class Tracer(core.Tracer):
        self._ops = defaultdict()
        self._vars = defaultdict()
        self._trace_id = 0
+        self._train_mode = True
    def trace_var(self, name, var):
        self._vars[name] = var
@@ -46,15 +49,57 @@ class Tracer(core.Tracer):
        return list((item for name, item in six.iteritems(self._vars)
                     if isinstance(item, framework.Parameter)))
-    def trace_op(self, op, stop_gradient=False):
+    def trace_op(self, op, inputs, outputs, stop_gradient=False):
+        # TODO(minqiyang): remove this line after we take apart all
+        # backward grads and forward variables
+        if self._train_mode:
+            op.inputs = inputs
+            inps = defaultdict(list)
+            for k, vars in six.iteritems(inputs):
+                if isinstance(vars, framework.Variable):
+                    inps[k].append(vars._ivar)
+                elif isinstance(vars, list) or isinstance(vars, tuple):
+                    for var in vars:
+                        inps[k].append(var._ivar)
+            op.outputs = outputs
+            outs = defaultdict(list)
+            for k, vars in six.iteritems(outputs):
+                if isinstance(vars, framework.Variable):
+                    outs[k].append(vars._ivar)
+                elif isinstance(vars, list) or isinstance(vars, tuple):
+                    for var in vars:
+                        outs[k].append(var._ivar)
+        else:
+            inps = defaultdict(list)
+            for k, vars in six.iteritems(inputs):
+                if isinstance(vars, framework.Variable):
+                    op.previous_ops.append(vars.op)
+                    inps[k].append(vars._ivar)
+                elif isinstance(vars, list) or isinstance(vars, tuple):
+                    for var in vars:
+                        op.previous_ops.append(var.op)
+                        inps[k].append(var._ivar)
+            op.outputs = outputs
+            outs = defaultdict(list)
+            for k, vars in six.iteritems(outputs):
+                if isinstance(vars, framework.Variable):
+                    vars.op = op
+                    outs[k].append(vars._ivar)
+                elif isinstance(vars, list) or isinstance(vars, tuple):
+                    for var in vars:
+                        var.op = op
+                        outs[k].append(var._ivar)
        # record op's trace id
        op.iop._trace_id = self._trace_id
-        backward_refs = self.trace(op.iop, op.inputs, op.outputs, op.attrs,
+        backward_refs = self.trace(op.iop, inps, outs, op.attrs,
                                   framework._current_expected_place(),
                                   stop_gradient)
-        if not stop_gradient:
+        if not stop_gradient and self._train_mode:
            self._trace_id += 1
            self._ops[op.iop._trace_id] = op
@@ -65,10 +110,16 @@ class Tracer(core.Tracer):
                # TODO(minqiyang): remove all inputs and outputs after separate
                # var and grad
                op.backward_refs = defaultdict(list)
-                for k, v in six.iteritems(op.inputs):
+                for k, v in six.iteritems(inputs):
                    if k in backward_refs:
-                        op.backward_refs[k] = op.inputs[k]
+                        op.backward_refs[k] = inputs[k]
-                for k, v in six.iteritems(op.outputs):
+                for k, v in six.iteritems(outputs):
                    if k in backward_refs:
-                        op.backward_refs[k] = op.outputs[k]
+                        op.backward_refs[k] = outputs[k]
+    def _train_mode(self):
+        self._train_mode = True
+    def _eval_mode(self):
+        self._train_mode = False
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -411,6 +411,7 @@ class Variable(object):
                    if persistable else False)
            if persistable:
                _dygraph_tracer().trace_var(name, self)
+            self.op = None
        else:
            self.error_clip = error_clip
@@ -939,24 +940,7 @@ class Operator(object):
                raise ValueError(
                    "`type` to initialized an Operator can not be None.")
            self.iop = core.OpBase(type)
+            self.previous_ops = []
-            # TODO(minqiyang): remove these lines after we take apart all
-            # backward grads and forward variables
-            self.inputs = defaultdict(list)
-            if inputs is not None:
-                for k, v in six.iteritems(inputs):
-                    if isinstance(v, Variable):
-                        self.inputs[k].append(v._ivar)
-                    elif isinstance(v, list) or isinstance(v, tuple):
-                        self.inputs[k].extend([var._ivar for var in v])
-            self.outputs = defaultdict(list)
-            if outputs is not None:
-                for k, v in six.iteritems(outputs):
-                    if isinstance(v, Variable):
-                        self.outputs[k].append(v._ivar)
-                    elif isinstance(v, list) or isinstance(v, tuple):
-                        self.outputs[k].extend([var._ivar for var in v])
            self.attrs = attrs if attrs else {}
        else:
@@ -1647,15 +1631,18 @@ class Block(object):
                block=self,
                desc=None,
                type=kwargs.get("type", None),
-                inputs=kwargs.get("inputs", None),
+                inputs=None,
-                outputs=kwargs.get("outputs", None),
+                outputs=None,
-                attrs=kwargs.get("attrs", None))
+                attrs=kwargs.get("attrs", {}))
            # record ops in tracer rather than blocks
            #
            # TODO(minqiyang): add op stop_gradient support in static mode too.
            # currently, we only support stop_gradient in dygraph mode.
-            _dygraph_tracer().trace_op(op, kwargs.get("stop_gradient", False))
+            _dygraph_tracer().trace_op(op,
+                                       kwargs.get("inputs", {}),
+                                       kwargs.get("outputs", {}),
+                                       kwargs.get("stop_gradient", False))
        else:
            op_desc = self.desc.append_op()
            op = Operator(
@@ -1719,10 +1706,14 @@ class Block(object):
                self,
                None,
                type=kwargs.get("type", None),
-                inputs=kwargs.get("inputs", None),
+                inputs=None,
-                outputs=kwargs.get("outputs", None),
+                outputs=None,
-                attrs=kwargs.get("attrs", None))
+                attrs=kwargs.get("attrs", {}))
-            _dygraph_tracer().trace_op(op, kwargs.get("stop_gradient", False))
+            _dygraph_tracer().trace_op(op,
+                                       kwargs.get("inputs", {}),
+                                       kwargs.get("outputs", {}),
+                                       kwargs.get("stop_gradient", False))
        else:
            op_desc = self.desc._prepend_op()
            op = Operator(

--- a/python/paddle/fluid/incubate/fleet/base/role_maker.py
+++ b/python/paddle/fluid/incubate/fleet/base/role_maker.py
@@ -128,7 +128,7 @@ class MPIRoleMaker(RoleMakerBase):
        """
        finalize the current MPI instance.
        """
-        self._comm.finalize()
+        pass
 class MPISymetricRoleMaker(MPIRoleMaker):

--- a/python/paddle/fluid/incubate/fleet/parameter_server/__init__.py
+++ b/python/paddle/fluid/incubate/fleet/parameter_server/__init__.py
@@ -241,6 +241,40 @@ class Fleet(object):
        """
        self._fleet_ptr.save_model(save_path)
+    def split_filelist(self, filelist):
+        """
+        split filelist before distributed training,
+        for example, filelist is [a, b, c ,d, e]  and trainer_num = 2,
+        then trainer 0 gets [a, b, c] and trainer 1 gets [d, e]
+        Example:
+            >>> all_filelist = ["a.txt", "b.txt", "c.txt"]
+            >>> my_filelist = fleet.split_filelist(all_filelist)
+            >>> dataset = fluid.DatasetFactory().create_dataset()
+            >>> dataset.set_filelist(my_filelist)
+        Args:
+            filelist(list): list of filename, can be local or hdfs/afs.
+        Returns:
+            list of filename which belongs to this trainer.
+        """
+        file_num = len(filelist)
+        trainer_id = self.get_worker_index()
+        trainer_num = self.get_worker_num()
+        if trainer_num > file_num:
+            raise ValueError("trainer_num should be <= file_num : "
+                             "%s > %s" % (trainer_num, file_num))
+        # get interval of filelist, it's [ )
+        start = 0
+        end = 0
+        for i in range(0, trainer_id + 1):
+            length = file_num / trainer_num + (i < (file_num % trainer_num))
+            start = end
+            end += length
+        my_filelist = filelist[start:end]
+        return my_filelist
    def _set_opt_info(self, opt_info):
        """
        this function saves the result from DistributedOptimizer.minimize()
@@ -337,3 +371,4 @@ save_pserver_model = fleet_instance.save_pserver_model
 worker_num = fleet_instance.get_worker_num
 server_num = fleet_instance.get_server_num
 worker_index = fleet_instance.get_worker_index
+split_filelist = fleet_instance.split_filelist
--- a/python/paddle/fluid/layers/detection.py
+++ b/python/paddle/fluid/layers/detection.py
@@ -509,14 +509,14 @@ def polygon_box_transform(input, name=None):
 @templatedoc(op_type="yolov3_loss")
 def yolov3_loss(x,
-                gtbox,
+                gt_box,
-                gtlabel,
+                gt_label,
                anchors,
                anchor_mask,
                class_num,
                ignore_thresh,
                downsample_ratio,
-                gtscore=None,
+                gt_score=None,
                use_label_smooth=True,
                name=None):
    """
@@ -524,12 +524,12 @@ def yolov3_loss(x,
    Args:
        x (Variable): ${x_comment}
-        gtbox (Variable): groud truth boxes, should be in shape of [N, B, 4],
+        gt_box (Variable): groud truth boxes, should be in shape of [N, B, 4],
                          in the third dimenstion, x, y, w, h should be stored 
                          and x, y, w, h should be relative value of input image.
                          N is the batch number and B is the max box number in 
                          an image.
-        gtlabel (Variable): class id of ground truth boxes, shoud be in shape
+        gt_label (Variable): class id of ground truth boxes, shoud be in shape
                            of [N, B].
        anchors (list|tuple): ${anchors_comment}
        anchor_mask (list|tuple): ${anchor_mask_comment}
@@ -537,7 +537,7 @@ def yolov3_loss(x,
        ignore_thresh (float): ${ignore_thresh_comment}
        downsample_ratio (int): ${downsample_ratio_comment}
        name (string): the name of yolov3 loss. Default None.
-        gtscore (Variable): mixup score of ground truth boxes, shoud be in shape
+        gt_score (Variable): mixup score of ground truth boxes, shoud be in shape
                            of [N, B]. Default None.
        use_label_smooth (bool): ${use_label_smooth_comment}
@@ -558,13 +558,13 @@ def yolov3_loss(x,
      .. code-block:: python
          x = fluid.layers.data(name='x', shape=[255, 13, 13], dtype='float32')
-          gtbox = fluid.layers.data(name='gtbox', shape=[6, 4], dtype='float32')
+          gt_box = fluid.layers.data(name='gt_box', shape=[6, 4], dtype='float32')
-          gtlabel = fluid.layers.data(name='gtlabel', shape=[6], dtype='int32')
+          gt_label = fluid.layers.data(name='gt_label', shape=[6], dtype='int32')
-          gtscore = fluid.layers.data(name='gtscore', shape=[6], dtype='float32')
+          gt_score = fluid.layers.data(name='gt_score', shape=[6], dtype='float32')
          anchors = [10, 13, 16, 30, 33, 23, 30, 61, 62, 45, 59, 119, 116, 90, 156, 198, 373, 326]
          anchor_mask = [0, 1, 2]
-          loss = fluid.layers.yolov3_loss(x=x, gtbox=gtbox, gtlabel=gtlabel,
+          loss = fluid.layers.yolov3_loss(x=x, gt_box=gt_box, gt_label=gt_label,
-                                          gtscore=gtscore, anchors=anchors, 
+                                          gt_score=gt_score, anchors=anchors, 
                                          anchor_mask=anchor_mask, class_num=80,
                                          ignore_thresh=0.7, downsample_ratio=32)
    """
@@ -572,11 +572,11 @@ def yolov3_loss(x,
    if not isinstance(x, Variable):
        raise TypeError("Input x of yolov3_loss must be Variable")
-    if not isinstance(gtbox, Variable):
+    if not isinstance(gt_box, Variable):
        raise TypeError("Input gtbox of yolov3_loss must be Variable")
-    if not isinstance(gtlabel, Variable):
+    if not isinstance(gt_label, Variable):
        raise TypeError("Input gtlabel of yolov3_loss must be Variable")
-    if gtscore is not None and not isinstance(gtscore, Variable):
+    if gt_score is not None and not isinstance(gt_score, Variable):
        raise TypeError("Input gtscore of yolov3_loss must be Variable")
    if not isinstance(anchors, list) and not isinstance(anchors, tuple):
        raise TypeError("Attr anchors of yolov3_loss must be list or tuple")
@@ -602,11 +602,11 @@ def yolov3_loss(x,
    inputs = {
        "X": x,
-        "GTBox": gtbox,
+        "GTBox": gt_box,
-        "GTLabel": gtlabel,
+        "GTLabel": gt_label,
    }
-    if gtscore:
+    if gt_score:
-        inputs["GTScore"] = gtscore
+        inputs["GTScore"] = gt_score
    attrs = {
        "anchors": anchors,
@@ -1542,7 +1542,7 @@ def multi_box_head(inputs,
        .. code-block:: python
          mbox_locs, mbox_confs, box, var = fluid.layers.multi_box_head(
-            inputs=[conv1, conv2, conv3, conv4, conv5, conv5],
+            inputs=[conv1, conv2, conv3, conv4, conv5, conv6],
            image=images,
            num_classes=21,
            min_ratio=20,

--- a/python/paddle/fluid/layers/ops.py
+++ b/python/paddle/fluid/layers/ops.py
@@ -82,8 +82,8 @@ def uniform_random(shape, dtype='float32', min=-1.0, max=1.0, seed=0):
    Examples:
        .. code-block:: python
-        result = fluid.layers.uniform_random(shape=[32, 784])
+            result = fluid.layers.uniform_random(shape=[32, 784])
    """
    locals_var = locals().keys()

--- a/python/paddle/fluid/tests/test_detection.py
+++ b/python/paddle/fluid/tests/test_detection.py
@@ -474,17 +474,17 @@ class TestYoloDetection(unittest.TestCase):
        program = Program()
        with program_guard(program):
            x = layers.data(name='x', shape=[30, 7, 7], dtype='float32')
-            gtbox = layers.data(name='gtbox', shape=[10, 4], dtype='float32')
+            gt_box = layers.data(name='gt_box', shape=[10, 4], dtype='float32')
-            gtlabel = layers.data(name='gtlabel', shape=[10], dtype='int32')
+            gt_label = layers.data(name='gt_label', shape=[10], dtype='int32')
-            gtscore = layers.data(name='gtscore', shape=[10], dtype='float32')
+            gt_score = layers.data(name='gt_score', shape=[10], dtype='float32')
            loss = layers.yolov3_loss(
                x,
-                gtbox,
+                gt_box,
-                gtlabel, [10, 13, 30, 13], [0, 1],
+                gt_label, [10, 13, 30, 13], [0, 1],
                10,
                0.7,
                32,
-                gtscore=gtscore,
+                gt_score=gt_score,
                use_label_smooth=False)
            self.assertIsNotNone(loss)

--- a/python/paddle/fluid/tests/unittests/test_dygraph_multi_forward.py
+++ b/python/paddle/fluid/tests/unittests/test_dygraph_multi_forward.py
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+from __future__ import print_function
+import contextlib
+import unittest
+import numpy as np
+import six
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid import core
+from paddle.fluid.optimizer import SGDOptimizer
+from paddle.fluid.dygraph.nn import Conv2D, Pool2D, FC
+from paddle.fluid.dygraph.base import to_variable
+from test_imperative_base import new_program_scope
+class SimpleImgConvPool(fluid.dygraph.Layer):
+    def __init__(self,
+                 name_scope,
+                 num_channels,
+                 num_filters,
+                 filter_size,
+                 pool_size,
+                 pool_stride,
+                 pool_padding=0,
+                 pool_type='max',
+                 global_pooling=False,
+                 conv_stride=1,
+                 conv_padding=0,
+                 conv_dilation=1,
+                 conv_groups=1,
+                 act=None,
+                 use_cudnn=False,
+                 param_attr=None,
+                 bias_attr=None):
+        super(SimpleImgConvPool, self).__init__(name_scope)
+        self._conv2d = Conv2D(
+            self.full_name(),
+            num_channels=num_channels,
+            num_filters=num_filters,
+            filter_size=filter_size,
+            stride=conv_stride,
+            padding=conv_padding,
+            dilation=conv_dilation,
+            groups=conv_groups,
+            param_attr=None,
+            bias_attr=None,
+            use_cudnn=use_cudnn)
+        self._pool2d = Pool2D(
+            self.full_name(),
+            pool_size=pool_size,
+            pool_type=pool_type,
+            pool_stride=pool_stride,
+            pool_padding=pool_padding,
+            global_pooling=global_pooling,
+            use_cudnn=use_cudnn)
+    def forward(self, inputs):
+        x = self._conv2d(inputs)
+        x = self._pool2d(x)
+        return x
+class MNIST(fluid.dygraph.Layer):
+    def __init__(self, name_scope):
+        super(MNIST, self).__init__(name_scope)
+        self._simple_img_conv_pool_1 = SimpleImgConvPool(
+            self.full_name(), 1, 20, 5, 2, 2, act="relu")
+        self._simple_img_conv_pool_2 = SimpleImgConvPool(
+            self.full_name(), 20, 50, 5, 2, 2, act="relu")
+        pool_2_shape = 50 * 4 * 4
+        SIZE = 10
+        scale = (2.0 / (pool_2_shape**2 * SIZE))**0.5
+        self._fc = FC(self.full_name(),
+                      10,
+                      param_attr=fluid.param_attr.ParamAttr(
+                          initializer=fluid.initializer.NormalInitializer(
+                              loc=0.0, scale=scale)),
+                      act="softmax")
+    def forward(self, inputs):
+        x = self._simple_img_conv_pool_1(inputs)
+        x = self._simple_img_conv_pool_2(x)
+        x = self._fc(x)
+        return x
+class TestDygraphMultiForward(unittest.TestCase):
+    def test_mnist_forward_float32(self):
+        seed = 90
+        epoch_num = 1
+        with fluid.dygraph.guard():
+            fluid.default_startup_program().random_seed = seed
+            fluid.default_main_program().random_seed = seed
+            mnist = MNIST("mnist")
+            sgd = SGDOptimizer(learning_rate=1e-3)
+            train_reader = paddle.batch(
+                paddle.dataset.mnist.train(), batch_size=128, drop_last=True)
+            dy_param_init_value = {}
+            mnist.eval()
+            for epoch in range(epoch_num):
+                for batch_id, data in enumerate(train_reader()):
+                    dy_x_data = np.array(
+                        [x[0].reshape(1, 28, 28)
+                         for x in data]).astype('float32')
+                    y_data = np.array(
+                        [x[1] for x in data]).astype('int64').reshape(128, 1)
+                    img = to_variable(dy_x_data)
+                    label = to_variable(y_data)
+                    label.stop_gradient = True
+                    cost = mnist(img)
+                    loss = fluid.layers.cross_entropy(cost, label)
+                    avg_loss = fluid.layers.mean(loss)
+                    dy_out = avg_loss.numpy()
+                    if epoch == 0 and batch_id == 0:
+                        for param in mnist.parameters():
+                            dy_param_init_value[param.name] = param.numpy()
+        with new_program_scope():
+            fluid.default_startup_program().random_seed = seed
+            fluid.default_main_program().random_seed = seed
+            exe = fluid.Executor(fluid.CPUPlace(
+            ) if not core.is_compiled_with_cuda() else fluid.CUDAPlace(0))
+            mnist = MNIST("mnist")
+            sgd = SGDOptimizer(learning_rate=1e-3)
+            train_reader = paddle.batch(
+                paddle.dataset.mnist.train(), batch_size=128, drop_last=True)
+            img = fluid.layers.data(
+                name='pixel', shape=[1, 28, 28], dtype='float32')
+            label = fluid.layers.data(name='label', shape=[1], dtype='int64')
+            cost = mnist(img)
+            loss = fluid.layers.cross_entropy(cost, label)
+            avg_loss = fluid.layers.mean(loss)
+            # initialize params and fetch them
+            static_param_init_value = {}
+            static_param_name_list = []
+            for param in mnist.parameters():
+                static_param_name_list.append(param.name)
+            out = exe.run(fluid.default_startup_program(),
+                          fetch_list=static_param_name_list)
+            for i in range(len(static_param_name_list)):
+                static_param_init_value[static_param_name_list[i]] = out[i]
+            for epoch in range(epoch_num):
+                for batch_id, data in enumerate(train_reader()):
+                    static_x_data = np.array(
+                        [x[0].reshape(1, 28, 28)
+                         for x in data]).astype('float32')
+                    y_data = np.array(
+                        [x[1] for x in data]).astype('int64').reshape([128, 1])
+                    fetch_list = [avg_loss.name]
+                    out = exe.run(
+                        fluid.default_main_program(),
+                        feed={"pixel": static_x_data,
+                              "label": y_data},
+                        fetch_list=fetch_list)
+                    static_out = out[0]
+        self.assertTrue(np.allclose(dy_x_data.all(), static_x_data.all()))
+        for key, value in six.iteritems(static_param_init_value):
+            self.assertTrue(np.allclose(value, dy_param_init_value[key]))
+        self.assertTrue(np.allclose(static_out, dy_out))
+if __name__ == '__main__':
+    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_softmax_with_cross_entropy_op.py
+++ b/python/paddle/fluid/tests/unittests/test_softmax_with_cross_entropy_op.py
@@ -195,5 +195,144 @@ class TestSoftmaxWithCrossEntropyOp3NoCudnn(TestSoftmaxWithCrossEntropyOp3):
        self.numeric_stable_mode = True
+class TestSoftmaxWithCrossEntropyOp5(OpTest):
+    """
+    Test softmax with cross entropy operator with ignore_index.
+    """
+    def initParams(self):
+        self.numeric_stable_mode = False
+    def setUp(self):
+        self.initParams()
+        self.op_type = "softmax_with_cross_entropy"
+        batch_size = [6, 10]
+        class_num = 47
+        logits = np.random.uniform(
+            0.1, 1.0, tuple(batch_size + [class_num])).astype("float64")
+        softmax = np.apply_along_axis(stable_softmax, 2, logits)
+        labels = np.random.randint(
+            0, class_num, tuple(batch_size + [1]), dtype="int64")
+        ignore_index = 7
+        softmax_2d = np.reshape(softmax, [-1, class_num])
+        labels_2d = np.reshape(labels, [-1, 1])
+        cross_entropy = np.asmatrix(
+            [[-np.log(softmax_2d[i][labels_2d[i][0]])]
+             if labels_2d[i] != ignore_index else [0]
+             for i in range(softmax_2d.shape[0])],
+            dtype="float64")
+        cross_entropy = np.reshape(cross_entropy, batch_size)
+        output_shape = tuple(batch_size + [1])
+        output_res = cross_entropy.astype("float64")
+        output_res = np.expand_dims(output_res, axis=2)
+        self.inputs = {"Logits": logits, "Label": labels}
+        self.outputs = {
+            "Softmax": softmax.astype("float64"),
+            "Loss": output_res,
+        }
+        self.attrs = {
+            "ignore_index": ignore_index,
+            "numeric_stable_mode": self.numeric_stable_mode
+        }
+    def test_check_output(self):
+        self.check_output()
+    def test_check_grad(self):
+        self.check_grad(["Logits"], "Loss")
+class TestSoftmaxWithCrossEntropyOp5NoCudnn(TestSoftmaxWithCrossEntropyOp5):
+    def initParams(self):
+        self.numeric_stable_mode = True
+class TestSoftmaxWithCrossEntropyOp6(OpTest):
+    """
+    Test softmax with cross entropy operator with soft labels.
+    """
+    def setUp(self):
+        self.op_type = "softmax_with_cross_entropy"
+        batch_size = [6, 10]
+        class_num = 37
+        logits = np.random.uniform(
+            0.1, 1.0, tuple(batch_size + [class_num])).astype("float64")
+        softmax = np.apply_along_axis(stable_softmax, 2, logits)
+        labels = np.random.uniform(
+            0.1, 1.0, tuple(batch_size + [class_num])).astype("float64")
+        labels /= np.sum(labels, axis=2, keepdims=True)
+        cross_entropy = (-labels * np.log(softmax)).sum(
+            axis=2, keepdims=True).astype("float64")
+        self.inputs = {"Logits": logits, "Label": labels}
+        self.outputs = {
+            "Softmax": softmax.astype("float64"),
+            "Loss": cross_entropy.astype("float64")
+        }
+        self.attrs = {"soft_label": True}
+    def test_check_output(self):
+        self.check_output()
+    def test_check_grad(self):
+        self.check_grad(["Logits"], "Loss")
+class TestSoftmaxWithCrossEntropyOpFp16_2(TestSoftmaxWithCrossEntropyOp):
+    def initParams(self):
+        self.numeric_stable_mode = False
+        self.dtype = np.float16
+    def setUp(self):
+        self.initParams()
+        self.op_type = "softmax_with_cross_entropy"
+        batch_size = [64, 10]
+        class_num = 37
+        # NOTE: numpy float16 have very low accuracy, use float32 for numpy check.
+        logits = np.random.uniform(
+            0.1, 1.0, tuple(batch_size + [class_num])).astype(np.float32)
+        softmax = np.apply_along_axis(stable_softmax, 2, logits)
+        labels = np.random.randint(
+            0, class_num, tuple(batch_size + [1]), dtype="int64")
+        softmax_2d = np.reshape(softmax, [-1, class_num])
+        labels_2d = np.reshape(labels, [-1, 1])
+        cross_entropy = np.asmatrix(
+            [[-np.log(softmax_2d[i][labels_2d[i][0]])]
+             for i in range(softmax_2d.shape[0])],
+            dtype=np.float32)
+        cross_entropy = np.reshape(cross_entropy, batch_size)
+        output_shape = tuple(batch_size + [1])
+        output_res = cross_entropy.astype(self.dtype)
+        output_res = np.expand_dims(output_res, axis=2)
+        self.inputs = {"Logits": logits, "Label": labels}
+        self.inputs = {
+            "Logits": logits.astype(self.dtype).view(np.uint16),
+            "Label": labels
+        }
+        self.outputs = {
+            "Softmax": softmax.astype(self.dtype),
+            "Loss": output_res,
+        }
+        self.attrs = {"numeric_stable_mode": self.numeric_stable_mode}
+    def test_check_output(self):
+        self.check_output(atol=1e-2)
+    def test_check_grad(self):
+        self.check_grad(["Logits"], "Loss", max_relative_error=0.1)
 if __name__ == "__main__":
    unittest.main()