Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into port_pybind11

0d7047ca · minqiyang · 4d4491ef · bd87f67f · 0d7047ca · 0d7047ca
20 changed file
--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@@ -55,7 +55,7 @@ paddle.fluid.Inferencer.__init__ ArgSpec(args=['self', 'infer_func', 'param_path
 paddle.fluid.Inferencer.infer ArgSpec(args=['self', 'inputs', 'return_numpy'], varargs=None, keywords=None, defaults=(True,))
 paddle.fluid.DistributeTranspiler.__init__ ArgSpec(args=['self', 'config'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.DistributeTranspiler.get_pserver_program ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.DistributeTranspiler.get_startup_program ArgSpec(args=['self', 'endpoint', 'pserver_program'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.DistributeTranspiler.get_startup_program ArgSpec(args=['self', 'endpoint', 'pserver_program', 'startup_program'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.DistributeTranspiler.get_trainer_program ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.DistributeTranspiler.transpile ArgSpec(args=['self', 'trainer_id', 'program', 'pservers', 'trainers', 'sync_mode'], varargs=None, keywords=None, defaults=(None, '127.0.0.1:6174', 1, True))
 paddle.fluid.InferenceTranspiler.__init__ 
@@ -159,6 +159,7 @@ paddle.fluid.layers.relu ArgSpec(args=['x'], varargs=None, keywords=None, defaul
 paddle.fluid.layers.log ArgSpec(args=['x'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.crop ArgSpec(args=['x', 'shape', 'offsets', 'name'], varargs=None, keywords=None, defaults=(None, None, None))
 paddle.fluid.layers.rank_loss ArgSpec(args=['label', 'left', 'right', 'name'], varargs=None, keywords=None, defaults=(None,))
+paddle.fluid.layers.flatten ArgSpec(args=['x', 'axis', 'name'], varargs=None, keywords=None, defaults=(1, None))
 paddle.fluid.layers.data ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True))
 paddle.fluid.layers.open_recordio_file ArgSpec(args=['filename', 'shapes', 'lod_levels', 'dtypes', 'pass_num', 'for_parallel'], varargs=None, keywords=None, defaults=(1, True))
 paddle.fluid.layers.open_files ArgSpec(args=['filenames', 'shapes', 'lod_levels', 'dtypes', 'thread_num', 'buffer_size', 'pass_num', 'is_test'], varargs=None, keywords=None, defaults=(None, None, 1, None))
@@ -327,7 +328,7 @@ paddle.fluid.contrib.BeamSearchDecoder.update_array ArgSpec(args=['self', 'array
 paddle.fluid.contrib.memory_usage ArgSpec(args=['program', 'batch_size'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.transpiler.DistributeTranspiler.__init__ ArgSpec(args=['self', 'config'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.transpiler.DistributeTranspiler.get_pserver_program ArgSpec(args=['self', 'endpoint'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.transpiler.DistributeTranspiler.get_startup_program ArgSpec(args=['self', 'endpoint', 'pserver_program'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.transpiler.DistributeTranspiler.get_startup_program ArgSpec(args=['self', 'endpoint', 'pserver_program', 'startup_program'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.transpiler.DistributeTranspiler.get_trainer_program ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.transpiler.DistributeTranspiler.transpile ArgSpec(args=['self', 'trainer_id', 'program', 'pservers', 'trainers', 'sync_mode'], varargs=None, keywords=None, defaults=(None, '127.0.0.1:6174', 1, True))
 paddle.fluid.transpiler.InferenceTranspiler.__init__ 

--- a/paddle/fluid/framework/details/exception_holder.h
+++ b/paddle/fluid/framework/details/exception_holder.h
@@ -14,6 +14,7 @@
 #pragma once
+#include "glog/logging.h"
 #include "paddle/fluid/platform/enforce.h"
 namespace paddle {
@@ -22,27 +23,24 @@ namespace details {
 class ExceptionHolder {
 public:
-  void Catch(const platform::EnforceNotMet& exp) {
+  void Catch(std::exception_ptr eptr) {
-    std::lock_guard<std::mutex> lock(mu_);
+    try {
-    exception_.reset(new platform::EnforceNotMet(exp));
+      std::rethrow_exception(eptr);
-    type_ = kEnforceNotMet;
+    } catch (platform::EOFException exp) {
-  }
+      Catch(exp);
+    } catch (platform::EnforceNotMet exp) {
-  void Catch(const platform::EOFException& exp) {
+      Catch(exp);
-    std::lock_guard<std::mutex> lock(mu_);
+    } catch (...) {
-    // EOFException will not cover up existing EnforceNotMet.
+      LOG(FATAL) << "Unknown exception caught";
-    if (exception_.get() == nullptr) {
-      exception_.reset(new platform::EOFException(exp));
-      type_ = kEOF;
    }
  }
-  bool ExceptionCatched() const {
+  bool IsCaught() const {
    std::lock_guard<std::mutex> lock(mu_);
    return exception_.get() != nullptr;
  }
-  void Throw() {
+  void ReThrow() {
    std::lock_guard<std::mutex> lock(mu_);
    switch (type_) {
      case kNone:
@@ -50,27 +48,41 @@ class ExceptionHolder {
      case kEnforceNotMet: {
        auto e = *static_cast<platform::EnforceNotMet*>(exception_.get());
        throw e;
-        break;
      }
      case kEOF: {
        auto e = *static_cast<platform::EOFException*>(exception_.get());
        throw e;
-        break;
      }
-      default:
-        LOG(FATAL) << "Unknown exception.";
    }
-    exception_.reset();
+    ClearImpl();
-    type_ = kNone;
  }
  void Clear() {
    std::lock_guard<std::mutex> lock(mu_);
+    ClearImpl();
+  }
+ private:
+  void ClearImpl() {
    exception_.reset();
    type_ = kNone;
  }
- private:
+  void Catch(const platform::EnforceNotMet& exp) {
+    std::lock_guard<std::mutex> lock(mu_);
+    exception_.reset(new platform::EnforceNotMet(exp));
+    type_ = kEnforceNotMet;
+  }
+  void Catch(const platform::EOFException& exp) {
+    std::lock_guard<std::mutex> lock(mu_);
+    // EOFException will not cover up existing EnforceNotMet.
+    if (exception_.get() == nullptr) {
+      exception_.reset(new platform::EOFException(exp));
+      type_ = kEOF;
+    }
+  }
  enum ExceptionType { kNone, kEnforceNotMet, kEOF };
  ExceptionType type_{kNone};

--- a/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
+++ b/paddle/fluid/framework/details/threaded_ssa_graph_executor.cc
@@ -107,11 +107,11 @@ FeedFetchList ThreadedSSAGraphExecutor::Run(
    auto cur_ready_vars = ready_vars.PopAll(1, &timeout);
    if (timeout) {
-      if (exception_holder_.ExceptionCatched()) {
+      if (exception_holder_.IsCaught()) {
        for (auto &run_op_future : run_op_futures_) {
          run_op_future.wait();
        }
-        exception_holder_.Throw();
+        exception_holder_.ReThrow();
      } else {
        continue;
      }
@@ -220,12 +220,8 @@ void ThreadedSSAGraphExecutor::RunOp(
      running_ops_--;
      ready_var_q->Extend(op->Outputs());
      VLOG(10) << op << " " << op->Name() << "Signal posted";
-    } catch (platform::EOFException ex) {
-      exception_holder_.Catch(ex);
-    } catch (platform::EnforceNotMet ex) {
-      exception_holder_.Catch(ex);
    } catch (...) {
-      LOG(FATAL) << "Unknown exception catched";
+      exception_holder_.Catch(std::current_exception());
    }
  };
  if (pool_) {

--- a/paddle/fluid/framework/ir/graph.h
+++ b/paddle/fluid/framework/ir/graph.h
@@ -28,6 +28,38 @@ namespace paddle {
 namespace framework {
 namespace ir {
+/*
+ * The graph is a Directed Acyclic Single Static Assignment Graph.
+ *
+ * In more detail, the following properties must hold:
+ *
+ *   The graph shouldn't contain cycle. Each node is a black-box to the graph
+ *   so the node itself could be a loop operator.
+ *
+ *   Each Variable-type node has only one input (thus single static assignment).
+ *
+ *   The output/input of operator is variable and the output/input of variable
+ *   is operator.
+ *
+ * The following data harzards in Program are addressed in the Graph:
+ *
+ *   Write-After-Read
+ *     a = op1(x)
+ *     x = op2(b)
+ *     A control-dependency connection is created bettwen op1 and op2 such that
+ *     op1->op2, so as to ensure correct order.
+ *
+ *   Write-After-Write
+ *     x = op1(a)
+ *     x = op2(b)
+ *     A control-dependency connection is created between op1 and op2 such that
+ *     op1->op2, so as to ensure correct order.
+ *
+ * Other properties currently hold, but is not enforced yet:
+ *
+ *   Variable-type node (not control dep) with the same variable name share
+ *   the same underlying VarDesc.
+ */
 class Graph {
 public:
  explicit Graph(const ProgramDesc &program);

--- a/paddle/fluid/framework/ir/graph_test.cc
+++ b/paddle/fluid/framework/ir/graph_test.cc
@@ -36,7 +36,7 @@ class SumOpMaker : public OpProtoAndCheckerMaker {
 public:
  void Make() {
    AddInput("X", "").AsDuplicable();
-    AddOutput("Out", "");
+    AddOutput("Out", "").AsDuplicable();
    AddComment("");
  }
 };
@@ -59,11 +59,27 @@ class SumOpVarTypeInference : public VarTypeInference {
    block->Var(out_var_name)->SetType(default_var_type);
  }
 };
+class DummyOpMaker : public OpProtoAndCheckerMaker {
+ public:
+  void Make() {
+    AddInput("X", "").AsDuplicable();
+    AddOutput("Out", "").AsDuplicable();
+    AddComment("");
+  }
+};
+class DummyOpVarTypeInference : public VarTypeInference {
+ public:
+  void operator()(const OpDesc &op_desc, BlockDesc *block) const override {}
+};
 }  // namespace framework
 }  // namespace paddle
 REGISTER_OPERATOR(sum, paddle::framework::NOP, paddle::framework::SumOpMaker,
                  paddle::framework::SumOpVarTypeInference);
+REGISTER_OPERATOR(dummy, paddle::framework::NOP, paddle::framework::SumOpMaker,
+                  paddle::framework::SumOpVarTypeInference);
 REGISTER_OPERATOR(sum_without_infer_var_type, paddle::framework::NOP,
                  paddle::framework::SumOpMaker);
@@ -110,5 +126,83 @@ TEST(GraphTest, Basic) {
  }
  ASSERT_EQ(nodes.size(), 5);
 }
+TEST(GraphTest, WriteAfterRead) {
+  // void Test() {
+  ProgramDesc prog;
+  auto *op = prog.MutableBlock(0)->AppendOp();
+  op->SetType("sum");
+  op->SetInput("X", {"a"});
+  op->SetOutput("Out", {"b"});
+  op->SetAttr("op_role", 1);
+  op = prog.MutableBlock(0)->AppendOp();
+  op->SetType("dummy");
+  op->SetInput("X", {"c"});
+  op->SetOutput("Out", {"a"});
+  op->SetAttr("op_role", 1);
+  prog.MutableBlock(0)->Var("a")->SetType(proto::VarType::LOD_TENSOR);
+  prog.MutableBlock(0)->Var("b")->SetType(proto::VarType::LOD_TENSOR);
+  prog.MutableBlock(0)->Var("c")->SetType(proto::VarType::LOD_TENSOR);
+  std::unique_ptr<ir::Graph> g(new ir::Graph(prog));
+  ir::Node *control_dep1 = nullptr;
+  ir::Node *control_dep2 = nullptr;
+  for (ir::Node *n : g->Nodes()) {
+    if (n->Name() == "sum") {
+      ASSERT_EQ(n->outputs[0]->Name(), "b");
+      ASSERT_TRUE(ir::IsControlDepVar(*n->outputs[1]));
+      control_dep1 = n->outputs[1];
+      ASSERT_EQ(n->outputs.size(), 2);
+    }
+    if (n->Name() == "dummy") {
+      ASSERT_EQ(n->inputs[0]->Name(), "c");
+      ASSERT_TRUE(ir::IsControlDepVar(*n->inputs[1]));
+      control_dep2 = n->inputs[1];
+      ASSERT_EQ(n->inputs.size(), 2);
+    }
+  }
+  ASSERT_EQ(control_dep1, control_dep2);
+}
+TEST(GraphTest, WriteAfterWrite) {
+  // void Test() {
+  ProgramDesc prog;
+  auto *op = prog.MutableBlock(0)->AppendOp();
+  op->SetType("sum");
+  op->SetInput("X", {"a"});
+  op->SetOutput("Out", {"b"});
+  op->SetAttr("op_role", 1);
+  op = prog.MutableBlock(0)->AppendOp();
+  op->SetType("dummy");
+  op->SetInput("X", {"c"});
+  op->SetOutput("Out", {"b"});
+  op->SetAttr("op_role", 1);
+  prog.MutableBlock(0)->Var("a")->SetType(proto::VarType::LOD_TENSOR);
+  prog.MutableBlock(0)->Var("b")->SetType(proto::VarType::LOD_TENSOR);
+  prog.MutableBlock(0)->Var("c")->SetType(proto::VarType::LOD_TENSOR);
+  std::unique_ptr<ir::Graph> g(new ir::Graph(prog));
+  ir::Node *control_dep1 = nullptr;
+  ir::Node *control_dep2 = nullptr;
+  for (ir::Node *n : g->Nodes()) {
+    if (n->Name() == "sum") {
+      ASSERT_EQ(n->outputs[0]->Name(), "b");
+      ASSERT_TRUE(ir::IsControlDepVar(*n->outputs[1]));
+      ASSERT_EQ(n->outputs.size(), 2);
+      control_dep1 = n->outputs[1];
+    }
+    if (n->Name() == "dummy") {
+      ASSERT_EQ(n->inputs[0]->Name(), "c");
+      ASSERT_TRUE(ir::IsControlDepVar(*n->inputs[1]));
+      control_dep2 = n->inputs[1];
+      ASSERT_EQ(n->inputs.size(), 2);
+      ASSERT_EQ(control_dep1, control_dep2);
+    }
+  }
+}
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/tensor.cc
+++ b/paddle/fluid/framework/tensor.cc
@@ -112,5 +112,6 @@ Tensor& Tensor::Resize(const DDim& dims) {
 const DDim& Tensor::dims() const { return dims_; }
 int64_t Tensor::numel() const { return product(dims_); }
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/tensor_impl.h
+++ b/paddle/fluid/framework/tensor_impl.h
@@ -59,6 +59,14 @@ inline T* Tensor::mutable_data(platform::Place place) {
 }
 inline Tensor ReshapeToMatrix(const Tensor& src, int num_col_dims) {
+  int rank = src.dims().size();
+  PADDLE_ENFORCE_GE(
+      rank, 2,
+      "'ReshapeToMatrix()' is only used for flatten high rank "
+      "tensors to matrixs. Can not be used in reshaping vectors.");
+  if (rank == 2) {
+    return src;
+  }
  Tensor res;
  res.ShareDataWith(src);
  res.Resize(flatten_to_2d(src.dims(), num_col_dims));

--- a/paddle/fluid/inference/api/api_impl.cc
+++ b/paddle/fluid/inference/api/api_impl.cc
@@ -22,6 +22,9 @@ limitations under the License. */
 #include <vector>
 #include "paddle/fluid/inference/api/api_impl.h"
+#include "paddle/fluid/platform/profiler.h"
+DEFINE_bool(profile, false, "Turn on profiler for fluid");
 namespace paddle {
 namespace {
@@ -58,6 +61,15 @@ bool NativePaddlePredictor::Init(
    std::shared_ptr<framework::Scope> parent_scope) {
  VLOG(3) << "Predictor::init()";
+  if (FLAGS_profile) {
+    LOG(WARNING) << "Profiler is actived, might affect the performance";
+    LOG(INFO) << "You can turn off by set gflags '-profile false'";
+    auto tracking_device = config_.use_gpu ? platform::ProfilerState::kAll
+                                           : platform::ProfilerState::kCPU;
+    platform::EnableProfiler(tracking_device);
+  }
  if (config_.use_gpu) {
    place_ = paddle::platform::CUDAPlace(config_.device);
  } else {
@@ -102,6 +114,10 @@ bool NativePaddlePredictor::Init(
 }
 NativePaddlePredictor::~NativePaddlePredictor() {
+  if (FLAGS_profile) {
+    platform::DisableProfiler(platform::EventSortingKey::kTotal,
+                              "./profile.log");
+  }
  if (sub_scope_) {
    scope_->DeleteScope(sub_scope_);
  }

--- a/paddle/fluid/operators/.flatten_op.cc.swp
+++ b/paddle/fluid/operators/.flatten_op.cc.swp
--- a/paddle/fluid/operators/cross_entropy_op.cc
+++ b/paddle/fluid/operators/cross_entropy_op.cc
@@ -28,23 +28,26 @@ class CrossEntropyOp : public framework::OperatorWithKernel {
    auto x_dims = ctx->GetInputDim("X");
    auto label_dims = ctx->GetInputDim("Label");
-    PADDLE_ENFORCE_EQ(x_dims.size(), 2UL, "Input(X)'s rank should be 2.");
+    int rank = x_dims.size();
-    PADDLE_ENFORCE_EQ(label_dims.size(), 2UL,
+    PADDLE_ENFORCE_EQ(rank, label_dims.size(),
-                      "Input(Label)'s rank should be 2.");
+                      "Input(X) and Input(Label) shall have the same rank.");
-    PADDLE_ENFORCE_EQ(x_dims[0], label_dims[0],
+    PADDLE_ENFORCE_EQ(framework::slice_ddim(x_dims, 0, rank - 1),
-                      "The 1st dimension of Input(X) and Input(Label) should "
+                      framework::slice_ddim(label_dims, 0, rank - 1),
-                      "be equal.");
+                      "Input(X) and Input(Label) shall have the same shape "
+                      "except the last dimension.");
    if (ctx->Attrs().Get<bool>("soft_label")) {
-      PADDLE_ENFORCE_EQ(x_dims[1], label_dims[1],
+      PADDLE_ENFORCE_EQ(x_dims[rank - 1], label_dims[rank - 1],
-                        "If Attr(soft_label) == true, the 2nd dimension of "
+                        "If Attr(soft_label) == true, the last dimension of "
                        "Input(X) and Input(Label) should be equal.");
    } else {
-      PADDLE_ENFORCE_EQ(label_dims[1], 1UL,
+      PADDLE_ENFORCE_EQ(label_dims[rank - 1], 1UL,
-                        "If Attr(softLabel) == false, the 2nd dimension of "
+                        "If Attr(softLabel) == false, the last dimension of "
                        "Input(Label) should be 1.");
    }
-    ctx->SetOutputDim("Y", {x_dims[0], 1});
+    auto y_dims = x_dims;
+    y_dims[rank - 1] = 1;
+    ctx->SetOutputDim("Y", y_dims);
    ctx->ShareLoD("X", /*->*/ "Y");
  }
@@ -74,24 +77,28 @@ class CrossEntropyGradientOp : public framework::OperatorWithKernel {
    auto x_dims = ctx->GetInputDim("X");
    auto label_dims = ctx->GetInputDim("Label");
    auto dy_dims = ctx->GetInputDim(framework::GradVarName("Y"));
-    PADDLE_ENFORCE_EQ(x_dims.size(), 2, "Input(X)'s rank should be 2.");
+    int rank = x_dims.size();
-    PADDLE_ENFORCE_EQ(dy_dims.size(), 2, "Input(Y@Grad)'s rank should be 2.");
+    PADDLE_ENFORCE_EQ(dy_dims.size(), rank,
-    PADDLE_ENFORCE_EQ(label_dims.size(), 2, "Input(Label)'s rank should be 2.");
+                      "Input(Y@Grad) and Input(X) should have the same rank.");
-    PADDLE_ENFORCE_EQ(x_dims[0], label_dims[0],
+    PADDLE_ENFORCE_EQ(label_dims.size(), rank,
-                      "The 1st dimension of Input(X) and Input(Label) should "
+                      "Input(Label) and Input(X) should have the same rank.");
-                      "be equal.");
+    PADDLE_ENFORCE_EQ(framework::slice_ddim(x_dims, 0, rank - 1),
-    PADDLE_ENFORCE_EQ(x_dims[0], dy_dims[0],
+                      framework::slice_ddim(label_dims, 0, rank - 1),
-                      "The 1st dimension of Input(X) and Input(Y@Grad) should "
+                      "The Input(X) and Input(Label) should have the same "
-                      "be equal.");
+                      "shape except the last dimension.");
-    PADDLE_ENFORCE_EQ(dy_dims[1], 1,
+    PADDLE_ENFORCE_EQ(framework::slice_ddim(x_dims, 0, rank - 1),
-                      "The 2nd dimension of Input(Y@Grad) should be 1.");
+                      framework::slice_ddim(dy_dims, 0, rank - 1),
+                      "The Input(X) and Input(Y@Grad) should have the same "
+                      "shape except the last dimension.");
+    PADDLE_ENFORCE_EQ(dy_dims[rank - 1], 1,
+                      "The last dimension of Input(Y@Grad) should be 1.");
    if (ctx->Attrs().Get<bool>("soft_label")) {
-      PADDLE_ENFORCE_EQ(x_dims[1], label_dims[1],
+      PADDLE_ENFORCE_EQ(x_dims[rank - 1], label_dims[rank - 1],
-                        "When Attr(soft_label) == true, the 2nd dimension of "
+                        "When Attr(soft_label) == true, the last dimension of "
                        "Input(X) and Input(Label) should be equal.");
    } else {
-      PADDLE_ENFORCE_EQ(label_dims[1], 1,
+      PADDLE_ENFORCE_EQ(label_dims[rank - 1], 1,
-                        "When Attr(soft_label) == false, the 2nd dimension of "
+                        "When Attr(soft_label) == false, the last dimension of "
                        "Input(Label) should be 1.");
    }
    ctx->SetOutputDim(framework::GradVarName("X"), x_dims);
@@ -113,18 +120,20 @@ class CrossEntropyOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
  void Make() override {
    AddInput("X",
-             "(Tensor, default Tensor<float>), a 2-D tensor with shape [N x D],"
+             "(Tensor, default Tensor<float>), a tensor whose last dimension "
-             " where N is the batch size and D is the number of classes. "
+             "size is equal to the number of classes. This input is a "
-             "This input is a probability computed by the previous operator, "
+             "probability computed by the previous operator, which is almost "
-             "which is almost always the result of a softmax operator.");
+             "always the result of a softmax operator.");
-    AddInput("Label",
+    AddInput(
-             "(Tensor), the ground truth which is a 2-D tensor. When "
+        "Label",
-             "soft_label is set to false, Label is a Tensor<int64> with shape "
+        "(Tensor), the tensor which represents the ground truth. It has the "
-             "[N x 1]. When soft_label is set to true, Label is a "
+        "same shape with 'X' except the last dimension. When soft_label is set "
-             "Tensor<float/double> with shape [N x D].");
+        "to false, the last dimension size is 1; when soft_label is set to "
+        "true, the last dimension size is equal to the number of classes.");
    AddOutput("Y",
-              "(Tensor, default Tensor<float>), a 2-D tensor with shape "
+              "(Tensor, default Tensor<float>), a tensor whose shape is same "
-              "[N x 1]. The cross entropy loss.");
+              "with 'X' except that the last dimension size is 1. It "
+              "represents the cross entropy loss.");
    AddAttr<bool>("soft_label",
                  "(bool, default false), a flag indicating whether to "
                  "interpretate the given labels as soft labels.")
@@ -132,6 +141,12 @@ class CrossEntropyOpMaker : public framework::OpProtoAndCheckerMaker {
    AddComment(R"DOC(
 CrossEntropy Operator.
+The input 'X' and 'Label' will first be logically flattened to 2-D matrixs. 
+The matrix's second dimension(row length) is as same as the original last 
+dimension, and the first dimension(column length) is the product of all other 
+original dimensions. Then the softmax computation will take palce on each raw 
+of flattened matrixs.
 It supports both standard cross-entropy and soft-label cross-entropy loss
 computation.
 1) One-hot cross-entropy:

--- a/paddle/fluid/operators/cross_entropy_op.h
+++ b/paddle/fluid/operators/cross_entropy_op.h
@@ -33,8 +33,13 @@ class CrossEntropyOpKernel : public framework::OpKernel<T> {
    auto* y = ctx.Output<Tensor>("Y");
    y->mutable_data<T>(ctx.GetPlace());
+    int rank = x->dims().size();
+    Tensor x_2d = framework::ReshapeToMatrix(*x, rank - 1);
+    Tensor labels_2d = framework::ReshapeToMatrix(*labels, rank - 1);
+    Tensor y_2d = framework::ReshapeToMatrix(*y, rank - 1);
    math::CrossEntropyFunctor<DeviceContext, T>()(
-        ctx.template device_context<DeviceContext>(), y, x, labels,
+        ctx.template device_context<DeviceContext>(), &y_2d, &x_2d, &labels_2d,
        ctx.Attr<bool>("soft_label"));
  }
 };
@@ -98,9 +103,12 @@ class CrossEntropyGradientOpKernel : public framework::OpKernel<T> {
    auto* dy = ctx.Input<Tensor>(framework::GradVarName("Y"));
    auto* label = ctx.Input<Tensor>("Label");
    auto* dx = ctx.Output<Tensor>(framework::GradVarName("X"));
-    auto* dx_data = dx->mutable_data<T>(ctx.GetPlace());
+    T* dx_data = dx->mutable_data<T>(ctx.GetPlace());
-    int64_t class_num = x->dims()[1];
+    // Following computation only depends on the last dimension size. So it's
+    // unnecessary to convert tensors to 2-D views.
+    int rank = x->dims().size();
+    int64_t class_num = x->dims()[rank - 1];
    if (ctx.Attr<bool>("soft_label")) {
      XeSoftlabelGradFunctor<T> functor(dx_data, dy->data<T>(), x->data<T>(),
                                        label->data<T>(),

--- a/paddle/fluid/operators/shape_op.cc
+++ b/paddle/fluid/operators/shape_op.cc
@@ -38,7 +38,7 @@ class ShapeOpMaker : public framework::OpProtoAndCheckerMaker {
    AddInput("Input", "(Tensor), The input tensor.");
    AddOutput("Out",
              "(Tensor), The shape of input tensor, the data type of the shape"
-              " is int64_t, will be on the same device with the input Tensor.");
+              " is int32_t, will be on the same device with the input Tensor.");
    AddComment(R"DOC(
 Shape Operator
@@ -53,5 +53,5 @@ Get the shape of input tensor. Only support CPU input Tensor now.
 namespace ops = paddle::operators;
 REGISTER_OPERATOR(shape, ops::ShapeOp, ops::ShapeOpMaker,
                  paddle::framework::EmptyGradOpMaker);
-REGISTER_OP_CPU_KERNEL(shape, ops::ShapeKernel<int>, ops::ShapeKernel<int64_t>,
+REGISTER_OP_CPU_KERNEL(shape, ops::ShapeKernel<int>, ops::ShapeKernel<int32_t>,
                       ops::ShapeKernel<float>, ops::ShapeKernel<double>);
--- a/paddle/fluid/operators/shape_op.cu
+++ b/paddle/fluid/operators/shape_op.cu
@@ -15,6 +15,6 @@ limitations under the License. */
 #include "paddle/fluid/operators/shape_op.h"
 REGISTER_OP_CUDA_KERNEL(shape, paddle::operators::ShapeKernel<int>,
-                        paddle::operators::ShapeKernel<int64_t>,
+                        paddle::operators::ShapeKernel<int32_t>,
                        paddle::operators::ShapeKernel<float>,
                        paddle::operators::ShapeKernel<double>);
--- a/paddle/fluid/operators/shape_op.h
+++ b/paddle/fluid/operators/shape_op.h
@@ -27,7 +27,7 @@ class ShapeKernel : public framework::OpKernel<T> {
  void Compute(const framework::ExecutionContext& ctx) const override {
    auto* in_t = ctx.Input<Tensor>("Input");
    auto* out_t = ctx.Output<Tensor>("Out");
-    auto out_data = out_t->mutable_data<int64_t>(platform::CPUPlace());
+    auto out_data = out_t->mutable_data<int32_t>(platform::CPUPlace());
    auto in_dims = in_t->dims();
    for (int i = 0; i < in_dims.size(); ++i) {
      out_data[i] = in_dims[i];

--- a/paddle/fluid/operators/softmax_op.h
+++ b/paddle/fluid/operators/softmax_op.h
@@ -31,16 +31,12 @@ class SoftmaxKernel : public framework::OpKernel<T> {
    // allocate memory on device.
    Out->mutable_data<T>(context.GetPlace());
-    auto dims = X->dims();
+    int rank = X->dims().size();
-    auto flattened_dims = framework::flatten_to_2d(dims, dims.size() - 1);
+    Tensor X_2d = framework::ReshapeToMatrix(*X, rank - 1);
-    framework::LoDTensor flattened_x;
+    Tensor Out_2d = framework::ReshapeToMatrix(*Out, rank - 1);
-    framework::LoDTensor flattened_out;
-    flattened_x.ShareDataWith(*X).Resize(flattened_dims);
-    flattened_out.ShareDataWith(*Out).Resize(flattened_dims);
    math::SoftmaxFunctor<DeviceContext, T>()(
-        context.template device_context<DeviceContext>(), &flattened_x,
+        context.template device_context<DeviceContext>(), &X_2d, &Out_2d);
-        &flattened_out);
  }
 };
@@ -55,18 +51,14 @@ class SoftmaxGradKernel : public framework::OpKernel<T> {
    // allocate memory on device.
    dX->mutable_data<T>(context.GetPlace());
-    auto dims = Out->dims();
+    int rank = Out->dims().size();
-    auto flattened_dims = framework::flatten_to_2d(dims, dims.size() - 1);
+    Tensor Out_2d = framework::ReshapeToMatrix(*Out, rank - 1);
-    framework::LoDTensor flattened_out;
+    Tensor dOut_2d = framework::ReshapeToMatrix(*dOut, rank - 1);
-    framework::LoDTensor flattened_d_out;
+    Tensor dX_2d = framework::ReshapeToMatrix(*dX, rank - 1);
-    framework::LoDTensor flattened_d_x;
-    flattened_out.ShareDataWith(*Out).Resize(flattened_dims);
-    flattened_d_out.ShareDataWith(*dOut).Resize(flattened_dims);
-    flattened_d_x.ShareDataWith(*dX).Resize(flattened_dims);
    math::SoftmaxGradFunctor<DeviceContext, T>()(
-        context.template device_context<DeviceContext>(), &flattened_out,
+        context.template device_context<DeviceContext>(), &Out_2d, &dOut_2d,
-        &flattened_d_out, &flattened_d_x);
+        &dX_2d);
  }
 };

--- a/python/paddle/fluid/layers/detection.py
+++ b/python/paddle/fluid/layers/detection.py
@@ -20,9 +20,11 @@ from .layer_function_generator import autodoc, templatedoc
 from ..layer_helper import LayerHelper
 from . import tensor
 from . import nn
+from . import ops
 from ... import compat as cpt
 import math
 import six
+import numpy
 from functools import reduce
 __all__ = [
@@ -266,10 +268,11 @@ def detection_output(loc,
        prior_box_var=prior_box_var,
        target_box=loc,
        code_type='decode_center_size')
-    old_shape = scores.shape
+    compile_shape = scores.shape
-    scores = nn.reshape(x=scores, shape=(-1, old_shape[-1]))
+    run_shape = ops.shape(scores)
+    scores = nn.flatten(x=scores, axis=2)
    scores = nn.softmax(input=scores)
-    scores = nn.reshape(x=scores, shape=old_shape)
+    scores = nn.reshape(x=scores, shape=compile_shape, actual_shape=run_shape)
    scores = nn.transpose(scores, perm=[0, 2, 1])
    scores.stop_gradient = True
    nmsed_outs = helper.create_tmp_variable(dtype=decoded_box.dtype)
@@ -679,9 +682,10 @@ def ssd_loss(location,
        raise ValueError("Only support mining_type == max_negative now.")
    num, num_prior, num_class = confidence.shape
+    conf_shape = ops.shape(confidence)
    def __reshape_to_2d(var):
-        return nn.reshape(x=var, shape=[-1, var.shape[-1]])
+        return nn.flatten(x=var, axis=2)
    # 1. Find matched boundding box by prior box.
    #   1.1 Compute IOU similarity between ground-truth boxes and prior boxes.
@@ -692,7 +696,8 @@ def ssd_loss(location,
    # 2. Compute confidence for mining hard examples
    # 2.1. Get the target label based on matched indices
-    gt_label = nn.reshape(x=gt_label, shape=gt_label.shape + (1, ))
+    gt_label = nn.reshape(
+        x=gt_label, shape=(len(gt_label.shape) - 1) * (0, ) + (-1, 1))
    gt_label.stop_gradient = True
    target_label, _ = target_assign(
        gt_label, matched_indices, mismatch_value=background_label)
@@ -703,9 +708,12 @@ def ssd_loss(location,
    target_label = __reshape_to_2d(target_label)
    target_label.stop_gradient = True
    conf_loss = nn.softmax_with_cross_entropy(confidence, target_label)
    # 3. Mining hard examples
-    conf_loss = nn.reshape(x=conf_loss, shape=(num, num_prior))
+    conf_loss = nn.reshape(
+        x=conf_loss,
+        shape=(num, num_prior),
+        actual_shape=ops.slice(
+            conf_shape, axes=[0], starts=[0], ends=[2]))
    conf_loss.stop_gradient = True
    neg_indices = helper.create_tmp_variable(dtype='int32')
    dtype = matched_indices.dtype
@@ -774,7 +782,11 @@ def ssd_loss(location,
    # 5.3 Compute overall weighted loss.
    loss = conf_loss_weight * conf_loss + loc_loss_weight * loc_loss
    # reshape to [N, Np], N is the batch size and Np is the prior box number.
-    loss = nn.reshape(x=loss, shape=[-1, num_prior])
+    loss = nn.reshape(
+        x=loss,
+        shape=(num, num_prior),
+        actual_shape=ops.slice(
+            conf_shape, axes=[0], starts=[0], ends=[2]))
    loss = nn.reduce_sum(loss, dim=1, keep_dim=True)
    if normalize:
        normalizer = nn.reduce_sum(target_loc_weight)
@@ -1007,13 +1019,7 @@ def multi_box_head(inputs,
    """
    def _reshape_with_axis_(input, axis=1):
-        if not (axis > 0 and axis < len(input.shape)):
+        out = nn.flatten(x=input, axis=axis)
-            raise ValueError("The axis should be smaller than "
-                             "the arity of input and bigger than 0.")
-        new_shape = [
-            -1, reduce(lambda x, y: x * y, input.shape[axis:len(input.shape)])
-        ]
-        out = nn.reshape(x=input, shape=new_shape)
        return out
    def _is_list_or_tuple_(data):
@@ -1103,11 +1109,13 @@ def multi_box_head(inputs,
            stride=stride)
        mbox_loc = nn.transpose(mbox_loc, perm=[0, 2, 3, 1])
-        new_shape = [
+        compile_shape = [
-            mbox_loc.shape[0], mbox_loc.shape[1] * mbox_loc.shape[2] *
+            mbox_loc.shape[0], cpt.floor_division(
-            cpt.floor_division(mbox_loc.shape[3], 4), 4
+                box_loc.shape[1] * mbox_loc.shape[2] * mbox_loc.shape[3], 4), 4
        ]
-        mbox_loc_flatten = nn.reshape(mbox_loc, shape=new_shape)
+        run_shape = tensor.assign(numpy.array([0, -1, 4]).astype("int32"))
+        mbox_loc_flatten = nn.reshape(
+            mbox_loc, shape=compile_shape, actual_shape=run_shape)
        mbox_locs.append(mbox_loc_flatten)
        # get conf
@@ -1119,11 +1127,16 @@ def multi_box_head(inputs,
            padding=pad,
            stride=stride)
        conf_loc = nn.transpose(conf_loc, perm=[0, 2, 3, 1])
-        new_shape = [
+        new_shape = [0, -1, num_classes]
-            conf_loc.shape[0], conf_loc.shape[1] * conf_loc.shape[2] *
+        compile_shape = [
-            cpt.floor_division(conf_loc.shape[3], num_classes), num_classes
+            conf_loc.shape[0],
+            cpt.floor_division(conf_loc.shape[1] * conf_loc.shape[2] *
+                               conf_loc.shape[3], num_classes), num_classes
        ]
-        conf_loc_flatten = nn.reshape(conf_loc, shape=new_shape)
+        run_shape = tensor.assign(
+            numpy.array([0, -1, num_classes]).astype("int32"))
+        conf_loc_flatten = nn.reshape(
+            conf_loc, shape=compile_shape, actual_shape=run_shape)
        mbox_confs.append(conf_loc_flatten)
    if len(box_results) == 1:

--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -112,6 +112,7 @@ __all__ = [
    'log',
    'crop',
    'rank_loss',
+    'flatten',
 ]
@@ -5361,3 +5362,70 @@ def rank_loss(label, left, right, name=None):
                "Right": right},
        outputs={'Out': out})
    return out
+def flatten(x, axis=1, name=None):
+    """
+    **Flatten layer**
+    Flattens the input tensor into a 2D matrix.
+    Examples:
+    Case 1:
+      Given
+        X.shape = (3, 100, 100, 4)
+      and
+        axis = 2
+      We get:
+        Out.shape = (3 * 100, 4 * 100)
+    Case 2:
+      Given
+        X.shape = (3, 100, 100, 4)
+      and
+        axis = 0
+      We get:
+        Out.shape = (1, 3 * 100 * 100 * 4)
+    Args:
+        x (Variable): A tensor of rank >= axis.
+        axis (int): Indicate up to which input dimensions (exclusive) should 
+                    be flattened to the outer dimension of the output. 
+                    The value for axis must be in the range [0, R], where R
+                    is the rank of the input tensor. When axis = 0, the shape
+                    of the output tensor is (1, (d_0 X d_1 ... d_n), where the
+                    shape of the input tensor is (d_0, d_1, ... d_n).
+        name(str|None): A name for this layer(optional). If set None, the layer
+                        will be named automatically.
+    Returns:
+        Variable: A 2D tensor with the contents of the input tensor, with input
+                  dimensions up to axis flattened to the outer dimension of
+                  the output and remaining input dimensions flattened into the
+                  inner dimension of the output.
+    Raises:
+        ValueError: If x is not a variable.
+        ValueError: If axis is not in range [0, rank(x)]. 
+    Examples:
+        .. code-block:: python
+            x = fluid.layers.data(name="x", shape=[4, 4, 3], dtype="float32")
+            out = fluid.layers.flatten(x=x, axis=2)
+    """
+    helper = LayerHelper('flatten', **locals())
+    if not (isinstance(x, Variable)):
+        raise ValueError("The input x should be a Variable")
+    if not (isinstance(axis, int)) or axis > len(x.shape) or axis < 0:
+        raise ValueError("The axis should be a int, and in range [0, rank(x)]")
+    out = helper.create_tmp_variable(x.dtype)
+    helper.append_op(
+        type='flatten',
+        inputs={"X": x},
+        outputs={'Out': out},
+        attrs={"axis": axis})
+    return out
--- a/python/paddle/fluid/tests/unittests/test_cross_entropy_op.py
+++ b/python/paddle/fluid/tests/unittests/test_cross_entropy_op.py
@@ -105,5 +105,107 @@ class TestCrossEntropyOp3(OpTest):
            ["X"], "Y", max_relative_error=0.05, numeric_grad_delta=0.001)
+class TestCrossEntropyOp4(OpTest):
+    """Test high rank tensor cross-entropy with discrete one-hot labels.
+    """
+    def setUp(self):
+        self.op_type = "cross_entropy"
+        shape = [10, 2, 4]
+        ins_num = np.prod(np.array(shape))
+        class_num = 10
+        X_2d = randomize_probability(ins_num, class_num, dtype='float64')
+        label_2d = np.random.randint(0, class_num, (ins_num, 1), dtype="int64")
+        cross_entropy_2d = np.asmatrix(
+            [[-np.log(X_2d[i][label_2d[i][0]])] for i in range(X_2d.shape[0])],
+            dtype="float64")
+        X = X_2d.reshape(shape + [class_num])
+        label = label_2d.reshape(shape + [1])
+        cross_entropy = np.array(cross_entropy_2d).reshape(shape + [1])
+        self.inputs = {"X": X, "Label": label}
+        self.outputs = {"Y": cross_entropy}
+        self.attrs = {"soft_label": False}
+    def test_check_output(self):
+        self.check_output()
+    def test_check_grad(self):
+        self.check_grad(["X"], "Y", numeric_grad_delta=0.001)
+class TestCrossEntropyOp5(OpTest):
+    """Test high rank tensor cross-entropy with vectorized soft labels.
+    """
+    def setUp(self):
+        self.op_type = "cross_entropy"
+        shape = [4, 3]
+        ins_num = np.prod(np.array(shape))
+        class_num = 37
+        X_2d = randomize_probability(ins_num, class_num)
+        label_2d = np.random.uniform(0.1, 1.0,
+                                     [ins_num, class_num]).astype("float32")
+        label_2d /= label_2d.sum(axis=1, keepdims=True)
+        cross_entropy_2d = (-label_2d * np.log(X_2d)).sum(
+            axis=1, keepdims=True).astype("float32")
+        X = X_2d.reshape(shape + [class_num])
+        label = label_2d.reshape(shape + [class_num])
+        cross_entropy = np.array(cross_entropy_2d).reshape(shape + [1])
+        self.inputs = {"X": X, "Label": label}
+        self.outputs = {"Y": cross_entropy}
+        self.attrs = {"soft_label": True}
+    def test_check_output(self):
+        self.check_output()
+    def test_check_grad(self):
+        self.check_grad(
+            ["X"], "Y", max_relative_error=0.05, numeric_grad_delta=0.001)
+class TestCrossEntropyOp6(OpTest):
+    """Test high rank tensor cross-entropy with vectorized one-hot representation of labels.
+    """
+    def setUp(self):
+        self.op_type = "cross_entropy"
+        shape = [4, 3, 2]
+        ins_num = np.prod(np.array(shape))
+        class_num = 17
+        X_2d = randomize_probability(ins_num, class_num)
+        label_index_2d = np.random.randint(
+            0, class_num, (ins_num), dtype="int32")
+        label_2d = np.zeros(X_2d.shape)
+        label_2d[np.arange(ins_num), label_index_2d] = 1
+        cross_entropy_2d = np.asmatrix(
+            [[-np.log(X_2d[i][label_index_2d[i]])]
+             for i in range(X_2d.shape[0])],
+            dtype="float32")
+        X = X_2d.reshape(shape + [class_num])
+        label = label_2d.reshape(shape + [class_num])
+        cross_entropy = np.array(cross_entropy_2d).reshape(shape + [1])
+        self.inputs = {"X": X, "Label": label.astype(np.float32)}
+        self.outputs = {"Y": cross_entropy}
+        self.attrs = {"soft_label": True}
+    def test_check_output(self):
+        self.check_output()
+    def test_check_grad(self):
+        self.check_grad(
+            ["X"], "Y", max_relative_error=0.05, numeric_grad_delta=0.001)
 if __name__ == "__main__":
    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_layers.py
@@ -465,6 +465,17 @@ class TestBook(unittest.TestCase):
            self.assertIsNotNone(out)
        print(str(program))
+    def test_flatten(self):
+        program = Program()
+        with program_guard(program):
+            x = layers.data(
+                name='x',
+                append_batch_size=False,
+                shape=[4, 4, 3],
+                dtype="float32")
+            out = layers.flatten(x, axis=1, name="flatten")
+            self.assertIsNotNone(out)
    def test_shape(self):
        program = Program()
        with program_guard(program):

--- a/python/paddle/fluid/transpiler/distribute_transpiler.py
+++ b/python/paddle/fluid/transpiler/distribute_transpiler.py
@@ -532,7 +532,10 @@ class DistributeTranspiler(object):
        pserver_program._sync_with_cpp()
        return pserver_program
-    def get_startup_program(self, endpoint, pserver_program):
+    def get_startup_program(self,
+                            endpoint,
+                            pserver_program,
+                            startup_program=None):
        """
        Get startup program for current parameter server.
        Modify operator input variables if there are variables that
@@ -542,12 +545,17 @@ class DistributeTranspiler(object):
            endpoint (str): current pserver endpoint.
            pserver_program (Program): call get_pserver_program first and
                pass the result here.
+            startup_program (Program): if pass None, will use
+                default_startup_program
        Returns:
            Program: parameter server side startup program.
        """
        s_prog = Program()
+        if not startup_program:
            orig_s_prog = default_startup_program()
+        else:
+            orig_s_prog = startup_program
        s_prog.random_seed = orig_s_prog.random_seed
        params = self.param_grad_ep_mapping[endpoint]["params"]