Merge branch 'develop' into unsqueeze_op

938319bb · chenweihang · GitHub · b8ea7a08 · 092d6201 · 938319bb
28 changed file
--- a/paddle/fluid/inference/analysis/data_flow_graph.cc
+++ b/paddle/fluid/inference/analysis/data_flow_graph.cc
@@ -90,6 +90,20 @@ std::string DataFlowGraph::DotString() const {
  return dot.Build();
 }
+std::string DataFlowGraph::HumanReadableInfo(bool show_values,
+                                             bool show_functions) const {
+  std::stringstream values, functions;
+  for (auto &n : nodes.nodes()) {
+    if (show_values && n->IsValue()) {
+      values << n->repr() << "\n";
+    }
+    if (show_functions && n->IsFunction()) {
+      functions << n->repr() << "\n";
+    }
+  }
+  return "Values:\n" + values.str() + "\n\n" + "Functions:\n" + functions.str();
+}
 //
 // NodesBFSIterator
 //
@@ -146,7 +160,7 @@ bool GraphTraits<DataFlowGraph>::NodesBFSIterator::operator==(
  if ((!queue_.empty()) && (!other.queue_.empty())) {
    return queue_.front() == other.queue_.front() &&
           visited_.size() == other.visited_.size();  // here need to check the
-                                                      // equality of queue and
+    // equality of queue and
    // visited. Just a light but week implementation.
  }
  return false;
@@ -208,6 +222,76 @@ Node *GraphTraits<DataFlowGraph>::NodesDFSIterator::operator->() {
  return stack_.top();
 }
+GraphTraits<DataFlowGraph>::NodesTSIterator::NodesTSIterator(
+    const std::vector<Node *> &source) {
+  PADDLE_ENFORCE(!source.empty(),
+                 "Start points of topological sorting should not be empty!");
+  std::unordered_set<Node *> visited;
+  std::unordered_set<Node *> to_visit{source.begin(), source.end()};
+  std::vector<Node *> inlink_visited;
+  while (!to_visit.empty()) {
+    std::vector<Node *> queue(to_visit.begin(), to_visit.end());
+    for (auto *p : queue) {
+      inlink_visited.clear();
+      std::copy_if(p->inlinks.begin(), p->inlinks.end(),
+                   std::back_inserter(inlink_visited),
+                   [&](Node *x) { return visited.count(x); });
+      if (inlink_visited.size() == p->inlinks.size()) {
+        sorted_.push_back(p);
+        for (auto *_ : p->outlinks) {
+          if (!visited.count(_)) {
+            to_visit.insert(_);
+          }
+        }
+        to_visit.erase(p);
+        visited.insert(p);
+      }
+    }
+  }
+}
+GraphTraits<DataFlowGraph>::NodesTSIterator::NodesTSIterator(
+    const paddle::inference::analysis::GraphTraits<
+        DataFlowGraph>::NodesTSIterator &other)
+    : sorted_(other.sorted_), cursor_(other.cursor_) {}
+Node &GraphTraits<DataFlowGraph>::NodesTSIterator::operator*() {
+  PADDLE_ENFORCE_LT(cursor_, sorted_.size());
+  return *sorted_[cursor_];
+}
+paddle::inference::analysis::GraphTraits<DataFlowGraph>::NodesTSIterator
+    &GraphTraits<DataFlowGraph>::NodesTSIterator::operator++() {
+  if (++cursor_ >= sorted_.size()) {
+    sorted_.clear();
+    cursor_ = 0;
+  }
+  return *this;
+}
+paddle::inference::analysis::GraphTraits<DataFlowGraph>::NodesTSIterator &
+GraphTraits<DataFlowGraph>::NodesTSIterator::operator=(
+    const paddle::inference::analysis::GraphTraits<
+        DataFlowGraph>::NodesTSIterator &other) {
+  cursor_ = other.cursor_;
+  sorted_ = other.sorted_;
+  return *this;
+}
+bool GraphTraits<DataFlowGraph>::NodesTSIterator::operator==(
+    const paddle::inference::analysis::GraphTraits<
+        DataFlowGraph>::NodesTSIterator &other) {
+  return sorted_ == other.sorted_ && cursor_ == other.cursor_;
+}
+Node *GraphTraits<DataFlowGraph>::NodesTSIterator::operator->() {
+  PADDLE_ENFORCE_LT(cursor_, sorted_.size());
+  return sorted_[cursor_];
+}
 }  // namespace analysis
 }  // namespace inference
 }  // namespace paddle
--- a/paddle/fluid/inference/analysis/data_flow_graph.h
+++ b/paddle/fluid/inference/analysis/data_flow_graph.h
@@ -48,6 +48,9 @@ struct DataFlowGraph {
  // Output a DOT graph file for debug.
  std::string DotString() const;
+  std::string HumanReadableInfo(bool show_values = true,
+                                bool show_functions = true) const;
 private:
  // Remove duplicate edges and so on.
  void Clean();
@@ -107,6 +110,32 @@ struct GraphTraits<DataFlowGraph> {
    std::unordered_set<Node *> visited_;
  };
+  // Topological sorting iterator on nodes.
+  struct NodesTSIterator
+      : public std::iterator<std::forward_iterator_tag, Node *> {
+    NodesTSIterator() = default;
+    explicit NodesTSIterator(const std::vector<Node *> &source);
+    NodesTSIterator(NodesTSIterator &&other)
+        : sorted_(std::move(other.sorted_)), cursor_(other.cursor_) {
+      other.cursor_ = 0;
+    }
+    NodesTSIterator(const NodesTSIterator &other);
+    Node &operator*();
+    NodesTSIterator &operator++();
+    // TODO(Superjomn) current implementation just compare the first
+    // element, need to compare the graph and all the elements in the queue and
+    // set.
+    NodesTSIterator &operator=(const NodesTSIterator &other);
+    bool operator==(const NodesTSIterator &other);
+    bool operator!=(const NodesTSIterator &other) { return !(*this == other); }
+    Node *operator->();
+   private:
+    std::vector<Node *> sorted_;
+    int cursor_{0};
+  };
  explicit GraphTraits(DataFlowGraph *graph) : graph_(graph) {}
  // default use BFS to visit the nodes.
@@ -119,17 +148,24 @@ struct GraphTraits<DataFlowGraph> {
  iterator_range<NodesDFSIterator> nodes_in_DFS() {
    return iterator_range<NodesDFSIterator>(nodes_dfs_begin(), nodes_dfs_end());
  }
+  iterator_range<NodesTSIterator> nodes_in_TS() {
+    return iterator_range<NodesTSIterator>(nodes_ts_begin(), nodes_ts_end());
+  }
 private:
  NodesBFSIterator nodes_bfs_begin() {
    return NodesBFSIterator(graph_->inputs);
  }
  NodesBFSIterator nodes_bfs_end() { return NodesBFSIterator(); }
  NodesDFSIterator nodes_dfs_begin() {
    return NodesDFSIterator(graph_->inputs);
  }
  NodesDFSIterator nodes_dfs_end() { return NodesDFSIterator(); }
+  NodesTSIterator nodes_ts_begin() { return NodesTSIterator(graph_->inputs); }
+  NodesTSIterator nodes_ts_end() { return NodesTSIterator(); }
 private:
  DataFlowGraph *graph_;
 };

--- a/paddle/fluid/inference/analysis/data_flow_graph_tester.cc
+++ b/paddle/fluid/inference/analysis/data_flow_graph_tester.cc
@@ -24,11 +24,11 @@ TEST(DataFlowGraph, BFS) {
  auto dfg = ProgramDescToDFG(desc);
  dfg.Build();
-  for (auto* in : dfg.inputs) {
+  for (auto *in : dfg.inputs) {
    LOG(INFO) << "inputs: " << in->name() << " "
              << static_cast<int>(in->type());
  }
-  for (auto* out : dfg.outputs) {
+  for (auto *out : dfg.outputs) {
    LOG(INFO) << "outputs: " << out->name() << " "
              << static_cast<int>(out->type());
  }
@@ -57,6 +57,71 @@ TEST(DataFlowGraph, DFS) {
  ASSERT_EQ(count, dfg.nodes.size());
 }
+// Topological sorting.
+/*
+ * Graph topology
+ * inputs: 0, 1, 2
+ * 0 -> 4
+ * 0 -> 5
+ * 1 -> 6
+ * 2 -> 7
+ * 4 -> 5
+ * 4 -> 7
+ * 4 -> 3
+ * 7 -> 3
+ */
+TEST(DataFlowGraph, TS) {
+  DataFlowGraph graph;
+  for (int i = 0; i < 8; i++) {
+    auto *node = graph.nodes.Create(Node::Type::kValue);
+    node->SetName("node-" + std::to_string(i));
+  }
+  auto add_link = [&](int i, int j) {
+    Node *source = graph.nodes.GetMutable(i);
+    Node *target = graph.nodes.GetMutable(j);
+    target->inlinks.push_back(source);
+    source->outlinks.push_back(target);
+  };
+  graph.inputs.push_back(graph.nodes.GetMutable(0));
+  graph.inputs.push_back(graph.nodes.GetMutable(1));
+  graph.inputs.push_back(graph.nodes.GetMutable(2));
+  add_link(0, 4);
+  add_link(0, 5);
+  add_link(1, 6);
+  add_link(2, 7);
+  add_link(4, 5);
+  add_link(4, 7);
+  add_link(4, 3);
+  add_link(7, 3);
+  auto its = GraphTraits<DataFlowGraph>(&graph).nodes_in_TS();
+  std::vector<int> sorted_ids;
+  for (auto it = its.begin(); it != its.end(); ++it) {
+    LOG(INFO) << it->name();
+    sorted_ids.push_back(it->id());
+  }
+  // Assert a occurs prior to b in the sorted_ids.
+  auto assert_positive_sequence_pair = [&](int a, int b) {
+    auto a_offset = std::find(sorted_ids.begin(), sorted_ids.end(), a);
+    auto b_offset = std::find(sorted_ids.begin(), sorted_ids.end(), b);
+    ASSERT_LT(a_offset, b_offset);
+  };
+  assert_positive_sequence_pair(2, 7);
+  assert_positive_sequence_pair(7, 3);
+  assert_positive_sequence_pair(4, 3);
+  assert_positive_sequence_pair(0, 4);
+  assert_positive_sequence_pair(0, 5);
+  assert_positive_sequence_pair(1, 6);
+  assert_positive_sequence_pair(4, 5);
+  assert_positive_sequence_pair(4, 7);
+}
 }  // namespace analysis
 }  // namespace inference
 }  // namespace paddle
--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -266,6 +266,7 @@ op_library(warpctc_op DEPS dynload_warpctc sequence_padding sequence_scale)
 op_library(cos_sim_op DEPS cos_sim_functor)
 op_library(parallel_do_op DEPS executor)
 op_library(unsqueeze_op DEPS reshape_op)
+op_library(squeeze_op DEPS reshape_op)
 if (WITH_GPU)
    op_library(conv_op DEPS vol2col depthwise_conv im2col)

--- a/paddle/fluid/operators/detection/rpn_target_assign_op.cc
+++ b/paddle/fluid/operators/detection/rpn_target_assign_op.cc
@@ -86,8 +86,9 @@ class RpnTargetAssignKernel : public framework::OpKernel<T> {
                         std::minstd_rand engine,
                         std::vector<int>* inds) const {
    std::uniform_real_distribution<float> uniform(0, 1);
-    if (inds->size() > num) {
+    const int64_t size = static_cast<int64_t>(inds->size());
-      for (int i = num; i < inds->size(); ++i) {
+    if (size > num) {
+      for (int64_t i = num; i < size; ++i) {
        int rng_ind = std::floor(uniform(engine) * i);
        if (rng_ind < num)
          std::iter_swap(inds->begin() + rng_ind + offset,

--- a/paddle/fluid/operators/im2sequence_op.cc
+++ b/paddle/fluid/operators/im2sequence_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/operators/im2sequence_op.h"
+#include <string>
 #include <vector>
 namespace paddle {
@@ -28,20 +29,19 @@ class Im2SequenceOp : public framework::OperatorWithKernel {
                   "Input(X) of Im2SequenceOp should not be null.");
    PADDLE_ENFORCE(ctx->HasOutput("Out"),
                   "Output(Out) of Im2SequenceOp op should not be null.");
    auto in_dim = ctx->GetInputDim("X");
    PADDLE_ENFORCE_EQ(in_dim.size(), 4,
                      "Input(X) format must be 4D tensor, eg., NCHW.");
-    auto kernels = ctx->Attrs().Get<std::vector<int>>("kernels");
-    auto strides = ctx->Attrs().Get<std::vector<int>>("strides");
-    auto paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
    int batch_size = in_dim[0];
    int img_channels = in_dim[1];
    int img_height = in_dim[2];
    int img_width = in_dim[3];
+    auto kernels = ctx->Attrs().Get<std::vector<int>>("kernels");
+    auto strides = ctx->Attrs().Get<std::vector<int>>("strides");
+    auto paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
    int output_height = Im2SeqOutputSize(img_height, kernels[0], paddings[0],
                                         paddings[2], strides[0]);
    int output_width = Im2SeqOutputSize(img_width, kernels[1], paddings[1],
@@ -61,6 +61,10 @@ class Im2SequenceOpMaker : public framework::OpProtoAndCheckerMaker {
             "C: channels"
             "H: height"
             "W: width");
+    AddInput("Y",
+             "(Tensor) The input tensor of image real size(H, W)."
+             "2-D with shape [batchsize, 2]")
+        .AsDispensable();
    AddOutput("Out", "(LodTensor) The output data of im2sequence op,");
    AddAttr<std::vector<int>>("kernels",
                              "(vector<int>), the "
@@ -73,6 +77,13 @@ class Im2SequenceOpMaker : public framework::OpProtoAndCheckerMaker {
                              "(vector<int> default:{0, 0, 0, 0}), the "
                              "paddings(up_pad, left_pad, down_pad, right_pad)")
        .SetDefault({0, 0, 0, 0});
+    AddAttr<std::vector<int>>("out_stride",
+                              "the attribute is valid only when input(Y)"
+                              "is not NULL.this attribute represents the"
+                              "scaling of the pic through the CNN"
+                              "(vector<int> dedault:{1,1}),the out_stride"
+                              " (out_stride_height, out_stride_width)")
+        .SetDefault({1, 1});
    AddComment(R"DOC(
 This op uses kernels to scan images and converts these images to sequences.
 After expanding, The number of time steps are output_height * output_width
@@ -123,7 +134,7 @@ output.data = [[ 6.  2.  8.  3.  2.  4.  6.  3.]
               [ 7.  1.  7.  9.  2.  1.  3.  5.]
               [ 5.  7.  2.  4.  1.  3.  9.  0.]
               [ 7.  9.  4.  8.  3.  5.  0.  8.]]
-output.dims = {8, 9}
+output.dims = {8, 8}
 output.lod = [[0, 4, 8]]
 )DOC");

--- a/paddle/fluid/operators/im2sequence_op.h
+++ b/paddle/fluid/operators/im2sequence_op.h
@@ -13,6 +13,7 @@
   limitations under the License. */
 #pragma once
+#include <string>
 #include <vector>
 #include "paddle/fluid/framework/data_layout.h"
 #include "paddle/fluid/framework/eigen.h"
@@ -39,50 +40,106 @@ class Im2SequenceKernel : public framework::OpKernel<T> {
  void Compute(const framework::ExecutionContext& ctx) const override {
    const Tensor* in = ctx.Input<Tensor>("X");
    LoDTensor* out = ctx.Output<LoDTensor>("Out");
-    out->mutable_data<T>(ctx.GetPlace());
-    // TODO(wanghaoshuang): Add layout checker after 'set_layout'
-    // being available for python API
-    // PADDLE_ENFORCE_EQ(in->layout(), framework::DataLayout::kNCHW,
-    //                  "Input(X) layout must be NCHW");
    auto in_dim = in->dims();
    int batch_size = in_dim[0];
    int img_channels = in_dim[1];
    int img_height = in_dim[2];
    int img_width = in_dim[3];
    auto kernels = ctx.Attr<std::vector<int>>("kernels");
    auto strides = ctx.Attr<std::vector<int>>("strides");
    auto paddings = ctx.Attr<std::vector<int>>("paddings");
-    int output_height = Im2SeqOutputSize(img_height, kernels[0], paddings[0],
+    if (ctx.HasInput("Y") && batch_size > 1) {
-                                         paddings[2], strides[0]);
+      const Tensor* imgrealsize = ctx.Input<Tensor>("Y");
-    int output_width = Im2SeqOutputSize(img_width, kernels[1], paddings[1],
+      auto out_stride = ctx.Attr<std::vector<int>>("out_stride");
-                                        paddings[3], strides[1]);
+      Tensor cpu_shape_tensor;
+      TensorCopySync(*imgrealsize, platform::CPUPlace(), &cpu_shape_tensor);
-    const std::vector<int> dilations({1, 1});
+      std::vector<int> imgreal_h;
+      std::vector<int> imgreal_w;
-    auto out_dims = out->dims();
+      std::vector<int> output_height;
-    out->Resize({batch_size, out->numel() / batch_size});
+      std::vector<int> output_width;
-    for (int i = 0; i < batch_size; i++) {
+      int result = 0;
-      const Tensor src =
+      for (int i = 0; i < batch_size; i++) {
-          in->Slice(i, i + 1).Resize({img_channels, img_height, img_width});
+        int tmp_real_h = static_cast<int>((cpu_shape_tensor.data<T>())[2 * i]);
-      Tensor dst = out->Slice(i, i + 1).Resize(
+        int tmp_real_w =
-          {output_height, output_width, img_channels, kernels[0], kernels[1]});
+            static_cast<int>((cpu_shape_tensor.data<T>())[2 * i + 1]);
+        if (tmp_real_h % out_stride[0] == 0) {
-      math::Im2ColFunctor<math::ColFormat::kOCF, DeviceContext, T> f;
+          tmp_real_h = tmp_real_h / out_stride[0];
-      auto& dev_ctx = ctx.template device_context<DeviceContext>();
+        } else {
-      f(dev_ctx, src, dilations, strides, paddings, &dst);
+          tmp_real_h = tmp_real_h / out_stride[0] + 1;
-    }
+        }
-    out->Resize(out_dims);
+        if (tmp_real_w % out_stride[1] == 0) {
+          tmp_real_w = tmp_real_w / out_stride[1];
-    // set lod information
+        } else {
-    // TODO(wanghaoshuang): Move this to InferShape
+          tmp_real_w = tmp_real_w / out_stride[1] + 1;
-    framework::LoD lod(1);
+        }
-    lod[0].reserve(batch_size + 1);
+        imgreal_h.push_back(tmp_real_h);
-    for (int i = 0, offset = 0; i < batch_size + 1; ++i) {
+        imgreal_w.push_back(tmp_real_w);
+        output_height.push_back(Im2SeqOutputSize(
+            imgreal_h[i], kernels[0], paddings[0], paddings[2], strides[0]));
+        output_width.push_back(Im2SeqOutputSize(
+            imgreal_w[i], kernels[1], paddings[1], paddings[3], strides[1]));
+        result += output_height[i] * output_width[i];
+      }
+      out->mutable_data<T>({result, img_channels * kernels[0] * kernels[1]},
+                           ctx.GetPlace());
+      const std::vector<int> dilations({1, 1});
+      int offset_out = 0;
+      for (int i = 0; i < batch_size; i++) {
+        const Tensor src =
+            in->Slice(i, i + 1).Resize({img_channels, img_height, img_width});
+        Tensor dst = out->Slice(offset_out,
+                                offset_out + output_height[i] * output_width[i])
+                         .Resize({output_height[i], output_width[i],
+                                  img_channels, kernels[0], kernels[1]});
+        offset_out += output_height[i] * output_width[i];
+        math::Im2ColFunctor<math::ColFormat::kOCF, DeviceContext, T> f;
+        auto& dev_ctx = ctx.template device_context<DeviceContext>();
+        f(dev_ctx, src, dilations, strides, paddings, &dst);
+      }
+      framework::LoD lod(1);
+      lod[0].reserve(batch_size + 1);
+      int offset = 0;
+      lod[0].push_back(offset);
+      for (int i = 0; i < batch_size; ++i) {
+        offset += output_height[i] * output_width[i];
+        lod[0].push_back(offset);
+      }
+      out->set_lod(lod);
+    } else {
+      out->mutable_data<T>(ctx.GetPlace());
+      int output_height = Im2SeqOutputSize(img_height, kernels[0], paddings[0],
+                                           paddings[2], strides[0]);
+      int output_width = Im2SeqOutputSize(img_width, kernels[1], paddings[1],
+                                          paddings[3], strides[1]);
+      const std::vector<int> dilations({1, 1});
+      auto out_dims = out->dims();
+      out->Resize({batch_size, out->numel() / batch_size});
+      for (int i = 0; i < batch_size; i++) {
+        const Tensor src =
+            in->Slice(i, i + 1).Resize({img_channels, img_height, img_width});
+        Tensor dst =
+            out->Slice(i, i + 1).Resize({output_height, output_width,
+                                         img_channels, kernels[0], kernels[1]});
+        math::Im2ColFunctor<math::ColFormat::kOCF, DeviceContext, T> f;
+        auto& dev_ctx = ctx.template device_context<DeviceContext>();
+        f(dev_ctx, src, dilations, strides, paddings, &dst);
+      }
+      out->Resize(out_dims);
+      framework::LoD lod(1);
+      lod[0].reserve(batch_size + 1);
+      int offset = 0;
      lod[0].push_back(offset);
-      offset += output_height * output_width;
+      for (int i = 0; i < batch_size; ++i) {
+        offset += output_height * output_width;
+        lod[0].push_back(offset);
+      }
+      out->set_lod(lod);
    }
-    out->set_lod(lod);
  }
 };

--- a/paddle/fluid/operators/math/im2col.cc
+++ b/paddle/fluid/operators/math/im2col.cc
@@ -43,21 +43,6 @@ class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO,
    int col_height = col->dims()[3];
    int col_width = col->dims()[4];
-    PADDLE_ENFORCE_EQ((im_height + padding[0] + padding[2] -
-                       ((dilation[0] * (filter_height - 1) + 1))) /
-                              stride[0] +
-                          1,
-                      col_height,
-                      "Output_height and padding(padding_up, padding_down) are "
-                      "inconsistent.");
-    PADDLE_ENFORCE_EQ((im_width + padding[1] + padding[3] -
-                       ((dilation[1] * (filter_width - 1) + 1))) /
-                              stride[1] +
-                          1,
-                      col_width,
-                      "Output_height and padding(padding_up, padding_down) are "
-                      "inconsistent.");
    int channels_col = im_channels * filter_height * filter_width;
    const T* im_data = im.data<T>();
@@ -178,17 +163,6 @@ class Im2ColFunctor<paddle::operators::math::ColFormat::kOCF,
    int col_height = col->dims()[0];
    int col_width = col->dims()[1];
-    PADDLE_ENFORCE_EQ(
-        (im_height + padding[0] + padding[2] - filter_height) / stride[0] + 1,
-        col_height,
-        "Output_height and padding(padding_up, padding_down) are "
-        "inconsistent.");
-    PADDLE_ENFORCE_EQ(
-        (im_width + padding[1] + padding[3] - filter_width) / stride[1] + 1,
-        col_width,
-        "col_width and padding(padding_left, padding_right) are "
-        "inconsistent.");
    const T* im_data = im.data<T>();
    T* col_data = col->data<T>();

--- a/paddle/fluid/operators/math/im2col.cu
+++ b/paddle/fluid/operators/math/im2col.cu
@@ -77,21 +77,6 @@ class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO,
    int col_height = col->dims()[3];
    int col_width = col->dims()[4];
-    PADDLE_ENFORCE_EQ((im_height + padding[0] + padding[2] -
-                       (dilation[0] * (filter_height - 1) + 1)) /
-                              stride[0] +
-                          1,
-                      col_height,
-                      "Output_height and padding(padding_up, padding_down) are "
-                      "inconsistent.");
-    PADDLE_ENFORCE_EQ((im_width + padding[1] + padding[3] -
-                       (dilation[1] * (filter_width - 1) + 1)) /
-                              stride[1] +
-                          1,
-                      col_width,
-                      "col_width and padding(padding_left, padding_right) are "
-                      "inconsistent.");
    int num_outputs = im_channels * col_height * col_width;
    int blocks = (num_outputs + 1024 - 1) / 1024;
    int block_x = 512;
@@ -274,21 +259,6 @@ class Im2ColFunctor<paddle::operators::math::ColFormat::kOCF,
    int col_height = col->dims()[0];
    int col_width = col->dims()[1];
-    PADDLE_ENFORCE_EQ((im_height + padding[0] + padding[2] -
-                       (dilation[0] * (filter_height - 1) + 1)) /
-                              stride[0] +
-                          1,
-                      col_height,
-                      "Output_height and padding(padding_up, padding_down) are "
-                      "inconsistent.");
-    PADDLE_ENFORCE_EQ((im_width + padding[1] + padding[3] -
-                       (dilation[1] * (filter_width - 1) + 1)) /
-                              stride[1] +
-                          1,
-                      col_width,
-                      "col_width and padding(padding_left, padding_right) are "
-                      "inconsistent.");
    int block_dim_x = 0;
    int block_dim_y = 0;
    if (filter_height <= 4 && filter_width <= 4) {

--- a/paddle/fluid/operators/squeeze_op.cc
+++ b/paddle/fluid/operators/squeeze_op.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/op_registry.h"
+namespace paddle {
+namespace operators {
+class SqueezeOpInferShape : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of SqueezeOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of SqueezeOp should not be null.");
+    const auto &x_dims = ctx->GetInputDim("X");
+    // Check input tensor dims (<6) Eigen limit.
+    PADDLE_ENFORCE(x_dims.size() <= 6,
+                   "Invalid dimnesions, the rank of Input(X) "
+                   "should be in the range of [1, 6] (Eigen limit).");
+    const auto &axes = ctx->Attrs().Get<std::vector<int>>("axes");
+    for (int a : axes) {
+      PADDLE_ENFORCE_LT(a, x_dims.size(),
+                        "The squeeze axis should be less than input "
+                        "tensor's rank.");
+    }
+    auto out_dims = GetOutputShape(axes, x_dims);
+    ctx->SetOutputDim("Out", out_dims);
+    if (x_dims[0] == out_dims[0]) {
+      // Only pass LoD when the first dimension of output and Input(X)
+      // are the same.
+      ctx->ShareLoD("X", "Out");
+    }
+  }
+  static framework::DDim GetOutputShape(const std::vector<int> squeeze_dims,
+                                        const framework::DDim &in_dims) {
+    size_t num_squeeze_dims = squeeze_dims.size();
+    int cnt_squeezed_dims = 0;
+    bool should_squeeze[9] = {false};
+    // Determines number of dimensions of output tensor after squeeze.
+    // Mark and count the dimensions need to be squeezed
+    if (num_squeeze_dims == 0) {
+      for (int idx = 0; idx < in_dims.size(); ++idx) {
+        if (in_dims[idx] == 1) {
+          should_squeeze[idx] = true;
+          ++cnt_squeezed_dims;
+        }
+      }
+    } else {
+      for (size_t idx = 0; idx < num_squeeze_dims; ++idx) {
+        int current = squeeze_dims[idx] < 0 ? squeeze_dims[idx] + in_dims.size()
+                                            : squeeze_dims[idx];
+        // Check current index, the upper limit has beed checked in line 36.
+        PADDLE_ENFORCE(current >= 0,
+                       "Invalid axis, the negative axis is out of range.");
+        PADDLE_ENFORCE(in_dims[current] == 1,
+                       "Invalid axis index, the axis that will be squeezed "
+                       "should be equal to 1.");
+        if (!(should_squeeze[current])) {
+          ++cnt_squeezed_dims;
+        }
+        should_squeeze[current] = true;
+      }
+    }
+    // Make output dimensions
+    std::vector<int64_t> output_shape(in_dims.size() - cnt_squeezed_dims, 0);
+    for (int in_idx = 0, out_idx = 0; in_idx < in_dims.size(); ++in_idx) {
+      if (!should_squeeze[in_idx]) {
+        output_shape[out_idx++] = in_dims[in_idx];
+      }
+    }
+    return framework::make_ddim(output_shape);
+  }
+};
+class SqueezeOp : public framework::OperatorBase {
+ public:
+  using OperatorBase::OperatorBase;
+ private:
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &place) const override {
+    auto &axes = Attr<std::vector<int>>("axes");
+    auto x_dims = scope.FindVar(Input("X"))->Get<framework::LoDTensor>().dims();
+    auto out_dims = SqueezeOpInferShape::GetOutputShape(axes, x_dims);
+    framework::AttributeMap attrs;
+    attrs["shape"] = framework::vectorize2int(out_dims);
+    attrs["inplace"] = Attr<bool>("inplace");
+    // Invoke Reshape Op
+    auto reshape_op = framework::OpRegistry::CreateOp(
+        "reshape", {{"X", {Input("X")}}, {"Shape", {}}},
+        {{"Out", {Output("Out")}}}, attrs);
+    reshape_op->Run(scope, place);
+  }
+};
+class SqueezeOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X", "(Tensor). The input tensor of squeeze operator.");
+    AddOutput("Out", "(Tensor). The output tensor of squeeze operator.");
+    AddAttr<std::vector<int>>("axes",
+                              "(std::vector<int>). List of integers,"
+                              " indicating the dimensions to squeeze.")
+        .SetDefault({});
+    AddAttr<bool>("inplace",
+                  "(default: false) Squeeze the source tensor's shape without "
+                  "memory copy. When Attr(inplace) is set true, the output "
+                  "tensor shares memory with Input(X), otherwise, a new output "
+                  "tensor is created, and its data are copied from Input(x).")
+        .SetDefault(false);
+    AddComment(R"DOC(
+        Squeeze Operator.
+        Remove single-dimensional entries from the shape of a tensor. 
+        Takes a parameter axes with a list of axes to squeeze. 
+        If axes is not provided, all the single dimensions will be removed from the shape. 
+        If an axis is selected with shape entry not equal to one, an error is raised.
+        Examples:
+        Case 1:
+          Given 
+            X.shape = (1, 3, 1, 5)
+          and
+            axes = [0]
+          we get:
+            Out.shape = (3, 1, 5)
+        Case 2:
+          Given
+            X.shape = (1, 3, 1, 5)
+          and 
+            axes = []
+          we get:
+            Out.shape = (3, 5)
+    )DOC");
+  }
+};
+class SqueezeGradInferShape : public framework::InferShapeBase {
+ public:
+  void operator()(framework::InferShapeContext *context) const override {
+    context->SetOutputDim(framework::GradVarName("X"),
+                          context->GetInputDim("X"));
+    context->ShareLoD("X", framework::GradVarName("X"));
+  }
+};
+class SqueezeGradOp : public framework::OperatorBase {
+ public:
+  using OperatorBase::OperatorBase;
+ private:
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &place) const override {
+    auto dx_name = Output(framework::GradVarName("X"));
+    auto dout_name = Input(framework::GradVarName("Out"));
+    auto x_dims = scope.FindVar(Input("X"))->Get<framework::LoDTensor>().dims();
+    framework::AttributeMap attrs;
+    attrs["shape"] = framework::vectorize2int(x_dims);
+    attrs["inplace"] = Attr<bool>("inplace");
+    auto reshape_op = framework::OpRegistry::CreateOp(
+        "reshape", {{"X", {dout_name}}, {"Shape", {}}}, {{"Out", {dx_name}}},
+        attrs);
+    reshape_op->Run(scope, place);
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+// Tell linker to use reshape op
+USE_OP(reshape);
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(squeeze, ops::SqueezeOp, ops::SqueezeOpMaker,
+                  ops::SqueezeOpInferShape,
+                  paddle::framework::DefaultGradOpDescMaker<true>);
+REGISTER_OPERATOR(squeeze_grad, ops::SqueezeGradOp, ops::SqueezeGradInferShape);
--- a/paddle/fluid/platform/CMakeLists.txt
+++ b/paddle/fluid/platform/CMakeLists.txt
@@ -46,7 +46,7 @@ ENDIF()
 # memcpy depends on device_context, here add deps individually for
 # avoiding cycle dependencies
 cc_library(device_context SRCS device_context.cc init.cc DEPS malloc
-    place eigen3 stringpiece cpu_helper ${GPU_CTX_DEPS} ${MKLDNN_CTX_DEPS})
+    place eigen3 stringpiece cpu_helper framework_proto ${GPU_CTX_DEPS} ${MKLDNN_CTX_DEPS})
 nv_test(device_context_test SRCS device_context_test.cu DEPS device_context gpu_info)
 cc_test(init_test SRCS init_test.cc DEPS device_context)

--- a/python/paddle/fluid/annotations.py
+++ b/python/paddle/fluid/annotations.py
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import functools
+import sys
+__all__ = ['deprecated']
+def deprecated(since, instead, extra_message=""):
+    def decorator(func):
+        err_msg = "API {0} is deprecated since {1}. Please use {2} instead.".format(
+            func.__name__, since, instead)
+        if len(extra_message) != 0:
+            err_msg += "\n"
+            err_msg += extra_message
+        @functools.wraps(func)
+        def wrapper(*args, **kwargs):
+            print >> sys.stderr, err_msg
+            return func(*args, **kwargs)
+        wrapper.__doc__ += "\n    "
+        wrapper.__doc__ += err_msg
+        return wrapper
+    return decorator
--- a/python/paddle/fluid/backward.py
+++ b/python/paddle/fluid/backward.py
@@ -18,10 +18,7 @@ import collections
 import copy
 import unique_name
-__all__ = [
+__all__ = ['append_backward']
-    'append_backward',
-    'calc_gradient',
-]
 def _rename_arg_(op_descs, old_name, new_name, begin_idx=None, end_idx=None):
@@ -123,7 +120,8 @@ def _append_grad_suffix_(name):
 def _addup_repetitive_outputs_(op_descs):
    """
    In backward part, an variable may be the output of more than one ops.
-    In this case, the variable should be the accumulation of all the outputs.
+    And one op may yield its multiple outputs to the same variable.
+    In these cases, the variable should be the accumulation of all the outputs.
    `sum_op`s are added to implement the accumulate.
    """
    pending_sum_ops = []
@@ -136,29 +134,46 @@ def _addup_repetitive_outputs_(op_descs):
                    "sum", {"X": renamed_vars[var_name]}, {"Out": [var_name]},
                    {"use_mkldnn": False}), idx))
                renamed_vars[var_name] = [var_name]
-        for var_name in op_desc.output_arg_names():
+        for param_idx, param_name in enumerate(op_desc.output_names()):
-            if var_name == core.empty_var_name(
+            arg_names = op_desc.output(param_name)
-            ) or var_name in op_desc.input_arg_names():
+            for arg_idx, var_name in enumerate(arg_names):
-                # empty variable or inplace op
+                if var_name == core.empty_var_name(
-                continue
+                ) or var_name in op_desc.input_arg_names():
-            if len(renamed_vars[var_name]) == 0:
+                    # empty variable or inplace op
-                # it's the first time we get the variable
+                    continue
-                renamed_vars[var_name] = [var_name]
+                if len(renamed_vars[var_name]) == 0:
-            else:
+                    # it's the first time we get the variable
-                if len(renamed_vars[var_name]) == 1:
+                    renamed_vars[var_name] = [var_name]
+                else:
+                    if len(renamed_vars[var_name]) == 1:
+                        new_name = var_name + "@RENAME@" + \
+                            str(var_rename_count[var_name])
+                        var_rename_count[var_name] += 1
+                        # rename original var_name
+                        renamed_vars[var_name][0] = new_name
+                        _rename_arg_(op_descs, var_name, new_name, 0, idx)
+                        _rename_arg_(pending_sum_ops, var_name, new_name)
+                        for p in op_desc.output_names()[:param_idx]:
+                            p_arg_names = op_desc.output(p)
+                            if var_name in p_arg_names:
+                                op_desc.set_output(p, [
+                                    new_name if x == var_name else x
+                                    for x in p_arg_names
+                                ])
+                        arg_names = [
+                            new_name if x == var_name else x
+                            for x in arg_names[:arg_idx]
+                        ] + arg_names[arg_idx:]
                    new_name = var_name + "@RENAME@" + \
                        str(var_rename_count[var_name])
                    var_rename_count[var_name] += 1
-                    # rename original var_name
+                    arg_names[arg_idx] = new_name
-                    renamed_vars[var_name][0] = new_name
+                    op_desc.set_output(param_name, arg_names)
-                    _rename_arg_(op_descs, var_name, new_name, 0, idx)
+                    renamed_vars[var_name].append(new_name)
-                    _rename_arg_(pending_sum_ops, var_name, new_name)
-                new_name = var_name + "@RENAME@" + \
-                    str(var_rename_count[var_name])
-                var_rename_count[var_name] += 1
-                op_desc.rename_output(var_name, new_name)
-                renamed_vars[var_name].append(new_name)
    for var_name, inputs in renamed_vars.iteritems():
        if len(inputs) > 1:
            pending_sum_ops.append(

--- a/python/paddle/fluid/layers/device.py
+++ b/python/paddle/fluid/layers/device.py
@@ -18,10 +18,12 @@ All util layers.
 from layer_function_generator import autodoc
 from ..framework import unique_name
 from ..layer_helper import LayerHelper
+from ..annotations import deprecated
-__all__ = ['get_places']
+__all__ = []
+@deprecated(since='0.15.0', instead="ParallelExecutor")
 @autodoc()
 def get_places(device_count=None, device_type=None):
    helper = LayerHelper('get_places', **locals())

--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#   Copyright (c ) 2018 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -3900,7 +3914,13 @@ def transpose(x, perm, name=None):
    return out
-def im2sequence(input, filter_size=1, stride=1, padding=0, name=None):
+def im2sequence(input,
+                filter_size=1,
+                stride=1,
+                padding=0,
+                input_image_size=None,
+                out_stride=1,
+                name=None):
    """
    Extracts image patches from the input tensor to form a tensor of shape
    {input.batch_size * output_height * output_width, filter_size_H *
@@ -3937,6 +3957,15 @@ def im2sequence(input, filter_size=1, stride=1, padding=0, name=None):
            padding_up = padding_down = padding_left = padding_right = padding
            Default: padding = 0.
+        input_image_size(Variable): the input contains image real size.It's dim
+            is [batchsize, 2]. It is dispensable.It is just for batch inference.
+        out_stride(int|tuple): The scaling of image through CNN. It is
+            dispensable. It is valid only when input_image_size is not null.
+            If out_stride is tuple,  it must contain two intergers,
+            (out_stride_H, out_stride_W). Otherwise,
+            the out_stride_H = out_stride_W = out_stride.
        name (int): The name of this layer. It is optional.
    Returns:
@@ -3987,7 +4016,7 @@ def im2sequence(input, filter_size=1, stride=1, padding=0, name=None):
                           [ 5.  7.  2.  4.  1.  3.  9.  0.]
                           [ 7.  9.  4.  8.  3.  5.  0.  8.]]
-            output.dims = {8, 9}
+            output.dims = {8, 8}
            output.lod = [[4, 4]]
@@ -4009,18 +4038,17 @@ def im2sequence(input, filter_size=1, stride=1, padding=0, name=None):
    if len(padding) == 2:
        padding.append(padding[0])
        padding.append(padding[1])
+    inputs = {"X": input}
+    attrs = {"kernels": filter_size, "strides": stride, "padding": padding}
+    if input_image_size:
+        if isinstance(out_stride, int):
+            out_stride = [out_stride, out_stride]
+        inputs["Y"] = input_image_size
+        attrs["out_stride"] = out_stride
    helper = LayerHelper('im2sequence', **locals())
    out = helper.create_tmp_variable(dtype=helper.input_dtype())
    helper.append_op(
-        type='im2sequence',
+        type='im2sequence', inputs=inputs, outputs={'Out': out}, attrs=attrs)
-        inputs={'X': input},
-        outputs={'Out': out},
-        attrs={
-            'kernels': filter_size,
-            'strides': stride,
-            'paddings': padding,
-        })
    return out

--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
@@ -29,7 +29,7 @@ __all__ = [
    'SGD', 'Momentum', 'Adagrad', 'Adam', 'Adamax', 'DecayedAdagrad', 'Ftrl',
    'SGDOptimizer', 'MomentumOptimizer', 'AdagradOptimizer', 'AdamOptimizer',
    'AdamaxOptimizer', 'DecayedAdagradOptimizer', 'RMSPropOptimizer',
-    'FtrlOptimizer', 'Adadelta', 'ModelAverage', 'Optimizer', 'RMSPropOptimizer'
+    'FtrlOptimizer', 'Adadelta', 'ModelAverage', 'RMSPropOptimizer'
 ]
@@ -67,7 +67,7 @@ class Optimizer(object):
        self._LARS_weight_decay = LARS_weight_decay
    def _create_global_learning_rate(self):
-        lr = self.global_learning_rate()
+        lr = self._global_learning_rate()
        if isinstance(lr, framework.Variable):
            return
@@ -86,7 +86,7 @@ class Optimizer(object):
            dtype='float32' if self._dtype == None else self._dtype,
            persistable=True)
-    def global_learning_rate(self, program=None):
+    def _global_learning_rate(self, program=None):
        """
        get global decayed learning rate
        :return:
@@ -110,9 +110,9 @@ class Optimizer(object):
            return param_lr
        else:
            if param_lr == 1.0:
-                return self.global_learning_rate()
+                return self._global_learning_rate()
            else:
-                return self.global_learning_rate() * param_lr
+                return self._global_learning_rate() * param_lr
    def _create_accumulators(self, block, parameters):
        """Create all accumulators needed by the parameters
@@ -185,10 +185,10 @@ class Optimizer(object):
                            format(name, param.name))
        return self._accumulators[name][param.name]
-    def create_optimization_pass(self,
+    def _create_optimization_pass(self,
-                                 parameters_and_grads,
+                                  parameters_and_grads,
-                                 loss,
+                                  loss,
-                                 startup_program=None):
+                                  startup_program=None):
        """Add optimization operators to update gradients to variables.
        Args:
@@ -221,7 +221,7 @@ class Optimizer(object):
            self._create_global_learning_rate()
            if self._LARS_weight_decay > 0.0:
                layers.append_LARS(parameters_and_grads,
-                                   self.global_learning_rate(),
+                                   self._global_learning_rate(),
                                   self._LARS_weight_decay)
            optimize_ops = []
@@ -262,8 +262,8 @@ class Optimizer(object):
        params_grads = append_regularization_ops(params_grads,
                                                 self.regularization)
-        optimize_ops = self.create_optimization_pass(params_grads, loss,
+        optimize_ops = self._create_optimization_pass(params_grads, loss,
-                                                     startup_program)
+                                                      startup_program)
        return optimize_ops, params_grads

--- a/python/paddle/fluid/tests/book/notest_understand_sentiment.py
+++ b/python/paddle/fluid/tests/book/notest_understand_sentiment.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from __future__ import print_function
+from paddle.fluid.layers.device import get_places
 import unittest
 import paddle.fluid as fluid
 import paddle
@@ -144,7 +144,7 @@ def train(word_dict,
        cost, acc_out, prediction = net_method(
            data, label, input_dim=dict_dim, class_dim=class_dim)
    else:
-        places = fluid.layers.get_places()
+        places = get_places()
        pd = fluid.layers.ParallelDo(places)
        with pd.do():
            cost, acc, _ = net_method(

--- a/python/paddle/fluid/tests/book/test_recognize_digits.py
+++ b/python/paddle/fluid/tests/book/test_recognize_digits.py
@@ -12,15 +12,17 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from __future__ import print_function
-import argparse
-import paddle.fluid as fluid
-import paddle
-import sys
-import numpy
-import unittest
 import math
-import sys
 import os
+import sys
+import unittest
+import numpy
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid.layers.device import get_places
 BATCH_SIZE = 64
@@ -76,7 +78,7 @@ def train(nn_type,
        net_conf = conv_net
    if parallel:
-        places = fluid.layers.get_places()
+        places = get_places()
        pd = fluid.layers.ParallelDo(places)
        with pd.do():
            img_ = pd.read_input(img)

--- a/python/paddle/fluid/tests/book/test_word2vec.py
+++ b/python/paddle/fluid/tests/book/test_word2vec.py
@@ -14,6 +14,7 @@
 import paddle
 import paddle.fluid as fluid
+from paddle.fluid.layers.device import get_places
 import unittest
 import os
 import numpy as np
@@ -80,7 +81,7 @@ def train(use_cuda, is_sparse, is_parallel, save_dirname, is_local=True):
        avg_cost, predict_word = __network__(
            [first_word, second_word, third_word, forth_word, next_word])
    else:
-        places = fluid.layers.get_places()
+        places = get_places()
        pd = fluid.layers.ParallelDo(places)
        with pd.do():
            avg_cost, predict_word = __network__(

--- a/python/paddle/fluid/tests/book_memory_optimization/test_memopt_fit_a_line.py
+++ b/python/paddle/fluid/tests/book_memory_optimization/test_memopt_fit_a_line.py
@@ -12,12 +12,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
-import numpy as np
-import paddle
-import paddle.fluid as fluid
 import math
 import sys
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid.layers.device import get_places
 # need to fix random seed and training data to compare the loss
 # value accurately calculated by the default and the memory optimization
 # version.
@@ -34,7 +35,7 @@ if fluid.core.is_compiled_with_cuda():
    use_nccl = False
    place = fluid.CUDAPlace(0)
-places = fluid.layers.get_places(device_count=0, device_type=device_type)
+places = get_places(device_count=0, device_type=device_type)
 pd = fluid.layers.ParallelDo(places, use_nccl=use_nccl)
 with pd.do():
    x_ = pd.read_input(x)

--- a/python/paddle/fluid/tests/unittests/test_calc_gradient.py
+++ b/python/paddle/fluid/tests/unittests/test_calc_gradient.py
@@ -16,8 +16,6 @@ import unittest
 import paddle.fluid as fluid
 import paddle.fluid.layers as layers
-import paddle.fluid.framework as framework
-import paddle.fluid.optimizer as optimizer
 from paddle.fluid.backward import calc_gradient

--- a/python/paddle/fluid/tests/unittests/test_get_places_op.py
+++ b/python/paddle/fluid/tests/unittests/test_get_places_op.py
@@ -13,6 +13,7 @@
 # limitations under the License.
 import paddle.fluid as fluid
+from paddle.fluid.layers.device import get_places
 import decorators
 import unittest
@@ -20,7 +21,7 @@ import unittest
 class TestGetPlaces(unittest.TestCase):
    @decorators.prog_scope()
    def test_get_places(self):
-        places = fluid.layers.get_places()
+        places = get_places()
        cpu = fluid.CPUPlace()
        exe = fluid.Executor(cpu)
        exe.run(fluid.default_main_program())

--- a/python/paddle/fluid/tests/unittests/test_im2sequence_op.py
+++ b/python/paddle/fluid/tests/unittests/test_im2sequence_op.py
@@ -16,23 +16,48 @@ import numpy as np
 from op_test import OpTest
-def get_output_shape(attrs, in_shape):
+def get_output_shape(attrs, in_shape, img_real_size):
+    batchsize = in_shape[0]
    img_height = in_shape[2]
    img_width = in_shape[3]
+    paddings = np.array(attrs['paddings']).astype("int32")
+    kernels = np.array(attrs['kernels']).astype("int32")
+    strides = np.array(attrs['strides']).astype("int32")
+    output_height = np.zeros((1, batchsize)).astype("int32")
+    output_width = np.zeros((1, batchsize)).astype("int32")
+    if len(img_real_size):
+        out_stride = np.array(attrs['out_stride']).astype("int32")
+        imgreal_h = 0
+        imgreal_w = 0
+        for index in range(batchsize):
+            if img_real_size[index, 0] % out_stride[0] == 0:
+                imgreal_h = img_real_size[index, 0] / out_stride[0]
+            else:
+                imgreal_h = img_real_size[index, 0] / out_stride[0] + 1
+            if img_real_size[index, 0] % out_stride[1] == 0:
+                imgreal_w = img_real_size[index, 1] / out_stride[1]
+            else:
+                imgreal_w = img_real_size[index, 0] / out_stride[1] + 1
+            output_height[0,index] = \
+              1 +  \
+              (imgreal_h + paddings[0] + paddings[2] - kernels[0] + strides[0] - 1) / \
+                  strides[0]
-    paddings = attrs['paddings']
+            output_width[0,index] = \
-    kernels = attrs['kernels']
+              1 + \
-    strides = attrs['strides']
+              (imgreal_w + paddings[1] + paddings[3] - kernels[1] + strides[1] - 1) / \
+                  strides[1]
+    else:
+        for index in range(batchsize):
+            output_height[0,index] = \
+              1 +  \
+              (img_height + paddings[0] + paddings[2] - kernels[0] + strides[0] - 1) / \
+                  strides[0]
-    output_height = \
+            output_width[0,index] = \
-      1 +  \
+              1 + \
-      (img_height + paddings[0] + paddings[2] - kernels[0] + strides[0] - 1) / \
+              (img_width + paddings[1] + paddings[3] - kernels[1] + strides[1] - 1) / \
-          strides[0]
+                  strides[1]
-    output_width = \
-      1 + \
-      (img_width + paddings[1] + paddings[3] - kernels[1] + strides[1] - 1) / \
-          strides[1]
    return output_height, output_width
@@ -75,22 +100,25 @@ def im2col(attrs, im, col):
                                    im_row_offset][im_col_offset]
-def Im2Sequence(inputs, attrs):
+def Im2Sequence(inputs, img_real_size, attrs):
-    output_height, output_width = get_output_shape(attrs, inputs.shape)
+    output_height, output_width = get_output_shape(attrs, inputs.shape,
+                                                   img_real_size)
    img_channels = inputs.shape[1]
    batch_size = inputs.shape[0]
-    out = np.zeros([
+    out = []
-        batch_size, output_height, output_width, img_channels,
+    for index in range(batch_size):
-        attrs['kernels'][0], attrs['kernels'][1]
+        tmp = np.zeros([
-    ]).astype("float32")
+            output_height[0, index], output_width[0, index], img_channels,
+            attrs['kernels'][0], attrs['kernels'][1]
-    for i in range(len(inputs)):
+        ]).astype("float32")
-        im2col(attrs, inputs[i], out[i])
+        out.append(tmp)
+    for index in range(len(inputs)):
-    out = out.reshape([
+        im2col(attrs, inputs[index], out[index])
-        batch_size * output_height * output_width,
+        out[index] = out[index].reshape([
-        img_channels * attrs['kernels'][0] * attrs['kernels'][1]
+            output_height[0, index] * output_width[0, index],
-    ])
+            img_channels * attrs['kernels'][0] * attrs['kernels'][1]
+        ])
+    out = np.concatenate(out, axis=0)
    return out
@@ -103,7 +131,7 @@ class TestBlockExpandOp(OpTest):
        self.attrs = {
            'kernels': [2, 2],
            'strides': [1, 1],
-            'paddings': [1, 1, 1, 1]
+            'paddings': [1, 1, 1, 1],
        }
    def setUp(self):
@@ -113,7 +141,8 @@ class TestBlockExpandOp(OpTest):
            self.batch_size, self.img_channels, self.img_height, self.img_width
        ]).astype("float32")
-        out = Im2Sequence(x, self.attrs)
+        real_size = np.array([]).astype("float32")
+        out = Im2Sequence(x, real_size, self.attrs)
        self.inputs = {'X': x}
        self.outputs = {'Out': out}
@@ -133,20 +162,20 @@ class TestBlockExpandOpCase2(TestBlockExpandOp):
        self.attrs = {
            'kernels': [2, 1],
            'strides': [2, 1],
-            'paddings': [2, 1, 2, 1]
+            'paddings': [2, 1, 2, 1],
        }
 class TestBlockExpandOpCase3(TestBlockExpandOp):
    def config(self):
-        self.batch_size = 3
+        self.batch_size = 2
        self.img_channels = 1
        self.img_height = 4
        self.img_width = 5
        self.attrs = {
            'kernels': [2, 1],
            'strides': [2, 1],
-            'paddings': [2, 0, 2, 0]
+            'paddings': [2, 0, 2, 0],
        }
@@ -159,9 +188,94 @@ class TestBlockExpandOpCase4(TestBlockExpandOp):
        self.attrs = {
            'kernels': [2, 2],
            'strides': [1, 1],
-            'paddings': [0, 0, 0, 0]
+            'paddings': [0, 0, 0, 0],
+        }
+class TestBlockExpandOpCase5(OpTest):
+    def config(self):
+        self.batch_size = 1
+        self.img_channels = 3
+        self.img_height = 4
+        self.img_width = 5
+        self.attrs = {
+            'kernels': [2, 1],
+            'strides': [2, 1],
+            'paddings': [2, 1, 2, 1],
+            'out_stride': [2, 2],
+        }
+    def setUp(self):
+        self.config()
+        self.op_type = "im2sequence"
+        x = np.random.uniform(0.1, 1, [
+            self.batch_size, self.img_channels, self.img_height, self.img_width
+        ]).astype("float32")
+        real_size = np.array([[8, 10], [5, 8]]).astype("float32")
+        out = np.array(Im2Sequence(x, real_size, self.attrs))
+        self.inputs = {'X': x, 'Y': real_size}  #l ??
+        self.outputs = {'Out': out}
+    def test_check_output(self):
+        self.check_output()
+class TestBlockExpandOpCase6(OpTest):
+    def config(self):
+        self.batch_size = 3
+        self.img_channels = 1
+        self.img_height = 4
+        self.img_width = 5
+        self.attrs = {
+            'kernels': [2, 1],
+            'strides': [1, 1],
+            'paddings': [0, 0, 0, 0],
+            'out_stride': [1, 1],
+        }
+    def setUp(self):
+        self.config()
+        self.op_type = "im2sequence"
+        x = np.random.uniform(0.1, 1, [
+            self.batch_size, self.img_channels, self.img_height, self.img_width
+        ]).astype("float32")
+        real_size = np.array([[8, 10], [5, 8], [5, 8]]).astype("float32")
+        out = np.array(Im2Sequence(x, real_size, self.attrs))
+        self.inputs = {'X': x, 'Y': real_size}  #l ??
+        self.outputs = {'Out': out}
+    def test_check_output(self):
+        self.check_output()
+class TestBlockExpandOpCase7(OpTest):
+    def config(self):
+        self.batch_size = 2
+        self.img_channels = 2
+        self.img_height = 3
+        self.img_width = 3
+        self.attrs = {
+            'kernels': [2, 2],
+            'strides': [1, 1],
+            'paddings': [1, 0, 1, 0],
+            'out_stride': [2, 2],
        }
+    def setUp(self):
+        self.config()
+        self.op_type = "im2sequence"
+        x = np.random.uniform(0.1, 1, [
+            self.batch_size, self.img_channels, self.img_height, self.img_width
+        ]).astype("float32")
+        real_size = np.array([[6, 6], [4, 4]]).astype("float32")
+        out = np.array(Im2Sequence(x, real_size, self.attrs))
+        self.inputs = {'X': x, 'Y': real_size}
+        self.outputs = {'Out': out}
+    def test_check_output(self):
+        self.check_output()
 if __name__ == '__main__':
    unittest.main()
+#set shiftwidth=4 set expandtab set tabstop=4
--- a/python/paddle/fluid/tests/unittests/test_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_layers.py
@@ -16,6 +16,7 @@ from __future__ import print_function
 import unittest
 import paddle.fluid.layers as layers
+from paddle.fluid.layers.device import get_places
 import paddle.fluid.nets as nets
 from paddle.fluid.framework import Program, program_guard, default_main_program
 from paddle.fluid.param_attr import ParamAttr
@@ -238,7 +239,7 @@ class TestBook(unittest.TestCase):
    def test_get_places(self):
        program = Program()
        with program_guard(program):
-            x = layers.get_places(device_count=4)
+            x = get_places(device_count=4)
            self.assertIsNotNone(x)
        print(str(program))
@@ -251,12 +252,16 @@ class TestBook(unittest.TestCase):
        print(str(program))
    def test_im2sequence(self):
-        print("test_im2sequence")
        program = Program()
        with program_guard(program):
            x = layers.data(name='x', shape=[3, 128, 128], dtype='float32')
+            y = layers.data(name='y', shape=[], dtype='float32')
            output = layers.im2sequence(
-                input=x, stride=[1, 1], filter_size=[2, 2])
+                input=x,
+                input_image_size=y,
+                stride=[1, 1],
+                filter_size=[2, 2],
+                out_stride=[1, 1])
            self.assertIsNotNone(output)
        print(str(program))

--- a/python/paddle/fluid/tests/unittests/test_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_optimizer.py
@@ -97,7 +97,7 @@ class TestMomentumOptimizer(unittest.TestCase):
        params_grads = append_backward(mean_out)
        self.assertEqual(len(params_grads), 1)
        self.assertEqual(len(momentum_optimizer.get_accumulators()), 0)
-        opts = momentum_optimizer.create_optimization_pass(
+        opts = momentum_optimizer._create_optimization_pass(
            params_grads, mul_out, init_program)
        self.assertEqual(len(opts), 3)
        sgd_op = opts[-1]
@@ -151,7 +151,7 @@ class TestMomentumOptimizer(unittest.TestCase):
        params_grads = append_backward(mean_out)
        self.assertEqual(len(params_grads), 1)
        self.assertEqual(len(momentum_optimizer.get_accumulators()), 0)
-        opts = momentum_optimizer.create_optimization_pass(
+        opts = momentum_optimizer._create_optimization_pass(
            params_grads, mul_out, init_program)
        self.assertEqual(len(opts), 3)
        sgd_op = opts[-1]
@@ -214,8 +214,8 @@ class TestAdagradOptimizer(unittest.TestCase):
        params_grads = append_backward(mean_out)
        self.assertEqual(len(params_grads), 1)
        self.assertEqual(len(adagrad_optimizer.get_accumulators()), 0)
-        opts = adagrad_optimizer.create_optimization_pass(params_grads, mul_out,
+        opts = adagrad_optimizer._create_optimization_pass(
-                                                          init_program)
+            params_grads, mul_out, init_program)
        self.assertEqual(len(opts), 3)
        self.assertEqual([op.type for op in opts],
                         ["fill_constant", "elementwise_mul", "adagrad"])
@@ -278,8 +278,8 @@ class TestAdamOptimizer(unittest.TestCase):
        params_grads = append_backward(mean_out)
        self.assertEqual(len(params_grads), 1)
        self.assertEqual(len(adam_optimizer.get_accumulators()), 0)
-        opts = adam_optimizer.create_optimization_pass(params_grads, mul_out,
+        opts = adam_optimizer._create_optimization_pass(params_grads, mul_out,
-                                                       init_program)
+                                                        init_program)
        self.assertEqual(len(opts), 5)
        self.assertEqual(
            [op.type for op in opts],
@@ -345,8 +345,8 @@ class TestAdamaxOptimizer(unittest.TestCase):
        params_grads = append_backward(mean_out)
        self.assertEqual(len(params_grads), 1)
        self.assertEqual(len(adamax_optimizer.get_accumulators()), 0)
-        opts = adamax_optimizer.create_optimization_pass(params_grads, mul_out,
+        opts = adamax_optimizer._create_optimization_pass(params_grads, mul_out,
-                                                         init_program)
+                                                          init_program)
        self.assertEqual(len(opts), 4)
        self.assertEqual(
            [op.type for op in opts],
@@ -409,7 +409,7 @@ class TestDecayedAdagradOptimizer(unittest.TestCase):
        params_grads = append_backward(mean_out)
        self.assertEqual(len(params_grads), 1)
        self.assertEqual(len(decayed_adagrad_optimizer.get_accumulators()), 0)
-        opts = decayed_adagrad_optimizer.create_optimization_pass(
+        opts = decayed_adagrad_optimizer._create_optimization_pass(
            params_grads, mul_out, init_program)
        self.assertEqual(len(opts), 3)
        self.assertEqual(
@@ -475,8 +475,8 @@ class TestFtrlOptimizer(unittest.TestCase):
        params_grads = append_backward(mean_out)
        self.assertEqual(len(params_grads), 1)
        self.assertEqual(len(ftrl_optimizer.get_accumulators()), 0)
-        opts = ftrl_optimizer.create_optimization_pass(params_grads, mul_out,
+        opts = ftrl_optimizer._create_optimization_pass(params_grads, mul_out,
-                                                       init_program)
+                                                        init_program)
        self.assertEqual(len(opts), 3)
        self.assertEqual([op.type for op in opts],
                         ["fill_constant", "elementwise_mul", "ftrl"])

--- a/python/paddle/fluid/tests/unittests/test_parallel_op.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_op.py
@@ -15,6 +15,7 @@
 import unittest
 import paddle.fluid as fluid
+from paddle.fluid.layers.device import get_places
 import paddle.fluid.profiler as profiler
 import numpy
@@ -115,7 +116,7 @@ class BaseParallelForTest(unittest.TestCase):
            if use_parallel:
                thread_num = fluid.core.get_cuda_device_count(
                ) if use_gpu else 8
-                places = fluid.layers.get_places(thread_num)
+                places = get_places(thread_num)
                pd = fluid.layers.ParallelDo(places, use_nccl=use_nccl)
                data = next(generator)

--- a/python/paddle/fluid/tests/unittests/test_squeeze_op.py
+++ b/python/paddle/fluid/tests/unittests/test_squeeze_op.py
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import unittest
+import numpy as np
+from op_test import OpTest
+# Correct: General.
+class TestSqueezeOp(OpTest):
+    def setUp(self):
+        self.op_type = "squeeze"
+        self.init_test_case()
+        self.inputs = {"X": np.random.random(self.ori_shape).astype("float32")}
+        self.init_attrs()
+        self.outputs = {"Out": self.inputs["X"].reshape(self.new_shape)}
+    def test_check_output(self):
+        self.check_output()
+    def test_check_grad(self):
+        self.check_grad(["X"], "Out")
+    def init_test_case(self):
+        self.ori_shape = (1, 3, 1, 5)
+        self.axes = (0, 2)
+        self.new_shape = (3, 5)
+    def init_attrs(self):
+        self.attrs = {"axes": self.axes, "inplace": False}
+# Correct: There is mins axis.
+class TestSqueezeOp1(TestSqueezeOp):
+    def init_test_case(self):
+        self.ori_shape = (1, 3, 1, 5)
+        self.axes = (0, -2)
+        self.new_shape = (3, 5)
+# Correct: No axes input.
+class TestSqueezeOp2(TestSqueezeOp):
+    def init_test_case(self):
+        self.ori_shape = (1, 3, 1, 5)
+        self.axes = ()
+        self.new_shape = (3, 5)
+# Correct: Just part of axes be squeezed. 
+class TestSqueezeOp3(TestSqueezeOp):
+    def init_test_case(self):
+        self.ori_shape = (3, 1, 5, 1, 4, 1)
+        self.axes = (1, -1)
+        self.new_shape = (3, 5, 1, 4)
+# Correct: Inplace.
+class TestSqueezeOpInplace1(TestSqueezeOp):
+    def init_test_case(self):
+        self.ori_shape = (1, 3, 1, 5)
+        self.axes = (0, 2)
+        self.new_shape = (3, 5)
+    def init_attrs(self):
+        self.attrs = {"axes": self.axes, "inplace": True}
+# Correct: Inplace. There is mins axis.
+class TestSqueezeOpInplace2(TestSqueezeOp):
+    def inti_test_case(self):
+        self.ori_shape = (1, 3, 1, 5)
+        self.axes = (0, -2)
+        self.new_shape = (3, 5)
+    def init_attrs(self):
+        self.attrs = {"axes": self.axes, "inplace": True}
+# Correct: Inplace. No axes input.
+class TestSqueezeOpInplace3(TestSqueezeOp):
+    def init_test_case(self):
+        self.ori_shape = (1, 3, 1, 5)
+        self.axes = ()
+        self.new_shape = (3, 5)
+    def init_attrs(self):
+        self.attrs = {"axes": self.axes, "inplace": True}
+# Correct: Inpalce. Just part of axes be squeezed. 
+class TestSqueezeOpInplace4(TestSqueezeOp):
+    def init_test_case(self):
+        self.ori_shape = (3, 1, 5, 1, 4, 1)
+        self.axes = (1, -1)
+        self.new_shape = (3, 5, 1, 4)
+    def init_attrs(self):
+        self.attrs = {"axes": self.axes, "inplace": True}
+if __name__ == "__main__":
+    unittest.main()
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -181,6 +181,14 @@ else:
    command = "patchelf --set-rpath '$ORIGIN/../libs/' ${PADDLE_BINARY_DIR}/python/paddle/fluid/core.so"
 if os.system(command) != 0:
    raise Exception("patch core.so failed, command: %s" % command)
+if '${WITH_FLUID_ONLY}'== 'OFF':
+    # change rpath of _swig_paddle.so.
+    if "@APPLE@" == "1":
+        command = "install_name_tool -id \"@loader_path/../paddle/libs/\" ${PADDLE_BINARY_DIR}/python/py_paddle/_swig_paddle.so"
+    else:
+        command = "patchelf --set-rpath '$ORIGIN/../paddle/libs/' ${PADDLE_BINARY_DIR}/python/py_paddle/_swig_paddle.so"
+    if os.system(command) != 0:
+        raise Exception("patch _swig_paddle.so failed, command: %s" % command)
 setup(name='${PACKAGE_NAME}',
      version='${PADDLE_VERSION}',