Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into feature/hide_api_cont

818e0708 · yuyang18 · b83ffda4 · 1c9c8e8d · 818e0708 · 818e0708
25 changed file
--- a/paddle/fluid/inference/analysis/data_flow_graph.cc
+++ b/paddle/fluid/inference/analysis/data_flow_graph.cc
@@ -90,6 +90,20 @@ std::string DataFlowGraph::DotString() const {
  return dot.Build();
 }

+std::string DataFlowGraph::HumanReadableInfo(bool show_values,
+                                             bool show_functions) const {
+  std::stringstream values, functions;
+  for (auto &n : nodes.nodes()) {
+    if (show_values && n->IsValue()) {
+      values << n->repr() << "\n";
+    }
+    if (show_functions && n->IsFunction()) {
+      functions << n->repr() << "\n";
+    }
+  }
+  return "Values:\n" + values.str() + "\n\n" + "Functions:\n" + functions.str();
+}
+
 //
 // NodesBFSIterator
 //
@@ -208,6 +222,76 @@ Node *GraphTraits<DataFlowGraph>::NodesDFSIterator::operator->() {
  return stack_.top();
 }

+GraphTraits<DataFlowGraph>::NodesTSIterator::NodesTSIterator(
+    const std::vector<Node *> &source) {
+  PADDLE_ENFORCE(!source.empty(),
+                 "Start points of topological sorting should not be empty!");
+  std::unordered_set<Node *> visited;
+  std::unordered_set<Node *> to_visit{source.begin(), source.end()};
+
+  std::vector<Node *> inlink_visited;
+  while (!to_visit.empty()) {
+    std::vector<Node *> queue(to_visit.begin(), to_visit.end());
+    for (auto *p : queue) {
+      inlink_visited.clear();
+
+      std::copy_if(p->inlinks.begin(), p->inlinks.end(),
+                   std::back_inserter(inlink_visited),
+                   [&](Node *x) { return visited.count(x); });
+
+      if (inlink_visited.size() == p->inlinks.size()) {
+        sorted_.push_back(p);
+        for (auto *_ : p->outlinks) {
+          if (!visited.count(_)) {
+            to_visit.insert(_);
+          }
+        }
+
+        to_visit.erase(p);
+        visited.insert(p);
+      }
+    }
+  }
+}
+
+GraphTraits<DataFlowGraph>::NodesTSIterator::NodesTSIterator(
+    const paddle::inference::analysis::GraphTraits<
+        DataFlowGraph>::NodesTSIterator &other)
+    : sorted_(other.sorted_), cursor_(other.cursor_) {}
+
+Node &GraphTraits<DataFlowGraph>::NodesTSIterator::operator*() {
+  PADDLE_ENFORCE_LT(cursor_, sorted_.size());
+  return *sorted_[cursor_];
+}
+
+paddle::inference::analysis::GraphTraits<DataFlowGraph>::NodesTSIterator
+    &GraphTraits<DataFlowGraph>::NodesTSIterator::operator++() {
+  if (++cursor_ >= sorted_.size()) {
+    sorted_.clear();
+    cursor_ = 0;
+  }
+  return *this;
+}
+paddle::inference::analysis::GraphTraits<DataFlowGraph>::NodesTSIterator &
+GraphTraits<DataFlowGraph>::NodesTSIterator::operator=(
+    const paddle::inference::analysis::GraphTraits<
+        DataFlowGraph>::NodesTSIterator &other) {
+  cursor_ = other.cursor_;
+  sorted_ = other.sorted_;
+  return *this;
+}
+
+bool GraphTraits<DataFlowGraph>::NodesTSIterator::operator==(
+    const paddle::inference::analysis::GraphTraits<
+        DataFlowGraph>::NodesTSIterator &other) {
+  return sorted_ == other.sorted_ && cursor_ == other.cursor_;
+}
+
+Node *GraphTraits<DataFlowGraph>::NodesTSIterator::operator->() {
+  PADDLE_ENFORCE_LT(cursor_, sorted_.size());
+  return sorted_[cursor_];
+}
+
 }  // namespace analysis
 }  // namespace inference
 }  // namespace paddle
--- a/paddle/fluid/inference/analysis/data_flow_graph.h
+++ b/paddle/fluid/inference/analysis/data_flow_graph.h
@@ -48,6 +48,9 @@ struct DataFlowGraph {
  // Output a DOT graph file for debug.
  std::string DotString() const;

+  std::string HumanReadableInfo(bool show_values = true,
+                                bool show_functions = true) const;
+
 private:
  // Remove duplicate edges and so on.
  void Clean();
@@ -107,6 +110,32 @@ struct GraphTraits<DataFlowGraph> {
    std::unordered_set<Node *> visited_;
  };

+  // Topological sorting iterator on nodes.
+  struct NodesTSIterator
+      : public std::iterator<std::forward_iterator_tag, Node *> {
+    NodesTSIterator() = default;
+    explicit NodesTSIterator(const std::vector<Node *> &source);
+    NodesTSIterator(NodesTSIterator &&other)
+        : sorted_(std::move(other.sorted_)), cursor_(other.cursor_) {
+      other.cursor_ = 0;
+    }
+    NodesTSIterator(const NodesTSIterator &other);
+
+    Node &operator*();
+    NodesTSIterator &operator++();
+    // TODO(Superjomn) current implementation just compare the first
+    // element, need to compare the graph and all the elements in the queue and
+    // set.
+    NodesTSIterator &operator=(const NodesTSIterator &other);
+    bool operator==(const NodesTSIterator &other);
+    bool operator!=(const NodesTSIterator &other) { return !(*this == other); }
+    Node *operator->();
+
+   private:
+    std::vector<Node *> sorted_;
+    int cursor_{0};
+  };
+
  explicit GraphTraits(DataFlowGraph *graph) : graph_(graph) {}

  // default use BFS to visit the nodes.
@@ -119,17 +148,24 @@ struct GraphTraits<DataFlowGraph> {
  iterator_range<NodesDFSIterator> nodes_in_DFS() {
    return iterator_range<NodesDFSIterator>(nodes_dfs_begin(), nodes_dfs_end());
  }
+  iterator_range<NodesTSIterator> nodes_in_TS() {
+    return iterator_range<NodesTSIterator>(nodes_ts_begin(), nodes_ts_end());
+  }

 private:
  NodesBFSIterator nodes_bfs_begin() {
    return NodesBFSIterator(graph_->inputs);
  }
  NodesBFSIterator nodes_bfs_end() { return NodesBFSIterator(); }
+
  NodesDFSIterator nodes_dfs_begin() {
    return NodesDFSIterator(graph_->inputs);
  }
  NodesDFSIterator nodes_dfs_end() { return NodesDFSIterator(); }

+  NodesTSIterator nodes_ts_begin() { return NodesTSIterator(graph_->inputs); }
+  NodesTSIterator nodes_ts_end() { return NodesTSIterator(); }
+
 private:
  DataFlowGraph *graph_;
 };

--- a/paddle/fluid/inference/analysis/data_flow_graph_tester.cc
+++ b/paddle/fluid/inference/analysis/data_flow_graph_tester.cc
@@ -24,11 +24,11 @@ TEST(DataFlowGraph, BFS) {
  auto dfg = ProgramDescToDFG(desc);
  dfg.Build();

-  for (auto* in : dfg.inputs) {
+  for (auto *in : dfg.inputs) {
    LOG(INFO) << "inputs: " << in->name() << " "
              << static_cast<int>(in->type());
  }
-  for (auto* out : dfg.outputs) {
+  for (auto *out : dfg.outputs) {
    LOG(INFO) << "outputs: " << out->name() << " "
              << static_cast<int>(out->type());
  }
@@ -57,6 +57,71 @@ TEST(DataFlowGraph, DFS) {
  ASSERT_EQ(count, dfg.nodes.size());
 }

+// Topological sorting.
+/*
+ * Graph topology
+ * inputs: 0, 1, 2
+ * 0 -> 4
+ * 0 -> 5
+ * 1 -> 6
+ * 2 -> 7
+ * 4 -> 5
+ * 4 -> 7
+ * 4 -> 3
+ * 7 -> 3
+ */
+TEST(DataFlowGraph, TS) {
+  DataFlowGraph graph;
+
+  for (int i = 0; i < 8; i++) {
+    auto *node = graph.nodes.Create(Node::Type::kValue);
+    node->SetName("node-" + std::to_string(i));
+  }
+
+  auto add_link = [&](int i, int j) {
+    Node *source = graph.nodes.GetMutable(i);
+    Node *target = graph.nodes.GetMutable(j);
+    target->inlinks.push_back(source);
+    source->outlinks.push_back(target);
+  };
+
+  graph.inputs.push_back(graph.nodes.GetMutable(0));
+  graph.inputs.push_back(graph.nodes.GetMutable(1));
+  graph.inputs.push_back(graph.nodes.GetMutable(2));
+
+  add_link(0, 4);
+  add_link(0, 5);
+  add_link(1, 6);
+  add_link(2, 7);
+  add_link(4, 5);
+  add_link(4, 7);
+  add_link(4, 3);
+  add_link(7, 3);
+
+  auto its = GraphTraits<DataFlowGraph>(&graph).nodes_in_TS();
+  std::vector<int> sorted_ids;
+  for (auto it = its.begin(); it != its.end(); ++it) {
+    LOG(INFO) << it->name();
+    sorted_ids.push_back(it->id());
+  }
+
+  // Assert a occurs prior to b in the sorted_ids.
+  auto assert_positive_sequence_pair = [&](int a, int b) {
+    auto a_offset = std::find(sorted_ids.begin(), sorted_ids.end(), a);
+    auto b_offset = std::find(sorted_ids.begin(), sorted_ids.end(), b);
+    ASSERT_LT(a_offset, b_offset);
+  };
+
+  assert_positive_sequence_pair(2, 7);
+  assert_positive_sequence_pair(7, 3);
+  assert_positive_sequence_pair(4, 3);
+  assert_positive_sequence_pair(0, 4);
+  assert_positive_sequence_pair(0, 5);
+  assert_positive_sequence_pair(1, 6);
+  assert_positive_sequence_pair(4, 5);
+  assert_positive_sequence_pair(4, 7);
+}
+
 }  // namespace analysis
 }  // namespace inference
 }  // namespace paddle
--- a/paddle/fluid/operators/detection/rpn_target_assign_op.cc
+++ b/paddle/fluid/operators/detection/rpn_target_assign_op.cc
@@ -86,8 +86,9 @@ class RpnTargetAssignKernel : public framework::OpKernel<T> {
                         std::minstd_rand engine,
                         std::vector<int>* inds) const {
    std::uniform_real_distribution<float> uniform(0, 1);
-    if (inds->size() > num) {
-      for (int i = num; i < inds->size(); ++i) {
+    const int64_t size = static_cast<int64_t>(inds->size());
+    if (size > num) {
+      for (int64_t i = num; i < size; ++i) {
        int rng_ind = std::floor(uniform(engine) * i);
        if (rng_ind < num)
          std::iter_swap(inds->begin() + rng_ind + offset,

--- a/paddle/fluid/operators/im2sequence_op.cc
+++ b/paddle/fluid/operators/im2sequence_op.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "paddle/fluid/operators/im2sequence_op.h"
+#include <string>
 #include <vector>

 namespace paddle {
@@ -28,20 +29,19 @@ class Im2SequenceOp : public framework::OperatorWithKernel {
                   "Input(X) of Im2SequenceOp should not be null.");
    PADDLE_ENFORCE(ctx->HasOutput("Out"),
                   "Output(Out) of Im2SequenceOp op should not be null.");
-
    auto in_dim = ctx->GetInputDim("X");
+
    PADDLE_ENFORCE_EQ(in_dim.size(), 4,
                      "Input(X) format must be 4D tensor, eg., NCHW.");
-
-    auto kernels = ctx->Attrs().Get<std::vector<int>>("kernels");
-    auto strides = ctx->Attrs().Get<std::vector<int>>("strides");
-    auto paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
-
    int batch_size = in_dim[0];
    int img_channels = in_dim[1];
    int img_height = in_dim[2];
    int img_width = in_dim[3];

+    auto kernels = ctx->Attrs().Get<std::vector<int>>("kernels");
+    auto strides = ctx->Attrs().Get<std::vector<int>>("strides");
+    auto paddings = ctx->Attrs().Get<std::vector<int>>("paddings");
+
    int output_height = Im2SeqOutputSize(img_height, kernels[0], paddings[0],
                                         paddings[2], strides[0]);
    int output_width = Im2SeqOutputSize(img_width, kernels[1], paddings[1],
@@ -61,6 +61,10 @@ class Im2SequenceOpMaker : public framework::OpProtoAndCheckerMaker {
             "C: channels"
             "H: height"
             "W: width");
+    AddInput("Y",
+             "(Tensor) The input tensor of image real size(H, W)."
+             "2-D with shape [batchsize, 2]")
+        .AsDispensable();
    AddOutput("Out", "(LodTensor) The output data of im2sequence op,");
    AddAttr<std::vector<int>>("kernels",
                              "(vector<int>), the "
@@ -73,6 +77,13 @@ class Im2SequenceOpMaker : public framework::OpProtoAndCheckerMaker {
                              "(vector<int> default:{0, 0, 0, 0}), the "
                              "paddings(up_pad, left_pad, down_pad, right_pad)")
        .SetDefault({0, 0, 0, 0});
+    AddAttr<std::vector<int>>("out_stride",
+                              "the attribute is valid only when input(Y)"
+                              "is not NULL.this attribute represents the"
+                              "scaling of the pic through the CNN"
+                              "(vector<int> dedault:{1,1}),the out_stride"
+                              " (out_stride_height, out_stride_width)")
+        .SetDefault({1, 1});
    AddComment(R"DOC(
 This op uses kernels to scan images and converts these images to sequences.
 After expanding, The number of time steps are output_height * output_width
@@ -123,7 +134,7 @@ output.data = [[ 6.  2.  8.  3.  2.  4.  6.  3.]
               [ 7.  1.  7.  9.  2.  1.  3.  5.]
               [ 5.  7.  2.  4.  1.  3.  9.  0.]
               [ 7.  9.  4.  8.  3.  5.  0.  8.]]
-output.dims = {8, 9}
+output.dims = {8, 8}
 output.lod = [[0, 4, 8]]

 )DOC");

--- a/paddle/fluid/operators/im2sequence_op.h
+++ b/paddle/fluid/operators/im2sequence_op.h
@@ -13,6 +13,7 @@
   limitations under the License. */

 #pragma once
+#include <string>
 #include <vector>
 #include "paddle/fluid/framework/data_layout.h"
 #include "paddle/fluid/framework/eigen.h"
@@ -39,51 +40,107 @@ class Im2SequenceKernel : public framework::OpKernel<T> {
  void Compute(const framework::ExecutionContext& ctx) const override {
    const Tensor* in = ctx.Input<Tensor>("X");
    LoDTensor* out = ctx.Output<LoDTensor>("Out");
-    out->mutable_data<T>(ctx.GetPlace());
-    // TODO(wanghaoshuang): Add layout checker after 'set_layout'
-    // being available for python API
-    // PADDLE_ENFORCE_EQ(in->layout(), framework::DataLayout::kNCHW,
-    //                  "Input(X) layout must be NCHW");
    auto in_dim = in->dims();
    int batch_size = in_dim[0];
    int img_channels = in_dim[1];
    int img_height = in_dim[2];
    int img_width = in_dim[3];
-
    auto kernels = ctx.Attr<std::vector<int>>("kernels");
    auto strides = ctx.Attr<std::vector<int>>("strides");
    auto paddings = ctx.Attr<std::vector<int>>("paddings");
+    if (ctx.HasInput("Y") && batch_size > 1) {
+      const Tensor* imgrealsize = ctx.Input<Tensor>("Y");
+      auto out_stride = ctx.Attr<std::vector<int>>("out_stride");
+      Tensor cpu_shape_tensor;
+      TensorCopySync(*imgrealsize, platform::CPUPlace(), &cpu_shape_tensor);
+      std::vector<int> imgreal_h;
+      std::vector<int> imgreal_w;
+      std::vector<int> output_height;
+      std::vector<int> output_width;
+      int result = 0;
+      for (int i = 0; i < batch_size; i++) {
+        int tmp_real_h = static_cast<int>((cpu_shape_tensor.data<T>())[2 * i]);
+        int tmp_real_w =
+            static_cast<int>((cpu_shape_tensor.data<T>())[2 * i + 1]);
+        if (tmp_real_h % out_stride[0] == 0) {
+          tmp_real_h = tmp_real_h / out_stride[0];
+        } else {
+          tmp_real_h = tmp_real_h / out_stride[0] + 1;
+        }
+        if (tmp_real_w % out_stride[1] == 0) {
+          tmp_real_w = tmp_real_w / out_stride[1];
+        } else {
+          tmp_real_w = tmp_real_w / out_stride[1] + 1;
+        }
+        imgreal_h.push_back(tmp_real_h);
+        imgreal_w.push_back(tmp_real_w);
+        output_height.push_back(Im2SeqOutputSize(
+            imgreal_h[i], kernels[0], paddings[0], paddings[2], strides[0]));
+        output_width.push_back(Im2SeqOutputSize(
+            imgreal_w[i], kernels[1], paddings[1], paddings[3], strides[1]));
+        result += output_height[i] * output_width[i];
+      }
+
+      out->mutable_data<T>({result, img_channels * kernels[0] * kernels[1]},
+                           ctx.GetPlace());
+
+      const std::vector<int> dilations({1, 1});
+      int offset_out = 0;
+      for (int i = 0; i < batch_size; i++) {
+        const Tensor src =
+            in->Slice(i, i + 1).Resize({img_channels, img_height, img_width});
+        Tensor dst = out->Slice(offset_out,
+                                offset_out + output_height[i] * output_width[i])
+                         .Resize({output_height[i], output_width[i],
+                                  img_channels, kernels[0], kernels[1]});
+        offset_out += output_height[i] * output_width[i];
+
+        math::Im2ColFunctor<math::ColFormat::kOCF, DeviceContext, T> f;
+        auto& dev_ctx = ctx.template device_context<DeviceContext>();
+        f(dev_ctx, src, dilations, strides, paddings, &dst);
+      }
+      framework::LoD lod(1);
+      lod[0].reserve(batch_size + 1);
+      int offset = 0;
+      lod[0].push_back(offset);
+      for (int i = 0; i < batch_size; ++i) {
+        offset += output_height[i] * output_width[i];
+        lod[0].push_back(offset);
+      }
+      out->set_lod(lod);
+    } else {
+      out->mutable_data<T>(ctx.GetPlace());
      int output_height = Im2SeqOutputSize(img_height, kernels[0], paddings[0],
                                           paddings[2], strides[0]);
      int output_width = Im2SeqOutputSize(img_width, kernels[1], paddings[1],
                                          paddings[3], strides[1]);

      const std::vector<int> dilations({1, 1});
-
      auto out_dims = out->dims();
      out->Resize({batch_size, out->numel() / batch_size});
      for (int i = 0; i < batch_size; i++) {
        const Tensor src =
            in->Slice(i, i + 1).Resize({img_channels, img_height, img_width});
-      Tensor dst = out->Slice(i, i + 1).Resize(
-          {output_height, output_width, img_channels, kernels[0], kernels[1]});
+        Tensor dst =
+            out->Slice(i, i + 1).Resize({output_height, output_width,
+                                         img_channels, kernels[0], kernels[1]});

        math::Im2ColFunctor<math::ColFormat::kOCF, DeviceContext, T> f;
        auto& dev_ctx = ctx.template device_context<DeviceContext>();
        f(dev_ctx, src, dilations, strides, paddings, &dst);
      }
      out->Resize(out_dims);
-
-    // set lod information
-    // TODO(wanghaoshuang): Move this to InferShape
      framework::LoD lod(1);
      lod[0].reserve(batch_size + 1);
-    for (int i = 0, offset = 0; i < batch_size + 1; ++i) {
+      int offset = 0;
      lod[0].push_back(offset);
+      for (int i = 0; i < batch_size; ++i) {
        offset += output_height * output_width;
+        lod[0].push_back(offset);
      }
      out->set_lod(lod);
    }
+  }
 };

 template <typename DeviceContext, typename T>

--- a/paddle/fluid/operators/math/im2col.cc
+++ b/paddle/fluid/operators/math/im2col.cc
@@ -43,21 +43,6 @@ class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO,
    int col_height = col->dims()[3];
    int col_width = col->dims()[4];

-    PADDLE_ENFORCE_EQ((im_height + padding[0] + padding[2] -
-                       ((dilation[0] * (filter_height - 1) + 1))) /
-                              stride[0] +
-                          1,
-                      col_height,
-                      "Output_height and padding(padding_up, padding_down) are "
-                      "inconsistent.");
-    PADDLE_ENFORCE_EQ((im_width + padding[1] + padding[3] -
-                       ((dilation[1] * (filter_width - 1) + 1))) /
-                              stride[1] +
-                          1,
-                      col_width,
-                      "Output_height and padding(padding_up, padding_down) are "
-                      "inconsistent.");
-
    int channels_col = im_channels * filter_height * filter_width;

    const T* im_data = im.data<T>();
@@ -178,17 +163,6 @@ class Im2ColFunctor<paddle::operators::math::ColFormat::kOCF,
    int col_height = col->dims()[0];
    int col_width = col->dims()[1];

-    PADDLE_ENFORCE_EQ(
-        (im_height + padding[0] + padding[2] - filter_height) / stride[0] + 1,
-        col_height,
-        "Output_height and padding(padding_up, padding_down) are "
-        "inconsistent.");
-    PADDLE_ENFORCE_EQ(
-        (im_width + padding[1] + padding[3] - filter_width) / stride[1] + 1,
-        col_width,
-        "col_width and padding(padding_left, padding_right) are "
-        "inconsistent.");
-
    const T* im_data = im.data<T>();
    T* col_data = col->data<T>();


--- a/paddle/fluid/operators/math/im2col.cu
+++ b/paddle/fluid/operators/math/im2col.cu
@@ -77,21 +77,6 @@ class Im2ColFunctor<paddle::operators::math::ColFormat::kCFO,
    int col_height = col->dims()[3];
    int col_width = col->dims()[4];

-    PADDLE_ENFORCE_EQ((im_height + padding[0] + padding[2] -
-                       (dilation[0] * (filter_height - 1) + 1)) /
-                              stride[0] +
-                          1,
-                      col_height,
-                      "Output_height and padding(padding_up, padding_down) are "
-                      "inconsistent.");
-    PADDLE_ENFORCE_EQ((im_width + padding[1] + padding[3] -
-                       (dilation[1] * (filter_width - 1) + 1)) /
-                              stride[1] +
-                          1,
-                      col_width,
-                      "col_width and padding(padding_left, padding_right) are "
-                      "inconsistent.");
-
    int num_outputs = im_channels * col_height * col_width;
    int blocks = (num_outputs + 1024 - 1) / 1024;
    int block_x = 512;
@@ -274,21 +259,6 @@ class Im2ColFunctor<paddle::operators::math::ColFormat::kOCF,
    int col_height = col->dims()[0];
    int col_width = col->dims()[1];

-    PADDLE_ENFORCE_EQ((im_height + padding[0] + padding[2] -
-                       (dilation[0] * (filter_height - 1) + 1)) /
-                              stride[0] +
-                          1,
-                      col_height,
-                      "Output_height and padding(padding_up, padding_down) are "
-                      "inconsistent.");
-    PADDLE_ENFORCE_EQ((im_width + padding[1] + padding[3] -
-                       (dilation[1] * (filter_width - 1) + 1)) /
-                              stride[1] +
-                          1,
-                      col_width,
-                      "col_width and padding(padding_left, padding_right) are "
-                      "inconsistent.");
-
    int block_dim_x = 0;
    int block_dim_y = 0;
    if (filter_height <= 4 && filter_width <= 4) {

--- a/paddle/fluid/platform/CMakeLists.txt
+++ b/paddle/fluid/platform/CMakeLists.txt
@@ -46,7 +46,7 @@ ENDIF()
 # memcpy depends on device_context, here add deps individually for
 # avoiding cycle dependencies
 cc_library(device_context SRCS device_context.cc init.cc DEPS malloc
-    place eigen3 stringpiece cpu_helper ${GPU_CTX_DEPS} ${MKLDNN_CTX_DEPS})
+    place eigen3 stringpiece cpu_helper framework_proto ${GPU_CTX_DEPS} ${MKLDNN_CTX_DEPS})
 nv_test(device_context_test SRCS device_context_test.cu DEPS device_context gpu_info)

 cc_test(init_test SRCS init_test.cc DEPS device_context)

--- a/python/paddle/fluid/annotations.py
+++ b/python/paddle/fluid/annotations.py
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import functools
+import sys
+
+__all__ = ['deprecated']
+
+
+def deprecated(since, instead, extra_message=""):
+    def decorator(func):
+        err_msg = "API {0} is deprecated since {1}. Please use {2} instead.".format(
+            func.__name__, since, instead)
+        if len(extra_message) != 0:
+            err_msg += "\n"
+            err_msg += extra_message
+
+        @functools.wraps(func)
+        def wrapper(*args, **kwargs):
+            print >> sys.stderr, err_msg
+            return func(*args, **kwargs)
+
+        wrapper.__doc__ += "\n    "
+        wrapper.__doc__ += err_msg
+        return wrapper
+
+    return decorator
--- a/python/paddle/fluid/backward.py
+++ b/python/paddle/fluid/backward.py
@@ -18,10 +18,7 @@ import collections
 import copy
 import unique_name

-__all__ = [
-    'append_backward',
-    'calc_gradient',
-]
+__all__ = ['append_backward']


 def _rename_arg_(op_descs, old_name, new_name, begin_idx=None, end_idx=None):
@@ -123,7 +120,8 @@ def _append_grad_suffix_(name):
 def _addup_repetitive_outputs_(op_descs):
    """
    In backward part, an variable may be the output of more than one ops.
-    In this case, the variable should be the accumulation of all the outputs.
+    And one op may yield its multiple outputs to the same variable.
+    In these cases, the variable should be the accumulation of all the outputs.
    `sum_op`s are added to implement the accumulate.
    """
    pending_sum_ops = []
@@ -136,7 +134,9 @@ def _addup_repetitive_outputs_(op_descs):
                    "sum", {"X": renamed_vars[var_name]}, {"Out": [var_name]},
                    {"use_mkldnn": False}), idx))
                renamed_vars[var_name] = [var_name]
-        for var_name in op_desc.output_arg_names():
+        for param_idx, param_name in enumerate(op_desc.output_names()):
+            arg_names = op_desc.output(param_name)
+            for arg_idx, var_name in enumerate(arg_names):
                if var_name == core.empty_var_name(
                ) or var_name in op_desc.input_arg_names():
                    # empty variable or inplace op
@@ -154,11 +154,26 @@ def _addup_repetitive_outputs_(op_descs):
                        _rename_arg_(op_descs, var_name, new_name, 0, idx)
                        _rename_arg_(pending_sum_ops, var_name, new_name)

+                        for p in op_desc.output_names()[:param_idx]:
+                            p_arg_names = op_desc.output(p)
+                            if var_name in p_arg_names:
+                                op_desc.set_output(p, [
+                                    new_name if x == var_name else x
+                                    for x in p_arg_names
+                                ])
+
+                        arg_names = [
+                            new_name if x == var_name else x
+                            for x in arg_names[:arg_idx]
+                        ] + arg_names[arg_idx:]
+
                    new_name = var_name + "@RENAME@" + \
                        str(var_rename_count[var_name])
                    var_rename_count[var_name] += 1
-                op_desc.rename_output(var_name, new_name)
+                    arg_names[arg_idx] = new_name
+                    op_desc.set_output(param_name, arg_names)
                    renamed_vars[var_name].append(new_name)
+
    for var_name, inputs in renamed_vars.iteritems():
        if len(inputs) > 1:
            pending_sum_ops.append(

--- a/python/paddle/fluid/layers/device.py
+++ b/python/paddle/fluid/layers/device.py
@@ -18,10 +18,12 @@ All util layers.
 from layer_function_generator import autodoc
 from ..framework import unique_name
 from ..layer_helper import LayerHelper
+from ..annotations import deprecated

-__all__ = ['get_places']
+__all__ = []


+@deprecated(since='0.15.0', instead="ParallelExecutor")
 @autodoc()
 def get_places(device_count=None, device_type=None):
    helper = LayerHelper('get_places', **locals())

--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -11,6 +11,20 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
+#   Copyright (c ) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 """
 All layers just related to the neural network.
 """
@@ -3900,7 +3914,13 @@ def transpose(x, perm, name=None):
    return out


-def im2sequence(input, filter_size=1, stride=1, padding=0, name=None):
+def im2sequence(input,
+                filter_size=1,
+                stride=1,
+                padding=0,
+                input_image_size=None,
+                out_stride=1,
+                name=None):
    """
    Extracts image patches from the input tensor to form a tensor of shape
    {input.batch_size * output_height * output_width, filter_size_H *
@@ -3937,6 +3957,15 @@ def im2sequence(input, filter_size=1, stride=1, padding=0, name=None):
            padding_up = padding_down = padding_left = padding_right = padding
            Default: padding = 0.

+        input_image_size(Variable): the input contains image real size.It's dim
+            is [batchsize, 2]. It is dispensable.It is just for batch inference.
+
+        out_stride(int|tuple): The scaling of image through CNN. It is
+            dispensable. It is valid only when input_image_size is not null.
+            If out_stride is tuple,  it must contain two intergers,
+            (out_stride_H, out_stride_W). Otherwise,
+            the out_stride_H = out_stride_W = out_stride.
+
        name (int): The name of this layer. It is optional.

    Returns:
@@ -3987,7 +4016,7 @@ def im2sequence(input, filter_size=1, stride=1, padding=0, name=None):
                           [ 5.  7.  2.  4.  1.  3.  9.  0.]
                           [ 7.  9.  4.  8.  3.  5.  0.  8.]]

-            output.dims = {8, 9}
+            output.dims = {8, 8}

            output.lod = [[4, 4]]

@@ -4009,18 +4038,17 @@ def im2sequence(input, filter_size=1, stride=1, padding=0, name=None):
    if len(padding) == 2:
        padding.append(padding[0])
        padding.append(padding[1])
-
+    inputs = {"X": input}
+    attrs = {"kernels": filter_size, "strides": stride, "padding": padding}
+    if input_image_size:
+        if isinstance(out_stride, int):
+            out_stride = [out_stride, out_stride]
+        inputs["Y"] = input_image_size
+        attrs["out_stride"] = out_stride
    helper = LayerHelper('im2sequence', **locals())
    out = helper.create_tmp_variable(dtype=helper.input_dtype())
    helper.append_op(
-        type='im2sequence',
-        inputs={'X': input},
-        outputs={'Out': out},
-        attrs={
-            'kernels': filter_size,
-            'strides': stride,
-            'paddings': padding,
-        })
+        type='im2sequence', inputs=inputs, outputs={'Out': out}, attrs=attrs)
    return out



--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
@@ -29,7 +29,7 @@ __all__ = [
    'SGD', 'Momentum', 'Adagrad', 'Adam', 'Adamax', 'DecayedAdagrad', 'Ftrl',
    'SGDOptimizer', 'MomentumOptimizer', 'AdagradOptimizer', 'AdamOptimizer',
    'AdamaxOptimizer', 'DecayedAdagradOptimizer', 'RMSPropOptimizer',
-    'FtrlOptimizer', 'Adadelta', 'ModelAverage', 'Optimizer', 'RMSPropOptimizer'
+    'FtrlOptimizer', 'Adadelta', 'ModelAverage', 'RMSPropOptimizer'
 ]


@@ -67,7 +67,7 @@ class Optimizer(object):
        self._LARS_weight_decay = LARS_weight_decay

    def _create_global_learning_rate(self):
-        lr = self.global_learning_rate()
+        lr = self._global_learning_rate()

        if isinstance(lr, framework.Variable):
            return
@@ -86,7 +86,7 @@ class Optimizer(object):
            dtype='float32' if self._dtype == None else self._dtype,
            persistable=True)

-    def global_learning_rate(self, program=None):
+    def _global_learning_rate(self, program=None):
        """
        get global decayed learning rate
        :return:
@@ -110,9 +110,9 @@ class Optimizer(object):
            return param_lr
        else:
            if param_lr == 1.0:
-                return self.global_learning_rate()
+                return self._global_learning_rate()
            else:
-                return self.global_learning_rate() * param_lr
+                return self._global_learning_rate() * param_lr

    def _create_accumulators(self, block, parameters):
        """Create all accumulators needed by the parameters
@@ -185,7 +185,7 @@ class Optimizer(object):
                            format(name, param.name))
        return self._accumulators[name][param.name]

-    def create_optimization_pass(self,
+    def _create_optimization_pass(self,
                                  parameters_and_grads,
                                  loss,
                                  startup_program=None):
@@ -221,7 +221,7 @@ class Optimizer(object):
            self._create_global_learning_rate()
            if self._LARS_weight_decay > 0.0:
                layers.append_LARS(parameters_and_grads,
-                                   self.global_learning_rate(),
+                                   self._global_learning_rate(),
                                   self._LARS_weight_decay)

            optimize_ops = []
@@ -262,7 +262,7 @@ class Optimizer(object):
        params_grads = append_regularization_ops(params_grads,
                                                 self.regularization)

-        optimize_ops = self.create_optimization_pass(params_grads, loss,
+        optimize_ops = self._create_optimization_pass(params_grads, loss,
                                                      startup_program)
        return optimize_ops, params_grads


--- a/python/paddle/fluid/tests/book/notest_understand_sentiment.py
+++ b/python/paddle/fluid/tests/book/notest_understand_sentiment.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from __future__ import print_function
-
+from paddle.fluid.layers.device import get_places
 import unittest
 import paddle.fluid as fluid
 import paddle
@@ -144,7 +144,7 @@ def train(word_dict,
        cost, acc_out, prediction = net_method(
            data, label, input_dim=dict_dim, class_dim=class_dim)
    else:
-        places = fluid.layers.get_places()
+        places = get_places()
        pd = fluid.layers.ParallelDo(places)
        with pd.do():
            cost, acc, _ = net_method(

--- a/python/paddle/fluid/tests/book/test_recognize_digits.py
+++ b/python/paddle/fluid/tests/book/test_recognize_digits.py
@@ -12,15 +12,17 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 from __future__ import print_function
-import argparse
-import paddle.fluid as fluid
-import paddle
-import sys
-import numpy
-import unittest
+
 import math
-import sys
 import os
+import sys
+import unittest
+
+import numpy
+
+import paddle
+import paddle.fluid as fluid
+from paddle.fluid.layers.device import get_places

 BATCH_SIZE = 64

@@ -76,7 +78,7 @@ def train(nn_type,
        net_conf = conv_net

    if parallel:
-        places = fluid.layers.get_places()
+        places = get_places()
        pd = fluid.layers.ParallelDo(places)
        with pd.do():
            img_ = pd.read_input(img)

--- a/python/paddle/fluid/tests/book/test_word2vec.py
+++ b/python/paddle/fluid/tests/book/test_word2vec.py
@@ -14,6 +14,7 @@

 import paddle
 import paddle.fluid as fluid
+from paddle.fluid.layers.device import get_places
 import unittest
 import os
 import numpy as np
@@ -80,7 +81,7 @@ def train(use_cuda, is_sparse, is_parallel, save_dirname, is_local=True):
        avg_cost, predict_word = __network__(
            [first_word, second_word, third_word, forth_word, next_word])
    else:
-        places = fluid.layers.get_places()
+        places = get_places()
        pd = fluid.layers.ParallelDo(places)
        with pd.do():
            avg_cost, predict_word = __network__(

--- a/python/paddle/fluid/tests/book_memory_optimization/test_memopt_fit_a_line.py
+++ b/python/paddle/fluid/tests/book_memory_optimization/test_memopt_fit_a_line.py
@@ -12,12 +12,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

-import numpy as np
-import paddle
-import paddle.fluid as fluid
 import math
 import sys

+import paddle
+import paddle.fluid as fluid
+from paddle.fluid.layers.device import get_places
+
 # need to fix random seed and training data to compare the loss
 # value accurately calculated by the default and the memory optimization
 # version.
@@ -34,7 +35,7 @@ if fluid.core.is_compiled_with_cuda():
    use_nccl = False
    place = fluid.CUDAPlace(0)

-places = fluid.layers.get_places(device_count=0, device_type=device_type)
+places = get_places(device_count=0, device_type=device_type)
 pd = fluid.layers.ParallelDo(places, use_nccl=use_nccl)
 with pd.do():
    x_ = pd.read_input(x)

--- a/python/paddle/fluid/tests/unittests/test_calc_gradient.py
+++ b/python/paddle/fluid/tests/unittests/test_calc_gradient.py
@@ -16,8 +16,6 @@ import unittest

 import paddle.fluid as fluid
 import paddle.fluid.layers as layers
-import paddle.fluid.framework as framework
-import paddle.fluid.optimizer as optimizer
 from paddle.fluid.backward import calc_gradient



--- a/python/paddle/fluid/tests/unittests/test_get_places_op.py
+++ b/python/paddle/fluid/tests/unittests/test_get_places_op.py
@@ -13,6 +13,7 @@
 # limitations under the License.

 import paddle.fluid as fluid
+from paddle.fluid.layers.device import get_places
 import decorators
 import unittest

@@ -20,7 +21,7 @@ import unittest
 class TestGetPlaces(unittest.TestCase):
    @decorators.prog_scope()
    def test_get_places(self):
-        places = fluid.layers.get_places()
+        places = get_places()
        cpu = fluid.CPUPlace()
        exe = fluid.Executor(cpu)
        exe.run(fluid.default_main_program())

--- a/python/paddle/fluid/tests/unittests/test_im2sequence_op.py
+++ b/python/paddle/fluid/tests/unittests/test_im2sequence_op.py
@@ -16,20 +16,45 @@ import numpy as np
 from op_test import OpTest


-def get_output_shape(attrs, in_shape):
+def get_output_shape(attrs, in_shape, img_real_size):
+    batchsize = in_shape[0]
    img_height = in_shape[2]
    img_width = in_shape[3]
+    paddings = np.array(attrs['paddings']).astype("int32")
+    kernels = np.array(attrs['kernels']).astype("int32")
+    strides = np.array(attrs['strides']).astype("int32")
+    output_height = np.zeros((1, batchsize)).astype("int32")
+    output_width = np.zeros((1, batchsize)).astype("int32")
+    if len(img_real_size):
+        out_stride = np.array(attrs['out_stride']).astype("int32")
+        imgreal_h = 0
+        imgreal_w = 0
+        for index in range(batchsize):
+            if img_real_size[index, 0] % out_stride[0] == 0:
+                imgreal_h = img_real_size[index, 0] / out_stride[0]
+            else:
+                imgreal_h = img_real_size[index, 0] / out_stride[0] + 1
+            if img_real_size[index, 0] % out_stride[1] == 0:
+                imgreal_w = img_real_size[index, 1] / out_stride[1]
+            else:
+                imgreal_w = img_real_size[index, 0] / out_stride[1] + 1
+            output_height[0,index] = \
+              1 +  \
+              (imgreal_h + paddings[0] + paddings[2] - kernels[0] + strides[0] - 1) / \
+                  strides[0]

-    paddings = attrs['paddings']
-    kernels = attrs['kernels']
-    strides = attrs['strides']
-
-    output_height = \
+            output_width[0,index] = \
+              1 + \
+              (imgreal_w + paddings[1] + paddings[3] - kernels[1] + strides[1] - 1) / \
+                  strides[1]
+    else:
+        for index in range(batchsize):
+            output_height[0,index] = \
              1 +  \
              (img_height + paddings[0] + paddings[2] - kernels[0] + strides[0] - 1) / \
                  strides[0]

-    output_width = \
+            output_width[0,index] = \
              1 + \
              (img_width + paddings[1] + paddings[3] - kernels[1] + strides[1] - 1) / \
                  strides[1]
@@ -75,22 +100,25 @@ def im2col(attrs, im, col):
                                    im_row_offset][im_col_offset]


-def Im2Sequence(inputs, attrs):
-    output_height, output_width = get_output_shape(attrs, inputs.shape)
+def Im2Sequence(inputs, img_real_size, attrs):
+    output_height, output_width = get_output_shape(attrs, inputs.shape,
+                                                   img_real_size)
    img_channels = inputs.shape[1]
    batch_size = inputs.shape[0]
-    out = np.zeros([
-        batch_size, output_height, output_width, img_channels,
+    out = []
+    for index in range(batch_size):
+        tmp = np.zeros([
+            output_height[0, index], output_width[0, index], img_channels,
            attrs['kernels'][0], attrs['kernels'][1]
        ]).astype("float32")
-
-    for i in range(len(inputs)):
-        im2col(attrs, inputs[i], out[i])
-
-    out = out.reshape([
-        batch_size * output_height * output_width,
+        out.append(tmp)
+    for index in range(len(inputs)):
+        im2col(attrs, inputs[index], out[index])
+        out[index] = out[index].reshape([
+            output_height[0, index] * output_width[0, index],
            img_channels * attrs['kernels'][0] * attrs['kernels'][1]
        ])
+    out = np.concatenate(out, axis=0)
    return out


@@ -103,7 +131,7 @@ class TestBlockExpandOp(OpTest):
        self.attrs = {
            'kernels': [2, 2],
            'strides': [1, 1],
-            'paddings': [1, 1, 1, 1]
+            'paddings': [1, 1, 1, 1],
        }

    def setUp(self):
@@ -113,7 +141,8 @@ class TestBlockExpandOp(OpTest):
            self.batch_size, self.img_channels, self.img_height, self.img_width
        ]).astype("float32")

-        out = Im2Sequence(x, self.attrs)
+        real_size = np.array([]).astype("float32")
+        out = Im2Sequence(x, real_size, self.attrs)
        self.inputs = {'X': x}
        self.outputs = {'Out': out}

@@ -133,20 +162,20 @@ class TestBlockExpandOpCase2(TestBlockExpandOp):
        self.attrs = {
            'kernels': [2, 1],
            'strides': [2, 1],
-            'paddings': [2, 1, 2, 1]
+            'paddings': [2, 1, 2, 1],
        }


 class TestBlockExpandOpCase3(TestBlockExpandOp):
    def config(self):
-        self.batch_size = 3
+        self.batch_size = 2
        self.img_channels = 1
        self.img_height = 4
        self.img_width = 5
        self.attrs = {
            'kernels': [2, 1],
            'strides': [2, 1],
-            'paddings': [2, 0, 2, 0]
+            'paddings': [2, 0, 2, 0],
        }


@@ -159,9 +188,94 @@ class TestBlockExpandOpCase4(TestBlockExpandOp):
        self.attrs = {
            'kernels': [2, 2],
            'strides': [1, 1],
-            'paddings': [0, 0, 0, 0]
+            'paddings': [0, 0, 0, 0],
+        }
+
+
+class TestBlockExpandOpCase5(OpTest):
+    def config(self):
+        self.batch_size = 1
+        self.img_channels = 3
+        self.img_height = 4
+        self.img_width = 5
+        self.attrs = {
+            'kernels': [2, 1],
+            'strides': [2, 1],
+            'paddings': [2, 1, 2, 1],
+            'out_stride': [2, 2],
+        }
+
+    def setUp(self):
+        self.config()
+        self.op_type = "im2sequence"
+        x = np.random.uniform(0.1, 1, [
+            self.batch_size, self.img_channels, self.img_height, self.img_width
+        ]).astype("float32")
+        real_size = np.array([[8, 10], [5, 8]]).astype("float32")
+        out = np.array(Im2Sequence(x, real_size, self.attrs))
+        self.inputs = {'X': x, 'Y': real_size}  #l ??
+        self.outputs = {'Out': out}
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestBlockExpandOpCase6(OpTest):
+    def config(self):
+        self.batch_size = 3
+        self.img_channels = 1
+        self.img_height = 4
+        self.img_width = 5
+        self.attrs = {
+            'kernels': [2, 1],
+            'strides': [1, 1],
+            'paddings': [0, 0, 0, 0],
+            'out_stride': [1, 1],
+        }
+
+    def setUp(self):
+        self.config()
+        self.op_type = "im2sequence"
+        x = np.random.uniform(0.1, 1, [
+            self.batch_size, self.img_channels, self.img_height, self.img_width
+        ]).astype("float32")
+        real_size = np.array([[8, 10], [5, 8], [5, 8]]).astype("float32")
+        out = np.array(Im2Sequence(x, real_size, self.attrs))
+        self.inputs = {'X': x, 'Y': real_size}  #l ??
+        self.outputs = {'Out': out}
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestBlockExpandOpCase7(OpTest):
+    def config(self):
+        self.batch_size = 2
+        self.img_channels = 2
+        self.img_height = 3
+        self.img_width = 3
+        self.attrs = {
+            'kernels': [2, 2],
+            'strides': [1, 1],
+            'paddings': [1, 0, 1, 0],
+            'out_stride': [2, 2],
        }

+    def setUp(self):
+        self.config()
+        self.op_type = "im2sequence"
+        x = np.random.uniform(0.1, 1, [
+            self.batch_size, self.img_channels, self.img_height, self.img_width
+        ]).astype("float32")
+        real_size = np.array([[6, 6], [4, 4]]).astype("float32")
+        out = np.array(Im2Sequence(x, real_size, self.attrs))
+        self.inputs = {'X': x, 'Y': real_size}
+        self.outputs = {'Out': out}
+
+    def test_check_output(self):
+        self.check_output()
+

 if __name__ == '__main__':
    unittest.main()
+#set shiftwidth=4 set expandtab set tabstop=4
--- a/python/paddle/fluid/tests/unittests/test_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_layers.py
@@ -16,6 +16,7 @@ from __future__ import print_function
 import unittest

 import paddle.fluid.layers as layers
+from paddle.fluid.layers.device import get_places
 import paddle.fluid.nets as nets
 from paddle.fluid.framework import Program, program_guard, default_main_program
 from paddle.fluid.param_attr import ParamAttr
@@ -238,7 +239,7 @@ class TestBook(unittest.TestCase):
    def test_get_places(self):
        program = Program()
        with program_guard(program):
-            x = layers.get_places(device_count=4)
+            x = get_places(device_count=4)
            self.assertIsNotNone(x)
        print(str(program))

@@ -251,12 +252,16 @@ class TestBook(unittest.TestCase):
        print(str(program))

    def test_im2sequence(self):
-        print("test_im2sequence")
        program = Program()
        with program_guard(program):
            x = layers.data(name='x', shape=[3, 128, 128], dtype='float32')
+            y = layers.data(name='y', shape=[], dtype='float32')
            output = layers.im2sequence(
-                input=x, stride=[1, 1], filter_size=[2, 2])
+                input=x,
+                input_image_size=y,
+                stride=[1, 1],
+                filter_size=[2, 2],
+                out_stride=[1, 1])
            self.assertIsNotNone(output)
        print(str(program))


--- a/python/paddle/fluid/tests/unittests/test_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_optimizer.py
@@ -97,7 +97,7 @@ class TestMomentumOptimizer(unittest.TestCase):
        params_grads = append_backward(mean_out)
        self.assertEqual(len(params_grads), 1)
        self.assertEqual(len(momentum_optimizer.get_accumulators()), 0)
-        opts = momentum_optimizer.create_optimization_pass(
+        opts = momentum_optimizer._create_optimization_pass(
            params_grads, mul_out, init_program)
        self.assertEqual(len(opts), 3)
        sgd_op = opts[-1]
@@ -151,7 +151,7 @@ class TestMomentumOptimizer(unittest.TestCase):
        params_grads = append_backward(mean_out)
        self.assertEqual(len(params_grads), 1)
        self.assertEqual(len(momentum_optimizer.get_accumulators()), 0)
-        opts = momentum_optimizer.create_optimization_pass(
+        opts = momentum_optimizer._create_optimization_pass(
            params_grads, mul_out, init_program)
        self.assertEqual(len(opts), 3)
        sgd_op = opts[-1]
@@ -214,8 +214,8 @@ class TestAdagradOptimizer(unittest.TestCase):
        params_grads = append_backward(mean_out)
        self.assertEqual(len(params_grads), 1)
        self.assertEqual(len(adagrad_optimizer.get_accumulators()), 0)
-        opts = adagrad_optimizer.create_optimization_pass(params_grads, mul_out,
-                                                          init_program)
+        opts = adagrad_optimizer._create_optimization_pass(
+            params_grads, mul_out, init_program)
        self.assertEqual(len(opts), 3)
        self.assertEqual([op.type for op in opts],
                         ["fill_constant", "elementwise_mul", "adagrad"])
@@ -278,7 +278,7 @@ class TestAdamOptimizer(unittest.TestCase):
        params_grads = append_backward(mean_out)
        self.assertEqual(len(params_grads), 1)
        self.assertEqual(len(adam_optimizer.get_accumulators()), 0)
-        opts = adam_optimizer.create_optimization_pass(params_grads, mul_out,
+        opts = adam_optimizer._create_optimization_pass(params_grads, mul_out,
                                                        init_program)
        self.assertEqual(len(opts), 5)
        self.assertEqual(
@@ -345,7 +345,7 @@ class TestAdamaxOptimizer(unittest.TestCase):
        params_grads = append_backward(mean_out)
        self.assertEqual(len(params_grads), 1)
        self.assertEqual(len(adamax_optimizer.get_accumulators()), 0)
-        opts = adamax_optimizer.create_optimization_pass(params_grads, mul_out,
+        opts = adamax_optimizer._create_optimization_pass(params_grads, mul_out,
                                                          init_program)
        self.assertEqual(len(opts), 4)
        self.assertEqual(
@@ -409,7 +409,7 @@ class TestDecayedAdagradOptimizer(unittest.TestCase):
        params_grads = append_backward(mean_out)
        self.assertEqual(len(params_grads), 1)
        self.assertEqual(len(decayed_adagrad_optimizer.get_accumulators()), 0)
-        opts = decayed_adagrad_optimizer.create_optimization_pass(
+        opts = decayed_adagrad_optimizer._create_optimization_pass(
            params_grads, mul_out, init_program)
        self.assertEqual(len(opts), 3)
        self.assertEqual(
@@ -475,7 +475,7 @@ class TestFtrlOptimizer(unittest.TestCase):
        params_grads = append_backward(mean_out)
        self.assertEqual(len(params_grads), 1)
        self.assertEqual(len(ftrl_optimizer.get_accumulators()), 0)
-        opts = ftrl_optimizer.create_optimization_pass(params_grads, mul_out,
+        opts = ftrl_optimizer._create_optimization_pass(params_grads, mul_out,
                                                        init_program)
        self.assertEqual(len(opts), 3)
        self.assertEqual([op.type for op in opts],

--- a/python/paddle/fluid/tests/unittests/test_parallel_op.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_op.py
@@ -15,6 +15,7 @@
 import unittest

 import paddle.fluid as fluid
+from paddle.fluid.layers.device import get_places
 import paddle.fluid.profiler as profiler
 import numpy

@@ -115,7 +116,7 @@ class BaseParallelForTest(unittest.TestCase):
            if use_parallel:
                thread_num = fluid.core.get_cuda_device_count(
                ) if use_gpu else 8
-                places = fluid.layers.get_places(thread_num)
+                places = get_places(thread_num)
                pd = fluid.layers.ParallelDo(places, use_nccl=use_nccl)
                data = next(generator)


--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -181,6 +181,14 @@ else:
    command = "patchelf --set-rpath '$ORIGIN/../libs/' ${PADDLE_BINARY_DIR}/python/paddle/fluid/core.so"
 if os.system(command) != 0:
    raise Exception("patch core.so failed, command: %s" % command)
+if '${WITH_FLUID_ONLY}'== 'OFF':
+    # change rpath of _swig_paddle.so.
+    if "@APPLE@" == "1":
+        command = "install_name_tool -id \"@loader_path/../paddle/libs/\" ${PADDLE_BINARY_DIR}/python/py_paddle/_swig_paddle.so"
+    else:
+        command = "patchelf --set-rpath '$ORIGIN/../paddle/libs/' ${PADDLE_BINARY_DIR}/python/py_paddle/_swig_paddle.so"
+    if os.system(command) != 0:
+        raise Exception("patch _swig_paddle.so failed, command: %s" % command)

 setup(name='${PACKAGE_NAME}',
      version='${PADDLE_VERSION}',