merge develop

f55e8901 · nhzlx · 1600ba86 · 4069262f · f55e8901 · f55e8901
447 changed file
--- a/.gitignore
+++ b/.gitignore
@@ -5,6 +5,7 @@ python/paddle/v2/fluid/tests/book/image_classification_resnet.inference.model/
 python/paddle/v2/fluid/tests/book/image_classification_vgg.inference.model/
 python/paddle/v2/fluid/tests/book/label_semantic_roles.inference.model/
 *.DS_Store
+*.vs
 build/
 build_doc/
 *.user
@@ -15,6 +16,7 @@ build_doc/
 .cproject
 .pydevproject
 .settings/
+CMakeSettings.json
 Makefile
 .test_env/
 third_party/

--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -204,11 +204,12 @@ include(external/snappy)    # download snappy
 include(external/snappystream)
 include(external/threadpool)
-set(WITH_ANAKIN OFF CACHE STRING "Disable Anakin first, will add it later." FORCE)
 if(WITH_GPU)
    include(cuda)
    include(tensorrt)
    include(external/anakin)
+elseif()
+    set(WITH_ANAKIN OFF CACHE STRING "Anakin is used in GPU only now." FORCE)
 endif()
 include(cudnn)              # set cudnn libraries, must before configure

--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -56,6 +56,10 @@ if(NOT CMAKE_CROSSCOMPILING)
        set(SIMD_FLAG ${SSE3_FLAG})
    endif()
 endif()
+if(UNIX AND NOT APPLE)
+  # except apple from nix*Os family
+  set(LINUX TRUE)
+endif(UNIX AND NOT APPLE)
 if(NOT WITH_GOLANG)
    add_definitions(-DPADDLE_WITHOUT_GOLANG)
@@ -104,6 +108,10 @@ if(WITH_GPU)
        if(${CUDNN_MAJOR_VERSION} VERSION_LESS 7)
            message(FATAL_ERROR "Anakin needs CUDNN >= 7.0 to compile")
        endif()
+        set(ENV{CUDNN_INCLUDE_DIR} ${CUDNN_INCLUDE_DIR})
+        set(ENV{CUDNN_LIBRARY} ${CUDNN_LIBRARY})
+        message(STATUS "cudnn include header is ${CUDNN_INCLUDE_DIR}/cudnn.h")
+        message(STATUS "cudnn library is ${CUDNN_LIBRARY}")
    endif()
 elseif(WITH_AMD_GPU)
    add_definitions(-DPADDLE_WITH_HIP)

--- a/cmake/external/anakin.cmake
+++ b/cmake/external/anakin.cmake
@@ -35,9 +35,10 @@ set(ANAKIN_COMPILE_EXTRA_FLAGS
 ExternalProject_Add(
    extern_anakin
    ${EXTERNAL_PROJECT_LOG_ARGS}
-    # TODO(luotao): use PaddlePaddle/Anakin later
+    DEPENDS             ${MKLML_PROJECT}
+    # Anakin codes error on Intel(R) Xeon(R) Gold 5117 CPU, temporary do not compile avx512 related code.
    GIT_REPOSITORY      "https://github.com/luotao1/Anakin"
-    GIT_TAG             "3957ae9263eaa0b1986758dac60a88852afb09be"
+    GIT_TAG             "bcf17aabe7921ceb7bce591244b4f9dce7dba5c8"
    PREFIX              ${ANAKIN_SOURCE_DIR}
    UPDATE_COMMAND      ""
    CMAKE_ARGS          -DUSE_GPU_PLACE=YES

--- a/doc/fluid/new_docs/advanced_usage/deploy/native_infer.rst
+++ b/doc/fluid/new_docs/advanced_usage/deploy/native_infer.rst
@@ -4,7 +4,7 @@ Paddle 预测 API
 为了更简单方便的预测部署，Fluid 提供了一套高层 API
 用来隐藏底层不同的优化实现。
-`预测库相关代码 <https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/contrib/inference>`__
+`预测库相关代码 <https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/fluid/inference/api>`__
 包括
 -  头文件 ``paddle_inference_api.h`` 定义了所有的接口
@@ -104,5 +104,5 @@ engine
 ------------
 -  `inference
-   demos <https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/contrib/inference/demo>`__
+   demos <https://github.com/PaddlePaddle/Paddle/tree/develop/paddle/fluid/inference/api/demo_ci>`__
-  `复杂单线程/多线程例子 <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/contrib/inference/test_paddle_inference_api_impl.cc>`__
+-  `复杂单线程/多线程例子 <https://github.com/PaddlePaddle/Paddle/blob/develop/paddle/fluid/inference/api/api_impl_tester.cc>`__
--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@@ -155,10 +155,11 @@ paddle.fluid.layers.resize_bilinear ArgSpec(args=['input', 'out_shape', 'scale',
 paddle.fluid.layers.gather ArgSpec(args=['input', 'index'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.random_crop ArgSpec(args=['x', 'shape', 'seed'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.mean_iou ArgSpec(args=['input', 'label', 'num_classes'], varargs=None, keywords=None, defaults=None)
-paddle.fluid.layers.relu ArgSpec(args=['x'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.layers.relu ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
-paddle.fluid.layers.log ArgSpec(args=['x'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.layers.log ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,))
 paddle.fluid.layers.crop ArgSpec(args=['x', 'shape', 'offsets', 'name'], varargs=None, keywords=None, defaults=(None, None, None))
 paddle.fluid.layers.rank_loss ArgSpec(args=['label', 'left', 'right', 'name'], varargs=None, keywords=None, defaults=(None,))
+paddle.fluid.layers.prelu ArgSpec(args=['x', 'mode', 'param_attr', 'name'], varargs=None, keywords=None, defaults=(None, None))
 paddle.fluid.layers.flatten ArgSpec(args=['x', 'axis', 'name'], varargs=None, keywords=None, defaults=(1, None))
 paddle.fluid.layers.data ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True))
 paddle.fluid.layers.open_recordio_file ArgSpec(args=['filename', 'shapes', 'lod_levels', 'dtypes', 'pass_num', 'for_parallel'], varargs=None, keywords=None, defaults=(1, True))

--- a/paddle/fluid/framework/attribute.h
+++ b/paddle/fluid/framework/attribute.h
@@ -128,7 +128,8 @@ struct ExtractAttribute {
      attr_value = &boost::get<T>(attr);
    } catch (boost::bad_get& bad_get) {
      PADDLE_THROW("Cannot get attribute %s by type %s, its type is %s",
-                   attr_name_, typeid(T).name(), attr.type().name());
+                   attr_name_, paddle::platform::demangle(typeid(T).name()),
+                   paddle::platform::demangle(attr.type().name()));
    }
    return attr_value;
  }
@@ -160,7 +161,7 @@ struct ExtractAttribute<bool> {
      attr_value = &boost::get<bool>(attr);
    } catch (boost::bad_get& bad_get) {
      PADDLE_THROW("Cannot get attribute %s by type bool, its type is %s",
-                   attr_name_, attr.type().name());
+                   attr_name_, paddle::platform::demangle(attr.type().name()));
    }
    return attr_value;
  }
@@ -186,7 +187,7 @@ struct ExtractAttribute<int64_t> {
      attr_value = &boost::get<int64_t>(attr);
    } catch (boost::bad_get& bad_get) {
      PADDLE_THROW("Cannot get attribute %s by type int64_t, its type is %s",
-                   attr_name_, attr.type().name());
+                   attr_name_, paddle::platform::demangle(attr.type().name()));
    }
    return attr_value;
  }

--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
@@ -3,7 +3,10 @@ cc_library(graph SRCS graph.cc DEPS node)
 cc_library(graph_helper SRCS graph_helper.cc DEPS graph)
 cc_library(pass SRCS pass.cc DEPS graph node graph_helper)
 cc_library(graph_viz_pass SRCS graph_viz_pass.cc DEPS graph pass graph_helper)
+cc_library(graph_traits SRCS graph_traits.cc DEPS graph)
+cc_library(graph_pattern_detecter SRCS graph_pattern_detecter.cc DEPS graph graph_helper graph_traits)
 cc_test(pass_test SRCS pass_test.cc DEPS graph pass graph_helper)
 cc_test(graph_test SRCS graph_test.cc DEPS graph graph_helper op_registry)
 cc_test(graph_helper_test SRCS graph_helper_test.cc DEPS graph graph_helper op_registry)
+cc_test(test_graph_pattern_detecter SRCS graph_pattern_detecter_tester.cc DEPS graph_pattern_detecter)
--- a/paddle/fluid/framework/ir/graph_pattern_detecter.cc
+++ b/paddle/fluid/framework/ir/graph_pattern_detecter.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <array>
+#include <string>
+#include <vector>
+#include "paddle/fluid/framework/ir/graph_helper.h"
+#include "paddle/fluid/framework/ir/graph_pattern_detecter.h"
+#include "paddle/fluid/framework/ir/graph_traits.h"
+#include "paddle/fluid/platform/enforce.h"
+namespace paddle {
+namespace framework {
+namespace ir {
+PDNode* PDPattern::NewNode(PDNode::teller_t&& teller, const std::string& name) {
+  nodes_.emplace_back(new PDNode(std::move(teller), name));
+  auto* cur = nodes_.back().get();
+  return cur;
+}
+void PDPattern::AddEdge(PDNode* a, PDNode* b) {
+  PADDLE_ENFORCE(a);
+  PADDLE_ENFORCE(b);
+  PADDLE_ENFORCE(a != b, "can't connect to the same nodes.");
+  edges_.emplace_back(a, b);
+}
+void GraphPatternDetecter::operator()(Graph* graph,
+                                      GraphPatternDetecter::handle_t handler) {
+  if (!MarkPDNodesInGraph(*graph)) return;
+  auto subgraphs = DetectPatterns();
+  UniquePatterns(&subgraphs);
+  RemoveOverlappedMatch(&subgraphs);
+  for (auto& g : subgraphs) {
+    handler(g, graph);
+  }
+}
+bool GraphPatternDetecter::MarkPDNodesInGraph(const ir::Graph& graph) {
+  if (graph.Nodes().empty()) return false;
+  for (auto& node : GraphTraits::DFS(graph)) {
+    for (const auto& pdnode : pattern_.nodes()) {
+      if (pdnode->Tell(&node)) {
+        pdnodes2nodes_[pdnode.get()].insert(&node);
+      }
+    }
+  }
+  return !pdnodes2nodes_.empty();
+}
+struct HitGroup {
+  std::unordered_map<PDNode*, Node*> roles;
+  bool Match(Node* node, PDNode* pat) {
+    return !roles.count(pat) || roles.at(pat) == node;
+  }
+  void Register(Node* node, PDNode* pat) { roles[pat] = node; }
+};
+// Tell whether Node a links to b.
+bool IsNodesLink(Node* a, Node* b) {
+  for (auto* node : a->outputs) {
+    if (b == node) {
+      return true;
+    }
+  }
+  return false;
+}
+std::vector<GraphPatternDetecter::subgraph_t>
+GraphPatternDetecter::DetectPatterns() {
+  // Init empty subgraphs.
+  std::vector<GraphPatternDetecter::subgraph_t> result;
+  std::vector<HitGroup> init_groups;
+  PADDLE_ENFORCE(!pattern_.edges().empty(), "At least one edge is needed");
+  auto* first_pnode = pattern_.edges().front().first;
+  if (!pdnodes2nodes_.count(first_pnode)) return result;
+  for (auto* node : pdnodes2nodes_[first_pnode]) {
+    HitGroup group;
+    group.roles[first_pnode] = node;
+    init_groups.emplace_back(group);
+  }
+  int step = 0;
+  std::array<std::vector<HitGroup>, 2> bi_records;
+  bi_records[0] = std::move(init_groups);
+  // Extend a PDNode to subgraphs by deducing the connection relations defined
+  // in edges of PDNodes.
+  for (const auto& edge : pattern_.edges()) {
+    // Each role has two PDNodes, which indicates two roles.
+    // Detect two Nodes that can match these two roles and they are connected.
+    auto& pre_groups = bi_records[step % 2];
+    auto& cur_groups = bi_records[1 - (step++ % 2)];
+    cur_groups.clear();
+    // source -> target
+    for (Node* source : pdnodes2nodes_[edge.first]) {
+      for (Node* target : pdnodes2nodes_[edge.second]) {
+        // TODO(Superjomn) add some prune strategies.
+        for (const auto& group : pre_groups) {
+          HitGroup new_group = group;
+          if (IsNodesLink(source, target) &&
+              new_group.Match(source, edge.first)) {
+            new_group.Register(source, edge.first);
+            if (new_group.Match(target, edge.second)) {
+              new_group.Register(target, edge.second);
+              cur_groups.push_back(new_group);
+              // TODO(Superjomn) need to unique
+            }
+          }
+        }
+      }
+    }
+  }
+  for (auto& group : bi_records[step % 2]) {
+    GraphPatternDetecter::subgraph_t subgraph;
+    for (auto& role : group.roles) {
+      subgraph.emplace(role.first, role.second);
+    }
+    result.emplace_back(subgraph);
+  }
+  return result;
+}
+void GraphPatternDetecter::UniquePatterns(
+    std::vector<GraphPatternDetecter::subgraph_t>* subgraphs) {
+  if (subgraphs->empty()) return;
+  std::vector<GraphPatternDetecter::subgraph_t> result;
+  std::unordered_set<size_t> set;
+  for (auto& g : *subgraphs) {
+    size_t key = 0;
+    for (auto& item : g) {
+      key ^= std::hash<void*>{}(item.first);
+      key ^= std::hash<void*>{}(item.second);
+    }
+    if (!set.count(key)) {
+      result.emplace_back(g);
+      set.insert(key);
+    }
+  }
+  *subgraphs = result;
+}
+void GraphPatternDetecter::RemoveOverlappedMatch(
+    std::vector<subgraph_t>* subgraphs) {
+  std::vector<subgraph_t> result;
+  std::unordered_set<Node*> node_set;
+  for (const auto& subgraph : *subgraphs) {
+    bool valid = true;
+    for (auto& item : subgraph) {
+      if (node_set.count(item.second)) {
+        valid = false;
+        break;
+      }
+    }
+    if (valid) {
+      for (auto& item : subgraph) {
+        node_set.insert(item.second);
+      }
+      result.push_back(subgraph);
+    }
+  }
+  *subgraphs = result;
+}
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/ir/graph_pattern_detecter.h
+++ b/paddle/fluid/framework/ir/graph_pattern_detecter.h
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#ifdef PADDLE_WITH_TESTING
+#include <gtest/gtest_prod.h>
+#endif
+#include <numeric>
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/node.h"
+namespace paddle {
+namespace framework {
+namespace ir {
+// Some basic torminolygies:
+//   - PDPattern: a pattern defined as a data flow graph.
+//   - PDNode: the node in the pattern, each PDNode represents an `ir::Node`
+//     that meets some conditions defined in `PDNode.teller`.
+//   - A pattern is defined with PDNodes with edges.
+// Pattern detector node. This node helps to build a pattern.
+struct PDNode {
+  // tell whether an ir::Node* is a candidation for a PDNode.
+  using teller_t = std::function<bool(Node*)>;
+  PDNode(teller_t&& teller, const std::string& name = "")
+      : teller_(teller), name_(name) {
+    PADDLE_ENFORCE(teller_ != nullptr, "invalid teller functer is set.");
+  }
+  PDNode(PDNode&& other) = default;
+  std::vector<PDNode*> inlinks;
+  std::vector<PDNode*> outlinks;
+  bool Tell(Node* node) const {
+    PADDLE_ENFORCE(teller_ != nullptr, "teller should be set for a PDNode");
+    return teller_(node);
+  }
+  const std::string& name() const { return name_; }
+  PDNode(const PDNode&) = delete;
+  PDNode& operator=(const PDNode&) = delete;
+ private:
+  teller_t teller_;
+  std::string name_;
+};
+/*
+ * A pattern in a graph, which defined with PDNode and edges. Most graph
+ * patterns can be divided into PDNodes and link relations between them.
+ *
+ * For example, the FC fusion need to filter the MUL and ELEMENTWISE_ADD
+ * operators from the computation graph, the MUL's output should have only one
+ * consumer which is the ELEMENTWISE_ADD.
+ * This pattern can be defined as with the following pseudo codes
+ *
+ *     // Create two operator PDNodes.
+ *     MUL = PDPattern.NewNode()
+ *     ELE = PDPattern.NewNode()
+ *     // Create the variable PDNodes.
+ *     MUL_out = PDPattern.NewNode()
+ *     // Add teller to define some rules that help to filter the target Nodes.
+ *     MUL.teller = lambda(node): node->IsOp() && node->Op()->Type == "mul";
+ *     ELE.teller = lambda(node): \
+ *                        node->IsOp() && node->Op()->Type == "elementwise_add";
+ *     MUL_out.teller = lambda(node): node->IsVar() && (MUL in node->inputs)
+ *                                                  && (ELE in node->outputs)
+ *
+ * One can add more specific tellers for PDNodes or edges, both the Operator
+ * and Variable Nodes can be ruled in PDNode.teller.
+ *
+ * PDPattern can record the general patterns, such as the pattern represents
+ *   - Op in CPU -> Op in GPU -> Op in CPU, to findout the IO abnormal place.
+ *   - Ops whose inputs and outputs share the same variables
+ */
+class PDPattern {
+ public:
+  using edge_t = std::pair<PDNode*, PDNode*>;
+  void AddEdge(PDNode* a, PDNode* b);
+  PDNode* NewNode(PDNode::teller_t&& teller, const std::string& name = "");
+  const std::vector<std::unique_ptr<PDNode>>& nodes() const { return nodes_; }
+  const std::vector<edge_t>& edges() const { return edges_; }
+ private:
+#ifdef PADDLE_WITH_TESTING
+  FRIEND_TEST(PDPattern, AddEdge);
+  FRIEND_TEST(PDPattern, NewNode);
+#endif
+  std::vector<std::unique_ptr<PDNode>> nodes_;
+  std::vector<edge_t> edges_;
+};
+/*
+ * GraphPatternDetecter helps to detect the specific patterns in the graph.
+ * Input a pattern, output a list of the matched subgraphs/nodes.
+ * This helper can be used to support fuse(conv+batchnorm => batchnorm e.g.).
+ *
+ * The algorithm has three phases:
+ *   1. Mark the nodes that match the defined PDNodes in a PDPattern,
+ *   2. Extend a PDNode to subgraphs by deducing the connection relation defined
+ *      in PAPattern(the edges),
+ *   3. Get the filtered subgraphs and treat them with a pre-defined handler.
+ *
+ * Usage:
+ *    // Create a detector
+ *    GraphPatternDetecter detector;
+ *    // Define the detector's pattern, by adding PDNode and define the edges.
+ *    auto* node0 = detector.mutable_pattern().AddNode(...)
+ *    auto* node1 = detector.mutable_pattern().AddNode(...)
+ *    node0->teller = some lambda.
+ *    node1->teller = some lambda.
+ *    detector.mutable_pattern().AddEdge(node0, node1);
+ *    // Create an handler, to define the behavior of treating the filtered
+ *    // subgraphs that comply with the patterns.
+ *    GraphPatternDetecter::handle_t handler = some labmda
+ *    // Execute the detector.
+ *    detector(&graph, handler);
+ */
+class GraphPatternDetecter {
+ public:
+  using subgraph_t = std::unordered_map<PDNode*, Node*>;
+  // Operate on the detected pattern.
+  using handle_t =
+      std::function<void(const subgraph_t& /*hitted pattern*/, Graph*)>;
+  void operator()(Graph* graph, handle_t handler);
+  const PDPattern& pattern() const { return pattern_; }
+  PDPattern* mutable_pattern() { return &pattern_; }
+ private:
+  // Mark the nodes that fits the pattern.
+  bool MarkPDNodesInGraph(const ir::Graph& graph);
+  // Detect all the pattern and output the hit records.
+  std::vector<subgraph_t> DetectPatterns();
+  // Remove duplicate patterns.
+  void UniquePatterns(std::vector<subgraph_t>* subgraphs);
+  // Remove overlapped match subgraphs, when overlapped, keep the previous one.
+  void RemoveOverlappedMatch(std::vector<subgraph_t>* subgraphs);
+#ifdef PADDLE_WITH_TESTING
+  FRIEND_TEST(GraphPatternDetecter, MarkPDNodesInGraph);
+  FRIEND_TEST(GraphPatternDetecter, DetectPatterns);
+#endif
+ private:
+  using hit_rcd_t =
+      std::pair<Node* /*node in graph*/, PDNode* /*node in pattern*/>;
+  PDPattern pattern_;
+  std::vector<hit_rcd_t> marked_records_;
+  std::unordered_map<const PDNode*, std::unordered_set<Node*>> pdnodes2nodes_;
+};
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/ir/graph_pattern_detecter_tester.cc
+++ b/paddle/fluid/framework/ir/graph_pattern_detecter_tester.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/framework/ir/graph_pattern_detecter.h"
+#include <gtest/gtest.h>
+namespace paddle {
+namespace framework {
+namespace ir {
+void BuildGraph(Graph* g) {
+  ir::Node* o1 = g->CreateEmptyNode("op1", Node::Type::kOperation);
+  ir::Node* o2 = g->CreateEmptyNode("op2", Node::Type::kOperation);
+  ir::Node* o3 = g->CreateEmptyNode("op3", Node::Type::kOperation);
+  ir::Node* o4 = g->CreateEmptyNode("op4", Node::Type::kOperation);
+  ir::Node* o5 = g->CreateEmptyNode("op5", Node::Type::kOperation);
+  ir::Node* v1 = g->CreateEmptyNode("var1", Node::Type::kVariable);
+  ir::Node* v2 = g->CreateEmptyNode("var2", Node::Type::kVariable);
+  ir::Node* v3 = g->CreateEmptyNode("var3", Node::Type::kVariable);
+  ir::Node* v4 = g->CreateEmptyNode("var4", Node::Type::kVariable);
+  // o1->v1->o2
+  o1->outputs.push_back(v1);
+  o2->inputs.push_back(v1);
+  v1->inputs.push_back(o1);
+  v1->outputs.push_back(o2);
+  // o2->v2->o3
+  // o2->v2->o4
+  o2->outputs.push_back(v2);
+  o3->inputs.push_back(v2);
+  o4->inputs.push_back(v2);
+  v2->inputs.push_back(o2);
+  v2->outputs.push_back(o3);
+  v2->outputs.push_back(o4);
+  // o2->v3->o5
+  o2->outputs.push_back(v3);
+  o5->inputs.push_back(v3);
+  v3->inputs.push_back(o2);
+  v3->outputs.push_back(o5);
+  // o3-v4->o5
+  o3->outputs.push_back(v4);
+  o5->inputs.push_back(v4);
+  v4->inputs.push_back(o3);
+  v4->outputs.push_back(o5);
+}
+TEST(PDPattern, NewNode) {
+  PDPattern x;
+  auto* n = x.NewNode([](Node* x) { return true; });
+  ASSERT_TRUE(n);
+  ASSERT_EQ(x.nodes_.size(), 1UL);
+}
+TEST(PDPattern, AddEdge) {
+  PDPattern x;
+  auto* a = x.NewNode([](Node* x) { return true; });
+  auto* b = x.NewNode([](Node* x) { return true; });
+  ASSERT_TRUE(a);
+  ASSERT_TRUE(b);
+  x.AddEdge(a, b);
+  ASSERT_EQ(x.nodes_.size(), 2UL);
+  ASSERT_EQ(x.edges_.size(), 1UL);
+  ASSERT_EQ(x.edges_.front().first, a);
+  ASSERT_EQ(x.edges_.front().second, b);
+  ASSERT_EQ(x.nodes().size(), 2UL);
+  ASSERT_EQ(x.edges().size(), 1UL);
+  ASSERT_EQ(x.edges().front().first, a);
+  ASSERT_EQ(x.edges().front().second, b);
+}
+TEST(GraphPatternDetecter, MarkPDNodesInGraph) {
+  GraphPatternDetecter x;
+  // mark o2, o3, v2
+  // The pattern is a graph:
+  //   o2(a node named o2) -> v2(a node named v2)
+  //   v2 -> o3(a node named o3)
+  auto* o2 = x.pattern_.NewNode([](Node* node) {
+    // The teller can be any condition, such as op type, or variable's shape.
+    return node && node->Name() == "op2" && node->IsOp();
+  });
+  auto* o3 = x.pattern_.NewNode([](Node* node) {
+    // The teller can be any condition, such as op type, or variable's shape.
+    return node && node->Name() == "op3" && node->IsOp();
+  });
+  auto* v2 = x.pattern_.NewNode([](Node* node) {
+    // The teller can be any condition, such as op type, or variable's shape.
+    return node && node->Name() == "var2" && node->IsVar();
+  });
+  ASSERT_FALSE(o2->Tell(nullptr));
+  ASSERT_FALSE(o3->Tell(nullptr));
+  ASSERT_FALSE(v2->Tell(nullptr));
+  x.pattern_.AddEdge(o2, v2);
+  x.pattern_.AddEdge(v2, o3);
+  ASSERT_EQ(x.pattern_.edges().size(), 2UL);
+  ASSERT_EQ(x.pattern_.edges()[0].first, o2);
+  ASSERT_EQ(x.pattern_.edges()[0].second, v2);
+  ASSERT_EQ(x.pattern_.edges()[1].first, v2);
+  ASSERT_EQ(x.pattern_.edges()[1].second, o3);
+  ProgramDesc program;
+  Graph graph(program);
+  BuildGraph(&graph);
+  x.MarkPDNodesInGraph(graph);
+  ASSERT_EQ(x.pdnodes2nodes_.size(), 3UL);
+  auto subgraphs = x.DetectPatterns();
+  ASSERT_EQ(subgraphs.size(), 1UL);
+}
+TEST(GraphPatternDetecter, MultiSubgraph) {
+  ProgramDesc program;
+  Graph graph(program);
+  BuildGraph(&graph);
+  GraphPatternDetecter x;
+  // The pattern is a graph:
+  //   op -> var
+  auto* any_op = x.mutable_pattern()->NewNode(
+      [](Node* node) {
+        return node->IsOp() && (node->Name() == "op2" || node->Name() == "op3");
+      },
+      "OP0");
+  auto* any_var = x.mutable_pattern()->NewNode(
+      [](Node* node) { return node->IsVar(); }, "VAR");
+  auto* any_op1 = x.mutable_pattern()->NewNode(
+      [](Node* node) { return node->IsOp(); }, "OP1");
+  x.mutable_pattern()->AddEdge(any_op, any_var);
+  x.mutable_pattern()->AddEdge(any_var, any_op1);
+  int count = 0;
+  GraphPatternDetecter::handle_t handle = [&](
+      const GraphPatternDetecter::subgraph_t& s, Graph* g) {
+    LOG(INFO) << "Detect " << s.at(any_op)->Name() << " -> "
+              << s.at(any_var)->Name() << " -> " << s.at(any_op1)->Name();
+    count++;
+  };
+  x(&graph, handle);
+  // 1. Detect op3 -> var4 -> op5
+  // 2. Detect op2 -> var2 -> op3
+  // 3. Detect op2 -> var2 -> op4
+  // 4. Detect op2 -> var3 -> op5
+  // But 2 and 3 and 4 overlapped, so keep 2, so the final choices are 1 and 2
+  ASSERT_GE(count, 1UL);
+  ASSERT_LE(count, 2UL);
+}
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/ir/graph_traits.cc
+++ b/paddle/fluid/framework/ir/graph_traits.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/framework/ir/graph_traits.h"
+namespace paddle {
+namespace framework {
+namespace ir {
+//
+// NodesDFSIterator
+//
+NodesDFSIterator::NodesDFSIterator(const std::vector<Node *> &source) {
+  for (auto *x : source) stack_.push(x);
+}
+NodesDFSIterator::NodesDFSIterator(NodesDFSIterator &&other) noexcept
+    : stack_(std::move(other.stack_)),
+      visited_(std::move(other.visited_)) {}
+NodesDFSIterator::NodesDFSIterator(const NodesDFSIterator &other)
+    : stack_(other.stack_), visited_(other.visited_) {}
+Node &NodesDFSIterator::operator*() {
+  PADDLE_ENFORCE(!stack_.empty());
+  return *stack_.top();
+}
+NodesDFSIterator &NodesDFSIterator::operator++() {
+  PADDLE_ENFORCE(!stack_.empty(), "the iterator exceeds range");
+  visited_.insert(stack_.top());
+  auto *cur = stack_.top();
+  stack_.pop();
+  for (auto *x : cur->outputs) {
+    if (!visited_.count(x)) {
+      stack_.push(x);
+    }
+  }
+  return *this;
+}
+bool NodesDFSIterator::operator==(const NodesDFSIterator &other) {
+  if (stack_.empty()) return other.stack_.empty();
+  if ((!stack_.empty()) && (!other.stack_.empty())) {
+    return stack_.top() == other.stack_.top();
+  }
+  return false;
+}
+NodesDFSIterator &NodesDFSIterator::operator=(const NodesDFSIterator &other) {
+  stack_ = other.stack_;
+  visited_ = other.visited_;
+  return *this;
+}
+Node *NodesDFSIterator::operator->() { return stack_.top(); }
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/ir/graph_traits.h
+++ b/paddle/fluid/framework/ir/graph_traits.h
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <stack>
+#include "paddle/fluid/framework/ir/graph.h"
+#include "paddle/fluid/framework/ir/node.h"
+namespace paddle {
+namespace framework {
+namespace ir {
+template <typename IteratorT>
+class iterator_range {
+  IteratorT begin_, end_;
+ public:
+  template <typename Container>
+  explicit iterator_range(Container &&c) : begin_(c.begin()), end_(c.end()) {}
+  iterator_range(const IteratorT &begin, const IteratorT &end)
+      : begin_(begin), end_(end) {}
+  const IteratorT &begin() const { return begin_; }
+  const IteratorT &end() const { return end_; }
+};
+// DFS iterator on nodes.
+struct NodesDFSIterator
+    : public std::iterator<std::forward_iterator_tag, Node *> {
+  NodesDFSIterator() = default;
+  explicit NodesDFSIterator(const std::vector<Node *> &source);
+  NodesDFSIterator(NodesDFSIterator &&other) noexcept;
+  NodesDFSIterator(const NodesDFSIterator &other);
+  Node &operator*();
+  NodesDFSIterator &operator++();
+  // TODO(Superjomn) current implementation just compare the first
+  // element, need to compare the graph and all the elements in the queue and
+  // set.
+  NodesDFSIterator &operator=(const NodesDFSIterator &other);
+  bool operator==(const NodesDFSIterator &other);
+  bool operator!=(const NodesDFSIterator &other) { return !(*this == other); }
+  Node *operator->();
+ private:
+  std::stack<Node *> stack_;
+  std::unordered_set<Node *> visited_;
+};
+/*
+ * GraphTraits contains some graph traversal algorithms.
+ *
+ * Usage:
+ *
+ */
+struct GraphTraits {
+  static iterator_range<NodesDFSIterator> DFS(const Graph &g) {
+    auto start_points = ExtractStartPoints(g);
+    NodesDFSIterator x(start_points);
+    return iterator_range<NodesDFSIterator>(NodesDFSIterator(start_points),
+                                            NodesDFSIterator());
+  }
+ private:
+  // The nodes those have no input will be treated as start points.
+  static std::vector<Node *> ExtractStartPoints(const Graph &g) {
+    std::vector<Node *> result;
+    for (auto *node : g.Nodes()) {
+      if (node->inputs.empty()) {
+        result.push_back(node);
+      }
+    }
+    return result;
+  }
+};
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/ir/node.h
+++ b/paddle/fluid/framework/ir/node.h
@@ -58,6 +58,9 @@ class Node {
    return op_desc_;
  }
+  bool IsOp() const { return type_ == Type::kOperation; }
+  bool IsVar() const { return type_ == Type::kVariable; }
  std::vector<Node*> inputs;
  std::vector<Node*> outputs;

--- a/paddle/fluid/framework/op_desc.cc
+++ b/paddle/fluid/framework/op_desc.cc
@@ -202,6 +202,52 @@ std::vector<std::string> OpDesc::AttrNames() const {
 }
 void OpDesc::SetAttr(const std::string &name, const Attribute &v) {
+  // NOTICE(minqiyang): pybind11 will take the empty list in python as
+  // the std::vector<int> type in C++; so we have to change the attr's type
+  // here if we meet this issue
+  proto::AttrType attr_type = static_cast<proto::AttrType>(v.which() - 1);
+  if (attr_type == proto::AttrType::INTS &&
+      boost::get<std::vector<int>>(v).size() == 0u) {
+    // Find current attr via attr name and set the correct attribute value
+    const proto::OpProto::Attr &attr = GetProtoAttr(name);
+    switch (attr.type()) {
+      case proto::AttrType::BOOLEANS: {
+        VLOG(11) << "SetAttr: " << Type() << ", " << name
+                 << " from INTS to BOOLEANS";
+        this->attrs_[name] = std::vector<bool>();
+        break;
+      }
+      case proto::AttrType::INTS: {
+        VLOG(11) << "SetAttr: " << Type() << ", " << name
+                 << " from INTS to INTS";
+        this->attrs_[name] = std::vector<int>();
+        break;
+      }
+      case proto::AttrType::FLOATS: {
+        VLOG(11) << "SetAttr: " << Type() << ", " << name
+                 << " from INTS to FLOATS";
+        this->attrs_[name] = std::vector<float>();
+        break;
+      }
+      case proto::AttrType::STRINGS: {
+        VLOG(11) << "SetAttr: " << Type() << ", " << name
+                 << " from INTS to STRINGS";
+        this->attrs_[name] = std::vector<std::string>();
+        break;
+      }
+      case proto::AttrType::BLOCKS: {
+        VLOG(11) << "SetAttr: " << Type() << ", " << name
+                 << " from INTS to BLOCKS";
+        this->SetBlocksAttr(name, std::vector<BlockDesc *>());
+        return;
+      }
+      default:
+        PADDLE_THROW("Wrong attr type %d", attr.type());
+    }
+    need_update_ = true;
+    return;
+  }
  this->attrs_[name] = v;
  need_update_ = true;
 }
@@ -229,6 +275,19 @@ Attribute OpDesc::GetAttr(const std::string &name) const {
  return it->second;
 }
+const proto::OpProto::Attr &OpDesc::GetProtoAttr(
+    const std::string &name) const {
+  const proto::OpProto &proto = OpInfoMap::Instance().Get(Type()).Proto();
+  for (int i = 0; i != proto.attrs_size(); ++i) {
+    const proto::OpProto::Attr &attr = proto.attrs(i);
+    if (attr.name() == name) {
+      return attr;
+    }
+  }
+  PADDLE_THROW("Attribute %s is not found in proto %s", name, proto.type());
+}
 Attribute OpDesc::GetNullableAttr(const std::string &name) const {
  auto it = attrs_.find(name);
  if (it != attrs_.end()) {

--- a/paddle/fluid/framework/op_desc.h
+++ b/paddle/fluid/framework/op_desc.h
@@ -81,6 +81,8 @@ class OpDesc {
  Attribute GetAttr(const std::string &name) const;
+  const proto::OpProto::Attr &GetProtoAttr(const std::string &name) const;
  Attribute GetNullableAttr(const std::string &name) const;
  int GetBlockAttrId(const std::string &name) const;

--- a/paddle/fluid/operators/prelu_op.cu
+++ b/paddle/fluid/operators/prelu_op.cu
@@ -4,7 +4,7 @@ Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
-    http://www.apache.org/licenses/LICENSE-2.0
+http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
@@ -12,11 +12,35 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/operators/prelu_op.h"
+#pragma once
-REGISTER_OP_CUDA_KERNEL(
+#include <pthread.h>
-    prelu,
-    paddle::operators::PReluKernel<paddle::platform::CUDADeviceContext, float>);
+namespace paddle {
-REGISTER_OP_CUDA_KERNEL(prelu_grad,
+namespace framework {
-                        paddle::operators::PReluGradKernel<
-                            paddle::platform::CUDADeviceContext, float>);
+struct RWLock {
+  RWLock() { pthread_rwlock_init(&lock_, nullptr); }
+  ~RWLock() { pthread_rwlock_destroy(&lock_); }
+  void RDLock() {
+    PADDLE_ENFORCE_EQ(pthread_rwlock_rdlock(&lock_), 0,
+                      "acquire read lock failed");
+  }
+  void WRLock() {
+    PADDLE_ENFORCE_EQ(pthread_rwlock_wrlock(&lock_), 0,
+                      "acquire write lock failed");
+  }
+  void UNLock() {
+    PADDLE_ENFORCE_EQ(pthread_rwlock_unlock(&lock_), 0, "unlock failed");
+  }
+ private:
+  pthread_rwlock_t lock_;
+};
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/selected_rows.cc
+++ b/paddle/fluid/framework/selected_rows.cc
@@ -120,66 +120,76 @@ bool SelectedRows::HasKey(int64_t key) const {
                                                                   : true;
 }
-std::vector<std::pair<int64_t, int64_t>> SelectedRows::Get(
+int64_t SelectedRows::AutoGrownIndex(int64_t key, bool auto_grown) {
-    const std::vector<int64_t>& keys, framework::Tensor* value) const {
+  rwlock_->RDLock();
+  auto iter = id_to_index_.find(key);
+  if (iter == id_to_index_.end()) {
+    rwlock_->UNLock();
+    if (!auto_grown) {
+      PADDLE_THROW("key %d not found", key);
+    }
+    rwlock_->WRLock();
+    auto map_size = id_to_index_.size();
+    auto vector_size = rows_.size();
+    if (map_size != vector_size) {
+      rwlock_->UNLock();
+      PADDLE_THROW(
+          "id_to_index_ size %d should have the same size with rows_ %d",
+          map_size, vector_size);
+    }
+    auto write_iter = id_to_index_.find(key);
+    if (write_iter == id_to_index_.end()) {
+      size_t row_num = rows_.size();
+      if (row_num == value_->dims()[0]) {
+        rwlock_->UNLock();
+        PADDLE_THROW("selected rows is full, then length exceed %d", row_num);
+      }
+      // key logic to put a key into id_to_index_
+      rows_.push_back(key);
+      auto index = static_cast<int64_t>(rows_.size() - 1);
+      id_to_index_[key] = index;
+      rwlock_->UNLock();
+      return index;
+    } else {
+      auto index = write_iter->second;
+      rwlock_->UNLock();
+      return index;
+    }
+  } else {
+    auto index = iter->second;
+    rwlock_->UNLock();
+    return index;
+  }
+}
+void SelectedRows::SyncIndex() {
+  rwlock_->WRLock();
+  id_to_index_.clear();
+  for (size_t i = 0; i < rows_.size(); ++i) {
+    id_to_index_[rows_[i]] = i;
+  }
+  rwlock_->UNLock();
+}
+void SelectedRows::Get(const framework::Tensor& ids, framework::Tensor* value,
+                       bool auto_grown) {
  PADDLE_ENFORCE(value->IsInitialized(),
                 "The value tensor should be initialized.");
-  std::vector<std::pair<int64_t, int64_t>> non_keys_pair;
+  if (ids.numel() == 0) {
-  if (keys.empty()) {
    VLOG(3) << "keys is empty, please check data!";
  } else {
    int64_t value_width = value_->numel() / value_->dims()[0];
    PADDLE_ENFORCE_EQ(value_width, value->numel() / value->dims()[0],
                      "output tensor should have the same shape with table "
                      "except the dims[0].");
+    for (size_t i = 0; i < ids.numel(); ++i) {
-    for (size_t i = 0; i < keys.size(); ++i) {
+      int64_t index = AutoGrownIndex(ids.data<int64_t>()[i], auto_grown);
-      int64_t index = Index(keys[i]);
+      framework::VisitDataType(
-      if (index == -1) {
+          framework::ToDataType(value_->type()),
-        non_keys_pair.push_back(
+          TensorCopyVisitor(value, i * value_width, *value_.get(),
-            std::make_pair(keys[i], static_cast<int64_t>(i)));
+                            index * value_width, value_width));
-      } else {
-        framework::VisitDataType(
-            framework::ToDataType(value_->type()),
-            TensorCopyVisitor(value, i * value_width, *value_.get(),
-                              index * value_width, value_width));
-      }
    }
  }
-  return non_keys_pair;
-}
-bool SelectedRows::Set(int64_t key, const framework::Tensor& value) {
-  PADDLE_ENFORCE(value.IsInitialized(), "The value should be initialized.");
-  if (value_->IsInitialized()) {
-    PADDLE_ENFORCE_EQ(
-        value.type(), value_->type(),
-        "The type of the value should be same with the original value");
-  }
-  PADDLE_ENFORCE_EQ(value.dims()[0], static_cast<size_t>(1),
-                    "The first dim of value should be 1.");
-  std::lock_guard<std::mutex> lock(*auto_grown_mutex_.get());
-  auto index = Index(key);
-  bool is_new_key = false;
-  if (index == -1) {
-    rows_.push_back(key);
-    index = rows_.size() - 1;
-    is_new_key = true;
-    // whether need to resize the table
-    if (static_cast<int64_t>(rows_.size()) > value_->dims()[0]) {
-      auto dims = value_->dims();
-      dims[0] = (dims[0] + 1) << 1;
-      framework::VisitDataType(framework::ToDataType(value.type()),
-                               ReAllocateVisitor(dims, value_.get()));
-    }
-  }
-  framework::VisitDataType(
-      framework::ToDataType(value.type()),
-      TensorCopyVisitor(value_.get(),
-                        index * value_->numel() / value_->dims()[0], value,
-                        static_cast<int64_t>(0), value.numel()));
-  return is_new_key;
 }
 }  // namespace framework

--- a/paddle/fluid/framework/selected_rows.h
+++ b/paddle/fluid/framework/selected_rows.h
@@ -17,10 +17,12 @@ limitations under the License. */
 #include <algorithm>
 #include <memory>
 #include <mutex>  // NOLINT
+#include <unordered_map>
 #include <utility>
 #include <vector>
 #include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/rw_lock.h"
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/memory/memcpy.h"
@@ -48,13 +50,13 @@ class SelectedRows {
  SelectedRows(const std::vector<int64_t>& rows, const int64_t& height)
      : rows_(rows), height_(height) {
    value_.reset(new Tensor());
-    auto_grown_mutex_.reset(new std::mutex);
+    rwlock_.reset(new RWLock);
  }
  SelectedRows() {
    height_ = 0;
    value_.reset(new Tensor());
-    auto_grown_mutex_.reset(new std::mutex);
+    rwlock_.reset(new RWLock);
  }
  platform::Place place() const { return value_->place(); }
@@ -74,47 +76,51 @@ class SelectedRows {
  void set_rows(const Vector<int64_t>& rows) { rows_ = rows; }
  /*
-   * @brief wheter has the specified key in the table.
+   * @brief Get the index of key in rows
+   *
+   * @return -1 if the key does not exists.
+   */
+  int64_t Index(int64_t key) const {
+    auto it = std::find(rows_.begin(), rows_.end(), key);
+    if (it == rows_.end()) {
+      PADDLE_THROW("id %s not in table", key);
+    }
+    return static_cast<int64_t>(std::distance(rows_.begin(), it));
+  }
+  /*
+   * @brief whether has the specified key in the table.
   *
   * @return true if the key is exists.
   */
  bool HasKey(int64_t key) const;
  /*
-   * @brief Get value by the key list, if the
+   * @brief Get value by the key list.
+   * Note!!! this interface is only used when selected_rows is used as
+   * parameters
+   * for distribute lookup table.
   *
   * @return a list of pair which contains the non-exists key and the index in
   * the value
   */
-  std::vector<std::pair<int64_t, int64_t>> Get(const std::vector<int64_t>& keys,
+  void Get(const framework::Tensor& ids, framework::Tensor* value,
-                                               framework::Tensor* value) const;
+           bool auto_grown = false);
  /*
-   * @brief Set a key-value pair into the table.
+   * @brief Get the index of the key from id_to_index_ map. If the key not
-   *  This function will double the value memory if it's not engouth.
+   * exist,
+   * add the key into id_to_index_.
   *
-   * @note:
+   * Note!!! this interface is only used when selected_rows is used as
-   *    1. The first dim of the value should be 1
+   * parameters
-   *    2. The value should be initialized and the data type
+   * for distribute lookup table.
-   *       should be the same with the table.
-   *
-   * @return true if the key is a new one, otherwise false
   *
+   * @return index of the key.
   */
-  bool Set(int64_t key, const Tensor& value);
+  int64_t AutoGrownIndex(int64_t key, bool auto_grown);
-  /*
+  void SyncIndex();
-   * @brief Get the index of key in rows
-   *
-   * @return -1 if the key does not exists.
-   */
-  int64_t Index(int64_t key) const {
-    auto it = std::find(rows_.begin(), rows_.end(), key);
-    if (it == rows_.end()) {
-      return static_cast<int64_t>(-1);
-    }
-    return static_cast<int64_t>(std::distance(rows_.begin(), it));
-  }
  DDim GetCompleteDims() const {
    std::vector<int64_t> dims = vectorize(value_->dims());
@@ -127,9 +133,10 @@ class SelectedRows {
  // SelectedRows are simply concated when adding together. Until a
  // SelectedRows add a Tensor, will the duplicate rows be handled.
  Vector<int64_t> rows_;
+  std::unordered_map<int64_t, int64_t> id_to_index_;
  std::unique_ptr<Tensor> value_{nullptr};
  int64_t height_;
-  std::unique_ptr<std::mutex> auto_grown_mutex_{nullptr};
+  std::unique_ptr<RWLock> rwlock_{nullptr};
 };
 /*

--- a/paddle/fluid/framework/selected_rows_test.cc
+++ b/paddle/fluid/framework/selected_rows_test.cc
@@ -9,8 +9,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "paddle/fluid/framework/selected_rows.h"
+#include <time.h>
+#include <thread>  // NOLINT
 #include "gtest/gtest.h"
+#include "paddle/fluid/framework/selected_rows.h"
 namespace paddle {
 namespace framework {
@@ -59,39 +62,129 @@ TEST_F(SelectedRowsTester, SerializeAndDeseralize) {
  ASSERT_EQ(selected_rows_->GetCompleteDims(), dst_tensor.GetCompleteDims());
 }
-TEST_F(SelectedRowsTester, SparseTable) {
+TEST(SelectedRows, SparseTable) {
  platform::CPUPlace cpu;
  SelectedRows table;
+  int64_t table_size = 100;
+  int64_t embedding_width = 8;
  // initialize a sparse table
-  table.mutable_value()->Resize(framework::make_ddim({1, 100}));
+  table.mutable_value()->Resize(
-  table.mutable_value()->mutable_data<float>(cpu);
+      framework::make_ddim({table_size, embedding_width}));
-  table.mutable_rows()->push_back(1);
+  auto* data = table.mutable_value()->mutable_data<float>(cpu);
+  for (int64_t i = 0; i < table_size; ++i) {
+    for (int64_t j = 0; j < embedding_width; ++j) {
+      data[i * embedding_width + j] = static_cast<float>(i);
+    }
+  }
+  ASSERT_EQ(table.AutoGrownIndex(10, true), 0);
+  ASSERT_EQ(table.AutoGrownIndex(8, true), 1);
+  ASSERT_EQ(table.AutoGrownIndex(8, true), 1);
+  ASSERT_EQ(table.AutoGrownIndex(6, true), 2);
+  ASSERT_TRUE(table.HasKey(10));
+  ASSERT_TRUE(table.HasKey(8));
+  ASSERT_TRUE(table.HasKey(6));
+  ASSERT_EQ(table.rows().size(), 3);
+  framework::Tensor ids;
+  ids.Resize(framework::make_ddim({4}));
+  auto* ids_data = ids.mutable_data<int64_t>(cpu);
+  ids_data[0] = static_cast<int64_t>(6);
+  ids_data[1] = static_cast<int64_t>(6);
+  ids_data[2] = static_cast<int64_t>(8);
+  ids_data[3] = static_cast<int64_t>(10);
-  int64_t key = 10000;
+  framework::Tensor get_value;
-  int64_t non_key = 999;
+  auto* value_data = get_value.mutable_data<float>(
-  framework::Tensor value;
+      framework::make_ddim({4, embedding_width}), cpu);
-  value.Resize(framework::make_ddim({1, 100}));
+  table.Get(ids, &get_value);
-  auto ptr = value.mutable_data<float>(cpu);
-  ptr[0] = static_cast<float>(10);
-  ASSERT_EQ(table.rows().size(), static_cast<size_t>(1));
+  for (int j = 0; j < embedding_width; ++j) {
-  ASSERT_EQ(table.HasKey(key), false);
+    ASSERT_EQ(value_data[0 * embedding_width + j], 2);
+  }
+  for (int j = 0; j < embedding_width; ++j) {
+    ASSERT_EQ(value_data[1 * embedding_width + j], 2);
+  }
+  for (int j = 0; j < embedding_width; ++j) {
+    ASSERT_EQ(value_data[2 * embedding_width + j], 1);
+  }
+  for (int j = 0; j < embedding_width; ++j) {
+    ASSERT_EQ(value_data[3 * embedding_width + j], 0);
+  }
+}
-  table.Set(key, value);
+void f1(SelectedRows* table, int table_size) {
+  for (int i = 1000000; i > 0; --i) {
+    auto id = i % table_size;
+    int64_t index1 = table->AutoGrownIndex(id, true);
+    int64_t index2 = table->AutoGrownIndex(id, false);
+    int64_t index3 = table->AutoGrownIndex(id, true);
+    ASSERT_EQ(index1, index2);
+    ASSERT_EQ(index2, index3);
+  }
+}
-  ASSERT_EQ(table.rows().size(), static_cast<size_t>(2));
+void f2(SelectedRows* table, int table_size) {
-  ASSERT_EQ(table.HasKey(key), true);
+  for (int i = 0; i < 1000000; ++i) {
-  // check re-allocate
+    auto id = i % table_size;
-  ASSERT_EQ(table.value().dims()[0], static_cast<int64_t>(4));
+    int64_t index1 = table->AutoGrownIndex(id, true);
+    int64_t index2 = table->AutoGrownIndex(id, false);
+    int64_t index3 = table->AutoGrownIndex(id, true);
+    ASSERT_EQ(index1, index2);
+    ASSERT_EQ(index2, index3);
+  }
+}
-  framework::Tensor get_value;
+void f3(SelectedRows* table, int table_size) {
-  get_value.mutable_data<float>(framework::make_ddim({2, 100}), cpu);
+  clock_t t1 = clock();
-  std::vector<int64_t> keys({non_key, key});
+  for (int i = 100000; i > 0; --i) {
-  auto non_key_pairs = table.Get(keys, &get_value);
+    auto id1 = table->AutoGrownIndex(i % table_size, true);
+    auto id2 = table->Index(i % table_size);
+    ASSERT_EQ(id1, id2);
+  }
+  clock_t t2 = clock();
+  std::cout << "f3 run time:" << t2 - t1 << std::endl;
+}
+void f4(SelectedRows* table, int table_size) {
+  clock_t t1 = clock();
+  for (int i = 0; i < 100000; ++i) {
+    auto id1 = table->AutoGrownIndex(i % table_size, true);
+    auto id2 = table->Index(i % table_size);
+    ASSERT_EQ(id1, id2);
+  }
+  clock_t t2 = clock();
+  std::cout << "f4 run time:" << t2 - t1 << std::endl;
+}
+TEST(SelectedRows, MultiThreadAutoIndex) {
+  platform::CPUPlace cpu;
+  SelectedRows table;
+  int64_t table_size = 100000;
+  int64_t embedding_width = 8;
+  // initialize a sparse table
+  table.mutable_value()->Resize(
+      framework::make_ddim({table_size, embedding_width}));
+  auto* data = table.mutable_value()->mutable_data<float>(cpu);
+  for (int64_t i = 0; i < table_size; ++i) {
+    for (int64_t j = 0; j < embedding_width; ++j) {
+      data[i * embedding_width + j] = static_cast<float>(i);
+    }
+  }
-  ASSERT_EQ(get_value.data<float>()[100], static_cast<float>(10));
+  std::thread t1(f1, &table, table_size);
-  ASSERT_EQ(non_key_pairs.size(), static_cast<size_t>(1));
+  std::thread t11(f1, &table, table_size);
-  ASSERT_EQ(non_key_pairs[0].first, non_key);
+  std::thread t2(f2, &table, table_size);
+  std::thread t22(f2, &table, table_size);
+  t1.join();
+  t11.join();
+  t2.join();
+  t22.join();
+  std::thread t3(f3, &table, table_size);
+  std::thread t4(f4, &table, table_size);
+  t3.join();
+  t4.join();
 }
 }  // namespace framework

--- a/paddle/fluid/framework/threadpool.cc
+++ b/paddle/fluid/framework/threadpool.cc
@@ -20,6 +20,9 @@
 DEFINE_int32(io_threadpool_size, 100,
             "number of threads used for doing IO, default 100");
+DEFINE_int32(dist_threadpool_size, 0,
+             "number of threads used for distributed executed.");
 namespace paddle {
 namespace framework {
@@ -35,6 +38,10 @@ void ThreadPool::Init() {
  if (threadpool_.get() == nullptr) {
    // TODO(Yancey1989): specify the max threads number
    int num_threads = std::thread::hardware_concurrency();
+    if (FLAGS_dist_threadpool_size > 0) {
+      num_threads = FLAGS_dist_threadpool_size;
+      VLOG(1) << "set dist_threadpool_size to " << num_threads;
+    }
    PADDLE_ENFORCE_GT(num_threads, 0);
    threadpool_.reset(new ThreadPool(num_threads));
  }

--- a/paddle/fluid/inference/api/CMakeLists.txt
+++ b/paddle/fluid/inference/api/CMakeLists.txt
@@ -60,7 +60,7 @@ cc_library(paddle_inference_tensorrt_subgraph_engine
 inference_api_test(test_api_tensorrt_subgraph_engine SRC api_tensorrt_subgraph_engine_tester.cc ARGS test_word2vec)
 endif()
-if (WITH_ANAKIN) # only needed in CI
+if (WITH_ANAKIN AND WITH_GPU) # only needed in CI
    # compile the libinference_anakin_api.a and anakin.so.
    nv_library(inference_anakin_api SRCS api.cc api_anakin_engine.cc DEPS anakin_shared anakin_saber)
    #nv_library(inference_anakin_api_shared SHARED SRCS api.cc api_anakin_engine.cc DEPS anakin)

--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -171,6 +171,9 @@ function(op_library TARGET)
        file(APPEND ${pybind_file} "USE_OP(fake_dequantize_max_abs);\n")
      elseif(${TARGET} STREQUAL "tensorrt_engine_op")
          message(STATUS "Pybind skips [tensorrt_engine_op], for this OP is only used in inference")
+      elseif(${TARGET} STREQUAL "fc")
+        # HACK: fc only have mkldnn and cpu, which would mismatch the cpu only condition
+        file(APPEND ${pybind_file} "USE_CPU_ONLY_OP(${TARGET});\n")
      else()
        file(APPEND ${pybind_file} "USE_OP(${TARGET});\n")
      endif()
@@ -302,12 +305,6 @@ op_library(channel_recv_op DEPS concurrency)
 list(REMOVE_ITEM GENERAL_OPS ${DEPS_OPS})
-# The fully connected layer is deleted when the WITH_MKLDNN flag is OFF
-# Because the fully connected layer has only one MKLDNN's operator
-if(NOT WITH_MKLDNN)
-    list(REMOVE_ITEM GENERAL_OPS fc_op)
-endif(NOT WITH_MKLDNN)
 foreach(src ${GENERAL_OPS})
    op_library(${src})
 endforeach()

--- a/paddle/fluid/operators/distributed/rpc_server_test.cc
+++ b/paddle/fluid/operators/distributed/rpc_server_test.cc
@@ -78,10 +78,9 @@ void InitTensorsOnServer(framework::Scope* scope, platform::CPUPlace* place,
                         int64_t rows_numel) {
  CreateVarsOnScope(scope, place);
  auto w = scope->Var("w")->GetMutable<framework::SelectedRows>();
-  auto rows = w->mutable_rows();
-  for (int64_t i = 0; i < rows_numel; ++i) rows->push_back(i);
  auto w_value = w->mutable_value();
  w_value->Resize({rows_numel, 10});
+  for (int64_t i = 0; i < rows_numel; ++i) w->AutoGrownIndex(i, true);
  auto ptr = w_value->mutable_data<float>(*place);

--- a/paddle/fluid/operators/distributed/variable_response.cc
+++ b/paddle/fluid/operators/distributed/variable_response.cc
@@ -190,12 +190,15 @@ bool VariableResponse::ProcSerializedField(
 #endif
  }
+  VLOG(7) << "ProcSerializedField:" << meta_.varname()
+          << ", type:" << meta_.type() << std::endl;
  framework::DDim dims = GetDims(meta_.dims());
  if (meta_.type() == sendrecv::LOD_TENSOR) {
    PADDLE_ENFORCE(meta_.lod_size() >= 0, "lod info should be got first!");
    if (!CopyLodTensorData(input, *dev_ctx_, dims, num_bytes)) {
      return false;
    }
    return true;
  }
@@ -206,7 +209,9 @@ bool VariableResponse::ProcSerializedField(
    return true;
  }
-  return true;
+  PADDLE_ENFORCE("not supported var types:", meta_.varname(), meta_.type());
+  return false;
 }
 };  // namespace distributed

--- a/paddle/fluid/operators/fc_mkldnn_op.cc
+++ b/paddle/fluid/operators/fc_mkldnn_op.cc
@@ -125,13 +125,16 @@ class FCMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
    auto input = ctx.Input<Tensor>("Input");
    auto w = ctx.Input<Tensor>("W");
+    auto bias = ctx.Input<Tensor>("Bias");
    PADDLE_ENFORCE(input->dims().size() == 2 || input->dims().size() == 4,
                   "Input must be with 2 or 4 dimensions, i.e. NCHW");
+    // TODO(intel friends): the native weight format is io,
+    // but the mkldnn weight format is oihw, which may need be transposed.
    PADDLE_ENFORCE(w->dims().size() == 2 || w->dims().size() == 4,
                   "Weights must be with 2 or 4 dimensions, i.e. OI or OIHW");
-    bool with_bias = ctx.Attr<bool>("bias_attr");
+    bool with_bias = bias != nullptr;
    MKLDNNMD<Tensor> md(input, w, with_bias);
    std::shared_ptr<mkldnn::inner_product_forward::primitive_desc> pd =
@@ -154,6 +157,7 @@ class FCMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
    auto dst_memory = mem.dst(output_data);
    auto src_memory = mem.src(input_data);
    auto weights_memory = mem.weights(w_data);
+    // TODO(intel friends): bias memory should also be obtain from bias->data()
    auto bias_memory = mem.bias();
    auto forward = with_bias ? mkldnn::inner_product_forward(
@@ -216,7 +220,8 @@ class FCMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
    const Tensor* out_grad = ctx.Input<Tensor>(framework::GradVarName("Out"));
    const T* out_grad_data = out_grad->data<T>();
-    bool with_bias = ctx.Attr<bool>("bias_attr");
+    auto bias = ctx.Input<Tensor>("Bias");
+    bool with_bias = bias != nullptr;
    MKLDNNMD<Tensor> md(input, w, with_bias);
    MKLDNNMemory mem(&md, mkldnn_engine);

--- a/paddle/fluid/operators/fc_op.cc
+++ b/paddle/fluid/operators/fc_op.cc
@@ -14,6 +14,9 @@ limitations under the License. */
 #include "paddle/fluid/operators/fc_op.h"
 #include <vector>
+#include "paddle/fluid/operators/math/blas.h"
+DECLARE_int32(paddle_num_threads);
 namespace paddle {
 namespace operators {
@@ -25,16 +28,24 @@ void FCOp::InferShape(framework::InferShapeContext* ctx) const {
                 "Out(Output) of Fully Connected should not be null.");
  PADDLE_ENFORCE(ctx->HasInput("W"),
                 "W(Input) of Fully Connected should not be null.");
+  // NCHW
  auto in_dims = ctx->GetInputDim("Input");
+  // IO, I=C*H*W
  auto w_dims = ctx->GetInputDim("W");
  std::vector<int64_t> output_shape({in_dims[0], w_dims[1]});
+  if (ctx->HasInput("Bias")) {
+    auto bias_dims = ctx->GetInputDim("Bias");
+    PADDLE_ENFORCE_EQ(bias_dims[0], 1, "The shape of Bias must be [1, dim].");
+    PADDLE_ENFORCE_EQ(bias_dims[1], w_dims[1],
+                      "The shape of Bias must be [1, dim].");
+  }
  PADDLE_ENFORCE(in_dims.size() == 2 || in_dims.size() == 4,
                 "Fully Connected input should be 2-D or 4-D tensor.");
+  PADDLE_ENFORCE_EQ(w_dims.size(), 2UL,
-  PADDLE_ENFORCE(w_dims.size() == 2 || w_dims.size() == 4,
+                    "Fully Connected input should be 2-D tensor.");
-                 "Fully Connected input should be 2-D or 4-D tensor.");
+  PADDLE_ENFORCE_EQ(framework::product(in_dims) / in_dims[0], w_dims[0],
+                    "Fully Connected input and weigth size do not match.");
  ctx->SetOutputDim("Out", framework::make_ddim(output_shape));
  ctx->ShareLoD("Input", "Out");
@@ -42,9 +53,12 @@ void FCOp::InferShape(framework::InferShapeContext* ctx) const {
 framework::OpKernelType FCOp::GetExpectedKernelType(
    const framework::ExecutionContext& ctx) const {
-  framework::LibraryType library{framework::LibraryType::kMKLDNN};
+  framework::LibraryType library = framework::LibraryType::kPlain;
-  framework::DataLayout layout{framework::DataLayout::kMKLDNN};
+  framework::DataLayout layout = framework::DataLayout::kAnyLayout;
+  if (ctx.Attr<bool>("use_mkldnn")) {
+    library = framework::LibraryType::kMKLDNN;
+    layout = framework::DataLayout::kMKLDNN;
+  }
  return framework::OpKernelType(
      framework::ToDataType(ctx.Input<Tensor>("Input")->type()), ctx.GetPlace(),
      layout, library);
@@ -60,27 +74,39 @@ void FCOpGrad::InferShape(framework::InferShapeContext* ctx) const {
  if (ctx->HasOutput(framework::GradVarName("W"))) {
    ctx->SetOutputDim(framework::GradVarName("W"), w_dims);
  }
+  if (ctx->HasInput("Bias")) {
+    PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("Bias")),
+                   "Should have bias grad");
+    auto bias_dims = ctx->GetInputDim("Bias");
+    ctx->SetOutputDim(framework::GradVarName("Bias"), bias_dims);
+  }
 }
 framework::OpKernelType FCOpGrad::GetExpectedKernelType(
    const framework::ExecutionContext& ctx) const {
-  framework::LibraryType library{framework::LibraryType::kMKLDNN};
+  framework::LibraryType library = framework::LibraryType::kPlain;
-  framework::DataLayout layout{framework::DataLayout::kMKLDNN};
+  framework::DataLayout layout = framework::DataLayout::kAnyLayout;
+  if (ctx.Attr<bool>("use_mkldnn")) {
+    library = framework::LibraryType::kMKLDNN;
+    layout = framework::DataLayout::kMKLDNN;
+  }
  return framework::OpKernelType(
      framework::ToDataType(ctx.Input<Tensor>("Input")->type()), ctx.GetPlace(),
      layout, library);
 }
 void FCOpMaker::Make() {
-  AddInput("Input", "(Tensor) The input tensor of fully connected operator. ");
+  AddInput("Input",
-  AddInput("W", "(Tensor), The second input tensor of fc op.");
+           "(Tensor), The input tensor of fully connected operator with format "
+           "(NCHW). ");
+  AddInput("W", "(Tensor), The weight fc op with shape (I, O).");
+  AddInput("Bias", "(Tensor, optional) Bias vector with shape (1 x O")
+      .AsDispensable();
  AddOutput("Out", "(Tensor) The output tensor of fully connected operator. ");
  AddAttr<bool>("use_mkldnn",
                "(bool, default false) Only used in mkldnn kernel")
      .SetDefault(false);
-  AddAttr<bool>("bias_attr", "(bool, default false) Only used in mkldnn kernel")
-      .SetDefault(false);
  AddComment(R"DOC(
  Fully Connected Operator.
@@ -94,9 +120,47 @@ void FCOpMaker::Make() {
 )DOC");
 }
+template <typename T>
+class FCOpKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const paddle::framework::ExecutionContext& ctx) const override {
+    PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()),
+                   "It must use CPUPlace.");
+    auto input = ctx.Input<Tensor>("Input");
+    auto w = ctx.Input<Tensor>("W");
+    auto bias = ctx.Input<Tensor>("Bias");
+    auto output = ctx.Output<Tensor>("Out");
+    auto in_dims = input->dims();
+    auto w_dims = w->dims();
+    auto& dev_ctx = ctx.template device_context<platform::CPUDeviceContext>();
+    auto blas = math::GetBlas<platform::CPUDeviceContext, T>(dev_ctx);
+    const T* input_data = input->data<T>();
+    const T* w_data = w->data<T>();
+    T* output_data = output->mutable_data<T>(ctx.GetPlace());
+    blas.GEMM(CblasNoTrans, CblasNoTrans, in_dims[0], w_dims[1], w_dims[0],
+              static_cast<T>(1), input_data, w_data, static_cast<T>(0),
+              output_data);
+    if (bias) {
+      const T* bias_data = bias->data<T>();
+#ifdef PADDLE_WITH_MKLML
+#pragma omp parallel for if (FLAGS_paddle_num_threads > 1)
+#endif
+      for (int bs = 0; bs < in_dims[0]; bs++) {
+        blas.AXPY(w_dims[1], static_cast<T>(1), bias_data,
+                  output_data + bs * w_dims[1]);
+      }
+    }
+  }
+};
 }  // namespace operators
 }  // namespace paddle
-REGISTER_OPERATOR(fc, paddle::operators::FCOp, paddle::operators::FCOpMaker,
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(fc, ops::FCOp, ops::FCOpMaker,
                  paddle::framework::DefaultGradOpDescMaker<true>);
-REGISTER_OPERATOR(fc_grad, paddle::operators::FCOpGrad);
+REGISTER_OPERATOR(fc_grad, ops::FCOpGrad);
+REGISTER_OP_CPU_KERNEL(fc, ops::FCOpKernel<float>, ops::FCOpKernel<double>);
--- a/paddle/fluid/operators/gru_op.cc
+++ b/paddle/fluid/operators/gru_op.cc
@@ -14,6 +14,11 @@ limitations under the License. */
 #include "paddle/fluid/operators/gru_op.h"
 #include <string>
+#include "paddle/fluid/operators/math/blas.h"
+#include "paddle/fluid/operators/math/detail/gru_cpu_kernel.h"
+#include "paddle/fluid/operators/math/detail/gru_kernel.h"
+DECLARE_int32(paddle_num_threads);
 namespace paddle {
 namespace operators {
@@ -211,6 +216,158 @@ class GRUGradOp : public framework::OperatorWithKernel {
  }
 };
+template <typename T>
+class GRUCPUKernel : public framework::OpKernel<T> {
+ public:
+  void BatchCompute(const framework::ExecutionContext& context) const {
+    using DeviceContext = paddle::platform::CPUDeviceContext;
+    auto* input = context.Input<LoDTensor>("Input");
+    auto* h0 = context.Input<Tensor>("H0");
+    auto* weight = context.Input<Tensor>("Weight");
+    const T* weight_data = weight->data<T>();
+    auto* bias = context.Input<Tensor>("Bias");
+    auto* batch_gate = context.Output<LoDTensor>("BatchGate");
+    batch_gate->mutable_data<T>(context.GetPlace());
+    auto* batch_reset_hidden_prev =
+        context.Output<LoDTensor>("BatchResetHiddenPrev");
+    batch_reset_hidden_prev->mutable_data<T>(context.GetPlace());
+    auto* batch_hidden = context.Output<LoDTensor>("BatchHidden");
+    batch_hidden->mutable_data<T>(context.GetPlace());
+    auto* hidden = context.Output<LoDTensor>("Hidden");
+    hidden->mutable_data<T>(context.GetPlace());
+    auto hidden_dims = hidden->dims();
+    bool is_reverse = context.Attr<bool>("is_reverse");
+    math::LoDTensor2BatchFunctor<DeviceContext, T> to_batch;
+    auto& dev_ctx = context.template device_context<DeviceContext>();
+    to_batch(dev_ctx, *input, batch_gate, true, is_reverse);
+    if (bias) {
+      math::RowwiseAdd<DeviceContext, T> add_bias;
+      add_bias(dev_ctx, *batch_gate, *bias, batch_gate);
+    }
+    int frame_size = hidden_dims[1];
+    math::GRUMetaValue<T> gru_value;
+    gru_value.gate_weight = const_cast<T*>(weight_data);
+    gru_value.state_weight =
+        const_cast<T*>(weight_data + 2 * frame_size * frame_size);
+    Tensor ordered_h0;
+    framework::Vector<size_t> order(batch_gate->lod()[2]);
+    if (h0) {
+      // Since the batch computing for GRU reorders the input sequences
+      // according to their length. The initialized cell state also needs
+      // to reorder.
+      ReorderInitState<DeviceContext, T>(
+          context.template device_context<DeviceContext>(), *h0, order,
+          &ordered_h0, true);
+      gru_value.prev_out_value = ordered_h0.data<T>();
+    } else {
+      gru_value.prev_out_value = nullptr;
+    }
+    auto batch_starts = batch_gate->lod()[0];
+    size_t seq_len = batch_starts.size() - 1;
+    auto active_node = math::detail::GetActivationType(
+        context.Attr<std::string>("activation"));
+    auto active_gate = math::detail::GetActivationType(
+        context.Attr<std::string>("gate_activation"));
+#ifdef PADDLE_WITH_MKLML
+    // use MKL packed to speedup GEMM
+    if (FLAGS_paddle_num_threads >= 4) {
+      auto blas = math::GetBlas<DeviceContext, T>(dev_ctx);
+      T* packed_gate = blas.GEMM_ALLOC(CblasBMatrix, 1 /*height of C*/,
+                                       frame_size * 2 /*width of weight*/,
+                                       frame_size /*height of height*/);
+      PADDLE_ENFORCE(packed_gate);
+      blas.GEMM_PACK(CblasBMatrix, CblasNoTrans, 1 /*cur bs?*/, frame_size * 2,
+                     frame_size, T(1.0), gru_value.gate_weight, frame_size * 2,
+                     packed_gate);
+      T* packed_state = blas.GEMM_ALLOC(CblasBMatrix, 1 /*height of C*/,
+                                        frame_size /*width of weight*/,
+                                        frame_size /*height of height*/);
+      PADDLE_ENFORCE(packed_state);
+      blas.GEMM_PACK(CblasBMatrix, CblasNoTrans, 1 /*cur bs?*/, frame_size,
+                     frame_size, T(1.0), gru_value.state_weight, frame_size,
+                     packed_state);
+      for (size_t n = 0; n < seq_len; n++) {
+        int bstart = static_cast<int>(batch_starts[n]);
+        int bend = static_cast<int>(batch_starts[n + 1]);
+        int cur_batch_size = bend - bstart;
+        Tensor gate_t = batch_gate->Slice(bstart, bend);
+        Tensor reset_hidden_prev_t =
+            batch_reset_hidden_prev->Slice(bstart, bend);
+        Tensor hidden_t = batch_hidden->Slice(bstart, bend);
+        gru_value.output_value = hidden_t.data<T>();
+        gru_value.gate_value = gate_t.data<T>();
+        gru_value.reset_output_value = reset_hidden_prev_t.data<T>();
+        if (gru_value.prev_out_value) {
+          blas.GEMM_COMPUTE(
+              CblasNoTrans, CblasPacked, cur_batch_size, frame_size * 2,
+              frame_size, gru_value.prev_out_value, frame_size, packed_gate,
+              frame_size * 2, T(1), gru_value.gate_value, frame_size * 3);
+        }
+        math::detail::forward_reset_output(
+            math::detail::forward::gru_resetOutput<T>(), gru_value, frame_size,
+            cur_batch_size, active_gate);
+        if (gru_value.prev_out_value) {
+          blas.GEMM_COMPUTE(
+              CblasNoTrans, CblasPacked, cur_batch_size, frame_size, frame_size,
+              gru_value.reset_output_value, frame_size, packed_state,
+              frame_size, T(1), gru_value.gate_value + frame_size * 2,
+              frame_size * 3);
+        }
+        math::detail::forward_final_output(
+            math::detail::forward::gru_finalOutput<T>(), gru_value, frame_size,
+            cur_batch_size, active_node);
+        gru_value.prev_out_value = gru_value.output_value;
+      }
+      blas.GEMM_FREE(packed_gate);
+      blas.GEMM_FREE(packed_state);
+    } else {
+#endif
+      for (size_t n = 0; n < seq_len; n++) {
+        int bstart = static_cast<int>(batch_starts[n]);
+        int bend = static_cast<int>(batch_starts[n + 1]);
+        int cur_batch_size = bend - bstart;
+        Tensor gate_t = batch_gate->Slice(bstart, bend);
+        Tensor reset_hidden_prev_t =
+            batch_reset_hidden_prev->Slice(bstart, bend);
+        Tensor hidden_t = batch_hidden->Slice(bstart, bend);
+        gru_value.output_value = hidden_t.data<T>();
+        gru_value.gate_value = gate_t.data<T>();
+        gru_value.reset_output_value = reset_hidden_prev_t.data<T>();
+        math::GRUUnitFunctor<DeviceContext, T>::compute(
+            dev_ctx, gru_value, frame_size, cur_batch_size, active_node,
+            active_gate);
+        gru_value.prev_out_value = gru_value.output_value;
+      }
+#ifdef PADDLE_WITH_MKLML
+    }
+#endif
+    math::Batch2LoDTensorFunctor<DeviceContext, T> to_seq;
+    batch_hidden->set_lod(batch_gate->lod());
+    to_seq(dev_ctx, *batch_hidden, hidden);
+  }
+  void Compute(const framework::ExecutionContext& context) const override {
+    BatchCompute(context);
+  }
+};
 }  // namespace operators
 }  // namespace paddle
@@ -218,9 +375,8 @@ namespace ops = paddle::operators;
 REGISTER_OPERATOR(gru, ops::GRUOp, ops::GRUOpMaker,
                  paddle::framework::DefaultGradOpDescMaker<true>);
 REGISTER_OPERATOR(gru_grad, ops::GRUGradOp);
-REGISTER_OP_CPU_KERNEL(
+REGISTER_OP_CPU_KERNEL(gru, ops::GRUCPUKernel<float>,
-    gru, ops::GRUKernel<paddle::platform::CPUDeviceContext, float>,
+                       ops::GRUCPUKernel<double>);
-    ops::GRUKernel<paddle::platform::CPUDeviceContext, double>);
 REGISTER_OP_CPU_KERNEL(
    gru_grad, ops::GRUGradKernel<paddle::platform::CPUDeviceContext, float>,
    ops::GRUGradKernel<paddle::platform::CPUDeviceContext, double>);
--- a/paddle/fluid/operators/gru_op.cu.cc
+++ b/paddle/fluid/operators/gru_op.cu.cc
@@ -14,6 +14,96 @@ limitations under the License. */
 #include "paddle/fluid/operators/gru_op.h"
+namespace paddle {
+namespace operators {
+template <typename DeviceContext, typename T>
+class GRUKernel : public framework::OpKernel<T> {
+ public:
+  void BatchCompute(const framework::ExecutionContext& context) const {
+    auto* input = context.Input<LoDTensor>("Input");
+    auto* h0 = context.Input<Tensor>("H0");
+    auto* weight = context.Input<Tensor>("Weight");
+    const T* weight_data = weight->data<T>();
+    auto* bias = context.Input<Tensor>("Bias");
+    auto* batch_gate = context.Output<LoDTensor>("BatchGate");
+    batch_gate->mutable_data<T>(context.GetPlace());
+    auto* batch_reset_hidden_prev =
+        context.Output<LoDTensor>("BatchResetHiddenPrev");
+    batch_reset_hidden_prev->mutable_data<T>(context.GetPlace());
+    auto* batch_hidden = context.Output<LoDTensor>("BatchHidden");
+    batch_hidden->mutable_data<T>(context.GetPlace());
+    auto* hidden = context.Output<LoDTensor>("Hidden");
+    hidden->mutable_data<T>(context.GetPlace());
+    auto hidden_dims = hidden->dims();
+    bool is_reverse = context.Attr<bool>("is_reverse");
+    math::LoDTensor2BatchFunctor<DeviceContext, T> to_batch;
+    auto& dev_ctx = context.template device_context<DeviceContext>();
+    to_batch(dev_ctx, *input, batch_gate, true, is_reverse);
+    if (bias) {
+      math::RowwiseAdd<DeviceContext, T> add_bias;
+      add_bias(dev_ctx, *batch_gate, *bias, batch_gate);
+    }
+    int frame_size = hidden_dims[1];
+    math::GRUMetaValue<T> gru_value;
+    gru_value.gate_weight = const_cast<T*>(weight_data);
+    gru_value.state_weight =
+        const_cast<T*>(weight_data + 2 * frame_size * frame_size);
+    Tensor ordered_h0;
+    framework::Vector<size_t> order(batch_gate->lod()[2]);
+    if (h0) {
+      // Since the batch computing for GRU reorders the input sequences
+      // according to their length. The initialized cell state also needs
+      // to reorder.
+      ReorderInitState<DeviceContext, T>(
+          context.template device_context<DeviceContext>(), *h0, order,
+          &ordered_h0, true);
+      gru_value.prev_out_value = ordered_h0.data<T>();
+    } else {
+      gru_value.prev_out_value = nullptr;
+    }
+    auto batch_starts = batch_gate->lod()[0];
+    size_t num_batch = batch_starts.size() - 1;
+    auto active_node = math::detail::GetActivationType(
+        context.Attr<std::string>("activation"));
+    auto active_gate = math::detail::GetActivationType(
+        context.Attr<std::string>("gate_activation"));
+    for (size_t n = 0; n < num_batch; n++) {
+      int bstart = static_cast<int>(batch_starts[n]);
+      int bend = static_cast<int>(batch_starts[n + 1]);
+      int cur_batch_size = bend - bstart;
+      Tensor gate_t = batch_gate->Slice(bstart, bend);
+      Tensor reset_hidden_prev_t = batch_reset_hidden_prev->Slice(bstart, bend);
+      Tensor hidden_t = batch_hidden->Slice(bstart, bend);
+      gru_value.output_value = hidden_t.data<T>();
+      gru_value.gate_value = gate_t.data<T>();
+      gru_value.reset_output_value = reset_hidden_prev_t.data<T>();
+      math::GRUUnitFunctor<DeviceContext, T>::compute(
+          dev_ctx, gru_value, frame_size, cur_batch_size, active_node,
+          active_gate);
+      gru_value.prev_out_value = gru_value.output_value;
+    }
+    math::Batch2LoDTensorFunctor<DeviceContext, T> to_seq;
+    batch_hidden->set_lod(batch_gate->lod());
+    to_seq(dev_ctx, *batch_hidden, hidden);
+  }
+  void Compute(const framework::ExecutionContext& context) const override {
+    BatchCompute(context);
+  }
+};
+}  // namespace operators
+}  // namespace paddle
 namespace ops = paddle::operators;
 REGISTER_OP_CUDA_KERNEL(
    gru, ops::GRUKernel<paddle::platform::CUDADeviceContext, float>,

--- a/paddle/fluid/operators/gru_op.h
+++ b/paddle/fluid/operators/gru_op.h
@@ -37,90 +37,6 @@ inline void ReorderInitState(const DeviceContext& ctx,
  row_shuffle(ctx, src, index_lod, dst, indexed_src);
 }
-template <typename DeviceContext, typename T>
-class GRUKernel : public framework::OpKernel<T> {
- public:
-  void BatchCompute(const framework::ExecutionContext& context) const {
-    auto* input = context.Input<LoDTensor>("Input");
-    auto* h0 = context.Input<Tensor>("H0");
-    auto* weight = context.Input<Tensor>("Weight");
-    const T* weight_data = weight->data<T>();
-    auto* bias = context.Input<Tensor>("Bias");
-    auto* batch_gate = context.Output<LoDTensor>("BatchGate");
-    batch_gate->mutable_data<T>(context.GetPlace());
-    auto* batch_reset_hidden_prev =
-        context.Output<LoDTensor>("BatchResetHiddenPrev");
-    batch_reset_hidden_prev->mutable_data<T>(context.GetPlace());
-    auto* batch_hidden = context.Output<LoDTensor>("BatchHidden");
-    batch_hidden->mutable_data<T>(context.GetPlace());
-    auto* hidden = context.Output<LoDTensor>("Hidden");
-    hidden->mutable_data<T>(context.GetPlace());
-    auto hidden_dims = hidden->dims();
-    bool is_reverse = context.Attr<bool>("is_reverse");
-    math::LoDTensor2BatchFunctor<DeviceContext, T> to_batch;
-    auto& dev_ctx = context.template device_context<DeviceContext>();
-    to_batch(dev_ctx, *input, batch_gate, true, is_reverse);
-    if (bias) {
-      math::RowwiseAdd<DeviceContext, T> add_bias;
-      add_bias(dev_ctx, *batch_gate, *bias, batch_gate);
-    }
-    int frame_size = hidden_dims[1];
-    math::GRUMetaValue<T> gru_value;
-    gru_value.gate_weight = const_cast<T*>(weight_data);
-    gru_value.state_weight =
-        const_cast<T*>(weight_data + 2 * frame_size * frame_size);
-    Tensor ordered_h0;
-    framework::Vector<size_t> order(batch_gate->lod()[2]);
-    if (h0) {
-      // Since the batch computing for GRU reorders the input sequences
-      // according to their length. The initialized cell state also needs
-      // to reorder.
-      ReorderInitState<DeviceContext, T>(
-          context.template device_context<DeviceContext>(), *h0, order,
-          &ordered_h0, true);
-      gru_value.prev_out_value = ordered_h0.data<T>();
-    } else {
-      gru_value.prev_out_value = nullptr;
-    }
-    auto batch_starts = batch_gate->lod()[0];
-    size_t num_batch = batch_starts.size() - 1;
-    auto active_node = math::detail::GetActivationType(
-        context.Attr<std::string>("activation"));
-    auto active_gate = math::detail::GetActivationType(
-        context.Attr<std::string>("gate_activation"));
-    for (size_t n = 0; n < num_batch; n++) {
-      int bstart = static_cast<int>(batch_starts[n]);
-      int bend = static_cast<int>(batch_starts[n + 1]);
-      int cur_batch_size = bend - bstart;
-      Tensor gate_t = batch_gate->Slice(bstart, bend);
-      Tensor reset_hidden_prev_t = batch_reset_hidden_prev->Slice(bstart, bend);
-      Tensor hidden_t = batch_hidden->Slice(bstart, bend);
-      gru_value.output_value = hidden_t.data<T>();
-      gru_value.gate_value = gate_t.data<T>();
-      gru_value.reset_output_value = reset_hidden_prev_t.data<T>();
-      math::GRUUnitFunctor<DeviceContext, T>::compute(
-          dev_ctx, gru_value, frame_size, cur_batch_size, active_node,
-          active_gate);
-      gru_value.prev_out_value = gru_value.output_value;
-    }
-    math::Batch2LoDTensorFunctor<DeviceContext, T> to_seq;
-    batch_hidden->set_lod(batch_gate->lod());
-    to_seq(dev_ctx, *batch_hidden, hidden);
-  }
-  void Compute(const framework::ExecutionContext& context) const override {
-    BatchCompute(context);
-  }
-};
 template <typename DeviceContext, typename T>
 class GRUGradKernel : public framework::OpKernel<T> {
 public:

--- a/paddle/fluid/operators/listen_and_serv_op.cc
+++ b/paddle/fluid/operators/listen_and_serv_op.cc
@@ -123,8 +123,11 @@ void ListenAndServOp::RunSyncLoop(
      optimize_prepared.begin(),
      std::shared_ptr<framework::ExecutorPrepareContext>(nullptr));
+  // Trainers will get all parameters from pserver in the
+  // startup program, so we will wait RequestGet first
+  rpc_service_->SetCond(distributed::kRequestGet);
+  rpc_service_->WaitBarrier(distributed::kRequestGet);
  rpc_service_->ResetBarrierCounter();
  while (true) {
    rpc_service_->Profiler().OneStep();
    // Get from multiple trainers, we don't care about the order in which

--- a/paddle/fluid/operators/lookup_sparse_table_op.cc
+++ b/paddle/fluid/operators/lookup_sparse_table_op.cc
@@ -17,7 +17,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/math_function.h"
-#include "paddle/fluid/platform/device_context.h"
 namespace paddle {
 namespace operators {
@@ -46,10 +45,6 @@ class LookupSparseTableOp : public framework::OperatorBase {
    auto out_var = scope.FindVar(Output("Out"));
    auto w_var = scope.FindVar(Input("W"));
    auto ids_var = scope.FindVar(Input("Ids"));
-    unsigned int seed = static_cast<unsigned int>(Attr<int>("seed"));
-    float min = Attr<float>("min");
-    float max = Attr<float>("max");
-    bool auto_grown_table = Attr<bool>("auto_grown_table");
    PADDLE_ENFORCE(out_var->IsType<framework::LoDTensor>(),
                   "The type of Out var should be LodTensor.");
@@ -60,46 +55,17 @@ class LookupSparseTableOp : public framework::OperatorBase {
    auto &ids_t = ids_var->Get<framework::LoDTensor>();
    auto out_t = out_var->GetMutable<framework::LoDTensor>();
    auto w_t = w_var->GetMutable<framework::SelectedRows>();
-    std::vector<int64_t> keys;
-    keys.resize(ids_t.numel());
-    for (int64_t i = 0; i < ids_t.numel(); ++i) {
-      keys[i] = ids_t.data<int64_t>()[i];
-    }
    // TODO(Yancey1989): support CUDA Place for the sparse table
    platform::CPUPlace cpu;
    auto out_shape = w_t->value().dims();
-    out_shape[0] = keys.size();
+    out_shape[0] = ids_t.numel();
    out_t->Resize(out_shape);
    out_t->mutable_data(cpu, w_t->value().type());
    PADDLE_ENFORCE_EQ(framework::ToDataType(w_t->value().type()),
                      framework::proto::VarType::FP32,
                      "The sparse table only support FP32");
-    auto non_keys_pair = w_t->Get(keys, out_t);
+    w_t->Get(ids_t, out_t, true);
-    if (!auto_grown_table) {
-      PADDLE_ENFORCE_EQ(non_keys_pair.size(), static_cast<size_t>(0),
-                        "there is some keys does exists in the sparse table.");
-    }
-    auto value_shape = w_t->value().dims();
-    value_shape[0] = 1;
-    for (const auto &it : non_keys_pair) {
-      const auto key = it.first;
-      const auto index = it.second;
-      framework::Tensor value;
-      value.Resize(value_shape);
-      auto data = value.mutable_data<float>(cpu);
-      std::minstd_rand engine;
-      engine.seed(seed);
-      std::uniform_real_distribution<float> dist(min, max);
-      int64_t size = value.numel();
-      for (int64_t i = 0; i < size; ++i) {
-        data[i] = dist(engine);
-      }
-      w_t->Set(key, value);
-      memory::Copy(cpu, out_t->mutable_data<float>(cpu) + index * value.numel(),
-                   cpu, value.data<float>(), value.numel() * sizeof(float));
-    }
  }
 };
@@ -121,21 +87,6 @@ class LookupSparseTableOpMaker : public framework::OpProtoAndCheckerMaker {
                     "Otherwise the given value indicates padding the output "
                     "with zeros whenever lookup encounters it in Ids.")
        .SetDefault(kNoPadding);
-    AddAttr<float>("min",
-                   "(float, default -1.0) "
-                   "Minimum value of uniform random")
-        .SetDefault(-1.0f);
-    AddAttr<float>("max",
-                   "(float, default 1.0) "
-                   "Maximum value of uniform random")
-        .SetDefault(1.0f);
-    AddAttr<int>("seed",
-                 "(int, default 0) "
-                 "Random seed used for generating samples. "
-                 "0 means use a seed generated by the system."
-                 "Note that if seed is not 0, this operator will always "
-                 "generate the same random numbers every time.")
-        .SetDefault(0);
    AddAttr<bool>("auto_grown_table",
                  "(bool default false)"
                  "Whether create new value if for nonexistent key.")

--- a/paddle/fluid/operators/math/blas.h
+++ b/paddle/fluid/operators/math/blas.h
@@ -90,6 +90,25 @@ class Blas {
  void GEMM(bool transA, bool transB, int M, int N, int K, T alpha, const T* A,
            int lda, const T* B, int ldb, T beta, T* C, int ldc) const;
+#ifdef PADDLE_WITH_MKLML
+  template <typename T>
+  T* GEMM_ALLOC(const CBLAS_IDENTIFIER id, const int M, const int N,
+                const int K) const;
+  template <typename T>
+  void GEMM_PACK(const CBLAS_IDENTIFIER id, const CBLAS_TRANSPOSE trans, int M,
+                 int N, int K, const T alpha, const T* src, const int ld,
+                 T* dst) const;
+  template <typename T>
+  void GEMM_COMPUTE(int transA, int transB, int M, int N, int K, const T* A,
+                    const int lda, const T* B, const int ldb, T beta, T* C,
+                    const int ldc) const;
+  template <typename T>
+  void GEMM_FREE(T* data) const;
+#endif
  template <typename T>
  void MatMul(const framework::Tensor& mat_a, bool trans_a,
              const framework::Tensor& mat_b, bool trans_b, T alpha,
@@ -146,6 +165,28 @@ class BlasT : private Blas<DeviceContext> {
    Base()->template GEMM<T>(args...);
  }
+#ifdef PADDLE_WITH_MKLML
+  template <typename... ARGS>
+  T* GEMM_ALLOC(ARGS... args) const {
+    return Base()->template GEMM_ALLOC<T>(args...);
+  }
+  template <typename... ARGS>
+  void GEMM_PACK(ARGS... args) const {
+    Base()->template GEMM_PACK<T>(args...);
+  }
+  template <typename... ARGS>
+  void GEMM_COMPUTE(ARGS... args) const {
+    Base()->template GEMM_COMPUTE<T>(args...);
+  }
+  template <typename... ARGS>
+  void GEMM_FREE(ARGS... args) const {
+    Base()->template GEMM_FREE<T>(args...);
+  }
+#endif
  template <typename... ARGS>
  void MatMul(ARGS... args) const {
    Base()->template MatMul<T>(args...);

--- a/paddle/fluid/operators/math/blas_impl.h
+++ b/paddle/fluid/operators/math/blas_impl.h
@@ -31,6 +31,26 @@ struct CBlas<float> {
    platform::dynload::cblas_sgemm(args...);
  }
+  template <typename... ARGS>
+  static float *GEMM_ALLOC(ARGS... args) {
+    return platform::dynload::cblas_sgemm_alloc(args...);
+  }
+  template <typename... ARGS>
+  static void GEMM_PACK(ARGS... args) {
+    platform::dynload::cblas_sgemm_pack(args...);
+  }
+  template <typename... ARGS>
+  static void GEMM_COMPUTE(ARGS... args) {
+    platform::dynload::cblas_sgemm_compute(args...);
+  }
+  template <typename... ARGS>
+  static void GEMM_FREE(ARGS... args) {
+    platform::dynload::cblas_sgemm_free(args...);
+  }
 #ifdef PADDLE_WITH_LIBXSMM
  template <typename... ARGS>
  static void SMM_GEMM(ARGS... args) {
@@ -71,6 +91,26 @@ struct CBlas<double> {
    platform::dynload::cblas_dgemm(args...);
  }
+  template <typename... ARGS>
+  static double *GEMM_ALLOC(ARGS... args) {
+    return platform::dynload::cblas_dgemm_alloc(args...);
+  }
+  template <typename... ARGS>
+  static void GEMM_PACK(ARGS... args) {
+    platform::dynload::cblas_dgemm_pack(args...);
+  }
+  template <typename... ARGS>
+  static void GEMM_COMPUTE(ARGS... args) {
+    platform::dynload::cblas_dgemm_compute(args...);
+  }
+  template <typename... ARGS>
+  static void GEMM_FREE(ARGS... args) {
+    platform::dynload::cblas_dgemm_free(args...);
+  }
 #ifdef PADDLE_WITH_LIBXSMM
  template <typename... ARGS>
  static void SMM_GEMM(ARGS... args) {
@@ -224,6 +264,41 @@ inline void GEMM_WARP(CBLAS_ORDER order, CBLAS_TRANSPOSE transA,
                 beta, C, ldc);
 }
+#ifdef PADDLE_WITH_MKLML
+template <>
+template <typename T>
+T *Blas<platform::CPUDeviceContext>::GEMM_ALLOC(const CBLAS_IDENTIFIER id,
+                                                const int M, const int N,
+                                                const int K) const {
+  return CBlas<T>::GEMM_ALLOC(id, M, N, K);
+}
+template <>
+template <typename T>
+void Blas<platform::CPUDeviceContext>::GEMM_PACK(const CBLAS_IDENTIFIER id,
+                                                 const CBLAS_TRANSPOSE trans,
+                                                 int M, int N, int K,
+                                                 const T alpha, const T *src,
+                                                 const int ld, T *dst) const {
+  CBlas<T>::GEMM_PACK(CblasRowMajor, id, trans, M, N, K, alpha, src, ld, dst);
+}
+template <>
+template <typename T>
+void Blas<platform::CPUDeviceContext>::GEMM_COMPUTE(
+    int transA, int transB, int M, int N, int K, const T *A, const int lda,
+    const T *B, const int ldb, T beta, T *C, const int ldc) const {
+  CBlas<T>::GEMM_COMPUTE(CblasRowMajor, transA, transB, M, N, K, A, lda, B, ldb,
+                         beta, C, ldc);
+}
+template <>
+template <typename T>
+void Blas<platform::CPUDeviceContext>::GEMM_FREE(T *data) const {
+  CBlas<T>::GEMM_FREE(data);
+}
+#endif
 template <>
 template <typename T>
 void Blas<platform::CPUDeviceContext>::GEMM(CBLAS_TRANSPOSE transA,

--- a/paddle/fluid/operators/prelu_op.cc
+++ b/paddle/fluid/operators/prelu_op.cc
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -26,14 +23,40 @@ class PReluOp : public framework::OperatorWithKernel {
      : OperatorWithKernel(type, inputs, outputs, attrs) {}
  void InferShape(framework::InferShapeContext *ctx) const override {
+    std::string mode = ctx->Attrs().Get<std::string>("mode");
+    auto x_dim = ctx->GetInputDim("X");
    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null");
    PADDLE_ENFORCE(ctx->HasInput("Alpha"), "Input(Alpha) should not be null");
-    PADDLE_ENFORCE(product(ctx->GetInputDim("Alpha")) == 1,
-                   "Size of weight Alpha must be one.");
    PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) should not be null");
-    ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
+    if (mode == "all") {
+      PADDLE_ENFORCE(product(ctx->GetInputDim("Alpha")) == 1,
+                     "For mode 'all', size of weight Alpha must be one.");
+    } else if (mode == "channel") {
+      PADDLE_ENFORCE(product(ctx->GetInputDim("Alpha")) == x_dim[1],
+                     "For channel-wise mode, size of weight Alpha must be "
+                     "equal to the number of channels, should be %d",
+                     x_dim[1]);
+    } else if (mode == "element") {
+      PADDLE_ENFORCE(product(ctx->GetInputDim("Alpha")) == product(x_dim),
+                     "For element-wise mode, size of weight Alpha must be "
+                     "equal to the number of input, should be %d",
+                     product(x_dim));
+    } else {
+      PADDLE_THROW("Unkown mode %s", mode);
+    }
+    ctx->SetOutputDim("Out", x_dim);
    ctx->ShareLoD("X", /*->*/ "Out");
  }
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<Tensor>("X")->type()),
+        platform::CPUPlace());
+  }
 };
 class PReluOpMaker : public framework::OpProtoAndCheckerMaker {
@@ -44,9 +67,7 @@ class PReluOpMaker : public framework::OpProtoAndCheckerMaker {
    AddOutput("Out", "The output tensor of prelu operator.");
    AddComment(R"DOC(
 PRelu Operator.
 The equation is:
 $$
 f(x) =
 \begin{cases}
@@ -54,11 +75,15 @@ f(x) =
 x,         \qquad  \text{if} \ x >= 0
 \end{cases}
 $$
 The input `X` can carry the LoD (Level of Details) information,
 or not. And the output shares the LoD information with input `X`.
+There are modes: 
+  all: all elements share same weight
+  channel: elements in a channel share same weight
+  element: each element has a weight 
 )DOC");
+    AddAttr<std::string>("mode", "The mode for inputs to share weights.")
+        .SetDefault("all");
  }
 };
@@ -71,9 +96,23 @@ class PReluGradOp : public framework::OperatorWithKernel {
    PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) must not be null.");
    PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Out")),
                   "Input(Out@GRAD) should not be null");
-    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
+    auto x_grad_name = framework::GradVarName("X");
-    ctx->SetOutputDim(framework::GradVarName("Alpha"),
+    auto alpha_grad_name = framework::GradVarName("Alpha");
-                      ctx->GetInputDim("Alpha"));
+    if (ctx->HasOutput(x_grad_name)) {
+      ctx->SetOutputDim(x_grad_name, ctx->GetInputDim("X"));
+    }
+    if (ctx->HasOutput(alpha_grad_name)) {
+      ctx->SetOutputDim(alpha_grad_name, ctx->GetInputDim("Alpha"));
+    }
+  }
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    return framework::OpKernelType(
+        framework::ToDataType(ctx.Input<Tensor>("X")->type()),
+        platform::CPUPlace());
  }
 };

--- a/paddle/fluid/operators/prelu_op.h
+++ b/paddle/fluid/operators/prelu_op.h
 /* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
 Licensed under the Apache License, Version 2.0 (the "License");
 you may not use this file except in compliance with the License.
 You may obtain a copy of the License at
    http://www.apache.org/licenses/LICENSE-2.0
 Unless required by applicable law or agreed to in writing, software
 distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -13,32 +10,16 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
+#include <string>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/platform/transform.h"
 namespace paddle {
 namespace operators {
 using Tensor = framework::Tensor;
 using platform::Transform;
-template <typename T>
-class PReluFunctor {
- public:
-  explicit PReluFunctor(const T* alpha) : alpha_(alpha) {}
-  HOSTDEVICE T operator()(const T& x) const {
-    if (x > 0)
-      return x;
-    else
-      return x * (*alpha_);
-  }
- private:
-  const T* alpha_;
-};
 template <typename DeviceContext, typename T>
 class PReluKernel : public framework::OpKernel<T> {
 public:
@@ -50,53 +31,93 @@ class PReluKernel : public framework::OpKernel<T> {
    const T* x_ptr = x->data<T>();
    T* o_ptr = out->mutable_data<T>(context.GetPlace());
-    auto* alpha_ptr = alpha->data<T>();
+    const T* alpha_ptr = alpha->data<T>();
+    std::string mode = context.Attr<std::string>("mode");
    int numel = x->numel();
+    auto dim = x->dims();
-    Transform<DeviceContext> trans;
+    int index = 0;
-    trans(context.template device_context<DeviceContext>(), x_ptr,
+    int i = 0;
-          x_ptr + numel, o_ptr, PReluFunctor<T>(alpha_ptr));
+    int temp = 0;
-  }
+    if (mode == "channel") {
-};
+      for (i = 0; i < numel; i++) {
+        temp = numel / (dim[0] * dim[1]);
-template <typename T>
+        index = (i / temp) % dim[1];
-class PReluGradFunctor {
+        o_ptr[i] = x_ptr[i] > 0 ? x_ptr[i] : alpha_ptr[index] * x_ptr[i];
- public:
+      }
-  explicit PReluGradFunctor(const T* alpha) : alpha_(alpha) {}
+    } else if (mode == "element") {
+      for (i = 0; i < numel; i++) {
-  HOSTDEVICE T operator()(const T& out, const T& dout) const {
+        o_ptr[i] = x_ptr[i] > 0 ? x_ptr[i] : alpha_ptr[i] * x_ptr[i];
-    if (out > 0)
+      }
-      return dout;
+    } else {
-    else
+      for (i = 0; i < numel; i++) {
-      return dout * (*alpha_);
+        o_ptr[i] = x_ptr[i] > 0 ? x_ptr[i] : alpha_ptr[0] * x_ptr[i];
+      }
+    }
  }
- private:
-  const T* alpha_;
 };
 template <typename DeviceContext, typename T>
 class PReluGradKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& context) const override {
+    auto* x = context.Input<Tensor>("X");
    auto* dx = context.Output<Tensor>(framework::GradVarName("X"));
    auto* dout = context.Input<Tensor>(framework::GradVarName("Out"));
+    auto* dalpha = context.Output<Tensor>(framework::GradVarName("Alpha"));
    auto* out = context.Input<Tensor>("Out");
    auto* alpha = context.Input<Tensor>("Alpha");
-    auto* alpha_ptr = alpha->data<T>();
+    const T* alpha_ptr = alpha->data<T>();
+    const T* x_ptr = x->data<T>();
-    T* dx_ptr = dx->mutable_data<T>(context.GetPlace());
    const T* dout_ptr = dout->data<T>();
    const T* out_ptr = out->data<T>();
-    int numel = dx->numel();
+    std::string mode = context.Attr<std::string>("mode");
+    int numel = x->numel();
-    Transform<DeviceContext> trans;
+    auto dim = x->dims();
-    trans(context.template device_context<DeviceContext>(), out_ptr,
+    int index = 0;
-          out_ptr + numel, dout_ptr, dx_ptr, PReluGradFunctor<T>(alpha_ptr));
+    int i = 0;
+    int temp = 0;
-    // TODO(Zhuoyuan): add dalpha upgrade when GPU kernels ready
+    if (dx) {
+      T* dx_ptr = dx->mutable_data<T>(context.GetPlace());
+      if (mode == "channel") {
+        for (i = 0; i < numel; i++) {
+          temp = numel / (dim[0] * dim[1]);
+          index = (i / temp) % dim[1];
+          dx_ptr[i] =
+              out_ptr[i] > 0 ? dout_ptr[i] : alpha_ptr[index] * dout_ptr[i];
+        }
+      } else if (mode == "element") {
+        for (i = 0; i < numel; i++) {
+          dx_ptr[i] = out_ptr[i] > 0 ? dout_ptr[i] : alpha_ptr[i] * dout_ptr[i];
+        }
+      } else {
+        for (i = 0; i < numel; i++) {
+          dx_ptr[i] = out_ptr[i] > 0 ? dout_ptr[i] : alpha_ptr[0] * dout_ptr[i];
+        }
+      }
+    }
+    index = 0;
+    if (dalpha) {
+      T* dalpha_ptr = dalpha->mutable_data<T>(context.GetPlace());
+      if (mode == "channel") {
+        for (i = 0; i < numel; i++) {
+          temp = numel / (dim[0] * dim[1]);
+          index = (i / temp) % dim[1];
+          dalpha_ptr[index] += out_ptr[i] > 0 ? 0 : x_ptr[i] * dout_ptr[i];
+        }
+      } else if (mode == "element") {
+        for (i = 0; i < numel; i++) {
+          dalpha_ptr[i] += out_ptr[i] > 0 ? 0 : x_ptr[i] * dout_ptr[i];
+        }
+      } else {
+        for (i = 0; i < numel; i++) {
+          dalpha_ptr[0] += out_ptr[i] > 0 ? 0 : x_ptr[i] * dout_ptr[i];
+        }
+      }
+    }
+    // TODO(Guanzhong): add GPU kernels
  }
 };

--- a/paddle/fluid/operators/sampling_id_op.cc
+++ b/paddle/fluid/operators/sampling_id_op.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/operators/sampling_id_op.h"
+namespace paddle {
+namespace operators {
+using Tensor = framework::Tensor;
+class SamplingIdOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of SamplingIdOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of SamplingIdOp should not be null.");
+    PADDLE_ENFORCE(
+        ctx->Attrs().Get<float>("min") < ctx->Attrs().Get<float>("max"),
+        "min must less then max");
+    auto input_dims = ctx->GetInputDim("X");
+    PADDLE_ENFORCE(input_dims.size() == 2,
+                   "Input(X, Filter) should be 2-D tensor.");
+    framework::DDim dims = input_dims;
+    ctx->SetOutputDim("Out", dims);
+    ctx->ShareLoD("X", "Out");
+  }
+};
+class SamplingIdOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X",
+             "The input tensor of softmax. "
+             "2-D with shape [batch_size, input_feature_dimensions].");
+    AddOutput("Out", "SamplingId data tensor.");
+    AddComment(R"DOC(
+SamplingId Operator.
+A layer for sampling id from multinomial distribution from the
+ input. Sampling one id for one sample.)DOC");
+    AddAttr<float>("min", "Minimum value of random. [default 0.0].")
+        .SetDefault(0.0f);
+    AddAttr<float>("max", "Maximun value of random. [default 1.0].")
+        .SetDefault(1.0f);
+    AddAttr<int>("seed",
+                 "Random seed used for the random number engine. "
+                 "0 means use a seed generated by the system."
+                 "Note that if seed is not 0, this operator will always "
+                 "generate the same random numbers every time. [default 0].")
+        .SetDefault(0);
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(sampling_id, ops::SamplingIdOp, ops::SamplingIdOpMaker,
+                  paddle::framework::EmptyGradOpMaker);
+REGISTER_OP_CPU_KERNEL(sampling_id, paddle::operators::SamplingIdKernel<float>,
+                       paddle::operators::SamplingIdKernel<double>);
--- a/paddle/fluid/operators/sampling_id_op.cu
+++ b/paddle/fluid/operators/sampling_id_op.cu
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+     http://www.apache.org/licenses/LICENSE-2.0
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+#include "paddle/fluid/operators/sampling_id_op.h"
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(sampling_id, paddle::operators::SamplingIdKernel<float>,
+                        paddle::operators::SamplingIdKernel<double>);
--- a/paddle/fluid/operators/sampling_id_op.h
+++ b/paddle/fluid/operators/sampling_id_op.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+     http://www.apache.org/licenses/LICENSE-2.0
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+#pragma once
+#include <algorithm>
+#include <iostream>
+#include <iterator>
+#include <random>
+#include <sstream>
+#include <vector>
+#include "paddle/fluid/framework/op_registry.h"
+namespace paddle {
+namespace operators {
+using Tensor = framework::Tensor;
+template <typename T>
+class SamplingIdKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    const Tensor* input = context.Input<Tensor>("X");
+    const int batch_size = static_cast<int>(input->dims()[0]);
+    const int width = static_cast<int>(input->dims()[1]);
+    PADDLE_ENFORCE_GE(batch_size, 0,
+                      "batch_size(dims[0]) must be nonnegative.");
+    PADDLE_ENFORCE_GE(width, 0, "width(dims[1]) must be nonnegative.");
+    std::vector<T> ins_vector;
+    framework::TensorToVector(*input, context.device_context(), &ins_vector);
+    unsigned int seed = static_cast<unsigned int>(context.Attr<int>("seed"));
+    std::minstd_rand engine;
+    if (seed == 0) {
+      seed = std::random_device()();
+    }
+    engine.seed(seed);
+    std::uniform_real_distribution<T> dist(
+        static_cast<T>(context.Attr<float>("min")),
+        static_cast<T>(context.Attr<float>("max")));
+    std::vector<T> ids(batch_size);
+    for (size_t i = 0; i < batch_size; ++i) {
+      T r = dist(engine);
+      int idx = width - 1;
+      for (int j = 0; j < width; ++j) {
+        if ((r -= ins_vector[i * width + j]) < 0) {
+          idx = j;
+          break;
+        }
+      }
+      ids[i] = ins_vector[i * width + idx];
+    }
+    std::vector<int64_t> out_dim;
+    out_dim.push_back(static_cast<int64_t>(batch_size));
+    Tensor* output = context.Output<Tensor>("Out");
+    output->Resize(framework::make_ddim(out_dim));
+    output->mutable_data<T>(context.GetPlace());
+    framework::TensorFromVector(ids, context.device_context(), output);
+  }
+};
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/scatter_op.h
+++ b/paddle/fluid/operators/scatter_op.h
@@ -35,7 +35,7 @@ class ScatterOpKernel : public framework::OpKernel<T> {
    auto *Out = ctx.Output<Tensor>("Out");
    // In place output: Out = X, Out[Ids] += Updates
-    Out->ShareDataWith(*X);
+    framework::TensorCopySync(*X, ctx.GetPlace(), Out);
    // Apply ScatterUpdate: Out[index] += Updates[:]
    ScatterAssign<T>(ctx.device_context(), *Updates, *Ids, Out);
  }
@@ -53,7 +53,7 @@ class ScatterGradientOpKernel : public framework::OpKernel<T> {
    auto *dOut = ctx.Input<Tensor>(framework::GradVarName("Out"));
    // In place gradient: dX = dO
-    dX->ShareDataWith(*dOut);
+    framework::TensorCopySync(*dOut, ctx.GetPlace(), dX);
    dUpdates->mutable_data<T>(ctx.GetPlace());
    // Gradient by Gather: dUpdates += dO[Ids]
    CPUGather<T>(ctx.device_context(), *dOut, *Ids, dUpdates);

--- a/paddle/fluid/operators/sgd_op.h
+++ b/paddle/fluid/operators/sgd_op.h
@@ -111,7 +111,7 @@ class SGDOpKernel : public framework::OpKernel<T> {
      for (size_t i = 0; i < grad.rows().size(); i++) {
        PADDLE_ENFORCE(grad.rows()[i] < grad.height(),
                       "Input rows index should less than height");
-        int64_t id_index = param.Index(grad.rows()[i]);
+        int64_t id_index = param_out->AutoGrownIndex(grad.rows()[i], false);
        PADDLE_ENFORCE_GE(id_index, static_cast<int64_t>(0),
                          "id should be in the table");
        for (int64_t j = 0; j < grad_row_width; j++) {

--- a/paddle/fluid/operators/uniform_random_op.cc
+++ b/paddle/fluid/operators/uniform_random_op.cc
@@ -30,8 +30,10 @@ class CPUUniformRandomKernel : public framework::OpKernel<T> {
      tensor = out_var->GetMutable<framework::LoDTensor>();
    } else if (out_var->IsType<framework::SelectedRows>()) {
      auto shape = ctx.Attr<std::vector<int>>("shape");
-      tensor = out_var->GetMutable<framework::SelectedRows>()->mutable_value();
+      auto* selected_rows = out_var->GetMutable<framework::SelectedRows>();
+      tensor = selected_rows->mutable_value();
      tensor->Resize(framework::make_ddim(shape));
+      selected_rows->mutable_rows()->reserve(shape[0]);
    } else {
      PADDLE_THROW(
          "uniform_random_op's output only"

--- a/paddle/fluid/platform/dynload/mklml.h
+++ b/paddle/fluid/platform/dynload/mklml.h
@@ -60,6 +60,14 @@ extern void* mklml_dso_handle;
  __macro(cblas_dgemm_batch);       \
  __macro(vsAdd);                   \
  __macro(vdAdd);                   \
+  __macro(cblas_sgemm_alloc);       \
+  __macro(cblas_sgemm_pack);        \
+  __macro(cblas_sgemm_compute);     \
+  __macro(cblas_sgemm_free);        \
+  __macro(cblas_dgemm_alloc);       \
+  __macro(cblas_dgemm_pack);        \
+  __macro(cblas_dgemm_compute);     \
+  __macro(cblas_dgemm_free);        \
  __macro(MKL_Set_Num_Threads)
 MKLML_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_MKLML_WRAP);

--- a/paddle/fluid/platform/enforce.h
+++ b/paddle/fluid/platform/enforce.h
@@ -263,7 +263,8 @@ inline void throw_on_error(T e) {
 *    PADDLE_ENFORCE_EQ(a, b);
 *
 *    will raise an expression described as follows:
- *    "enforce a == b failed, 1 != 2" with detailed stack information.
+ *    "Enforce failed. Expected input a == b, but received a(1) != b(2)."
+ *      with detailed stack information.
 *
 *    extra messages is also supported, for example:
 *    PADDLE_ENFORCE(a, b, "some simple enforce failed between %d numbers", 2)
@@ -292,9 +293,10 @@ inline void throw_on_error(T e) {
 #define __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, __CMP, __INV_CMP, ...)  \
  do {                                                                  \
    if (UNLIKELY(!((__VAL0)__CMP(__VAL1)))) {                           \
-      PADDLE_THROW("enforce %s " #__CMP " %s failed, %s " #__INV_CMP    \
+      PADDLE_THROW("Enforce failed. Expected %s " #__CMP                \
-                   " %s\n%s",                                           \
+                   " %s, but received %s:%s " #__INV_CMP " %s:%s.\n%s", \
-                   #__VAL0, #__VAL1, paddle::string::to_string(__VAL0), \
+                   #__VAL0, #__VAL1, #__VAL0,                           \
+                   paddle::string::to_string(__VAL0), #__VAL1,          \
                   paddle::string::to_string(__VAL1),                   \
                   paddle::string::Sprintf("" __VA_ARGS__));            \
    }                                                                   \

--- a/paddle/fluid/platform/enforce_test.cc
+++ b/paddle/fluid/platform/enforce_test.cc
@@ -54,7 +54,9 @@ TEST(ENFORCE_EQ, NO_EXTRA_MSG_FAIL) {
    PADDLE_ENFORCE_EQ(a, 1 + 3);
  } catch (paddle::platform::EnforceNotMet error) {
    caught_exception = true;
-    HasPrefix(StringPiece(error.what()), "enforce a == 1 + 3 failed, 2 != 4");
+    HasPrefix(
+        StringPiece(error.what()),
+        "Enforce failed. Expected a == 1 + 3, but received a:2 != 1 + 3:4.");
  }
  EXPECT_TRUE(caught_exception);
 }
@@ -67,7 +69,8 @@ TEST(ENFORCE_EQ, EXTRA_MSG_FAIL) {
  } catch (paddle::platform::EnforceNotMet error) {
    caught_exception = true;
    HasPrefix(StringPiece(error.what()),
-              "enforce a == 1 + 3 failed, 2 != 4\ntheir size not match");
+              "Enforce failed. Expected a == 1 + 3, but received a:2 != 1 + "
+              "3:4.\ntheir size not match");
  }
  EXPECT_TRUE(caught_exception);
 }
@@ -84,8 +87,9 @@ TEST(ENFORCE_NE, FAIL) {
    PADDLE_ENFORCE_NE(1.0, 1UL);
  } catch (paddle::platform::EnforceNotMet error) {
    caught_exception = true;
-    EXPECT_TRUE(HasPrefix(StringPiece(error.what()),
+    EXPECT_TRUE(HasPrefix(
-                          "enforce 1.0 != 1UL failed, 1 == 1"))
+        StringPiece(error.what()),
+        "Enforce failed. Expected 1.0 != 1UL, but received 1.0:1 == 1UL:1."))
        << error.what() << " does not have expected prefix";
  }
  EXPECT_TRUE(caught_exception);
@@ -98,8 +102,9 @@ TEST(ENFORCE_GT, FAIL) {
    PADDLE_ENFORCE_GT(1, 2UL);
  } catch (paddle::platform::EnforceNotMet error) {
    caught_exception = true;
-    EXPECT_TRUE(
+    EXPECT_TRUE(HasPrefix(
-        HasPrefix(StringPiece(error.what()), "enforce 1 > 2UL failed, 1 <= 2"));
+        StringPiece(error.what()),
+        "Enforce failed. Expected 1 > 2UL, but received 1:1 <= 2UL:2."));
  }
  EXPECT_TRUE(caught_exception);
 }
@@ -116,8 +121,9 @@ TEST(ENFORCE_GE, FAIL) {
    PADDLE_ENFORCE_GE(1, 2UL);
  } catch (paddle::platform::EnforceNotMet error) {
    caught_exception = true;
-    EXPECT_TRUE(
+    EXPECT_TRUE(HasPrefix(
-        HasPrefix(StringPiece(error.what()), "enforce 1 >= 2UL failed, 1 < 2"));
+        StringPiece(error.what()),
+        "Enforce failed. Expected 1 >= 2UL, but received 1:1 < 2UL:2."));
  }
  EXPECT_TRUE(caught_exception);
 }
@@ -135,8 +141,9 @@ TEST(ENFORCE_LE, FAIL) {
    PADDLE_ENFORCE_GT(1, 2UL);
  } catch (paddle::platform::EnforceNotMet error) {
    caught_exception = true;
-    EXPECT_TRUE(
+    EXPECT_TRUE(HasPrefix(
-        HasPrefix(StringPiece(error.what()), "enforce 1 > 2UL failed, 1 <= 2"));
+        StringPiece(error.what()),
+        "Enforce failed. Expected 1 > 2UL, but received 1:1 <= 2UL:2."));
  }
  EXPECT_TRUE(caught_exception);
 }
@@ -153,7 +160,8 @@ TEST(ENFORCE_LT, FAIL) {
  } catch (paddle::platform::EnforceNotMet error) {
    caught_exception = true;
    EXPECT_TRUE(HasPrefix(StringPiece(error.what()),
-                          "enforce 1UL < 0.12 failed, 1 >= 0.12"));
+                          "Enforce failed. Expected 1UL < 0.12, but "
+                          "received 1UL:1 >= 0.12:0.12."));
  }
  EXPECT_TRUE(caught_exception);
 }

--- a/paddle/fluid/platform/gpu_info.cc
+++ b/paddle/fluid/platform/gpu_info.cc
@@ -116,7 +116,8 @@ size_t GpuMaxChunkSize() {
  size_t allocating = static_cast<size_t>(FLAGS_fraction_of_gpu_memory_to_use *
                                          (total - reserving));
-  PADDLE_ENFORCE_LE(allocating, available);
+  PADDLE_ENFORCE_LE(allocating, available,
+                    "Insufficient GPU memory to allocation.");
  return allocating;
 }

--- a/paddle/fluid/pybind/protobuf.cc
+++ b/paddle/fluid/pybind/protobuf.cc
@@ -205,12 +205,7 @@ void BindBlockDesc(pybind11::module *m) {
 void BindVarDsec(pybind11::module *m) {
  pybind11::class_<pd::VarDesc> var_desc(*m, "VarDesc", "");
  var_desc
-      .def("name",
+      .def("name", &pd::VarDesc::Name, pybind11::return_value_policy::reference)
-           [](pd::VarDesc &self) {
-             pybind11::bytes name = self.Name();
-             return name;
-           },
-           pybind11::return_value_policy::reference)
      .def("set_name", &pd::VarDesc::SetName)
      .def("set_shape", &pd::VarDesc::SetShape)
      .def("set_shapes", &pd::VarDesc::SetShapes)

--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -54,6 +54,8 @@ limitations under the License. */
 #include "paddle/fluid/platform/gpu_info.h"
 #endif
+#include "pybind11/stl.h"
 // disable auto conversion to list in Python
 PYBIND11_MAKE_OPAQUE(paddle::framework::LoDTensorArray);
@@ -247,6 +249,7 @@ PYBIND11_PLUGIN(core) {
        self.set_rows(new_rows);
 #endif
           })
+      .def("sync_index", [](SelectedRows &instance) { instance.SyncIndex(); })
      .def("rows", [](SelectedRows &self) {
        auto rows = self.rows();
        std::vector<int64_t> new_rows;

--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -97,10 +97,11 @@ if(APPLE)
  if(NOT INSTALL_NAME_TOOL_EXECUTABLE)
    message(FATAL_ERROR "install_name_tool not found, please check.\n")
  endif()
-else(APPLE)
+endif()
+if(LINUX)
  find_program(PATCHELF_EXECUTABLE patchelf)
  if(NOT PATCHELF_EXECUTABLE)
    message(FATAL_ERROR "patchelf not found, please install it.\n"
            "For Ubuntu, the command is: apt-get install -y patchelf.")
  endif()
-endif(APPLE)
+endif(LINUX)
--- a/python/paddle/__init__.py
+++ b/python/paddle/__init__.py
@@ -24,4 +24,5 @@ except ImportError:
 import paddle.reader
 import paddle.dataset
 import paddle.batch
+import paddle.compat
 batch = batch.batch
--- a/python/paddle/compat.py
+++ b/python/paddle/compat.py
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import six
+import math
+__all__ = [
+    'long_type',
+    'to_text',
+    'to_bytes',
+    'round',
+    'floor_division',
+    'get_exception_message',
+]
+if six.PY2:
+    int_type = int
+    long_type = long
+else:
+    int_type = int
+    long_type = int
+#  str and bytes related functions
+def to_text(obj, encoding='utf-8', inplace=False):
+    """
+      All string in PaddlePaddle should be represented as a literal string.
+    This function will convert object to a literal string without any encoding.
+    Especially, if the object type is a list or set container, we will iterate
+    all items in the object and convert them to literal string.
+    In Python3:
+        Decode the bytes type object to str type with specific encoding
+    In Python2:
+        Decode the str type object to unicode type with specific encoding
+    Args:
+        obj(unicode|str|bytes|list|set) : The object to be decoded.
+        encoding(str) : The encoding format to decode a string
+        inplace(bool) : If we change the original object or we create a new one
+    Returns:
+        Decoded result of obj
+    """
+    if obj is None:
+        return obj
+    if isinstance(obj, list):
+        if inplace:
+            for i in six.moves.xrange(len(obj)):
+                obj[i] = _to_text(obj[i], encoding)
+            return obj
+        else:
+            return [_to_text(item, encoding) for item in obj]
+    elif isinstance(obj, set):
+        if inplace:
+            for item in obj:
+                obj.remove(item)
+                obj.add(_to_text(item, encoding))
+            return obj
+        else:
+            return set([_to_text(item, encoding) for item in obj])
+    else:
+        return _to_text(obj, encoding)
+def _to_text(obj, encoding):
+    """
+    In Python3:
+        Decode the bytes type object to str type with specific encoding
+    In Python2:
+        Decode the str type object to unicode type with specific encoding,
+        or we just return the unicode string of object
+    Args:
+        obj(unicode|str|bytes) : The object to be decoded.
+        encoding(str) : The encoding format
+    Returns:
+        decoded result of obj
+    """
+    if obj is None:
+        return obj
+    if isinstance(obj, six.binary_type):
+        return obj.decode(encoding)
+    elif isinstance(obj, six.text_type):
+        return obj
+    else:
+        return six.u(obj)
+def to_bytes(obj, encoding='utf-8', inplace=False):
+    """
+      All string in PaddlePaddle should be represented as a literal string.
+    This function will convert object to a bytes with specific encoding.
+    Especially, if the object type is a list or set container, we will iterate
+    all items in the object and convert them to bytes.
+    In Python3:
+        Encode the str type object to bytes type with specific encoding
+    In Python2:
+        Encode the unicode type object to str type with specific encoding,
+        or we just return the 8-bit string of object
+    Args:
+        obj(unicode|str|bytes|list|set) : The object to be encoded.
+        encoding(str) : The encoding format to encode a string
+        inplace(bool) : If we change the original object or we create a new one
+    Returns:
+        Decoded result of obj
+    """
+    if obj is None:
+        return obj
+    if isinstance(obj, list):
+        if inplace:
+            for i in six.moves.xrange(len(obj)):
+                obj[i] = _to_bytes(obj[i], encoding)
+            return obj
+        else:
+            return [_to_bytes(item, encoding) for item in obj]
+    elif isinstance(obj, set):
+        if inplace:
+            for item in obj:
+                obj.remove(item)
+                obj.add(_to_bytes(item, encoding))
+            return obj
+        else:
+            return set([_to_bytes(item, encoding) for item in obj])
+    else:
+        return _to_bytes(obj, encoding)
+def _to_bytes(obj, encoding):
+    """
+    In Python3:
+        Encode the str type object to bytes type with specific encoding
+    In Python2:
+        Encode the unicode type object to str type with specific encoding,
+        or we just return the 8-bit string of object
+    Args:
+        obj(unicode|str|bytes) : The object to be encoded.
+        encoding(str) : The encoding format
+    Returns:
+        encoded result of obj
+    """
+    if obj is None:
+        return obj
+    assert encoding is not None
+    if isinstance(obj, six.text_type):
+        return obj.encode(encoding)
+    elif isinstance(obj, six.binary_type):
+        return obj
+    else:
+        return six.b(obj)
+# math related functions
+def round(x, d=0):
+    """
+    Compatible round which act the same behaviour in Python3.
+    Args:
+        x(float) : The number to round halfway.
+    Returns:
+        round result of x
+    """
+    if six.PY3:
+        # The official walkaround of round in Python3 is incorrect
+        # we implement accroding this answer: https://www.techforgeek.info/round_python.html
+        if x > 0.0:
+            p = 10**d
+            return float(math.floor((x * p) + math.copysign(0.5, x))) / p
+        elif x < 0.0:
+            p = 10**d
+            return float(math.ceil((x * p) + math.copysign(0.5, x))) / p
+        else:
+            return math.copysign(0.0, x)
+    else:
+        import __builtin__
+        return __builtin__.round(x, d)
+def floor_division(x, y):
+    """
+    Compatible division which act the same behaviour in Python3 and Python2,
+    whose result will be a int value of floor(x / y) in Python3 and value of
+    (x / y) in Python2.
+    Args:
+        x(int|float) : The number to divide.
+        y(int|float) : The number to be divided
+    Returns:
+        division result of x // y
+    """
+    return x // y
+# exception related functions
+def get_exception_message(exc):
+    """
+    Get the error message of a specific exception
+    Args:
+        exec(Exception) : The exception to get error message.
+    Returns:
+        the error message of exec
+    """
+    assert exc is not None
+    if six.PY2:
+        return exc.message
+    else:
+        return str(exc)
--- a/python/paddle/dataset/cifar.py
+++ b/python/paddle/dataset/cifar.py
@@ -28,11 +28,13 @@ images per class.
 """
+from __future__ import print_function
 import itertools
 import numpy
 import paddle.dataset.common
 import tarfile
-from six.moves import zip
+import six
 from six.moves import cPickle as pickle
 __all__ = ['train100', 'test100', 'train10', 'test10', 'convert']
@@ -46,10 +48,11 @@ CIFAR100_MD5 = 'eb9058c3a382ffc7106e4002c42a8d85'
 def reader_creator(filename, sub_name, cycle=False):
    def read_batch(batch):
-        data = batch['data']
+        data = batch[six.b('data')]
-        labels = batch.get('labels', batch.get('fine_labels', None))
+        labels = batch.get(
+            six.b('labels'), batch.get(six.b('fine_labels'), None))
        assert labels is not None
-        for sample, label in zip(data, labels):
+        for sample, label in six.moves.zip(data, labels):
            yield (sample / 255.0).astype(numpy.float32), int(label)
    def reader():
@@ -59,7 +62,11 @@ def reader_creator(filename, sub_name, cycle=False):
            while True:
                for name in names:
-                    batch = pickle.load(f.extractfile(name))
+                    if six.PY2:
+                        batch = pickle.load(f.extractfile(name))
+                    else:
+                        batch = pickle.load(
+                            f.extractfile(name), encoding='bytes')
                    for item in read_batch(batch):
                        yield item
                if not cycle:

--- a/python/paddle/dataset/common.py
+++ b/python/paddle/dataset/common.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from __future__ import print_function
 import requests
 import hashlib
 import os
@@ -85,10 +87,10 @@ def download(url, module_name, md5sum, save_name=None):
        total_length = r.headers.get('content-length')
        if total_length is None:
-            with open(filename, 'w') as f:
+            with open(filename, 'wb') as f:
                shutil.copyfileobj(r.raw, f)
        else:
-            with open(filename, 'w') as f:
+            with open(filename, 'wb') as f:
                dl = 0
                total_length = int(total_length)
                for data in r.iter_content(chunk_size=4096):

--- a/python/paddle/dataset/conll05.py
+++ b/python/paddle/dataset/conll05.py
@@ -20,15 +20,18 @@ dataset. And a pre-trained word vector model based on Wikipedia corpus is used
 to initialize SRL model.
 """
+from __future__ import print_function
 import tarfile
 import gzip
 import itertools
 import paddle.dataset.common
-from six.moves import zip
+import paddle.compat as cpt
+from six.moves import zip, range
 __all__ = ['test, get_dict', 'get_embedding', 'convert']
-DATA_URL = 'http://www.cs.upc.edu/~srlconll/conll05st-tests.tar.gz'
+DATA_URL = 'http://paddlemodels.bj.bcebos.com/conll05st/conll05st-tests.tar.gz'
 DATA_MD5 = '387719152ae52d60422c016e92a742fc'
 WORDDICT_URL = 'http://paddlemodels.bj.bcebos.com/conll05st%2FwordDict.txt'
 WORDDICT_MD5 = 'ea7fb7d4c75cc6254716f0177a506baa'
@@ -89,8 +92,8 @@ def corpus_reader(data_path, words_name, props_name):
            labels = []
            one_seg = []
            for word, label in zip(words_file, props_file):
-                word = word.strip()
+                word = cpt.to_text(word.strip())
-                label = label.strip().split()
+                label = cpt.to_text(label.strip().split())
                if len(label) == 0:  # end of sentence
                    for i in range(len(one_seg[0])):

--- a/python/paddle/dataset/flowers.py
+++ b/python/paddle/dataset/flowers.py
@@ -28,6 +28,9 @@ Graphics and Image Processing (2008)
 http://www.robots.ox.ac.uk/~vgg/publications/papers/nilsback08.{pdf,ps.gz}.
 """
+from __future__ import print_function
 import itertools
 import functools
 from .common import download
@@ -116,7 +119,7 @@ def reader_creator(data_file,
            for file in open(file_list):
                file = file.strip()
                batch = None
-                with open(file, 'r') as f:
+                with open(file, 'rb') as f:
                    batch = pickle.load(f)
                data = batch['data']
                labels = batch['label']

--- a/python/paddle/dataset/image.py
+++ b/python/paddle/dataset/image.py
@@ -29,10 +29,18 @@ the image layout as follows.
  formats can be used for training. Noted that, the format should
  be keep consistent between the training and inference peroid.
 """
+from __future__ import print_function
 import numpy as np
 try:
    import cv2
 except ImportError:
+    import sys
+    sys.stderr.write(
+        '''Warning with paddle image module: opencv-python should be imported,
+    or paddle image module could NOT work; please install opencv-python first.'''
+    )
    cv2 = None
 import os
 import tarfile
@@ -56,7 +64,7 @@ def batch_images_from_tar(data_file,
    :type data_file: string
    :param dataset_name: 'train','test' or 'valid'
    :type dataset_name: string
-    :param img2label: a dic with image file name as key 
+    :param img2label: a dic with image file name as key
                    and image's label as value
    :type img2label: dic
    :param num_per_batch: image number per batch file
@@ -88,7 +96,7 @@ def batch_images_from_tar(data_file,
                output['data'] = data
                pickle.dump(
                    output,
-                    open('%s/batch_%d' % (out_path, file_id), 'w'),
+                    open('%s/batch_%d' % (out_path, file_id), 'wb'),
                    protocol=pickle.HIGHEST_PROTOCOL)
                file_id += 1
                data = []
@@ -99,7 +107,7 @@ def batch_images_from_tar(data_file,
        output['data'] = data
        pickle.dump(
            output,
-            open('%s/batch_%d' % (out_path, file_id), 'w'),
+            open('%s/batch_%d' % (out_path, file_id), 'wb'),
            protocol=pickle.HIGHEST_PROTOCOL)
    with open(meta_file, 'a') as meta:
@@ -113,7 +121,7 @@ def load_image_bytes(bytes, is_color=True):
    Load an color or gray image from bytes array.
    Example usage:
    .. code-block:: python
        with open('cat.jpg') as f:
@@ -126,6 +134,8 @@ def load_image_bytes(bytes, is_color=True):
                     load and return a gray image.
    :type is_color: bool
    """
+    assert cv2 is not None
    flag = 1 if is_color else 0
    file_bytes = np.asarray(bytearray(bytes), dtype=np.uint8)
    img = cv2.imdecode(file_bytes, flag)
@@ -137,7 +147,7 @@ def load_image(file, is_color=True):
    Load an color or gray image from the file path.
    Example usage:
    .. code-block:: python
        im = load_image('cat.jpg')
@@ -149,6 +159,8 @@ def load_image(file, is_color=True):
                     load and return a gray image.
    :type is_color: bool
    """
+    assert cv2 is not None
    # cv2.IMAGE_COLOR for OpenCV3
    # cv2.CV_LOAD_IMAGE_COLOR for older OpenCV Version
    # cv2.IMAGE_GRAYSCALE for OpenCV3
@@ -161,27 +173,29 @@ def load_image(file, is_color=True):
 def resize_short(im, size):
-    """ 
+    """
    Resize an image so that the length of shorter edge is size.
    Example usage:
    .. code-block:: python
        im = load_image('cat.jpg')
        im = resize_short(im, 256)
    :param im: the input image with HWC layout.
    :type im: ndarray
    :param size: the shorter edge size of image after resizing.
    :type size: int
    """
+    assert cv2 is not None
    h, w = im.shape[:2]
    h_new, w_new = size, size
    if h > w:
-        h_new = size * h / w
+        h_new = size * h // w
    else:
-        w_new = size * w / h
+        w_new = size * w // h
    im = cv2.resize(im, (h_new, w_new), interpolation=cv2.INTER_CUBIC)
    return im
@@ -193,17 +207,17 @@ def to_chw(im, order=(2, 0, 1)):
    according the order (2,0,1).
    Example usage:
    .. code-block:: python
        im = load_image('cat.jpg')
        im = resize_short(im, 256)
        im = to_chw(im)
    :param im: the input image with HWC layout.
    :type im: ndarray
    :param order: the transposed order.
-    :type order: tuple|list 
+    :type order: tuple|list
    """
    assert len(im.shape) == len(order)
    im = im.transpose(order)
@@ -215,11 +229,11 @@ def center_crop(im, size, is_color=True):
    Crop the center of image with size.
    Example usage:
    .. code-block:: python
        im = center_crop(im, 224)
    :param im: the input image with HWC layout.
    :type im: ndarray
    :param size: the cropping size.
@@ -228,8 +242,8 @@ def center_crop(im, size, is_color=True):
    :type is_color: bool
    """
    h, w = im.shape[:2]
-    h_start = (h - size) / 2
+    h_start = (h - size) // 2
-    w_start = (w - size) / 2
+    w_start = (w - size) // 2
    h_end, w_end = h_start + size, w_start + size
    if is_color:
        im = im[h_start:h_end, w_start:w_end, :]
@@ -243,11 +257,11 @@ def random_crop(im, size, is_color=True):
    Randomly crop input image with size.
    Example usage:
    .. code-block:: python
        im = random_crop(im, 224)
    :param im: the input image with HWC layout.
    :type im: ndarray
    :param size: the cropping size.
@@ -272,11 +286,11 @@ def left_right_flip(im, is_color=True):
    Return the flipped image.
    Example usage:
    .. code-block:: python
        im = left_right_flip(im)
    :param im: input image with HWC layout or HW layout for gray image
    :type im: ndarray
    :param is_color: whether input image is color or not
@@ -299,7 +313,7 @@ def simple_transform(im,
    resizing, croping and flipping.
    Example usage:
    .. code-block:: python
        im = simple_transform(im, 256, 224, True)
@@ -314,7 +328,7 @@ def simple_transform(im,
    :type is_train: bool
    :param is_color: whether the image is color or not.
    :type is_color: bool
-    :param mean: the mean values, which can be element-wise mean values or 
+    :param mean: the mean values, which can be element-wise mean values or
                 mean values per channel.
    :type mean: numpy array | list
    """
@@ -332,7 +346,7 @@ def simple_transform(im,
    im = im.astype('float32')
    if mean is not None:
        mean = np.array(mean, dtype=np.float32)
-        # mean value, may be one value per channel 
+        # mean value, may be one value per channel
        if mean.ndim == 1 and is_color:
            mean = mean[:, np.newaxis, np.newaxis]
        elif mean.ndim == 1:
@@ -357,7 +371,7 @@ def load_and_transform(filename,
    for the transform operations.
    Example usage:
    .. code-block:: python
        im = load_and_transform('cat.jpg', 256, 224, True)
@@ -372,7 +386,7 @@ def load_and_transform(filename,
    :type is_train: bool
    :param is_color: whether the image is color or not.
    :type is_color: bool
-    :param mean: the mean values, which can be element-wise mean values or 
+    :param mean: the mean values, which can be element-wise mean values or
                 mean values per channel.
    :type mean: numpy array | list
    """

--- a/python/paddle/dataset/imdb.py
+++ b/python/paddle/dataset/imdb.py
@@ -20,11 +20,14 @@ of 25,000 highly polar movie reviews for training, and 25,000 for testing.
 Besides, this module also provides API for building dictionary.
 """
+from __future__ import print_function
 import paddle.dataset.common
 import collections
 import tarfile
 import re
 import string
+import six
 __all__ = ['build_dict', 'train', 'test', 'convert']
@@ -42,13 +45,14 @@ def tokenize(pattern):
        # sequential access of member files, other than
        # tarfile.extractfile, which does random access and might
        # destroy hard disks.
-        tf = next(tarf)
+        tf = tarf.next()
        while tf != None:
            if bool(pattern.match(tf.name)):
                # newline and punctuations removal and ad-hoc tokenization.
-                yield tarf.extractfile(tf).read().rstrip("\n\r").translate(
+                yield tarf.extractfile(tf).read().rstrip(six.b(
-                    None, string.punctuation).lower().split()
+                    "\n\r")).translate(
-            tf = next(tarf)
+                        None, six.b(string.punctuation)).lower().split()
+            tf = tarf.next()
 def build_dict(pattern, cutoff):
@@ -62,11 +66,11 @@ def build_dict(pattern, cutoff):
            word_freq[word] += 1
    # Not sure if we should prune less-frequent words here.
-    word_freq = [x for x in list(word_freq.items()) if x[1] > cutoff]
+    word_freq = [x for x in six.iteritems(word_freq) if x[1] > cutoff]
    dictionary = sorted(word_freq, key=lambda x: (-x[1], x[0]))
    words, _ = list(zip(*dictionary))
-    word_idx = dict(list(zip(words, list(range(len(words))))))
+    word_idx = dict(list(zip(words, six.moves.range(len(words)))))
    word_idx['<unk>'] = len(words)
    return word_idx

--- a/python/paddle/dataset/imikolov.py
+++ b/python/paddle/dataset/imikolov.py
@@ -14,13 +14,17 @@
 """
 imikolov's simple dataset.
-This module will download dataset from 
+This module will download dataset from
 http://www.fit.vutbr.cz/~imikolov/rnnlm/ and parse training set and test set
 into paddle reader creators.
 """
+from __future__ import print_function
 import paddle.dataset.common
 import collections
 import tarfile
+import six
 __all__ = ['train', 'test', 'build_dict', 'convert']
@@ -64,11 +68,13 @@ def build_dict(min_word_freq=50):
            # remove <unk> for now, since we will set it as last index
            del word_freq['<unk>']
-        word_freq = [x for x in list(word_freq.items()) if x[1] > min_word_freq]
+        word_freq = [
+            x for x in six.iteritems(word_freq) if x[1] > min_word_freq
+        ]
        word_freq_sorted = sorted(word_freq, key=lambda x: (-x[1], x[0]))
        words, _ = list(zip(*word_freq_sorted))
-        word_idx = dict(list(zip(words, list(range(len(words))))))
+        word_idx = dict(list(zip(words, six.moves.range(len(words)))))
        word_idx['<unk>'] = len(words)
    return word_idx
@@ -89,7 +95,7 @@ def reader_creator(filename, word_idx, n, data_type):
                    l = ['<s>'] + l.strip().split() + ['<e>']
                    if len(l) >= n:
                        l = [word_idx.get(w, UNK) for w in l]
-                        for i in range(n, len(l) + 1):
+                        for i in six.moves.range(n, len(l) + 1):
                            yield tuple(l[i - n:i])
                elif DataType.SEQ == data_type:
                    l = l.strip().split()

--- a/python/paddle/dataset/mnist.py
+++ b/python/paddle/dataset/mnist.py
@@ -17,10 +17,16 @@ MNIST dataset.
 This module will download dataset from http://yann.lecun.com/exdb/mnist/ and
 parse training set and test set into paddle reader creators.
 """
+from __future__ import print_function
 import paddle.dataset.common
 import subprocess
 import numpy
 import platform
+import six
+import tempfile
+from six.moves import range
 __all__ = ['train', 'test', 'convert']
 URL_PREFIX = 'http://yann.lecun.com/exdb/mnist/'
@@ -45,23 +51,28 @@ def reader_creator(image_filename, label_filename, buffer_size):
        # According to http://stackoverflow.com/a/38061619/724872, we
        # cannot use standard package gzip here.
-        m = subprocess.Popen([zcat_cmd, image_filename], stdout=subprocess.PIPE)
+        tmp_image_file = tempfile.TemporaryFile(prefix='paddle_dataset')
-        m.stdout.read(16)  # skip some magic bytes
+        m = subprocess.Popen(
+            [zcat_cmd, image_filename], stdout=tmp_image_file).communicate()
+        tmp_image_file.seek(16)  # skip some magic bytes
-        l = subprocess.Popen([zcat_cmd, label_filename], stdout=subprocess.PIPE)
+        # Python3 will not take stdout as file
-        l.stdout.read(8)  # skip some magic bytes
+        tmp_label_file = tempfile.TemporaryFile(prefix='paddle_dataset')
+        l = subprocess.Popen(
+            [zcat_cmd, label_filename], stdout=tmp_label_file).communicate()
+        tmp_label_file.seek(8)  # skip some magic bytes
        try:  # reader could be break.
            while True:
                labels = numpy.fromfile(
-                    l.stdout, 'ubyte', count=buffer_size).astype("int")
+                    tmp_label_file, 'ubyte', count=buffer_size).astype("int")
                if labels.size != buffer_size:
                    break  # numpy.fromfile returns empty slice after EOF.
                images = numpy.fromfile(
-                    m.stdout, 'ubyte', count=buffer_size * 28 * 28).reshape(
+                    tmp_image_file, 'ubyte', count=buffer_size * 28 *
-                        (buffer_size, 28 * 28)).astype('float32')
+                    28).reshape((buffer_size, 28 * 28)).astype('float32')
                images = images / 255.0 * 2.0 - 1.0

--- a/python/paddle/dataset/movielens.py
+++ b/python/paddle/dataset/movielens.py
@@ -22,11 +22,15 @@ set and test set into paddle reader creators.
 """
+from __future__ import print_function
 import zipfile
 import paddle.dataset.common
 import re
 import random
 import functools
+import six
+import paddle.compat as cpt
 __all__ = [
    'train', 'test', 'get_movie_title_dict', 'max_movie_id', 'max_user_id',
@@ -112,6 +116,7 @@ def __initialize_meta_info__():
                categories_set = set()
                with package.open('ml-1m/movies.dat') as movie_file:
                    for i, line in enumerate(movie_file):
+                        line = cpt.to_text(line, encoding='latin')
                        movie_id, title, categories = line.strip().split('::')
                        categories = categories.split('|')
                        for c in categories:
@@ -136,6 +141,7 @@ def __initialize_meta_info__():
                USER_INFO = dict()
                with package.open('ml-1m/users.dat') as user_file:
                    for line in user_file:
+                        line = cpt.to_text(line, encoding='latin')
                        uid, gender, age, job, _ = line.strip().split("::")
                        USER_INFO[int(uid)] = UserInfo(
                            index=uid, gender=gender, age=age, job_id=job)
@@ -148,6 +154,7 @@ def __reader__(rand_seed=0, test_ratio=0.1, is_test=False):
    with zipfile.ZipFile(file=fn) as package:
        with package.open('ml-1m/ratings.dat') as rating:
            for line in rating:
+                line = cpt.to_text(line, encoding='latin')
                if (rand.random() < test_ratio) == is_test:
                    uid, mov_id, rating, _ = line.strip().split("::")
                    uid = int(uid)
@@ -187,7 +194,7 @@ def max_movie_id():
    Get the maximum value of movie id.
    """
    __initialize_meta_info__()
-    return reduce(__max_index_info__, list(MOVIE_INFO.values())).index
+    return six.moves.reduce(__max_index_info__, list(MOVIE_INFO.values())).index
 def max_user_id():
@@ -195,7 +202,7 @@ def max_user_id():
    Get the maximum value of user id.
    """
    __initialize_meta_info__()
-    return reduce(__max_index_info__, list(USER_INFO.values())).index
+    return six.moves.reduce(__max_index_info__, list(USER_INFO.values())).index
 def __max_job_id_impl__(a, b):
@@ -210,7 +217,8 @@ def max_job_id():
    Get the maximum value of job id.
    """
    __initialize_meta_info__()
-    return reduce(__max_job_id_impl__, list(USER_INFO.values())).job_id
+    return six.moves.reduce(__max_job_id_impl__,
+                            list(USER_INFO.values())).job_id
 def movie_categories():

--- a/python/paddle/dataset/mq2007.py
+++ b/python/paddle/dataset/mq2007.py
@@ -23,6 +23,8 @@ http://research.microsoft.com/en-us/um/beijing/projects/letor/LETOR4.0/Data/MQ20
 """
+from __future__ import print_function
 import os
 import functools
 import rarfile

--- a/python/paddle/dataset/sentiment.py
+++ b/python/paddle/dataset/sentiment.py
@@ -20,6 +20,9 @@ The script fetch and preprocess movie_reviews data set that provided by NLTK
 TODO(yuyang18): Complete dataset.
 """
+from __future__ import print_function
+import six
 import collections
 from itertools import chain
@@ -64,7 +67,7 @@ def get_word_dict():
        for field in movie_reviews.fileids(category):
            for words in movie_reviews.words(field):
                word_freq_dict[words] += 1
-    words_sort_list = list(word_freq_dict.items())
+    words_sort_list = six.iteritems(word_freq_dict)
    words_sort_list.sort(cmp=lambda a, b: b[1] - a[1])
    for index, word in enumerate(words_sort_list):
        words_freq_sorted.append((word[0], index))

--- a/python/paddle/dataset/tests/cifar_test.py
+++ b/python/paddle/dataset/tests/cifar_test.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from __future__ import print_function
 import paddle.dataset.cifar
 import unittest

--- a/python/paddle/dataset/tests/common_test.py
+++ b/python/paddle/dataset/tests/common_test.py
@@ -12,10 +12,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from __future__ import print_function
 import paddle.dataset.common
 import unittest
 import tempfile
 import glob
+from six.moves import range
 class TestCommon(unittest.TestCase):

--- a/python/paddle/dataset/tests/flowers_test.py
+++ b/python/paddle/dataset/tests/flowers_test.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from __future__ import print_function
 import paddle.dataset.flowers
 import unittest

--- a/python/paddle/dataset/tests/imdb_test.py
+++ b/python/paddle/dataset/tests/imdb_test.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from __future__ import print_function
 import paddle.dataset.imdb
 import unittest
 import re

--- a/python/paddle/dataset/tests/imikolov_test.py
+++ b/python/paddle/dataset/tests/imikolov_test.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from __future__ import print_function
 import paddle.dataset.imikolov
 import unittest

--- a/python/paddle/dataset/tests/mnist_test.py
+++ b/python/paddle/dataset/tests/mnist_test.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from __future__ import print_function
 import paddle.dataset.mnist
 import unittest

--- a/python/paddle/dataset/tests/mq2007_test.py
+++ b/python/paddle/dataset/tests/mq2007_test.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from __future__ import print_function
 import paddle.dataset.mq2007
 import unittest

--- a/python/paddle/dataset/tests/test_image.py
+++ b/python/paddle/dataset/tests/test_image.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from __future__ import print_function
 import unittest
 import numpy as np

--- a/python/paddle/dataset/tests/test_sentiment.py
+++ b/python/paddle/dataset/tests/test_sentiment.py
@@ -15,6 +15,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from __future__ import print_function
 import unittest
 import nltk
 import paddle.dataset.sentiment as st

--- a/python/paddle/dataset/tests/voc2012_test.py
+++ b/python/paddle/dataset/tests/voc2012_test.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from __future__ import print_function
 import paddle.dataset.voc2012
 import unittest

--- a/python/paddle/dataset/tests/wmt16_test.py
+++ b/python/paddle/dataset/tests/wmt16_test.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from __future__ import print_function
 import paddle.dataset.wmt16
 import unittest

--- a/python/paddle/dataset/uci_housing.py
+++ b/python/paddle/dataset/uci_housing.py
@@ -19,9 +19,10 @@ https://archive.ics.uci.edu/ml/machine-learning-databases/housing/ and
 parse training set and test set into paddle reader creators.
 """
-import os
+from __future__ import print_function
 import numpy as np
+import six
 import tempfile
 import tarfile
 import os
@@ -70,11 +71,11 @@ def load_data(filename, feature_num=14, ratio=0.8):
        return
    data = np.fromfile(filename, sep=' ')
-    data = data.reshape(data.shape[0] / feature_num, feature_num)
+    data = data.reshape(data.shape[0] // feature_num, feature_num)
    maximums, minimums, avgs = data.max(axis=0), data.min(axis=0), data.sum(
        axis=0) / data.shape[0]
    feature_range(maximums[:-1], minimums[:-1])
-    for i in range(feature_num - 1):
+    for i in six.moves.range(feature_num - 1):
        data[:, i] = (data[:, i] - avgs[i]) / (maximums[i] - minimums[i])
    offset = int(data.shape[0] * ratio)
    UCI_TRAIN_DATA = data[:offset]
@@ -137,7 +138,7 @@ def predict_reader():
    It returns just one tuple data to do inference.
    :return: one tuple data
-    :rtype: tuple 
+    :rtype: tuple
    """
    global UCI_TEST_DATA
    load_data(paddle.dataset.common.download(URL, 'uci_housing', MD5))

--- a/python/paddle/dataset/voc2012.py
+++ b/python/paddle/dataset/voc2012.py
@@ -19,6 +19,8 @@ to training/test sets has been maintained. The total number of images
 with segmentation has been increased from 7,062 to 9,993.
 """
+from __future__ import print_function
 import tarfile
 import io
 import numpy as np

--- a/python/paddle/dataset/wmt14.py
+++ b/python/paddle/dataset/wmt14.py
@@ -19,10 +19,15 @@ http://paddlepaddle.cdn.bcebos.com/demo/wmt_shrinked_data/wmt14.tgz and
 parse training set and test set into paddle reader creators.
 """
+from __future__ import print_function
+import six
 import tarfile
 import gzip
 import paddle.dataset.common
+import paddle.compat as cpt
 __all__ = [
    'train',
@@ -53,7 +58,7 @@ def __read_to_dict(tar_file, dict_size):
        out_dict = dict()
        for line_count, line in enumerate(fd):
            if line_count < size:
-                out_dict[line.strip()] = line_count
+                out_dict[cpt.to_text(line.strip())] = line_count
            else:
                break
        return out_dict
@@ -84,7 +89,7 @@ def reader_creator(tar_file, file_name, dict_size):
            ]
            for name in names:
                for line in f.extractfile(name):
-                    line_split = line.strip().split('\t')
+                    line_split = line.strip().split(six.b('\t'))
                    if len(line_split) != 2:
                        continue
                    src_seq = line_split[0]  # one source sequence
@@ -153,8 +158,8 @@ def get_dict(dict_size, reverse=True):
    tar_file = paddle.dataset.common.download(URL_TRAIN, 'wmt14', MD5_TRAIN)
    src_dict, trg_dict = __read_to_dict(tar_file, dict_size)
    if reverse:
-        src_dict = {v: k for k, v in list(src_dict.items())}
+        src_dict = {v: k for k, v in six.iteritems(src_dict)}
-        trg_dict = {v: k for k, v in list(trg_dict.items())}
+        trg_dict = {v: k for k, v in six.iteritems(trg_dict)}
    return src_dict, trg_dict

--- a/python/paddle/dataset/wmt16.py
+++ b/python/paddle/dataset/wmt16.py
@@ -28,12 +28,16 @@ Multi30K: Multilingual English-German Image Descriptions.
 }
 """
+from __future__ import print_function
 import os
+import six
 import tarfile
 import gzip
 from collections import defaultdict
 import paddle.dataset.common
+import paddle.compat as cpt
 __all__ = [
    "train",
@@ -60,7 +64,7 @@ def __build_dict(tar_file, dict_size, save_path, lang):
    word_dict = defaultdict(int)
    with tarfile.open(tar_file, mode="r") as f:
        for line in f.extractfile("wmt16/train"):
-            line_split = line.strip().split("\t")
+            line_split = line.strip().split(six.b("\t"))
            if len(line_split) != 2: continue
            sen = line_split[0] if lang == "en" else line_split[1]
            for w in sen.split():
@@ -70,8 +74,7 @@ def __build_dict(tar_file, dict_size, save_path, lang):
        fout.write("%s\n%s\n%s\n" % (START_MARK, END_MARK, UNK_MARK))
        for idx, word in enumerate(
                sorted(
-                    iter(list(word_dict.items())),
+                    six.iteritems(word_dict), key=lambda x: x[1],
-                    key=lambda x: x[1],
                    reverse=True)):
            if idx + 3 == dict_size: break
            fout.write("%s\n" % (word[0]))
@@ -81,16 +84,16 @@ def __load_dict(tar_file, dict_size, lang, reverse=False):
    dict_path = os.path.join(paddle.dataset.common.DATA_HOME,
                             "wmt16/%s_%d.dict" % (lang, dict_size))
    if not os.path.exists(dict_path) or (
-            len(open(dict_path, "r").readlines()) != dict_size):
+            len(open(dict_path, "rb").readlines()) != dict_size):
        __build_dict(tar_file, dict_size, dict_path, lang)
    word_dict = {}
-    with open(dict_path, "r") as fdict:
+    with open(dict_path, "rb") as fdict:
        for idx, line in enumerate(fdict):
            if reverse:
-                word_dict[idx] = line.strip()
+                word_dict[idx] = cpt.to_text(line.strip())
            else:
-                word_dict[line.strip()] = idx
+                word_dict[cpt.to_text(line.strip())] = idx
    return word_dict
@@ -120,7 +123,7 @@ def reader_creator(tar_file, file_name, src_dict_size, trg_dict_size, src_lang):
        with tarfile.open(tar_file, mode="r") as f:
            for line in f.extractfile(file_name):
-                line_split = line.strip().split("\t")
+                line_split = line.strip().split(six.b("\t"))
                if len(line_split) != 2:
                    continue
                src_words = line_split[src_col].split()

--- a/python/paddle/fluid/__init__.py
+++ b/python/paddle/fluid/__init__.py
@@ -122,7 +122,7 @@ def __bootstrap__():
        'use_pinned_memory', 'check_nan_inf', 'benchmark', 'warpctc_dir',
        'eager_delete_scope', 'use_mkldnn', 'initial_cpu_memory_in_mb',
        'init_allocated_mem', 'free_idle_memory', 'paddle_num_threads',
-        'cpu_deterministic'
+        "dist_threadpool_size", 'cpu_deterministic'
    ]
    if core.is_compiled_with_dist():
        read_env_flags.append('rpc_deadline')

--- a/python/paddle/fluid/average.py
+++ b/python/paddle/fluid/average.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from __future__ import print_function
 import numpy as np
 import warnings
 """

--- a/python/paddle/fluid/backward.py
+++ b/python/paddle/fluid/backward.py
@@ -12,11 +12,14 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from __future__ import print_function
 from paddle.fluid import framework as framework
 from . import core
 import collections
 import copy
 import six
+from .. import compat as cpt
 from . import unique_name
 __all__ = ['append_backward']
@@ -45,13 +48,13 @@ def _create_op_desc_(op_type, inputs, outputs, attrs):
    """
    op_desc = core.OpDesc()
    op_desc.set_type(op_type)
-    for para, args in list(inputs.items()):
+    for para, args in six.iteritems(inputs):
        op_desc.set_input(
            para,
            list(
                map(lambda arg: arg.decode() if isinstance(arg, six.binary_type) else arg,
                    args)))
-    for para, args in list(outputs.items()):
+    for para, args in six.iteritems(outputs):
        op_desc.set_output(
            para,
            list(
@@ -63,7 +66,7 @@ def _create_op_desc_(op_type, inputs, outputs, attrs):
    if op_role_attr_name not in attrs:
        attrs[
            op_role_attr_name] = core.op_proto_and_checker_maker.OpRole.Backward
-    for name, val in list(attrs.items()):
+    for name, val in six.iteritems(attrs):
        if isinstance(val, framework.Block):
            op_desc.set_block_attr(name, val.desc)
        else:
@@ -75,10 +78,10 @@ def _infer_var_data_type_(grad_var_name, block):
    """
    Infer the data type of given grad variable
    """
-    grad_var = block.desc.find_var(grad_var_name.encode("ascii"))
+    grad_var = block.desc.find_var(cpt.to_bytes(grad_var_name))
-    fwd_name = _strip_grad_suffix_(grad_var_name.encode("ascii"))
+    fwd_name = _strip_grad_suffix_(grad_var_name)
-    if block.desc.has_var_recursive(fwd_name):
+    if block.desc.has_var_recursive(cpt.to_bytes(fwd_name)):
-        fwd_var = block.desc.find_var_recursive(fwd_name.encode("ascii"))
+        fwd_var = block.desc.find_var_recursive(cpt.to_bytes(fwd_name))
        grad_var.set_dtype(fwd_var.dtype())
    else:
        grad_var.set_dtype(core.VarDesc.VarType.FP32)
@@ -102,8 +105,10 @@ def _some_in_set_(cands, s):
    """
    if len(cands) == 0:
        return False
-    for c in cands:
+    literal_set = cpt.to_text(s)
-        if c in s:
+    literal_cands = cpt.to_text(cands)
+    for c in literal_cands:
+        if c in literal_set:
            return True
    return False
@@ -114,9 +119,8 @@ def _strip_grad_suffix_(name):
    e.g. x@GRAD ==> x
         y@GRAD@RENAME@1 ==> y
    """
-    if isinstance(name, six.text_type):
+    name = cpt.to_text(name)
-        name = name.encode()
+    pos = name.find(core.grad_var_suffix())
-    pos = name.find(six.b(core.grad_var_suffix()))
    return name[:pos] if pos != -1 else name
@@ -125,9 +129,7 @@ def _append_grad_suffix_(name):
    Append grad suffix to the given variable name
    e.g. x ==> x@GRAD
    """
-    if isinstance(name, six.text_type):
+    return cpt.to_text(name) + core.grad_var_suffix()
-        name = name.encode()
-    return name + six.b(core.grad_var_suffix())
 def _addup_repetitive_outputs_(op_descs):
@@ -187,7 +189,7 @@ def _addup_repetitive_outputs_(op_descs):
                    op_desc.set_output(param_name, arg_names)
                    renamed_vars[var_name].append(new_name)
-    for var_name, inputs in list(renamed_vars.items()):
+    for var_name, inputs in six.iteritems(renamed_vars):
        if len(inputs) > 1:
            pending_sum_ops.append(
                (_create_op_desc_("sum", {"X": inputs}, {"Out": [var_name]},
@@ -243,7 +245,7 @@ from .proto import framework_pb2
 def serialize_op_decs(op_desc):
    protostr = op_desc.serialize_to_string()
-    proto = framework_pb2.OpDesc.FromString(str(protostr))
+    proto = framework_pb2.OpDesc.FromString(six.binary_type(protostr))
    return proto.__str__()
@@ -364,7 +366,7 @@ def _append_backward_ops_(block,
        # Getting op's corresponding grad_op
        grad_op_desc, op_grad_to_var = core.get_grad_op_desc(
-            op.desc, no_grad_dict[block.idx], grad_sub_block_list)
+            op.desc, cpt.to_text(no_grad_dict[block.idx]), grad_sub_block_list)
        grad_op_descs.extend(grad_op_desc)
        grad_to_var.update(op_grad_to_var)
@@ -411,11 +413,10 @@ def _append_backward_vars_(block, start_op_idx, grad_to_var, grad_info_map):
        new_vars = set()
        # create new gradient variables
        for grad_var_name in op_desc.output_arg_names():
-            grad_var_name = grad_var_name.encode("ascii")
+            if block.desc.has_var_recursive(cpt.to_bytes(
-            if block.desc.has_var_recursive(
+                    grad_var_name)) or grad_var_name == core.empty_var_name():
-                    grad_var_name) or grad_var_name == core.empty_var_name():
                continue
-            block.desc.var(grad_var_name)
+            block.desc.var(cpt.to_bytes(grad_var_name))
            new_vars.add(grad_var_name)
            if grad_var_name not in grad_to_var:
                continue
@@ -445,7 +446,7 @@ def _rename_grad_(block, start_op_idx, grad_to_var, target_grad_map):
                op_desc.rename_output(name, new_name)
                var_map[name] = new_name
-    for g, ng in list(var_map.items()):
+    for g, ng in six.iteritems(var_map):
        if g in grad_to_var:
            grad_to_var[ng] = grad_to_var[g]
            grad_to_var.pop(g)
@@ -595,11 +596,12 @@ def append_backward(loss, parameter_list=None, no_grad_set=None,
        parameters = parameter_list
    else:
        params = program.global_block().all_parameters()
+        program.global_block().iter_parameters()
        parameters = [param.name for param in params]
    params_and_grads = []
    for param in parameters:
-        if param not in grad_info_map:
+        if cpt.to_text(param) not in grad_info_map:
            continue
        grad_info = grad_info_map[param]
        grad_block = grad_info[1]

--- a/python/paddle/fluid/clip.py
+++ b/python/paddle/fluid/clip.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from __future__ import print_function
 import copy
 import six

--- a/python/paddle/fluid/concurrency.py
+++ b/python/paddle/fluid/concurrency.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from __future__ import print_function
 from .layers.control_flow import BlockGuard, equal
 from .framework import Operator
 from .layer_helper import LayerHelper, unique_name

--- a/python/paddle/fluid/contrib/__init__.py
+++ b/python/paddle/fluid/contrib/__init__.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from __future__ import print_function
 from . import decoder
 from .decoder import *
 from . import memory_usage_calc

--- a/python/paddle/fluid/contrib/decoder/__init__.py
+++ b/python/paddle/fluid/contrib/decoder/__init__.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from __future__ import print_function
 from . import beam_search_decoder
 from .beam_search_decoder import *

--- a/python/paddle/fluid/contrib/decoder/beam_search_decoder.py
+++ b/python/paddle/fluid/contrib/decoder/beam_search_decoder.py
@@ -20,6 +20,8 @@ without using the low level API such as while ops.
 This API is still under active development and may change drastically.
 """
+from __future__ import print_function
 import contextlib
 import numpy as np
 import six

--- a/python/paddle/fluid/contrib/memory_usage_calc.py
+++ b/python/paddle/fluid/contrib/memory_usage_calc.py
@@ -14,12 +14,16 @@
 """
 This module privides a memory usage calculate function for user.
 The purpose of this API is to allow users to estimate memory usage of
-a program under a special batch size, then user can set appropriate 
+a program under a special batch size, then user can set appropriate
-batch size to fully utilize a GPU. 
+batch size to fully utilize a GPU.
 This API is still under active development and may change drastically.
 """
+from __future__ import print_function
+import six
 from .. import core
 from ..framework import Program, Variable
@@ -45,15 +49,15 @@ def memory_usage(program, batch_size):
    Args:
        program(Program): The current Program.
-        batch_size(int): The current input data batch_size.  
+        batch_size(int): The current input data batch_size.
    Returns:
        min_total_memory(float): the estimate memory usage lower bound.
        max_total_memory(float): the estimate memory usage upper bound.
        unit_str(string): the unit of estimate usage result.
    Examples:
        >>> import paddle.fluid as fluid
        >>> lower_usage, upper_usage, unit = fluid.contrib.memory_usage(
                fluid.default_main_program(), batch_size=10)
@@ -72,7 +76,7 @@ def memory_usage(program, batch_size):
    # Get the var_name list of first block and calculate
    total_memory = 0.0
-    for var in program.global_block().vars.itervalues():
+    for var in six.itervalues(program.global_block().vars):
        data_count = 1
        for x in var.shape:
            if x == -1:
@@ -81,10 +85,10 @@ def memory_usage(program, batch_size):
                data_count *= x
        var_memory = data_count * dtype_to_size[var.dtype]
        if DEBUG:
-            print "%s memory usage: %d" % (var.name, var_memory)
+            print("%s memory usage: %d" % (var.name, var_memory))
        total_memory += var_memory
    if DEBUG:
-        print "total memory usage: %.2f" % (total_memory)
+        print("total memory usage: %.2f" % (total_memory))
    # Convert appropriate unit
    unit_str = "B"

--- a/python/paddle/fluid/data_feeder.py
+++ b/python/paddle/fluid/data_feeder.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from __future__ import print_function
 from . import core
 import numpy
 import os

--- a/python/paddle/fluid/debugger.py
+++ b/python/paddle/fluid/debugger.py
@@ -12,7 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from __future__ import print_function
 import sys
+import six
 import re
 from .graphviz import GraphPreviewGenerator
 from .proto import framework_pb2
@@ -225,7 +228,7 @@ def draw_block_graphviz(block, highlights=None, path="./temp.dot"):
    graph = GraphPreviewGenerator("some graph")
    # collect parameters and args
    protostr = block.desc.serialize_to_string()
-    desc = framework_pb2.BlockDesc.FromString(str(protostr))
+    desc = framework_pb2.BlockDesc.FromString(six.binary_type(protostr))
    def need_highlight(name):
        if highlights is None: return False

--- a/python/paddle/fluid/default_scope_funcs.py
+++ b/python/paddle/fluid/default_scope_funcs.py
@@ -26,6 +26,8 @@ A `scoped_function` will take a `function` as input. That function will be
 invoked in a new local scope.
 """
+from __future__ import print_function
 import paddle.fluid.core
 import threading

--- a/python/paddle/fluid/evaluator.py
+++ b/python/paddle/fluid/evaluator.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from __future__ import print_function
 import warnings
 import numpy as np

--- a/python/paddle/fluid/executor.py
+++ b/python/paddle/fluid/executor.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from __future__ import print_function
 import numpy as np
 import contextlib
 import six
@@ -320,8 +322,9 @@ class Executor(object):
        # append fetch_operators
        if not has_fetch_operators(global_block, fetch_list, fetch_var_name):
            for i, var in enumerate(fetch_list):
-                assert isinstance(var, Variable) or isinstance(var, str), (
+                assert isinstance(var, Variable) or isinstance(
-                    "Wrong type for fetch_list[%s]: %s" % (i, type(var)))
+                    var, six.string_types), (
+                        "Wrong type for fetch_list[%s]: %s" % (i, type(var)))
                global_block.append_op(
                    type='fetch',
                    inputs={'X': [var]},
@@ -346,7 +349,7 @@ class Executor(object):
    def _fetch_data(self, fetch_list, fetch_var_name, scope):
        outs = [
            core.get_fetch_variable(scope, fetch_var_name, i)
-            for i in range(len(fetch_list))
+            for i in six.moves.range(len(fetch_list))
        ]
        return outs

--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from __future__ import print_function
 import collections
 import contextlib
 import re
@@ -19,6 +21,7 @@ import six
 import numpy as np
+from .. import compat as cpt
 from .proto import framework_pb2
 try:
    from . import core
@@ -27,7 +30,7 @@ except ImportError as e:
        """NOTE: You may need to run \"export LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH\"
    if you encounters \"libmkldnn.so not found\" errors. If you have python
    installed in other directory, replace \"/usr/local/lib\" with your own
-    directory. The original error is: \n""" + e.message)
+    directory. The original error is: \n""" + cpt.get_exception_message(e))
 except Exception as e:
    raise e
 from . import unique_name
@@ -87,7 +90,7 @@ def convert_np_dtype_to_dtype_(np_dtype):
    elif dtype == np.uint8:
        return core.VarDesc.VarType.UINT8
    else:
-        raise ValueError("Not supported numpy dtype " + six.binary_type(dtype))
+        raise ValueError("Not supported numpy dtype %s" % dtype)
 def dtype_is_floating(dtype):
@@ -198,11 +201,11 @@ class Variable(object):
        if name is None:
            name = unique_name.generate('_generated_var')
        is_new_var = False
-        name = name if isinstance(name, six.binary_type) else name.encode()
+        name = cpt.to_text(name)
-        self.desc = self.block.desc.find_var(name)
+        self.desc = self.block.desc.find_var(cpt.to_bytes(name))
        if self.desc is None:
-            self.desc = self.block.desc.var(name)
+            self.desc = self.block.desc.var(cpt.to_bytes(name))
            is_new_var = True
        if is_new_var:
@@ -325,7 +328,7 @@ class Variable(object):
    @property
    def name(self):
-        return self.desc.name()
+        return cpt.to_text(self.desc.name())
    @name.setter
    def name(self, new_name):
@@ -531,14 +534,7 @@ class Operator(object):
                        elif isinstance(arg, six.binary_type):
                            in_arg_names.append(arg.decode())
                        else:
-                            if isinstance(arg.name, six.string_types):
+                            in_arg_names.append(cpt.to_text(arg.name))
-                                in_arg_names.append(arg.name)
-                            elif isinstance(arg.name, six.binary_type):
-                                in_arg_names.append(arg.name.decode())
-                            else:
-                                raise TypeError(
-                                    "arguments require unicode, str or bytes, but get %s instead."
-                                    % (type(arg.name)))
                    self.desc.set_input(in_proto.name, in_arg_names)
                else:
                    self.desc.set_input(in_proto.name, [])
@@ -567,14 +563,7 @@ class Operator(object):
                        (out_proto.name, len(out_args)))
                out_arg_names = []
                for arg in out_args:
-                    if isinstance(arg.name, six.string_types):
+                    out_arg_names.append(cpt.to_text(arg.name))
-                        out_arg_names.append(arg.name)
-                    elif isinstance(arg.name, six.binary_type):
-                        out_arg_names.append(arg.name.decode())
-                    else:
-                        raise TypeError(
-                            "arguments require unicode, str or bytes, but get %s instead."
-                            % (type(arg.name)))
                    arg.op = self
                self.desc.set_output(out_proto.name, out_arg_names)
@@ -970,10 +959,9 @@ class Block(object):
            Variable: the Variable with the giving name.
        """
        if not isinstance(name, six.string_types):
-            if not isinstance(name, six.binary_type):
+            raise TypeError(
-                raise TypeError(
+                "var require string as parameter, but get %s instead." %
-                    "var require string as parameter, but get %s instead." %
+                (type(name)))
-                    (type(name)))
        v = self.vars.get(name, None)
        if v is None:
            raise ValueError("var %s not in this block" % name)
@@ -1024,7 +1012,7 @@ class Block(object):
        return list(self.iter_parameters())
    def iter_parameters(self):
-        return (item[1] for item in list(self.vars.items())
+        return (item[1] for item in six.iteritems(self.vars)
                if isinstance(item[1], Parameter))
    def create_var(self, *args, **kwargs):
@@ -1052,6 +1040,9 @@ class Block(object):
        Returns:
            Variable: the Variable with the giving name.
        """
+        name = cpt.to_text(name)
+        new_name = cpt.to_text(new_name)
        if not self.has_var(name):
            raise ValueError("var %s is not in current block" % name)
        v = self.var(name)
@@ -1070,9 +1061,9 @@ class Block(object):
        else:
            raise ValueError("unsupported var type: %s", type(v))
        orig_var_type = v.type
-        self.desc._rename_var(name, new_name)
+        self.desc._rename_var(cpt.to_bytes(name), cpt.to_bytes(new_name))
        # NOTE: v is destroyed by C++ after calling _rename_var.
-        d = self.desc.find_var(new_name)
+        d = self.desc.find_var(cpt.to_bytes(new_name))
        if var_type == "Parameter":
            var = Parameter(
                self,
@@ -1103,7 +1094,7 @@ class Block(object):
    def _remove_var(self, name):
        self._sync_with_cpp()
-        self.desc._remove_var(name)
+        self.desc._remove_var(cpt.to_bytes(name))
        del self.vars[name]
    def create_parameter(self, *args, **kwargs):
@@ -1205,7 +1196,7 @@ class Block(object):
        # sync variables removed from c++ end
        for var in list(self.vars.keys()):
-            if not self.desc.find_var(var):
+            if not self.desc.find_var(cpt.to_bytes(var)):
                self.vars.pop(var)
        # sync operators from cpp
@@ -1576,7 +1567,9 @@ class Program(object):
            p.current_block_idx = self.current_block_idx
            p._seed = self._seed
            p.desc = core.ProgramDesc(self.desc)
-            p.blocks = [Block(p, i) for i in xrange(self.desc.num_blocks())]
+            p.blocks = [
+                Block(p, i) for i in six.moves.range(self.desc.num_blocks())
+            ]
            p._current_role = self._current_role
            p._op_role_var = self._op_role_var
@@ -1632,7 +1625,9 @@ class Program(object):
            targets_idx.append([t.block.idx, t.idx])
        res = Program()
        res.desc = core.prune(self.desc, targets_idx)
-        res.blocks = [Block(res, i) for i in range(res.desc.num_blocks())]
+        res.blocks = [
+            Block(res, i) for i in six.moves.range(res.desc.num_blocks())
+        ]
        res._sync_with_cpp()
        return res
@@ -1675,16 +1670,18 @@ class Program(object):
                root_block._remove_op(0, read_op_idx + 1)
            for var in root_block.all_vars():
                if var.type() == core.VarDesc.VarType.READER:
-                    root_block._remove_var(var.name())
+                    root_block._remove_var(cpt.to_bytes(var.name()))
        # change all `is_test` attributes to True
-        for i in range(res.desc.num_blocks()):
+        for i in six.moves.range(res.desc.num_blocks()):
            block = res.desc.block(i)
-            for j in range(block.op_size()):
+            for j in six.moves.range(block.op_size()):
                op = block.op(j)
                if op.has_attr('is_test'):
                    op.set_attr('is_test', True)
-        res.blocks = [Block(res, i) for i in range(res.desc.num_blocks())]
+        res.blocks = [
+            Block(res, i) for i in six.moves.range(res.desc.num_blocks())
+        ]
        res._sync_with_cpp()
        return res
@@ -1704,7 +1701,7 @@ class Program(object):
        """
        p = Program()
        p.desc = core.ProgramDesc(binary_str)
-        p.blocks = [Block(p, i) for i in range(p.desc.num_blocks())]
+        p.blocks = [Block(p, i) for i in six.moves.range(p.desc.num_blocks())]
        p._sync_with_cpp()
        return p

--- a/python/paddle/fluid/graphviz.py
+++ b/python/paddle/fluid/graphviz.py
@@ -12,9 +12,12 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from __future__ import print_function
 import os
 import random
 import six
+import functools
 import subprocess
 import logging
@@ -105,8 +108,9 @@ class Graph(object):
    def _rank_repr(self):
        ranks = sorted(
-            list(self.rank_groups.items()),
+            six.iteritems(self.rank_groups),
-            cmp=lambda a, b: a[1].priority > b[1].priority)
+            key=functools.cmp_to_key(
+                lambda a, b: a[1].priority > b[1].priority))
        repr = []
        for x in ranks:
            repr.append(str(x[1]))
@@ -149,7 +153,7 @@ class Node(object):
            name=self.name,
            label=self.label,
            extra=',' + ','.join("%s=%s" % (key, crepr(value))
-                                 for key, value in list(self.attrs.items()))
+                                 for key, value in six.iteritems(self.attrs))
            if self.attrs else "")
        return reprs
@@ -173,7 +177,7 @@ class Edge(object):
            target=self.target.name,
            extra="" if not self.attrs else
            "[" + ','.join("{}={}".format(attr[0], crepr(attr[1]))
-                           for attr in list(self.attrs.items())) + "]")
+                           for attr in six.iteritems(self.attrs)) + "]")
        return repr

--- a/python/paddle/fluid/inferencer.py
+++ b/python/paddle/fluid/inferencer.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from __future__ import print_function
 import contextlib
 from . import core

--- a/python/paddle/fluid/initializer.py
+++ b/python/paddle/fluid/initializer.py
@@ -12,10 +12,11 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from __future__ import print_function
 from . import framework
 import numpy as np
 import contextlib
-from .framework import convert_np_dtype_to_dtype_
 from .core import VarDesc
 __all__ = [

--- a/python/paddle/fluid/io.py
+++ b/python/paddle/fluid/io.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from __future__ import print_function
 import os
 import errno
 import time
@@ -603,25 +605,15 @@ def save_inference_model(dirname,
            # "./infer_model".
    """
-    if isinstance(feeded_var_names, six.binary_type):
+    if isinstance(feeded_var_names, six.string_types):
        feeded_var_names = [feeded_var_names]
-    elif isinstance(feeded_var_names, six.text_type):
-        feeded_var_names = [feeded_var_names.encode()]
    else:
        if len(feeded_var_names) > 0:
            # TODO(paddle-dev): polish these code blocks
            if not (bool(feeded_var_names) and all(
-                    isinstance(name, six.binary_type)
+                    isinstance(name, six.string_types)
                    for name in feeded_var_names)):
-                if not (all(
+                raise ValueError("'feed_var_names' should be a list of str.")
-                        isinstance(name, six.text_type)
-                        for name in feeded_var_names)):
-                    raise ValueError(
-                        "'feed_var_names' should be a list of str.")
-                else:
-                    feeded_var_names = [
-                        name.encode() for name in feeded_var_names
-                    ]
    if isinstance(target_vars, Variable):
        target_vars = [target_vars]

--- a/python/paddle/fluid/layer_helper.py
+++ b/python/paddle/fluid/layer_helper.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from __future__ import print_function
 import copy
 import itertools
 import six
@@ -85,7 +87,7 @@ class LayerHelper(object):
            raise ValueError("parameter number mismatch")
        elif len(param_attr) == 1 and length != 1:
            tmp = [None] * length
-            for i in range(length):
+            for i in six.moves.range(length):
                tmp[i] = copy.deepcopy(param_attr[0])
            param_attr = tmp
        return param_attr

--- a/python/paddle/fluid/layers/__init__.py
+++ b/python/paddle/fluid/layers/__init__.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from __future__ import print_function
 from . import ops
 from .ops import *
 from . import nn

--- a/python/paddle/fluid/layers/control_flow.py
+++ b/python/paddle/fluid/layers/control_flow.py
@@ -11,6 +11,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+from __future__ import print_function
 import contextlib
 from .layer_function_generator import autodoc, templatedoc
@@ -22,6 +24,7 @@ from ..initializer import force_init_on_cpu
 from .ops import logical_and, logical_not, logical_or
 import numpy
 import warnings
+import six
 from functools import reduce
 __all__ = [
@@ -602,7 +605,7 @@ class StaticRNN(object):
        boot_memories = []
        pre_memories = []
        memories = []
-        for _, mem in list(self.memories.items()):
+        for _, mem in six.iteritems(self.memories):
            boot_memories.append(mem.init)
            pre_memories.append(mem.pre_mem.name)
            mem_var = rnn_block.var(mem.mem.name)

--- a/python/paddle/fluid/layers/detection.py
+++ b/python/paddle/fluid/layers/detection.py
@@ -15,13 +15,17 @@
 All layers just related to the detection neural network.
 """
+from __future__ import print_function
 from .layer_function_generator import generate_layer_fn
 from .layer_function_generator import autodoc, templatedoc
 from ..layer_helper import LayerHelper
 from . import tensor
 from . import nn
 from . import ops
+from ... import compat as cpt
 import math
+import six
 import numpy
 from functools import reduce
@@ -104,7 +108,7 @@ def rpn_target_assign(loc,
            examples.
    Returns:
-        tuple: 
+        tuple:
               A tuple(predicted_scores, predicted_location, target_label,
               target_bbox) is returned. The predicted_scores and
               predicted_location is the predicted result of the RPN.
@@ -115,7 +119,7 @@ def rpn_target_assign(loc,
               anchors. The predicted_scores is a 2D Tensor with shape
               [F + B, 1], and the shape of target_label is same as the shape
               of the predicted_scores, B is the number of the background
-               anchors, the F and B is depends on the input of this operator. 
+               anchors, the F and B is depends on the input of this operator.
    Examples:
        .. code-block:: python
@@ -232,8 +236,8 @@ def detection_output(loc,
        nms_eta(float): The parameter for adaptive NMS.
    Returns:
-        Variable: 
+        Variable:
            The detection outputs is a LoDTensor with shape [No, 6].
            Each row has six values: [label, confidence, xmin, ymin, xmax, ymax].
            `No` is the total number of detections in this mini-batch. For each
@@ -504,7 +508,7 @@ def target_assign(input,
    Assumed that the row offset for each instance in `neg_indices` is called neg_lod,
    for i-th instance and each `id` of neg_indices in this instance:
    .. code-block:: text
        out[i][id][0 : K] = {mismatch_value, mismatch_value, ...}
@@ -522,11 +526,11 @@ def target_assign(input,
       mismatch_value (float32): Fill this value to the mismatched location.
    Returns:
-        tuple: 
+        tuple:
-               A tuple(out, out_weight) is returned. out is a 3D Tensor with 
+               A tuple(out, out_weight) is returned. out is a 3D Tensor with
-               shape [N, P, K], N and P is the same as they are in 
+               shape [N, P, K], N and P is the same as they are in
-               `neg_indices`, K is the same as it in input of X. If 
+               `neg_indices`, K is the same as it in input of X. If
-               `match_indices[i][j]`. out_weight is the weight for output with 
+               `match_indices[i][j]`. out_weight is the weight for output with
               the shape of [N, P, 1].
    Examples:
@@ -834,7 +838,7 @@ def prior_box(input,
       offset(float): Prior boxes center offset. Default: 0.5
       name(str): Name of the prior box op. Default: None.
       min_max_aspect_ratios_order(bool): If set True, the output prior box is
-            in order of [min, max, aspect_ratios], which is consistent with 
+            in order of [min, max, aspect_ratios], which is consistent with
            Caffe. Please note, this order affects the weights order of
            convolution layer followed by and does not affect the final
            detection results. Default: False.
@@ -977,7 +981,7 @@ def multi_box_head(inputs,
       stride(int|list|tuple): The stride of conv2d. Default:1,
       name(str): Name of the prior box layer. Default: None.
       min_max_aspect_ratios_order(bool): If set True, the output prior box is
-            in order of [min, max, aspect_ratios], which is consistent with 
+            in order of [min, max, aspect_ratios], which is consistent with
            Caffe. Please note, this order affects the weights order of
            convolution layer followed by and does not affect the fininal
            detection results. Default: False.
@@ -1039,7 +1043,7 @@ def multi_box_head(inputs,
        min_sizes = []
        max_sizes = []
        step = int(math.floor(((max_ratio - min_ratio)) / (num_layer - 2)))
-        for ratio in range(min_ratio, max_ratio + 1, step):
+        for ratio in six.moves.range(min_ratio, max_ratio + 1, step):
            min_sizes.append(base_size * ratio / 100.)
            max_sizes.append(base_size * (ratio + step) / 100.)
        min_sizes = [base_size * .10] + min_sizes
@@ -1108,8 +1112,8 @@ def multi_box_head(inputs,
        mbox_loc = nn.transpose(mbox_loc, perm=[0, 2, 3, 1])
        compile_shape = [
-            mbox_loc.shape[0],
+            mbox_loc.shape[0], cpt.floor_division(
-            mbox_loc.shape[1] * mbox_loc.shape[2] * mbox_loc.shape[3] / 4, 4
+                mbox_loc.shape[1] * mbox_loc.shape[2] * mbox_loc.shape[3], 4), 4
        ]
        run_shape = tensor.assign(numpy.array([0, -1, 4]).astype("int32"))
        mbox_loc_flatten = nn.reshape(
@@ -1127,8 +1131,9 @@ def multi_box_head(inputs,
        conf_loc = nn.transpose(conf_loc, perm=[0, 2, 3, 1])
        new_shape = [0, -1, num_classes]
        compile_shape = [
-            conf_loc.shape[0], conf_loc.shape[1] * conf_loc.shape[2] *
+            conf_loc.shape[0],
-            conf_loc.shape[3] / num_classes, num_classes
+            cpt.floor_division(conf_loc.shape[1] * conf_loc.shape[2] *
+                               conf_loc.shape[3], num_classes), num_classes
        ]
        run_shape = tensor.assign(
            numpy.array([0, -1, num_classes]).astype("int32"))

--- a/python/paddle/fluid/layers/device.py
+++ b/python/paddle/fluid/layers/device.py
--- a/python/paddle/fluid/layers/io.py
+++ b/python/paddle/fluid/layers/io.py
--- a/python/paddle/fluid/layers/layer_function_generator.py
+++ b/python/paddle/fluid/layers/layer_function_generator.py
--- a/python/paddle/fluid/layers/learning_rate_scheduler.py
+++ b/python/paddle/fluid/layers/learning_rate_scheduler.py
--- a/python/paddle/fluid/layers/math_op_patch.py
+++ b/python/paddle/fluid/layers/math_op_patch.py
--- a/python/paddle/fluid/layers/metric_op.py
+++ b/python/paddle/fluid/layers/metric_op.py
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
--- a/python/paddle/fluid/layers/ops.py
+++ b/python/paddle/fluid/layers/ops.py
--- a/python/paddle/fluid/layers/tensor.py
+++ b/python/paddle/fluid/layers/tensor.py
--- a/python/paddle/fluid/layers/utils.py
+++ b/python/paddle/fluid/layers/utils.py
--- a/python/paddle/fluid/lod_tensor.py
+++ b/python/paddle/fluid/lod_tensor.py
--- a/python/paddle/fluid/metrics.py
+++ b/python/paddle/fluid/metrics.py
--- a/python/paddle/fluid/net_drawer.py
+++ b/python/paddle/fluid/net_drawer.py
--- a/python/paddle/fluid/nets.py
+++ b/python/paddle/fluid/nets.py
--- a/python/paddle/fluid/op.py
+++ b/python/paddle/fluid/op.py
--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
--- a/python/paddle/fluid/parallel_executor.py
+++ b/python/paddle/fluid/parallel_executor.py
--- a/python/paddle/fluid/param_attr.py
+++ b/python/paddle/fluid/param_attr.py
--- a/python/paddle/fluid/profiler.py
+++ b/python/paddle/fluid/profiler.py
--- a/python/paddle/fluid/recordio_writer.py
+++ b/python/paddle/fluid/recordio_writer.py
--- a/python/paddle/fluid/regularizer.py
+++ b/python/paddle/fluid/regularizer.py
--- a/python/paddle/fluid/tests/book/high-level-api/fit_a_line/test_fit_a_line.py
+++ b/python/paddle/fluid/tests/book/high-level-api/fit_a_line/test_fit_a_line.py
--- a/python/paddle/fluid/tests/book/high-level-api/image_classification/cifar10_small_test_set.py
+++ b/python/paddle/fluid/tests/book/high-level-api/image_classification/cifar10_small_test_set.py
--- a/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_resnet.py
+++ b/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_resnet.py
--- a/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_vgg.py
+++ b/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_vgg.py
--- a/python/paddle/fluid/tests/book/high-level-api/label_semantic_roles/test_label_semantic_roles_newapi.py
+++ b/python/paddle/fluid/tests/book/high-level-api/label_semantic_roles/test_label_semantic_roles_newapi.py
--- a/python/paddle/fluid/tests/book/high-level-api/machine_translation/test_machine_translation.py
+++ b/python/paddle/fluid/tests/book/high-level-api/machine_translation/test_machine_translation.py
--- a/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_conv.py
+++ b/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_conv.py
--- a/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_mlp.py
+++ b/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_mlp.py
--- a/python/paddle/fluid/tests/book/high-level-api/recommender_system/test_recommender_system_newapi.py
+++ b/python/paddle/fluid/tests/book/high-level-api/recommender_system/test_recommender_system_newapi.py
--- a/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_conv.py
+++ b/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_conv.py
--- a/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_dynamic_rnn.py
+++ b/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_dynamic_rnn.py
--- a/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_stacked_lstm.py
+++ b/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_stacked_lstm.py
--- a/python/paddle/fluid/tests/book/high-level-api/word2vec/test_word2vec_new_api.py
+++ b/python/paddle/fluid/tests/book/high-level-api/word2vec/test_word2vec_new_api.py
--- a/python/paddle/fluid/tests/book/notest_understand_sentiment.py
+++ b/python/paddle/fluid/tests/book/notest_understand_sentiment.py
--- a/python/paddle/fluid/tests/book/test_fit_a_line.py
+++ b/python/paddle/fluid/tests/book/test_fit_a_line.py
--- a/python/paddle/fluid/tests/book/test_image_classification.py
+++ b/python/paddle/fluid/tests/book/test_image_classification.py
--- a/python/paddle/fluid/tests/book/test_label_semantic_roles.py
+++ b/python/paddle/fluid/tests/book/test_label_semantic_roles.py
--- a/python/paddle/fluid/tests/book/test_machine_translation.py
+++ b/python/paddle/fluid/tests/book/test_machine_translation.py
--- a/python/paddle/fluid/tests/book/test_recognize_digits.py
+++ b/python/paddle/fluid/tests/book/test_recognize_digits.py
--- a/python/paddle/fluid/tests/book/test_recommender_system.py
+++ b/python/paddle/fluid/tests/book/test_recommender_system.py
--- a/python/paddle/fluid/tests/book/test_rnn_encoder_decoder.py
+++ b/python/paddle/fluid/tests/book/test_rnn_encoder_decoder.py
--- a/python/paddle/fluid/tests/book/test_word2vec.py
+++ b/python/paddle/fluid/tests/book/test_word2vec.py
--- a/python/paddle/fluid/tests/book_memory_optimization/test_memopt_fit_a_line.py
+++ b/python/paddle/fluid/tests/book_memory_optimization/test_memopt_fit_a_line.py
--- a/python/paddle/fluid/tests/book_memory_optimization/test_memopt_image_classification_train.py
+++ b/python/paddle/fluid/tests/book_memory_optimization/test_memopt_image_classification_train.py
--- a/python/paddle/fluid/tests/book_memory_optimization/test_memopt_machine_translation.py
+++ b/python/paddle/fluid/tests/book_memory_optimization/test_memopt_machine_translation.py
--- a/python/paddle/fluid/tests/demo/fc_gan.py
+++ b/python/paddle/fluid/tests/demo/fc_gan.py
--- a/python/paddle/fluid/tests/demo/file_reader/convert_data_to_recordio.py
+++ b/python/paddle/fluid/tests/demo/file_reader/convert_data_to_recordio.py
--- a/python/paddle/fluid/tests/demo/file_reader/train.py
+++ b/python/paddle/fluid/tests/demo/file_reader/train.py
--- a/python/paddle/fluid/tests/demo/pyreader.py
+++ b/python/paddle/fluid/tests/demo/pyreader.py
--- a/python/paddle/fluid/tests/no_test_concurrency.py
+++ b/python/paddle/fluid/tests/no_test_concurrency.py
--- a/python/paddle/fluid/tests/notest_concurrency.py
+++ b/python/paddle/fluid/tests/notest_concurrency.py
--- a/python/paddle/fluid/tests/test_beam_search_decoder.py
+++ b/python/paddle/fluid/tests/test_beam_search_decoder.py
--- a/python/paddle/fluid/tests/test_cpp_reader.py
+++ b/python/paddle/fluid/tests/test_cpp_reader.py
--- a/python/paddle/fluid/tests/test_data_feeder.py
+++ b/python/paddle/fluid/tests/test_data_feeder.py
--- a/python/paddle/fluid/tests/test_detection.py
+++ b/python/paddle/fluid/tests/test_detection.py
--- a/python/paddle/fluid/tests/test_error_clip.py
+++ b/python/paddle/fluid/tests/test_error_clip.py
--- a/python/paddle/fluid/tests/test_gradient_clip.py
+++ b/python/paddle/fluid/tests/test_gradient_clip.py
--- a/python/paddle/fluid/tests/test_if_else_op.py
+++ b/python/paddle/fluid/tests/test_if_else_op.py
--- a/python/paddle/fluid/tests/test_lod_tensor.py
+++ b/python/paddle/fluid/tests/test_lod_tensor.py
--- a/python/paddle/fluid/tests/test_python_operator_overriding.py
+++ b/python/paddle/fluid/tests/test_python_operator_overriding.py
--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
--- a/python/paddle/fluid/tests/unittests/benchmark.py
+++ b/python/paddle/fluid/tests/unittests/benchmark.py
--- a/python/paddle/fluid/tests/unittests/benchmark_sum_op.py
+++ b/python/paddle/fluid/tests/unittests/benchmark_sum_op.py
--- a/python/paddle/fluid/tests/unittests/decorators.py
+++ b/python/paddle/fluid/tests/unittests/decorators.py
--- a/python/paddle/fluid/tests/unittests/dist_mnist.py
+++ b/python/paddle/fluid/tests/unittests/dist_mnist.py
--- a/python/paddle/fluid/tests/unittests/dist_se_resnext.py
+++ b/python/paddle/fluid/tests/unittests/dist_se_resnext.py
--- a/python/paddle/fluid/tests/unittests/dist_transformer.py
+++ b/python/paddle/fluid/tests/unittests/dist_transformer.py
--- a/python/paddle/fluid/tests/unittests/dist_word2vec.py
+++ b/python/paddle/fluid/tests/unittests/dist_word2vec.py
--- a/python/paddle/fluid/tests/unittests/op_test.py
+++ b/python/paddle/fluid/tests/unittests/op_test.py
--- a/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
+++ b/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
--- a/python/paddle/fluid/tests/unittests/test_accuracy_op.py
+++ b/python/paddle/fluid/tests/unittests/test_accuracy_op.py
--- a/python/paddle/fluid/tests/unittests/test_activation_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/test_activation_mkldnn_op.py
--- a/python/paddle/fluid/tests/unittests/test_activation_op.py
+++ b/python/paddle/fluid/tests/unittests/test_activation_op.py
--- a/python/paddle/fluid/tests/unittests/test_adadelta_op.py
+++ b/python/paddle/fluid/tests/unittests/test_adadelta_op.py
--- a/python/paddle/fluid/tests/unittests/test_adagrad_op.py
+++ b/python/paddle/fluid/tests/unittests/test_adagrad_op.py
--- a/python/paddle/fluid/tests/unittests/test_adam_op.py
+++ b/python/paddle/fluid/tests/unittests/test_adam_op.py
--- a/python/paddle/fluid/tests/unittests/test_adamax_op.py
+++ b/python/paddle/fluid/tests/unittests/test_adamax_op.py
--- a/python/paddle/fluid/tests/unittests/test_anchor_generator_op.py
+++ b/python/paddle/fluid/tests/unittests/test_anchor_generator_op.py
--- a/python/paddle/fluid/tests/unittests/test_arg_min_max_op.py
+++ b/python/paddle/fluid/tests/unittests/test_arg_min_max_op.py
--- a/python/paddle/fluid/tests/unittests/test_argsort_op.py
+++ b/python/paddle/fluid/tests/unittests/test_argsort_op.py
--- a/python/paddle/fluid/tests/unittests/test_array_read_write_op.py
+++ b/python/paddle/fluid/tests/unittests/test_array_read_write_op.py
--- a/python/paddle/fluid/tests/unittests/test_assign_op.py
+++ b/python/paddle/fluid/tests/unittests/test_assign_op.py
--- a/python/paddle/fluid/tests/unittests/test_assign_value_op.py
+++ b/python/paddle/fluid/tests/unittests/test_assign_value_op.py
--- a/python/paddle/fluid/tests/unittests/test_auc_op.py
+++ b/python/paddle/fluid/tests/unittests/test_auc_op.py
--- a/python/paddle/fluid/tests/unittests/test_batch_norm_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/test_batch_norm_mkldnn_op.py
--- a/python/paddle/fluid/tests/unittests/test_batch_norm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_batch_norm_op.py
--- a/python/paddle/fluid/tests/unittests/test_beam_search_decode_op.py
+++ b/python/paddle/fluid/tests/unittests/test_beam_search_decode_op.py
--- a/python/paddle/fluid/tests/unittests/test_beam_search_op.py
+++ b/python/paddle/fluid/tests/unittests/test_beam_search_op.py
--- a/python/paddle/fluid/tests/unittests/test_bilinear_interp_op.py
+++ b/python/paddle/fluid/tests/unittests/test_bilinear_interp_op.py
--- a/python/paddle/fluid/tests/unittests/test_bilinear_tensor_product_op.py
+++ b/python/paddle/fluid/tests/unittests/test_bilinear_tensor_product_op.py
--- a/python/paddle/fluid/tests/unittests/test_bipartite_match_op.py
+++ b/python/paddle/fluid/tests/unittests/test_bipartite_match_op.py
--- a/python/paddle/fluid/tests/unittests/test_box_coder_op.py
+++ b/python/paddle/fluid/tests/unittests/test_box_coder_op.py
--- a/python/paddle/fluid/tests/unittests/test_calc_gradient.py
+++ b/python/paddle/fluid/tests/unittests/test_calc_gradient.py
--- a/python/paddle/fluid/tests/unittests/test_cast_op.py
+++ b/python/paddle/fluid/tests/unittests/test_cast_op.py
--- a/python/paddle/fluid/tests/unittests/test_chunk_eval_op.py
+++ b/python/paddle/fluid/tests/unittests/test_chunk_eval_op.py
--- a/python/paddle/fluid/tests/unittests/test_clip_by_norm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_clip_by_norm_op.py
--- a/python/paddle/fluid/tests/unittests/test_clip_op.py
+++ b/python/paddle/fluid/tests/unittests/test_clip_op.py
--- a/python/paddle/fluid/tests/unittests/test_compare_op.py
+++ b/python/paddle/fluid/tests/unittests/test_compare_op.py
--- a/python/paddle/fluid/tests/unittests/test_compat.py
+++ b/python/paddle/fluid/tests/unittests/test_compat.py
--- a/python/paddle/fluid/tests/unittests/test_concat_op.py
+++ b/python/paddle/fluid/tests/unittests/test_concat_op.py
--- a/python/paddle/fluid/tests/unittests/test_conditional_block.py
+++ b/python/paddle/fluid/tests/unittests/test_conditional_block.py
--- a/python/paddle/fluid/tests/unittests/test_const_value.py
+++ b/python/paddle/fluid/tests/unittests/test_const_value.py
--- a/python/paddle/fluid/tests/unittests/test_conv2d_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/test_conv2d_mkldnn_op.py
--- a/python/paddle/fluid/tests/unittests/test_conv2d_op.py
+++ b/python/paddle/fluid/tests/unittests/test_conv2d_op.py
--- a/python/paddle/fluid/tests/unittests/test_conv2d_transpose_op.py
+++ b/python/paddle/fluid/tests/unittests/test_conv2d_transpose_op.py
--- a/python/paddle/fluid/tests/unittests/test_conv3d_op.py
+++ b/python/paddle/fluid/tests/unittests/test_conv3d_op.py
--- a/python/paddle/fluid/tests/unittests/test_conv3d_transpose_op.py
+++ b/python/paddle/fluid/tests/unittests/test_conv3d_transpose_op.py
--- a/python/paddle/fluid/tests/unittests/test_conv_shift_op.py
+++ b/python/paddle/fluid/tests/unittests/test_conv_shift_op.py
--- a/python/paddle/fluid/tests/unittests/test_cos_sim_op.py
+++ b/python/paddle/fluid/tests/unittests/test_cos_sim_op.py
--- a/python/paddle/fluid/tests/unittests/test_create_op_doc_string.py
+++ b/python/paddle/fluid/tests/unittests/test_create_op_doc_string.py
--- a/python/paddle/fluid/tests/unittests/test_crf_decoding_op.py
+++ b/python/paddle/fluid/tests/unittests/test_crf_decoding_op.py
--- a/python/paddle/fluid/tests/unittests/test_crop_op.py
+++ b/python/paddle/fluid/tests/unittests/test_crop_op.py
--- a/python/paddle/fluid/tests/unittests/test_cross_entropy_op.py
+++ b/python/paddle/fluid/tests/unittests/test_cross_entropy_op.py
--- a/python/paddle/fluid/tests/unittests/test_ctc_align.py
+++ b/python/paddle/fluid/tests/unittests/test_ctc_align.py
--- a/python/paddle/fluid/tests/unittests/test_cumsum_op.py
+++ b/python/paddle/fluid/tests/unittests/test_cumsum_op.py
--- a/python/paddle/fluid/tests/unittests/test_data_balance.py
+++ b/python/paddle/fluid/tests/unittests/test_data_balance.py
--- a/python/paddle/fluid/tests/unittests/test_debugger.py
+++ b/python/paddle/fluid/tests/unittests/test_debugger.py
--- a/python/paddle/fluid/tests/unittests/test_decayed_adagrad_op.py
+++ b/python/paddle/fluid/tests/unittests/test_decayed_adagrad_op.py
--- a/python/paddle/fluid/tests/unittests/test_default_scope_funcs.py
+++ b/python/paddle/fluid/tests/unittests/test_default_scope_funcs.py
--- a/python/paddle/fluid/tests/unittests/test_desc_clone.py
+++ b/python/paddle/fluid/tests/unittests/test_desc_clone.py
--- a/python/paddle/fluid/tests/unittests/test_detection_map_op.py
+++ b/python/paddle/fluid/tests/unittests/test_detection_map_op.py
--- a/python/paddle/fluid/tests/unittests/test_dist_base.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_base.py
--- a/python/paddle/fluid/tests/unittests/test_dist_mnist.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_mnist.py
--- a/python/paddle/fluid/tests/unittests/test_dist_se_resnext.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_se_resnext.py
--- a/python/paddle/fluid/tests/unittests/test_dist_train.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_train.py
--- a/python/paddle/fluid/tests/unittests/test_dist_transformer.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_transformer.py
--- a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
--- a/python/paddle/fluid/tests/unittests/test_dist_word2vec.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_word2vec.py
--- a/python/paddle/fluid/tests/unittests/test_dropout_op.py
+++ b/python/paddle/fluid/tests/unittests/test_dropout_op.py
--- a/python/paddle/fluid/tests/unittests/test_dyn_rnn.py
+++ b/python/paddle/fluid/tests/unittests/test_dyn_rnn.py
--- a/python/paddle/fluid/tests/unittests/test_dynrnn_gradient_check.py
+++ b/python/paddle/fluid/tests/unittests/test_dynrnn_gradient_check.py
--- a/python/paddle/fluid/tests/unittests/test_dynrnn_static_input.py
+++ b/python/paddle/fluid/tests/unittests/test_dynrnn_static_input.py
--- a/python/paddle/fluid/tests/unittests/test_edit_distance_op.py
+++ b/python/paddle/fluid/tests/unittests/test_edit_distance_op.py
--- a/python/paddle/fluid/tests/unittests/test_elementwise_add_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_add_mkldnn_op.py
--- a/python/paddle/fluid/tests/unittests/test_elementwise_add_op.py
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_add_op.py
--- a/python/paddle/fluid/tests/unittests/test_elementwise_div_op.py
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_div_op.py
--- a/python/paddle/fluid/tests/unittests/test_elementwise_gradient_op.py
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_gradient_op.py
--- a/python/paddle/fluid/tests/unittests/test_elementwise_max_op.py
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_max_op.py
--- a/python/paddle/fluid/tests/unittests/test_elementwise_min_op.py
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_min_op.py
--- a/python/paddle/fluid/tests/unittests/test_elementwise_mul_op.py
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_mul_op.py
--- a/python/paddle/fluid/tests/unittests/test_elementwise_pow_op.py
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_pow_op.py
--- a/python/paddle/fluid/tests/unittests/test_elementwise_sub_op.py
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_sub_op.py
--- a/python/paddle/fluid/tests/unittests/test_exception.py
+++ b/python/paddle/fluid/tests/unittests/test_exception.py
--- a/python/paddle/fluid/tests/unittests/test_executor_and_mul.py
+++ b/python/paddle/fluid/tests/unittests/test_executor_and_mul.py
--- a/python/paddle/fluid/tests/unittests/test_expand_op.py
+++ b/python/paddle/fluid/tests/unittests/test_expand_op.py
--- a/python/paddle/fluid/tests/unittests/test_extract_rows_op.py
+++ b/python/paddle/fluid/tests/unittests/test_extract_rows_op.py
--- a/python/paddle/fluid/tests/unittests/test_fake_dequantize_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fake_dequantize_op.py
--- a/python/paddle/fluid/tests/unittests/test_fake_quantize_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fake_quantize_op.py
--- a/python/paddle/fluid/tests/unittests/test_fc_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fc_mkldnn_op.py
--- a/python/paddle/fluid/tests/unittests/test_fc_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fc_op.py
--- a/python/paddle/fluid/tests/unittests/test_feed_fetch_method.py
+++ b/python/paddle/fluid/tests/unittests/test_feed_fetch_method.py
--- a/python/paddle/fluid/tests/unittests/test_fetch_var.py
+++ b/python/paddle/fluid/tests/unittests/test_fetch_var.py
--- a/python/paddle/fluid/tests/unittests/test_fill_constant_batch_size_like_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fill_constant_batch_size_like_op.py
--- a/python/paddle/fluid/tests/unittests/test_fill_constant_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fill_constant_op.py
--- a/python/paddle/fluid/tests/unittests/test_fill_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fill_op.py
--- a/python/paddle/fluid/tests/unittests/test_fill_zeros_like_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fill_zeros_like_op.py
--- a/python/paddle/fluid/tests/unittests/test_flatten_op.py
+++ b/python/paddle/fluid/tests/unittests/test_flatten_op.py
--- a/python/paddle/fluid/tests/unittests/test_framework_debug_str.py
+++ b/python/paddle/fluid/tests/unittests/test_framework_debug_str.py
--- a/python/paddle/fluid/tests/unittests/test_ftrl_op.py
+++ b/python/paddle/fluid/tests/unittests/test_ftrl_op.py
--- a/python/paddle/fluid/tests/unittests/test_fused_elemwise_activation_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fused_elemwise_activation_op.py
--- a/python/paddle/fluid/tests/unittests/test_gather_op.py
+++ b/python/paddle/fluid/tests/unittests/test_gather_op.py
--- a/python/paddle/fluid/tests/unittests/test_gaussian_random_batch_size_like_op.py
+++ b/python/paddle/fluid/tests/unittests/test_gaussian_random_batch_size_like_op.py
--- a/python/paddle/fluid/tests/unittests/test_gaussian_random_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/test_gaussian_random_mkldnn_op.py
--- a/python/paddle/fluid/tests/unittests/test_gaussian_random_op.py
+++ b/python/paddle/fluid/tests/unittests/test_gaussian_random_op.py
--- a/python/paddle/fluid/tests/unittests/test_get_places_op.py
+++ b/python/paddle/fluid/tests/unittests/test_get_places_op.py
--- a/python/paddle/fluid/tests/unittests/test_gru_op.py
+++ b/python/paddle/fluid/tests/unittests/test_gru_op.py
--- a/python/paddle/fluid/tests/unittests/test_gru_unit_op.py
+++ b/python/paddle/fluid/tests/unittests/test_gru_unit_op.py
--- a/python/paddle/fluid/tests/unittests/test_hinge_loss_op.py
+++ b/python/paddle/fluid/tests/unittests/test_hinge_loss_op.py
--- a/python/paddle/fluid/tests/unittests/test_hsigmoid_op.py
+++ b/python/paddle/fluid/tests/unittests/test_hsigmoid_op.py
--- a/python/paddle/fluid/tests/unittests/test_huber_loss_op.py
+++ b/python/paddle/fluid/tests/unittests/test_huber_loss_op.py
--- a/python/paddle/fluid/tests/unittests/test_im2sequence_op.py
+++ b/python/paddle/fluid/tests/unittests/test_im2sequence_op.py
--- a/python/paddle/fluid/tests/unittests/test_image_classification_layer.py
+++ b/python/paddle/fluid/tests/unittests/test_image_classification_layer.py
--- a/python/paddle/fluid/tests/unittests/test_infer_shape.py
+++ b/python/paddle/fluid/tests/unittests/test_infer_shape.py
--- a/python/paddle/fluid/tests/unittests/test_inference_model_io.py
+++ b/python/paddle/fluid/tests/unittests/test_inference_model_io.py
--- a/python/paddle/fluid/tests/unittests/test_initializer.py
+++ b/python/paddle/fluid/tests/unittests/test_initializer.py
--- a/python/paddle/fluid/tests/unittests/test_iou_similarity_op.py
+++ b/python/paddle/fluid/tests/unittests/test_iou_similarity_op.py
--- a/python/paddle/fluid/tests/unittests/test_is_empty_op.py
+++ b/python/paddle/fluid/tests/unittests/test_is_empty_op.py
--- a/python/paddle/fluid/tests/unittests/test_l1_norm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_l1_norm_op.py
--- a/python/paddle/fluid/tests/unittests/test_label_smooth_op.py
+++ b/python/paddle/fluid/tests/unittests/test_label_smooth_op.py
--- a/python/paddle/fluid/tests/unittests/test_layer_norm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_layer_norm_op.py
--- a/python/paddle/fluid/tests/unittests/test_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_layers.py
--- a/python/paddle/fluid/tests/unittests/test_learning_rate_scheduler.py
+++ b/python/paddle/fluid/tests/unittests/test_learning_rate_scheduler.py
--- a/python/paddle/fluid/tests/unittests/test_linear_chain_crf_op.py
+++ b/python/paddle/fluid/tests/unittests/test_linear_chain_crf_op.py
--- a/python/paddle/fluid/tests/unittests/test_listen_and_serv_op.py
+++ b/python/paddle/fluid/tests/unittests/test_listen_and_serv_op.py
--- a/python/paddle/fluid/tests/unittests/test_lod_array_length_op.py
+++ b/python/paddle/fluid/tests/unittests/test_lod_array_length_op.py
--- a/python/paddle/fluid/tests/unittests/test_lod_rank_table.py
+++ b/python/paddle/fluid/tests/unittests/test_lod_rank_table.py
--- a/python/paddle/fluid/tests/unittests/test_lod_reset_op.py
+++ b/python/paddle/fluid/tests/unittests/test_lod_reset_op.py
--- a/python/paddle/fluid/tests/unittests/test_lod_tensor_array.py
+++ b/python/paddle/fluid/tests/unittests/test_lod_tensor_array.py
--- a/python/paddle/fluid/tests/unittests/test_lod_tensor_array_ops.py
+++ b/python/paddle/fluid/tests/unittests/test_lod_tensor_array_ops.py
--- a/python/paddle/fluid/tests/unittests/test_log_loss_op.py
+++ b/python/paddle/fluid/tests/unittests/test_log_loss_op.py
--- a/python/paddle/fluid/tests/unittests/test_logical_op.py
+++ b/python/paddle/fluid/tests/unittests/test_logical_op.py
--- a/python/paddle/fluid/tests/unittests/test_lookup_sparse_table_op.py
+++ b/python/paddle/fluid/tests/unittests/test_lookup_sparse_table_op.py
--- a/python/paddle/fluid/tests/unittests/test_lookup_table_op.py
+++ b/python/paddle/fluid/tests/unittests/test_lookup_table_op.py
--- a/python/paddle/fluid/tests/unittests/test_lrn_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/test_lrn_mkldnn_op.py
--- a/python/paddle/fluid/tests/unittests/test_lrn_op.py
+++ b/python/paddle/fluid/tests/unittests/test_lrn_op.py
--- a/python/paddle/fluid/tests/unittests/test_lstm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_lstm_op.py
--- a/python/paddle/fluid/tests/unittests/test_lstm_unit_op.py
+++ b/python/paddle/fluid/tests/unittests/test_lstm_unit_op.py
--- a/python/paddle/fluid/tests/unittests/test_lstmp_op.py
+++ b/python/paddle/fluid/tests/unittests/test_lstmp_op.py
--- a/python/paddle/fluid/tests/unittests/test_margin_rank_loss_op.py
+++ b/python/paddle/fluid/tests/unittests/test_margin_rank_loss_op.py
--- a/python/paddle/fluid/tests/unittests/test_math_op_patch.py
+++ b/python/paddle/fluid/tests/unittests/test_math_op_patch.py
--- a/python/paddle/fluid/tests/unittests/test_matmul_op.py
+++ b/python/paddle/fluid/tests/unittests/test_matmul_op.py
--- a/python/paddle/fluid/tests/unittests/test_maxout_op.py
+++ b/python/paddle/fluid/tests/unittests/test_maxout_op.py
--- a/python/paddle/fluid/tests/unittests/test_mean_iou.py
+++ b/python/paddle/fluid/tests/unittests/test_mean_iou.py
--- a/python/paddle/fluid/tests/unittests/test_mean_op.py
+++ b/python/paddle/fluid/tests/unittests/test_mean_op.py
--- a/python/paddle/fluid/tests/unittests/test_memory_usage.py
+++ b/python/paddle/fluid/tests/unittests/test_memory_usage.py
--- a/python/paddle/fluid/tests/unittests/test_merge_ids_op.py
+++ b/python/paddle/fluid/tests/unittests/test_merge_ids_op.py
--- a/python/paddle/fluid/tests/unittests/test_mine_hard_examples_op.py
+++ b/python/paddle/fluid/tests/unittests/test_mine_hard_examples_op.py
--- a/python/paddle/fluid/tests/unittests/test_minus_op.py
+++ b/python/paddle/fluid/tests/unittests/test_minus_op.py
--- a/python/paddle/fluid/tests/unittests/test_modified_huber_loss_op.py
+++ b/python/paddle/fluid/tests/unittests/test_modified_huber_loss_op.py
--- a/python/paddle/fluid/tests/unittests/test_momentum_op.py
+++ b/python/paddle/fluid/tests/unittests/test_momentum_op.py
--- a/python/paddle/fluid/tests/unittests/test_mul_op.py
+++ b/python/paddle/fluid/tests/unittests/test_mul_op.py
--- a/python/paddle/fluid/tests/unittests/test_multi_file_reader.py
+++ b/python/paddle/fluid/tests/unittests/test_multi_file_reader.py
--- a/python/paddle/fluid/tests/unittests/test_multi_pass_reader.py
+++ b/python/paddle/fluid/tests/unittests/test_multi_pass_reader.py
--- a/python/paddle/fluid/tests/unittests/test_multiclass_nms_op.py
+++ b/python/paddle/fluid/tests/unittests/test_multiclass_nms_op.py
--- a/python/paddle/fluid/tests/unittests/test_multihead_attention.py
+++ b/python/paddle/fluid/tests/unittests/test_multihead_attention.py
--- a/python/paddle/fluid/tests/unittests/test_multiplex_op.py
+++ b/python/paddle/fluid/tests/unittests/test_multiplex_op.py
--- a/python/paddle/fluid/tests/unittests/test_nce.py
+++ b/python/paddle/fluid/tests/unittests/test_nce.py
--- a/python/paddle/fluid/tests/unittests/test_network_with_dtype.py
+++ b/python/paddle/fluid/tests/unittests/test_network_with_dtype.py
--- a/python/paddle/fluid/tests/unittests/test_norm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_norm_op.py
--- a/python/paddle/fluid/tests/unittests/test_normalization_wrapper.py
+++ b/python/paddle/fluid/tests/unittests/test_normalization_wrapper.py
--- a/python/paddle/fluid/tests/unittests/test_nvprof.py
+++ b/python/paddle/fluid/tests/unittests/test_nvprof.py
--- a/python/paddle/fluid/tests/unittests/test_one_hot_op.py
+++ b/python/paddle/fluid/tests/unittests/test_one_hot_op.py
--- a/python/paddle/fluid/tests/unittests/test_op_support_gpu.py
+++ b/python/paddle/fluid/tests/unittests/test_op_support_gpu.py
--- a/python/paddle/fluid/tests/unittests/test_operator.py
+++ b/python/paddle/fluid/tests/unittests/test_operator.py
--- a/python/paddle/fluid/tests/unittests/test_operator_desc.py
+++ b/python/paddle/fluid/tests/unittests/test_operator_desc.py
--- a/python/paddle/fluid/tests/unittests/test_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_optimizer.py
--- a/python/paddle/fluid/tests/unittests/test_pad_op.py
+++ b/python/paddle/fluid/tests/unittests/test_pad_op.py
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_crf.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_crf.py
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_fetch_feed.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_fetch_feed.py
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext.py
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_test_while_train.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_test_while_train.py
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py
--- a/python/paddle/fluid/tests/unittests/test_parallel_op.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_op.py
--- a/python/paddle/fluid/tests/unittests/test_parameter.py
+++ b/python/paddle/fluid/tests/unittests/test_parameter.py
--- a/python/paddle/fluid/tests/unittests/test_polygon_box_transform.py
+++ b/python/paddle/fluid/tests/unittests/test_polygon_box_transform.py
--- a/python/paddle/fluid/tests/unittests/test_pool2d_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/test_pool2d_mkldnn_op.py
--- a/python/paddle/fluid/tests/unittests/test_pool2d_op.py
+++ b/python/paddle/fluid/tests/unittests/test_pool2d_op.py
--- a/python/paddle/fluid/tests/unittests/test_pool3d_op.py
+++ b/python/paddle/fluid/tests/unittests/test_pool3d_op.py
--- a/python/paddle/fluid/tests/unittests/test_pool_max_op.py
+++ b/python/paddle/fluid/tests/unittests/test_pool_max_op.py
--- a/python/paddle/fluid/tests/unittests/test_positive_negative_pair_op.py
+++ b/python/paddle/fluid/tests/unittests/test_positive_negative_pair_op.py
--- a/python/paddle/fluid/tests/unittests/test_precision_recall_op.py
+++ b/python/paddle/fluid/tests/unittests/test_precision_recall_op.py
--- a/python/paddle/fluid/tests/unittests/test_prelu_op.py
+++ b/python/paddle/fluid/tests/unittests/test_prelu_op.py
--- a/python/paddle/fluid/tests/unittests/test_preprocessor.py
+++ b/python/paddle/fluid/tests/unittests/test_preprocessor.py
--- a/python/paddle/fluid/tests/unittests/test_print_op.py
+++ b/python/paddle/fluid/tests/unittests/test_print_op.py
--- a/python/paddle/fluid/tests/unittests/test_prior_box_op.py
+++ b/python/paddle/fluid/tests/unittests/test_prior_box_op.py
--- a/python/paddle/fluid/tests/unittests/test_profiler.py
+++ b/python/paddle/fluid/tests/unittests/test_profiler.py
--- a/python/paddle/fluid/tests/unittests/test_protobuf.py
+++ b/python/paddle/fluid/tests/unittests/test_protobuf.py
--- a/python/paddle/fluid/tests/unittests/test_protobuf_descs.py
+++ b/python/paddle/fluid/tests/unittests/test_protobuf_descs.py
--- a/python/paddle/fluid/tests/unittests/test_proximal_adagrad_op.py
+++ b/python/paddle/fluid/tests/unittests/test_proximal_adagrad_op.py
--- a/python/paddle/fluid/tests/unittests/test_proximal_gd_op.py
+++ b/python/paddle/fluid/tests/unittests/test_proximal_gd_op.py
--- a/python/paddle/fluid/tests/unittests/test_py_reader_push_pop.py
+++ b/python/paddle/fluid/tests/unittests/test_py_reader_push_pop.py
--- a/python/paddle/fluid/tests/unittests/test_py_reader_using_executor.py
+++ b/python/paddle/fluid/tests/unittests/test_py_reader_using_executor.py
--- a/python/paddle/fluid/tests/unittests/test_random_crop_op.py
+++ b/python/paddle/fluid/tests/unittests/test_random_crop_op.py
--- a/python/paddle/fluid/tests/unittests/test_rank_loss_op.py
+++ b/python/paddle/fluid/tests/unittests/test_rank_loss_op.py
--- a/python/paddle/fluid/tests/unittests/test_reader_reset.py
+++ b/python/paddle/fluid/tests/unittests/test_reader_reset.py
--- a/python/paddle/fluid/tests/unittests/test_recordio_reader.py
+++ b/python/paddle/fluid/tests/unittests/test_recordio_reader.py
--- a/python/paddle/fluid/tests/unittests/test_recurrent_op.py
+++ b/python/paddle/fluid/tests/unittests/test_recurrent_op.py
--- a/python/paddle/fluid/tests/unittests/test_reduce_op.py
+++ b/python/paddle/fluid/tests/unittests/test_reduce_op.py
--- a/python/paddle/fluid/tests/unittests/test_registry.py
+++ b/python/paddle/fluid/tests/unittests/test_registry.py
--- a/python/paddle/fluid/tests/unittests/test_regularizer.py
+++ b/python/paddle/fluid/tests/unittests/test_regularizer.py
--- a/python/paddle/fluid/tests/unittests/test_reorder_lod_tensor.py
+++ b/python/paddle/fluid/tests/unittests/test_reorder_lod_tensor.py
--- a/python/paddle/fluid/tests/unittests/test_reshape_op.py
+++ b/python/paddle/fluid/tests/unittests/test_reshape_op.py
--- a/python/paddle/fluid/tests/unittests/test_reverse_op.py
+++ b/python/paddle/fluid/tests/unittests/test_reverse_op.py
--- a/python/paddle/fluid/tests/unittests/test_rmsprop_op.py
+++ b/python/paddle/fluid/tests/unittests/test_rmsprop_op.py
--- a/python/paddle/fluid/tests/unittests/test_rnn_memory_helper_op.py
+++ b/python/paddle/fluid/tests/unittests/test_rnn_memory_helper_op.py
--- a/python/paddle/fluid/tests/unittests/test_roi_pool_op.py
+++ b/python/paddle/fluid/tests/unittests/test_roi_pool_op.py
--- a/python/paddle/fluid/tests/unittests/test_row_conv_op.py
+++ b/python/paddle/fluid/tests/unittests/test_row_conv_op.py
--- a/python/paddle/fluid/tests/unittests/test_rpn_target_assign_op.py
+++ b/python/paddle/fluid/tests/unittests/test_rpn_target_assign_op.py
--- a/python/paddle/fluid/tests/unittests/test_sampling_id_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sampling_id_op.py
--- a/python/paddle/fluid/tests/unittests/test_scale_op.py
+++ b/python/paddle/fluid/tests/unittests/test_scale_op.py
--- a/python/paddle/fluid/tests/unittests/test_scatter_op.py
+++ b/python/paddle/fluid/tests/unittests/test_scatter_op.py
--- a/python/paddle/fluid/tests/unittests/test_scope.py
+++ b/python/paddle/fluid/tests/unittests/test_scope.py
--- a/python/paddle/fluid/tests/unittests/test_selected_rows.py
+++ b/python/paddle/fluid/tests/unittests/test_selected_rows.py
--- a/python/paddle/fluid/tests/unittests/test_seq_concat_op.py
+++ b/python/paddle/fluid/tests/unittests/test_seq_concat_op.py
--- a/python/paddle/fluid/tests/unittests/test_seq_conv.py
+++ b/python/paddle/fluid/tests/unittests/test_seq_conv.py
--- a/python/paddle/fluid/tests/unittests/test_seq_pool.py
+++ b/python/paddle/fluid/tests/unittests/test_seq_pool.py
--- a/python/paddle/fluid/tests/unittests/test_sequence_erase_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sequence_erase_op.py
--- a/python/paddle/fluid/tests/unittests/test_sequence_expand.py
+++ b/python/paddle/fluid/tests/unittests/test_sequence_expand.py
--- a/python/paddle/fluid/tests/unittests/test_sequence_reshape.py
+++ b/python/paddle/fluid/tests/unittests/test_sequence_reshape.py
--- a/python/paddle/fluid/tests/unittests/test_sequence_slice_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sequence_slice_op.py
--- a/python/paddle/fluid/tests/unittests/test_sequence_softmax_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sequence_softmax_op.py
--- a/python/paddle/fluid/tests/unittests/test_sgd_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sgd_op.py
--- a/python/paddle/fluid/tests/unittests/test_shape_op.py
+++ b/python/paddle/fluid/tests/unittests/test_shape_op.py
--- a/python/paddle/fluid/tests/unittests/test_shrink_rnn_memory.py
+++ b/python/paddle/fluid/tests/unittests/test_shrink_rnn_memory.py
--- a/python/paddle/fluid/tests/unittests/test_sigmoid_cross_entropy_with_logits_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sigmoid_cross_entropy_with_logits_op.py
--- a/python/paddle/fluid/tests/unittests/test_sign_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sign_op.py
--- a/python/paddle/fluid/tests/unittests/test_slice_op.py
+++ b/python/paddle/fluid/tests/unittests/test_slice_op.py
--- a/python/paddle/fluid/tests/unittests/test_slice_var.py
+++ b/python/paddle/fluid/tests/unittests/test_slice_var.py
--- a/python/paddle/fluid/tests/unittests/test_smooth_l1_loss_op.py
+++ b/python/paddle/fluid/tests/unittests/test_smooth_l1_loss_op.py
--- a/python/paddle/fluid/tests/unittests/test_softmax_op.py
+++ b/python/paddle/fluid/tests/unittests/test_softmax_op.py
--- a/python/paddle/fluid/tests/unittests/test_softmax_with_cross_entropy_op.py
+++ b/python/paddle/fluid/tests/unittests/test_softmax_with_cross_entropy_op.py
--- a/python/paddle/fluid/tests/unittests/test_split_and_merge_lod_tensor_op.py
+++ b/python/paddle/fluid/tests/unittests/test_split_and_merge_lod_tensor_op.py
--- a/python/paddle/fluid/tests/unittests/test_split_ids_op.py
+++ b/python/paddle/fluid/tests/unittests/test_split_ids_op.py
--- a/python/paddle/fluid/tests/unittests/test_split_op.py
+++ b/python/paddle/fluid/tests/unittests/test_split_op.py
--- a/python/paddle/fluid/tests/unittests/test_split_selected_rows_op.py
+++ b/python/paddle/fluid/tests/unittests/test_split_selected_rows_op.py
--- a/python/paddle/fluid/tests/unittests/test_spp_op.py
+++ b/python/paddle/fluid/tests/unittests/test_spp_op.py
--- a/python/paddle/fluid/tests/unittests/test_squared_l2_distance_op.py
+++ b/python/paddle/fluid/tests/unittests/test_squared_l2_distance_op.py
--- a/python/paddle/fluid/tests/unittests/test_squared_l2_norm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_squared_l2_norm_op.py
--- a/python/paddle/fluid/tests/unittests/test_squeeze_op.py
+++ b/python/paddle/fluid/tests/unittests/test_squeeze_op.py
--- a/python/paddle/fluid/tests/unittests/test_sum_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sum_mkldnn_op.py
--- a/python/paddle/fluid/tests/unittests/test_sum_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sum_op.py
--- a/python/paddle/fluid/tests/unittests/test_switch.py
+++ b/python/paddle/fluid/tests/unittests/test_switch.py
--- a/python/paddle/fluid/tests/unittests/test_target_assign_op.py
+++ b/python/paddle/fluid/tests/unittests/test_target_assign_op.py
--- a/python/paddle/fluid/tests/unittests/test_tensor.py
+++ b/python/paddle/fluid/tests/unittests/test_tensor.py
--- a/python/paddle/fluid/tests/unittests/test_top_k_op.py
+++ b/python/paddle/fluid/tests/unittests/test_top_k_op.py
--- a/python/paddle/fluid/tests/unittests/test_transpose_op.py
+++ b/python/paddle/fluid/tests/unittests/test_transpose_op.py
--- a/python/paddle/fluid/tests/unittests/test_uniform_random_batch_size_like_op.py
+++ b/python/paddle/fluid/tests/unittests/test_uniform_random_batch_size_like_op.py
--- a/python/paddle/fluid/tests/unittests/test_uniform_random_op.py
+++ b/python/paddle/fluid/tests/unittests/test_uniform_random_op.py
--- a/python/paddle/fluid/tests/unittests/test_unique_name.py
+++ b/python/paddle/fluid/tests/unittests/test_unique_name.py
--- a/python/paddle/fluid/tests/unittests/test_unpool_op.py
+++ b/python/paddle/fluid/tests/unittests/test_unpool_op.py
--- a/python/paddle/fluid/tests/unittests/test_unsqueeze_op.py
+++ b/python/paddle/fluid/tests/unittests/test_unsqueeze_op.py
--- a/python/paddle/fluid/tests/unittests/test_variable.py
+++ b/python/paddle/fluid/tests/unittests/test_variable.py
--- a/python/paddle/fluid/tests/unittests/test_version.py
+++ b/python/paddle/fluid/tests/unittests/test_version.py
--- a/python/paddle/fluid/tests/unittests/test_warpctc_op.py
+++ b/python/paddle/fluid/tests/unittests/test_warpctc_op.py
--- a/python/paddle/fluid/tests/unittests/test_weight_normalization.py
+++ b/python/paddle/fluid/tests/unittests/test_weight_normalization.py
--- a/python/paddle/fluid/tests/unittests/test_while_op.py
+++ b/python/paddle/fluid/tests/unittests/test_while_op.py
--- a/python/paddle/fluid/tests/unittests/testsuite.py
+++ b/python/paddle/fluid/tests/unittests/testsuite.py
--- a/python/paddle/fluid/tests/unittests/transformer_model.py
+++ b/python/paddle/fluid/tests/unittests/transformer_model.py
--- a/python/paddle/fluid/trainer.py
+++ b/python/paddle/fluid/trainer.py
--- a/python/paddle/fluid/transpiler/__init__.py
+++ b/python/paddle/fluid/transpiler/__init__.py
--- a/python/paddle/fluid/transpiler/details/__init__.py
+++ b/python/paddle/fluid/transpiler/details/__init__.py
--- a/python/paddle/fluid/transpiler/details/program_utils.py
+++ b/python/paddle/fluid/transpiler/details/program_utils.py
--- a/python/paddle/fluid/transpiler/details/ufind.py
+++ b/python/paddle/fluid/transpiler/details/ufind.py
--- a/python/paddle/fluid/transpiler/distribute_transpiler.py
+++ b/python/paddle/fluid/transpiler/distribute_transpiler.py
--- a/python/paddle/fluid/transpiler/inference_transpiler.py
+++ b/python/paddle/fluid/transpiler/inference_transpiler.py
--- a/python/paddle/fluid/transpiler/memory_optimization_transpiler.py
+++ b/python/paddle/fluid/transpiler/memory_optimization_transpiler.py
--- a/python/paddle/fluid/transpiler/ps_dispatcher.py
+++ b/python/paddle/fluid/transpiler/ps_dispatcher.py
--- a/python/paddle/fluid/unique_name.py
+++ b/python/paddle/fluid/unique_name.py
--- a/python/paddle/reader/decorator.py
+++ b/python/paddle/reader/decorator.py
--- a/python/paddle/reader/tests/creator_test.py
+++ b/python/paddle/reader/tests/creator_test.py
--- a/python/paddle/utils/dump_config.py
+++ b/python/paddle/utils/dump_config.py
--- a/python/paddle/utils/image_multiproc.py
+++ b/python/paddle/utils/image_multiproc.py
--- a/python/paddle/utils/image_util.py
+++ b/python/paddle/utils/image_util.py
--- a/python/paddle/utils/make_model_diagram.py
+++ b/python/paddle/utils/make_model_diagram.py
--- a/python/paddle/utils/merge_model.py
+++ b/python/paddle/utils/merge_model.py
--- a/python/paddle/utils/plotcurve.py
+++ b/python/paddle/utils/plotcurve.py
--- a/python/paddle/utils/predefined_net.py
+++ b/python/paddle/utils/predefined_net.py
--- a/python/paddle/utils/preprocess_img.py
+++ b/python/paddle/utils/preprocess_img.py
--- a/python/paddle/utils/preprocess_util.py
+++ b/python/paddle/utils/preprocess_util.py
--- a/python/paddle/utils/show_pb.py
+++ b/python/paddle/utils/show_pb.py
--- a/python/paddle/utils/torch2paddle.py
+++ b/python/paddle/utils/torch2paddle.py
--- a/tools/manylinux1/Dockerfile.x64
+++ b/tools/manylinux1/Dockerfile.x64
--- a/tools/manylinux1/build_scripts/build.sh
+++ b/tools/manylinux1/build_scripts/build.sh
--- a/tools/manylinux1/build_scripts/install_nccl2.sh
+++ b/tools/manylinux1/build_scripts/install_nccl2.sh
--- a/tools/test_runner.py
+++ b/tools/test_runner.py