Merge remote-tracking branch 'ups/develop' into feature/op/fusion_lstm

b0904794 · tensor-tang · f6d07342 · fd10669e · b0904794 · b0904794
455 changed file
--- a/cmake/external/anakin.cmake
+++ b/cmake/external/anakin.cmake
@@ -35,8 +35,10 @@ set(ANAKIN_COMPILE_EXTRA_FLAGS
 ExternalProject_Add(
    extern_anakin
    ${EXTERNAL_PROJECT_LOG_ARGS}
-    GIT_REPOSITORY      "https://github.com/PaddlePaddle/Anakin"
-    GIT_TAG             "04256ba78fa3da0beb74e8036c8efd68c12824d6"
+    DEPENDS             ${MKLML_PROJECT}
+    # Anakin codes error on Intel(R) Xeon(R) Gold 5117 CPU, temporary do not compile avx512 related code.
+    GIT_REPOSITORY      "https://github.com/luotao1/Anakin"
+    GIT_TAG             "bcf17aabe7921ceb7bce591244b4f9dce7dba5c8"
    PREFIX              ${ANAKIN_SOURCE_DIR}
    UPDATE_COMMAND      ""
    CMAKE_ARGS          -DUSE_GPU_PLACE=YES

--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -115,6 +115,8 @@ cc_test(cow_ptr_tests SRCS details/cow_ptr_test.cc)
 # cc_test(channel_test SRCS channel_test.cc)
 cc_test(tuple_test SRCS tuple_test.cc )

+cc_test(rw_lock_test SRCS rw_lock_test.cc)
+
 # disable test temporarily.
 # TODO https://github.com/PaddlePaddle/Paddle/issues/11971
 # cc_test(concurrency_test SRCS concurrency_test.cc DEPS go_op channel_close_op channel_create_op

--- a/paddle/fluid/framework/ir/graph_pattern_detecter.cc
+++ b/paddle/fluid/framework/ir/graph_pattern_detecter.cc
@@ -12,6 +12,7 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

+#include <array>
 #include <string>
 #include <vector>


--- a/paddle/fluid/framework/ir/graph_traits.h
+++ b/paddle/fluid/framework/ir/graph_traits.h
@@ -12,7 +12,11 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

+#pragma once
+
 #include <stack>
+#include <vector>
+
 #include "paddle/fluid/framework/ir/graph.h"
 #include "paddle/fluid/framework/ir/node.h"


--- a/paddle/fluid/framework/op_desc.cc
+++ b/paddle/fluid/framework/op_desc.cc
@@ -202,6 +202,52 @@ std::vector<std::string> OpDesc::AttrNames() const {
 }

 void OpDesc::SetAttr(const std::string &name, const Attribute &v) {
+  // NOTICE(minqiyang): pybind11 will take the empty list in python as
+  // the std::vector<int> type in C++; so we have to change the attr's type
+  // here if we meet this issue
+  proto::AttrType attr_type = static_cast<proto::AttrType>(v.which() - 1);
+  if (attr_type == proto::AttrType::INTS &&
+      boost::get<std::vector<int>>(v).size() == 0u) {
+    // Find current attr via attr name and set the correct attribute value
+    const proto::OpProto::Attr &attr = GetProtoAttr(name);
+    switch (attr.type()) {
+      case proto::AttrType::BOOLEANS: {
+        VLOG(11) << "SetAttr: " << Type() << ", " << name
+                 << " from INTS to BOOLEANS";
+        this->attrs_[name] = std::vector<bool>();
+        break;
+      }
+      case proto::AttrType::INTS: {
+        VLOG(11) << "SetAttr: " << Type() << ", " << name
+                 << " from INTS to INTS";
+        this->attrs_[name] = std::vector<int>();
+        break;
+      }
+      case proto::AttrType::FLOATS: {
+        VLOG(11) << "SetAttr: " << Type() << ", " << name
+                 << " from INTS to FLOATS";
+        this->attrs_[name] = std::vector<float>();
+        break;
+      }
+      case proto::AttrType::STRINGS: {
+        VLOG(11) << "SetAttr: " << Type() << ", " << name
+                 << " from INTS to STRINGS";
+        this->attrs_[name] = std::vector<std::string>();
+        break;
+      }
+      case proto::AttrType::BLOCKS: {
+        VLOG(11) << "SetAttr: " << Type() << ", " << name
+                 << " from INTS to BLOCKS";
+        this->SetBlocksAttr(name, std::vector<BlockDesc *>());
+        return;
+      }
+      default:
+        PADDLE_THROW("Wrong attr type %d", attr.type());
+    }
+    need_update_ = true;
+    return;
+  }
+
  this->attrs_[name] = v;
  need_update_ = true;
 }
@@ -229,6 +275,19 @@ Attribute OpDesc::GetAttr(const std::string &name) const {
  return it->second;
 }

+const proto::OpProto::Attr &OpDesc::GetProtoAttr(
+    const std::string &name) const {
+  const proto::OpProto &proto = OpInfoMap::Instance().Get(Type()).Proto();
+  for (int i = 0; i != proto.attrs_size(); ++i) {
+    const proto::OpProto::Attr &attr = proto.attrs(i);
+    if (attr.name() == name) {
+      return attr;
+    }
+  }
+
+  PADDLE_THROW("Attribute %s is not found in proto %s", name, proto.type());
+}
+
 Attribute OpDesc::GetNullableAttr(const std::string &name) const {
  auto it = attrs_.find(name);
  if (it != attrs_.end()) {

--- a/paddle/fluid/framework/op_desc.h
+++ b/paddle/fluid/framework/op_desc.h
@@ -81,6 +81,8 @@ class OpDesc {

  Attribute GetAttr(const std::string &name) const;

+  const proto::OpProto::Attr &GetProtoAttr(const std::string &name) const;
+
  Attribute GetNullableAttr(const std::string &name) const;

  int GetBlockAttrId(const std::string &name) const;

--- a/paddle/fluid/framework/rw_lock.h
+++ b/paddle/fluid/framework/rw_lock.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <pthread.h>
+
+#include "paddle/fluid/platform/enforce.h"
+
+namespace paddle {
+namespace framework {
+
+struct RWLock {
+  RWLock() { pthread_rwlock_init(&lock_, nullptr); }
+
+  ~RWLock() { pthread_rwlock_destroy(&lock_); }
+
+  void RDLock() {
+    PADDLE_ENFORCE_EQ(pthread_rwlock_rdlock(&lock_), 0,
+                      "acquire read lock failed");
+  }
+
+  void WRLock() {
+    PADDLE_ENFORCE_EQ(pthread_rwlock_wrlock(&lock_), 0,
+                      "acquire write lock failed");
+  }
+
+  void UNLock() {
+    PADDLE_ENFORCE_EQ(pthread_rwlock_unlock(&lock_), 0, "unlock failed");
+  }
+
+ private:
+  pthread_rwlock_t lock_;
+};
+
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/framework/rw_lock_test.cc
+++ b/paddle/fluid/framework/rw_lock_test.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/rw_lock.h"
+#include <gtest/gtest.h>
+#include <chrono>  // NOLINT
+#include <thread>  // NOLINT
+#include <vector>
+
+namespace f = paddle::framework;
+
+void f1(f::RWLock *lock) {
+  lock->RDLock();
+  lock->UNLock();
+}
+
+TEST(RWLOCK, read_read) {
+  f::RWLock lock;
+  lock.RDLock();
+  std::thread t1(f1, &lock);
+  std::thread t2(f1, &lock);
+  t1.join();
+  t2.join();
+  lock.UNLock();
+}
+
+void f2(f::RWLock *lock, std::vector<int> *result) {
+  lock->RDLock();
+  ASSERT_EQ(result->size(), 0UL);
+  lock->UNLock();
+}
+
+void f3(f::RWLock *lock, std::vector<int> *result) {
+  lock->WRLock();
+  result->push_back(1);
+  lock->UNLock();
+}
+
+TEST(RWLOCK, read_write) {
+  f::RWLock lock;
+  std::vector<int> result;
+
+  lock.RDLock();
+  std::thread t1(f2, &lock, &result);
+  t1.join();
+  std::thread t2(f3, &lock, &result);
+  std::this_thread::sleep_for(std::chrono::seconds(1));
+  ASSERT_EQ(result.size(), 0UL);
+  lock.UNLock();
+  t2.join();
+  ASSERT_EQ(result.size(), 1UL);
+}
+
+void f4(f::RWLock *lock, std::vector<int> *result) {
+  lock->RDLock();
+  ASSERT_EQ(result->size(), 1UL);
+  lock->UNLock();
+}
+
+TEST(RWLOCK, write_read) {
+  f::RWLock lock;
+  std::vector<int> result;
+
+  lock.WRLock();
+  std::thread t1(f4, &lock, &result);
+  std::this_thread::sleep_for(std::chrono::seconds(1));
+  result.push_back(1);
+  lock.UNLock();
+  t1.join();
+}
--- a/paddle/fluid/framework/selected_rows.cc
+++ b/paddle/fluid/framework/selected_rows.cc
@@ -120,66 +120,76 @@ bool SelectedRows::HasKey(int64_t key) const {
                                                                   : true;
 }

-std::vector<std::pair<int64_t, int64_t>> SelectedRows::Get(
-    const std::vector<int64_t>& keys, framework::Tensor* value) const {
+int64_t SelectedRows::AutoGrownIndex(int64_t key, bool auto_grown) {
+  rwlock_->RDLock();
+  auto iter = id_to_index_.find(key);
+  if (iter == id_to_index_.end()) {
+    rwlock_->UNLock();
+    if (!auto_grown) {
+      PADDLE_THROW("key %d not found", key);
+    }
+    rwlock_->WRLock();
+    auto map_size = id_to_index_.size();
+    auto vector_size = rows_.size();
+    if (map_size != vector_size) {
+      rwlock_->UNLock();
+      PADDLE_THROW(
+          "id_to_index_ size %d should have the same size with rows_ %d",
+          map_size, vector_size);
+    }
+    auto write_iter = id_to_index_.find(key);
+    if (write_iter == id_to_index_.end()) {
+      size_t row_num = rows_.size();
+      if (row_num == value_->dims()[0]) {
+        rwlock_->UNLock();
+        PADDLE_THROW("selected rows is full, then length exceed %d", row_num);
+      }
+      // key logic to put a key into id_to_index_
+      rows_.push_back(key);
+      auto index = static_cast<int64_t>(rows_.size() - 1);
+      id_to_index_[key] = index;
+      rwlock_->UNLock();
+      return index;
+    } else {
+      auto index = write_iter->second;
+      rwlock_->UNLock();
+      return index;
+    }
+  } else {
+    auto index = iter->second;
+    rwlock_->UNLock();
+    return index;
+  }
+}
+
+void SelectedRows::SyncIndex() {
+  rwlock_->WRLock();
+  id_to_index_.clear();
+  for (size_t i = 0; i < rows_.size(); ++i) {
+    id_to_index_[rows_[i]] = i;
+  }
+  rwlock_->UNLock();
+}
+
+void SelectedRows::Get(const framework::Tensor& ids, framework::Tensor* value,
+                       bool auto_grown) {
  PADDLE_ENFORCE(value->IsInitialized(),
                 "The value tensor should be initialized.");
-  std::vector<std::pair<int64_t, int64_t>> non_keys_pair;
-  if (keys.empty()) {
+  if (ids.numel() == 0) {
    VLOG(3) << "keys is empty, please check data!";
  } else {
    int64_t value_width = value_->numel() / value_->dims()[0];
    PADDLE_ENFORCE_EQ(value_width, value->numel() / value->dims()[0],
                      "output tensor should have the same shape with table "
                      "except the dims[0].");
-
-    for (size_t i = 0; i < keys.size(); ++i) {
-      int64_t index = Index(keys[i]);
-      if (index == -1) {
-        non_keys_pair.push_back(
-            std::make_pair(keys[i], static_cast<int64_t>(i)));
-      } else {
-        framework::VisitDataType(
-            framework::ToDataType(value_->type()),
-            TensorCopyVisitor(value, i * value_width, *value_.get(),
-                              index * value_width, value_width));
-      }
+    for (size_t i = 0; i < ids.numel(); ++i) {
+      int64_t index = AutoGrownIndex(ids.data<int64_t>()[i], auto_grown);
+      framework::VisitDataType(
+          framework::ToDataType(value_->type()),
+          TensorCopyVisitor(value, i * value_width, *value_.get(),
+                            index * value_width, value_width));
    }
  }
-  return non_keys_pair;
-}
-
-bool SelectedRows::Set(int64_t key, const framework::Tensor& value) {
-  PADDLE_ENFORCE(value.IsInitialized(), "The value should be initialized.");
-  if (value_->IsInitialized()) {
-    PADDLE_ENFORCE_EQ(
-        value.type(), value_->type(),
-        "The type of the value should be same with the original value");
-  }
-  PADDLE_ENFORCE_EQ(value.dims()[0], static_cast<size_t>(1),
-                    "The first dim of value should be 1.");
-  std::lock_guard<std::mutex> lock(*auto_grown_mutex_.get());
-  auto index = Index(key);
-  bool is_new_key = false;
-  if (index == -1) {
-    rows_.push_back(key);
-    index = rows_.size() - 1;
-    is_new_key = true;
-    // whether need to resize the table
-    if (static_cast<int64_t>(rows_.size()) > value_->dims()[0]) {
-      auto dims = value_->dims();
-      dims[0] = (dims[0] + 1) << 1;
-      framework::VisitDataType(framework::ToDataType(value.type()),
-                               ReAllocateVisitor(dims, value_.get()));
-    }
-  }
-
-  framework::VisitDataType(
-      framework::ToDataType(value.type()),
-      TensorCopyVisitor(value_.get(),
-                        index * value_->numel() / value_->dims()[0], value,
-                        static_cast<int64_t>(0), value.numel()));
-  return is_new_key;
 }

 }  // namespace framework

--- a/paddle/fluid/framework/selected_rows.h
+++ b/paddle/fluid/framework/selected_rows.h
@@ -17,10 +17,12 @@ limitations under the License. */
 #include <algorithm>
 #include <memory>
 #include <mutex>  // NOLINT
+#include <unordered_map>
 #include <utility>
 #include <vector>

 #include "paddle/fluid/framework/lod_tensor.h"
+#include "paddle/fluid/framework/rw_lock.h"
 #include "paddle/fluid/framework/tensor.h"
 #include "paddle/fluid/memory/memcpy.h"

@@ -48,13 +50,13 @@ class SelectedRows {
  SelectedRows(const std::vector<int64_t>& rows, const int64_t& height)
      : rows_(rows), height_(height) {
    value_.reset(new Tensor());
-    auto_grown_mutex_.reset(new std::mutex);
+    rwlock_.reset(new RWLock);
  }

  SelectedRows() {
    height_ = 0;
    value_.reset(new Tensor());
-    auto_grown_mutex_.reset(new std::mutex);
+    rwlock_.reset(new RWLock);
  }

  platform::Place place() const { return value_->place(); }
@@ -74,47 +76,51 @@ class SelectedRows {
  void set_rows(const Vector<int64_t>& rows) { rows_ = rows; }

  /*
-   * @brief wheter has the specified key in the table.
+   * @brief Get the index of key in rows
+   *
+   * @return -1 if the key does not exists.
+   */
+  int64_t Index(int64_t key) const {
+    auto it = std::find(rows_.begin(), rows_.end(), key);
+    if (it == rows_.end()) {
+      PADDLE_THROW("id %s not in table", key);
+    }
+    return static_cast<int64_t>(std::distance(rows_.begin(), it));
+  }
+
+  /*
+   * @brief whether has the specified key in the table.
   *
   * @return true if the key is exists.
   */
  bool HasKey(int64_t key) const;

  /*
-   * @brief Get value by the key list, if the
+   * @brief Get value by the key list.
+   * Note!!! this interface is only used when selected_rows is used as
+   * parameters
+   * for distribute lookup table.
   *
   * @return a list of pair which contains the non-exists key and the index in
   * the value
   */
-  std::vector<std::pair<int64_t, int64_t>> Get(const std::vector<int64_t>& keys,
-                                               framework::Tensor* value) const;
+  void Get(const framework::Tensor& ids, framework::Tensor* value,
+           bool auto_grown = false);

  /*
-   * @brief Set a key-value pair into the table.
-   *  This function will double the value memory if it's not engouth.
+   * @brief Get the index of the key from id_to_index_ map. If the key not
+   * exist,
+   * add the key into id_to_index_.
   *
-   * @note:
-   *    1. The first dim of the value should be 1
-   *    2. The value should be initialized and the data type
-   *       should be the same with the table.
-   *
-   * @return true if the key is a new one, otherwise false
+   * Note!!! this interface is only used when selected_rows is used as
+   * parameters
+   * for distribute lookup table.
   *
+   * @return index of the key.
   */
-  bool Set(int64_t key, const Tensor& value);
+  int64_t AutoGrownIndex(int64_t key, bool auto_grown);

-  /*
-   * @brief Get the index of key in rows
-   *
-   * @return -1 if the key does not exists.
-   */
-  int64_t Index(int64_t key) const {
-    auto it = std::find(rows_.begin(), rows_.end(), key);
-    if (it == rows_.end()) {
-      return static_cast<int64_t>(-1);
-    }
-    return static_cast<int64_t>(std::distance(rows_.begin(), it));
-  }
+  void SyncIndex();

  DDim GetCompleteDims() const {
    std::vector<int64_t> dims = vectorize(value_->dims());
@@ -127,9 +133,10 @@ class SelectedRows {
  // SelectedRows are simply concated when adding together. Until a
  // SelectedRows add a Tensor, will the duplicate rows be handled.
  Vector<int64_t> rows_;
+  std::unordered_map<int64_t, int64_t> id_to_index_;
  std::unique_ptr<Tensor> value_{nullptr};
  int64_t height_;
-  std::unique_ptr<std::mutex> auto_grown_mutex_{nullptr};
+  std::unique_ptr<RWLock> rwlock_{nullptr};
 };

 /*

--- a/paddle/fluid/framework/selected_rows_test.cc
+++ b/paddle/fluid/framework/selected_rows_test.cc
@@ -9,8 +9,11 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

-#include "paddle/fluid/framework/selected_rows.h"
+#include <time.h>
+#include <thread>  // NOLINT
+
 #include "gtest/gtest.h"
+#include "paddle/fluid/framework/selected_rows.h"

 namespace paddle {
 namespace framework {
@@ -59,39 +62,129 @@ TEST_F(SelectedRowsTester, SerializeAndDeseralize) {
  ASSERT_EQ(selected_rows_->GetCompleteDims(), dst_tensor.GetCompleteDims());
 }

-TEST_F(SelectedRowsTester, SparseTable) {
+TEST(SelectedRows, SparseTable) {
  platform::CPUPlace cpu;
  SelectedRows table;
+
+  int64_t table_size = 100;
+  int64_t embedding_width = 8;
  // initialize a sparse table
-  table.mutable_value()->Resize(framework::make_ddim({1, 100}));
-  table.mutable_value()->mutable_data<float>(cpu);
-  table.mutable_rows()->push_back(1);
+  table.mutable_value()->Resize(
+      framework::make_ddim({table_size, embedding_width}));
+  auto* data = table.mutable_value()->mutable_data<float>(cpu);
+  for (int64_t i = 0; i < table_size; ++i) {
+    for (int64_t j = 0; j < embedding_width; ++j) {
+      data[i * embedding_width + j] = static_cast<float>(i);
+    }
+  }
+  ASSERT_EQ(table.AutoGrownIndex(10, true), 0);
+  ASSERT_EQ(table.AutoGrownIndex(8, true), 1);
+  ASSERT_EQ(table.AutoGrownIndex(8, true), 1);
+  ASSERT_EQ(table.AutoGrownIndex(6, true), 2);
+  ASSERT_TRUE(table.HasKey(10));
+  ASSERT_TRUE(table.HasKey(8));
+  ASSERT_TRUE(table.HasKey(6));
+  ASSERT_EQ(table.rows().size(), 3);
+
+  framework::Tensor ids;
+  ids.Resize(framework::make_ddim({4}));
+  auto* ids_data = ids.mutable_data<int64_t>(cpu);
+  ids_data[0] = static_cast<int64_t>(6);
+  ids_data[1] = static_cast<int64_t>(6);
+  ids_data[2] = static_cast<int64_t>(8);
+  ids_data[3] = static_cast<int64_t>(10);

-  int64_t key = 10000;
-  int64_t non_key = 999;
-  framework::Tensor value;
-  value.Resize(framework::make_ddim({1, 100}));
-  auto ptr = value.mutable_data<float>(cpu);
-  ptr[0] = static_cast<float>(10);
+  framework::Tensor get_value;
+  auto* value_data = get_value.mutable_data<float>(
+      framework::make_ddim({4, embedding_width}), cpu);
+  table.Get(ids, &get_value);

-  ASSERT_EQ(table.rows().size(), static_cast<size_t>(1));
-  ASSERT_EQ(table.HasKey(key), false);
+  for (int j = 0; j < embedding_width; ++j) {
+    ASSERT_EQ(value_data[0 * embedding_width + j], 2);
+  }
+  for (int j = 0; j < embedding_width; ++j) {
+    ASSERT_EQ(value_data[1 * embedding_width + j], 2);
+  }
+  for (int j = 0; j < embedding_width; ++j) {
+    ASSERT_EQ(value_data[2 * embedding_width + j], 1);
+  }
+  for (int j = 0; j < embedding_width; ++j) {
+    ASSERT_EQ(value_data[3 * embedding_width + j], 0);
+  }
+}

-  table.Set(key, value);
+void f1(SelectedRows* table, int table_size) {
+  for (int i = 1000000; i > 0; --i) {
+    auto id = i % table_size;
+    int64_t index1 = table->AutoGrownIndex(id, true);
+    int64_t index2 = table->AutoGrownIndex(id, false);
+    int64_t index3 = table->AutoGrownIndex(id, true);
+    ASSERT_EQ(index1, index2);
+    ASSERT_EQ(index2, index3);
+  }
+}

-  ASSERT_EQ(table.rows().size(), static_cast<size_t>(2));
-  ASSERT_EQ(table.HasKey(key), true);
-  // check re-allocate
-  ASSERT_EQ(table.value().dims()[0], static_cast<int64_t>(4));
+void f2(SelectedRows* table, int table_size) {
+  for (int i = 0; i < 1000000; ++i) {
+    auto id = i % table_size;
+    int64_t index1 = table->AutoGrownIndex(id, true);
+    int64_t index2 = table->AutoGrownIndex(id, false);
+    int64_t index3 = table->AutoGrownIndex(id, true);
+    ASSERT_EQ(index1, index2);
+    ASSERT_EQ(index2, index3);
+  }
+}

-  framework::Tensor get_value;
-  get_value.mutable_data<float>(framework::make_ddim({2, 100}), cpu);
-  std::vector<int64_t> keys({non_key, key});
-  auto non_key_pairs = table.Get(keys, &get_value);
+void f3(SelectedRows* table, int table_size) {
+  clock_t t1 = clock();
+  for (int i = 100000; i > 0; --i) {
+    auto id1 = table->AutoGrownIndex(i % table_size, true);
+    auto id2 = table->Index(i % table_size);
+    ASSERT_EQ(id1, id2);
+  }
+  clock_t t2 = clock();
+  std::cout << "f3 run time:" << t2 - t1 << std::endl;
+}
+
+void f4(SelectedRows* table, int table_size) {
+  clock_t t1 = clock();
+  for (int i = 0; i < 100000; ++i) {
+    auto id1 = table->AutoGrownIndex(i % table_size, true);
+    auto id2 = table->Index(i % table_size);
+    ASSERT_EQ(id1, id2);
+  }
+  clock_t t2 = clock();
+  std::cout << "f4 run time:" << t2 - t1 << std::endl;
+}
+
+TEST(SelectedRows, MultiThreadAutoIndex) {
+  platform::CPUPlace cpu;
+  SelectedRows table;
+
+  int64_t table_size = 100000;
+  int64_t embedding_width = 8;
+  // initialize a sparse table
+  table.mutable_value()->Resize(
+      framework::make_ddim({table_size, embedding_width}));
+  auto* data = table.mutable_value()->mutable_data<float>(cpu);
+  for (int64_t i = 0; i < table_size; ++i) {
+    for (int64_t j = 0; j < embedding_width; ++j) {
+      data[i * embedding_width + j] = static_cast<float>(i);
+    }
+  }

-  ASSERT_EQ(get_value.data<float>()[100], static_cast<float>(10));
-  ASSERT_EQ(non_key_pairs.size(), static_cast<size_t>(1));
-  ASSERT_EQ(non_key_pairs[0].first, non_key);
+  std::thread t1(f1, &table, table_size);
+  std::thread t11(f1, &table, table_size);
+  std::thread t2(f2, &table, table_size);
+  std::thread t22(f2, &table, table_size);
+  t1.join();
+  t11.join();
+  t2.join();
+  t22.join();
+  std::thread t3(f3, &table, table_size);
+  std::thread t4(f4, &table, table_size);
+  t3.join();
+  t4.join();
 }

 }  // namespace framework

--- a/paddle/fluid/inference/analysis/CMakeLists.txt
+++ b/paddle/fluid/inference/analysis/CMakeLists.txt
@@ -8,7 +8,7 @@ cc_library(analysis SRCS pass_manager.cc dot.cc node.cc data_flow_graph.cc graph
  helper.cc
        model_store_pass.cc
  DEPS framework_proto proto_desc)
-cc_test(test_node SRCS node_tester.cc DEPS analysis)
+cc_test(test_node SRCS node_tester.cc DEPS analysis gflags glog gtest)
 cc_test(test_dot SRCS dot_tester.cc DEPS analysis)
 cc_binary(inference_analyzer SRCS analyzer_main.cc DEPS analysis)


--- a/paddle/fluid/inference/analysis/node.cc
+++ b/paddle/fluid/inference/analysis/node.cc
@@ -20,17 +20,6 @@ namespace paddle {
 namespace inference {
 namespace analysis {

-template <>
-std::string &NodeAttr::As<std::string>() {
-  if (data_.empty()) {
-    type_index_ = std::type_index(typeid(std::string));
-  }
-  PADDLE_ENFORCE_EQ(type_index_, std::type_index(typeid(std::string)));
-  return data_;
-}
-
-std::string &NodeAttr::String() { return As<std::string>(); }
-
 std::vector<Dot::Attr> Value::dot_attrs() const {
  return std::vector<Dot::Attr>({Dot::Attr("style", "filled,rounded"),
                                 Dot::Attr("shape", "box"),

--- a/paddle/fluid/inference/analysis/node.h
+++ b/paddle/fluid/inference/analysis/node.h
@@ -29,6 +29,7 @@ limitations under the License. */
 #include "paddle/fluid/inference/analysis/device.h"
 #include "paddle/fluid/inference/analysis/dot.h"
 #include "paddle/fluid/inference/analysis/helper.h"
+#include "paddle/fluid/platform/variant.h"

 namespace paddle {
 namespace inference {
@@ -38,39 +39,35 @@ class NodeMap;

 // A helper class to maintain the status from Pass.
 struct NodeAttr {
+  using any_t =
+      boost::variant<bool, float, int32_t, int64_t, void *, std::string>;
  // NOTE T should be a primary type or a struct combined by several primary
  // types.
  // NOTE the STL containers should not use here.
  // Some usages
  //   Attr attr;
  //   attr.Bool() = true;
-
  bool &Bool() { return As<bool>(); }
  float &Float() { return As<float>(); }
  int32_t &Int32() { return As<int32_t>(); }
  int64_t &Int64() { return As<int64_t>(); }
  void *&Pointer() { return As<void *>(); }
-  std::string &String();
+  std::string &String() { return As<std::string>(); }

 private:
  template <typename T>
  T &As() {
-    // init storage in the first usage.
-    if (data_.empty()) {
-      VLOG(4) << "resize data to " << sizeof(T);
-      type_index_ = std::type_index(typeid(T));
-      data_.resize(sizeof(T));
+    if (type_index_ == typeid(NodeAttr)) {
+      type_index_ = typeid(T);
+      any_data_ = T();
+    } else {
+      PADDLE_ENFORCE(type_index_ == typeid(T), "fetch error type");
    }
-    PADDLE_ENFORCE(framework::IsType<T>(type_index_),
-                   "type not matched, origin is %s, want %s",
-                   DataTypeNamer::Global().repr(type_index_),
-                   DataTypeNamer::Global().repr<T>());
-    PADDLE_ENFORCE_EQ(data_.size(), sizeof(T), "Node attr type recast error");
-    return *reinterpret_cast<T *>(&data_[0]);
+    return boost::get<T>(any_data_);
  }

 private:
-  std::string data_;
+  any_t any_data_;
  std::type_index type_index_{typeid(NodeAttr)};
 };


--- a/paddle/fluid/inference/analysis/node_tester.cc
+++ b/paddle/fluid/inference/analysis/node_tester.cc
@@ -20,6 +20,24 @@ namespace paddle {
 namespace inference {
 namespace analysis {

+TEST(NodeAttr, bool) {
+  NodeAttr x;
+  x.Bool() = true;
+  ASSERT_EQ(x.Bool(), true);
+}
+
+TEST(NodeAttr, int32) {
+  NodeAttr x;
+  x.Int32() = 32;
+  ASSERT_EQ(x.Int32(), 32);
+}
+
+TEST(NodeAttr, string) {
+  NodeAttr x;
+  x.String() = "Hello";
+  ASSERT_EQ(x.String(), "Hello");
+}
+
 TEST(Node, Attr) {
  // Node is an abstract class, use Value instead for they share the same Attr
  // logic.
@@ -27,6 +45,9 @@ TEST(Node, Attr) {
  auto* node = nodes.Create(Node::Type::kValue);
  node->attr("v0").Int32() = 2008;
  ASSERT_EQ(node->attr("v0").Int32(), 2008);
+
+  node->attr("str").String() = "hello world";
+  ASSERT_EQ(node->attr("str").String(), "hello world");
 }

 }  // namespace analysis

--- a/paddle/fluid/inference/api/demo_ci/run.sh
+++ b/paddle/fluid/inference/api/demo_ci/run.sh
@@ -13,16 +13,22 @@ else
  use_gpu_list='false'
 fi

+PREFIX=inference-vis-demos%2F
+URL_ROOT=http://paddlemodels.bj.bcebos.com/${PREFIX}
+
 # download vis_demo data
 function download() {
  dir_name=$1
  mkdir -p $dir_name
  cd $dir_name
-  wget -q ${URL_ROOT}$dir_name.tar.gz
-  tar xzf *.tar.gz
+  if [[ -e "${PREFIX}${dir_name}.tar.gz" ]]; then
+    echo "${PREFIX}{dir_name}.tar.gz has been downloaded."
+  else
+      wget -q ${URL_ROOT}$dir_name.tar.gz
+      tar xzf *.tar.gz
+  fi
  cd ..
 }
-URL_ROOT=http://paddlemodels.bj.bcebos.com/inference-vis-demos%2F
 mkdir -p data
 cd data
 vis_demo_list='se_resnext50 ocr mobilenet'

--- a/paddle/fluid/operators/activation_op.cu
+++ b/paddle/fluid/operators/activation_op.cu
@@ -26,8 +26,6 @@ namespace plat = paddle::platform;
      act_type##_grad, ops::ActivationGradKernel<plat::CUDADeviceContext,   \
                                                 ops::grad_functor<float>>, \
      ops::ActivationGradKernel<plat::CUDADeviceContext,                    \
-                                ops::grad_functor<double>>,                 \
-      ops::ActivationGradKernel<plat::CUDADeviceContext,                    \
-                                ops::grad_functor<plat::float16>>);
+                                ops::grad_functor<double>>);

 FOR_EACH_KERNEL_FUNCTOR(REGISTER_ACTIVATION_CUDA_KERNEL);
--- a/paddle/fluid/operators/activation_op.h
+++ b/paddle/fluid/operators/activation_op.h
@@ -333,7 +333,8 @@ struct SqrtGradFunctor : public BaseActivationFunctor<T> {
  template <typename Device, typename X, typename Out, typename dOut,
            typename dX>
  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    dx.device(d) = static_cast<T>(0.5) * dout / out;
+    const Out out_conj = Eigen::numext::conj(out);
+    dx.device(d) = static_cast<T>(0.5) * dout / out_conj;
  }
 };

@@ -739,7 +740,7 @@ struct PowGradFunctor : public BaseActivationFunctor<T> {
            typename dX>
  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
    dx.device(d) = dout * static_cast<T>(factor) *
-                   x.pow(static_cast<T>(factor) - static_cast<T>(1));
+                   x.pow(static_cast<T>(factor - static_cast<T>(1)));
  }
 };

@@ -862,11 +863,10 @@ struct SwishGradFunctor : public BaseActivationFunctor<T> {
  template <typename Device, typename X, typename Out, typename dOut,
            typename dX>
  void operator()(Device d, X x, Out out, dOut dout, dX dx) const {
-    T b = static_cast<T>(beta);
    auto temp1 = static_cast<T>(1) /
-                 (static_cast<T>(1) + (static_cast<T>(-b) * x).exp());
-    auto temp2 = temp1 * (static_cast<T>(1) - (b * out));
-    dx.device(d) = dout * ((b * out) + temp2);
+                 (static_cast<T>(1) + (static_cast<T>(-beta) * x).exp());
+    auto temp2 = temp1 * (static_cast<T>(1) - (beta * out));
+    dx.device(d) = dout * ((beta * out) + temp2);
  }
 };


--- a/paddle/fluid/operators/assign_value_op.cu.cc
+++ b/paddle/fluid/operators/assign_value_op.cu.cc
@@ -13,10 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "paddle/fluid/operators/assign_value_op.h"
-#include "paddle/fluid/platform/float16.h"

 namespace ops = paddle::operators;
-namespace plat = paddle::platform;
 REGISTER_OP_CUDA_KERNEL(assign_value, ops::AssignValueKernel<int>,
-                        ops::AssignValueKernel<float>,
-                        ops::AssignValueKernel<plat::float16>);
+                        ops::AssignValueKernel<float>);
--- a/paddle/fluid/operators/conv_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/conv_cudnn_op.cu.cc
@@ -39,27 +39,6 @@ using ScalingParamType = typename platform::CudnnDataType<T>::ScalingParamType;
 static constexpr size_t kCONV_CUDNN_WORKSPACE_LIMIT_BYTES =
    static_cast<size_t>(1024) * 1024 * 1024;

-template <typename T, typename DeviceContext>
-// bool EnableFp16(const T& dummy, const DeviceContext& dev_ctx,
-bool EnableFp16(const DeviceContext& dev_ctx,
-                cudnnConvolutionDescriptor_t cudnn_conv_desc) {
-#if CUDA_VERSION >= 9000 && CUDNN_VERSION_MIN(7, 0, 1)
-  // Tensor core is supported since the volta GPU and
-  // is only enabled when input and filter data are float16
-  if (dev_ctx.GetComputeCapability() >= 70 &&
-      std::type_index(typeid(T)) ==
-          std::type_index(typeid(platform::float16))) {
-    PADDLE_ENFORCE(platform::dynload::cudnnSetConvolutionMathType(
-        cudnn_conv_desc, CUDNN_TENSOR_OP_MATH));
-    return true;
-  } else {
-    PADDLE_ENFORCE(platform::dynload::cudnnSetConvolutionMathType(
-        cudnn_conv_desc, CUDNN_DEFAULT_MATH));
-  }
-#endif
-  return false;
-}
-
 template <typename T>
 class CUDNNConvOpKernel : public framework::OpKernel<T> {
 public:
@@ -149,14 +128,27 @@ class CUDNNConvOpKernel : public framework::OpKernel<T> {
    cudnnConvolutionFwdAlgo_t algo;
    auto& dev_ctx = ctx.template device_context<platform::CUDADeviceContext>();
    auto handle = dev_ctx.cudnn_handle();
-    if (EnableFp16<T>(dev_ctx, cudnn_conv_desc)) {
+
+    CUDNN_ENFORCE(platform::dynload::cudnnGetConvolutionForwardAlgorithm(
+        handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc,
+        cudnn_output_desc, CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
+        workspace_size_limit, &algo));
+
+#if CUDA_VERSION >= 9000 && CUDNN_VERSION_MIN(7, 0, 1)
+    // Tensor core is supported since the volta GPU and
+    // is only enabled when input and filter data are float16
+    if (dev_ctx.GetComputeCapability() >= 70 &&
+        std::type_index(typeid(T)) ==
+            std::type_index(typeid(platform::float16))) {
+      CUDNN_ENFORCE(platform::dynload::cudnnSetConvolutionMathType(
+          cudnn_conv_desc, CUDNN_TENSOR_OP_MATH));
+      // Currently tensor core is only enabled using this algo
      algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM;
    } else {
-      PADDLE_ENFORCE(platform::dynload::cudnnGetConvolutionForwardAlgorithm(
-          handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc,
-          cudnn_output_desc, CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT,
-          workspace_size_limit, &algo));
+      CUDNN_ENFORCE(platform::dynload::cudnnSetConvolutionMathType(
+          cudnn_conv_desc, CUDNN_DEFAULT_MATH));
    }
+#endif

    // get workspace size able to allocate
    CUDNN_ENFORCE(platform::dynload::cudnnGetConvolutionForwardWorkspaceSize(
@@ -296,9 +288,6 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
      } else {
        data_algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_1;
      }
-      if (EnableFp16<T>(dev_ctx, cudnn_conv_desc)) {
-        data_algo = CUDNN_CONVOLUTION_BWD_DATA_ALGO_1;
-      }

      CUDNN_ENFORCE(
          platform::dynload::cudnnGetConvolutionBackwardDataWorkspaceSize(
@@ -318,9 +307,6 @@ class CUDNNConvGradOpKernel : public framework::OpKernel<T> {
      } else {
        filter_algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1;
      }
-      if (EnableFp16<T>(dev_ctx, cudnn_conv_desc)) {
-        filter_algo = CUDNN_CONVOLUTION_BWD_FILTER_ALGO_1;
-      }

      CUDNN_ENFORCE(
          platform::dynload::cudnnGetConvolutionBackwardFilterWorkspaceSize(
@@ -376,8 +362,7 @@ REGISTER_OP_KERNEL(conv2d, CUDNN, plat::CUDAPlace,
                   paddle::operators::CUDNNConvOpKernel<plat::float16>);
 REGISTER_OP_KERNEL(conv2d_grad, CUDNN, plat::CUDAPlace,
                   paddle::operators::CUDNNConvGradOpKernel<float>,
-                   paddle::operators::CUDNNConvGradOpKernel<double>,
-                   paddle::operators::CUDNNConvGradOpKernel<plat::float16>);
+                   paddle::operators::CUDNNConvGradOpKernel<double>);

 REGISTER_OP_KERNEL(conv3d, CUDNN, plat::CUDAPlace,
                   paddle::operators::CUDNNConvOpKernel<float>,
@@ -385,5 +370,4 @@ REGISTER_OP_KERNEL(conv3d, CUDNN, plat::CUDAPlace,
                   paddle::operators::CUDNNConvOpKernel<plat::float16>);
 REGISTER_OP_KERNEL(conv3d_grad, CUDNN, plat::CUDAPlace,
                   paddle::operators::CUDNNConvGradOpKernel<float>,
-                   paddle::operators::CUDNNConvGradOpKernel<double>,
-                   paddle::operators::CUDNNConvGradOpKernel<plat::float16>)
+                   paddle::operators::CUDNNConvGradOpKernel<double>);
--- a/paddle/fluid/operators/cross_entropy_op.cu
+++ b/paddle/fluid/operators/cross_entropy_op.cu
@@ -13,16 +13,12 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "paddle/fluid/operators/cross_entropy_op.h"
-#include "paddle/fluid/platform/float16.h"

 namespace ops = paddle::operators;
-namespace plat = paddle::platform;
 using CUDACtx = paddle::platform::CUDADeviceContext;
 REGISTER_OP_CUDA_KERNEL(cross_entropy,
                        ops::CrossEntropyOpKernel<CUDACtx, float>,
-                        ops::CrossEntropyOpKernel<CUDACtx, double>,
-                        ops::CrossEntropyOpKernel<CUDACtx, plat::float16>);
-REGISTER_OP_CUDA_KERNEL(
-    cross_entropy_grad, ops::CrossEntropyGradientOpKernel<CUDACtx, float>,
-    ops::CrossEntropyGradientOpKernel<CUDACtx, double>,
-    ops::CrossEntropyGradientOpKernel<CUDACtx, plat::float16>);
+                        ops::CrossEntropyOpKernel<CUDACtx, double>);
+REGISTER_OP_CUDA_KERNEL(cross_entropy_grad,
+                        ops::CrossEntropyGradientOpKernel<CUDACtx, float>,
+                        ops::CrossEntropyGradientOpKernel<CUDACtx, double>);
--- a/paddle/fluid/operators/distributed/rpc_server_test.cc
+++ b/paddle/fluid/operators/distributed/rpc_server_test.cc
@@ -78,10 +78,9 @@ void InitTensorsOnServer(framework::Scope* scope, platform::CPUPlace* place,
                         int64_t rows_numel) {
  CreateVarsOnScope(scope, place);
  auto w = scope->Var("w")->GetMutable<framework::SelectedRows>();
-  auto rows = w->mutable_rows();
-  for (int64_t i = 0; i < rows_numel; ++i) rows->push_back(i);
  auto w_value = w->mutable_value();
  w_value->Resize({rows_numel, 10});
+  for (int64_t i = 0; i < rows_numel; ++i) w->AutoGrownIndex(i, true);

  auto ptr = w_value->mutable_data<float>(*place);


--- a/paddle/fluid/operators/elementwise_add_op.cu
+++ b/paddle/fluid/operators/elementwise_add_op.cu
@@ -30,5 +30,4 @@ REGISTER_OP_CUDA_KERNEL(
    ops::ElementwiseAddGradKernel<plat::CUDADeviceContext, float>,
    ops::ElementwiseAddGradKernel<plat::CUDADeviceContext, double>,
    ops::ElementwiseAddGradKernel<plat::CUDADeviceContext, int>,
-    ops::ElementwiseAddGradKernel<plat::CUDADeviceContext, int64_t>,
-    ops::ElementwiseAddGradKernel<plat::CUDADeviceContext, plat::float16>);
+    ops::ElementwiseAddGradKernel<plat::CUDADeviceContext, int64_t>);
--- a/paddle/fluid/operators/elementwise_div_op.cu
+++ b/paddle/fluid/operators/elementwise_div_op.cu
@@ -14,24 +14,19 @@ limitations under the License. */

 #define EIGEN_USE_GPU
 #include "paddle/fluid/operators/elementwise_div_op.h"
-#include "paddle/fluid/platform/float16.h"

 namespace ops = paddle::operators;
-namespace plat = paddle::platform;

 REGISTER_OP_CUDA_KERNEL(
    elementwise_div,
    ops::ElementwiseDivKernel<paddle::platform::CUDADeviceContext, float>,
    ops::ElementwiseDivKernel<paddle::platform::CUDADeviceContext, double>,
    ops::ElementwiseDivKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::ElementwiseDivKernel<paddle::platform::CUDADeviceContext, int64_t>,
-    ops::ElementwiseDivKernel<paddle::platform::CUDADeviceContext,
-                              plat::float16>);
+    ops::ElementwiseDivKernel<paddle::platform::CUDADeviceContext, int64_t>);
 REGISTER_OP_CUDA_KERNEL(
    elementwise_div_grad,
    ops::ElementwiseDivGradKernel<paddle::platform::CUDADeviceContext, float>,
    ops::ElementwiseDivGradKernel<paddle::platform::CUDADeviceContext, double>,
    ops::ElementwiseDivGradKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::ElementwiseDivGradKernel<paddle::platform::CUDADeviceContext, int64_t>,
    ops::ElementwiseDivGradKernel<paddle::platform::CUDADeviceContext,
-                                  plat::float16>);
+                                  int64_t>);
--- a/paddle/fluid/operators/elementwise_mul_op.cu
+++ b/paddle/fluid/operators/elementwise_mul_op.cu
@@ -14,25 +14,19 @@ limitations under the License. */

 #define EIGEN_USE_GPU
 #include "paddle/fluid/operators/elementwise_mul_op.h"
-#include "paddle/fluid/platform/float16.h"

 namespace ops = paddle::operators;
-namespace plat = paddle::platform;

 REGISTER_OP_CUDA_KERNEL(
    elementwise_mul,
    ops::ElementwiseMulKernel<paddle::platform::CUDADeviceContext, float>,
    ops::ElementwiseMulKernel<paddle::platform::CUDADeviceContext, double>,
    ops::ElementwiseMulKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::ElementwiseMulKernel<paddle::platform::CUDADeviceContext, int64_t>,
-    ops::ElementwiseMulKernel<paddle::platform::CUDADeviceContext,
-                              plat::float16>);
+    ops::ElementwiseMulKernel<paddle::platform::CUDADeviceContext, int64_t>);
 REGISTER_OP_CUDA_KERNEL(
    elementwise_mul_grad,
    ops::ElementwiseMulGradKernel<paddle::platform::CUDADeviceContext, float>,
    ops::ElementwiseMulGradKernel<paddle::platform::CUDADeviceContext, double>,
    ops::ElementwiseMulGradKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::ElementwiseMulGradKernel<paddle::platform::CUDADeviceContext,
-                                  plat::float16>,
    ops::ElementwiseMulGradKernel<paddle::platform::CUDADeviceContext,
                                  int64_t>);
--- a/paddle/fluid/operators/elementwise_op_function.h
+++ b/paddle/fluid/operators/elementwise_op_function.h
@@ -350,7 +350,7 @@ static __global__ void ElemwiseGradBroadcast1CUDAKernel(
  int j = blockIdx.x;
  int i = threadIdx.x;
  int tid = threadIdx.x;
-  T val(0);
+  T val = 0;

  do {
    int x_offset = i * w + j;
@@ -418,7 +418,7 @@ static __global__ void ElemwiseGradBroadcast2CUDAKernel(
  int tid = threadIdx.x;
  int j = blockIdx.x;

-  T val(0);
+  T val = 0;
  int ttid = tid;

  while (true) {

--- a/paddle/fluid/operators/elementwise_sub_op.cu
+++ b/paddle/fluid/operators/elementwise_sub_op.cu
@@ -14,25 +14,19 @@ limitations under the License. */

 #define EIGEN_USE_GPU
 #include "paddle/fluid/operators/elementwise_sub_op.h"
-#include "paddle/fluid/platform/float16.h"

 namespace ops = paddle::operators;
-namespace plat = paddle::platform;

 REGISTER_OP_CUDA_KERNEL(
    elementwise_sub,
    ops::ElementwiseSubKernel<paddle::platform::CUDADeviceContext, float>,
    ops::ElementwiseSubKernel<paddle::platform::CUDADeviceContext, double>,
    ops::ElementwiseSubKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::ElementwiseSubKernel<paddle::platform::CUDADeviceContext, int64_t>,
-    ops::ElementwiseSubKernel<paddle::platform::CUDADeviceContext,
-                              plat::float16>);
+    ops::ElementwiseSubKernel<paddle::platform::CUDADeviceContext, int64_t>);
 REGISTER_OP_CUDA_KERNEL(
    elementwise_sub_grad,
    ops::ElementwiseSubGradKernel<paddle::platform::CUDADeviceContext, float>,
    ops::ElementwiseSubGradKernel<paddle::platform::CUDADeviceContext, double>,
    ops::ElementwiseSubGradKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::ElementwiseSubGradKernel<paddle::platform::CUDADeviceContext,
-                                  plat::float16>,
    ops::ElementwiseSubGradKernel<paddle::platform::CUDADeviceContext,
                                  int64_t>);
--- a/paddle/fluid/operators/fill_constant_op.cc
+++ b/paddle/fluid/operators/fill_constant_op.cc
@@ -12,28 +12,48 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

-#include "paddle/fluid/operators/fill_constant_op.h"
-#include "paddle/fluid/platform/float16.h"
+#include "paddle/fluid/framework/data_type.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/math_function.h"
+#include "paddle/fluid/platform/device_context.h"

 namespace paddle {
 namespace operators {

-class FillConstantOp : public framework::OperatorWithKernel {
+class FillConstantInferShape : public framework::InferShapeBase {
 public:
-  using framework::OperatorWithKernel::OperatorWithKernel;
-
-  void InferShape(framework::InferShapeContext* ctx) const override {
+  void operator()(framework::InferShapeContext *ctx) const override {
    PADDLE_ENFORCE(ctx->HasOutput("Out"),
                   "Output(Out) of FillConstantOp should not be null.");
-    auto& shape = ctx->Attrs().Get<std::vector<int>>("shape");
+    auto &shape = ctx->Attrs().Get<std::vector<int>>("shape");
    ctx->SetOutputDim("Out", framework::make_ddim(shape));
  }
+};
+
+class FillConstantOp : public framework::OperatorBase {
+ public:
+  using framework::OperatorBase::OperatorBase;
+
+ private:
+  void RunImpl(const framework::Scope &scope,
+               const platform::Place &dev_place) const override {
+    auto data_type =
+        static_cast<framework::proto::VarType::Type>(Attr<int>("dtype"));
+    auto value = Attr<float>("value");
+    auto force_cpu = Attr<bool>("force_cpu");
+    auto &out =
+        *scope.FindVar(Output("Out"))->GetMutable<framework::LoDTensor>();
+    out.Resize(framework::make_ddim(Attr<std::vector<int>>("shape")));
+    if (force_cpu) {
+      auto cpu = platform::CPUPlace();
+      out.mutable_data(cpu, framework::ToTypeIndex(data_type));
+    } else {
+      out.mutable_data(dev_place, framework::ToTypeIndex(data_type));
+    }

-  framework::OpKernelType GetExpectedKernelType(
-      const framework::ExecutionContext& ctx) const override {
-    return framework::OpKernelType(
-        static_cast<framework::proto::VarType::Type>(ctx.Attr<int>("dtype")),
-        ctx.device_context());
+    platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance();
+    auto &dev_ctx = *pool.Get(dev_place);
+    math::set_constant(dev_ctx, &out, value);
  }
 };

@@ -67,11 +87,6 @@ Fill up a variable with specified constant value.
 }  // namespace paddle

 namespace ops = paddle::operators;
-REGISTER_OPERATOR(fill_constant, ops::FillConstantOp, ops::FillConstantOpMaker,
+REGISTER_OPERATOR(fill_constant, ops::FillConstantOp,
+                  ops::FillConstantInferShape, ops::FillConstantOpMaker,
                  paddle::framework::EmptyGradOpMaker);
-REGISTER_OP_CPU_KERNEL(
-    fill_constant,
-    ops::FillConstantOpKernel<paddle::platform::CPUDeviceContext, float>,
-    ops::FillConstantOpKernel<paddle::platform::CPUDeviceContext, double>,
-    ops::FillConstantOpKernel<paddle::platform::CPUDeviceContext, int>,
-    ops::FillConstantOpKernel<paddle::platform::CPUDeviceContext, int64_t>)
--- a/paddle/fluid/operators/fill_constant_op.cu.cc
+++ b/paddle/fluid/operators/fill_constant_op.cu.cc
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/operators/fill_constant_op.h"
-#include "paddle/fluid/platform/float16.h"
-
-namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    fill_constant,
-    ops::FillConstantOpKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::FillConstantOpKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::FillConstantOpKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::FillConstantOpKernel<paddle::platform::CUDADeviceContext, int64_t>,
-    ops::FillConstantOpKernel<paddle::platform::CUDADeviceContext,
-                              paddle::platform::float16>)
--- a/paddle/fluid/operators/fill_constant_op.h
+++ b/paddle/fluid/operators/fill_constant_op.h
-// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <vector>
-
-#include "paddle/fluid/framework/data_type.h"
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/operators/math/math_function.h"
-
-namespace paddle {
-namespace operators {
-
-template <typename DeviceContext, typename T>
-class FillConstantOpKernel : public framework::OpKernel<T> {
- public:
-  void Compute(const framework::ExecutionContext& ctx) const override {
-    auto data_type =
-        static_cast<framework::proto::VarType::Type>(ctx.Attr<int>("dtype"));
-    auto value = ctx.Attr<float>("value");
-    auto force_cpu = ctx.Attr<bool>("force_cpu");
-    auto* out = ctx.Output<framework::Tensor>("Out");
-    out->Resize(framework::make_ddim(ctx.Attr<std::vector<int>>("shape")));
-    if (force_cpu) {
-      auto cpu = platform::CPUPlace();
-      out->mutable_data(cpu, framework::ToTypeIndex(data_type));
-    } else {
-      out->mutable_data(ctx.GetPlace(), framework::ToTypeIndex(data_type));
-    }
-
-    math::set_constant(ctx.template device_context<DeviceContext>(), out,
-                       value);
-  }
-};
-
-}  // namespace operators
-}  // namespace paddle
--- a/paddle/fluid/operators/fill_op.cc
+++ b/paddle/fluid/operators/fill_op.cc
@@ -16,7 +16,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/detail/safe_ref.h"
 #include "paddle/fluid/platform/device_context.h"
-#include "paddle/fluid/platform/float16.h"

 namespace paddle {
 namespace operators {
@@ -70,6 +69,7 @@ class FillOp : public framework::OperatorBase {

    framework::VisitDataType(
        dtype, FillOpVisitor(&tensor, Attr<std::vector<float>>("value")));
+
    if (!force_cpu && platform::is_gpu_place(place)) {
      // Copy tensor to out
      platform::DeviceContextPool &pool =

--- a/paddle/fluid/operators/gaussian_random_op.cu
+++ b/paddle/fluid/operators/gaussian_random_op.cu
@@ -15,7 +15,6 @@ limitations under the License. */
 #include <thrust/transform.h>
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/platform/float16.h"

 namespace paddle {
 namespace operators {
@@ -61,7 +60,6 @@ class GPUGaussianRandomKernel : public framework::OpKernel<T> {
 }  // namespace operators
 }  // namespace paddle

-namespace plat = paddle::platform;
 REGISTER_OP_CUDA_KERNEL(gaussian_random,
                        paddle::operators::GPUGaussianRandomKernel<float>,
                        paddle::operators::GPUGaussianRandomKernel<double>);

--- a/paddle/fluid/operators/lookup_sparse_table_op.cc
+++ b/paddle/fluid/operators/lookup_sparse_table_op.cc
@@ -17,7 +17,6 @@ limitations under the License. */
 #include "paddle/fluid/framework/data_type.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/math/math_function.h"
-#include "paddle/fluid/platform/device_context.h"

 namespace paddle {
 namespace operators {
@@ -46,10 +45,6 @@ class LookupSparseTableOp : public framework::OperatorBase {
    auto out_var = scope.FindVar(Output("Out"));
    auto w_var = scope.FindVar(Input("W"));
    auto ids_var = scope.FindVar(Input("Ids"));
-    unsigned int seed = static_cast<unsigned int>(Attr<int>("seed"));
-    float min = Attr<float>("min");
-    float max = Attr<float>("max");
-    bool auto_grown_table = Attr<bool>("auto_grown_table");

    PADDLE_ENFORCE(out_var->IsType<framework::LoDTensor>(),
                   "The type of Out var should be LodTensor.");
@@ -60,46 +55,17 @@ class LookupSparseTableOp : public framework::OperatorBase {
    auto &ids_t = ids_var->Get<framework::LoDTensor>();
    auto out_t = out_var->GetMutable<framework::LoDTensor>();
    auto w_t = w_var->GetMutable<framework::SelectedRows>();
-    std::vector<int64_t> keys;
-    keys.resize(ids_t.numel());
-    for (int64_t i = 0; i < ids_t.numel(); ++i) {
-      keys[i] = ids_t.data<int64_t>()[i];
-    }

    // TODO(Yancey1989): support CUDA Place for the sparse table
    platform::CPUPlace cpu;
    auto out_shape = w_t->value().dims();
-    out_shape[0] = keys.size();
+    out_shape[0] = ids_t.numel();
    out_t->Resize(out_shape);
    out_t->mutable_data(cpu, w_t->value().type());
    PADDLE_ENFORCE_EQ(framework::ToDataType(w_t->value().type()),
                      framework::proto::VarType::FP32,
                      "The sparse table only support FP32");
-    auto non_keys_pair = w_t->Get(keys, out_t);
-    if (!auto_grown_table) {
-      PADDLE_ENFORCE_EQ(non_keys_pair.size(), static_cast<size_t>(0),
-                        "there is some keys does exists in the sparse table.");
-    }
-    auto value_shape = w_t->value().dims();
-    value_shape[0] = 1;
-    for (const auto &it : non_keys_pair) {
-      const auto key = it.first;
-      const auto index = it.second;
-      framework::Tensor value;
-      value.Resize(value_shape);
-      auto data = value.mutable_data<float>(cpu);
-
-      std::minstd_rand engine;
-      engine.seed(seed);
-      std::uniform_real_distribution<float> dist(min, max);
-      int64_t size = value.numel();
-      for (int64_t i = 0; i < size; ++i) {
-        data[i] = dist(engine);
-      }
-      w_t->Set(key, value);
-      memory::Copy(cpu, out_t->mutable_data<float>(cpu) + index * value.numel(),
-                   cpu, value.data<float>(), value.numel() * sizeof(float));
-    }
+    w_t->Get(ids_t, out_t, true);
  }
 };

@@ -121,21 +87,6 @@ class LookupSparseTableOpMaker : public framework::OpProtoAndCheckerMaker {
                     "Otherwise the given value indicates padding the output "
                     "with zeros whenever lookup encounters it in Ids.")
        .SetDefault(kNoPadding);
-    AddAttr<float>("min",
-                   "(float, default -1.0) "
-                   "Minimum value of uniform random")
-        .SetDefault(-1.0f);
-    AddAttr<float>("max",
-                   "(float, default 1.0) "
-                   "Maximum value of uniform random")
-        .SetDefault(1.0f);
-    AddAttr<int>("seed",
-                 "(int, default 0) "
-                 "Random seed used for generating samples. "
-                 "0 means use a seed generated by the system."
-                 "Note that if seed is not 0, this operator will always "
-                 "generate the same random numbers every time.")
-        .SetDefault(0);
    AddAttr<bool>("auto_grown_table",
                  "(bool default false)"
                  "Whether create new value if for nonexistent key.")

--- a/paddle/fluid/operators/math/cross_entropy.cu
+++ b/paddle/fluid/operators/math/cross_entropy.cu
@@ -15,25 +15,11 @@ limitations under the License. */
 #include "paddle/fluid/operators/math/cross_entropy.h"
 #include "paddle/fluid/platform/cuda_device_function.h"
 #include "paddle/fluid/platform/cuda_primitives.h"
-#include "paddle/fluid/platform/float16.h"

 namespace paddle {
 namespace operators {
 namespace math {

-template <typename T>
-HOSTDEVICE T log(const T& val) {
-  return std::log(val);
-}
-
-template <>
-HOSTDEVICE platform::float16 log(const platform::float16& val) {
-  // strage bug, hlog is not exists.
-  return static_cast<float16>(0);
-  // half tmp = static_cast<half>(val);
-  // return static_cast<platform::float16>(hlog(tmp));
-}
-
 namespace {
 template <typename T>
 __global__ void CrossEntropyKernel(T* Y, const T* X, const int64_t* label,
@@ -49,12 +35,12 @@ template <typename T>
 __global__ void SoftCrossEntropyKernel(T* Y, const T* X, const T* label,
                                       const int class_num) {
  int tid = threadIdx.x;
-  T val(0);
+  T val = 0;

  int idx = blockIdx.x * class_num + tid;
  int end = blockIdx.x * class_num + class_num;
  for (; idx < end; idx += blockDim.x) {
-    val += math::TolerableValue<T>()(log(X[idx])) * label[idx];
+    val += math::TolerableValue<T>()(std::log(X[idx])) * label[idx];
  }

  val = paddle::platform::reduceSum(val, tid, blockDim.x);
@@ -98,8 +84,6 @@ class CrossEntropyFunctor<platform::CUDADeviceContext, T> {

 template class CrossEntropyFunctor<platform::CUDADeviceContext, float>;
 template class CrossEntropyFunctor<platform::CUDADeviceContext, double>;
-template class CrossEntropyFunctor<platform::CUDADeviceContext,
-                                   platform::float16>;
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle
--- a/paddle/fluid/operators/math/cross_entropy.h
+++ b/paddle/fluid/operators/math/cross_entropy.h
@@ -13,10 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #pragma once
-#include <limits>
 #include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/tensor.h"
-#include "paddle/fluid/platform/float16.h"
 #include "paddle/fluid/platform/hostdevice.h"

 namespace paddle {
@@ -35,21 +33,6 @@ struct TolerableValue {
  }
 };

-// float16 value clip behave different.
-using paddle::platform::float16;
-using paddle::platform::isfinite;
-template <>
-struct TolerableValue<float16> {
-  HOSTDEVICE float16 operator()(const float16& x) const {
-    if (isfinite(x))
-      return x;
-    else if (x > static_cast<float16>(0))
-      return std::numeric_limits<float16>::max();
-    else
-      return std::numeric_limits<float16>::min();
-  }
-};
-
 template <typename DeviceContext, typename T>
 class CrossEntropyFunctor {
 public:

--- a/paddle/fluid/operators/math/selected_rows_functor.cu
+++ b/paddle/fluid/operators/math/selected_rows_functor.cu
@@ -18,7 +18,6 @@ limitations under the License. */
 #include "paddle/fluid/operators/math/math_function.h"
 #include "paddle/fluid/operators/math/selected_rows_functor.h"
 #include "paddle/fluid/platform/cuda_primitives.h"
-#include "paddle/fluid/platform/float16.h"

 namespace paddle {
 namespace operators {
@@ -77,7 +76,6 @@ struct SelectedRowsAdd<platform::CUDADeviceContext, T> {

 template struct SelectedRowsAdd<platform::CUDADeviceContext, float>;
 template struct SelectedRowsAdd<platform::CUDADeviceContext, double>;
-template struct SelectedRowsAdd<platform::CUDADeviceContext, platform::float16>;

 namespace {
 template <typename T, int block_size>
@@ -122,7 +120,7 @@ struct SelectedRowsAddTensor<platform::CUDADeviceContext, T> {
    auto* out_data = output->data<T>();

    SetConstant<platform::CUDADeviceContext, T> functor;
-    functor(context, output, static_cast<T>(0));
+    functor(context, output, 0.0);

    const int block_size = 256;
    dim3 threads(block_size, 1);
@@ -140,8 +138,6 @@ struct SelectedRowsAddTensor<platform::CUDADeviceContext, T> {

 template struct SelectedRowsAddTensor<platform::CUDADeviceContext, float>;
 template struct SelectedRowsAddTensor<platform::CUDADeviceContext, double>;
-template struct SelectedRowsAddTensor<platform::CUDADeviceContext,
-                                      platform::float16>;

 template <typename T>
 struct SelectedRowsAddTo<platform::CUDADeviceContext, T> {
@@ -181,8 +177,6 @@ template struct SelectedRowsAddTo<platform::CUDADeviceContext, float>;
 template struct SelectedRowsAddTo<platform::CUDADeviceContext, double>;
 template struct SelectedRowsAddTo<platform::CUDADeviceContext, int>;
 template struct SelectedRowsAddTo<platform::CUDADeviceContext, int64_t>;
-template struct SelectedRowsAddTo<platform::CUDADeviceContext,
-                                  platform::float16>;

 namespace {
 template <typename T, int block_size>
@@ -235,8 +229,6 @@ template struct SelectedRowsAddToTensor<platform::CUDADeviceContext, float>;
 template struct SelectedRowsAddToTensor<platform::CUDADeviceContext, double>;
 template struct SelectedRowsAddToTensor<platform::CUDADeviceContext, int>;
 template struct SelectedRowsAddToTensor<platform::CUDADeviceContext, int64_t>;
-template struct SelectedRowsAddToTensor<platform::CUDADeviceContext,
-                                        platform::float16>;

 namespace scatter {

@@ -284,7 +276,7 @@ struct MergeAdd<platform::CUDADeviceContext, T> {
        context.GetPlace());

    math::SetConstant<platform::CUDADeviceContext, T> constant_functor;
-    constant_functor(context, out.mutable_value(), static_cast<T>(0));
+    constant_functor(context, out.mutable_value(), 0.0);

    auto* out_data = out.mutable_value()->data<T>();
    auto* input_data = input.value().data<T>();
@@ -308,7 +300,6 @@ template struct MergeAdd<platform::CUDADeviceContext, float>;
 template struct MergeAdd<platform::CUDADeviceContext, double>;
 template struct MergeAdd<platform::CUDADeviceContext, int>;
 template struct MergeAdd<platform::CUDADeviceContext, int64_t>;
-template struct MergeAdd<platform::CUDADeviceContext, platform::float16>;

 template <typename T, int block_size>
 __global__ void UpdateToTensorKernel(const T* selected_rows,

--- a/paddle/fluid/operators/math/softmax.cu
+++ b/paddle/fluid/operators/math/softmax.cu
@@ -94,15 +94,12 @@ void SoftmaxGradCUDNNFunctor<T>::operator()(
 template class SoftmaxCUDNNFunctor<platform::float16>;
 template class SoftmaxCUDNNFunctor<float>;
 template class SoftmaxCUDNNFunctor<double>;
-template class SoftmaxGradCUDNNFunctor<platform::float16>;
 template class SoftmaxGradCUDNNFunctor<float>;
 template class SoftmaxGradCUDNNFunctor<double>;

 template class SoftmaxFunctor<platform::CUDADeviceContext, platform::float16>;
 template class SoftmaxFunctor<platform::CUDADeviceContext, float>;
 template class SoftmaxFunctor<platform::CUDADeviceContext, double>;
-template class SoftmaxGradFunctor<platform::CUDADeviceContext,
-                                  platform::float16>;
 template class SoftmaxGradFunctor<platform::CUDADeviceContext, float>;
 template class SoftmaxGradFunctor<platform::CUDADeviceContext, double>;


--- a/paddle/fluid/operators/mean_op.cu
+++ b/paddle/fluid/operators/mean_op.cu
@@ -12,16 +12,14 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

+#define EIGEN_USE_GPU
+
 #include "paddle/fluid/operators/mean_op.h"
-#include "paddle/fluid/platform/float16.h"

 namespace ops = paddle::operators;
-namespace plat = paddle::platform;
 REGISTER_OP_CUDA_KERNEL(
    mean, ops::MeanKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::MeanKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::MeanKernel<paddle::platform::CUDADeviceContext, plat::float16>);
+    ops::MeanKernel<paddle::platform::CUDADeviceContext, double>);
 REGISTER_OP_CUDA_KERNEL(
    mean_grad, ops::MeanGradKernel<paddle::platform::CUDADeviceContext, float>,
-    ops::MeanGradKernel<paddle::platform::CUDADeviceContext, double>,
-    ops::MeanGradKernel<paddle::platform::CUDADeviceContext, plat::float16>);
+    ops::MeanGradKernel<paddle::platform::CUDADeviceContext, double>);
--- a/paddle/fluid/operators/mean_op.h
+++ b/paddle/fluid/operators/mean_op.h
@@ -55,7 +55,7 @@ class MeanGradKernel : public framework::OpKernel<T> {
    IG->mutable_data<T>(context.GetPlace());

    T ig_size = static_cast<T>(IG->numel());
-    Eigen::DSizes<int, 1> bcast(static_cast<int>(ig_size));
+    Eigen::DSizes<int, 1> bcast(ig_size);

    EigenVector<T>::Flatten(*IG).device(
        *context.template device_context<DeviceContext>().eigen_device()) =

--- a/paddle/fluid/operators/mul_op.cu.cc
+++ b/paddle/fluid/operators/mul_op.cu.cc
@@ -20,7 +20,6 @@ namespace plat = paddle::platform;
 REGISTER_OP_CUDA_KERNEL(mul, ops::MulKernel<plat::CUDADeviceContext, float>,
                        ops::MulKernel<plat::CUDADeviceContext, double>,
                        ops::MulKernel<plat::CUDADeviceContext, plat::float16>);
-REGISTER_OP_CUDA_KERNEL(
-    mul_grad, ops::MulGradKernel<plat::CUDADeviceContext, float>,
-    ops::MulGradKernel<plat::CUDADeviceContext, double>,
-    ops::MulGradKernel<plat::CUDADeviceContext, plat::float16>);
+REGISTER_OP_CUDA_KERNEL(mul_grad,
+                        ops::MulGradKernel<plat::CUDADeviceContext, float>,
+                        ops::MulGradKernel<plat::CUDADeviceContext, double>);
--- a/paddle/fluid/operators/pool_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/pool_cudnn_op.cu.cc
@@ -174,8 +174,7 @@ REGISTER_OP_KERNEL(pool2d, CUDNN, plat::CUDAPlace,
                   ops::PoolCUDNNOpKernel<plat::float16>);
 REGISTER_OP_KERNEL(pool2d_grad, CUDNN, plat::CUDAPlace,
                   ops::PoolCUDNNGradOpKernel<float>,
-                   ops::PoolCUDNNGradOpKernel<double>,
-                   ops::PoolCUDNNGradOpKernel<plat::float16>);
+                   ops::PoolCUDNNGradOpKernel<double>);

 REGISTER_OP_KERNEL(pool3d, CUDNN, plat::CUDAPlace,
                   ops::PoolCUDNNOpKernel<float>,
@@ -183,5 +182,4 @@ REGISTER_OP_KERNEL(pool3d, CUDNN, plat::CUDAPlace,
                   ops::PoolCUDNNOpKernel<plat::float16>);
 REGISTER_OP_KERNEL(pool3d_grad, CUDNN, plat::CUDAPlace,
                   ops::PoolCUDNNGradOpKernel<float>,
-                   ops::PoolCUDNNGradOpKernel<double>,
-                   ops::PoolCUDNNGradOpKernel<plat::float16>);
+                   ops::PoolCUDNNGradOpKernel<double>);
--- a/paddle/fluid/operators/recv_op.cc
+++ b/paddle/fluid/operators/recv_op.cc
@@ -57,6 +57,8 @@ class RecvOp : public framework::OperatorBase {
 class RecvOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
  void Make() {
+    AddInput("X", "(Any) Dummy inputs, used for control dependency")
+        .AsDuplicable();
    AddOutput("Out", "(Tensor) Variables to get from server.").AsDuplicable();
    AddComment(R"DOC(
 Recv operator

--- a/paddle/fluid/operators/sampling_id_op.cc
+++ b/paddle/fluid/operators/sampling_id_op.cc
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/sampling_id_op.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+class SamplingIdOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("X"),
+                   "Input(X) of SamplingIdOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of SamplingIdOp should not be null.");
+    PADDLE_ENFORCE(
+        ctx->Attrs().Get<float>("min") < ctx->Attrs().Get<float>("max"),
+        "min must less then max");
+
+    auto input_dims = ctx->GetInputDim("X");
+    PADDLE_ENFORCE(input_dims.size() == 2,
+                   "Input(X, Filter) should be 2-D tensor.");
+
+    framework::DDim dims = input_dims;
+    ctx->SetOutputDim("Out", dims);
+    ctx->ShareLoD("X", "Out");
+  }
+};
+
+class SamplingIdOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X",
+             "The input tensor of softmax. "
+             "2-D with shape [batch_size, input_feature_dimensions].");
+    AddOutput("Out", "SamplingId data tensor.");
+    AddComment(R"DOC(
+SamplingId Operator.
+A layer for sampling id from multinomial distribution from the
+ input. Sampling one id for one sample.)DOC");
+    AddAttr<float>("min", "Minimum value of random. [default 0.0].")
+        .SetDefault(0.0f);
+    AddAttr<float>("max", "Maximun value of random. [default 1.0].")
+        .SetDefault(1.0f);
+    AddAttr<int>("seed",
+                 "Random seed used for the random number engine. "
+                 "0 means use a seed generated by the system."
+                 "Note that if seed is not 0, this operator will always "
+                 "generate the same random numbers every time. [default 0].")
+        .SetDefault(0);
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OPERATOR(sampling_id, ops::SamplingIdOp, ops::SamplingIdOpMaker,
+                  paddle::framework::EmptyGradOpMaker);
+
+REGISTER_OP_CPU_KERNEL(sampling_id, paddle::operators::SamplingIdKernel<float>,
+                       paddle::operators::SamplingIdKernel<double>);
--- a/paddle/fluid/operators/sampling_id_op.cu
+++ b/paddle/fluid/operators/sampling_id_op.cu
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+#include "paddle/fluid/operators/sampling_id_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(sampling_id, paddle::operators::SamplingIdKernel<float>,
+                        paddle::operators::SamplingIdKernel<double>);
--- a/paddle/fluid/operators/sampling_id_op.h
+++ b/paddle/fluid/operators/sampling_id_op.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+ Licensed under the Apache License, Version 2.0 (the "License");
+ you may not use this file except in compliance with the License.
+ You may obtain a copy of the License at
+
+     http://www.apache.org/licenses/LICENSE-2.0
+
+ Unless required by applicable law or agreed to in writing, software
+ distributed under the License is distributed on an "AS IS" BASIS,
+ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ See the License for the specific language governing permissions and
+ limitations under the License. */
+
+#pragma once
+
+#include <algorithm>
+#include <iostream>
+#include <iterator>
+#include <random>
+#include <sstream>
+#include <vector>
+
+#include "paddle/fluid/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+using Tensor = framework::Tensor;
+
+template <typename T>
+class SamplingIdKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    const Tensor* input = context.Input<Tensor>("X");
+    const int batch_size = static_cast<int>(input->dims()[0]);
+    const int width = static_cast<int>(input->dims()[1]);
+
+    PADDLE_ENFORCE_GE(batch_size, 0,
+                      "batch_size(dims[0]) must be nonnegative.");
+    PADDLE_ENFORCE_GE(width, 0, "width(dims[1]) must be nonnegative.");
+
+    std::vector<T> ins_vector;
+    framework::TensorToVector(*input, context.device_context(), &ins_vector);
+
+    unsigned int seed = static_cast<unsigned int>(context.Attr<int>("seed"));
+    std::minstd_rand engine;
+    if (seed == 0) {
+      seed = std::random_device()();
+    }
+    engine.seed(seed);
+    std::uniform_real_distribution<T> dist(
+        static_cast<T>(context.Attr<float>("min")),
+        static_cast<T>(context.Attr<float>("max")));
+
+    std::vector<T> ids(batch_size);
+    for (size_t i = 0; i < batch_size; ++i) {
+      T r = dist(engine);
+      int idx = width - 1;
+      for (int j = 0; j < width; ++j) {
+        if ((r -= ins_vector[i * width + j]) < 0) {
+          idx = j;
+          break;
+        }
+      }
+      ids[i] = ins_vector[i * width + idx];
+    }
+
+    std::vector<int64_t> out_dim;
+    out_dim.push_back(static_cast<int64_t>(batch_size));
+
+    Tensor* output = context.Output<Tensor>("Out");
+    output->Resize(framework::make_ddim(out_dim));
+    output->mutable_data<T>(context.GetPlace());
+    framework::TensorFromVector(ids, context.device_context(), output);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/scale_op.cu
+++ b/paddle/fluid/operators/scale_op.cu
@@ -13,15 +13,11 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "paddle/fluid/operators/scale_op.h"
-#include "paddle/fluid/platform/float16.h"

-namespace plat = paddle::platform;
 REGISTER_OP_CUDA_KERNEL(
    scale,
    paddle::operators::ScaleKernel<paddle::platform::CUDADeviceContext, float>,
    paddle::operators::ScaleKernel<paddle::platform::CUDADeviceContext, double>,
    paddle::operators::ScaleKernel<paddle::platform::CUDADeviceContext, int>,
    paddle::operators::ScaleKernel<paddle::platform::CUDADeviceContext,
-                                   int64_t>,
-    paddle::operators::ScaleKernel<paddle::platform::CUDADeviceContext,
-                                   plat::float16>);
+                                   int64_t>);
--- a/paddle/fluid/operators/send_barrier_op.cc
+++ b/paddle/fluid/operators/send_barrier_op.cc
@@ -37,22 +37,19 @@ class SendBarrierOp : public framework::OperatorBase {
  void RunImpl(const framework::Scope& scope,
               const platform::Place& place) const override {
    std::vector<std::string> eps = Attr<std::vector<std::string>>("endpoints");
-    bool sync_mode = Attr<bool>("sync_mode");

    distributed::RPCClient* rpc_client =
        distributed::RPCClient::GetInstance<RPCCLIENT_T>();

-    VLOG(3) << "SendBarrierOp sync_mode:" << sync_mode;
+    VLOG(3) << "SendBarrierOp sync";

    // need to wait before sending send_barrier message
    PADDLE_ENFORCE(rpc_client->Wait(), "internal error in RPCClient");
-    if (sync_mode) {
-      for (auto& ep : eps) {
-        VLOG(3) << "send barrier, ep: " << ep;
-        rpc_client->AsyncSendBatchBarrier(ep);
-      }
-      PADDLE_ENFORCE(rpc_client->Wait(), "internal error in RPCClient");
+    for (auto& ep : eps) {
+      VLOG(3) << "send barrier, ep: " << ep;
+      rpc_client->AsyncSendBatchBarrier(ep);
    }
+    PADDLE_ENFORCE(rpc_client->Wait(), "internal error in RPCClient");
  }
 };

@@ -70,7 +67,6 @@ the Parameter Server would knew all variables have been sent.
                                      "(string vector, default 127.0.0.1:6164)"
                                      "Server endpoints to send variables to.")
        .SetDefault({"127.0.0.1:6164"});
-    AddAttr<bool>("sync_mode", "work in sync_mode or not").SetDefault(true);
  }
 };


--- a/paddle/fluid/operators/send_op.cc
+++ b/paddle/fluid/operators/send_op.cc
@@ -66,6 +66,8 @@ class SendOpMaker : public framework::OpProtoAndCheckerMaker {
  void Make() {
    AddInput("X", "(Tensor, SelectedRows) Input variables to be sent")
        .AsDuplicable();
+    AddOutput("Out", "(Any) Dummy outputs, used for control dependency")
+        .AsDuplicable();
    AddComment(R"DOC(
 Send operator


--- a/paddle/fluid/operators/sgd_op.h
+++ b/paddle/fluid/operators/sgd_op.h
@@ -111,7 +111,7 @@ class SGDOpKernel : public framework::OpKernel<T> {
      for (size_t i = 0; i < grad.rows().size(); i++) {
        PADDLE_ENFORCE(grad.rows()[i] < grad.height(),
                       "Input rows index should less than height");
-        int64_t id_index = param.Index(grad.rows()[i]);
+        int64_t id_index = param_out->AutoGrownIndex(grad.rows()[i], false);
        PADDLE_ENFORCE_GE(id_index, static_cast<int64_t>(0),
                          "id should be in the table");
        for (int64_t j = 0; j < grad_row_width; j++) {

--- a/paddle/fluid/operators/softmax_cudnn_op.cu.cc
+++ b/paddle/fluid/operators/softmax_cudnn_op.cu.cc
@@ -78,5 +78,4 @@ REGISTER_OP_KERNEL(softmax, CUDNN, plat::CUDAPlace,
                   ops::SoftmaxCUDNNKernel<float>,
                   ops::SoftmaxCUDNNKernel<plat::float16>);
 REGISTER_OP_KERNEL(softmax_grad, CUDNN, plat::CUDAPlace,
-                   ops::SoftmaxGradCUDNNKernel<float>,
-                   ops::SoftmaxGradCUDNNKernel<plat::float16>);
+                   ops::SoftmaxGradCUDNNKernel<float>);
--- a/paddle/fluid/operators/softmax_op.cu.cc
+++ b/paddle/fluid/operators/softmax_op.cu.cc
@@ -23,5 +23,4 @@ REGISTER_OP_CUDA_KERNEL(
    ops::SoftmaxKernel<plat::CUDADeviceContext, plat::float16>);
 REGISTER_OP_CUDA_KERNEL(
    softmax_grad, ops::SoftmaxGradKernel<plat::CUDADeviceContext, float>,
-    ops::SoftmaxGradKernel<plat::CUDADeviceContext, double>,
-    ops::SoftmaxGradKernel<plat::CUDADeviceContext, plat::float16>);
+    ops::SoftmaxGradKernel<plat::CUDADeviceContext, double>);
--- a/paddle/fluid/operators/sum_op.cu
+++ b/paddle/fluid/operators/sum_op.cu
@@ -11,13 +11,10 @@ limitations under the License. */

 #define EIGEN_USE_GPU
 #include "paddle/fluid/operators/sum_op.h"
-#include "paddle/fluid/platform/float16.h"

 namespace ops = paddle::operators;
-namespace plat = paddle::platform;
 REGISTER_OP_CUDA_KERNEL(
    sum, ops::SumKernel<paddle::platform::CUDADeviceContext, float>,
    ops::SumKernel<paddle::platform::CUDADeviceContext, double>,
    ops::SumKernel<paddle::platform::CUDADeviceContext, int>,
-    ops::SumKernel<paddle::platform::CUDADeviceContext, int64_t>,
-    ops::SumKernel<paddle::platform::CUDADeviceContext, plat::float16>);
+    ops::SumKernel<paddle::platform::CUDADeviceContext, int64_t>);
--- a/paddle/fluid/operators/sum_op.h
+++ b/paddle/fluid/operators/sum_op.h
@@ -46,7 +46,7 @@ class SumKernel : public framework::OpKernel<T> {
      if (!in_place) {
        math::SetConstant<DeviceContext, T> constant_functor;
        constant_functor(context.template device_context<DeviceContext>(), out,
-                         static_cast<T>(0));
+                         0.0);
      }

      math::SelectedRowsAddToTensor<DeviceContext, T> functor;

--- a/paddle/fluid/operators/top_k_op.cu
+++ b/paddle/fluid/operators/top_k_op.cu
@@ -11,19 +11,16 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include <limits>

 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/top_k_op.h"
 #include "paddle/fluid/platform/assert.h"
 #include "paddle/fluid/platform/cuda_device_function.h"
-#include "paddle/fluid/platform/float16.h"

 namespace paddle {
 namespace operators {

 using Tensor = framework::Tensor;
-using paddle::platform::float16;

 template <typename T>
 struct Pair {
@@ -35,11 +32,6 @@ struct Pair {
    id = id;
  }

-  __device__ __forceinline__ void clear() {
-    v = -INFINITY;
-    id = -1;
-  }
-
  __device__ __forceinline__ void operator=(const Pair<T>& in) {
    v = in.v;
    id = in.id;
@@ -61,12 +53,6 @@ struct Pair {
  int64_t id;
 };

-template <>
-__device__ __forceinline__ void Pair<float16>::clear() {
-  v = platform::raw_uint16_to_float16(0x400);
-  id = -1;
-}
-
 template <typename T>
 __device__ __forceinline__ void AddTo(Pair<T> topk[], const Pair<T>& p,
                                      int beam_size) {
@@ -164,7 +150,7 @@ __device__ __forceinline__ void ThreadGetTopK(Pair<T> topk[], int* beam,
        if (k < MaxLength - (*beam)) {
          topk[k] = topk[k + *beam];
        } else {
-          topk[k].clear();
+          topk[k].set(-INFINITY, -1);
        }
      }
      if (!(*is_empty)) {
@@ -174,7 +160,7 @@ __device__ __forceinline__ void ThreadGetTopK(Pair<T> topk[], int* beam,
    }

    *max = topk[MaxLength - 1];
-    if ((*max).v == static_cast<T>(-1)) *is_empty = true;
+    if ((*max).v == -1) *is_empty = true;
    *beam = 0;
  }
 }
@@ -195,7 +181,7 @@ __device__ __forceinline__ void ThreadGetTopK(Pair<T> topk[], int* beam,
        if (k < MaxLength - *beam) {
          topk[k] = topk[k + *beam];
        } else {
-          topk[k].set(std::numeric_limits<T>::min(), -1);
+          topk[k].set(-INFINITY, -1);
        }
      }
      if (!(*is_empty)) {
@@ -287,7 +273,7 @@ __global__ void KeMatrixTopK(T* output, int output_stride, int64_t* indices,
  bool firststep = true;

  for (int k = 0; k < MaxLength; k++) {
-    topk[k].clear();
+    topk[k].set(-INFINITY, -1);
  }
  while (k) {
    ThreadGetTopK<T, MaxLength, BlockSize>(topk, &beam, k,
@@ -339,7 +325,5 @@ class TopkOpCUDAKernel : public framework::OpKernel<T> {
 }  // namespace operators
 }  // namespace paddle

-REGISTER_OP_CUDA_KERNEL(
-    top_k, paddle::operators::TopkOpCUDAKernel<float>,
-    paddle::operators::TopkOpCUDAKernel<double>,
-    paddle::operators::TopkOpCUDAKernel<paddle::platform::float16>);
+REGISTER_OP_CUDA_KERNEL(top_k, paddle::operators::TopkOpCUDAKernel<float>,
+                        paddle::operators::TopkOpCUDAKernel<double>);
--- a/paddle/fluid/operators/uniform_random_op.cc
+++ b/paddle/fluid/operators/uniform_random_op.cc
@@ -30,8 +30,10 @@ class CPUUniformRandomKernel : public framework::OpKernel<T> {
      tensor = out_var->GetMutable<framework::LoDTensor>();
    } else if (out_var->IsType<framework::SelectedRows>()) {
      auto shape = ctx.Attr<std::vector<int>>("shape");
-      tensor = out_var->GetMutable<framework::SelectedRows>()->mutable_value();
+      auto* selected_rows = out_var->GetMutable<framework::SelectedRows>();
+      tensor = selected_rows->mutable_value();
      tensor->Resize(framework::make_ddim(shape));
+      selected_rows->mutable_rows()->reserve(shape[0]);
    } else {
      PADDLE_THROW(
          "uniform_random_op's output only"

--- a/paddle/fluid/operators/uniform_random_op.cu
+++ b/paddle/fluid/operators/uniform_random_op.cu
@@ -11,14 +11,10 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include <glog/logging.h>
 #include <thrust/random.h>
 #include <thrust/transform.h>
-#include "paddle/fluid/framework/eigen.h"
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/framework/operator.h"
-#include "paddle/fluid/platform/float16.h"
-#include "paddle/fluid/platform/transform.h"

 namespace paddle {
 namespace operators {
@@ -40,11 +36,6 @@ struct UniformGenerator {
  }
 };

-template <typename T, typename V>
-struct CastFunctor {
-  HOSTDEVICE V operator()(const T& a) { return static_cast<V>(a); }
-};
-
 // It seems that Eigen::Tensor::random in GPU will SEGFAULT.
 // Use std::random and thrust::random(thrust is a std library in CUDA) to
 // implement uniform random.
@@ -75,50 +66,18 @@ class GPUUniformRandomKernel : public framework::OpKernel<T> {
    T max = static_cast<T>(context.Attr<float>("max"));
    thrust::counting_iterator<unsigned int> index_sequence_begin(0);
    int64_t size = tensor->numel();
-    if (out_var->IsType<framework::LoDTensor>() &&
-        std::type_index(typeid(T)) ==
-            std::type_index(typeid(platform::float16))) {
-      framework::Tensor master_copy_tensor;
-      master_copy_tensor.Resize(tensor->dims());
-      float* master_copy_tensor_data =
-          master_copy_tensor.mutable_data<float>(context.GetPlace());
-      thrust::transform(index_sequence_begin, index_sequence_begin + size,
-                        thrust::device_ptr<float>(master_copy_tensor_data),
-                        UniformGenerator<float>(static_cast<float>(min),
-                                                static_cast<float>(max), seed));
-      platform::Transform<platform::CUDADeviceContext> trans;
-      auto* in_begin = master_copy_tensor.data<float>();
-      auto* in_end = in_begin + master_copy_tensor.numel();
-      auto* out_begin = tensor->mutable_data<T>(context.GetPlace());
-      trans(context.template device_context<platform::CUDADeviceContext>(),
-            in_begin, in_end, out_begin, CastFunctor<float, T>());
-    } else {
-      thrust::transform(index_sequence_begin, index_sequence_begin + size,
-                        thrust::device_ptr<T>(data),
-                        UniformGenerator<T>(min, max, seed));
-    }
-    if (VLOG_IS_ON(5)) {
-      framework::Tensor cpu_tensor;
-      framework::TensorCopySync(*tensor, platform::CPUPlace(), &cpu_tensor);
-      auto& dev_ctx =
-          *platform::DeviceContextPool::Instance().Get(context.GetPlace());
-      dev_ctx.Wait();
-      auto x = framework::EigenVector<T>::Flatten(cpu_tensor);
-      VLOG(5) << "The Uniform output " << x;
-    }
+    thrust::transform(index_sequence_begin, index_sequence_begin + size,
+                      thrust::device_ptr<T>(data),
+                      UniformGenerator<T>(min, max, seed));
  }
 };

 }  // namespace operators
 }  // namespace paddle

-namespace plat = paddle::platform;
-REGISTER_OP_CUDA_KERNEL(
-    uniform_random, paddle::operators::GPUUniformRandomKernel<float>,
-    paddle::operators::GPUUniformRandomKernel<double>,
-    paddle::operators::GPUUniformRandomKernel<plat::float16>);
-REGISTER_OP_CUDA_KERNEL(
-    uniform_random_batch_size_like,
-    paddle::operators::GPUUniformRandomKernel<float>,
-    paddle::operators::GPUUniformRandomKernel<double>,
-    paddle::operators::GPUUniformRandomKernel<plat::float16>);
+REGISTER_OP_CUDA_KERNEL(uniform_random,
+                        paddle::operators::GPUUniformRandomKernel<float>,
+                        paddle::operators::GPUUniformRandomKernel<double>);
+REGISTER_OP_CUDA_KERNEL(uniform_random_batch_size_like,
+                        paddle::operators::GPUUniformRandomKernel<float>,
+                        paddle::operators::GPUUniformRandomKernel<double>);
--- a/paddle/fluid/platform/cuda_device_function.h
+++ b/paddle/fluid/platform/cuda_device_function.h
@@ -36,7 +36,7 @@ __forceinline__ __device__ T CudaShuffleDownSync(unsigned mask, T val,
 #if CUDA_VERSION < 9000
  return __shfl_down(val, delta, width);
 #else
-  return __shfl_down_sync(mask, val, delta, width);
+  return __shfl_down_sync(mask, val, static_cast<unsigned>(delta), width);
 #endif
 }

@@ -46,9 +46,16 @@ template <>
 __forceinline__ __device__ float16 CudaShuffleDownSync(unsigned mask,
                                                       float16 val, int delta,
                                                       int width) {
-  half tmp = static_cast<half>(val);
-  __shfl_down(tmp, static_cast<unsigned>(delta), width);
-  return float16(tmp);
+  return float16(
+      __shfl_down(static_cast<half>(val), static_cast<unsigned>(delta), width));
+}
+#else
+template <>
+__forceinline__ __device__ float16 CudaShuffleDownSync(unsigned mask,
+                                                       float16 val, int delta,
+                                                       int width) {
+  return float16(__shfl_down_sync(mask, static_cast<half>(val),
+                                  static_cast<unsigned>(delta), width));
 }
 #endif


--- a/paddle/fluid/platform/cuda_helper_test.cu
+++ b/paddle/fluid/platform/cuda_helper_test.cu
@@ -13,6 +13,7 @@
 // limitations under the License.

 #include <gtest/gtest.h>
+#include <algorithm>
 #include <iostream>
 #include <random>

@@ -123,7 +124,7 @@ void TestUnalign(size_t num, const int shift_bit) {
  cudaMemcpy(out, d_in2, array_size, cudaMemcpyDeviceToHost);
  cudaDeviceSynchronize();
  for (size_t i = 0; i < num / 2; ++i) {
-    // NOTE(dzhwinter): the float16 add has small underflow/overflow
+    // NOTE(dzhwinter): the float16 add has small truncate error.
    // so we use EXPECT_NEAR to check the result.
    EXPECT_NEAR(static_cast<float>(out[i]),
                static_cast<float>(AddFunctor<float16>()(r_in1[i], r_in2[i])),
@@ -151,3 +152,83 @@ TEST(CudaAtomic, float16Unalign) {
  TestUnalign(static_cast<size_t>(1024), /*shift_bit*/ 3);
  TestUnalign(static_cast<size_t>(1024 * 1024), /*shift_bit*/ 3);
 }
+
+// https://devblogs.nvidia.com/faster-parallel-reductions-kepler/
+template <typename T>
+static __forceinline__ __device__ T WarpReduceSum(T val) {
+  unsigned mask = 0u;
+  CREATE_SHFL_MASK(mask, true);
+  for (int offset = warpSize / 2; offset > 0; offset /= 2) {
+    val += paddle::platform::CudaShuffleDownSync(mask, val, offset);
+  }
+  return val;
+}
+
+template <typename T>
+__forceinline__ __device__ T BlockReduce(T val) {
+  static __shared__ T shared[32];  // Shared mem for 32 partial sums
+  int lane = threadIdx.x % warpSize;
+  int wid = threadIdx.x / warpSize;
+
+  val = WarpReduceSum(val);  // Each warp performs partial reduction
+
+  if (lane == 0) shared[wid] = val;  // Write reduced value to shared memory
+
+  __syncthreads();  // Wait for all partial reductions
+
+  // read from shared memory only if that warp existed
+  val =
+      (threadIdx.x < blockDim.x / warpSize) ? shared[lane] : static_cast<T>(0);
+
+  if (wid == 0) val = WarpReduceSum(val);  // Final reduce within first warp
+
+  return val;
+}
+
+template <typename T>
+__global__ void DeviceReduceSum(T* in, T* out, size_t N) {
+  T sum(0);
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < N;
+       i += blockDim.x * gridDim.x) {
+    sum += in[i];
+  }
+  sum = BlockReduce<T>(sum);
+  __syncthreads();
+  if (threadIdx.x == 0) out[blockIdx.x] = sum;
+}
+
+template <typename T>
+void TestReduce(size_t num, float atol = 0.01) {
+  T* in1;
+  T *d_in1, *d_in2;
+  size_t size = sizeof(T) * num;
+  cudaMalloc(reinterpret_cast<void**>(&d_in1), size);
+  cudaMalloc(reinterpret_cast<void**>(&d_in2), sizeof(T));
+  in1 = reinterpret_cast<T*>(malloc(size));
+  std::minstd_rand engine;
+  std::uniform_real_distribution<double> dist(0.0, 1.0);
+  for (size_t i = 0; i < num; ++i) {
+    in1[i] = static_cast<T>(dist(engine));
+  }
+  auto out = std::accumulate(in1, in1 + num, static_cast<T>(0));
+  cudaMemcpy(d_in1, in1, size, cudaMemcpyHostToDevice);
+  cudaDeviceSynchronize();
+  DeviceReduceSum<T><<<1, PADDLE_CUDA_NUM_THREADS>>>(d_in1, d_in2, num);
+  cudaMemcpy(in1, d_in2, sizeof(T), cudaMemcpyDeviceToHost);
+  cudaDeviceSynchronize();
+  // NOTE(dzhwinter): the float16 add has small underflow/overflow
+  // so we use EXPECT_NEAR to check the result.
+  EXPECT_NEAR(static_cast<float>(in1[0]), static_cast<float>(out), atol);
+  free(in1);
+  cudaFree(d_in1);
+  cudaFree(d_in2);
+}
+
+TEST(CudaShuffleSync, float16) {
+  TestReduce<float>(10);
+  TestReduce<float>(1000);
+
+  // float16 will overflow or accumulate truncate errors in big size.
+  TestReduce<float16>(10);
+  TestReduce<float16>(100, /*atol error*/ 1.0);
+}
--- a/paddle/fluid/pybind/protobuf.cc
+++ b/paddle/fluid/pybind/protobuf.cc
@@ -205,12 +205,7 @@ void BindBlockDesc(pybind11::module *m) {
 void BindVarDsec(pybind11::module *m) {
  pybind11::class_<pd::VarDesc> var_desc(*m, "VarDesc", "");
  var_desc
-      .def("name",
-           [](pd::VarDesc &self) {
-             pybind11::bytes name = self.Name();
-             return name;
-           },
-           pybind11::return_value_policy::reference)
+      .def("name", &pd::VarDesc::Name, pybind11::return_value_policy::reference)
      .def("set_name", &pd::VarDesc::SetName)
      .def("set_shape", &pd::VarDesc::SetShape)
      .def("set_shapes", &pd::VarDesc::SetShapes)

--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -54,6 +54,8 @@ limitations under the License. */
 #include "paddle/fluid/platform/gpu_info.h"
 #endif

+#include "pybind11/stl.h"
+
 // disable auto conversion to list in Python
 PYBIND11_MAKE_OPAQUE(paddle::framework::LoDTensorArray);

@@ -247,6 +249,7 @@ PYBIND11_PLUGIN(core) {
        self.set_rows(new_rows);
 #endif
           })
+      .def("sync_index", [](SelectedRows &instance) { instance.SyncIndex(); })
      .def("rows", [](SelectedRows &self) {
        auto rows = self.rows();
        std::vector<int64_t> new_rows;

--- a/paddle/scripts/submit_local.sh.in
+++ b/paddle/scripts/submit_local.sh.in
@@ -54,7 +54,7 @@ function cpu_config() {
  if [ $platform == "Linux" ]; then
    ht=`lscpu |grep "per core"|awk -F':' '{print $2}'|xargs`
  elif [ $platform == "Darwin" ]; then
-    if [`sysctl -n hw.physicalcpu` -eq `sysctl -n hw.logicalcpu`]; then
+    if [ `sysctl -n hw.physicalcpu` -eq `sysctl -n hw.logicalcpu` ]; then
      # HT is OFF
      ht=1
    fi

--- a/python/paddle/__init__.py
+++ b/python/paddle/__init__.py
@@ -24,4 +24,5 @@ except ImportError:
 import paddle.reader
 import paddle.dataset
 import paddle.batch
+import paddle.compat
 batch = batch.batch
--- a/python/paddle/compat.py
+++ b/python/paddle/compat.py
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import six
+import math
+
+__all__ = [
+    'long_type',
+    'to_text',
+    'to_bytes',
+    'round',
+    'floor_division',
+    'get_exception_message',
+]
+
+if six.PY2:
+    int_type = int
+    long_type = long
+else:
+    int_type = int
+    long_type = int
+
+
+#  str and bytes related functions
+def to_text(obj, encoding='utf-8', inplace=False):
+    """
+      All string in PaddlePaddle should be represented as a literal string.
+    This function will convert object to a literal string without any encoding.
+    Especially, if the object type is a list or set container, we will iterate
+    all items in the object and convert them to literal string.
+
+    In Python3:
+        Decode the bytes type object to str type with specific encoding
+
+    In Python2:
+        Decode the str type object to unicode type with specific encoding
+
+    Args:
+        obj(unicode|str|bytes|list|set) : The object to be decoded.
+        encoding(str) : The encoding format to decode a string
+        inplace(bool) : If we change the original object or we create a new one
+
+    Returns:
+        Decoded result of obj
+    """
+    if obj is None:
+        return obj
+
+    if isinstance(obj, list):
+        if inplace:
+            for i in six.moves.xrange(len(obj)):
+                obj[i] = _to_text(obj[i], encoding)
+            return obj
+        else:
+            return [_to_text(item, encoding) for item in obj]
+    elif isinstance(obj, set):
+        if inplace:
+            for item in obj:
+                obj.remove(item)
+                obj.add(_to_text(item, encoding))
+            return obj
+        else:
+            return set([_to_text(item, encoding) for item in obj])
+    else:
+        return _to_text(obj, encoding)
+
+
+def _to_text(obj, encoding):
+    """
+    In Python3:
+        Decode the bytes type object to str type with specific encoding
+
+    In Python2:
+        Decode the str type object to unicode type with specific encoding,
+        or we just return the unicode string of object
+
+    Args:
+        obj(unicode|str|bytes) : The object to be decoded.
+        encoding(str) : The encoding format
+
+    Returns:
+        decoded result of obj
+    """
+    if obj is None:
+        return obj
+
+    if isinstance(obj, six.binary_type):
+        return obj.decode(encoding)
+    elif isinstance(obj, six.text_type):
+        return obj
+    else:
+        return six.u(obj)
+
+
+def to_bytes(obj, encoding='utf-8', inplace=False):
+    """
+      All string in PaddlePaddle should be represented as a literal string.
+    This function will convert object to a bytes with specific encoding.
+    Especially, if the object type is a list or set container, we will iterate
+    all items in the object and convert them to bytes.
+
+    In Python3:
+        Encode the str type object to bytes type with specific encoding
+
+    In Python2:
+        Encode the unicode type object to str type with specific encoding,
+        or we just return the 8-bit string of object
+
+    Args:
+        obj(unicode|str|bytes|list|set) : The object to be encoded.
+        encoding(str) : The encoding format to encode a string
+        inplace(bool) : If we change the original object or we create a new one
+
+    Returns:
+        Decoded result of obj
+    """
+    if obj is None:
+        return obj
+
+    if isinstance(obj, list):
+        if inplace:
+            for i in six.moves.xrange(len(obj)):
+                obj[i] = _to_bytes(obj[i], encoding)
+            return obj
+        else:
+            return [_to_bytes(item, encoding) for item in obj]
+    elif isinstance(obj, set):
+        if inplace:
+            for item in obj:
+                obj.remove(item)
+                obj.add(_to_bytes(item, encoding))
+            return obj
+        else:
+            return set([_to_bytes(item, encoding) for item in obj])
+    else:
+        return _to_bytes(obj, encoding)
+
+
+def _to_bytes(obj, encoding):
+    """
+    In Python3:
+        Encode the str type object to bytes type with specific encoding
+
+    In Python2:
+        Encode the unicode type object to str type with specific encoding,
+        or we just return the 8-bit string of object
+
+    Args:
+        obj(unicode|str|bytes) : The object to be encoded.
+        encoding(str) : The encoding format
+
+    Returns:
+        encoded result of obj
+    """
+    if obj is None:
+        return obj
+
+    assert encoding is not None
+    if isinstance(obj, six.text_type):
+        return obj.encode(encoding)
+    elif isinstance(obj, six.binary_type):
+        return obj
+    else:
+        return six.b(obj)
+
+
+# math related functions
+def round(x, d=0):
+    """
+    Compatible round which act the same behaviour in Python3.
+
+    Args:
+        x(float) : The number to round halfway.
+
+    Returns:
+        round result of x
+    """
+    if six.PY3:
+        # The official walkaround of round in Python3 is incorrect
+        # we implement accroding this answer: https://www.techforgeek.info/round_python.html
+        if x > 0.0:
+            p = 10**d
+            return float(math.floor((x * p) + math.copysign(0.5, x))) / p
+        elif x < 0.0:
+            p = 10**d
+            return float(math.ceil((x * p) + math.copysign(0.5, x))) / p
+        else:
+            return math.copysign(0.0, x)
+    else:
+        import __builtin__
+        return __builtin__.round(x, d)
+
+
+def floor_division(x, y):
+    """
+    Compatible division which act the same behaviour in Python3 and Python2,
+    whose result will be a int value of floor(x / y) in Python3 and value of
+    (x / y) in Python2.
+
+    Args:
+        x(int|float) : The number to divide.
+        y(int|float) : The number to be divided
+
+    Returns:
+        division result of x // y
+    """
+    return x // y
+
+
+# exception related functions
+def get_exception_message(exc):
+    """
+    Get the error message of a specific exception
+
+    Args:
+        exec(Exception) : The exception to get error message.
+
+    Returns:
+        the error message of exec
+    """
+    assert exc is not None
+
+    if six.PY2:
+        return exc.message
+    else:
+        return str(exc)
--- a/python/paddle/dataset/cifar.py
+++ b/python/paddle/dataset/cifar.py
@@ -28,11 +28,13 @@ images per class.

 """

+from __future__ import print_function
+
 import itertools
 import numpy
 import paddle.dataset.common
 import tarfile
-from six.moves import zip
+import six
 from six.moves import cPickle as pickle

 __all__ = ['train100', 'test100', 'train10', 'test10', 'convert']
@@ -46,10 +48,11 @@ CIFAR100_MD5 = 'eb9058c3a382ffc7106e4002c42a8d85'

 def reader_creator(filename, sub_name, cycle=False):
    def read_batch(batch):
-        data = batch['data']
-        labels = batch.get('labels', batch.get('fine_labels', None))
+        data = batch[six.b('data')]
+        labels = batch.get(
+            six.b('labels'), batch.get(six.b('fine_labels'), None))
        assert labels is not None
-        for sample, label in zip(data, labels):
+        for sample, label in six.moves.zip(data, labels):
            yield (sample / 255.0).astype(numpy.float32), int(label)

    def reader():
@@ -59,7 +62,11 @@ def reader_creator(filename, sub_name, cycle=False):

            while True:
                for name in names:
-                    batch = pickle.load(f.extractfile(name))
+                    if six.PY2:
+                        batch = pickle.load(f.extractfile(name))
+                    else:
+                        batch = pickle.load(
+                            f.extractfile(name), encoding='bytes')
                    for item in read_batch(batch):
                        yield item
                if not cycle:

--- a/python/paddle/dataset/common.py
+++ b/python/paddle/dataset/common.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

+from __future__ import print_function
+
 import requests
 import hashlib
 import os
@@ -85,10 +87,10 @@ def download(url, module_name, md5sum, save_name=None):
        total_length = r.headers.get('content-length')

        if total_length is None:
-            with open(filename, 'w') as f:
+            with open(filename, 'wb') as f:
                shutil.copyfileobj(r.raw, f)
        else:
-            with open(filename, 'w') as f:
+            with open(filename, 'wb') as f:
                dl = 0
                total_length = int(total_length)
                for data in r.iter_content(chunk_size=4096):

--- a/python/paddle/dataset/conll05.py
+++ b/python/paddle/dataset/conll05.py
@@ -20,15 +20,18 @@ dataset. And a pre-trained word vector model based on Wikipedia corpus is used
 to initialize SRL model.
 """

+from __future__ import print_function
+
 import tarfile
 import gzip
 import itertools
 import paddle.dataset.common
-from six.moves import zip
+import paddle.compat as cpt
+from six.moves import zip, range

 __all__ = ['test, get_dict', 'get_embedding', 'convert']

-DATA_URL = 'http://www.cs.upc.edu/~srlconll/conll05st-tests.tar.gz'
+DATA_URL = 'http://paddlemodels.bj.bcebos.com/conll05st/conll05st-tests.tar.gz'
 DATA_MD5 = '387719152ae52d60422c016e92a742fc'
 WORDDICT_URL = 'http://paddlemodels.bj.bcebos.com/conll05st%2FwordDict.txt'
 WORDDICT_MD5 = 'ea7fb7d4c75cc6254716f0177a506baa'
@@ -89,8 +92,8 @@ def corpus_reader(data_path, words_name, props_name):
            labels = []
            one_seg = []
            for word, label in zip(words_file, props_file):
-                word = word.strip()
-                label = label.strip().split()
+                word = cpt.to_text(word.strip())
+                label = cpt.to_text(label.strip().split())

                if len(label) == 0:  # end of sentence
                    for i in range(len(one_seg[0])):

--- a/python/paddle/dataset/flowers.py
+++ b/python/paddle/dataset/flowers.py
@@ -28,6 +28,9 @@ Graphics and Image Processing (2008)
 http://www.robots.ox.ac.uk/~vgg/publications/papers/nilsback08.{pdf,ps.gz}.

 """
+
+from __future__ import print_function
+
 import itertools
 import functools
 from .common import download
@@ -116,7 +119,7 @@ def reader_creator(data_file,
            for file in open(file_list):
                file = file.strip()
                batch = None
-                with open(file, 'r') as f:
+                with open(file, 'rb') as f:
                    batch = pickle.load(f)
                data = batch['data']
                labels = batch['label']

--- a/python/paddle/dataset/image.py
+++ b/python/paddle/dataset/image.py
@@ -29,10 +29,18 @@ the image layout as follows.
  formats can be used for training. Noted that, the format should
  be keep consistent between the training and inference peroid.
 """
+
+from __future__ import print_function
+
 import numpy as np
 try:
    import cv2
 except ImportError:
+    import sys
+    sys.stderr.write(
+        '''Warning with paddle image module: opencv-python should be imported,
+    or paddle image module could NOT work; please install opencv-python first.'''
+    )
    cv2 = None
 import os
 import tarfile
@@ -56,7 +64,7 @@ def batch_images_from_tar(data_file,
    :type data_file: string
    :param dataset_name: 'train','test' or 'valid'
    :type dataset_name: string
-    :param img2label: a dic with image file name as key 
+    :param img2label: a dic with image file name as key
                    and image's label as value
    :type img2label: dic
    :param num_per_batch: image number per batch file
@@ -88,7 +96,7 @@ def batch_images_from_tar(data_file,
                output['data'] = data
                pickle.dump(
                    output,
-                    open('%s/batch_%d' % (out_path, file_id), 'w'),
+                    open('%s/batch_%d' % (out_path, file_id), 'wb'),
                    protocol=pickle.HIGHEST_PROTOCOL)
                file_id += 1
                data = []
@@ -99,7 +107,7 @@ def batch_images_from_tar(data_file,
        output['data'] = data
        pickle.dump(
            output,
-            open('%s/batch_%d' % (out_path, file_id), 'w'),
+            open('%s/batch_%d' % (out_path, file_id), 'wb'),
            protocol=pickle.HIGHEST_PROTOCOL)

    with open(meta_file, 'a') as meta:
@@ -113,7 +121,7 @@ def load_image_bytes(bytes, is_color=True):
    Load an color or gray image from bytes array.

    Example usage:
-    
+
    .. code-block:: python

        with open('cat.jpg') as f:
@@ -126,6 +134,8 @@ def load_image_bytes(bytes, is_color=True):
                     load and return a gray image.
    :type is_color: bool
    """
+    assert cv2 is not None
+
    flag = 1 if is_color else 0
    file_bytes = np.asarray(bytearray(bytes), dtype=np.uint8)
    img = cv2.imdecode(file_bytes, flag)
@@ -137,7 +147,7 @@ def load_image(file, is_color=True):
    Load an color or gray image from the file path.

    Example usage:
-    
+
    .. code-block:: python

        im = load_image('cat.jpg')
@@ -149,6 +159,8 @@ def load_image(file, is_color=True):
                     load and return a gray image.
    :type is_color: bool
    """
+    assert cv2 is not None
+
    # cv2.IMAGE_COLOR for OpenCV3
    # cv2.CV_LOAD_IMAGE_COLOR for older OpenCV Version
    # cv2.IMAGE_GRAYSCALE for OpenCV3
@@ -161,27 +173,29 @@ def load_image(file, is_color=True):


 def resize_short(im, size):
-    """ 
+    """
    Resize an image so that the length of shorter edge is size.

    Example usage:
-    
+
    .. code-block:: python

        im = load_image('cat.jpg')
        im = resize_short(im, 256)
-    
+
    :param im: the input image with HWC layout.
    :type im: ndarray
    :param size: the shorter edge size of image after resizing.
    :type size: int
    """
+    assert cv2 is not None
+
    h, w = im.shape[:2]
    h_new, w_new = size, size
    if h > w:
-        h_new = size * h / w
+        h_new = size * h // w
    else:
-        w_new = size * w / h
+        w_new = size * w // h
    im = cv2.resize(im, (h_new, w_new), interpolation=cv2.INTER_CUBIC)
    return im

@@ -193,17 +207,17 @@ def to_chw(im, order=(2, 0, 1)):
    according the order (2,0,1).

    Example usage:
-    
+
    .. code-block:: python

        im = load_image('cat.jpg')
        im = resize_short(im, 256)
        im = to_chw(im)
-    
+
    :param im: the input image with HWC layout.
    :type im: ndarray
    :param order: the transposed order.
-    :type order: tuple|list 
+    :type order: tuple|list
    """
    assert len(im.shape) == len(order)
    im = im.transpose(order)
@@ -215,11 +229,11 @@ def center_crop(im, size, is_color=True):
    Crop the center of image with size.

    Example usage:
-    
+
    .. code-block:: python

        im = center_crop(im, 224)
-    
+
    :param im: the input image with HWC layout.
    :type im: ndarray
    :param size: the cropping size.
@@ -228,8 +242,8 @@ def center_crop(im, size, is_color=True):
    :type is_color: bool
    """
    h, w = im.shape[:2]
-    h_start = (h - size) / 2
-    w_start = (w - size) / 2
+    h_start = (h - size) // 2
+    w_start = (w - size) // 2
    h_end, w_end = h_start + size, w_start + size
    if is_color:
        im = im[h_start:h_end, w_start:w_end, :]
@@ -243,11 +257,11 @@ def random_crop(im, size, is_color=True):
    Randomly crop input image with size.

    Example usage:
-    
+
    .. code-block:: python

        im = random_crop(im, 224)
-    
+
    :param im: the input image with HWC layout.
    :type im: ndarray
    :param size: the cropping size.
@@ -272,11 +286,11 @@ def left_right_flip(im, is_color=True):
    Return the flipped image.

    Example usage:
-    
+
    .. code-block:: python

        im = left_right_flip(im)
-    
+
    :param im: input image with HWC layout or HW layout for gray image
    :type im: ndarray
    :param is_color: whether input image is color or not
@@ -299,7 +313,7 @@ def simple_transform(im,
    resizing, croping and flipping.

    Example usage:
-    
+
    .. code-block:: python

        im = simple_transform(im, 256, 224, True)
@@ -314,7 +328,7 @@ def simple_transform(im,
    :type is_train: bool
    :param is_color: whether the image is color or not.
    :type is_color: bool
-    :param mean: the mean values, which can be element-wise mean values or 
+    :param mean: the mean values, which can be element-wise mean values or
                 mean values per channel.
    :type mean: numpy array | list
    """
@@ -332,7 +346,7 @@ def simple_transform(im,
    im = im.astype('float32')
    if mean is not None:
        mean = np.array(mean, dtype=np.float32)
-        # mean value, may be one value per channel 
+        # mean value, may be one value per channel
        if mean.ndim == 1 and is_color:
            mean = mean[:, np.newaxis, np.newaxis]
        elif mean.ndim == 1:
@@ -357,7 +371,7 @@ def load_and_transform(filename,
    for the transform operations.

    Example usage:
-    
+
    .. code-block:: python

        im = load_and_transform('cat.jpg', 256, 224, True)
@@ -372,7 +386,7 @@ def load_and_transform(filename,
    :type is_train: bool
    :param is_color: whether the image is color or not.
    :type is_color: bool
-    :param mean: the mean values, which can be element-wise mean values or 
+    :param mean: the mean values, which can be element-wise mean values or
                 mean values per channel.
    :type mean: numpy array | list
    """

--- a/python/paddle/dataset/imdb.py
+++ b/python/paddle/dataset/imdb.py
@@ -20,11 +20,14 @@ of 25,000 highly polar movie reviews for training, and 25,000 for testing.
 Besides, this module also provides API for building dictionary.
 """

+from __future__ import print_function
+
 import paddle.dataset.common
 import collections
 import tarfile
 import re
 import string
+import six

 __all__ = ['build_dict', 'train', 'test', 'convert']

@@ -42,13 +45,14 @@ def tokenize(pattern):
        # sequential access of member files, other than
        # tarfile.extractfile, which does random access and might
        # destroy hard disks.
-        tf = next(tarf)
+        tf = tarf.next()
        while tf != None:
            if bool(pattern.match(tf.name)):
                # newline and punctuations removal and ad-hoc tokenization.
-                yield tarf.extractfile(tf).read().rstrip("\n\r").translate(
-                    None, string.punctuation).lower().split()
-            tf = next(tarf)
+                yield tarf.extractfile(tf).read().rstrip(six.b(
+                    "\n\r")).translate(
+                        None, six.b(string.punctuation)).lower().split()
+            tf = tarf.next()


 def build_dict(pattern, cutoff):
@@ -62,11 +66,11 @@ def build_dict(pattern, cutoff):
            word_freq[word] += 1

    # Not sure if we should prune less-frequent words here.
-    word_freq = [x for x in list(word_freq.items()) if x[1] > cutoff]
+    word_freq = [x for x in six.iteritems(word_freq) if x[1] > cutoff]

    dictionary = sorted(word_freq, key=lambda x: (-x[1], x[0]))
    words, _ = list(zip(*dictionary))
-    word_idx = dict(list(zip(words, list(range(len(words))))))
+    word_idx = dict(list(zip(words, six.moves.range(len(words)))))
    word_idx['<unk>'] = len(words)
    return word_idx


--- a/python/paddle/dataset/imikolov.py
+++ b/python/paddle/dataset/imikolov.py
@@ -14,13 +14,17 @@
 """
 imikolov's simple dataset.

-This module will download dataset from 
+This module will download dataset from
 http://www.fit.vutbr.cz/~imikolov/rnnlm/ and parse training set and test set
 into paddle reader creators.
 """
+
+from __future__ import print_function
+
 import paddle.dataset.common
 import collections
 import tarfile
+import six

 __all__ = ['train', 'test', 'build_dict', 'convert']

@@ -64,11 +68,13 @@ def build_dict(min_word_freq=50):
            # remove <unk> for now, since we will set it as last index
            del word_freq['<unk>']

-        word_freq = [x for x in list(word_freq.items()) if x[1] > min_word_freq]
+        word_freq = [
+            x for x in six.iteritems(word_freq) if x[1] > min_word_freq
+        ]

        word_freq_sorted = sorted(word_freq, key=lambda x: (-x[1], x[0]))
        words, _ = list(zip(*word_freq_sorted))
-        word_idx = dict(list(zip(words, list(range(len(words))))))
+        word_idx = dict(list(zip(words, six.moves.range(len(words)))))
        word_idx['<unk>'] = len(words)

    return word_idx
@@ -89,7 +95,7 @@ def reader_creator(filename, word_idx, n, data_type):
                    l = ['<s>'] + l.strip().split() + ['<e>']
                    if len(l) >= n:
                        l = [word_idx.get(w, UNK) for w in l]
-                        for i in range(n, len(l) + 1):
+                        for i in six.moves.range(n, len(l) + 1):
                            yield tuple(l[i - n:i])
                elif DataType.SEQ == data_type:
                    l = l.strip().split()

--- a/python/paddle/dataset/mnist.py
+++ b/python/paddle/dataset/mnist.py
@@ -17,10 +17,15 @@ MNIST dataset.
 This module will download dataset from http://yann.lecun.com/exdb/mnist/ and
 parse training set and test set into paddle reader creators.
 """
+
+from __future__ import print_function
+
 import paddle.dataset.common
 import subprocess
 import numpy
 import platform
+import tempfile
+from six.moves import range
 __all__ = ['train', 'test', 'convert']

 URL_PREFIX = 'http://yann.lecun.com/exdb/mnist/'
@@ -45,23 +50,28 @@ def reader_creator(image_filename, label_filename, buffer_size):

        # According to http://stackoverflow.com/a/38061619/724872, we
        # cannot use standard package gzip here.
-        m = subprocess.Popen([zcat_cmd, image_filename], stdout=subprocess.PIPE)
-        m.stdout.read(16)  # skip some magic bytes
+        tmp_image_file = tempfile.TemporaryFile(prefix='paddle_dataset')
+        m = subprocess.Popen(
+            [zcat_cmd, image_filename], stdout=tmp_image_file).communicate()
+        tmp_image_file.seek(16)  # skip some magic bytes

-        l = subprocess.Popen([zcat_cmd, label_filename], stdout=subprocess.PIPE)
-        l.stdout.read(8)  # skip some magic bytes
+        # Python3 will not take stdout as file
+        tmp_label_file = tempfile.TemporaryFile(prefix='paddle_dataset')
+        l = subprocess.Popen(
+            [zcat_cmd, label_filename], stdout=tmp_label_file).communicate()
+        tmp_label_file.seek(8)  # skip some magic bytes

        try:  # reader could be break.
            while True:
                labels = numpy.fromfile(
-                    l.stdout, 'ubyte', count=buffer_size).astype("int")
+                    tmp_label_file, 'ubyte', count=buffer_size).astype("int")

                if labels.size != buffer_size:
                    break  # numpy.fromfile returns empty slice after EOF.

                images = numpy.fromfile(
-                    m.stdout, 'ubyte', count=buffer_size * 28 * 28).reshape(
-                        (buffer_size, 28 * 28)).astype('float32')
+                    tmp_image_file, 'ubyte', count=buffer_size * 28 *
+                    28).reshape((buffer_size, 28 * 28)).astype('float32')

                images = images / 255.0 * 2.0 - 1.0


--- a/python/paddle/dataset/movielens.py
+++ b/python/paddle/dataset/movielens.py
@@ -22,11 +22,15 @@ set and test set into paddle reader creators.

 """

+from __future__ import print_function
+
 import zipfile
 import paddle.dataset.common
 import re
 import random
 import functools
+import six
+import paddle.compat as cpt

 __all__ = [
    'train', 'test', 'get_movie_title_dict', 'max_movie_id', 'max_user_id',
@@ -112,6 +116,7 @@ def __initialize_meta_info__():
                categories_set = set()
                with package.open('ml-1m/movies.dat') as movie_file:
                    for i, line in enumerate(movie_file):
+                        line = cpt.to_text(line, encoding='latin')
                        movie_id, title, categories = line.strip().split('::')
                        categories = categories.split('|')
                        for c in categories:
@@ -136,6 +141,7 @@ def __initialize_meta_info__():
                USER_INFO = dict()
                with package.open('ml-1m/users.dat') as user_file:
                    for line in user_file:
+                        line = cpt.to_text(line, encoding='latin')
                        uid, gender, age, job, _ = line.strip().split("::")
                        USER_INFO[int(uid)] = UserInfo(
                            index=uid, gender=gender, age=age, job_id=job)
@@ -148,6 +154,7 @@ def __reader__(rand_seed=0, test_ratio=0.1, is_test=False):
    with zipfile.ZipFile(file=fn) as package:
        with package.open('ml-1m/ratings.dat') as rating:
            for line in rating:
+                line = cpt.to_text(line, encoding='latin')
                if (rand.random() < test_ratio) == is_test:
                    uid, mov_id, rating, _ = line.strip().split("::")
                    uid = int(uid)
@@ -187,7 +194,7 @@ def max_movie_id():
    Get the maximum value of movie id.
    """
    __initialize_meta_info__()
-    return reduce(__max_index_info__, list(MOVIE_INFO.values())).index
+    return six.moves.reduce(__max_index_info__, list(MOVIE_INFO.values())).index


 def max_user_id():
@@ -195,7 +202,7 @@ def max_user_id():
    Get the maximum value of user id.
    """
    __initialize_meta_info__()
-    return reduce(__max_index_info__, list(USER_INFO.values())).index
+    return six.moves.reduce(__max_index_info__, list(USER_INFO.values())).index


 def __max_job_id_impl__(a, b):
@@ -210,7 +217,8 @@ def max_job_id():
    Get the maximum value of job id.
    """
    __initialize_meta_info__()
-    return reduce(__max_job_id_impl__, list(USER_INFO.values())).job_id
+    return six.moves.reduce(__max_job_id_impl__,
+                            list(USER_INFO.values())).job_id


 def movie_categories():

--- a/python/paddle/dataset/mq2007.py
+++ b/python/paddle/dataset/mq2007.py
@@ -23,6 +23,8 @@ http://research.microsoft.com/en-us/um/beijing/projects/letor/LETOR4.0/Data/MQ20

 """

+from __future__ import print_function
+
 import os
 import functools
 import rarfile

--- a/python/paddle/dataset/sentiment.py
+++ b/python/paddle/dataset/sentiment.py
@@ -20,6 +20,9 @@ The script fetch and preprocess movie_reviews data set that provided by NLTK
 TODO(yuyang18): Complete dataset.
 """

+from __future__ import print_function
+
+import six
 import collections
 from itertools import chain

@@ -64,7 +67,7 @@ def get_word_dict():
        for field in movie_reviews.fileids(category):
            for words in movie_reviews.words(field):
                word_freq_dict[words] += 1
-    words_sort_list = list(word_freq_dict.items())
+    words_sort_list = six.iteritems(word_freq_dict)
    words_sort_list.sort(cmp=lambda a, b: b[1] - a[1])
    for index, word in enumerate(words_sort_list):
        words_freq_sorted.append((word[0], index))

--- a/python/paddle/dataset/tests/cifar_test.py
+++ b/python/paddle/dataset/tests/cifar_test.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

+from __future__ import print_function
+
 import paddle.dataset.cifar
 import unittest


--- a/python/paddle/dataset/tests/common_test.py
+++ b/python/paddle/dataset/tests/common_test.py
@@ -12,10 +12,13 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

+from __future__ import print_function
+
 import paddle.dataset.common
 import unittest
 import tempfile
 import glob
+from six.moves import range


 class TestCommon(unittest.TestCase):

--- a/python/paddle/dataset/tests/flowers_test.py
+++ b/python/paddle/dataset/tests/flowers_test.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

+from __future__ import print_function
+
 import paddle.dataset.flowers
 import unittest


--- a/python/paddle/dataset/tests/imdb_test.py
+++ b/python/paddle/dataset/tests/imdb_test.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

+from __future__ import print_function
+
 import paddle.dataset.imdb
 import unittest
 import re

--- a/python/paddle/dataset/tests/imikolov_test.py
+++ b/python/paddle/dataset/tests/imikolov_test.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

+from __future__ import print_function
+
 import paddle.dataset.imikolov
 import unittest


--- a/python/paddle/dataset/tests/mnist_test.py
+++ b/python/paddle/dataset/tests/mnist_test.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

+from __future__ import print_function
+
 import paddle.dataset.mnist
 import unittest


--- a/python/paddle/dataset/tests/mq2007_test.py
+++ b/python/paddle/dataset/tests/mq2007_test.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

+from __future__ import print_function
+
 import paddle.dataset.mq2007
 import unittest


--- a/python/paddle/dataset/tests/test_image.py
+++ b/python/paddle/dataset/tests/test_image.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

+from __future__ import print_function
+
 import unittest
 import numpy as np


--- a/python/paddle/dataset/tests/test_sentiment.py
+++ b/python/paddle/dataset/tests/test_sentiment.py
@@ -15,6 +15,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

+from __future__ import print_function
+
 import unittest
 import nltk
 import paddle.dataset.sentiment as st

--- a/python/paddle/dataset/tests/voc2012_test.py
+++ b/python/paddle/dataset/tests/voc2012_test.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

+from __future__ import print_function
+
 import paddle.dataset.voc2012
 import unittest


--- a/python/paddle/dataset/tests/wmt16_test.py
+++ b/python/paddle/dataset/tests/wmt16_test.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

+from __future__ import print_function
+
 import paddle.dataset.wmt16
 import unittest


--- a/python/paddle/dataset/uci_housing.py
+++ b/python/paddle/dataset/uci_housing.py
@@ -19,9 +19,10 @@ https://archive.ics.uci.edu/ml/machine-learning-databases/housing/ and
 parse training set and test set into paddle reader creators.
 """

-import os
+from __future__ import print_function

 import numpy as np
+import six
 import tempfile
 import tarfile
 import os
@@ -70,11 +71,11 @@ def load_data(filename, feature_num=14, ratio=0.8):
        return

    data = np.fromfile(filename, sep=' ')
-    data = data.reshape(data.shape[0] / feature_num, feature_num)
+    data = data.reshape(data.shape[0] // feature_num, feature_num)
    maximums, minimums, avgs = data.max(axis=0), data.min(axis=0), data.sum(
        axis=0) / data.shape[0]
    feature_range(maximums[:-1], minimums[:-1])
-    for i in range(feature_num - 1):
+    for i in six.moves.range(feature_num - 1):
        data[:, i] = (data[:, i] - avgs[i]) / (maximums[i] - minimums[i])
    offset = int(data.shape[0] * ratio)
    UCI_TRAIN_DATA = data[:offset]
@@ -137,7 +138,7 @@ def predict_reader():
    It returns just one tuple data to do inference.

    :return: one tuple data
-    :rtype: tuple 
+    :rtype: tuple
    """
    global UCI_TEST_DATA
    load_data(paddle.dataset.common.download(URL, 'uci_housing', MD5))

--- a/python/paddle/dataset/voc2012.py
+++ b/python/paddle/dataset/voc2012.py
@@ -19,6 +19,8 @@ to training/test sets has been maintained. The total number of images
 with segmentation has been increased from 7,062 to 9,993.
 """

+from __future__ import print_function
+
 import tarfile
 import io
 import numpy as np

--- a/python/paddle/dataset/wmt14.py
+++ b/python/paddle/dataset/wmt14.py
@@ -19,10 +19,15 @@ http://paddlepaddle.cdn.bcebos.com/demo/wmt_shrinked_data/wmt14.tgz and
 parse training set and test set into paddle reader creators.

 """
+
+from __future__ import print_function
+
+import six
 import tarfile
 import gzip

 import paddle.dataset.common
+import paddle.compat as cpt

 __all__ = [
    'train',
@@ -53,7 +58,7 @@ def __read_to_dict(tar_file, dict_size):
        out_dict = dict()
        for line_count, line in enumerate(fd):
            if line_count < size:
-                out_dict[line.strip()] = line_count
+                out_dict[cpt.to_text(line.strip())] = line_count
            else:
                break
        return out_dict
@@ -84,7 +89,7 @@ def reader_creator(tar_file, file_name, dict_size):
            ]
            for name in names:
                for line in f.extractfile(name):
-                    line_split = line.strip().split('\t')
+                    line_split = line.strip().split(six.b('\t'))
                    if len(line_split) != 2:
                        continue
                    src_seq = line_split[0]  # one source sequence
@@ -153,8 +158,8 @@ def get_dict(dict_size, reverse=True):
    tar_file = paddle.dataset.common.download(URL_TRAIN, 'wmt14', MD5_TRAIN)
    src_dict, trg_dict = __read_to_dict(tar_file, dict_size)
    if reverse:
-        src_dict = {v: k for k, v in list(src_dict.items())}
-        trg_dict = {v: k for k, v in list(trg_dict.items())}
+        src_dict = {v: k for k, v in six.iteritems(src_dict)}
+        trg_dict = {v: k for k, v in six.iteritems(trg_dict)}
    return src_dict, trg_dict



--- a/python/paddle/dataset/wmt16.py
+++ b/python/paddle/dataset/wmt16.py
@@ -28,12 +28,16 @@ Multi30K: Multilingual English-German Image Descriptions.
 }
 """

+from __future__ import print_function
+
 import os
+import six
 import tarfile
 import gzip
 from collections import defaultdict

 import paddle.dataset.common
+import paddle.compat as cpt

 __all__ = [
    "train",
@@ -60,7 +64,7 @@ def __build_dict(tar_file, dict_size, save_path, lang):
    word_dict = defaultdict(int)
    with tarfile.open(tar_file, mode="r") as f:
        for line in f.extractfile("wmt16/train"):
-            line_split = line.strip().split("\t")
+            line_split = line.strip().split(six.b("\t"))
            if len(line_split) != 2: continue
            sen = line_split[0] if lang == "en" else line_split[1]
            for w in sen.split():
@@ -70,8 +74,7 @@ def __build_dict(tar_file, dict_size, save_path, lang):
        fout.write("%s\n%s\n%s\n" % (START_MARK, END_MARK, UNK_MARK))
        for idx, word in enumerate(
                sorted(
-                    iter(list(word_dict.items())),
-                    key=lambda x: x[1],
+                    six.iteritems(word_dict), key=lambda x: x[1],
                    reverse=True)):
            if idx + 3 == dict_size: break
            fout.write("%s\n" % (word[0]))
@@ -81,16 +84,16 @@ def __load_dict(tar_file, dict_size, lang, reverse=False):
    dict_path = os.path.join(paddle.dataset.common.DATA_HOME,
                             "wmt16/%s_%d.dict" % (lang, dict_size))
    if not os.path.exists(dict_path) or (
-            len(open(dict_path, "r").readlines()) != dict_size):
+            len(open(dict_path, "rb").readlines()) != dict_size):
        __build_dict(tar_file, dict_size, dict_path, lang)

    word_dict = {}
-    with open(dict_path, "r") as fdict:
+    with open(dict_path, "rb") as fdict:
        for idx, line in enumerate(fdict):
            if reverse:
-                word_dict[idx] = line.strip()
+                word_dict[idx] = cpt.to_text(line.strip())
            else:
-                word_dict[line.strip()] = idx
+                word_dict[cpt.to_text(line.strip())] = idx
    return word_dict


@@ -120,7 +123,7 @@ def reader_creator(tar_file, file_name, src_dict_size, trg_dict_size, src_lang):

        with tarfile.open(tar_file, mode="r") as f:
            for line in f.extractfile(file_name):
-                line_split = line.strip().split("\t")
+                line_split = line.strip().split(six.b("\t"))
                if len(line_split) != 2:
                    continue
                src_words = line_split[src_col].split()

--- a/python/paddle/fluid/average.py
+++ b/python/paddle/fluid/average.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

+from __future__ import print_function
+
 import numpy as np
 import warnings
 """

--- a/python/paddle/fluid/backward.py
+++ b/python/paddle/fluid/backward.py
@@ -12,11 +12,14 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

+from __future__ import print_function
+
 from paddle.fluid import framework as framework
 from . import core
 import collections
 import copy
 import six
+from .. import compat as cpt
 from . import unique_name

 __all__ = ['append_backward']
@@ -45,13 +48,13 @@ def _create_op_desc_(op_type, inputs, outputs, attrs):
    """
    op_desc = core.OpDesc()
    op_desc.set_type(op_type)
-    for para, args in list(inputs.items()):
+    for para, args in six.iteritems(inputs):
        op_desc.set_input(
            para,
            list(
                map(lambda arg: arg.decode() if isinstance(arg, six.binary_type) else arg,
                    args)))
-    for para, args in list(outputs.items()):
+    for para, args in six.iteritems(outputs):
        op_desc.set_output(
            para,
            list(
@@ -63,7 +66,7 @@ def _create_op_desc_(op_type, inputs, outputs, attrs):
    if op_role_attr_name not in attrs:
        attrs[
            op_role_attr_name] = core.op_proto_and_checker_maker.OpRole.Backward
-    for name, val in list(attrs.items()):
+    for name, val in six.iteritems(attrs):
        if isinstance(val, framework.Block):
            op_desc.set_block_attr(name, val.desc)
        else:
@@ -75,10 +78,10 @@ def _infer_var_data_type_(grad_var_name, block):
    """
    Infer the data type of given grad variable
    """
-    grad_var = block.desc.find_var(grad_var_name.encode("ascii"))
-    fwd_name = _strip_grad_suffix_(grad_var_name.encode("ascii"))
-    if block.desc.has_var_recursive(fwd_name):
-        fwd_var = block.desc.find_var_recursive(fwd_name.encode("ascii"))
+    grad_var = block.desc.find_var(cpt.to_bytes(grad_var_name))
+    fwd_name = _strip_grad_suffix_(grad_var_name)
+    if block.desc.has_var_recursive(cpt.to_bytes(fwd_name)):
+        fwd_var = block.desc.find_var_recursive(cpt.to_bytes(fwd_name))
        grad_var.set_dtype(fwd_var.dtype())
    else:
        grad_var.set_dtype(core.VarDesc.VarType.FP32)
@@ -102,8 +105,10 @@ def _some_in_set_(cands, s):
    """
    if len(cands) == 0:
        return False
-    for c in cands:
-        if c in s:
+    literal_set = cpt.to_text(s)
+    literal_cands = cpt.to_text(cands)
+    for c in literal_cands:
+        if c in literal_set:
            return True
    return False

@@ -114,9 +119,8 @@ def _strip_grad_suffix_(name):
    e.g. x@GRAD ==> x
         y@GRAD@RENAME@1 ==> y
    """
-    if isinstance(name, six.text_type):
-        name = name.encode()
-    pos = name.find(six.b(core.grad_var_suffix()))
+    name = cpt.to_text(name)
+    pos = name.find(core.grad_var_suffix())
    return name[:pos] if pos != -1 else name


@@ -125,9 +129,7 @@ def _append_grad_suffix_(name):
    Append grad suffix to the given variable name
    e.g. x ==> x@GRAD
    """
-    if isinstance(name, six.text_type):
-        name = name.encode()
-    return name + six.b(core.grad_var_suffix())
+    return cpt.to_text(name) + core.grad_var_suffix()


 def _addup_repetitive_outputs_(op_descs):
@@ -187,7 +189,7 @@ def _addup_repetitive_outputs_(op_descs):
                    op_desc.set_output(param_name, arg_names)
                    renamed_vars[var_name].append(new_name)

-    for var_name, inputs in list(renamed_vars.items()):
+    for var_name, inputs in six.iteritems(renamed_vars):
        if len(inputs) > 1:
            pending_sum_ops.append(
                (_create_op_desc_("sum", {"X": inputs}, {"Out": [var_name]},
@@ -243,7 +245,7 @@ from .proto import framework_pb2

 def serialize_op_decs(op_desc):
    protostr = op_desc.serialize_to_string()
-    proto = framework_pb2.OpDesc.FromString(str(protostr))
+    proto = framework_pb2.OpDesc.FromString(six.binary_type(protostr))
    return proto.__str__()


@@ -364,7 +366,7 @@ def _append_backward_ops_(block,

        # Getting op's corresponding grad_op
        grad_op_desc, op_grad_to_var = core.get_grad_op_desc(
-            op.desc, no_grad_dict[block.idx], grad_sub_block_list)
+            op.desc, cpt.to_text(no_grad_dict[block.idx]), grad_sub_block_list)

        grad_op_descs.extend(grad_op_desc)
        grad_to_var.update(op_grad_to_var)
@@ -411,11 +413,10 @@ def _append_backward_vars_(block, start_op_idx, grad_to_var, grad_info_map):
        new_vars = set()
        # create new gradient variables
        for grad_var_name in op_desc.output_arg_names():
-            grad_var_name = grad_var_name.encode("ascii")
-            if block.desc.has_var_recursive(
-                    grad_var_name) or grad_var_name == core.empty_var_name():
+            if block.desc.has_var_recursive(cpt.to_bytes(
+                    grad_var_name)) or grad_var_name == core.empty_var_name():
                continue
-            block.desc.var(grad_var_name)
+            block.desc.var(cpt.to_bytes(grad_var_name))
            new_vars.add(grad_var_name)
            if grad_var_name not in grad_to_var:
                continue
@@ -445,7 +446,7 @@ def _rename_grad_(block, start_op_idx, grad_to_var, target_grad_map):
                op_desc.rename_output(name, new_name)
                var_map[name] = new_name

-    for g, ng in list(var_map.items()):
+    for g, ng in six.iteritems(var_map):
        if g in grad_to_var:
            grad_to_var[ng] = grad_to_var[g]
            grad_to_var.pop(g)
@@ -595,11 +596,12 @@ def append_backward(loss, parameter_list=None, no_grad_set=None,
        parameters = parameter_list
    else:
        params = program.global_block().all_parameters()
+        program.global_block().iter_parameters()
        parameters = [param.name for param in params]

    params_and_grads = []
    for param in parameters:
-        if param not in grad_info_map:
+        if cpt.to_text(param) not in grad_info_map:
            continue
        grad_info = grad_info_map[param]
        grad_block = grad_info[1]

--- a/python/paddle/fluid/clip.py
+++ b/python/paddle/fluid/clip.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

+from __future__ import print_function
+
 import copy
 import six


--- a/python/paddle/fluid/concurrency.py
+++ b/python/paddle/fluid/concurrency.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

+from __future__ import print_function
+
 from .layers.control_flow import BlockGuard, equal
 from .framework import Operator
 from .layer_helper import LayerHelper, unique_name

--- a/python/paddle/fluid/contrib/__init__.py
+++ b/python/paddle/fluid/contrib/__init__.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

+from __future__ import print_function
+
 from . import decoder
 from .decoder import *
 from . import memory_usage_calc

--- a/python/paddle/fluid/contrib/decoder/__init__.py
+++ b/python/paddle/fluid/contrib/decoder/__init__.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

+from __future__ import print_function
+
 from . import beam_search_decoder
 from .beam_search_decoder import *


--- a/python/paddle/fluid/contrib/decoder/beam_search_decoder.py
+++ b/python/paddle/fluid/contrib/decoder/beam_search_decoder.py
@@ -20,6 +20,8 @@ without using the low level API such as while ops.
 This API is still under active development and may change drastically.
 """

+from __future__ import print_function
+
 import contextlib
 import numpy as np
 import six

--- a/python/paddle/fluid/contrib/memory_usage_calc.py
+++ b/python/paddle/fluid/contrib/memory_usage_calc.py
@@ -14,12 +14,16 @@
 """
 This module privides a memory usage calculate function for user.
 The purpose of this API is to allow users to estimate memory usage of
-a program under a special batch size, then user can set appropriate 
-batch size to fully utilize a GPU. 
+a program under a special batch size, then user can set appropriate
+batch size to fully utilize a GPU.

 This API is still under active development and may change drastically.
 """

+from __future__ import print_function
+
+import six
+
 from .. import core
 from ..framework import Program, Variable

@@ -45,15 +49,15 @@ def memory_usage(program, batch_size):

    Args:
        program(Program): The current Program.
-        batch_size(int): The current input data batch_size.  
-    
+        batch_size(int): The current input data batch_size.
+
    Returns:
        min_total_memory(float): the estimate memory usage lower bound.
        max_total_memory(float): the estimate memory usage upper bound.
        unit_str(string): the unit of estimate usage result.
-    
+
    Examples:
-        
+
        >>> import paddle.fluid as fluid
        >>> lower_usage, upper_usage, unit = fluid.contrib.memory_usage(
                fluid.default_main_program(), batch_size=10)
@@ -72,7 +76,7 @@ def memory_usage(program, batch_size):

    # Get the var_name list of first block and calculate
    total_memory = 0.0
-    for var in program.global_block().vars.itervalues():
+    for var in six.itervalues(program.global_block().vars):
        data_count = 1
        for x in var.shape:
            if x == -1:
@@ -81,10 +85,10 @@ def memory_usage(program, batch_size):
                data_count *= x
        var_memory = data_count * dtype_to_size[var.dtype]
        if DEBUG:
-            print "%s memory usage: %d" % (var.name, var_memory)
+            print("%s memory usage: %d" % (var.name, var_memory))
        total_memory += var_memory
    if DEBUG:
-        print "total memory usage: %.2f" % (total_memory)
+        print("total memory usage: %.2f" % (total_memory))

    # Convert appropriate unit
    unit_str = "B"

--- a/python/paddle/fluid/data_feeder.py
+++ b/python/paddle/fluid/data_feeder.py
@@ -12,6 +12,8 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

+from __future__ import print_function
+
 from . import core
 import numpy
 import os

--- a/python/paddle/fluid/debugger.py
+++ b/python/paddle/fluid/debugger.py
@@ -12,7 +12,10 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.

+from __future__ import print_function
+
 import sys
+import six
 import re
 from .graphviz import GraphPreviewGenerator
 from .proto import framework_pb2
@@ -225,7 +228,7 @@ def draw_block_graphviz(block, highlights=None, path="./temp.dot"):
    graph = GraphPreviewGenerator("some graph")
    # collect parameters and args
    protostr = block.desc.serialize_to_string()
-    desc = framework_pb2.BlockDesc.FromString(str(protostr))
+    desc = framework_pb2.BlockDesc.FromString(six.binary_type(protostr))

    def need_highlight(name):
        if highlights is None: return False

--- a/python/paddle/fluid/default_scope_funcs.py
+++ b/python/paddle/fluid/default_scope_funcs.py
@@ -26,6 +26,8 @@ A `scoped_function` will take a `function` as input. That function will be
 invoked in a new local scope.
 """

+from __future__ import print_function
+
 import paddle.fluid.core
 import threading


--- a/python/paddle/fluid/evaluator.py
+++ b/python/paddle/fluid/evaluator.py
--- a/python/paddle/fluid/executor.py
+++ b/python/paddle/fluid/executor.py
--- a/python/paddle/fluid/framework.py
+++ b/python/paddle/fluid/framework.py
--- a/python/paddle/fluid/graphviz.py
+++ b/python/paddle/fluid/graphviz.py
--- a/python/paddle/fluid/inferencer.py
+++ b/python/paddle/fluid/inferencer.py
--- a/python/paddle/fluid/initializer.py
+++ b/python/paddle/fluid/initializer.py
--- a/python/paddle/fluid/io.py
+++ b/python/paddle/fluid/io.py
--- a/python/paddle/fluid/layer_helper.py
+++ b/python/paddle/fluid/layer_helper.py
--- a/python/paddle/fluid/layers/__init__.py
+++ b/python/paddle/fluid/layers/__init__.py
--- a/python/paddle/fluid/layers/control_flow.py
+++ b/python/paddle/fluid/layers/control_flow.py
--- a/python/paddle/fluid/layers/detection.py
+++ b/python/paddle/fluid/layers/detection.py
--- a/python/paddle/fluid/layers/device.py
+++ b/python/paddle/fluid/layers/device.py
--- a/python/paddle/fluid/layers/io.py
+++ b/python/paddle/fluid/layers/io.py
--- a/python/paddle/fluid/layers/layer_function_generator.py
+++ b/python/paddle/fluid/layers/layer_function_generator.py
--- a/python/paddle/fluid/layers/learning_rate_scheduler.py
+++ b/python/paddle/fluid/layers/learning_rate_scheduler.py
--- a/python/paddle/fluid/layers/math_op_patch.py
+++ b/python/paddle/fluid/layers/math_op_patch.py
--- a/python/paddle/fluid/layers/metric_op.py
+++ b/python/paddle/fluid/layers/metric_op.py
--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
--- a/python/paddle/fluid/layers/ops.py
+++ b/python/paddle/fluid/layers/ops.py
--- a/python/paddle/fluid/layers/tensor.py
+++ b/python/paddle/fluid/layers/tensor.py
--- a/python/paddle/fluid/layers/utils.py
+++ b/python/paddle/fluid/layers/utils.py
--- a/python/paddle/fluid/lod_tensor.py
+++ b/python/paddle/fluid/lod_tensor.py
--- a/python/paddle/fluid/metrics.py
+++ b/python/paddle/fluid/metrics.py
--- a/python/paddle/fluid/net_drawer.py
+++ b/python/paddle/fluid/net_drawer.py
--- a/python/paddle/fluid/nets.py
+++ b/python/paddle/fluid/nets.py
--- a/python/paddle/fluid/op.py
+++ b/python/paddle/fluid/op.py
--- a/python/paddle/fluid/optimizer.py
+++ b/python/paddle/fluid/optimizer.py
--- a/python/paddle/fluid/parallel_executor.py
+++ b/python/paddle/fluid/parallel_executor.py
--- a/python/paddle/fluid/param_attr.py
+++ b/python/paddle/fluid/param_attr.py
--- a/python/paddle/fluid/profiler.py
+++ b/python/paddle/fluid/profiler.py
--- a/python/paddle/fluid/recordio_writer.py
+++ b/python/paddle/fluid/recordio_writer.py
--- a/python/paddle/fluid/regularizer.py
+++ b/python/paddle/fluid/regularizer.py
--- a/python/paddle/fluid/tests/book/high-level-api/fit_a_line/test_fit_a_line.py
+++ b/python/paddle/fluid/tests/book/high-level-api/fit_a_line/test_fit_a_line.py
--- a/python/paddle/fluid/tests/book/high-level-api/image_classification/cifar10_small_test_set.py
+++ b/python/paddle/fluid/tests/book/high-level-api/image_classification/cifar10_small_test_set.py
--- a/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_resnet.py
+++ b/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_resnet.py
--- a/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_vgg.py
+++ b/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_vgg.py
--- a/python/paddle/fluid/tests/book/high-level-api/label_semantic_roles/test_label_semantic_roles_newapi.py
+++ b/python/paddle/fluid/tests/book/high-level-api/label_semantic_roles/test_label_semantic_roles_newapi.py
--- a/python/paddle/fluid/tests/book/high-level-api/machine_translation/test_machine_translation.py
+++ b/python/paddle/fluid/tests/book/high-level-api/machine_translation/test_machine_translation.py
--- a/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_conv.py
+++ b/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_conv.py
--- a/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_mlp.py
+++ b/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_mlp.py
--- a/python/paddle/fluid/tests/book/high-level-api/recommender_system/test_recommender_system_newapi.py
+++ b/python/paddle/fluid/tests/book/high-level-api/recommender_system/test_recommender_system_newapi.py
--- a/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_conv.py
+++ b/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_conv.py
--- a/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_dynamic_rnn.py
+++ b/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_dynamic_rnn.py
--- a/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_stacked_lstm.py
+++ b/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_stacked_lstm.py
--- a/python/paddle/fluid/tests/book/high-level-api/word2vec/test_word2vec_new_api.py
+++ b/python/paddle/fluid/tests/book/high-level-api/word2vec/test_word2vec_new_api.py
--- a/python/paddle/fluid/tests/book/notest_understand_sentiment.py
+++ b/python/paddle/fluid/tests/book/notest_understand_sentiment.py
--- a/python/paddle/fluid/tests/book/test_fit_a_line.py
+++ b/python/paddle/fluid/tests/book/test_fit_a_line.py
--- a/python/paddle/fluid/tests/book/test_image_classification.py
+++ b/python/paddle/fluid/tests/book/test_image_classification.py
--- a/python/paddle/fluid/tests/book/test_label_semantic_roles.py
+++ b/python/paddle/fluid/tests/book/test_label_semantic_roles.py
--- a/python/paddle/fluid/tests/book/test_machine_translation.py
+++ b/python/paddle/fluid/tests/book/test_machine_translation.py
--- a/python/paddle/fluid/tests/book/test_recognize_digits.py
+++ b/python/paddle/fluid/tests/book/test_recognize_digits.py
--- a/python/paddle/fluid/tests/book/test_recommender_system.py
+++ b/python/paddle/fluid/tests/book/test_recommender_system.py
--- a/python/paddle/fluid/tests/book/test_rnn_encoder_decoder.py
+++ b/python/paddle/fluid/tests/book/test_rnn_encoder_decoder.py
--- a/python/paddle/fluid/tests/book/test_word2vec.py
+++ b/python/paddle/fluid/tests/book/test_word2vec.py
--- a/python/paddle/fluid/tests/book_memory_optimization/test_memopt_fit_a_line.py
+++ b/python/paddle/fluid/tests/book_memory_optimization/test_memopt_fit_a_line.py
--- a/python/paddle/fluid/tests/book_memory_optimization/test_memopt_image_classification_train.py
+++ b/python/paddle/fluid/tests/book_memory_optimization/test_memopt_image_classification_train.py
--- a/python/paddle/fluid/tests/book_memory_optimization/test_memopt_machine_translation.py
+++ b/python/paddle/fluid/tests/book_memory_optimization/test_memopt_machine_translation.py
--- a/python/paddle/fluid/tests/demo/fc_gan.py
+++ b/python/paddle/fluid/tests/demo/fc_gan.py
--- a/python/paddle/fluid/tests/demo/file_reader/convert_data_to_recordio.py
+++ b/python/paddle/fluid/tests/demo/file_reader/convert_data_to_recordio.py
--- a/python/paddle/fluid/tests/demo/file_reader/train.py
+++ b/python/paddle/fluid/tests/demo/file_reader/train.py
--- a/python/paddle/fluid/tests/demo/pyreader.py
+++ b/python/paddle/fluid/tests/demo/pyreader.py
--- a/python/paddle/fluid/tests/no_test_concurrency.py
+++ b/python/paddle/fluid/tests/no_test_concurrency.py
--- a/python/paddle/fluid/tests/notest_concurrency.py
+++ b/python/paddle/fluid/tests/notest_concurrency.py
--- a/python/paddle/fluid/tests/test_beam_search_decoder.py
+++ b/python/paddle/fluid/tests/test_beam_search_decoder.py
--- a/python/paddle/fluid/tests/test_cpp_reader.py
+++ b/python/paddle/fluid/tests/test_cpp_reader.py
--- a/python/paddle/fluid/tests/test_data_feeder.py
+++ b/python/paddle/fluid/tests/test_data_feeder.py
--- a/python/paddle/fluid/tests/test_detection.py
+++ b/python/paddle/fluid/tests/test_detection.py
--- a/python/paddle/fluid/tests/test_error_clip.py
+++ b/python/paddle/fluid/tests/test_error_clip.py
--- a/python/paddle/fluid/tests/test_gradient_clip.py
+++ b/python/paddle/fluid/tests/test_gradient_clip.py
--- a/python/paddle/fluid/tests/test_if_else_op.py
+++ b/python/paddle/fluid/tests/test_if_else_op.py
--- a/python/paddle/fluid/tests/test_lod_tensor.py
+++ b/python/paddle/fluid/tests/test_lod_tensor.py
--- a/python/paddle/fluid/tests/test_python_operator_overriding.py
+++ b/python/paddle/fluid/tests/test_python_operator_overriding.py
--- a/python/paddle/fluid/tests/unittests/benchmark.py
+++ b/python/paddle/fluid/tests/unittests/benchmark.py
--- a/python/paddle/fluid/tests/unittests/benchmark_sum_op.py
+++ b/python/paddle/fluid/tests/unittests/benchmark_sum_op.py
--- a/python/paddle/fluid/tests/unittests/decorators.py
+++ b/python/paddle/fluid/tests/unittests/decorators.py
--- a/python/paddle/fluid/tests/unittests/dist_mnist.py
+++ b/python/paddle/fluid/tests/unittests/dist_mnist.py
--- a/python/paddle/fluid/tests/unittests/dist_se_resnext.py
+++ b/python/paddle/fluid/tests/unittests/dist_se_resnext.py
--- a/python/paddle/fluid/tests/unittests/dist_transformer.py
+++ b/python/paddle/fluid/tests/unittests/dist_transformer.py
--- a/python/paddle/fluid/tests/unittests/dist_word2vec.py
+++ b/python/paddle/fluid/tests/unittests/dist_word2vec.py
--- a/python/paddle/fluid/tests/unittests/op_test.py
+++ b/python/paddle/fluid/tests/unittests/op_test.py
--- a/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
+++ b/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py
--- a/python/paddle/fluid/tests/unittests/test_accuracy_op.py
+++ b/python/paddle/fluid/tests/unittests/test_accuracy_op.py
--- a/python/paddle/fluid/tests/unittests/test_activation_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/test_activation_mkldnn_op.py
--- a/python/paddle/fluid/tests/unittests/test_activation_op.py
+++ b/python/paddle/fluid/tests/unittests/test_activation_op.py
--- a/python/paddle/fluid/tests/unittests/test_adadelta_op.py
+++ b/python/paddle/fluid/tests/unittests/test_adadelta_op.py
--- a/python/paddle/fluid/tests/unittests/test_adagrad_op.py
+++ b/python/paddle/fluid/tests/unittests/test_adagrad_op.py
--- a/python/paddle/fluid/tests/unittests/test_adam_op.py
+++ b/python/paddle/fluid/tests/unittests/test_adam_op.py
--- a/python/paddle/fluid/tests/unittests/test_adamax_op.py
+++ b/python/paddle/fluid/tests/unittests/test_adamax_op.py
--- a/python/paddle/fluid/tests/unittests/test_anchor_generator_op.py
+++ b/python/paddle/fluid/tests/unittests/test_anchor_generator_op.py
--- a/python/paddle/fluid/tests/unittests/test_arg_min_max_op.py
+++ b/python/paddle/fluid/tests/unittests/test_arg_min_max_op.py
--- a/python/paddle/fluid/tests/unittests/test_argsort_op.py
+++ b/python/paddle/fluid/tests/unittests/test_argsort_op.py
--- a/python/paddle/fluid/tests/unittests/test_array_read_write_op.py
+++ b/python/paddle/fluid/tests/unittests/test_array_read_write_op.py
--- a/python/paddle/fluid/tests/unittests/test_assign_op.py
+++ b/python/paddle/fluid/tests/unittests/test_assign_op.py
--- a/python/paddle/fluid/tests/unittests/test_assign_value_op.py
+++ b/python/paddle/fluid/tests/unittests/test_assign_value_op.py
--- a/python/paddle/fluid/tests/unittests/test_auc_op.py
+++ b/python/paddle/fluid/tests/unittests/test_auc_op.py
--- a/python/paddle/fluid/tests/unittests/test_batch_norm_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/test_batch_norm_mkldnn_op.py
--- a/python/paddle/fluid/tests/unittests/test_batch_norm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_batch_norm_op.py
--- a/python/paddle/fluid/tests/unittests/test_beam_search_decode_op.py
+++ b/python/paddle/fluid/tests/unittests/test_beam_search_decode_op.py
--- a/python/paddle/fluid/tests/unittests/test_beam_search_op.py
+++ b/python/paddle/fluid/tests/unittests/test_beam_search_op.py
--- a/python/paddle/fluid/tests/unittests/test_bilinear_interp_op.py
+++ b/python/paddle/fluid/tests/unittests/test_bilinear_interp_op.py
--- a/python/paddle/fluid/tests/unittests/test_bilinear_tensor_product_op.py
+++ b/python/paddle/fluid/tests/unittests/test_bilinear_tensor_product_op.py
--- a/python/paddle/fluid/tests/unittests/test_bipartite_match_op.py
+++ b/python/paddle/fluid/tests/unittests/test_bipartite_match_op.py
--- a/python/paddle/fluid/tests/unittests/test_box_coder_op.py
+++ b/python/paddle/fluid/tests/unittests/test_box_coder_op.py
--- a/python/paddle/fluid/tests/unittests/test_calc_gradient.py
+++ b/python/paddle/fluid/tests/unittests/test_calc_gradient.py
--- a/python/paddle/fluid/tests/unittests/test_cast_op.py
+++ b/python/paddle/fluid/tests/unittests/test_cast_op.py
--- a/python/paddle/fluid/tests/unittests/test_chunk_eval_op.py
+++ b/python/paddle/fluid/tests/unittests/test_chunk_eval_op.py
--- a/python/paddle/fluid/tests/unittests/test_clip_by_norm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_clip_by_norm_op.py
--- a/python/paddle/fluid/tests/unittests/test_clip_op.py
+++ b/python/paddle/fluid/tests/unittests/test_clip_op.py
--- a/python/paddle/fluid/tests/unittests/test_compare_op.py
+++ b/python/paddle/fluid/tests/unittests/test_compare_op.py
--- a/python/paddle/fluid/tests/unittests/test_compat.py
+++ b/python/paddle/fluid/tests/unittests/test_compat.py
--- a/python/paddle/fluid/tests/unittests/test_concat_op.py
+++ b/python/paddle/fluid/tests/unittests/test_concat_op.py
--- a/python/paddle/fluid/tests/unittests/test_conditional_block.py
+++ b/python/paddle/fluid/tests/unittests/test_conditional_block.py
--- a/python/paddle/fluid/tests/unittests/test_const_value.py
+++ b/python/paddle/fluid/tests/unittests/test_const_value.py
--- a/python/paddle/fluid/tests/unittests/test_conv2d_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/test_conv2d_mkldnn_op.py
--- a/python/paddle/fluid/tests/unittests/test_conv2d_op.py
+++ b/python/paddle/fluid/tests/unittests/test_conv2d_op.py
--- a/python/paddle/fluid/tests/unittests/test_conv2d_transpose_op.py
+++ b/python/paddle/fluid/tests/unittests/test_conv2d_transpose_op.py
--- a/python/paddle/fluid/tests/unittests/test_conv3d_op.py
+++ b/python/paddle/fluid/tests/unittests/test_conv3d_op.py
--- a/python/paddle/fluid/tests/unittests/test_conv3d_transpose_op.py
+++ b/python/paddle/fluid/tests/unittests/test_conv3d_transpose_op.py
--- a/python/paddle/fluid/tests/unittests/test_conv_shift_op.py
+++ b/python/paddle/fluid/tests/unittests/test_conv_shift_op.py
--- a/python/paddle/fluid/tests/unittests/test_cos_sim_op.py
+++ b/python/paddle/fluid/tests/unittests/test_cos_sim_op.py
--- a/python/paddle/fluid/tests/unittests/test_create_op_doc_string.py
+++ b/python/paddle/fluid/tests/unittests/test_create_op_doc_string.py
--- a/python/paddle/fluid/tests/unittests/test_crf_decoding_op.py
+++ b/python/paddle/fluid/tests/unittests/test_crf_decoding_op.py
--- a/python/paddle/fluid/tests/unittests/test_crop_op.py
+++ b/python/paddle/fluid/tests/unittests/test_crop_op.py
--- a/python/paddle/fluid/tests/unittests/test_cross_entropy_op.py
+++ b/python/paddle/fluid/tests/unittests/test_cross_entropy_op.py
--- a/python/paddle/fluid/tests/unittests/test_ctc_align.py
+++ b/python/paddle/fluid/tests/unittests/test_ctc_align.py
--- a/python/paddle/fluid/tests/unittests/test_cumsum_op.py
+++ b/python/paddle/fluid/tests/unittests/test_cumsum_op.py
--- a/python/paddle/fluid/tests/unittests/test_data_balance.py
+++ b/python/paddle/fluid/tests/unittests/test_data_balance.py
--- a/python/paddle/fluid/tests/unittests/test_debugger.py
+++ b/python/paddle/fluid/tests/unittests/test_debugger.py
--- a/python/paddle/fluid/tests/unittests/test_decayed_adagrad_op.py
+++ b/python/paddle/fluid/tests/unittests/test_decayed_adagrad_op.py
--- a/python/paddle/fluid/tests/unittests/test_default_scope_funcs.py
+++ b/python/paddle/fluid/tests/unittests/test_default_scope_funcs.py
--- a/python/paddle/fluid/tests/unittests/test_desc_clone.py
+++ b/python/paddle/fluid/tests/unittests/test_desc_clone.py
--- a/python/paddle/fluid/tests/unittests/test_detection_map_op.py
+++ b/python/paddle/fluid/tests/unittests/test_detection_map_op.py
--- a/python/paddle/fluid/tests/unittests/test_dist_base.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_base.py
--- a/python/paddle/fluid/tests/unittests/test_dist_mnist.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_mnist.py
--- a/python/paddle/fluid/tests/unittests/test_dist_se_resnext.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_se_resnext.py
--- a/python/paddle/fluid/tests/unittests/test_dist_train.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_train.py
--- a/python/paddle/fluid/tests/unittests/test_dist_transformer.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_transformer.py
--- a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py
--- a/python/paddle/fluid/tests/unittests/test_dist_word2vec.py
+++ b/python/paddle/fluid/tests/unittests/test_dist_word2vec.py
--- a/python/paddle/fluid/tests/unittests/test_dropout_op.py
+++ b/python/paddle/fluid/tests/unittests/test_dropout_op.py
--- a/python/paddle/fluid/tests/unittests/test_dyn_rnn.py
+++ b/python/paddle/fluid/tests/unittests/test_dyn_rnn.py
--- a/python/paddle/fluid/tests/unittests/test_dynrnn_gradient_check.py
+++ b/python/paddle/fluid/tests/unittests/test_dynrnn_gradient_check.py
--- a/python/paddle/fluid/tests/unittests/test_dynrnn_static_input.py
+++ b/python/paddle/fluid/tests/unittests/test_dynrnn_static_input.py
--- a/python/paddle/fluid/tests/unittests/test_edit_distance_op.py
+++ b/python/paddle/fluid/tests/unittests/test_edit_distance_op.py
--- a/python/paddle/fluid/tests/unittests/test_elementwise_add_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_add_mkldnn_op.py
--- a/python/paddle/fluid/tests/unittests/test_elementwise_add_op.py
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_add_op.py
--- a/python/paddle/fluid/tests/unittests/test_elementwise_div_op.py
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_div_op.py
--- a/python/paddle/fluid/tests/unittests/test_elementwise_gradient_op.py
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_gradient_op.py
--- a/python/paddle/fluid/tests/unittests/test_elementwise_max_op.py
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_max_op.py
--- a/python/paddle/fluid/tests/unittests/test_elementwise_min_op.py
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_min_op.py
--- a/python/paddle/fluid/tests/unittests/test_elementwise_mul_op.py
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_mul_op.py
--- a/python/paddle/fluid/tests/unittests/test_elementwise_pow_op.py
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_pow_op.py
--- a/python/paddle/fluid/tests/unittests/test_elementwise_sub_op.py
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_sub_op.py
--- a/python/paddle/fluid/tests/unittests/test_exception.py
+++ b/python/paddle/fluid/tests/unittests/test_exception.py
--- a/python/paddle/fluid/tests/unittests/test_executor_and_mul.py
+++ b/python/paddle/fluid/tests/unittests/test_executor_and_mul.py
--- a/python/paddle/fluid/tests/unittests/test_expand_op.py
+++ b/python/paddle/fluid/tests/unittests/test_expand_op.py
--- a/python/paddle/fluid/tests/unittests/test_extract_rows_op.py
+++ b/python/paddle/fluid/tests/unittests/test_extract_rows_op.py
--- a/python/paddle/fluid/tests/unittests/test_fake_dequantize_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fake_dequantize_op.py
--- a/python/paddle/fluid/tests/unittests/test_fake_quantize_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fake_quantize_op.py
--- a/python/paddle/fluid/tests/unittests/test_fc_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fc_mkldnn_op.py
--- a/python/paddle/fluid/tests/unittests/test_feed_fetch_method.py
+++ b/python/paddle/fluid/tests/unittests/test_feed_fetch_method.py
--- a/python/paddle/fluid/tests/unittests/test_fetch_var.py
+++ b/python/paddle/fluid/tests/unittests/test_fetch_var.py
--- a/python/paddle/fluid/tests/unittests/test_fill_constant_batch_size_like_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fill_constant_batch_size_like_op.py
--- a/python/paddle/fluid/tests/unittests/test_fill_constant_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fill_constant_op.py
--- a/python/paddle/fluid/tests/unittests/test_fill_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fill_op.py
--- a/python/paddle/fluid/tests/unittests/test_fill_zeros_like_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fill_zeros_like_op.py
--- a/python/paddle/fluid/tests/unittests/test_flatten_op.py
+++ b/python/paddle/fluid/tests/unittests/test_flatten_op.py
--- a/python/paddle/fluid/tests/unittests/test_framework_debug_str.py
+++ b/python/paddle/fluid/tests/unittests/test_framework_debug_str.py
--- a/python/paddle/fluid/tests/unittests/test_ftrl_op.py
+++ b/python/paddle/fluid/tests/unittests/test_ftrl_op.py
--- a/python/paddle/fluid/tests/unittests/test_fused_elemwise_activation_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fused_elemwise_activation_op.py
--- a/python/paddle/fluid/tests/unittests/test_gather_op.py
+++ b/python/paddle/fluid/tests/unittests/test_gather_op.py
--- a/python/paddle/fluid/tests/unittests/test_gaussian_random_batch_size_like_op.py
+++ b/python/paddle/fluid/tests/unittests/test_gaussian_random_batch_size_like_op.py
--- a/python/paddle/fluid/tests/unittests/test_gaussian_random_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/test_gaussian_random_mkldnn_op.py
--- a/python/paddle/fluid/tests/unittests/test_gaussian_random_op.py
+++ b/python/paddle/fluid/tests/unittests/test_gaussian_random_op.py
--- a/python/paddle/fluid/tests/unittests/test_get_places_op.py
+++ b/python/paddle/fluid/tests/unittests/test_get_places_op.py
--- a/python/paddle/fluid/tests/unittests/test_gru_op.py
+++ b/python/paddle/fluid/tests/unittests/test_gru_op.py
--- a/python/paddle/fluid/tests/unittests/test_gru_unit_op.py
+++ b/python/paddle/fluid/tests/unittests/test_gru_unit_op.py
--- a/python/paddle/fluid/tests/unittests/test_hinge_loss_op.py
+++ b/python/paddle/fluid/tests/unittests/test_hinge_loss_op.py
--- a/python/paddle/fluid/tests/unittests/test_hsigmoid_op.py
+++ b/python/paddle/fluid/tests/unittests/test_hsigmoid_op.py
--- a/python/paddle/fluid/tests/unittests/test_huber_loss_op.py
+++ b/python/paddle/fluid/tests/unittests/test_huber_loss_op.py
--- a/python/paddle/fluid/tests/unittests/test_im2sequence_op.py
+++ b/python/paddle/fluid/tests/unittests/test_im2sequence_op.py
--- a/python/paddle/fluid/tests/unittests/test_image_classification_layer.py
+++ b/python/paddle/fluid/tests/unittests/test_image_classification_layer.py
--- a/python/paddle/fluid/tests/unittests/test_infer_shape.py
+++ b/python/paddle/fluid/tests/unittests/test_infer_shape.py
--- a/python/paddle/fluid/tests/unittests/test_inference_model_io.py
+++ b/python/paddle/fluid/tests/unittests/test_inference_model_io.py
--- a/python/paddle/fluid/tests/unittests/test_initializer.py
+++ b/python/paddle/fluid/tests/unittests/test_initializer.py
--- a/python/paddle/fluid/tests/unittests/test_iou_similarity_op.py
+++ b/python/paddle/fluid/tests/unittests/test_iou_similarity_op.py
--- a/python/paddle/fluid/tests/unittests/test_is_empty_op.py
+++ b/python/paddle/fluid/tests/unittests/test_is_empty_op.py
--- a/python/paddle/fluid/tests/unittests/test_l1_norm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_l1_norm_op.py
--- a/python/paddle/fluid/tests/unittests/test_label_smooth_op.py
+++ b/python/paddle/fluid/tests/unittests/test_label_smooth_op.py
--- a/python/paddle/fluid/tests/unittests/test_layer_norm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_layer_norm_op.py
--- a/python/paddle/fluid/tests/unittests/test_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_layers.py
--- a/python/paddle/fluid/tests/unittests/test_learning_rate_scheduler.py
+++ b/python/paddle/fluid/tests/unittests/test_learning_rate_scheduler.py
--- a/python/paddle/fluid/tests/unittests/test_linear_chain_crf_op.py
+++ b/python/paddle/fluid/tests/unittests/test_linear_chain_crf_op.py
--- a/python/paddle/fluid/tests/unittests/test_listen_and_serv_op.py
+++ b/python/paddle/fluid/tests/unittests/test_listen_and_serv_op.py
--- a/python/paddle/fluid/tests/unittests/test_lod_array_length_op.py
+++ b/python/paddle/fluid/tests/unittests/test_lod_array_length_op.py
--- a/python/paddle/fluid/tests/unittests/test_lod_rank_table.py
+++ b/python/paddle/fluid/tests/unittests/test_lod_rank_table.py
--- a/python/paddle/fluid/tests/unittests/test_lod_reset_op.py
+++ b/python/paddle/fluid/tests/unittests/test_lod_reset_op.py
--- a/python/paddle/fluid/tests/unittests/test_lod_tensor_array.py
+++ b/python/paddle/fluid/tests/unittests/test_lod_tensor_array.py
--- a/python/paddle/fluid/tests/unittests/test_lod_tensor_array_ops.py
+++ b/python/paddle/fluid/tests/unittests/test_lod_tensor_array_ops.py
--- a/python/paddle/fluid/tests/unittests/test_log_loss_op.py
+++ b/python/paddle/fluid/tests/unittests/test_log_loss_op.py
--- a/python/paddle/fluid/tests/unittests/test_logical_op.py
+++ b/python/paddle/fluid/tests/unittests/test_logical_op.py
--- a/python/paddle/fluid/tests/unittests/test_lookup_sparse_table_op.py
+++ b/python/paddle/fluid/tests/unittests/test_lookup_sparse_table_op.py
--- a/python/paddle/fluid/tests/unittests/test_lookup_table_op.py
+++ b/python/paddle/fluid/tests/unittests/test_lookup_table_op.py
--- a/python/paddle/fluid/tests/unittests/test_lrn_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/test_lrn_mkldnn_op.py
--- a/python/paddle/fluid/tests/unittests/test_lrn_op.py
+++ b/python/paddle/fluid/tests/unittests/test_lrn_op.py
--- a/python/paddle/fluid/tests/unittests/test_lstm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_lstm_op.py
--- a/python/paddle/fluid/tests/unittests/test_lstm_unit_op.py
+++ b/python/paddle/fluid/tests/unittests/test_lstm_unit_op.py
--- a/python/paddle/fluid/tests/unittests/test_lstmp_op.py
+++ b/python/paddle/fluid/tests/unittests/test_lstmp_op.py
--- a/python/paddle/fluid/tests/unittests/test_margin_rank_loss_op.py
+++ b/python/paddle/fluid/tests/unittests/test_margin_rank_loss_op.py
--- a/python/paddle/fluid/tests/unittests/test_math_op_patch.py
+++ b/python/paddle/fluid/tests/unittests/test_math_op_patch.py
--- a/python/paddle/fluid/tests/unittests/test_matmul_op.py
+++ b/python/paddle/fluid/tests/unittests/test_matmul_op.py
--- a/python/paddle/fluid/tests/unittests/test_maxout_op.py
+++ b/python/paddle/fluid/tests/unittests/test_maxout_op.py
--- a/python/paddle/fluid/tests/unittests/test_mean_iou.py
+++ b/python/paddle/fluid/tests/unittests/test_mean_iou.py
--- a/python/paddle/fluid/tests/unittests/test_mean_op.py
+++ b/python/paddle/fluid/tests/unittests/test_mean_op.py
--- a/python/paddle/fluid/tests/unittests/test_memory_usage.py
+++ b/python/paddle/fluid/tests/unittests/test_memory_usage.py
--- a/python/paddle/fluid/tests/unittests/test_merge_ids_op.py
+++ b/python/paddle/fluid/tests/unittests/test_merge_ids_op.py
--- a/python/paddle/fluid/tests/unittests/test_mine_hard_examples_op.py
+++ b/python/paddle/fluid/tests/unittests/test_mine_hard_examples_op.py
--- a/python/paddle/fluid/tests/unittests/test_minus_op.py
+++ b/python/paddle/fluid/tests/unittests/test_minus_op.py
--- a/python/paddle/fluid/tests/unittests/test_modified_huber_loss_op.py
+++ b/python/paddle/fluid/tests/unittests/test_modified_huber_loss_op.py
--- a/python/paddle/fluid/tests/unittests/test_momentum_op.py
+++ b/python/paddle/fluid/tests/unittests/test_momentum_op.py
--- a/python/paddle/fluid/tests/unittests/test_mul_op.py
+++ b/python/paddle/fluid/tests/unittests/test_mul_op.py
--- a/python/paddle/fluid/tests/unittests/test_multi_file_reader.py
+++ b/python/paddle/fluid/tests/unittests/test_multi_file_reader.py
--- a/python/paddle/fluid/tests/unittests/test_multi_pass_reader.py
+++ b/python/paddle/fluid/tests/unittests/test_multi_pass_reader.py
--- a/python/paddle/fluid/tests/unittests/test_multiclass_nms_op.py
+++ b/python/paddle/fluid/tests/unittests/test_multiclass_nms_op.py
--- a/python/paddle/fluid/tests/unittests/test_multihead_attention.py
+++ b/python/paddle/fluid/tests/unittests/test_multihead_attention.py
--- a/python/paddle/fluid/tests/unittests/test_multiplex_op.py
+++ b/python/paddle/fluid/tests/unittests/test_multiplex_op.py
--- a/python/paddle/fluid/tests/unittests/test_nce.py
+++ b/python/paddle/fluid/tests/unittests/test_nce.py
--- a/python/paddle/fluid/tests/unittests/test_network_with_dtype.py
+++ b/python/paddle/fluid/tests/unittests/test_network_with_dtype.py
--- a/python/paddle/fluid/tests/unittests/test_norm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_norm_op.py
--- a/python/paddle/fluid/tests/unittests/test_normalization_wrapper.py
+++ b/python/paddle/fluid/tests/unittests/test_normalization_wrapper.py
--- a/python/paddle/fluid/tests/unittests/test_nvprof.py
+++ b/python/paddle/fluid/tests/unittests/test_nvprof.py
--- a/python/paddle/fluid/tests/unittests/test_one_hot_op.py
+++ b/python/paddle/fluid/tests/unittests/test_one_hot_op.py
--- a/python/paddle/fluid/tests/unittests/test_op_support_gpu.py
+++ b/python/paddle/fluid/tests/unittests/test_op_support_gpu.py
--- a/python/paddle/fluid/tests/unittests/test_operator.py
+++ b/python/paddle/fluid/tests/unittests/test_operator.py
--- a/python/paddle/fluid/tests/unittests/test_operator_desc.py
+++ b/python/paddle/fluid/tests/unittests/test_operator_desc.py
--- a/python/paddle/fluid/tests/unittests/test_optimizer.py
+++ b/python/paddle/fluid/tests/unittests/test_optimizer.py
--- a/python/paddle/fluid/tests/unittests/test_pad_op.py
+++ b/python/paddle/fluid/tests/unittests/test_pad_op.py
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_crf.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_crf.py
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_fetch_feed.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_fetch_feed.py
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_mnist.py
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext.py
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_test_while_train.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_test_while_train.py
--- a/python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py
--- a/python/paddle/fluid/tests/unittests/test_parallel_op.py
+++ b/python/paddle/fluid/tests/unittests/test_parallel_op.py
--- a/python/paddle/fluid/tests/unittests/test_parameter.py
+++ b/python/paddle/fluid/tests/unittests/test_parameter.py
--- a/python/paddle/fluid/tests/unittests/test_polygon_box_transform.py
+++ b/python/paddle/fluid/tests/unittests/test_polygon_box_transform.py
--- a/python/paddle/fluid/tests/unittests/test_pool2d_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/test_pool2d_mkldnn_op.py
--- a/python/paddle/fluid/tests/unittests/test_pool2d_op.py
+++ b/python/paddle/fluid/tests/unittests/test_pool2d_op.py
--- a/python/paddle/fluid/tests/unittests/test_pool3d_op.py
+++ b/python/paddle/fluid/tests/unittests/test_pool3d_op.py
--- a/python/paddle/fluid/tests/unittests/test_pool_max_op.py
+++ b/python/paddle/fluid/tests/unittests/test_pool_max_op.py
--- a/python/paddle/fluid/tests/unittests/test_positive_negative_pair_op.py
+++ b/python/paddle/fluid/tests/unittests/test_positive_negative_pair_op.py
--- a/python/paddle/fluid/tests/unittests/test_precision_recall_op.py
+++ b/python/paddle/fluid/tests/unittests/test_precision_recall_op.py
--- a/python/paddle/fluid/tests/unittests/test_prelu_op.py
+++ b/python/paddle/fluid/tests/unittests/test_prelu_op.py
--- a/python/paddle/fluid/tests/unittests/test_preprocessor.py
+++ b/python/paddle/fluid/tests/unittests/test_preprocessor.py
--- a/python/paddle/fluid/tests/unittests/test_print_op.py
+++ b/python/paddle/fluid/tests/unittests/test_print_op.py
--- a/python/paddle/fluid/tests/unittests/test_prior_box_op.py
+++ b/python/paddle/fluid/tests/unittests/test_prior_box_op.py
--- a/python/paddle/fluid/tests/unittests/test_profiler.py
+++ b/python/paddle/fluid/tests/unittests/test_profiler.py
--- a/python/paddle/fluid/tests/unittests/test_protobuf.py
+++ b/python/paddle/fluid/tests/unittests/test_protobuf.py
--- a/python/paddle/fluid/tests/unittests/test_protobuf_descs.py
+++ b/python/paddle/fluid/tests/unittests/test_protobuf_descs.py
--- a/python/paddle/fluid/tests/unittests/test_proximal_adagrad_op.py
+++ b/python/paddle/fluid/tests/unittests/test_proximal_adagrad_op.py
--- a/python/paddle/fluid/tests/unittests/test_proximal_gd_op.py
+++ b/python/paddle/fluid/tests/unittests/test_proximal_gd_op.py
--- a/python/paddle/fluid/tests/unittests/test_py_reader_push_pop.py
+++ b/python/paddle/fluid/tests/unittests/test_py_reader_push_pop.py
--- a/python/paddle/fluid/tests/unittests/test_py_reader_using_executor.py
+++ b/python/paddle/fluid/tests/unittests/test_py_reader_using_executor.py
--- a/python/paddle/fluid/tests/unittests/test_random_crop_op.py
+++ b/python/paddle/fluid/tests/unittests/test_random_crop_op.py
--- a/python/paddle/fluid/tests/unittests/test_rank_loss_op.py
+++ b/python/paddle/fluid/tests/unittests/test_rank_loss_op.py
--- a/python/paddle/fluid/tests/unittests/test_reader_reset.py
+++ b/python/paddle/fluid/tests/unittests/test_reader_reset.py
--- a/python/paddle/fluid/tests/unittests/test_recordio_reader.py
+++ b/python/paddle/fluid/tests/unittests/test_recordio_reader.py
--- a/python/paddle/fluid/tests/unittests/test_recurrent_op.py
+++ b/python/paddle/fluid/tests/unittests/test_recurrent_op.py
--- a/python/paddle/fluid/tests/unittests/test_reduce_op.py
+++ b/python/paddle/fluid/tests/unittests/test_reduce_op.py
--- a/python/paddle/fluid/tests/unittests/test_registry.py
+++ b/python/paddle/fluid/tests/unittests/test_registry.py
--- a/python/paddle/fluid/tests/unittests/test_regularizer.py
+++ b/python/paddle/fluid/tests/unittests/test_regularizer.py
--- a/python/paddle/fluid/tests/unittests/test_reorder_lod_tensor.py
+++ b/python/paddle/fluid/tests/unittests/test_reorder_lod_tensor.py
--- a/python/paddle/fluid/tests/unittests/test_reshape_op.py
+++ b/python/paddle/fluid/tests/unittests/test_reshape_op.py
--- a/python/paddle/fluid/tests/unittests/test_reverse_op.py
+++ b/python/paddle/fluid/tests/unittests/test_reverse_op.py
--- a/python/paddle/fluid/tests/unittests/test_rmsprop_op.py
+++ b/python/paddle/fluid/tests/unittests/test_rmsprop_op.py
--- a/python/paddle/fluid/tests/unittests/test_rnn_memory_helper_op.py
+++ b/python/paddle/fluid/tests/unittests/test_rnn_memory_helper_op.py
--- a/python/paddle/fluid/tests/unittests/test_roi_pool_op.py
+++ b/python/paddle/fluid/tests/unittests/test_roi_pool_op.py
--- a/python/paddle/fluid/tests/unittests/test_row_conv_op.py
+++ b/python/paddle/fluid/tests/unittests/test_row_conv_op.py
--- a/python/paddle/fluid/tests/unittests/test_rpn_target_assign_op.py
+++ b/python/paddle/fluid/tests/unittests/test_rpn_target_assign_op.py
--- a/python/paddle/fluid/tests/unittests/test_sampling_id_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sampling_id_op.py
--- a/python/paddle/fluid/tests/unittests/test_scale_op.py
+++ b/python/paddle/fluid/tests/unittests/test_scale_op.py
--- a/python/paddle/fluid/tests/unittests/test_scatter_op.py
+++ b/python/paddle/fluid/tests/unittests/test_scatter_op.py
--- a/python/paddle/fluid/tests/unittests/test_scope.py
+++ b/python/paddle/fluid/tests/unittests/test_scope.py
--- a/python/paddle/fluid/tests/unittests/test_selected_rows.py
+++ b/python/paddle/fluid/tests/unittests/test_selected_rows.py
--- a/python/paddle/fluid/tests/unittests/test_seq_concat_op.py
+++ b/python/paddle/fluid/tests/unittests/test_seq_concat_op.py
--- a/python/paddle/fluid/tests/unittests/test_seq_conv.py
+++ b/python/paddle/fluid/tests/unittests/test_seq_conv.py
--- a/python/paddle/fluid/tests/unittests/test_seq_pool.py
+++ b/python/paddle/fluid/tests/unittests/test_seq_pool.py
--- a/python/paddle/fluid/tests/unittests/test_sequence_erase_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sequence_erase_op.py
--- a/python/paddle/fluid/tests/unittests/test_sequence_expand.py
+++ b/python/paddle/fluid/tests/unittests/test_sequence_expand.py
--- a/python/paddle/fluid/tests/unittests/test_sequence_reshape.py
+++ b/python/paddle/fluid/tests/unittests/test_sequence_reshape.py
--- a/python/paddle/fluid/tests/unittests/test_sequence_slice_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sequence_slice_op.py
--- a/python/paddle/fluid/tests/unittests/test_sequence_softmax_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sequence_softmax_op.py
--- a/python/paddle/fluid/tests/unittests/test_sgd_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sgd_op.py
--- a/python/paddle/fluid/tests/unittests/test_shape_op.py
+++ b/python/paddle/fluid/tests/unittests/test_shape_op.py
--- a/python/paddle/fluid/tests/unittests/test_shrink_rnn_memory.py
+++ b/python/paddle/fluid/tests/unittests/test_shrink_rnn_memory.py
--- a/python/paddle/fluid/tests/unittests/test_sigmoid_cross_entropy_with_logits_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sigmoid_cross_entropy_with_logits_op.py
--- a/python/paddle/fluid/tests/unittests/test_sign_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sign_op.py
--- a/python/paddle/fluid/tests/unittests/test_slice_op.py
+++ b/python/paddle/fluid/tests/unittests/test_slice_op.py
--- a/python/paddle/fluid/tests/unittests/test_slice_var.py
+++ b/python/paddle/fluid/tests/unittests/test_slice_var.py
--- a/python/paddle/fluid/tests/unittests/test_smooth_l1_loss_op.py
+++ b/python/paddle/fluid/tests/unittests/test_smooth_l1_loss_op.py
--- a/python/paddle/fluid/tests/unittests/test_softmax_op.py
+++ b/python/paddle/fluid/tests/unittests/test_softmax_op.py
--- a/python/paddle/fluid/tests/unittests/test_softmax_with_cross_entropy_op.py
+++ b/python/paddle/fluid/tests/unittests/test_softmax_with_cross_entropy_op.py
--- a/python/paddle/fluid/tests/unittests/test_split_and_merge_lod_tensor_op.py
+++ b/python/paddle/fluid/tests/unittests/test_split_and_merge_lod_tensor_op.py
--- a/python/paddle/fluid/tests/unittests/test_split_ids_op.py
+++ b/python/paddle/fluid/tests/unittests/test_split_ids_op.py
--- a/python/paddle/fluid/tests/unittests/test_split_op.py
+++ b/python/paddle/fluid/tests/unittests/test_split_op.py
--- a/python/paddle/fluid/tests/unittests/test_split_selected_rows_op.py
+++ b/python/paddle/fluid/tests/unittests/test_split_selected_rows_op.py
--- a/python/paddle/fluid/tests/unittests/test_spp_op.py
+++ b/python/paddle/fluid/tests/unittests/test_spp_op.py
--- a/python/paddle/fluid/tests/unittests/test_squared_l2_distance_op.py
+++ b/python/paddle/fluid/tests/unittests/test_squared_l2_distance_op.py
--- a/python/paddle/fluid/tests/unittests/test_squared_l2_norm_op.py
+++ b/python/paddle/fluid/tests/unittests/test_squared_l2_norm_op.py
--- a/python/paddle/fluid/tests/unittests/test_squeeze_op.py
+++ b/python/paddle/fluid/tests/unittests/test_squeeze_op.py
--- a/python/paddle/fluid/tests/unittests/test_sum_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sum_mkldnn_op.py
--- a/python/paddle/fluid/tests/unittests/test_sum_op.py
+++ b/python/paddle/fluid/tests/unittests/test_sum_op.py
--- a/python/paddle/fluid/tests/unittests/test_switch.py
+++ b/python/paddle/fluid/tests/unittests/test_switch.py
--- a/python/paddle/fluid/tests/unittests/test_target_assign_op.py
+++ b/python/paddle/fluid/tests/unittests/test_target_assign_op.py
--- a/python/paddle/fluid/tests/unittests/test_tensor.py
+++ b/python/paddle/fluid/tests/unittests/test_tensor.py
--- a/python/paddle/fluid/tests/unittests/test_top_k_op.py
+++ b/python/paddle/fluid/tests/unittests/test_top_k_op.py
--- a/python/paddle/fluid/tests/unittests/test_transpose_op.py
+++ b/python/paddle/fluid/tests/unittests/test_transpose_op.py
--- a/python/paddle/fluid/tests/unittests/test_uniform_random_batch_size_like_op.py
+++ b/python/paddle/fluid/tests/unittests/test_uniform_random_batch_size_like_op.py
--- a/python/paddle/fluid/tests/unittests/test_uniform_random_op.py
+++ b/python/paddle/fluid/tests/unittests/test_uniform_random_op.py
--- a/python/paddle/fluid/tests/unittests/test_unique_name.py
+++ b/python/paddle/fluid/tests/unittests/test_unique_name.py
--- a/python/paddle/fluid/tests/unittests/test_unpool_op.py
+++ b/python/paddle/fluid/tests/unittests/test_unpool_op.py
--- a/python/paddle/fluid/tests/unittests/test_unsqueeze_op.py
+++ b/python/paddle/fluid/tests/unittests/test_unsqueeze_op.py
--- a/python/paddle/fluid/tests/unittests/test_variable.py
+++ b/python/paddle/fluid/tests/unittests/test_variable.py
--- a/python/paddle/fluid/tests/unittests/test_version.py
+++ b/python/paddle/fluid/tests/unittests/test_version.py
--- a/python/paddle/fluid/tests/unittests/test_warpctc_op.py
+++ b/python/paddle/fluid/tests/unittests/test_warpctc_op.py
--- a/python/paddle/fluid/tests/unittests/test_weight_normalization.py
+++ b/python/paddle/fluid/tests/unittests/test_weight_normalization.py
--- a/python/paddle/fluid/tests/unittests/test_while_op.py
+++ b/python/paddle/fluid/tests/unittests/test_while_op.py
--- a/python/paddle/fluid/tests/unittests/testsuite.py
+++ b/python/paddle/fluid/tests/unittests/testsuite.py
--- a/python/paddle/fluid/tests/unittests/transformer_model.py
+++ b/python/paddle/fluid/tests/unittests/transformer_model.py
--- a/python/paddle/fluid/trainer.py
+++ b/python/paddle/fluid/trainer.py
--- a/python/paddle/fluid/transpiler/__init__.py
+++ b/python/paddle/fluid/transpiler/__init__.py
--- a/python/paddle/fluid/transpiler/details/__init__.py
+++ b/python/paddle/fluid/transpiler/details/__init__.py
--- a/python/paddle/fluid/transpiler/details/program_utils.py
+++ b/python/paddle/fluid/transpiler/details/program_utils.py
--- a/python/paddle/fluid/transpiler/details/ufind.py
+++ b/python/paddle/fluid/transpiler/details/ufind.py
--- a/python/paddle/fluid/transpiler/distribute_transpiler.py
+++ b/python/paddle/fluid/transpiler/distribute_transpiler.py
--- a/python/paddle/fluid/transpiler/inference_transpiler.py
+++ b/python/paddle/fluid/transpiler/inference_transpiler.py
--- a/python/paddle/fluid/transpiler/memory_optimization_transpiler.py
+++ b/python/paddle/fluid/transpiler/memory_optimization_transpiler.py
--- a/python/paddle/fluid/transpiler/ps_dispatcher.py
+++ b/python/paddle/fluid/transpiler/ps_dispatcher.py
--- a/python/paddle/fluid/unique_name.py
+++ b/python/paddle/fluid/unique_name.py
--- a/python/paddle/reader/decorator.py
+++ b/python/paddle/reader/decorator.py
--- a/python/paddle/reader/tests/creator_test.py
+++ b/python/paddle/reader/tests/creator_test.py
--- a/python/paddle/utils/dump_config.py
+++ b/python/paddle/utils/dump_config.py
--- a/python/paddle/utils/image_multiproc.py
+++ b/python/paddle/utils/image_multiproc.py
--- a/python/paddle/utils/image_util.py
+++ b/python/paddle/utils/image_util.py
--- a/python/paddle/utils/make_model_diagram.py
+++ b/python/paddle/utils/make_model_diagram.py
--- a/python/paddle/utils/merge_model.py
+++ b/python/paddle/utils/merge_model.py
--- a/python/paddle/utils/plotcurve.py
+++ b/python/paddle/utils/plotcurve.py
--- a/python/paddle/utils/predefined_net.py
+++ b/python/paddle/utils/predefined_net.py
--- a/python/paddle/utils/preprocess_img.py
+++ b/python/paddle/utils/preprocess_img.py
--- a/python/paddle/utils/preprocess_util.py
+++ b/python/paddle/utils/preprocess_util.py
--- a/python/paddle/utils/show_pb.py
+++ b/python/paddle/utils/show_pb.py
--- a/python/paddle/utils/torch2paddle.py
+++ b/python/paddle/utils/torch2paddle.py
--- a/tools/manylinux1/build_scripts/build.sh
+++ b/tools/manylinux1/build_scripts/build.sh
--- a/tools/manylinux1/build_scripts/install_nccl2.sh
+++ b/tools/manylinux1/build_scripts/install_nccl2.sh
--- a/tools/test_runner.py
+++ b/tools/test_runner.py