Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into fix_concat_shape_2

db0518bb · phlrain · 3f0d047d · 06809ebb · db0518bb · db0518bb
36 changed file
--- a/cmake/external/protobuf.cmake
+++ b/cmake/external/protobuf.cmake
@@ -221,6 +221,7 @@ FUNCTION(build_protobuf TARGET_NAME BUILD_FOR_HOST)
            -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE}
            -DCMAKE_INSTALL_PREFIX=${PROTOBUF_INSTALL_DIR}
            -DCMAKE_INSTALL_LIBDIR=lib
+            -DBUILD_SHARED_LIBS=OFF
        CMAKE_CACHE_ARGS
            -DCMAKE_INSTALL_PREFIX:PATH=${PROTOBUF_INSTALL_DIR}
            -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE}

--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@@ -118,6 +118,8 @@ paddle.fluid.layers.reduce_mean (ArgSpec(args=['input', 'dim', 'keep_dim', 'name
 paddle.fluid.layers.reduce_max (ArgSpec(args=['input', 'dim', 'keep_dim', 'name'], varargs=None, keywords=None, defaults=(None, False, None)), ('document', '66a622db727551761ce4eb73eaa7f6a4'))
 paddle.fluid.layers.reduce_min (ArgSpec(args=['input', 'dim', 'keep_dim', 'name'], varargs=None, keywords=None, defaults=(None, False, None)), ('document', 'd50ac552b5d131468ed466d08bb2d38c'))
 paddle.fluid.layers.reduce_prod (ArgSpec(args=['input', 'dim', 'keep_dim', 'name'], varargs=None, keywords=None, defaults=(None, False, None)), ('document', 'fcd8301a0ce15f219c7a4bcd0c1e8eca'))
+paddle.fluid.layers.reduce_all (ArgSpec(args=['input', 'dim', 'keep_dim', 'name'], varargs=None, keywords=None, defaults=(None, False, None)), ('document', '646ca4d4a2cc16084f59de44b6927eca'))
+paddle.fluid.layers.reduce_any (ArgSpec(args=['input', 'dim', 'keep_dim', 'name'], varargs=None, keywords=None, defaults=(None, False, None)), ('document', 'f36661060aeeaf6c6b1331e41b3726fa'))
 paddle.fluid.layers.sequence_first_step (ArgSpec(args=['input'], varargs=None, keywords=None, defaults=None), ('document', '2b290d3d77882bfe9bb8d331cac8cdd3'))
 paddle.fluid.layers.sequence_last_step (ArgSpec(args=['input'], varargs=None, keywords=None, defaults=None), ('document', 'c16a892f44f7fe71bfa5afc32d3f34ce'))
 paddle.fluid.layers.sequence_slice (ArgSpec(args=['input', 'offset', 'length', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'fdcea0e8b5bc7d8d4b1b072c521014e6'))
@@ -125,7 +127,7 @@ paddle.fluid.layers.dropout (ArgSpec(args=['x', 'dropout_prob', 'is_test', 'seed
 paddle.fluid.layers.split (ArgSpec(args=['input', 'num_or_sections', 'dim', 'name'], varargs=None, keywords=None, defaults=(-1, None)), ('document', '652625345c2acb900029c78cc75f8aa6'))
 paddle.fluid.layers.ctc_greedy_decoder (ArgSpec(args=['input', 'blank', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'ebbf2adbd79683dc93db03454dfa18c2'))
 paddle.fluid.layers.edit_distance (ArgSpec(args=['input', 'label', 'normalized', 'ignored_tokens'], varargs=None, keywords=None, defaults=(True, None)), ('document', '97f0262f97602644c83142789d784571'))
-paddle.fluid.layers.l2_normalize (ArgSpec(args=['x', 'axis', 'epsilon', 'name'], varargs=None, keywords=None, defaults=(1e-12, None)), ('document', '6e428384ce6a77207fa2c70d9f011990'))
+paddle.fluid.layers.l2_normalize (ArgSpec(args=['x', 'axis', 'epsilon', 'name'], varargs=None, keywords=None, defaults=(1e-12, None)), ('document', '35c6a241bcc1a1fc89508860d82ad62b'))
 paddle.fluid.layers.matmul (ArgSpec(args=['x', 'y', 'transpose_x', 'transpose_y', 'alpha', 'name'], varargs=None, keywords=None, defaults=(False, False, 1.0, None)), ('document', 'b4cbe1ac451005df6dad12e9ffdccca9'))
 paddle.fluid.layers.topk (ArgSpec(args=['input', 'k', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'd3570c02f71bcd78e60b3f31dc8f5b32'))
 paddle.fluid.layers.warpctc (ArgSpec(args=['input', 'label', 'blank', 'norm_by_times', 'use_cudnn'], varargs=None, keywords=None, defaults=(0, False, False)), ('document', 'aaba49c038ba927f0a8e45c0c9a686ab'))
@@ -204,6 +206,7 @@ paddle.fluid.layers.gaussian_random_batch_size_like (ArgSpec(args=['input', 'sha
 paddle.fluid.layers.sum (ArgSpec(args=['x'], varargs=None, keywords=None, defaults=None), ('document', 'a418e3ccb5e2ac21bd60f5cc221d5860'))
 paddle.fluid.layers.slice (ArgSpec(args=['input', 'axes', 'starts', 'ends'], varargs=None, keywords=None, defaults=None), ('document', '01dbb91e7c74cb11336cd531013de51a'))
 paddle.fluid.layers.shape (ArgSpec(args=['input'], varargs=None, keywords=None, defaults=None), ('document', '17db0f814eb7bb5a3fac1ca6e60e16d8'))
+paddle.fluid.layers.rank (ArgSpec(args=['input'], varargs=None, keywords=None, defaults=None), ('document', 'ee1386c42ecc8f424fe3fb21862fefc2'))
 paddle.fluid.layers.logical_and (ArgSpec(args=['x', 'y', 'out', 'name'], varargs=None, keywords=None, defaults=(None, None)), ('document', 'cdcf20c494c92060d10feb9374532f42'))
 paddle.fluid.layers.logical_or (ArgSpec(args=['x', 'y', 'out', 'name'], varargs=None, keywords=None, defaults=(None, None)), ('document', '0eae3f726a4afe590757552fa3ced012'))
 paddle.fluid.layers.logical_xor (ArgSpec(args=['x', 'y', 'out', 'name'], varargs=None, keywords=None, defaults=(None, None)), ('document', 'b0daaa3fa4a0aa62f9b58c43d959eb25'))
@@ -236,7 +239,7 @@ paddle.fluid.layers.huber_loss (ArgSpec(args=['input', 'label', 'delta'], vararg
 paddle.fluid.layers.kldiv_loss (ArgSpec(args=['x', 'target', 'reduction', 'name'], varargs=None, keywords=None, defaults=('mean', None)), ('document', '776d536cac47c89073abc7ee524d5aec'))
 paddle.fluid.layers.tree_conv (ArgSpec(args=['nodes_vector', 'edge_set', 'output_size', 'num_filters', 'max_depth', 'act', 'param_attr', 'bias_attr', 'name'], varargs=None, keywords=None, defaults=(1, 2, 'tanh', None, None, None)), ('document', '34ea12ac9f10a65dccbc50100d12e607'))
 paddle.fluid.layers.npair_loss (ArgSpec(args=['anchor', 'positive', 'labels', 'l2_reg'], varargs=None, keywords=None, defaults=(0.002,)), ('document', '46994d10276dd4cb803b4062b5d14329'))
-paddle.fluid.layers.pixel_shuffle (ArgSpec(args=['x', 'upscale_factor'], varargs=None, keywords=None, defaults=None), ('document', 'ad669cdf83e72a69ebc5ed79e36486de'))
+paddle.fluid.layers.pixel_shuffle (ArgSpec(args=['x', 'upscale_factor'], varargs=None, keywords=None, defaults=None), ('document', '731b21c62a4add60a33bd76d802ffc5c'))
 paddle.fluid.layers.fsp_matrix (ArgSpec(args=['x', 'y'], varargs=None, keywords=None, defaults=None), ('document', 'b76ccca3735bea4a58a0dbf0d77c5393'))
 paddle.fluid.layers.data (ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True)), ('document', '33bbd42027d872b3818b3d64ec52e139'))
 paddle.fluid.layers.open_files (ArgSpec(args=['filenames', 'shapes', 'lod_levels', 'dtypes', 'thread_num', 'buffer_size', 'pass_num', 'is_test'], varargs=None, keywords=None, defaults=(None, None, 1, None)), ('document', 'b1ae2e1cc0750e58726374061ea90ecc'))
@@ -272,6 +275,7 @@ paddle.fluid.layers.has_inf (ArgSpec(args=['x'], varargs=None, keywords=None, de
 paddle.fluid.layers.has_nan (ArgSpec(args=['x'], varargs=None, keywords=None, defaults=None), ('document', '2e53e83127dbfd86e7098bdfe9a549e8'))
 paddle.fluid.layers.isfinite (ArgSpec(args=['x'], varargs=None, keywords=None, defaults=None), ('document', '0a437011c3906079fd8947ed3e52d292'))
 paddle.fluid.layers.range (ArgSpec(args=['start', 'end', 'step', 'dtype'], varargs=None, keywords=None, defaults=None), ('document', '2ec937ede953ded2fdff2675883900bb'))
+paddle.fluid.layers.linspace (ArgSpec(args=['start', 'stop', 'num', 'dtype'], varargs=None, keywords=None, defaults=None), ('document', '495e21e9a848c2d075a102802fc67756'))
 paddle.fluid.layers.While.__init__ (ArgSpec(args=['self', 'cond', 'is_test', 'name'], varargs=None, keywords=None, defaults=(False, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.layers.While.block (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.layers.Switch.__init__ (ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
@@ -361,7 +365,7 @@ paddle.fluid.layers.inverse_time_decay (ArgSpec(args=['learning_rate', 'decay_st
 paddle.fluid.layers.polynomial_decay (ArgSpec(args=['learning_rate', 'decay_steps', 'end_learning_rate', 'power', 'cycle'], varargs=None, keywords=None, defaults=(0.0001, 1.0, False)), ('document', '882634f420f626642f0874481263da40'))
 paddle.fluid.layers.piecewise_decay (ArgSpec(args=['boundaries', 'values'], varargs=None, keywords=None, defaults=None), ('document', 'c717d9d1d78a53c809d01b8bc56f3cae'))
 paddle.fluid.layers.noam_decay (ArgSpec(args=['d_model', 'warmup_steps'], varargs=None, keywords=None, defaults=None), ('document', 'd9a95746353fd574be36dc28d8726c28'))
-paddle.fluid.layers.cosine_decay (ArgSpec(args=['learning_rate', 'step_each_epoch', 'epochs'], varargs=None, keywords=None, defaults=None), ('document', '9588c64c26ffaef3c466e404a6af9d9b'))
+paddle.fluid.layers.cosine_decay (ArgSpec(args=['learning_rate', 'step_each_epoch', 'epochs'], varargs=None, keywords=None, defaults=None), ('document', 'f8b2727bccf0f368c997d7cf05847e49'))
 paddle.fluid.layers.linear_lr_warmup (ArgSpec(args=['learning_rate', 'warmup_steps', 'start_lr', 'end_lr'], varargs=None, keywords=None, defaults=None), ('document', '2ef3f5ca5cd71ea4217c418e5a7a0565'))
 paddle.fluid.contrib.InitState.__init__ (ArgSpec(args=['self', 'init', 'shape', 'value', 'init_boot', 'need_reorder', 'dtype'], varargs=None, keywords=None, defaults=(None, None, 0.0, None, False, 'float32')), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))
 paddle.fluid.contrib.StateCell.__init__ (ArgSpec(args=['self', 'inputs', 'states', 'out_state', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754'))

--- a/paddle/fluid/framework/data_feed.cc
+++ b/paddle/fluid/framework/data_feed.cc
@@ -72,7 +72,6 @@ bool DataFeed::PickOneFile(std::string* filename) {
  }
  VLOG(3) << "file_idx_=" << *file_idx_;
  *filename = filelist_[(*file_idx_)++];
-  // LOG(ERROR) << "pick file:" << *filename;
  return true;
 }

@@ -466,6 +465,17 @@ void MultiSlotDataFeed::Init(
    if (slot.is_used()) {
      use_slots_.push_back(all_slots_[i]);
      use_slots_is_dense_.push_back(slot.is_dense());
+      std::vector<int> local_shape;
+      if (slot.is_dense()) {
+        // for batch size holder if is_dense
+        if (slot.shape(0) > 0) {
+          local_shape.push_back(0);
+        }
+      }
+      for (size_t i = 0; i < slot.shape_size(); ++i) {
+        local_shape.push_back(slot.shape(i));
+      }
+      use_slots_shape_.push_back(local_shape);
    }
  }
  feed_vec_.resize(use_slots_.size());
@@ -752,8 +762,8 @@ void MultiSlotDataFeed::PutToFeedVec(
    LoD data_lod{offset};
    feed_vec_[i]->set_lod(data_lod);
    if (use_slots_is_dense_[i]) {
-      int dim = total_instance / batch_size_;
-      feed_vec_[i]->Resize({batch_size_, dim});
+      use_slots_shape_[i][0] = batch_size_;
+      feed_vec_[i]->Resize(framework::make_ddim(use_slots_shape_[i]));
    }
  }
 #endif
@@ -785,6 +795,16 @@ void MultiSlotInMemoryDataFeed::Init(
    if (slot.is_used()) {
      use_slots_.push_back(all_slots_[i]);
      use_slots_is_dense_.push_back(slot.is_dense());
+      std::vector<int> local_shape;
+      if (slot.is_dense()) {
+        if (slot.shape(0) > 0) {
+          local_shape.push_back(0);
+        }
+      }
+      for (size_t i = 0; i < slot.shape_size(); ++i) {
+        local_shape.push_back(slot.shape(i));
+      }
+      use_slots_shape_.push_back(local_shape);
    }
  }
  feed_vec_.resize(use_slots_.size());
@@ -940,8 +960,8 @@ void MultiSlotInMemoryDataFeed::PutToFeedVec(
    LoD data_lod{offset};
    feed_vec_[i]->set_lod(data_lod);
    if (use_slots_is_dense_[i]) {
-      int dim = total_instance / batch_size_;
-      feed_vec_[i]->Resize({batch_size_, dim});
+      use_slots_shape_[i][0] = batch_size_;
+      feed_vec_[i]->Resize(framework::make_ddim(use_slots_shape_[i]));
    }
  }
 #endif

--- a/paddle/fluid/framework/data_feed.h
+++ b/paddle/fluid/framework/data_feed.h
@@ -142,6 +142,7 @@ class DataFeed {
  // object)
  std::vector<std::string> all_slots_;
  std::vector<std::string> all_slots_type_;
+  std::vector<std::vector<int>> use_slots_shape_;
  std::vector<int>
      use_slots_index_;  // -1: not used; >=0: the index of use_slots_


--- a/paddle/fluid/framework/data_feed.proto
+++ b/paddle/fluid/framework/data_feed.proto
@@ -19,6 +19,7 @@ message Slot {
  required string type = 2;
  optional bool is_dense = 3 [ default = false ];
  optional bool is_used = 4 [ default = false ];
+  repeated int32 shape = 5; // we can define N-D Tensor
 }

 message MultiSlotDesc { repeated Slot slots = 1; }

--- a/paddle/fluid/framework/downpour_worker.cc
+++ b/paddle/fluid/framework/downpour_worker.cc
@@ -21,40 +21,40 @@ namespace framework {

 void DownpourWorker::Initialize(const TrainerDesc& desc) {
  param_ = desc.downpour_param();
-  for (size_t i = 0; i < param_.sparse_table_size(); ++i) {
+  for (int i = 0; i < param_.sparse_table_size(); ++i) {
    uint64_t table_id =
        static_cast<uint64_t>(param_.sparse_table(i).table_id());
    TableParameter table = param_.sparse_table(i);
    sparse_key_names_[table_id].resize(table.sparse_key_name_size());
-    for (size_t j = 0; j < table.sparse_key_name_size(); ++j) {
+    for (int j = 0; j < table.sparse_key_name_size(); ++j) {
      sparse_key_names_[table_id][j] = table.sparse_key_name(j);
    }
    sparse_value_names_[table_id].resize(table.sparse_value_name_size());
-    for (size_t j = 0; j < table.sparse_value_name_size(); ++j) {
+    for (int j = 0; j < table.sparse_value_name_size(); ++j) {
      sparse_value_names_[table_id][j] = table.sparse_value_name(j);
    }
    sparse_grad_names_[table_id].resize(table.sparse_grad_name_size());
-    for (size_t j = 0; j < table.sparse_grad_name_size(); ++j) {
+    for (int j = 0; j < table.sparse_grad_name_size(); ++j) {
      sparse_grad_names_[table_id][j] = table.sparse_grad_name(j);
    }
    label_var_name_[table_id] = table.label_var_name();
  }

-  for (size_t i = 0; i < param_.dense_table_size(); ++i) {
+  for (int i = 0; i < param_.dense_table_size(); ++i) {
    uint64_t table_id = static_cast<uint64_t>(param_.dense_table(i).table_id());
    auto table = param_.dense_table(i);
    dense_value_names_[table_id].resize(table.dense_value_name_size());
-    for (size_t j = 0; j < table.dense_value_name_size(); ++j) {
+    for (int j = 0; j < table.dense_value_name_size(); ++j) {
      dense_value_names_[table_id][j] = table.dense_value_name(j);
    }
    dense_grad_names_[table_id].resize(table.dense_grad_name_size());
-    for (size_t j = 0; j < table.dense_grad_name_size(); ++j) {
+    for (int j = 0; j < table.dense_grad_name_size(); ++j) {
      dense_grad_names_[table_id][j] = table.dense_grad_name(j);
    }
  }

  skip_ops_.resize(param_.skip_ops_size());
-  for (size_t i = 0; i < param_.skip_ops_size(); ++i) {
+  for (int i = 0; i < param_.skip_ops_size(); ++i) {
    skip_ops_[i] = param_.skip_ops(i);
  }

@@ -83,14 +83,14 @@ void DownpourWorker::CollectLabelInfo(size_t table_idx) {
  LoDTensor* tensor = var->GetMutable<LoDTensor>();
  int64_t* label_ptr = tensor->data<int64_t>();

-  int global_index = 0;
+  size_t global_index = 0;
  for (size_t i = 0; i < sparse_key_names_[table_id].size(); ++i) {
    VLOG(3) << "sparse_key_names_[" << i
            << "]: " << sparse_key_names_[table_id][i];
    Variable* fea_var = thread_scope_->FindVar(sparse_key_names_[table_id][i]);
    LoDTensor* tensor = fea_var->GetMutable<LoDTensor>();
    int64_t* ids = tensor->data<int64_t>();
-    int fea_idx = 0;
+    size_t fea_idx = 0;
    // tensor->lod()[0].size() == batch_size + 1
    for (auto lod_idx = 1u; lod_idx < tensor->lod()[0].size(); ++lod_idx) {
      for (; fea_idx < tensor->lod()[0][lod_idx]; ++fea_idx) {
@@ -138,7 +138,7 @@ void DownpourWorker::FillSparseValue(size_t table_idx) {
    auto& tensor_lod = tensor->lod()[0];
    LoD data_lod{tensor_lod};
    tensor_emb->set_lod(data_lod);
-    for (auto index = 0u; index < len; ++index) {
+    for (int index = 0; index < len; ++index) {
      if (ids[index] == 0u) {
        memcpy(ptr + table.emb_dim() * index, init_value.data() + 2,
               sizeof(float) * table.emb_dim());
@@ -192,7 +192,7 @@ void DownpourWorker::TrainFilesWithProfiler() {
    read_time += timeline.ElapsedSec();
    total_time += timeline.ElapsedSec();
    VLOG(3) << "program config size: " << param_.program_config_size();
-    for (size_t i = 0; i < param_.program_config(0).pull_sparse_table_id_size();
+    for (int i = 0; i < param_.program_config(0).pull_sparse_table_id_size();
         ++i) {
      uint64_t tid = static_cast<uint64_t>(
          param_.program_config(0).pull_sparse_table_id(i));
@@ -244,8 +244,8 @@ void DownpourWorker::TrainFilesWithProfiler() {
    }

    if (need_to_push_sparse_) {
-      for (size_t i = 0;
-           i < param_.program_config(0).push_sparse_table_id_size(); ++i) {
+      for (int i = 0; i < param_.program_config(0).push_sparse_table_id_size();
+           ++i) {
        uint64_t tid = static_cast<uint64_t>(
            param_.program_config(0).push_sparse_table_id(i));
        TableParameter table;
@@ -268,8 +268,8 @@ void DownpourWorker::TrainFilesWithProfiler() {

    if (need_to_push_dense_) {
      timeline.Start();
-      for (size_t i = 0;
-           i < param_.program_config(0).push_dense_table_id_size(); ++i) {
+      for (int i = 0; i < param_.program_config(0).push_dense_table_id_size();
+           ++i) {
        uint64_t tid = static_cast<uint64_t>(
            param_.program_config(0).push_dense_table_id(i));
        fleet_ptr_->PushDenseVarsAsync(
@@ -315,8 +315,8 @@ void DownpourWorker::TrainFilesWithProfiler() {
    }

    if (need_to_push_dense_) {
-      for (size_t i = 0;
-           i < param_.program_config(0).push_dense_table_id_size(); ++i) {
+      for (int i = 0; i < param_.program_config(0).push_dense_table_id_size();
+           ++i) {
        uint64_t tid = static_cast<uint64_t>(
            param_.program_config(0).push_dense_table_id(i));
        pull_dense_worker_->IncreaseThreadVersion(thread_id_, tid);
@@ -362,7 +362,7 @@ void DownpourWorker::TrainFiles() {
  int cur_batch;
  while ((cur_batch = device_reader_->Next()) > 0) {
    // pull sparse here
-    for (size_t i = 0; i < param_.program_config(0).pull_sparse_table_id_size();
+    for (int i = 0; i < param_.program_config(0).pull_sparse_table_id_size();
         ++i) {
      uint64_t tid = static_cast<uint64_t>(
          param_.program_config(0).pull_sparse_table_id(i));
@@ -397,8 +397,8 @@ void DownpourWorker::TrainFiles() {

    if (need_to_push_sparse_) {
      // push gradients here
-      for (size_t i = 0;
-           i < param_.program_config(0).push_sparse_table_id_size(); ++i) {
+      for (int i = 0; i < param_.program_config(0).push_sparse_table_id_size();
+           ++i) {
        uint64_t tid = static_cast<uint64_t>(
            param_.program_config(0).push_sparse_table_id(i));
        TableParameter table;
@@ -416,8 +416,8 @@ void DownpourWorker::TrainFiles() {
    }

    if (need_to_push_dense_) {
-      for (size_t i = 0;
-           i < param_.program_config(0).push_dense_table_id_size(); ++i) {
+      for (int i = 0; i < param_.program_config(0).push_dense_table_id_size();
+           ++i) {
        uint64_t tid = static_cast<uint64_t>(
            param_.program_config(0).push_dense_table_id(i));
        fleet_ptr_->PushDenseVarsAsync(
@@ -461,8 +461,8 @@ void DownpourWorker::TrainFiles() {
    }

    if (need_to_push_dense_) {
-      for (size_t i = 0;
-           i < param_.program_config(0).push_dense_table_id_size(); ++i) {
+      for (int i = 0; i < param_.program_config(0).push_dense_table_id_size();
+           ++i) {
        uint64_t tid = static_cast<uint64_t>(
            param_.program_config(0).push_dense_table_id(i));
        pull_dense_worker_->IncreaseThreadVersion(thread_id_, tid);

--- a/paddle/fluid/framework/ir/graph_helper.cc
+++ b/paddle/fluid/framework/ir/graph_helper.cc
@@ -31,10 +31,10 @@ namespace paddle {
 namespace framework {
 namespace ir {
 namespace {
-void SortHelper(
-    const std::map<ir::Node *, std::unordered_set<ir::Node *>> &adj_list,
-    ir::Node *node, std::unordered_set<ir::Node *> *visited,
-    std::vector<ir::Node *> *ret) {
+void SortHelper(const std::map<ir::Node *, std::set<ir::Node *, ir::NodeComp>,
+                               ir::NodeComp> &adj_list,
+                ir::Node *node, std::unordered_set<ir::Node *> *visited,
+                std::vector<ir::Node *> *ret) {
  visited->insert(node);

  for (auto adj : adj_list.at(node)) {
@@ -50,7 +50,8 @@ void SortHelper(

 bool HasCircleHelper(
    ir::Node *node,
-    const std::map<ir::Node *, std::unordered_set<ir::Node *>> &adj_list,
+    const std::map<ir::Node *, std::set<ir::Node *, ir::NodeComp>, ir::NodeComp>
+        &adj_list,
    std::unordered_set<ir::Node *> *visited,
    std::unordered_set<ir::Node *> *in_trace,
    std::vector<std::vector<ir::Node *>> *circles) {
@@ -84,7 +85,8 @@ bool HasCircleHelper(
 }

 bool HasCircleInternal(
-    const std::map<ir::Node *, std::unordered_set<ir::Node *>> &adj_list,
+    const std::map<ir::Node *, std::set<ir::Node *, ir::NodeComp>, ir::NodeComp>
+        &adj_list,
    std::vector<std::vector<ir::Node *>> *circles) {
  std::unordered_set<ir::Node *> visited;
  std::unordered_set<ir::Node *> in_trace;
@@ -107,8 +109,8 @@ bool FindCircleSubGraph(const Graph &graph,
 }

 std::vector<ir::Node *> TopologySortOperations(const Graph &graph) {
-  std::map<ir::Node *, std::unordered_set<ir::Node *>> adj_list =
-      BuildOperationAdjList(graph);
+  std::map<ir::Node *, std::set<ir::Node *, ir::NodeComp>, ir::NodeComp>
+      adj_list = BuildOperationAdjList(graph);
  PADDLE_ENFORCE(!HasCircleInternal(adj_list, nullptr));
  std::unordered_set<ir::Node *> visited;
  std::vector<ir::Node *> ret;
@@ -117,34 +119,30 @@ std::vector<ir::Node *> TopologySortOperations(const Graph &graph) {
      SortHelper(adj_list, adj.first, &visited, &ret);
    }
  }
+
  return ret;
 }

 // Build operator inlink edge table.
-std::map<ir::Node *, std::unordered_set<ir::Node *>> BuildOperationAdjList(
-    const Graph &graph) {
-  std::map<ir::Node *, std::unordered_set<ir::Node *>> adj_list;
+std::map<ir::Node *, std::set<ir::Node *, ir::NodeComp>, ir::NodeComp>
+BuildOperationAdjList(const Graph &graph) {
+  std::map<ir::Node *, std::set<ir::Node *, ir::NodeComp>, ir::NodeComp>
+      adj_list;

  for (auto &n : graph.Nodes()) {
    if (!n->IsOp()) continue;
    if (adj_list.find(n) == adj_list.end()) {
-      adj_list[n] = std::unordered_set<ir::Node *>();
+      adj_list[n] = std::set<ir::Node *, ir::NodeComp>();
    }
-    std::vector<ir::Node *> nodes;
    for (auto &var : n->inputs) {
      for (auto &adj_n : var->inputs) {
        PADDLE_ENFORCE(adj_n->NodeType() == ir::Node::Type::kOperation);
        VLOG(4) << "adj " << adj_n->Name() << reinterpret_cast<void *>(adj_n)
                << " -> " << n->Name() << reinterpret_cast<void *>(n)
                << "  via " << var->Name() << reinterpret_cast<void *>(var);
-        nodes.push_back(adj_n);
+        adj_list[n].insert(adj_n);
      }
    }
-    std::sort(nodes.begin(), nodes.end(), [](ir::Node *node1, ir::Node *node2) {
-      return node1->id() > node2->id();
-    });
-    adj_list[n].insert(std::make_move_iterator(nodes.begin()),
-                       std::make_move_iterator(nodes.end()));
  }
  return adj_list;
 }

--- a/paddle/fluid/framework/ir/graph_helper.h
+++ b/paddle/fluid/framework/ir/graph_helper.h
@@ -16,6 +16,7 @@ limitations under the License. */

 #include <map>
 #include <memory>
+#include <set>
 #include <vector>

 #include "paddle/fluid/framework/ir/graph.h"
@@ -25,6 +26,13 @@ namespace paddle {
 namespace framework {
 namespace ir {

+// Compare nodes via node id.
+struct NodeComp {
+  bool operator()(ir::Node *const &node1, ir::Node *const &node2) const {
+    return node1->id() < node2->id();
+  }
+};
+
 // Test if the graph contains circle.
 bool HasCircle(const Graph &graph);

@@ -57,8 +65,8 @@ std::vector<Node *> TopologyVarientSort(const Graph &graph, SortKind sort_kind);
 void CleanIndividualNodes(Graph *graph);

 // Build an adjacency list of operations for the `graph`.
-std::map<ir::Node *, std::unordered_set<ir::Node *>> BuildOperationAdjList(
-    const Graph &graph);
+std::map<ir::Node *, std::set<ir::Node *, ir::NodeComp>, ir::NodeComp>
+BuildOperationAdjList(const Graph &graph);

 template <typename T>
 std::vector<T *> FilterByNodeWrapper(const Graph &graph) {

--- a/paddle/fluid/framework/op_desc.cc
+++ b/paddle/fluid/framework/op_desc.cc
@@ -241,6 +241,7 @@ OpDesc::OpDesc(const std::string &type, const VariableNameMap &inputs,
  outputs_ = outputs;
  attrs_ = attrs;
  need_update_ = true;
+  block_ = nullptr;
 }

 OpDesc::OpDesc(const OpDesc &other, BlockDesc *block) {

--- a/paddle/fluid/framework/parallel_executor.cc
+++ b/paddle/fluid/framework/parallel_executor.cc
@@ -221,7 +221,7 @@ ParallelExecutor::ParallelExecutor(const std::vector<platform::Place> &places,
    PADDLE_ENFORCE(!member_->use_cuda_,
                   "gpu mode does not support async_mode_ now!");
    graphs.push_back(graph);
-    for (int i = 1; i < places.size(); ++i) {
+    for (size_t i = 1; i < places.size(); ++i) {
      auto *tmp_graph = new ir::Graph(graph->OriginProgram());
      async_graphs_.emplace_back(tmp_graph);
      graphs.push_back(tmp_graph);
@@ -315,7 +315,7 @@ ParallelExecutor::ParallelExecutor(const std::vector<platform::Place> &places,
    graph = build_strategy.Apply(graph, {member_->places_[0]}, loss_var_name,
                                 {member_->local_scopes_[0]}, 1,
                                 member_->use_cuda_, member_->nccl_ctxs_.get());
-    for (int i = 1; i < member_->places_.size(); ++i) {
+    for (size_t i = 1; i < member_->places_.size(); ++i) {
      graphs[i] =
          build_strategy.Apply(graphs[i], {member_->places_[i]}, loss_var_name,
                               {member_->local_scopes_[i]}, 1,

--- a/paddle/fluid/framework/trainer_desc.proto
+++ b/paddle/fluid/framework/trainer_desc.proto
@@ -76,7 +76,7 @@ message PullDenseWorkerParameter {

 message TableParameter {
  // dense table only
-  optional int64 table_id = 1;
+  optional uint64 table_id = 1;
  repeated string dense_value_name = 2;
  repeated string dense_grad_name = 3;
  repeated int32 push_dense_wait_times = 5;

--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -259,6 +259,9 @@ bool AnalysisPredictor::SetFeed(const std::vector<PaddleTensor> &inputs,
      return false;
    }

+    PADDLE_ENFORCE_NOT_NULL(input_ptr);
+    PADDLE_ENFORCE_NOT_NULL(inputs[i].data.data());
+
    if (platform::is_cpu_place(place_)) {
      // TODO(panyx0718): Init LoDTensor from existing memcpy to save a copy.
      std::memcpy(static_cast<void *>(input_ptr), inputs[i].data.data(),

--- a/paddle/fluid/inference/api/api.cc
+++ b/paddle/fluid/inference/api/api.cc
@@ -54,6 +54,7 @@ PaddleBuf &PaddleBuf::operator=(const PaddleBuf &other) {
    memory_owned_ = other.memory_owned_;
  } else {
    Resize(other.length());
+    PADDLE_ENFORCE(!(other.length() > 0 && other.data() == nullptr));
    memcpy(data_, other.data(), other.length());
    length_ = other.length();
    memory_owned_ = true;

--- a/paddle/fluid/inference/api/api_impl.cc
+++ b/paddle/fluid/inference/api/api_impl.cc
@@ -169,6 +169,7 @@ std::unique_ptr<PaddlePredictor> NativePaddlePredictor::Clone() {
  std::unique_ptr<PaddlePredictor> cls(new NativePaddlePredictor(config_));
  // Hot fix the bug that result diff in multi-thread.
  // TODO(Superjomn) re-implement a real clone here.
+  PADDLE_ENFORCE_NOT_NULL(dynamic_cast<NativePaddlePredictor *>(cls.get()));
  if (!dynamic_cast<NativePaddlePredictor *>(cls.get())->Init(nullptr)) {
    LOG(ERROR) << "fail to call Init";
    return nullptr;
@@ -210,6 +211,8 @@ bool NativePaddlePredictor::SetFeed(const std::vector<PaddleTensor> &inputs,
      return false;
    }

+    PADDLE_ENFORCE_NOT_NULL(input_ptr);
+    PADDLE_ENFORCE_NOT_NULL(inputs[i].data.data());
    if (platform::is_cpu_place(place_)) {
      // TODO(panyx0718): Init LoDTensor from existing memcpy to save a copy.
      std::memcpy(static_cast<void *>(input_ptr), inputs[i].data.data(),
@@ -316,6 +319,8 @@ std::unique_ptr<PaddlePredictor> CreatePaddlePredictor<
  }

  std::unique_ptr<PaddlePredictor> predictor(new NativePaddlePredictor(config));
+  PADDLE_ENFORCE_NOT_NULL(
+      dynamic_cast<NativePaddlePredictor *>(predictor.get()));
  if (!dynamic_cast<NativePaddlePredictor *>(predictor.get())->Init(nullptr)) {
    return nullptr;
  }

--- a/paddle/fluid/inference/tests/api/analyzer_seq_conv1_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_seq_conv1_tester.cc
@@ -47,6 +47,7 @@ struct DataRecord {
      num_lines++;
      std::vector<std::string> data;
      split(line, '\t', &data);
+      PADDLE_ENFORCE(data.size() >= 4);
      // load title1 data
      std::vector<int64_t> title1_data;
      split_to_int64(data[0], ' ', &title1_data);

--- a/paddle/fluid/inference/tests/api/analyzer_transformer_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_transformer_tester.cc
@@ -214,28 +214,23 @@ TEST(Analyzer_Transformer, fuse_statis) {
 }

 // Compare result of NativeConfig and AnalysisConfig
-// void compare(bool use_mkldnn = false) {
-//   AnalysisConfig cfg;
-//   SetConfig(&cfg);
-//   if (use_mkldnn) {
-//     cfg.EnableMKLDNN();
-//   }
-//
-//   std::vector<std::vector<PaddleTensor>> input_slots_all;
-//   SetInput(&input_slots_all);
-//   CompareNativeAndAnalysis(
-//       reinterpret_cast<const PaddlePredictor::Config *>(&cfg),
-//       input_slots_all);
-// }
-
-// TODO(yihuaxu):
-//    Disable compare and compare_mkldnn temporary, see
-//    https://github.com/paddlePaddle/Paddle/issues/16316 for details.
-// TEST(Analyzer_Transformer, compare) { compare(); }
-// #ifdef PADDLE_WITH_MKLDNN
-// TEST(Analyzer_Transformer, compare_mkldnn) { compare(true /* use_mkldnn */);
-// }
-// #endif
+void compare(bool use_mkldnn = false) {
+  AnalysisConfig cfg;
+  SetConfig(&cfg);
+  if (use_mkldnn) {
+    cfg.EnableMKLDNN();
+  }
+
+  std::vector<std::vector<PaddleTensor>> input_slots_all;
+  SetInput(&input_slots_all);
+  CompareNativeAndAnalysis(
+      reinterpret_cast<const PaddlePredictor::Config *>(&cfg), input_slots_all);
+}
+
+TEST(Analyzer_Transformer, compare) { compare(); }
+#ifdef PADDLE_WITH_MKLDNN
+TEST(Analyzer_Transformer, compare_mkldnn) { compare(true /* use_mkldnn */); }
+#endif

 }  // namespace inference
 }  // namespace paddle
--- a/paddle/fluid/op_use_default_grad_op_maker.spec
+++ b/paddle/fluid/op_use_default_grad_op_maker.spec
@@ -29,6 +29,8 @@ pool3d
 prelu
 quantize
 rank_loss
+reduce_all
+reduce_any
 reduce_max
 reduce_mean
 reduce_min

--- a/paddle/fluid/operators/detection/gpc.cc
+++ b/paddle/fluid/operators/detection/gpc.cc
@@ -24,6 +24,7 @@
 **/

 #include "paddle/fluid/operators/detection/gpc.h"
+#include "paddle/fluid/platform/enforce.h"

 namespace gpc {

@@ -689,6 +690,7 @@ static bbox *create_contour_bboxes(gpc_polygon *p) {

  gpc_malloc<bbox>(box, p->num_contours * sizeof(bbox),
                   const_cast<char *>("Bounding box creation"));
+  PADDLE_ENFORCE_NOT_NULL(box);

  /* Construct contour bounding boxes */
  for (c = 0; c < p->num_contours; c++) {
@@ -852,6 +854,7 @@ void gpc_add_contour(gpc_polygon *p, gpc_vertex_list *new_contour, int hole) {
  /* Create an extended hole array */
  gpc_malloc<int>(extended_hole, (p->num_contours + 1) * sizeof(int),
                  const_cast<char *>("contour hole addition"));
+  PADDLE_ENFORCE_NOT_NULL(extended_hole);

  /* Create an extended contour array */
  gpc_malloc<gpc_vertex_list>(extended_contour,
@@ -969,6 +972,7 @@ void gpc_polygon_clip(gpc_op op, gpc_polygon *subj, gpc_polygon *clip,
  /* Build scanbeam table from scanbeam tree */
  gpc_malloc<double>(sbt, sbt_entries * sizeof(double),
                     const_cast<char *>("sbt creation"));
+  PADDLE_ENFORCE_NOT_NULL(sbt);
  build_sbt(&scanbeam, sbt, sbtree);
  scanbeam = 0;
  free_sbtree(&sbtree);
@@ -1604,6 +1608,7 @@ void gpc_tristrip_clip(gpc_op op, gpc_polygon *subj, gpc_polygon *clip,
  /* Build scanbeam table from scanbeam tree */
  gpc_malloc<double>(sbt, sbt_entries * sizeof(double),
                     const_cast<char *>("sbt creation"));
+  PADDLE_ENFORCE_NOT_NULL(sbt);
  build_sbt(&scanbeam, sbt, sbtree);
  scanbeam = 0;
  free_sbtree(&sbtree);

--- a/paddle/fluid/operators/linspace_op.cc
+++ b/paddle/fluid/operators/linspace_op.cc
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/operators/linspace_op.h"
+
+namespace paddle {
+namespace operators {
+
+class LinspaceOp : public framework::OperatorWithKernel {
+ public:
+  using framework::OperatorWithKernel::OperatorWithKernel;
+
+  void InferShape(framework::InferShapeContext* ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInput("Start"),
+                   "Input(Start) of LinspaceOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Stop"),
+                   "Input(Stop) of LinspaceOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasInput("Num"),
+                   "Input(Num) of LinspaceOp should not be null.");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(OUt) of LinspaceOp should not be null.");
+
+    auto s_dims = ctx->GetInputDim("Start");
+    PADDLE_ENFORCE((s_dims.size() == 1) && (s_dims[0] == 1),
+                   "The shape of Input(Start) should be [1].");
+
+    auto e_dims = ctx->GetInputDim("Stop");
+    PADDLE_ENFORCE((e_dims.size() == 1) && (e_dims[0] == 1),
+                   "The shape of Input(Stop) should be [1].");
+
+    auto step_dims = ctx->GetInputDim("Num");
+    PADDLE_ENFORCE((step_dims.size() == 1) && (step_dims[0] == 1),
+                   "The shape of Input(Num) should be [1].");
+
+    ctx->SetOutputDim("Out", {-1});
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext& ctx) const override {
+    framework::LibraryType library_{framework::LibraryType::kPlain};
+    framework::DataLayout layout_ = framework::DataLayout::kAnyLayout;
+    return framework::OpKernelType(
+        ctx.Input<framework::Tensor>("Start")->type(), ctx.device_context(),
+        layout_, library_);
+  }
+};
+
+class LinspaceOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("Start",
+             "First entry in the sequence. It is a tensor of shape [1], should "
+             "be of type float32 or float64.");
+    AddInput("Stop",
+             "Last entry in the sequence. It is a tensor of shape [1], should "
+             "be of type float32 or float64.");
+    AddInput("Num",
+             "Number of entry in the sequence. It is a tensor of shape [1], "
+             "should be of type int32.");
+    AddOutput("Out", "A sequence of numbers.");
+    AddComment(R"DOC(
+    Return fixed number of evenly spaced values within a given interval. First entry is start, and last entry is stop. In the case when Num is 1, only Start is returned. Like linspace function of numpy.
+)DOC");
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_WITHOUT_GRADIENT(linspace, ops::LinspaceOp, ops::LinspaceOpMaker);
+REGISTER_OP_CPU_KERNEL(linspace, ops::CPULinspaceKernel<float>,
+                       ops::CPULinspaceKernel<double>);
--- a/paddle/fluid/operators/linspace_op.cu
+++ b/paddle/fluid/operators/linspace_op.cu
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/linspace_op.h"
+#include "paddle/fluid/platform/cuda_primitives.h"
+
+namespace paddle {
+namespace operators {
+
+#define CUDA_1D_KERNEL_LOOP(i, n)                              \
+  for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \
+       i += blockDim.x * gridDim.x)
+
+template <typename T>
+__global__ void LinspaceKernel(T start, T step, int64_t size, T* out) {
+  CUDA_1D_KERNEL_LOOP(index, size) { out[index] = start + step * index; }
+}
+
+template <typename T>
+__global__ void LinspaceSpecialKernel(T start, T* out) {
+  out[0] = start;
+}
+
+template <typename T>
+class CUDALinspaceKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* start_t = context.Input<framework::Tensor>("Start");
+    auto* stop_t = context.Input<framework::Tensor>("Stop");
+    auto* num_t = context.Input<framework::Tensor>("Num");
+    auto* out = context.Output<framework::Tensor>("Out");
+
+    framework::Tensor n;
+    framework::TensorCopy(*start_t, platform::CPUPlace(), &n);
+    T start = n.data<T>()[0];
+    framework::TensorCopy(*stop_t, platform::CPUPlace(), &n);
+    T stop = n.data<T>()[0];
+    framework::TensorCopy(*num_t, platform::CPUPlace(), &n);
+    int32_t num = n.data<int32_t>()[0];
+
+    PADDLE_ENFORCE(num > 0, "The num of linspace op should be larger than 0.");
+
+    out->Resize(framework::make_ddim({num}));
+    T* out_data = out->mutable_data<T>(context.GetPlace());
+
+    T step = 0;
+    if (num != 1) {
+      step = (stop - start) / (num - 1);
+    }
+
+    auto stream = context.cuda_device_context().stream();
+    int block = 512;
+    int grid = (num + block - 1) / block;
+    LinspaceKernel<T><<<grid, block, 0, stream>>>(start, step, num, out_data);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP_CUDA_KERNEL(linspace, ops::CUDALinspaceKernel<float>,
+                        ops::CUDALinspaceKernel<double>);
--- a/paddle/fluid/operators/linspace_op.h
+++ b/paddle/fluid/operators/linspace_op.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <functional>
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/operators/math/math_function.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class CPULinspaceKernel : public framework::OpKernel<T> {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    T start = context.Input<framework::Tensor>("Start")->data<T>()[0];
+    T stop = context.Input<framework::Tensor>("Stop")->data<T>()[0];
+    int32_t num = context.Input<framework::Tensor>("Num")->data<int32_t>()[0];
+    auto* out = context.Output<framework::Tensor>("Out");
+    PADDLE_ENFORCE(num > 0, "The num of linspace op should be larger than 0.");
+
+    out->Resize(framework::make_ddim({num}));
+
+    T* out_data = out->mutable_data<T>(context.GetPlace());
+
+    if (num > 1) {
+      T step = (stop - start) / (num - 1);
+      T value = start;
+      for (int i = 0; i < num; ++i) {
+        out_data[i] = value;
+        value += step;
+      }
+    } else {
+      out_data[0] = start;
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/reduce_ops/reduce_all_op.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_all_op.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/reduce_ops/reduce_all_op.h"
+
+REGISTER_REDUCE_OP(reduce_all);
+REGISTER_OP_CPU_KERNEL(reduce_all,
+                       ops::ReduceKernel<paddle::platform::CPUDeviceContext,
+                                         bool, ops::AllFunctor>);
--- a/paddle/fluid/operators/reduce_ops/reduce_all_op.cu
+++ b/paddle/fluid/operators/reduce_ops/reduce_all_op.cu
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/reduce_ops/reduce_all_op.h"
+
+REGISTER_OP_CUDA_KERNEL(reduce_all,
+                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
+                                          bool, ops::AllFunctor>);
--- a/paddle/fluid/operators/reduce_ops/reduce_all_op.h
+++ b/paddle/fluid/operators/reduce_ops/reduce_all_op.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include "paddle/fluid/operators/reduce_ops/reduce_op.h"
+
+namespace paddle {
+namespace operators {
+
+struct AllFunctor {
+  template <typename DeviceContext, typename X, typename Y, typename Dim>
+  void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) {
+    y->device(place) = x->all(dim);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/reduce_ops/reduce_any_op.cc
+++ b/paddle/fluid/operators/reduce_ops/reduce_any_op.cc
+// Copyright (c) 2018 PaddlePaddle Authors. Any Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/reduce_ops/reduce_any_op.h"
+
+REGISTER_REDUCE_OP(reduce_any);
+REGISTER_OP_CPU_KERNEL(reduce_any,
+                       ops::ReduceKernel<paddle::platform::CPUDeviceContext,
+                                         bool, ops::AnyFunctor>);
--- a/paddle/fluid/operators/reduce_ops/reduce_any_op.cu
+++ b/paddle/fluid/operators/reduce_ops/reduce_any_op.cu
+// Copyright (c) 2018 PaddlePaddle Authors. Any Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/reduce_ops/reduce_any_op.h"
+
+REGISTER_OP_CUDA_KERNEL(reduce_any,
+                        ops::ReduceKernel<paddle::platform::CUDADeviceContext,
+                                          bool, ops::AnyFunctor>);
--- a/paddle/fluid/operators/reduce_ops/reduce_any_op.h
+++ b/paddle/fluid/operators/reduce_ops/reduce_any_op.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+
+#include "paddle/fluid/operators/reduce_ops/reduce_op.h"
+
+namespace paddle {
+namespace operators {
+
+struct AnyFunctor {
+  template <typename DeviceContext, typename X, typename Y, typename Dim>
+  void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) {
+    y->device(place) = x->any(dim);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/fluid/operators/squared_l2_distance_op.h
+++ b/paddle/fluid/operators/squared_l2_distance_op.h
@@ -77,6 +77,9 @@ class SquaredL2DistanceGradKernel : public framework::OpKernel<T> {
    auto* x_g = context.Output<Tensor>(framework::GradVarName("X"));
    auto* y_g = context.Output<Tensor>(framework::GradVarName("Y"));

+    PADDLE_ENFORCE_NOT_NULL(x_g);
+    PADDLE_ENFORCE_NOT_NULL(y_g);
+
    auto sub_result = EigenMatrix<T>::From(*in0);
    auto out_grad = EigenMatrix<T>::From(*in1);

@@ -92,31 +95,28 @@ class SquaredL2DistanceGradKernel : public framework::OpKernel<T> {
    // propagate back to input
    auto& eigen_place =
        *context.template device_context<DeviceContext>().eigen_device();
-    if (x_g) {
-      x_g->mutable_data<T>(context.GetPlace());
-      // eigen matrix
-      auto x_grad =
-          EigenMatrix<T>::From(*x_g, framework::make_ddim({x_dims[0], cols}));
-      // dimensions are same with subResult
-      x_grad.device(eigen_place) = grad_mat;
-    }

-    if (y_g) {
-      y_g->mutable_data<T>(context.GetPlace());
-
-      PADDLE_ENFORCE_GE(sub_result.dimensions()[0], y_dims[0],
-                        "First dimension of gradient must be greater or "
-                        "equal than first dimension of target.");
-
-      if (sub_result.dimensions()[0] == y_dims[0]) {
-        auto y_grad =
-            EigenMatrix<T>::From(*y_g, framework::make_ddim({y_dims[0], cols}));
-        y_grad.device(eigen_place) = -1 * grad_mat;
-      } else {
-        auto col_sum_res = -1 * (grad_mat.sum(Eigen::array<int, 1>({{0}})));
-        auto y_grad = EigenVector<T>::Flatten(*y_g);
-        y_grad.device(eigen_place) = col_sum_res;
-      }
+    x_g->mutable_data<T>(context.GetPlace());
+    // eigen matrix
+    auto x_grad =
+        EigenMatrix<T>::From(*x_g, framework::make_ddim({x_dims[0], cols}));
+    // dimensions are same with subResult
+    x_grad.device(eigen_place) = grad_mat;
+
+    y_g->mutable_data<T>(context.GetPlace());
+
+    PADDLE_ENFORCE_GE(sub_result.dimensions()[0], y_dims[0],
+                      "First dimension of gradient must be greater or "
+                      "equal than first dimension of target.");
+
+    if (sub_result.dimensions()[0] == y_dims[0]) {
+      auto y_grad =
+          EigenMatrix<T>::From(*y_g, framework::make_ddim({y_dims[0], cols}));
+      y_grad.device(eigen_place) = -1 * grad_mat;
+    } else {
+      auto col_sum_res = -1 * (grad_mat.sum(Eigen::array<int, 1>({{0}})));
+      auto y_grad = EigenVector<T>::Flatten(*y_g);
+      y_grad.device(eigen_place) = col_sum_res;
    }
  }
 };

--- a/python/paddle/fluid/dataset.py
+++ b/python/paddle/fluid/dataset.py
@@ -136,6 +136,7 @@ class DatasetBase(object):
            slot_var.name = var.name
            if var.lod_level == 0:
                slot_var.is_dense = True
+                slot_var.shape.extend(var.shape)
            if var.dtype == core.VarDesc.VarType.FP32:
                slot_var.type = "float"
            elif var.dtype == core.VarDesc.VarType.INT64:

--- a/python/paddle/fluid/executor.py
+++ b/python/paddle/fluid/executor.py
@@ -712,10 +712,6 @@ class Executor(object):
        if dataset == None:
            raise RuntimeError("dataset is needed and should be initialized")

-        if not isinstance(self.place, core.CPUPlace):
-            raise RuntimeError("infer_from_dataset is verified on CPUPlace"
-                               "We will open CUDAPlace in the future")
-
        scope, trainer = self._prepare_trainer(
            program=program,
            dataset=dataset,
@@ -796,10 +792,6 @@ class Executor(object):
        if dataset == None:
            raise RuntimeError("dataset is need and should be initialized")

-        if not isinstance(self.place, core.CPUPlace):
-            raise RuntimeError("train_from_dataset is verified on CPUPlace"
-                               "We will open CUDAPlace in the future")
-
        scope, trainer = self._prepare_trainer(
            program=program,
            dataset=dataset,

--- a/python/paddle/fluid/layers/learning_rate_scheduler.py
+++ b/python/paddle/fluid/layers/learning_rate_scheduler.py
@@ -349,24 +349,26 @@ def cosine_decay(learning_rate, step_each_epoch, epochs):
    training progresses. By using this function, the learning rate will be decayed by
    following cosine decay strategy.

-    decayed_lr = learning_rate * 0.5 * (math.cos(epoch * math.pi / epochs) + 1)
+    .. math::
+
+	decayed\_lr = learning\_rate * 0.5 * (math.cos * (epoch * \\frac{math.pi}{epochs} ) + 1)
    
    Args:
        learning_rate(Variable|float): The initial learning rate.
        step_each_epoch(int): the number of steps in an epoch.
        epochs(int): the number of epochs.

-     Returns:
-        Variable: The decayed learning rate.
-
-     Examples:
+    Returns:
+	Variable: The decayed learning rate.

-    ..code-block:: python
+    Examples:
+	.. code-block:: python

-  	base_lr = 0.1
-	lr = fluid.layers.cosine_decay(
-	learning_rate = base_lr, step_each_epoch=10000, epochs=120)
+  	    base_lr = 0.1
+	    lr = fluid.layers.cosine_decay(
+	    learning_rate = base_lr, step_each_epoch=10000, epochs=120)
    """
+
    with default_main_program()._lr_schedule_guard():
        if imperative_base.enabled():
            decay = imperate_lr.CosineDecay(learning_rate, step_each_epoch,

--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -73,6 +73,8 @@ __all__ = [
    'reduce_max',
    'reduce_min',
    'reduce_prod',
+    'reduce_all',
+    'reduce_any',
    'sequence_first_step',
    'sequence_last_step',
    'sequence_slice',
@@ -159,6 +161,7 @@ __all__ = [
    'sum',
    'slice',
    'shape',
+    'rank',
    'logical_and',
    'logical_or',
    'logical_xor',
@@ -4738,6 +4741,106 @@ def reduce_prod(input, dim=None, keep_dim=False, name=None):
    return out


+def reduce_all(input, dim=None, keep_dim=False, name=None):
+    """
+    Computes the ``logical and`` of tensor elements over the given dimension.
+
+    Args:
+        input (Variable): The input variable which is a Tensor or LoDTensor.
+        dim (list|int|None): The dimension along which the logical and is computed.
+            If :attr:`None`, compute the logical and over all elements of
+            :attr:`input` and return a Tensor variable with a single element,
+            otherwise must be in the range :math:`[-rank(input), rank(input))`.
+            If :math:`dim[i] < 0`, the dimension to reduce is :math:`rank + dim[i]`.
+        keep_dim (bool): Whether to reserve the reduced dimension in the
+            output Tensor. The result tensor will have one fewer dimension
+            than the :attr:`input` unless :attr:`keep_dim` is true.
+        name(str|None): A name for this layer(optional). If set None, the layer
+                       will be named automatically.
+
+    Returns:
+        Variable: The reduced Tensor variable.
+
+    Examples:
+        .. code-block:: python
+        
+            # x is a bool Tensor variable with following elements:
+            #    [[True, False]
+            #     [True, True]]
+            # Each example is followed by the correspending output tensor.
+            fluid.layers.reduce_all(x)  # False 
+            fluid.layers.reduce_all(x, dim=0)  # [True, False]
+            fluid.layers.reduce_all(x, dim=-1)  # [False, True]
+            fluid.layers.reduce_all(x, dim=1,
+                                     keep_dim=True)  # [[False], [True]]
+
+    """
+    helper = LayerHelper('reduce_all', **locals())
+    out = helper.create_variable_for_type_inference(dtype=helper.input_dtype())
+    if dim is not None and not isinstance(dim, list):
+        dim = [dim]
+    helper.append_op(
+        type='reduce_all',
+        inputs={'X': input},
+        outputs={'Out': out},
+        attrs={
+            'dim': dim if dim != None else [0],
+            'keep_dim': keep_dim,
+            'reduce_all': True if dim == None else False
+        })
+    return out
+
+
+def reduce_any(input, dim=None, keep_dim=False, name=None):
+    """
+    Computes the ``logical or`` of tensor elements over the given dimension.
+
+    Args:
+        input (Variable): The input variable which is a Tensor or LoDTensor.
+        dim (list|int|None): The dimension along which the logical or is computed.
+            If :attr:`None`, compute the logical or over all elements of
+            :attr:`input` and return a Tensor variable with a single element,
+            otherwise must be in the range :math:`[-rank(input), rank(input))`.
+            If :math:`dim[i] < 0`, the dimension to reduce is :math:`rank + dim[i]`.
+        keep_dim (bool): Whether to reserve the reduced dimension in the
+            output Tensor. The result tensor will have one fewer dimension
+            than the :attr:`input` unless :attr:`keep_dim` is true.
+        name(str|None): A name for this layer(optional). If set None, the layer
+                       will be named automatically.
+
+    Returns:
+        Variable: The reduced Tensor variable.
+
+    Examples:
+        .. code-block:: python
+
+            # x is a bool Tensor variable with following elements:
+            #    [[True, False]
+            #     [False, False]]
+            # Each example is followed by the correspending output tensor.
+            fluid.layers.reduce_any(x)  # True
+            fluid.layers.reduce_any(x, dim=0)  # [True, False]
+            fluid.layers.reduce_any(x, dim=-1)  # [True, False]
+            fluid.layers.reduce_any(x, dim=1,
+                                     keep_dim=True)  # [[True], [False]]
+
+    """
+    helper = LayerHelper('reduce_any', **locals())
+    out = helper.create_variable_for_type_inference(dtype=helper.input_dtype())
+    if dim is not None and not isinstance(dim, list):
+        dim = [dim]
+    helper.append_op(
+        type='reduce_any',
+        inputs={'X': input},
+        outputs={'Out': out},
+        attrs={
+            'dim': dim if dim != None else [0],
+            'keep_dim': keep_dim,
+            'reduce_all': True if dim == None else False
+        })
+    return out
+
+
 def split(input, num_or_sections, dim=-1, name=None):
    """
    Split the input tensor into multiple sub-tensors.
@@ -4819,7 +4922,7 @@ def l2_normalize(x, axis, epsilon=1e-12, name=None):
            the dimension to normalization is rank(X) + axis. -1 is the
            last dimension.
        epsilon(float): The epsilon value is used to avoid division by zero, \
-            the defalut value is 1e-10.
+            the defalut value is 1e-12.
        name(str|None): A name for this layer(optional). If set None, the layer \
            will be named automatically.

@@ -9237,6 +9340,32 @@ def shape(input):
    return out


+def rank(input):
+    """
+    **Rank Layer**
+
+    Returns the number of dimensions for a tensor, which is a 0-D int32 Tensor.
+
+    Args:
+        input (Variable): The input variable.
+
+    Returns:
+        Variable: The rank of the input variable.
+
+    Examples:
+        .. code-block:: python
+
+            input = layers.data(
+                name="input", shape=[3, 100, 100], dtype="float32")
+            rank = layers.rank(input) # 4
+    """
+
+    ndims = len(input.shape)
+    out = assign(np.array(ndims, 'int32'))
+
+    return out
+
+
 def _elementwise_op(helper):
    op_type = helper.layer_type
    x = helper.kwargs.get('x', None)
@@ -11002,7 +11131,7 @@ def pixel_shuffle(x, upscale_factor):

    Returns:

-        Out(Variable): the pixel shuffle result is a tensor variable with the same shape and the same type as the input.
+        Out(Variable): Reshaped tensor according to the new dimension.

    Raises:


--- a/python/paddle/fluid/layers/tensor.py
+++ b/python/paddle/fluid/layers/tensor.py
@@ -24,26 +24,11 @@ from .layer_function_generator import templatedoc
 import numpy

 __all__ = [
-    'create_tensor',
-    'create_parameter',
-    'create_global_var',
-    'cast',
-    'tensor_array_to_tensor',
-    'concat',
-    'sums',
-    'assign',
-    'fill_constant_batch_size_like',
-    'fill_constant',
-    'argmin',
-    'argmax',
-    'argsort',
-    'ones',
-    'zeros',
-    'reverse',
-    'has_inf',
-    'has_nan',
-    'isfinite',
-    'range',
+    'create_tensor', 'create_parameter', 'create_global_var', 'cast',
+    'tensor_array_to_tensor', 'concat', 'sums', 'assign',
+    'fill_constant_batch_size_like', 'fill_constant', 'argmin', 'argmax',
+    'argsort', 'ones', 'zeros', 'reverse', 'has_inf', 'has_nan', 'isfinite',
+    'range', 'linspace'
 ]


@@ -826,3 +811,45 @@ def range(start, end, step, dtype):
                'Step': step},
        outputs={'Out': [out]})
    return out
+
+
+def linspace(start, stop, num, dtype):
+    """
+    Return fixed number of evenly spaced values within a given interval.
+
+    First entry is start, and last entry is stop. In the case when Num is 1, only Start is returned. Like linspace function of numpy.
+
+    Args:
+        start(float|Variable): First entry in the sequence. It is a float scalar, or a tensor of shape [1] with type 'float32'|'float64'.
+        stop(float|Variable): Last entry in the sequence. It is a float scalar, or a tensor of shape [1] with type 'float32'|'float64'.
+        num(int|Variable): Number of entry in the sequence. It is an int scalar, or a tensor of shape [1] with type int32.
+        dtype(string): 'float32'|'float64', the data type of the output tensor.
+
+    Returns:
+        Variable: The tensor variable storing a 1-D tensor. 
+
+    Examples:
+        .. code-block:: python
+
+             data = fluid.layers.linspace(0, 10, 5, 'float32') # [0.0,  2.5,  5.0,  7.5, 10.0]
+             data = fluid.layers.linspace(0, 10, 1, 'float32') # [0.0]
+
+    """
+    helper = LayerHelper("linspace", **locals())
+
+    if not isinstance(start, Variable):
+        start = fill_constant([1], dtype, start)
+    if not isinstance(stop, Variable):
+        stop = fill_constant([1], dtype, stop)
+    if not isinstance(num, Variable):
+        num = fill_constant([1], 'int32', num)
+
+    out = helper.create_variable_for_type_inference(dtype=start.dtype)
+
+    helper.append_op(
+        type='linspace',
+        inputs={'Start': start,
+                'Stop': stop,
+                'Num': num},
+        outputs={'Out': [out]})
+    return out
--- a/python/paddle/fluid/tests/unittests/test_layers.py
+++ b/python/paddle/fluid/tests/unittests/test_layers.py
@@ -1925,6 +1925,13 @@ class TestBook(LayerTest):
            out = layers.flatten(x, axis=1, name="flatten")
            return (out)

+    def test_linspace(self):
+        program = Program()
+        with program_guard(program):
+            out = layers.linspace(20, 10, 5, 'float64')
+            self.assertIsNotNone(out)
+        print(str(program))
+

 if __name__ == '__main__':
    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_linspace.py
+++ b/python/paddle/fluid/tests/unittests/test_linspace.py
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from __future__ import print_function
+
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+class TestLinspaceOpCommonCase(OpTest):
+    def setUp(self):
+        self.op_type = "linspace"
+        dtype = 'float32'
+        self.inputs = {
+            'Start': np.array([0]).astype(dtype),
+            'Stop': np.array([10]).astype(dtype),
+            'Num': np.array([11]).astype('int32')
+        }
+
+        self.outputs = {'Out': np.arange(0, 11).astype(dtype)}
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestLinspaceOpReverseCase(OpTest):
+    def setUp(self):
+        self.op_type = "linspace"
+        dtype = 'float32'
+        self.inputs = {
+            'Start': np.array([10]).astype(dtype),
+            'Stop': np.array([0]).astype(dtype),
+            'Num': np.array([11]).astype('int32')
+        }
+
+        self.outputs = {'Out': np.arange(10, -1, -1).astype(dtype)}
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestLinspaceOpNumOneCase(OpTest):
+    def setUp(self):
+        self.op_type = "linspace"
+        dtype = 'float32'
+        self.inputs = {
+            'Start': np.array([10]).astype(dtype),
+            'Stop': np.array([0]).astype(dtype),
+            'Num': np.array([1]).astype('int32')
+        }
+
+        self.outputs = {'Out': np.array(10, dtype=dtype)}
+
+    def test_check_output(self):
+        self.check_output()
+
+
+if __name__ == "__main__":
+    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_reduce_op.py
+++ b/python/paddle/fluid/tests/unittests/test_reduce_op.py
@@ -91,6 +91,78 @@ class TestProdOp(OpTest):
        self.check_grad(['X'], 'Out')


+class TestAllOp(OpTest):
+    def setUp(self):
+        self.op_type = "reduce_all"
+        self.inputs = {'X': np.random.randint(0, 2, (5, 6, 10)).astype("bool")}
+        self.outputs = {'Out': self.inputs['X'].all()}
+        self.attrs = {'reduce_all': True}
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestAllOpWithDim(OpTest):
+    def setUp(self):
+        self.op_type = "reduce_all"
+        self.inputs = {'X': np.random.randint(0, 2, (5, 6, 10)).astype("bool")}
+        self.attrs = {'dim': [1]}
+        self.outputs = {'Out': self.inputs['X'].all(axis=1)}
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestAllOpWithKeepDim(OpTest):
+    def setUp(self):
+        self.op_type = "reduce_all"
+        self.inputs = {'X': np.random.randint(0, 2, (5, 6, 10)).astype("bool")}
+        self.attrs = {'dim': [1], 'keep_dim': True}
+        self.outputs = {
+            'Out': np.expand_dims(
+                self.inputs['X'].all(axis=1), axis=1)
+        }
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestAnyOp(OpTest):
+    def setUp(self):
+        self.op_type = "reduce_any"
+        self.inputs = {'X': np.random.randint(0, 2, (5, 6, 10)).astype("bool")}
+        self.outputs = {'Out': self.inputs['X'].any()}
+        self.attrs = {'reduce_all': True}
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestAnyOpWithDim(OpTest):
+    def setUp(self):
+        self.op_type = "reduce_any"
+        self.inputs = {'X': np.random.randint(0, 2, (5, 6, 10)).astype("bool")}
+        self.attrs = {'dim': [1]}
+        self.outputs = {'Out': self.inputs['X'].any(axis=1)}
+
+    def test_check_output(self):
+        self.check_output()
+
+
+class TestAnyOpWithKeepDim(OpTest):
+    def setUp(self):
+        self.op_type = "reduce_any"
+        self.inputs = {'X': np.random.randint(0, 2, (5, 6, 10)).astype("bool")}
+        self.attrs = {'dim': [1], 'keep_dim': True}
+        self.outputs = {
+            'Out': np.expand_dims(
+                self.inputs['X'].any(axis=1), axis=1)
+        }
+
+    def test_check_output(self):
+        self.check_output()
+
+
 class Test1DReduce(OpTest):
    def setUp(self):
        self.op_type = "reduce_sum"