diff --git a/CMakeLists.txt b/CMakeLists.txt index 9ad69738eb2ac21d6ff2624f11d17a38410d5c1f..26d94384a9150735aa8341fd8a18cb039895ff91 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -71,7 +71,8 @@ option(ANAKIN_BUILD_CROSS_PLANTFORM "Build anakin lib for any nvidia device plan option(WITH_GRPC "Use grpc as the default rpc framework" ${WITH_DISTRIBUTE}) option(WITH_BRPC_RDMA "Use brpc rdma as the rpc protocal" OFF) option(ON_INFER "Turn on inference optimization." OFF) -option(WITH_INFERENCE_API_TEST "Test fluid inference high-level api interface" OFF) +option(WITH_INFERENCE_API_TEST "Test fluid inference C++ high-level api interface" OFF) +option(WITH_HIGH_LEVEL_API_TEST "Test fluid python high-level api interface" OFF) option(WITH_SYSTEM_BLAS "Use system blas library" OFF) option(PY_VERSION "Compile PaddlePaddle with python3 support" ${PY_VERSION}) option(WITH_FAST_MATH "Make use of fast math library, might affect the precision to some extent" ON) diff --git a/cmake/external/protobuf.cmake b/cmake/external/protobuf.cmake index 69da9b98198de358348621ecdb444f2f81c7757f..09eb437aede4364f8aa285d5296f21cd8460fca1 100644 --- a/cmake/external/protobuf.cmake +++ b/cmake/external/protobuf.cmake @@ -221,6 +221,7 @@ FUNCTION(build_protobuf TARGET_NAME BUILD_FOR_HOST) -DCMAKE_BUILD_TYPE=${THIRD_PARTY_BUILD_TYPE} -DCMAKE_INSTALL_PREFIX=${PROTOBUF_INSTALL_DIR} -DCMAKE_INSTALL_LIBDIR=lib + -DBUILD_SHARED_LIBS=OFF CMAKE_CACHE_ARGS -DCMAKE_INSTALL_PREFIX:PATH=${PROTOBUF_INSTALL_DIR} -DCMAKE_BUILD_TYPE:STRING=${THIRD_PARTY_BUILD_TYPE} diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec index c0df4c2dc6cd46a90d4205b26123fb6c44fcaf88..bf39325cc9bfb258051ec1a7fc7f5eb139c60133 100644 --- a/paddle/fluid/API.spec +++ b/paddle/fluid/API.spec @@ -118,6 +118,8 @@ paddle.fluid.layers.reduce_mean (ArgSpec(args=['input', 'dim', 'keep_dim', 'name paddle.fluid.layers.reduce_max (ArgSpec(args=['input', 'dim', 'keep_dim', 'name'], varargs=None, keywords=None, defaults=(None, False, None)), ('document', '66a622db727551761ce4eb73eaa7f6a4')) paddle.fluid.layers.reduce_min (ArgSpec(args=['input', 'dim', 'keep_dim', 'name'], varargs=None, keywords=None, defaults=(None, False, None)), ('document', 'd50ac552b5d131468ed466d08bb2d38c')) paddle.fluid.layers.reduce_prod (ArgSpec(args=['input', 'dim', 'keep_dim', 'name'], varargs=None, keywords=None, defaults=(None, False, None)), ('document', 'fcd8301a0ce15f219c7a4bcd0c1e8eca')) +paddle.fluid.layers.reduce_all (ArgSpec(args=['input', 'dim', 'keep_dim', 'name'], varargs=None, keywords=None, defaults=(None, False, None)), ('document', '646ca4d4a2cc16084f59de44b6927eca')) +paddle.fluid.layers.reduce_any (ArgSpec(args=['input', 'dim', 'keep_dim', 'name'], varargs=None, keywords=None, defaults=(None, False, None)), ('document', 'f36661060aeeaf6c6b1331e41b3726fa')) paddle.fluid.layers.sequence_first_step (ArgSpec(args=['input'], varargs=None, keywords=None, defaults=None), ('document', '2b290d3d77882bfe9bb8d331cac8cdd3')) paddle.fluid.layers.sequence_last_step (ArgSpec(args=['input'], varargs=None, keywords=None, defaults=None), ('document', 'c16a892f44f7fe71bfa5afc32d3f34ce')) paddle.fluid.layers.sequence_slice (ArgSpec(args=['input', 'offset', 'length', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'fdcea0e8b5bc7d8d4b1b072c521014e6')) @@ -125,7 +127,7 @@ paddle.fluid.layers.dropout (ArgSpec(args=['x', 'dropout_prob', 'is_test', 'seed paddle.fluid.layers.split (ArgSpec(args=['input', 'num_or_sections', 'dim', 'name'], varargs=None, keywords=None, defaults=(-1, None)), ('document', '652625345c2acb900029c78cc75f8aa6')) paddle.fluid.layers.ctc_greedy_decoder (ArgSpec(args=['input', 'blank', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'ebbf2adbd79683dc93db03454dfa18c2')) paddle.fluid.layers.edit_distance (ArgSpec(args=['input', 'label', 'normalized', 'ignored_tokens'], varargs=None, keywords=None, defaults=(True, None)), ('document', '97f0262f97602644c83142789d784571')) -paddle.fluid.layers.l2_normalize (ArgSpec(args=['x', 'axis', 'epsilon', 'name'], varargs=None, keywords=None, defaults=(1e-12, None)), ('document', '6e428384ce6a77207fa2c70d9f011990')) +paddle.fluid.layers.l2_normalize (ArgSpec(args=['x', 'axis', 'epsilon', 'name'], varargs=None, keywords=None, defaults=(1e-12, None)), ('document', '35c6a241bcc1a1fc89508860d82ad62b')) paddle.fluid.layers.matmul (ArgSpec(args=['x', 'y', 'transpose_x', 'transpose_y', 'alpha', 'name'], varargs=None, keywords=None, defaults=(False, False, 1.0, None)), ('document', 'b4cbe1ac451005df6dad12e9ffdccca9')) paddle.fluid.layers.topk (ArgSpec(args=['input', 'k', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', 'd3570c02f71bcd78e60b3f31dc8f5b32')) paddle.fluid.layers.warpctc (ArgSpec(args=['input', 'label', 'blank', 'norm_by_times', 'use_cudnn'], varargs=None, keywords=None, defaults=(0, False, False)), ('document', 'aaba49c038ba927f0a8e45c0c9a686ab')) @@ -204,6 +206,7 @@ paddle.fluid.layers.gaussian_random_batch_size_like (ArgSpec(args=['input', 'sha paddle.fluid.layers.sum (ArgSpec(args=['x'], varargs=None, keywords=None, defaults=None), ('document', 'a418e3ccb5e2ac21bd60f5cc221d5860')) paddle.fluid.layers.slice (ArgSpec(args=['input', 'axes', 'starts', 'ends'], varargs=None, keywords=None, defaults=None), ('document', '01dbb91e7c74cb11336cd531013de51a')) paddle.fluid.layers.shape (ArgSpec(args=['input'], varargs=None, keywords=None, defaults=None), ('document', '17db0f814eb7bb5a3fac1ca6e60e16d8')) +paddle.fluid.layers.rank (ArgSpec(args=['input'], varargs=None, keywords=None, defaults=None), ('document', 'ee1386c42ecc8f424fe3fb21862fefc2')) paddle.fluid.layers.logical_and (ArgSpec(args=['x', 'y', 'out', 'name'], varargs=None, keywords=None, defaults=(None, None)), ('document', 'cdcf20c494c92060d10feb9374532f42')) paddle.fluid.layers.logical_or (ArgSpec(args=['x', 'y', 'out', 'name'], varargs=None, keywords=None, defaults=(None, None)), ('document', '0eae3f726a4afe590757552fa3ced012')) paddle.fluid.layers.logical_xor (ArgSpec(args=['x', 'y', 'out', 'name'], varargs=None, keywords=None, defaults=(None, None)), ('document', 'b0daaa3fa4a0aa62f9b58c43d959eb25')) @@ -236,7 +239,7 @@ paddle.fluid.layers.huber_loss (ArgSpec(args=['input', 'label', 'delta'], vararg paddle.fluid.layers.kldiv_loss (ArgSpec(args=['x', 'target', 'reduction', 'name'], varargs=None, keywords=None, defaults=('mean', None)), ('document', '776d536cac47c89073abc7ee524d5aec')) paddle.fluid.layers.tree_conv (ArgSpec(args=['nodes_vector', 'edge_set', 'output_size', 'num_filters', 'max_depth', 'act', 'param_attr', 'bias_attr', 'name'], varargs=None, keywords=None, defaults=(1, 2, 'tanh', None, None, None)), ('document', '34ea12ac9f10a65dccbc50100d12e607')) paddle.fluid.layers.npair_loss (ArgSpec(args=['anchor', 'positive', 'labels', 'l2_reg'], varargs=None, keywords=None, defaults=(0.002,)), ('document', '46994d10276dd4cb803b4062b5d14329')) -paddle.fluid.layers.pixel_shuffle (ArgSpec(args=['x', 'upscale_factor'], varargs=None, keywords=None, defaults=None), ('document', 'ad669cdf83e72a69ebc5ed79e36486de')) +paddle.fluid.layers.pixel_shuffle (ArgSpec(args=['x', 'upscale_factor'], varargs=None, keywords=None, defaults=None), ('document', '731b21c62a4add60a33bd76d802ffc5c')) paddle.fluid.layers.fsp_matrix (ArgSpec(args=['x', 'y'], varargs=None, keywords=None, defaults=None), ('document', 'b76ccca3735bea4a58a0dbf0d77c5393')) paddle.fluid.layers.data (ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True)), ('document', '33bbd42027d872b3818b3d64ec52e139')) paddle.fluid.layers.open_files (ArgSpec(args=['filenames', 'shapes', 'lod_levels', 'dtypes', 'thread_num', 'buffer_size', 'pass_num', 'is_test'], varargs=None, keywords=None, defaults=(None, None, 1, None)), ('document', 'b1ae2e1cc0750e58726374061ea90ecc')) @@ -272,6 +275,7 @@ paddle.fluid.layers.has_inf (ArgSpec(args=['x'], varargs=None, keywords=None, de paddle.fluid.layers.has_nan (ArgSpec(args=['x'], varargs=None, keywords=None, defaults=None), ('document', '2e53e83127dbfd86e7098bdfe9a549e8')) paddle.fluid.layers.isfinite (ArgSpec(args=['x'], varargs=None, keywords=None, defaults=None), ('document', '0a437011c3906079fd8947ed3e52d292')) paddle.fluid.layers.range (ArgSpec(args=['start', 'end', 'step', 'dtype'], varargs=None, keywords=None, defaults=None), ('document', '2ec937ede953ded2fdff2675883900bb')) +paddle.fluid.layers.linspace (ArgSpec(args=['start', 'stop', 'num', 'dtype'], varargs=None, keywords=None, defaults=None), ('document', '495e21e9a848c2d075a102802fc67756')) paddle.fluid.layers.While.__init__ (ArgSpec(args=['self', 'cond', 'is_test', 'name'], varargs=None, keywords=None, defaults=(False, None)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.layers.While.block (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.layers.Switch.__init__ (ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) @@ -297,12 +301,12 @@ paddle.fluid.layers.DynamicRNN.static_input (ArgSpec(args=['self', 'x'], varargs paddle.fluid.layers.DynamicRNN.step_input (ArgSpec(args=['self', 'x', 'level'], varargs=None, keywords=None, defaults=(0,)), ('document', '7568c5ac7622a10288d3307a94134655')) paddle.fluid.layers.DynamicRNN.update_memory (ArgSpec(args=['self', 'ex_mem', 'new_mem'], varargs=None, keywords=None, defaults=None), ('document', '5d83987da13b98363d6a807a52d8024f')) paddle.fluid.layers.StaticRNN.__init__ (ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) -paddle.fluid.layers.StaticRNN.memory (ArgSpec(args=['self', 'init', 'shape', 'batch_ref', 'init_value', 'init_batch_dim_idx', 'ref_batch_dim_idx'], varargs=None, keywords=None, defaults=(None, None, None, 0.0, 0, 1)), ('document', 'c24e368e23afac1ed91a78a639d7a9c7')) -paddle.fluid.layers.StaticRNN.output (ArgSpec(args=['self'], varargs='outputs', keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) -paddle.fluid.layers.StaticRNN.step (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) -paddle.fluid.layers.StaticRNN.step_input (ArgSpec(args=['self', 'x'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) -paddle.fluid.layers.StaticRNN.step_output (ArgSpec(args=['self', 'o'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) -paddle.fluid.layers.StaticRNN.update_memory (ArgSpec(args=['self', 'mem', 'var'], varargs=None, keywords=None, defaults=None), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) +paddle.fluid.layers.StaticRNN.memory (ArgSpec(args=['self', 'init', 'shape', 'batch_ref', 'init_value', 'init_batch_dim_idx', 'ref_batch_dim_idx'], varargs=None, keywords=None, defaults=(None, None, None, 0.0, 0, 1)), ('document', '72530f299d6451a567cf4a12dc3fb1ff')) +paddle.fluid.layers.StaticRNN.output (ArgSpec(args=['self'], varargs='outputs', keywords=None, defaults=None), ('document', 'df6ceab6e6c9bd31e97914d7e7538137')) +paddle.fluid.layers.StaticRNN.step (ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None), ('document', '6d3e0a5d9aa519a9773a36e1620ea9b7')) +paddle.fluid.layers.StaticRNN.step_input (ArgSpec(args=['self', 'x'], varargs=None, keywords=None, defaults=None), ('document', '903387ec11f3d0bf46821d31a68cffa5')) +paddle.fluid.layers.StaticRNN.step_output (ArgSpec(args=['self', 'o'], varargs=None, keywords=None, defaults=None), ('document', '252890d4c3199a7623ab8667e13fd837')) +paddle.fluid.layers.StaticRNN.update_memory (ArgSpec(args=['self', 'mem', 'var'], varargs=None, keywords=None, defaults=None), ('document', '7a0000520f179f35239956a5ba55119f')) paddle.fluid.layers.reorder_lod_tensor_by_rank (ArgSpec(args=['x', 'rank_table'], varargs=None, keywords=None, defaults=None), ('document', '3545f529ef04e8f6ecb76b47fa3df01a')) paddle.fluid.layers.Print (ArgSpec(args=['input', 'first_n', 'message', 'summarize', 'print_tensor_name', 'print_tensor_type', 'print_tensor_shape', 'print_tensor_lod', 'print_phase'], varargs=None, keywords=None, defaults=(-1, None, -1, True, True, True, True, 'both')), ('document', '5fef91b0e21c93610785f2b1f7161732')) paddle.fluid.layers.is_empty (ArgSpec(args=['x', 'cond'], varargs=None, keywords=None, defaults=(None,)), ('document', 'bbe578dbb49ad13e15b014e98c22b519')) @@ -361,8 +365,7 @@ paddle.fluid.layers.inverse_time_decay (ArgSpec(args=['learning_rate', 'decay_st paddle.fluid.layers.polynomial_decay (ArgSpec(args=['learning_rate', 'decay_steps', 'end_learning_rate', 'power', 'cycle'], varargs=None, keywords=None, defaults=(0.0001, 1.0, False)), ('document', '882634f420f626642f0874481263da40')) paddle.fluid.layers.piecewise_decay (ArgSpec(args=['boundaries', 'values'], varargs=None, keywords=None, defaults=None), ('document', 'c717d9d1d78a53c809d01b8bc56f3cae')) paddle.fluid.layers.noam_decay (ArgSpec(args=['d_model', 'warmup_steps'], varargs=None, keywords=None, defaults=None), ('document', 'd9a95746353fd574be36dc28d8726c28')) -paddle.fluid.layers.append_LARS (ArgSpec(args=['params_grads', 'learning_rate', 'weight_decay'], varargs=None, keywords=None, defaults=None), ('document', 'd24fa1e7d62ac8a534fc6a86002f84f8')) -paddle.fluid.layers.cosine_decay (ArgSpec(args=['learning_rate', 'step_each_epoch', 'epochs'], varargs=None, keywords=None, defaults=None), ('document', '9588c64c26ffaef3c466e404a6af9d9b')) +paddle.fluid.layers.cosine_decay (ArgSpec(args=['learning_rate', 'step_each_epoch', 'epochs'], varargs=None, keywords=None, defaults=None), ('document', 'f8b2727bccf0f368c997d7cf05847e49')) paddle.fluid.layers.linear_lr_warmup (ArgSpec(args=['learning_rate', 'warmup_steps', 'start_lr', 'end_lr'], varargs=None, keywords=None, defaults=None), ('document', '2ef3f5ca5cd71ea4217c418e5a7a0565')) paddle.fluid.contrib.InitState.__init__ (ArgSpec(args=['self', 'init', 'shape', 'value', 'init_boot', 'need_reorder', 'dtype'], varargs=None, keywords=None, defaults=(None, None, 0.0, None, False, 'float32')), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) paddle.fluid.contrib.StateCell.__init__ (ArgSpec(args=['self', 'inputs', 'states', 'out_state', 'name'], varargs=None, keywords=None, defaults=(None,)), ('document', '6adf97f83acf6453d4a6a4b1070f3754')) diff --git a/paddle/fluid/framework/data_feed.cc b/paddle/fluid/framework/data_feed.cc index b5f7e6c22405d6928f0e423458d6cd720f2d09a8..365c80da34eb287f50d2f0dcbf3844001ab43ec8 100644 --- a/paddle/fluid/framework/data_feed.cc +++ b/paddle/fluid/framework/data_feed.cc @@ -72,7 +72,6 @@ bool DataFeed::PickOneFile(std::string* filename) { } VLOG(3) << "file_idx_=" << *file_idx_; *filename = filelist_[(*file_idx_)++]; - // LOG(ERROR) << "pick file:" << *filename; return true; } @@ -466,6 +465,17 @@ void MultiSlotDataFeed::Init( if (slot.is_used()) { use_slots_.push_back(all_slots_[i]); use_slots_is_dense_.push_back(slot.is_dense()); + std::vector local_shape; + if (slot.is_dense()) { + // for batch size holder if is_dense + if (slot.shape(0) > 0) { + local_shape.push_back(0); + } + } + for (size_t i = 0; i < slot.shape_size(); ++i) { + local_shape.push_back(slot.shape(i)); + } + use_slots_shape_.push_back(local_shape); } } feed_vec_.resize(use_slots_.size()); @@ -752,8 +762,8 @@ void MultiSlotDataFeed::PutToFeedVec( LoD data_lod{offset}; feed_vec_[i]->set_lod(data_lod); if (use_slots_is_dense_[i]) { - int dim = total_instance / batch_size_; - feed_vec_[i]->Resize({batch_size_, dim}); + use_slots_shape_[i][0] = batch_size_; + feed_vec_[i]->Resize(framework::make_ddim(use_slots_shape_[i])); } } #endif @@ -785,6 +795,16 @@ void MultiSlotInMemoryDataFeed::Init( if (slot.is_used()) { use_slots_.push_back(all_slots_[i]); use_slots_is_dense_.push_back(slot.is_dense()); + std::vector local_shape; + if (slot.is_dense()) { + if (slot.shape(0) > 0) { + local_shape.push_back(0); + } + } + for (size_t i = 0; i < slot.shape_size(); ++i) { + local_shape.push_back(slot.shape(i)); + } + use_slots_shape_.push_back(local_shape); } } feed_vec_.resize(use_slots_.size()); @@ -940,8 +960,8 @@ void MultiSlotInMemoryDataFeed::PutToFeedVec( LoD data_lod{offset}; feed_vec_[i]->set_lod(data_lod); if (use_slots_is_dense_[i]) { - int dim = total_instance / batch_size_; - feed_vec_[i]->Resize({batch_size_, dim}); + use_slots_shape_[i][0] = batch_size_; + feed_vec_[i]->Resize(framework::make_ddim(use_slots_shape_[i])); } } #endif diff --git a/paddle/fluid/framework/data_feed.h b/paddle/fluid/framework/data_feed.h index 648c874a0b8763b18118e18adf3b3e93acfd104b..d098c7858a98c644bd3cad78d3cf1e3b35ca026b 100644 --- a/paddle/fluid/framework/data_feed.h +++ b/paddle/fluid/framework/data_feed.h @@ -142,6 +142,7 @@ class DataFeed { // object) std::vector all_slots_; std::vector all_slots_type_; + std::vector> use_slots_shape_; std::vector use_slots_index_; // -1: not used; >=0: the index of use_slots_ diff --git a/paddle/fluid/framework/data_feed.proto b/paddle/fluid/framework/data_feed.proto index 77911306299b77748a2ad9437d49680748885003..03996e0e20a1729ee300a5ad37abc325876930b7 100644 --- a/paddle/fluid/framework/data_feed.proto +++ b/paddle/fluid/framework/data_feed.proto @@ -19,6 +19,7 @@ message Slot { required string type = 2; optional bool is_dense = 3 [ default = false ]; optional bool is_used = 4 [ default = false ]; + repeated int32 shape = 5; // we can define N-D Tensor } message MultiSlotDesc { repeated Slot slots = 1; } diff --git a/paddle/fluid/framework/details/async_ssa_graph_executor.cc b/paddle/fluid/framework/details/async_ssa_graph_executor.cc index e9aad5d264d1745662848d1ba313b573d0974cb7..7f63c07b18f7c6147670656dfc567f8f2ae8429a 100644 --- a/paddle/fluid/framework/details/async_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/async_ssa_graph_executor.cc @@ -64,9 +64,12 @@ void ProcessGraph(std::vector graphs, Scope *scope) { node->Op()->GetNullableAttr("epmap")); auto height_section = boost::get>( node->Op()->GetNullableAttr("sections")); + auto trainer_id = + boost::get(node->Op()->GetNullableAttr("trainer_id")); send_varname_to_ctx[send_var_name] = operators::distributed::RpcContext(send_var_name, send_varnames, - epmap, height_section); + epmap, height_section, + trainer_id); VLOG(3) << "find and init an send op: " << send_varname_to_ctx[send_var_name]; } else if (node->Name() == "recv") { @@ -75,9 +78,11 @@ void ProcessGraph(std::vector graphs, Scope *scope) { node->Op()->GetNullableAttr("recv_varnames")); auto epmap = boost::get>( node->Op()->GetNullableAttr("epmap")); + auto trainer_id = + boost::get(node->Op()->GetNullableAttr("trainer_id")); recv_varname_to_ctx[recv_var_name] = operators::distributed::RpcContext(recv_var_name, recv_varnames, - epmap, {}); + epmap, {}, trainer_id); nodes_to_delete.push_back(node); VLOG(3) << "find and remove an recv op: " << recv_varname_to_ctx[recv_var_name]; diff --git a/paddle/fluid/framework/details/build_strategy.cc b/paddle/fluid/framework/details/build_strategy.cc index afe5078bf80d00b595789a5f45d91a5e7a8dfce6..196603bbff1db79e46ebbe8b18f1092fcbaac7f9 100644 --- a/paddle/fluid/framework/details/build_strategy.cc +++ b/paddle/fluid/framework/details/build_strategy.cc @@ -101,8 +101,6 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder { "mode."; strategy_.fuse_all_optimizer_ops_ = false; } else { - VLOG(10) << "Add alloc_continuous_space_for_grad_pass"; - AppendPass("alloc_continuous_space_for_grad_pass"); // NOTE: fuse_all_xx_ops will count the number of xx operator first, // if the number is zero, fuse_all_reduce_ops will do nothing. // Currently, only one type of optimization algorithm can be fused. @@ -150,6 +148,11 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder { AppendPass("runtime_context_cache_pass"); } + if (strategy_.cache_expected_kernel_) { + VLOG(10) << "Add expected_kernel_cache_pass"; + AppendPass("expected_kernel_cache_pass"); + } + AppendMultiDevPass(strategy_); if (strategy_.fuse_all_reduce_ops_) { @@ -337,3 +340,4 @@ USE_PASS(fuse_adam_op_pass); USE_PASS(fuse_sgd_op_pass); USE_PASS(fuse_all_reduce_op_pass); USE_PASS(runtime_context_cache_pass); +USE_PASS(expected_kernel_cache_pass); diff --git a/paddle/fluid/framework/details/build_strategy.h b/paddle/fluid/framework/details/build_strategy.h index 8aa444a30c0f7f1f5c19d54cf248f86c3e3b3cf3..b1601cfbcd5e9c66f1bbecd1f6fe10bc279cea26 100644 --- a/paddle/fluid/framework/details/build_strategy.h +++ b/paddle/fluid/framework/details/build_strategy.h @@ -83,11 +83,11 @@ struct BuildStrategy { bool sync_batch_norm_{false}; - bool memory_optimize_{true}; - // TODO(dzhwinter): - // make enable_inplace, memory_optimize_ - // memory_early_delete_ true by default - bool enable_inplace_{true}; + // FIXME(liuwei1031) disable memory_optimzie and enable_inplace in 1.4 + // to open them by default, we need to solve the fetch variable issue + bool memory_optimize_{false}; + + bool enable_inplace_{false}; bool enable_sequential_execution_{false}; @@ -108,6 +108,7 @@ struct BuildStrategy { bool remove_unnecessary_lock_{true}; bool cache_runtime_context_{false}; + bool cache_expected_kernel_{true}; // NOTE: // Before you add new options, think if it's a general strategy that works diff --git a/paddle/fluid/framework/details/fuse_adam_op_pass.cc b/paddle/fluid/framework/details/fuse_adam_op_pass.cc index 0ef75e319244e2ccc63dfa3f93f0cd764cf67633..f95d93fd5575ae538274c4c0322cf661c631849a 100644 --- a/paddle/fluid/framework/details/fuse_adam_op_pass.cc +++ b/paddle/fluid/framework/details/fuse_adam_op_pass.cc @@ -24,7 +24,7 @@ namespace details { const std::string FuseAdamOpPass::GetOpType() const { return "adam"; } const std::vector FuseAdamOpPass::GetAuxiliaryVarNames() const { - return {"Param", "Moment1", "Moment2", "Beta1Pow", "Beta2Pow"}; + return {"Moment1", "Moment2", "Beta1Pow", "Beta2Pow"}; } void FuseAdamOpPass::FuseOptimizerOps( @@ -77,16 +77,16 @@ void FuseAdamOpPass::FuseAdamOps( VLOG(10) << "Insert adam to graph "; OpDesc adam_desc(adam_ops[0]->Op()->Block()); adam_desc.SetType("adam"); - adam_desc.SetInput("Param", {fused_vars_name.at("Param")}); - adam_desc.SetInput("Grad", {fused_vars_name.at("Grad")}); + adam_desc.SetInput(kParam, {fused_vars_name.at(kParam)}); + adam_desc.SetInput(kGrad, {fused_vars_name.at(kGrad)}); adam_desc.SetInput("Moment1", {fused_vars_name.at("Moment1")}); adam_desc.SetInput("Moment2", {fused_vars_name.at("Moment2")}); // TODO(zcd): The LearningRate, Beta1Pow, Beta2Pow should be equal. - adam_desc.SetInput("LearningRate", adam_ops[0]->Op()->Input("LearningRate")); + adam_desc.SetInput(kLearningRate, adam_ops[0]->Op()->Input(kLearningRate)); adam_desc.SetInput("Beta1Pow", adam_ops[0]->Op()->Input("Beta1Pow")); adam_desc.SetInput("Beta2Pow", adam_ops[0]->Op()->Input("Beta2Pow")); - adam_desc.SetOutput("ParamOut", {fused_vars_name.at("Param")}); + adam_desc.SetOutput("ParamOut", {fused_vars_name.at(kParam)}); adam_desc.SetOutput("Moment1Out", {fused_vars_name.at("Moment1")}); adam_desc.SetOutput("Moment2Out", {fused_vars_name.at("Moment2")}); adam_desc.SetAttr("beta1", beta1); diff --git a/paddle/fluid/framework/details/fuse_optimizer_op_pass.cc b/paddle/fluid/framework/details/fuse_optimizer_op_pass.cc index b49f095d428a017dd1a3bed2788a048af9afa6bb..25aa3019d102293725d836cf1f8e9fce8462408b 100644 --- a/paddle/fluid/framework/details/fuse_optimizer_op_pass.cc +++ b/paddle/fluid/framework/details/fuse_optimizer_op_pass.cc @@ -29,7 +29,9 @@ void FuseOptimizerOpPass::ApplyImpl(ir::Graph *graph) const { auto &local_scopes = Get>(kLocalScopes); const std::string fuse_op_type = GetOpType(); - const std::vector aux_var_names = GetAuxiliaryVarNames(); + std::vector aux_var_names = GetAuxiliaryVarNames(); + aux_var_names.emplace_back(kParam); + aux_var_names.emplace_back(kGrad); // Step 1: Get the specified op and auxiliary variables. std::vector topo_nodes = ir::TopologySortOperations(result); @@ -61,7 +63,7 @@ void FuseOptimizerOpPass::ApplyImpl(ir::Graph *graph) const { result.Set(kFusedVars, new FusedVars); } std::unordered_map fused_vars_name; - fused_vars_name.reserve(aux_var_names.size() + 1); + fused_vars_name.reserve(aux_var_names.size()); auto &fused_var_set = result.Get(kFusedVars); const std::string prefix(kFusedVarNamePrefix); // NOTE: the fused_var_name should be unique. @@ -75,39 +77,103 @@ void FuseOptimizerOpPass::ApplyImpl(ir::Graph *graph) const { } // Step 3: Get the fused Gradient's name - auto ¶ms_grads = result.Get(kParamsAndGrads); - if (!result.Has(kFusedGrads)) { - PADDLE_THROW( - "The alloc_continuous_space_for_grad_pass should be called before this " - "pass."); - } - auto &fused_grad = result.Get(kFusedGrads); - auto &fused_vars = result.Get(kFusedVars); - auto iter = std::find(fused_vars.begin(), fused_vars.end(), fused_grad); - PADDLE_ENFORCE(iter != fused_vars.end(), "Not find the fused_grad."); - fused_vars_name.emplace("Grad", fused_grad); - - // Step 4: Sort the parameters and auxiliary variables according - // to parameters' name to make variables' name correspond correctly. - PADDLE_ENFORCE(result.Has(kParamsAndGrads), "Does't find kParamsAndGrads."); - PADDLE_ENFORCE_EQ(params_grads.size(), aux_var_set.begin()->second.size(), - "The size of params_grads and aux_var_set are not equal."); - SortParametersAndAuxVars(params_grads, &aux_var_set, &opt_ops); - - // Step 5: Alloc continuous space for Parameters and AuxiliaryVar(e.g. + bool grad_fused = false; + if (result.Has(kParamsAndGrads)) { + auto ¶ms_grads = result.Get(kParamsAndGrads); + PADDLE_ENFORCE_EQ( + params_grads.size(), aux_var_set.at(kGrad).size(), + "The number of gradients and optimizer ops is not equal."); + std::unordered_set opt_grad_set(aux_var_set.at(kGrad).begin(), + aux_var_set.at(kGrad).end()); + size_t same_grad_num = 0; + for (auto &p_g : params_grads) { + if (opt_grad_set.count(p_g.second)) { + ++same_grad_num; + } + } + + // NOTE(zcd): the gradient of kParamsAndGrads may be different with the + // kGrad. + if (same_grad_num == aux_var_set.at(kGrad).size()) { + if (!result.Has(kFusedGrads)) { + PADDLE_THROW( + "The alloc_continuous_space_for_grad_pass should be called before " + "this pass."); + } + auto &fused_grad = result.Get(kFusedGrads); + auto &fused_vars = result.Get(kFusedVars); + auto iter = std::find(fused_vars.begin(), fused_vars.end(), fused_grad); + PADDLE_ENFORCE(iter != fused_vars.end(), "Not find the fused_grad."); + fused_vars_name[kGrad] = fused_grad; + + // Sort the parameters and auxiliary variables according + // to parameters' name to make variables' name correspond correctly. + SortParametersAndAuxVars(params_grads, &aux_var_set, &opt_ops); + grad_fused = true; + } + } + + // Step 4: Alloc continuous space for Parameters and AuxiliaryVar(e.g. // Moment1, Moment2, Beta1Pow, Beta2Pow) of all the optimizer ops separately. + aux_var_names.pop_back(); + if (!grad_fused) { + InitFusedGradsAndAllocSpaceForGrads( + places, local_scopes, aux_var_set.at(kParam), aux_var_set.at(kGrad), + fused_vars_name.at(kGrad), &result); + } InitFusedVarsAndAllocSpaceForVars(places, local_scopes, aux_var_names, aux_var_set, fused_vars_name); - // Step 6: Fuse optimizer Ops and Scale Ops + // Step 5: Fuse optimizer Ops and Scale Ops FuseOptimizerOps(aux_var_set, fused_vars_name, opt_ops, &result); - // Step 7: Remove optimizer Ops + // Step 6: Remove optimizer Ops for (auto &opt_op : opt_ops) { graph->RemoveNode(opt_op); } } +void FuseOptimizerOpPass::InitFusedGradsAndAllocSpaceForGrads( + const std::vector &places, + const std::vector &local_scopes, + const std::vector ¶ms, + const std::vector &grads, const std::string &fused_grad_name, + ir::Graph *result) const { + // Get Var Nodes + std::unordered_map vars; + for (ir::Node *node : result->Nodes()) { + if (node->IsVar() && node->Var()) { + // Note: The graph may have the same name node. For example, parameter + // is the input of operator and it also is the output of optimizer; + vars.emplace(node->Var()->Name(), node); + } + } + // Init Grads + for (auto it = local_scopes.rbegin(); it != local_scopes.rend(); ++it) { + auto &scope = *it; + VLOG(10) << "Init " << fused_grad_name; + PADDLE_ENFORCE(scope->FindVar(fused_grad_name) == nullptr, + "%s has existed in scope.", fused_grad_name); + scope->Var(fused_grad_name)->GetMutable(); + + for (auto &grad_var_name : grads) { + auto iter = vars.find(grad_var_name); + PADDLE_ENFORCE(iter != vars.end()); + PADDLE_ENFORCE_NOT_NULL(iter->second->Var()); + PADDLE_ENFORCE_EQ(iter->second->Var()->GetType(), + proto::VarType::LOD_TENSOR); + scope->Var(grad_var_name)->GetMutable(); + } + } + // Define Ops + ProgramDesc program_desc; + auto *global_block = program_desc.MutableBlock(0); + AppendAllocContinuousSpace(params, grads, fused_grad_name, global_block, + false, false); + // Run Ops + RunInitOps(places, local_scopes, *global_block); +} + void FuseOptimizerOpPass::InitFusedVarsAndAllocSpaceForVars( const std::vector &places, const std::vector &local_scopes, @@ -115,37 +181,49 @@ void FuseOptimizerOpPass::InitFusedVarsAndAllocSpaceForVars( const std::unordered_map> &aux_var_set, const std::unordered_map &fused_vars_name) const { - VLOG(10) << "Init FusedVars."; - // Alloc parameters and auxiliary vars in the respective scope. - size_t idx = local_scopes.size(); - for (auto iter = local_scopes.rbegin(); iter != local_scopes.rend(); - ++iter, --idx) { - auto &scope = *iter; - for (auto &var_name : aux_var_names) { - auto fused_var_name = fused_vars_name.at(var_name); - VLOG(10) << "Init " << fused_var_name; - PADDLE_ENFORCE(scope->FindVar(fused_var_name) == nullptr, - "%s has exist in scope[%d]", fused_var_name, idx); - scope->Var(fused_var_name)->GetMutable(); - } + // Init Vars + for (auto &var_name : aux_var_names) { + auto &fused_var_name = fused_vars_name.at(var_name); + InitVars(local_scopes, fused_var_name); } - + // Define Ops ProgramDesc program_desc; auto *global_block = program_desc.MutableBlock(0); for (auto &var_name : aux_var_names) { - AppendAllocContinuousSpace(aux_var_set.at(var_name), - fused_vars_name.at(var_name), true, - global_block); + AppendAllocContinuousSpace( + aux_var_set.at(var_name), aux_var_set.at(var_name), + fused_vars_name.at(var_name), global_block, true); } + // Run Ops + RunInitOps(places, local_scopes, *global_block); +} +void FuseOptimizerOpPass::RunInitOps(const std::vector &places, + const std::vector &local_scopes, + const BlockDesc &global_block) const { for (size_t i = 0; i < local_scopes.size(); ++i) { - for (auto &op_desc : global_block->AllOps()) { + for (auto &op_desc : global_block.AllOps()) { auto op = OpRegistry::CreateOp(*op_desc); op->Run(*local_scopes[i], places[i]); } } } +void FuseOptimizerOpPass::InitVars(const std::vector &local_scopes, + const std::string &fused_var_name) const { + VLOG(10) << "Init FusedVars."; + // Alloc parameters and auxiliary vars in the respective scope. + size_t idx = local_scopes.size(); + for (auto iter = local_scopes.rbegin(); iter != local_scopes.rend(); + ++iter, --idx) { + auto &scope = *iter; + VLOG(10) << "Init " << fused_var_name; + PADDLE_ENFORCE(scope->FindVar(fused_var_name) == nullptr, + "%s has exist in scope[%d]", fused_var_name, idx); + scope->Var(fused_var_name)->GetMutable(); + } +} + void FuseOptimizerOpPass::SortParametersAndAuxVars( const std::vector> ¶ms_grads, std::unordered_map> *aux_vars_set, @@ -203,15 +281,16 @@ void FuseOptimizerOpPass::GetSpecifiedOpsAndVars( } void FuseOptimizerOpPass::AppendAllocContinuousSpace( - const std::vector &args, const std::string &out_arg, - bool copy_data, BlockDesc *global_block) const { + const std::vector &in_args, + const std::vector &out_args, const std::string &fused_out_arg, + BlockDesc *global_block, bool copy_data, bool check_name) const { auto op_desc = global_block->AppendOp(); op_desc->SetType("alloc_continuous_space"); - op_desc->SetInput("Input", args); - op_desc->SetOutput("Output", args); - op_desc->SetOutput("FusedOutput", {out_arg}); + op_desc->SetInput("Input", in_args); + op_desc->SetOutput("Output", out_args); + op_desc->SetOutput("FusedOutput", {fused_out_arg}); op_desc->SetAttr("copy_data", copy_data); - op_desc->SetAttr("check_name", true); + op_desc->SetAttr("check_name", check_name); } void FuseOptimizerOpPass::InserInputAndOutputForOptOps( diff --git a/paddle/fluid/framework/details/fuse_optimizer_op_pass.h b/paddle/fluid/framework/details/fuse_optimizer_op_pass.h index 0240f1594d7ef9d855eb6e96e8e8a32ee1d957ba..47efc1693dd31ca88787da3a9d6d06aa7ef65786 100644 --- a/paddle/fluid/framework/details/fuse_optimizer_op_pass.h +++ b/paddle/fluid/framework/details/fuse_optimizer_op_pass.h @@ -27,6 +27,10 @@ namespace paddle { namespace framework { namespace details { +constexpr char kGrad[] = "Grad"; +constexpr char kParam[] = "Param"; +constexpr char kLearningRate[] = "LearningRate"; + class FuseOptimizerOpPass : public ir::Pass { protected: void ApplyImpl(ir::Graph *graph) const override; @@ -56,9 +60,18 @@ class FuseOptimizerOpPass : public ir::Pass { std::unordered_map> *aux_args_name) const; - void AppendAllocContinuousSpace(const std::vector &args, - const std::string &out_arg, bool copy_data, - BlockDesc *global_block) const; + void AppendAllocContinuousSpace(const std::vector &in_args, + const std::vector &out_args, + const std::string &fused_out_arg, + BlockDesc *global_block, bool copy_data, + bool check_name = true) const; + + void InitFusedGradsAndAllocSpaceForGrads( + const std::vector &places, + const std::vector &local_scopes, + const std::vector ¶ms, + const std::vector &grads, const std::string &fused_grad_name, + ir::Graph *result) const; void InitFusedVarsAndAllocSpaceForVars( const std::vector &places, @@ -68,6 +81,13 @@ class FuseOptimizerOpPass : public ir::Pass { &aux_var_set, const std::unordered_map &fused_vars_name) const; + + void RunInitOps(const std::vector &places, + const std::vector &local_scopes, + const BlockDesc &global_block) const; + + void InitVars(const std::vector &local_scopes, + const std::string &fused_var_name) const; }; } // namespace details diff --git a/paddle/fluid/framework/details/fuse_sgd_op_pass.cc b/paddle/fluid/framework/details/fuse_sgd_op_pass.cc index f91c21e3cc869de1a6d67146eb99f27a2ca5497c..2219f3209f77de5cb34abfb9edb8bdea6a8eebb0 100644 --- a/paddle/fluid/framework/details/fuse_sgd_op_pass.cc +++ b/paddle/fluid/framework/details/fuse_sgd_op_pass.cc @@ -24,7 +24,7 @@ namespace details { const std::string FuseSgdOpPass::GetOpType() const { return "sgd"; } const std::vector FuseSgdOpPass::GetAuxiliaryVarNames() const { - return {"Param"}; + return {}; } void FuseSgdOpPass::FuseOptimizerOps( @@ -50,12 +50,12 @@ void FuseSgdOpPass::FuseSgdOps( // Add fused scale OpDesc Sgd_desc(sgd_ops[0]->Op()->Block()); Sgd_desc.SetType("sgd"); - Sgd_desc.SetInput("Param", {fused_vars_name.at("Param")}); - Sgd_desc.SetInput("Grad", {fused_vars_name.at("Grad")}); - Sgd_desc.SetOutput("ParamOut", {fused_vars_name.at("Param")}); + Sgd_desc.SetInput(kParam, {fused_vars_name.at(kParam)}); + Sgd_desc.SetInput(kGrad, {fused_vars_name.at(kGrad)}); + Sgd_desc.SetOutput("ParamOut", {fused_vars_name.at(kParam)}); // TODO(zcd): The LearningRate, Beta1Pow, Beta2Pow should be equal. - Sgd_desc.SetInput("LearningRate", sgd_ops[0]->Op()->Input("LearningRate")); + Sgd_desc.SetInput(kLearningRate, sgd_ops[0]->Op()->Input(kLearningRate)); // NOTE: multi_devices_pass requires that every op should have a role. Sgd_desc.SetAttr(OpProtoAndCheckerMaker::OpRoleAttrName(), op_role); diff --git a/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc b/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc index 137e0dd7708dcc77c3a927979cfb357249f1fdc9..1bd27263f7dad5f733c553c202444ba7cacd2510 100644 --- a/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc +++ b/paddle/fluid/framework/details/parallel_ssa_graph_executor.cc @@ -106,7 +106,7 @@ ParallelSSAGraphExecutor::ParallelSSAGraphExecutor( VLOG(1) << "set num_threads: " << strategy_.num_threads_ << " to run the operators of the graph on each device."; for (size_t i = 0; i < places.size(); ++i) { - executors_.emplace_back(new details::ThreadedSSAGraphExecutor( + executors_.emplace_back(new details::FastThreadedSSAGraphExecutor( strategy_, local_scopes_, {places_[i]}, graphs_.at(i).get())); } } diff --git a/paddle/fluid/framework/details/parallel_ssa_graph_executor.h b/paddle/fluid/framework/details/parallel_ssa_graph_executor.h index 1e421f2a3a51363fe368859f7a34593c8c894077..faf071b05306a49c0049421bc72e4981c0bfc84c 100644 --- a/paddle/fluid/framework/details/parallel_ssa_graph_executor.h +++ b/paddle/fluid/framework/details/parallel_ssa_graph_executor.h @@ -14,12 +14,12 @@ #pragma once +#include #include #include - #include "ThreadPool.h" +#include "paddle/fluid/framework/details/fast_threaded_ssa_graph_executor.h" #include "paddle/fluid/framework/details/multi_devices_helper.h" -#include "paddle/fluid/framework/details/threaded_ssa_graph_executor.h" #include "paddle/fluid/framework/ir/graph.h" namespace paddle { @@ -48,7 +48,8 @@ class ParallelSSAGraphExecutor : public SSAGraphExecutor { std::vector places_; std::vector> graphs_; - std::vector> executors_; + std::vector> + executors_; ExceptionHolder exception_holder_; }; diff --git a/paddle/fluid/framework/downpour_worker.cc b/paddle/fluid/framework/downpour_worker.cc index 4ca7842fa261a1b8178438d35ca5d626146663d4..386ffd84c57063e950cd8b0d57304c66190be4c4 100644 --- a/paddle/fluid/framework/downpour_worker.cc +++ b/paddle/fluid/framework/downpour_worker.cc @@ -21,40 +21,40 @@ namespace framework { void DownpourWorker::Initialize(const TrainerDesc& desc) { param_ = desc.downpour_param(); - for (size_t i = 0; i < param_.sparse_table_size(); ++i) { + for (int i = 0; i < param_.sparse_table_size(); ++i) { uint64_t table_id = static_cast(param_.sparse_table(i).table_id()); TableParameter table = param_.sparse_table(i); sparse_key_names_[table_id].resize(table.sparse_key_name_size()); - for (size_t j = 0; j < table.sparse_key_name_size(); ++j) { + for (int j = 0; j < table.sparse_key_name_size(); ++j) { sparse_key_names_[table_id][j] = table.sparse_key_name(j); } sparse_value_names_[table_id].resize(table.sparse_value_name_size()); - for (size_t j = 0; j < table.sparse_value_name_size(); ++j) { + for (int j = 0; j < table.sparse_value_name_size(); ++j) { sparse_value_names_[table_id][j] = table.sparse_value_name(j); } sparse_grad_names_[table_id].resize(table.sparse_grad_name_size()); - for (size_t j = 0; j < table.sparse_grad_name_size(); ++j) { + for (int j = 0; j < table.sparse_grad_name_size(); ++j) { sparse_grad_names_[table_id][j] = table.sparse_grad_name(j); } label_var_name_[table_id] = table.label_var_name(); } - for (size_t i = 0; i < param_.dense_table_size(); ++i) { + for (int i = 0; i < param_.dense_table_size(); ++i) { uint64_t table_id = static_cast(param_.dense_table(i).table_id()); auto table = param_.dense_table(i); dense_value_names_[table_id].resize(table.dense_value_name_size()); - for (size_t j = 0; j < table.dense_value_name_size(); ++j) { + for (int j = 0; j < table.dense_value_name_size(); ++j) { dense_value_names_[table_id][j] = table.dense_value_name(j); } dense_grad_names_[table_id].resize(table.dense_grad_name_size()); - for (size_t j = 0; j < table.dense_grad_name_size(); ++j) { + for (int j = 0; j < table.dense_grad_name_size(); ++j) { dense_grad_names_[table_id][j] = table.dense_grad_name(j); } } skip_ops_.resize(param_.skip_ops_size()); - for (size_t i = 0; i < param_.skip_ops_size(); ++i) { + for (int i = 0; i < param_.skip_ops_size(); ++i) { skip_ops_[i] = param_.skip_ops(i); } @@ -83,14 +83,14 @@ void DownpourWorker::CollectLabelInfo(size_t table_idx) { LoDTensor* tensor = var->GetMutable(); int64_t* label_ptr = tensor->data(); - int global_index = 0; + size_t global_index = 0; for (size_t i = 0; i < sparse_key_names_[table_id].size(); ++i) { VLOG(3) << "sparse_key_names_[" << i << "]: " << sparse_key_names_[table_id][i]; Variable* fea_var = thread_scope_->FindVar(sparse_key_names_[table_id][i]); LoDTensor* tensor = fea_var->GetMutable(); int64_t* ids = tensor->data(); - int fea_idx = 0; + size_t fea_idx = 0; // tensor->lod()[0].size() == batch_size + 1 for (auto lod_idx = 1u; lod_idx < tensor->lod()[0].size(); ++lod_idx) { for (; fea_idx < tensor->lod()[0][lod_idx]; ++fea_idx) { @@ -138,7 +138,7 @@ void DownpourWorker::FillSparseValue(size_t table_idx) { auto& tensor_lod = tensor->lod()[0]; LoD data_lod{tensor_lod}; tensor_emb->set_lod(data_lod); - for (auto index = 0u; index < len; ++index) { + for (int index = 0; index < len; ++index) { if (ids[index] == 0u) { memcpy(ptr + table.emb_dim() * index, init_value.data() + 2, sizeof(float) * table.emb_dim()); @@ -192,7 +192,7 @@ void DownpourWorker::TrainFilesWithProfiler() { read_time += timeline.ElapsedSec(); total_time += timeline.ElapsedSec(); VLOG(3) << "program config size: " << param_.program_config_size(); - for (size_t i = 0; i < param_.program_config(0).pull_sparse_table_id_size(); + for (int i = 0; i < param_.program_config(0).pull_sparse_table_id_size(); ++i) { uint64_t tid = static_cast( param_.program_config(0).pull_sparse_table_id(i)); @@ -244,8 +244,8 @@ void DownpourWorker::TrainFilesWithProfiler() { } if (need_to_push_sparse_) { - for (size_t i = 0; - i < param_.program_config(0).push_sparse_table_id_size(); ++i) { + for (int i = 0; i < param_.program_config(0).push_sparse_table_id_size(); + ++i) { uint64_t tid = static_cast( param_.program_config(0).push_sparse_table_id(i)); TableParameter table; @@ -268,8 +268,8 @@ void DownpourWorker::TrainFilesWithProfiler() { if (need_to_push_dense_) { timeline.Start(); - for (size_t i = 0; - i < param_.program_config(0).push_dense_table_id_size(); ++i) { + for (int i = 0; i < param_.program_config(0).push_dense_table_id_size(); + ++i) { uint64_t tid = static_cast( param_.program_config(0).push_dense_table_id(i)); fleet_ptr_->PushDenseVarsAsync( @@ -315,8 +315,8 @@ void DownpourWorker::TrainFilesWithProfiler() { } if (need_to_push_dense_) { - for (size_t i = 0; - i < param_.program_config(0).push_dense_table_id_size(); ++i) { + for (int i = 0; i < param_.program_config(0).push_dense_table_id_size(); + ++i) { uint64_t tid = static_cast( param_.program_config(0).push_dense_table_id(i)); pull_dense_worker_->IncreaseThreadVersion(thread_id_, tid); @@ -362,7 +362,7 @@ void DownpourWorker::TrainFiles() { int cur_batch; while ((cur_batch = device_reader_->Next()) > 0) { // pull sparse here - for (size_t i = 0; i < param_.program_config(0).pull_sparse_table_id_size(); + for (int i = 0; i < param_.program_config(0).pull_sparse_table_id_size(); ++i) { uint64_t tid = static_cast( param_.program_config(0).pull_sparse_table_id(i)); @@ -397,8 +397,8 @@ void DownpourWorker::TrainFiles() { if (need_to_push_sparse_) { // push gradients here - for (size_t i = 0; - i < param_.program_config(0).push_sparse_table_id_size(); ++i) { + for (int i = 0; i < param_.program_config(0).push_sparse_table_id_size(); + ++i) { uint64_t tid = static_cast( param_.program_config(0).push_sparse_table_id(i)); TableParameter table; @@ -416,8 +416,8 @@ void DownpourWorker::TrainFiles() { } if (need_to_push_dense_) { - for (size_t i = 0; - i < param_.program_config(0).push_dense_table_id_size(); ++i) { + for (int i = 0; i < param_.program_config(0).push_dense_table_id_size(); + ++i) { uint64_t tid = static_cast( param_.program_config(0).push_dense_table_id(i)); fleet_ptr_->PushDenseVarsAsync( @@ -461,8 +461,8 @@ void DownpourWorker::TrainFiles() { } if (need_to_push_dense_) { - for (size_t i = 0; - i < param_.program_config(0).push_dense_table_id_size(); ++i) { + for (int i = 0; i < param_.program_config(0).push_dense_table_id_size(); + ++i) { uint64_t tid = static_cast( param_.program_config(0).push_dense_table_id(i)); pull_dense_worker_->IncreaseThreadVersion(thread_id_, tid); diff --git a/paddle/fluid/framework/ir/expected_kernel_cache_pass.cc b/paddle/fluid/framework/ir/expected_kernel_cache_pass.cc index ee67af0aff5c90a9da0ece8f197d9a0c0a8e5b9c..4a99d4c1a9c0f0bd973097d281e380341fe88515 100644 --- a/paddle/fluid/framework/ir/expected_kernel_cache_pass.cc +++ b/paddle/fluid/framework/ir/expected_kernel_cache_pass.cc @@ -23,7 +23,7 @@ namespace ir { void ExpectedKernelCachePass::ApplyImpl(ir::Graph* graph) const { VLOG(3) << "Applies Expected Kernel Cache strategy."; for (const Node* n : graph->Nodes()) { - if (n->IsOp()) { + if (n->IsOp() && n->Op()) { n->Op()->SetAttr(kEnableCacheExpectedKernel, true); } } diff --git a/paddle/fluid/framework/ir/graph_helper.cc b/paddle/fluid/framework/ir/graph_helper.cc index 28a37f331c100695f0ffec7288db84f4493d68a0..12ce99c8788625e2aae6e07abdea565bb2c2ebb9 100644 --- a/paddle/fluid/framework/ir/graph_helper.cc +++ b/paddle/fluid/framework/ir/graph_helper.cc @@ -31,10 +31,10 @@ namespace paddle { namespace framework { namespace ir { namespace { -void SortHelper( - const std::map> &adj_list, - ir::Node *node, std::unordered_set *visited, - std::vector *ret) { +void SortHelper(const std::map, + ir::NodeComp> &adj_list, + ir::Node *node, std::unordered_set *visited, + std::vector *ret) { visited->insert(node); for (auto adj : adj_list.at(node)) { @@ -50,7 +50,8 @@ void SortHelper( bool HasCircleHelper( ir::Node *node, - const std::map> &adj_list, + const std::map, ir::NodeComp> + &adj_list, std::unordered_set *visited, std::unordered_set *in_trace, std::vector> *circles) { @@ -84,7 +85,8 @@ bool HasCircleHelper( } bool HasCircleInternal( - const std::map> &adj_list, + const std::map, ir::NodeComp> + &adj_list, std::vector> *circles) { std::unordered_set visited; std::unordered_set in_trace; @@ -107,8 +109,8 @@ bool FindCircleSubGraph(const Graph &graph, } std::vector TopologySortOperations(const Graph &graph) { - std::map> adj_list = - BuildOperationAdjList(graph); + std::map, ir::NodeComp> + adj_list = BuildOperationAdjList(graph); PADDLE_ENFORCE(!HasCircleInternal(adj_list, nullptr)); std::unordered_set visited; std::vector ret; @@ -117,34 +119,30 @@ std::vector TopologySortOperations(const Graph &graph) { SortHelper(adj_list, adj.first, &visited, &ret); } } + return ret; } // Build operator inlink edge table. -std::map> BuildOperationAdjList( - const Graph &graph) { - std::map> adj_list; +std::map, ir::NodeComp> +BuildOperationAdjList(const Graph &graph) { + std::map, ir::NodeComp> + adj_list; for (auto &n : graph.Nodes()) { if (!n->IsOp()) continue; if (adj_list.find(n) == adj_list.end()) { - adj_list[n] = std::unordered_set(); + adj_list[n] = std::set(); } - std::vector nodes; for (auto &var : n->inputs) { for (auto &adj_n : var->inputs) { PADDLE_ENFORCE(adj_n->NodeType() == ir::Node::Type::kOperation); VLOG(4) << "adj " << adj_n->Name() << reinterpret_cast(adj_n) << " -> " << n->Name() << reinterpret_cast(n) << " via " << var->Name() << reinterpret_cast(var); - nodes.push_back(adj_n); + adj_list[n].insert(adj_n); } } - std::sort(nodes.begin(), nodes.end(), [](ir::Node *node1, ir::Node *node2) { - return node1->id() > node2->id(); - }); - adj_list[n].insert(std::make_move_iterator(nodes.begin()), - std::make_move_iterator(nodes.end())); } return adj_list; } diff --git a/paddle/fluid/framework/ir/graph_helper.h b/paddle/fluid/framework/ir/graph_helper.h index 214de9ec7d85aee6021b18866295777e317aa79d..849a9c3be6904f3f9c3669d8fc9d750154863031 100644 --- a/paddle/fluid/framework/ir/graph_helper.h +++ b/paddle/fluid/framework/ir/graph_helper.h @@ -16,6 +16,7 @@ limitations under the License. */ #include #include +#include #include #include "paddle/fluid/framework/ir/graph.h" @@ -25,6 +26,13 @@ namespace paddle { namespace framework { namespace ir { +// Compare nodes via node id. +struct NodeComp { + bool operator()(ir::Node *const &node1, ir::Node *const &node2) const { + return node1->id() < node2->id(); + } +}; + // Test if the graph contains circle. bool HasCircle(const Graph &graph); @@ -57,8 +65,8 @@ std::vector TopologyVarientSort(const Graph &graph, SortKind sort_kind); void CleanIndividualNodes(Graph *graph); // Build an adjacency list of operations for the `graph`. -std::map> BuildOperationAdjList( - const Graph &graph); +std::map, ir::NodeComp> +BuildOperationAdjList(const Graph &graph); template std::vector FilterByNodeWrapper(const Graph &graph) { diff --git a/paddle/fluid/framework/op_desc.cc b/paddle/fluid/framework/op_desc.cc index e6f5b15af8cd440a9304235acfe62787c5f1b134..1ea93b7638a85e67bcc85a0c0e130d636938d6c5 100644 --- a/paddle/fluid/framework/op_desc.cc +++ b/paddle/fluid/framework/op_desc.cc @@ -241,6 +241,7 @@ OpDesc::OpDesc(const std::string &type, const VariableNameMap &inputs, outputs_ = outputs; attrs_ = attrs; need_update_ = true; + block_ = nullptr; } OpDesc::OpDesc(const OpDesc &other, BlockDesc *block) { diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index 4245caf1689c76d72b410c742488c55562c8b998..c4bf2b7e8c017b22f917c9f9bd40e75b8cde08b2 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -221,7 +221,7 @@ ParallelExecutor::ParallelExecutor(const std::vector &places, PADDLE_ENFORCE(!member_->use_cuda_, "gpu mode does not support async_mode_ now!"); graphs.push_back(graph); - for (int i = 1; i < places.size(); ++i) { + for (size_t i = 1; i < places.size(); ++i) { auto *tmp_graph = new ir::Graph(graph->OriginProgram()); async_graphs_.emplace_back(tmp_graph); graphs.push_back(tmp_graph); @@ -315,7 +315,7 @@ ParallelExecutor::ParallelExecutor(const std::vector &places, graph = build_strategy.Apply(graph, {member_->places_[0]}, loss_var_name, {member_->local_scopes_[0]}, 1, member_->use_cuda_, member_->nccl_ctxs_.get()); - for (int i = 1; i < member_->places_.size(); ++i) { + for (size_t i = 1; i < member_->places_.size(); ++i) { graphs[i] = build_strategy.Apply(graphs[i], {member_->places_[i]}, loss_var_name, {member_->local_scopes_[i]}, 1, diff --git a/paddle/fluid/framework/trainer_desc.proto b/paddle/fluid/framework/trainer_desc.proto index 389c1a870fb54ad28806ad49632323b1c93676f4..4fc05ccf5c9be37e80b4ae7263166ad76eb6d6a7 100644 --- a/paddle/fluid/framework/trainer_desc.proto +++ b/paddle/fluid/framework/trainer_desc.proto @@ -76,7 +76,7 @@ message PullDenseWorkerParameter { message TableParameter { // dense table only - optional int64 table_id = 1; + optional uint64 table_id = 1; repeated string dense_value_name = 2; repeated string dense_grad_name = 3; repeated int32 push_dense_wait_times = 5; diff --git a/paddle/fluid/framework/var_type_inference.h b/paddle/fluid/framework/var_type_inference.h index 2e9c64d3e6854bf70c0aee06128b9f1b7c8c7439..66e6ac81623a1cd1c79981c1e4a97d974e9c2426 100644 --- a/paddle/fluid/framework/var_type_inference.h +++ b/paddle/fluid/framework/var_type_inference.h @@ -45,12 +45,16 @@ class InferVarTypeContext { virtual bool HasInput(const std::string& name) const { PADDLE_ENFORCE_NOT_NULL(op_); - return op_->Inputs().count(name) > 0; + auto& inputs = op_->Inputs(); + auto input = inputs.find(name); + return input != inputs.end() && !input->second.empty(); } virtual bool HasOutput(const std::string& name) const { PADDLE_ENFORCE_NOT_NULL(op_); - return op_->Outputs().count(name) > 0; + auto& outputs = op_->Outputs(); + auto output = outputs.find(name); + return output != outputs.end() && !output->second.empty(); } virtual const std::vector& Input(const std::string& name) const { diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index 6942604b0723f8665f0e8b058d48a5356a1a01f4..0155609a029664da2c3d4c90a152ec556927c32d 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -259,6 +259,9 @@ bool AnalysisPredictor::SetFeed(const std::vector &inputs, return false; } + PADDLE_ENFORCE_NOT_NULL(input_ptr); + PADDLE_ENFORCE_NOT_NULL(inputs[i].data.data()); + if (platform::is_cpu_place(place_)) { // TODO(panyx0718): Init LoDTensor from existing memcpy to save a copy. std::memcpy(static_cast(input_ptr), inputs[i].data.data(), diff --git a/paddle/fluid/inference/api/api.cc b/paddle/fluid/inference/api/api.cc index 7d57b6ec74468dbdb0519f85140629a0ac01c18d..fc2d7b48c2a1f89232dcb96d1899667230e2ddda 100644 --- a/paddle/fluid/inference/api/api.cc +++ b/paddle/fluid/inference/api/api.cc @@ -54,6 +54,7 @@ PaddleBuf &PaddleBuf::operator=(const PaddleBuf &other) { memory_owned_ = other.memory_owned_; } else { Resize(other.length()); + PADDLE_ENFORCE(!(other.length() > 0 && other.data() == nullptr)); memcpy(data_, other.data(), other.length()); length_ = other.length(); memory_owned_ = true; diff --git a/paddle/fluid/inference/api/api_impl.cc b/paddle/fluid/inference/api/api_impl.cc index 54f40563c3662af24e794422be4d3262d86c76a7..56996c5cff88f5b4a9094291a09996f8b8d70a23 100644 --- a/paddle/fluid/inference/api/api_impl.cc +++ b/paddle/fluid/inference/api/api_impl.cc @@ -169,6 +169,7 @@ std::unique_ptr NativePaddlePredictor::Clone() { std::unique_ptr cls(new NativePaddlePredictor(config_)); // Hot fix the bug that result diff in multi-thread. // TODO(Superjomn) re-implement a real clone here. + PADDLE_ENFORCE_NOT_NULL(dynamic_cast(cls.get())); if (!dynamic_cast(cls.get())->Init(nullptr)) { LOG(ERROR) << "fail to call Init"; return nullptr; @@ -210,6 +211,8 @@ bool NativePaddlePredictor::SetFeed(const std::vector &inputs, return false; } + PADDLE_ENFORCE_NOT_NULL(input_ptr); + PADDLE_ENFORCE_NOT_NULL(inputs[i].data.data()); if (platform::is_cpu_place(place_)) { // TODO(panyx0718): Init LoDTensor from existing memcpy to save a copy. std::memcpy(static_cast(input_ptr), inputs[i].data.data(), @@ -316,6 +319,8 @@ std::unique_ptr CreatePaddlePredictor< } std::unique_ptr predictor(new NativePaddlePredictor(config)); + PADDLE_ENFORCE_NOT_NULL( + dynamic_cast(predictor.get())); if (!dynamic_cast(predictor.get())->Init(nullptr)) { return nullptr; } diff --git a/paddle/fluid/inference/tests/api/analyzer_seq_conv1_tester.cc b/paddle/fluid/inference/tests/api/analyzer_seq_conv1_tester.cc index 9f23b9f037bcaeb758312d011067ae29c82e73cd..5ee848c3cfa2117b2adeab5e563c5d07ce1d76ca 100644 --- a/paddle/fluid/inference/tests/api/analyzer_seq_conv1_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_seq_conv1_tester.cc @@ -47,6 +47,7 @@ struct DataRecord { num_lines++; std::vector data; split(line, '\t', &data); + PADDLE_ENFORCE(data.size() >= 4); // load title1 data std::vector title1_data; split_to_int64(data[0], ' ', &title1_data); diff --git a/paddle/fluid/inference/tests/api/analyzer_transformer_tester.cc b/paddle/fluid/inference/tests/api/analyzer_transformer_tester.cc index bd4f1b61973fb0de06dcc288e329c94756d5ed47..a23297f29cf65d891f530850ffd184aa58e10886 100644 --- a/paddle/fluid/inference/tests/api/analyzer_transformer_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_transformer_tester.cc @@ -214,28 +214,23 @@ TEST(Analyzer_Transformer, fuse_statis) { } // Compare result of NativeConfig and AnalysisConfig -// void compare(bool use_mkldnn = false) { -// AnalysisConfig cfg; -// SetConfig(&cfg); -// if (use_mkldnn) { -// cfg.EnableMKLDNN(); -// } -// -// std::vector> input_slots_all; -// SetInput(&input_slots_all); -// CompareNativeAndAnalysis( -// reinterpret_cast(&cfg), -// input_slots_all); -// } - -// TODO(yihuaxu): -// Disable compare and compare_mkldnn temporary, see -// https://github.com/paddlePaddle/Paddle/issues/16316 for details. -// TEST(Analyzer_Transformer, compare) { compare(); } -// #ifdef PADDLE_WITH_MKLDNN -// TEST(Analyzer_Transformer, compare_mkldnn) { compare(true /* use_mkldnn */); -// } -// #endif +void compare(bool use_mkldnn = false) { + AnalysisConfig cfg; + SetConfig(&cfg); + if (use_mkldnn) { + cfg.EnableMKLDNN(); + } + + std::vector> input_slots_all; + SetInput(&input_slots_all); + CompareNativeAndAnalysis( + reinterpret_cast(&cfg), input_slots_all); +} + +TEST(Analyzer_Transformer, compare) { compare(); } +#ifdef PADDLE_WITH_MKLDNN +TEST(Analyzer_Transformer, compare_mkldnn) { compare(true /* use_mkldnn */); } +#endif } // namespace inference } // namespace paddle diff --git a/paddle/fluid/op_use_default_grad_op_maker.spec b/paddle/fluid/op_use_default_grad_op_maker.spec index 21a25ce7d5e2bad172cf50cee6138ef4b44b07c1..63eaa676a43fc784dce2437ca15bc85e2295dbb7 100644 --- a/paddle/fluid/op_use_default_grad_op_maker.spec +++ b/paddle/fluid/op_use_default_grad_op_maker.spec @@ -29,6 +29,8 @@ pool3d prelu quantize rank_loss +reduce_all +reduce_any reduce_max reduce_mean reduce_min diff --git a/paddle/fluid/operators/detection/gpc.cc b/paddle/fluid/operators/detection/gpc.cc index 7c0823c0487d39eece5be08322e7d182b931ba3c..f46aaf7d0a7b2d48f18ba6cccb555bbb691ad353 100644 --- a/paddle/fluid/operators/detection/gpc.cc +++ b/paddle/fluid/operators/detection/gpc.cc @@ -24,6 +24,7 @@ **/ #include "paddle/fluid/operators/detection/gpc.h" +#include "paddle/fluid/platform/enforce.h" namespace gpc { @@ -689,6 +690,7 @@ static bbox *create_contour_bboxes(gpc_polygon *p) { gpc_malloc(box, p->num_contours * sizeof(bbox), const_cast("Bounding box creation")); + PADDLE_ENFORCE_NOT_NULL(box); /* Construct contour bounding boxes */ for (c = 0; c < p->num_contours; c++) { @@ -852,6 +854,7 @@ void gpc_add_contour(gpc_polygon *p, gpc_vertex_list *new_contour, int hole) { /* Create an extended hole array */ gpc_malloc(extended_hole, (p->num_contours + 1) * sizeof(int), const_cast("contour hole addition")); + PADDLE_ENFORCE_NOT_NULL(extended_hole); /* Create an extended contour array */ gpc_malloc(extended_contour, @@ -969,6 +972,7 @@ void gpc_polygon_clip(gpc_op op, gpc_polygon *subj, gpc_polygon *clip, /* Build scanbeam table from scanbeam tree */ gpc_malloc(sbt, sbt_entries * sizeof(double), const_cast("sbt creation")); + PADDLE_ENFORCE_NOT_NULL(sbt); build_sbt(&scanbeam, sbt, sbtree); scanbeam = 0; free_sbtree(&sbtree); @@ -1604,6 +1608,7 @@ void gpc_tristrip_clip(gpc_op op, gpc_polygon *subj, gpc_polygon *clip, /* Build scanbeam table from scanbeam tree */ gpc_malloc(sbt, sbt_entries * sizeof(double), const_cast("sbt creation")); + PADDLE_ENFORCE_NOT_NULL(sbt); build_sbt(&scanbeam, sbt, sbtree); scanbeam = 0; free_sbtree(&sbtree); diff --git a/paddle/fluid/operators/distributed/CMakeLists.txt b/paddle/fluid/operators/distributed/CMakeLists.txt index 972b4f67a8388ce68952fa90aaa224cd45c6d226..f6531ec9edca7b425d28853f542d5e46783ba699 100644 --- a/paddle/fluid/operators/distributed/CMakeLists.txt +++ b/paddle/fluid/operators/distributed/CMakeLists.txt @@ -9,6 +9,9 @@ else() endif() configure_file(send_recv.proto.in ${CMAKE_CURRENT_SOURCE_DIR}/send_recv.proto @ONLY) +cc_library(async_sparse_param_update_recorder SRCS async_sparse_param_update_recorder.cc DEPS enforce simple_threadpool) +cc_test(async_sparse_param_update_recorder_test SRCS async_sparse_param_update_recorder_test.cc DEPS async_sparse_param_update_recorder) + # FIXME(typhoonzero): use add_subdirectory once we clean the dependency of these files set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor") if(WITH_GRPC) @@ -20,7 +23,7 @@ if(WITH_GRPC) collective_client.cc collective_server.cc ${GRPC_SRCS} PROTO send_recv.proto - DEPS lod_tensor selected_rows_functor memory scope ${GRPC_DEPS}) + DEPS lod_tensor selected_rows_functor memory scope ${GRPC_DEPS} async_sparse_param_update_recorder) set_source_files_properties(grpc_serde_test.cc rpc_server_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) set(RPC_DEPS sendrecvop_rpc ${GRPC_DEPS}) diff --git a/paddle/fluid/operators/distributed/async_sparse_param_update_recorder.cc b/paddle/fluid/operators/distributed/async_sparse_param_update_recorder.cc new file mode 100644 index 0000000000000000000000000000000000000000..3f3b6b959e30194c10b1a58d6fc3e7a61ad01313 --- /dev/null +++ b/paddle/fluid/operators/distributed/async_sparse_param_update_recorder.cc @@ -0,0 +1,27 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/operators/distributed/async_sparse_param_update_recorder.h" + +namespace paddle { +namespace operators { +namespace distributed { + +std::once_flag AsyncSparseParamUpdateRecorder::init_flag_; +std::unique_ptr + AsyncSparseParamUpdateRecorder::recorder_(nullptr); + +} // namespace distributed +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/distributed/async_sparse_param_update_recorder.h b/paddle/fluid/operators/distributed/async_sparse_param_update_recorder.h new file mode 100644 index 0000000000000000000000000000000000000000..eadd842c7f6ead56006fd0c34814b1b7bd9b62f4 --- /dev/null +++ b/paddle/fluid/operators/distributed/async_sparse_param_update_recorder.h @@ -0,0 +1,183 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include // NOLINT +#include +#include +#include +#include +#include +#include + +#include + +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace operators { +namespace distributed { + +class ConcurrentSet { + public: + ConcurrentSet() : pool_(new ::ThreadPool(1)) {} + ~ConcurrentSet() {} + + std::future Update(const std::vector& rows) { + auto task = [this, rows] { + if (VLOG_IS_ON(3)) { + std::ostringstream sstream; + sstream << "["; + for (auto& id : rows) { + sstream << id << ", "; + } + sstream << "]"; + VLOG(3) << "update ids -> " << sstream.str(); + } + for (auto row : rows) { + set_.insert(row); + } + }; + return pool_->enqueue(std::move(task)); + } + + std::future GetAndClear(std::vector* result) { + auto task = [this, &result] { + result->clear(); + for (auto& id : set_) { + result->push_back(id); + } + if (VLOG_IS_ON(3)) { + std::ostringstream sstream; + sstream << "["; + for (auto& id : *result) { + sstream << id << ", "; + } + sstream << "]"; + VLOG(3) << "result ids size: " << result->size() << " " + << sstream.str(); + } + set_.clear(); + }; + return pool_->enqueue(std::move(task)); + } + + private: + std::unordered_set set_; + std::unique_ptr<::ThreadPool> pool_{nullptr}; +}; + +class AsyncSparseParamUpdateRecorder { + using TrainerToRows = std::vector>; + + public: + AsyncSparseParamUpdateRecorder( + int trainer_num, + const std::unordered_map& grad_to_param) + : trainer_num_(trainer_num), grad_to_param_(grad_to_param) { + if (VLOG_IS_ON(3)) { + std::ostringstream sstream; + sstream << "["; + for (auto& item : grad_to_param) { + sstream << item.first << ":" << item.second << ", "; + } + sstream << "]"; + VLOG(3) << "trainer_num: " << trainer_num + << " grad_to_param_: " << sstream.str(); + } + for (auto& iter : grad_to_param) { + param_to_grad_[iter.second] = iter.first; + auto& param_name = iter.second; + param_to_updated_rows_[param_name] = TrainerToRows(); + auto& trainer_to_rows = param_to_updated_rows_[param_name]; + for (auto i = 0; i < trainer_num; ++i) { + trainer_to_rows.emplace_back(new ConcurrentSet()); + } + } + } + + ~AsyncSparseParamUpdateRecorder() = default; + + void Update(const std::string& grad_name, + const std::vector& update_rows) { + VLOG(3) << "update grad: " << grad_name + << " row size: " << update_rows.size(); + auto& param_name = grad_to_param_.at(grad_name); + auto& trainer_to_rows = param_to_updated_rows_.at(param_name); + + std::vector> fs; + for (auto& set : trainer_to_rows) { + fs.push_back(set->Update(update_rows)); + } + for (auto& f : fs) { + f.wait(); + } + } + + void GetAndClear(const std::string& param_name, int trainer_id, + std::vector* result) { + VLOG(3) << "GetAndClear param: " << param_name + << " for trainer: " << trainer_id; + PADDLE_ENFORCE_LT(trainer_id, trainer_num_); + param_to_updated_rows_.at(param_name)[trainer_id] + ->GetAndClear(result) + .wait(); + } + + bool HasParam(const std::string& param_name) { + return param_to_grad_.find(param_name) != param_to_grad_.end(); + } + + bool HasGrad(const std::string& grad_name) { + return grad_to_param_.find(grad_name) != grad_to_param_.end(); + } + + private: + const int trainer_num_; + std::unordered_map grad_to_param_; + std::unordered_map param_to_grad_; + std::unordered_map param_to_updated_rows_; + + // init recorder + public: + static void Init( + int trainer_num, + const std::unordered_map& grad_to_param) { + InitImpl(trainer_num, grad_to_param); + } + + static AsyncSparseParamUpdateRecorder* GetInstance() { + return recorder_.get(); + } + + private: + // Init is called by GetInstance. + static void InitImpl( + int trainer_num, + const std::unordered_map& grad_to_param) { + if (recorder_ == nullptr) { + recorder_.reset( + new AsyncSparseParamUpdateRecorder(trainer_num, grad_to_param)); + } + } + + static std::once_flag init_flag_; + static std::unique_ptr recorder_; +}; + +} // namespace distributed +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/distributed/async_sparse_param_update_recorder_test.cc b/paddle/fluid/operators/distributed/async_sparse_param_update_recorder_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..67e8fd8a0edc4510d0abe885c821e75b528254f8 --- /dev/null +++ b/paddle/fluid/operators/distributed/async_sparse_param_update_recorder_test.cc @@ -0,0 +1,99 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/operators/distributed/async_sparse_param_update_recorder.h" + +#include + +#include "gtest/gtest.h" + +namespace paddle { +namespace operators { +namespace distributed { + +TEST(ConcurrentSet, All) { + ConcurrentSet concurrent_set; + std::vector in1 = {1, 2, 3, 4}; + std::vector in2 = {2, 3, 5, 6}; + + std::vector> futures; + futures.push_back(concurrent_set.Update(in1)); + futures.push_back(concurrent_set.Update(in2)); + + for (auto &f : futures) { + f.wait(); + } + + std::unordered_set in; + std::copy(in1.begin(), in1.end(), std::inserter(in, in.begin())); + std::copy(in2.begin(), in2.end(), std::inserter(in, in.begin())); + + std::vector ret; + concurrent_set.GetAndClear(&ret).wait(); + + std::unordered_set out; + std::copy(ret.begin(), ret.end(), std::inserter(out, out.begin())); + + EXPECT_EQ(in, out); + + concurrent_set.GetAndClear(&ret).wait(); + EXPECT_EQ(ret.size(), 0); +} + +TEST(AsyncSparseParamUpdateRecorder, All) { + std::unordered_map grad_to_param; + grad_to_param["grad1"] = "param1"; + grad_to_param["grad2"] = "param2"; + + int trainer_num = 10; + + AsyncSparseParamUpdateRecorder recorder(trainer_num, grad_to_param); + std::vector in1 = {1, 2, 3, 4}; + std::vector in2 = {2, 3, 5, 6}; + + std::unordered_set in; + std::copy(in1.begin(), in1.end(), std::inserter(in, in.begin())); + std::copy(in2.begin(), in2.end(), std::inserter(in, in.begin())); + + recorder.Update("grad1", in1); + recorder.Update("grad1", in2); + + EXPECT_TRUE(recorder.HasParam("param1")); + EXPECT_TRUE(recorder.HasParam("param2")); + EXPECT_FALSE(recorder.HasParam("param3")); + + EXPECT_TRUE(recorder.HasGrad("grad1")); + EXPECT_TRUE(recorder.HasGrad("grad2")); + EXPECT_FALSE(recorder.HasGrad("grad3")); + + std::vector ret; + EXPECT_ANY_THROW(recorder.GetAndClear("param1", trainer_num, &ret)); + + for (int i = 0; i < trainer_num; ++i) { + std::vector ret; + std::unordered_set out; + + recorder.GetAndClear("param1", i, &ret); + std::copy(ret.begin(), ret.end(), std::inserter(out, out.begin())); + + EXPECT_EQ(in, out); + + recorder.GetAndClear("param1", i, &ret); + EXPECT_EQ(ret.size(), 0); + } +} + +} // namespace distributed +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/distributed/brpc/brpc_client.cc b/paddle/fluid/operators/distributed/brpc/brpc_client.cc index a1a3443348129b5cdf057592fced8fdff238ac09..4c22ad8eb4d4b2e23d8a6720e726eb9e2998314e 100644 --- a/paddle/fluid/operators/distributed/brpc/brpc_client.cc +++ b/paddle/fluid/operators/distributed/brpc/brpc_client.cc @@ -234,6 +234,7 @@ VarHandlePtr BRPCClient::AsyncGetVar(const std::string& ep, const framework::Scope& scope, const std::string& var_name, const std::string& out_var_name, + const std::string& table_name, int64_t time_out) { return _AsyncGetVar(ep, ctx, scope, var_name, out_var_name, kGetRPC, time_out); diff --git a/paddle/fluid/operators/distributed/brpc/brpc_client.h b/paddle/fluid/operators/distributed/brpc/brpc_client.h index 501a593b11d35c160348e42ee47216a85647aac4..51864dfdca53eb4b1d9045188a6347781130e785 100644 --- a/paddle/fluid/operators/distributed/brpc/brpc_client.h +++ b/paddle/fluid/operators/distributed/brpc/brpc_client.h @@ -21,8 +21,10 @@ limitations under the License. */ #include #include #include +#include #include // NOLINT #include +#include #include #include "brpc/channel.h" @@ -66,6 +68,7 @@ class BRPCClient : public RPCClient { const framework::Scope& scope, const std::string& var_name, const std::string& out_var_name, + const std::string& table_name = "", int64_t time_out = FLAGS_rpc_deadline) override; VarHandlePtr AsyncGetMonomerBarrier( @@ -107,13 +110,11 @@ class BRPCClient : public RPCClient { void SendComplete() override; private: - VarHandlePtr _AsyncGetVar(const std::string& ep, - const platform::DeviceContext& ctx, - const framework::Scope& scope, - const std::string& var_name, - const std::string& out_var_name, - const std::string& method_name, - int64_t time_out = FLAGS_rpc_deadline); + VarHandlePtr _AsyncGetVar( + const std::string& ep, const platform::DeviceContext& ctx, + const framework::Scope& scope, const std::string& var_name, + const std::string& out_var_name, const std::string& method_name, + const std::string& table_name, int64_t time_out = FLAGS_rpc_deadline); void Proceed(); ChannelQueuePtr GetChannel(const std::string& ep); diff --git a/paddle/fluid/operators/distributed/communicator.cc b/paddle/fluid/operators/distributed/communicator.cc index eba18c67771fa26eed855b0f19591e06101f424d..b528bcdd32b11d686f44596d9a1bb663b21691f4 100644 --- a/paddle/fluid/operators/distributed/communicator.cc +++ b/paddle/fluid/operators/distributed/communicator.cc @@ -32,6 +32,9 @@ DEFINE_int32(communicator_send_queue_size, 20, DEFINE_int32(communicator_max_send_grad_num_before_recv, 20, "max grad num to send before recv parameters"); DEFINE_int32(communicator_thread_pool_size, 5, "thread num to do send or recv"); +DEFINE_int32(communicator_send_wait_times, 5, + "times that send thread will wait if merge num does not reach " + "max_merge_var_num"); DEFINE_int32(communicator_max_merge_var_num, 20, "max var num to merge and send"); DEFINE_bool(communicator_fake_rpc, false, @@ -65,6 +68,8 @@ Communicator::Communicator(const RpcCtxMap &send_varname_to_ctx, << FLAGS_communicator_max_send_grad_num_before_recv; VLOG(0) << "communicator_thread_pool_size: " << FLAGS_communicator_thread_pool_size; + VLOG(0) << "communicator_send_wait_times: " + << FLAGS_communicator_send_wait_times; VLOG(0) << "communicator_max_merge_var_num: " << FLAGS_communicator_max_merge_var_num; VLOG(0) << "communicator_fake_rpc: " << FLAGS_communicator_fake_rpc; @@ -101,20 +106,32 @@ void Communicator::SendThread() { VLOG(3) << var_name << " merge and send"; std::vector> vars; size_t merged_var_num = 0; - while (var_queue->Size() > 0 && - merged_var_num < FLAGS_communicator_max_merge_var_num) { - vars.push_back(var_queue->Pop()); - // only count the send number of the first var - if (var_name == send_varname_to_queue_.begin()->first) { - grad_num_.fetch_add(1, std::memory_order_relaxed); + size_t wait_times = 0; + while (merged_var_num < FLAGS_communicator_max_merge_var_num) { + if (var_queue->Size() == 0) { + VLOG(3) << "wait_times -> " << wait_times; + if (wait_times >= FLAGS_communicator_send_wait_times) { + break; + } + std::this_thread::sleep_for(std::chrono::milliseconds(10)); + wait_times++; + continue; + } else { + wait_times = 0; + + vars.push_back(var_queue->Pop()); + // only count the send number of the first var + if (var_name == send_varname_to_queue_.begin()->first) { + grad_num_.fetch_add(1, std::memory_order_relaxed); + } + merged_var_num++; } - merged_var_num++; } auto before_merge = GetCurrentUS(); MergeVars(var_name, vars, send_scope_.get()); auto after_merge = GetCurrentUS(); - VLOG(3) << "merge " << var_name << " use time " - << after_merge - before_merge; + VLOG(3) << "merge " << merged_var_num << " " << var_name + << " use time " << after_merge - before_merge; auto send_functor = distributed::ParameterSend(); auto &ctx = send_varname_to_ctx_.at(var_name); if (!FLAGS_communicator_fake_rpc) { diff --git a/paddle/fluid/operators/distributed/communicator.h b/paddle/fluid/operators/distributed/communicator.h index 41155bfc31bb31520fdcf5bd50b203f2e1f2c516..37c39eb15112f745f6a25e95ce65d431d825182e 100644 --- a/paddle/fluid/operators/distributed/communicator.h +++ b/paddle/fluid/operators/distributed/communicator.h @@ -109,7 +109,7 @@ inline void MergeVars(const std::string& var_name, auto* out_var = scope->Var(var_name); if (var0->IsType()) { auto dims = var0->Get().dims(); - VLOG(3) << "merge " << var_name << " LoDTensor " << dims; + VLOG(3) << "merge " << var_name << " LoDTensor dims " << dims; // init output tensor auto* out_t = out_var->GetMutable(); diff --git a/paddle/fluid/operators/distributed/grpc/grpc_client.cc b/paddle/fluid/operators/distributed/grpc/grpc_client.cc index 61e94dae3c7a107e10fa5e5518651014cec078bc..8504110c6e9dbfe22b78063999ed4a9e36850e2c 100644 --- a/paddle/fluid/operators/distributed/grpc/grpc_client.cc +++ b/paddle/fluid/operators/distributed/grpc/grpc_client.cc @@ -128,9 +128,11 @@ VarHandlePtr GRPCClient::AsyncGetVar(const std::string& ep, const framework::Scope& scope, const std::string& var_name, const std::string& out_varname, + const std::string& table_name, int64_t time_out) { return _AsyncGetVar(ep, ctx, scope, kGetRPC, var_name, out_varname, - "/sendrecv.SendRecvService/GetVariable", time_out); + "/sendrecv.SendRecvService/GetVariable", table_name, + time_out); } VarHandlePtr GRPCClient::AsyncGetVarNoBarrier( @@ -142,7 +144,7 @@ VarHandlePtr GRPCClient::AsyncGetVarNoBarrier( return _AsyncGetVar( ep, ctx, scope, kGetNoBarrierRPC, var_name_no_barrier, out_varname, - "/sendrecv.SendRecvService/GetVariableNoBarrier", time_out); + "/sendrecv.SendRecvService/GetVariableNoBarrier", "", time_out); } VarHandlePtr GRPCClient::AsyncGetMonomerVariable( @@ -150,18 +152,21 @@ VarHandlePtr GRPCClient::AsyncGetMonomerVariable( const framework::Scope& scope, const std::string& var_name, int64_t time_out) { return _AsyncGetVar(ep, ctx, scope, kGetMonomerRPC, var_name, var_name, - "/sendrecv.SendRecvService/GetMonomerVariable", time_out); + "/sendrecv.SendRecvService/GetMonomerVariable", "", + time_out); } VarHandlePtr GRPCClient::_AsyncGetVar( const std::string& ep, const platform::DeviceContext& ctx, const framework::Scope& scope, const std::string& method, const std::string& var_name, const std::string& out_varname, - const std::string& rpc_path, int64_t time_out) { + const std::string& rpc_path, const std::string& table_name, + int64_t time_out) { const platform::DeviceContext* p_ctx = &ctx; const std::string ep_val = ep; const std::string var_name_val = var_name; const std::string out_varname_val = out_varname; + const std::string table_name_val = table_name; const framework::Scope* p_scope = &scope; const auto ch = GetChannel(ep_val); GetProcessor* s = new GetProcessor(ch); @@ -169,32 +174,33 @@ VarHandlePtr GRPCClient::_AsyncGetVar( VarHandlePtr h(new VarHandle(ep, method, out_varname_val, p_ctx, p_scope)); s->Prepare(h, time_out); - framework::AsyncIO( - [var_name_val, out_varname_val, s, method, p_ctx, h, rpc_path, this] { - // prepare input - sendrecv::VariableMessage req; - req.set_varname(var_name_val); - req.set_out_varname(out_varname_val); - req.set_trainer_id(trainer_id_); - ::grpc::ByteBuffer buf; - RequestToByteBuffer(req, &buf); + framework::AsyncIO([var_name_val, out_varname_val, table_name_val, s, method, + p_ctx, h, rpc_path, this] { + // prepare input + sendrecv::VariableMessage req; + req.set_varname(var_name_val); + req.set_out_varname(out_varname_val); + req.set_trainer_id(trainer_id_); + req.set_table_name(table_name_val); + ::grpc::ByteBuffer buf; + RequestToByteBuffer(req, &buf); - VLOG(3) << s->GetVarHandlePtr()->String() << " begin"; + VLOG(3) << s->GetVarHandlePtr()->String() << " begin"; - // stub context - s->response_call_back_ = ProcGetResponse; + // stub context + s->response_call_back_ = ProcGetResponse; - platform::RecordRPCEvent record_event(method); + platform::RecordRPCEvent record_event(method); - auto call = - s->stub_g_.PrepareUnaryCall(s->context_.get(), rpc_path, buf, &cq_); - call->StartCall(); - call->Finish(&s->reply_, &s->status_, reinterpret_cast(s)); + auto call = + s->stub_g_.PrepareUnaryCall(s->context_.get(), rpc_path, buf, &cq_); + call->StartCall(); + call->Finish(&s->reply_, &s->status_, reinterpret_cast(s)); - if (UNLIKELY(platform::IsProfileEnabled())) { - h->Wait(); - } - }); + if (UNLIKELY(platform::IsProfileEnabled())) { + h->Wait(); + } + }); req_count_++; diff --git a/paddle/fluid/operators/distributed/grpc/grpc_client.h b/paddle/fluid/operators/distributed/grpc/grpc_client.h index ce0d2152aa27c62b6e12881aaf2ae458597e67e6..ad2f04a6d1dda34e35b67b21dce8ac612ff697a0 100644 --- a/paddle/fluid/operators/distributed/grpc/grpc_client.h +++ b/paddle/fluid/operators/distributed/grpc/grpc_client.h @@ -23,9 +23,11 @@ limitations under the License. */ #include #include #include +#include #include // NOLINT #include #include // NOLINT +#include #include #include "grpc++/channel.h" @@ -187,6 +189,7 @@ class GRPCClient : public RPCClient { const framework::Scope& scope, const std::string& var_name, const std::string& out_varname, + const std::string& table_name = "", int64_t time_out = FLAGS_rpc_deadline) override; VarHandlePtr AsyncGetVarNoBarrier( @@ -239,7 +242,8 @@ class GRPCClient : public RPCClient { const std::string& ep, const platform::DeviceContext& ctx, const framework::Scope& scope, const std::string& method, const std::string& var_name, const std::string& out_varname, - const std::string& rpc_path, int64_t time_out = FLAGS_rpc_deadline); + const std::string& rpc_path, const std::string& table_name = "", + int64_t time_out = FLAGS_rpc_deadline); private: grpc::CompletionQueue cq_; diff --git a/paddle/fluid/operators/distributed/grpc/grpc_server.cc b/paddle/fluid/operators/distributed/grpc/grpc_server.cc index 0eb313f75dfa64f8722faa365128f3111f72bd0b..75526bed0f0eadada65279ec05757da7a469f984 100644 --- a/paddle/fluid/operators/distributed/grpc/grpc_server.cc +++ b/paddle/fluid/operators/distributed/grpc/grpc_server.cc @@ -137,6 +137,7 @@ class RequestGet final : public RequestBase { // proc request. std::string varname = request_.varname(); std::string out_varname = request_.out_varname(); + std::string table_name = request_.table_name(); int trainer_id = request_.trainer_id(); VLOG(4) << "RequestGet " << out_varname << " from " << varname; @@ -145,19 +146,23 @@ class RequestGet final : public RequestBase { framework::Variable* invar = nullptr; framework::Variable* outvar = nullptr; - request_handler_->Handle(varname, scope, invar, &outvar, trainer_id, - out_varname); + tmp_scope_ = std::move(scope->NewTmpScope()); + request_handler_->Handle(varname, tmp_scope_.get(), invar, &outvar, + trainer_id, out_varname, table_name); + VLOG(1) << "before SerializeToByteBuffer"; if (outvar) { SerializeToByteBuffer(out_varname, outvar, *request_handler_->dev_ctx(), &reply_); } + VLOG(1) << "after SerializeToByteBuffer"; Finish(reply_, &responder_); } protected: sendrecv::VariableMessage request_; ::grpc::ByteBuffer reply_; + std::unique_ptr tmp_scope_; ServerAsyncResponseWriter<::grpc::ByteBuffer> responder_; }; diff --git a/paddle/fluid/operators/distributed/parameter_recv.cc b/paddle/fluid/operators/distributed/parameter_recv.cc index e7d4c262aa9fad10a23adc61b94ba0c38577c0e8..da73167ae603fb8c8ba9deabe118269891d1f52a 100644 --- a/paddle/fluid/operators/distributed/parameter_recv.cc +++ b/paddle/fluid/operators/distributed/parameter_recv.cc @@ -42,27 +42,23 @@ using DDim = framework::DDim; template void ParameterRecv::operator()(const RpcContext &rpc_ctx, const framework::Scope &scope) { - VLOG(3) << "ParameterRecv in"; + VLOG(3) << "ParameterRecv in " << rpc_ctx.var_name; std::unique_ptr local_scope = scope.NewTmpScope(); platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); auto &cpu_ctx = *pool.Get(platform::CPUPlace()); distributed::RPCClient *rpc_client = - distributed::RPCClient::GetInstance(0); + distributed::RPCClient::GetInstance(rpc_ctx.trainer_id); auto *recv_var = scope.FindVar(rpc_ctx.var_name); - std::vector recved_tensors; - // recv all vars to local scope if (recv_var->IsType()) { std::vector rets; for (size_t i = 0; i < rpc_ctx.splited_var_names.size(); i++) { auto &recv_var_name = rpc_ctx.splited_var_names[i]; - framework::Tensor *t = - local_scope->Var(recv_var_name)->GetMutable(); - recved_tensors.push_back(t); + local_scope->Var(recv_var_name); VLOG(3) << "recv " << recv_var_name << " from " << rpc_ctx.epmap[i]; rets.push_back(rpc_client->AsyncGetVar(rpc_ctx.epmap[i], cpu_ctx, *local_scope.get(), recv_var_name, @@ -78,23 +74,61 @@ void ParameterRecv::operator()(const RpcContext &rpc_ctx, // concat recved tensor into one var { size_t output_offset = 0; + size_t row_offset = 0; framework::Tensor *recv_tensor = recv_var->GetMutable(); auto dev_ctx = paddle::platform::CPUDeviceContext(); int64_t recv_numel = 0; - for (auto *in : recved_tensors) { - recv_numel += in->numel(); - auto in_stride = framework::stride_numel(in->dims()); - auto out_stride = framework::stride_numel(recv_tensor->dims()); - StridedNumelCopyWithAxis( - dev_ctx, 0, recv_tensor->data() + output_offset, out_stride, - in->data(), in_stride, in_stride[0]); - output_offset += in_stride[0]; + for (auto &recv_var_name : rpc_ctx.splited_var_names) { + auto *recv_var = local_scope->FindVar(recv_var_name); + if (recv_var->IsType()) { + auto &in = recv_var->Get(); + recv_numel += in.numel(); + auto in_stride = framework::stride_numel(in.dims()); + auto out_stride = framework::stride_numel(recv_tensor->dims()); + StridedNumelCopyWithAxis( + dev_ctx, 0, recv_tensor->data() + output_offset, out_stride, + in.data(), in_stride, in_stride[0]); + output_offset += in_stride[0]; + } else if (recv_var->IsType()) { + auto &recv_slr = recv_var->Get(); + auto &recv_dims = recv_tensor->dims(); + int64_t width = recv_dims[1]; + recv_numel += recv_slr.height() * width; + PADDLE_ENFORCE_EQ(recv_slr.value().dims()[1], width); + PADDLE_ENFORCE_EQ(recv_slr.value().dims()[0], recv_slr.rows().size()); + VLOG(3) << "recv slr " << recv_var_name << " dims " + << recv_slr.value().dims(); + if (VLOG_IS_ON(3)) { + std::ostringstream sstream; + sstream << "["; + for (auto &row_id : recv_slr.rows()) { + sstream << row_id << ", "; + } + sstream << "]"; + VLOG(3) << "recv_slr size: " << recv_slr.rows().size() << " " + << sstream.str(); + } + + for (auto i = 0; i < recv_slr.rows().size(); ++i) { + auto row_id = recv_slr.rows()[i] + row_offset; + PADDLE_ENFORCE_LT(row_id, recv_dims[0]); + memcpy(recv_tensor->data() + row_id * width, + recv_slr.value().data() + i * width, sizeof(T) * width); + } + row_offset += recv_slr.height(); + } else { + PADDLE_THROW("unsupported recieved var type"); + } + } + auto numel = recv_tensor->numel(); + if (recv_numel != numel) { + LOG(FATAL) << "recv_numel: " << recv_numel << " acture numel: " << numel; } - PADDLE_ENFORCE_EQ(recv_numel, recv_tensor->numel()); + PADDLE_ENFORCE_EQ(recv_numel, numel); } - VLOG(3) << "ParameterRecv out"; + VLOG(3) << "ParameterRecv out " << rpc_ctx.var_name; } template struct ParameterRecv; diff --git a/paddle/fluid/operators/distributed/parameter_send.cc b/paddle/fluid/operators/distributed/parameter_send.cc index 9ce424445229cde0a7e775c95f4af8839f4d4d68..dfabad567af590b65b9e777824d476fce2b17238 100644 --- a/paddle/fluid/operators/distributed/parameter_send.cc +++ b/paddle/fluid/operators/distributed/parameter_send.cc @@ -47,7 +47,7 @@ void ParameterSend::operator()(const RpcContext &rpc_ctx, auto &cpu_ctx = *pool.Get(platform::CPUPlace()); distributed::RPCClient *rpc_client = - distributed::RPCClient::GetInstance(0); + distributed::RPCClient::GetInstance(rpc_ctx.trainer_id); auto *send_var = scope.FindVar(rpc_ctx.var_name); size_t out_num = rpc_ctx.splited_var_names.size(); diff --git a/paddle/fluid/operators/distributed/request_handler.h b/paddle/fluid/operators/distributed/request_handler.h index 991158ac72007efc1233f852caed4f90f35fe1cd..de8f30184611aeb961e2ab69b05779c56371b976 100644 --- a/paddle/fluid/operators/distributed/request_handler.h +++ b/paddle/fluid/operators/distributed/request_handler.h @@ -18,7 +18,9 @@ #include // NOLINT #include +#include #include +#include #include #include @@ -180,6 +182,10 @@ class RequestHandler { grad_to_prepared_ctx_ = g; } + void SetSparseGradToParam(std::unordered_map* g) { + sparse_grad_to_param_ = g; + } + void SetRPCServer(RPCServer* rpc_server) { rpc_server_ = rpc_server; } // Get attributes. @@ -228,6 +234,7 @@ class RequestHandler { std::unordered_map>* grad_to_prepared_ctx_; + std::unordered_map* sparse_grad_to_param_; RPCServer* rpc_server_; }; diff --git a/paddle/fluid/operators/distributed/request_handler_impl.cc b/paddle/fluid/operators/distributed/request_handler_impl.cc index e289ec929dbd6643a2518b92c1a25b7d63e790a9..a41536368abc925531d1a54615546a100482a7eb 100644 --- a/paddle/fluid/operators/distributed/request_handler_impl.cc +++ b/paddle/fluid/operators/distributed/request_handler_impl.cc @@ -22,6 +22,7 @@ #include "paddle/fluid/framework/scope.h" #include "paddle/fluid/framework/selected_rows.h" #include "paddle/fluid/framework/variable_helper.h" +#include "paddle/fluid/operators/distributed/async_sparse_param_update_recorder.h" #include "paddle/fluid/operators/distributed/rpc_server.h" #include "paddle/fluid/string/piece.h" #include "paddle/fluid/string/printf.h" @@ -59,6 +60,12 @@ bool RequestSendHandler::Handle(const std::string& varname, "async mode should not recv BATCH_BARRIER_MESSAGE or " "COMPLETE_MESSAGE"); } + if (AsyncSparseParamUpdateRecorder::GetInstance()->HasGrad(varname)) { + auto& grad_slr = + scope->FindVar(varname)->Get(); + AsyncSparseParamUpdateRecorder::GetInstance()->Update(varname, + grad_slr.rows()); + } executor_->RunPreparedContext((*grad_to_prepared_ctx_)[varname].get(), scope); return true; @@ -82,8 +89,9 @@ bool RequestGetHandler::Handle(const std::string& varname, const int trainer_id, const std::string& out_var_name, const std::string& table_name) { - VLOG(4) << "RequestGetHandler:" << varname - << " out_var_name: " << out_var_name; + VLOG(3) << "RequestGetHandler:" << varname + << " out_var_name: " << out_var_name << " trainer_id: " << trainer_id + << " table_name: " << table_name; if (sync_mode_) { if (varname == FETCH_BARRIER_MESSAGE) { @@ -108,7 +116,42 @@ bool RequestGetHandler::Handle(const std::string& varname, VLOG(3) << "copying " << varname << " to " << param_bak_name; framework::TensorCopy(t_orig, dev_ctx_->GetPlace(), t); } - *outvar = scope_->FindVar(varname); + if (AsyncSparseParamUpdateRecorder::GetInstance()->HasParam(varname) && + !table_name.empty()) { + std::vector updated_rows; + AsyncSparseParamUpdateRecorder::GetInstance()->GetAndClear( + varname, trainer_id, &updated_rows); + if (VLOG_IS_ON(3)) { + std::ostringstream sstream; + sstream << "["; + for (auto& row_id : updated_rows) { + sstream << row_id << ", "; + } + sstream << "]"; + VLOG(3) << "updated_rows size: " << updated_rows.size() << " " + << sstream.str(); + } + auto& origin_tensor = + scope_->FindVar(varname)->Get(); + auto* origin_tensor_data = origin_tensor.data(); + auto& dims = origin_tensor.dims(); + *outvar = scope->Var(); + auto* out_slr = (*outvar)->GetMutable(); + out_slr->set_rows(updated_rows); + out_slr->set_height(dims[0]); + auto out_dims = framework::make_ddim( + {static_cast(updated_rows.size()), dims[1]}); + auto* data = out_slr->mutable_value()->mutable_data( + out_dims, origin_tensor.place()); + auto width = dims[1]; + for (auto i = 0; i < updated_rows.size(); ++i) { + PADDLE_ENFORCE_LT(updated_rows[i], dims[0]); + memcpy(data + i * width, origin_tensor_data + updated_rows[i] * width, + sizeof(float) * width); + } + } else { + *outvar = scope_->FindVar(varname); + } } } return true; diff --git a/paddle/fluid/operators/distributed/rpc_client.h b/paddle/fluid/operators/distributed/rpc_client.h index ea54e0c2951253fc009672f4cd2e5233ed56944e..d4be2c28fdbaa4beef62402155de5b677ed67e9b 100644 --- a/paddle/fluid/operators/distributed/rpc_client.h +++ b/paddle/fluid/operators/distributed/rpc_client.h @@ -15,6 +15,7 @@ #pragma once #include // NOLINT +#include #include #include "gflags/gflags.h" @@ -44,6 +45,7 @@ class RPCClient { const framework::Scope& scope, const std::string& var_name, const std::string& out_varname, + const std::string& table_name = "", int64_t time_out = FLAGS_rpc_deadline) = 0; virtual VarHandlePtr AsyncGetVarNoBarrier( @@ -96,6 +98,7 @@ class RPCClient { // Init is called by GetInstance. template static void Init(int trainer_id) { + VLOG(0) << "init rpc client with trainer_id " << trainer_id; trainer_id_ = trainer_id; if (rpc_client_.get() == nullptr) { rpc_client_.reset(new T()); diff --git a/paddle/fluid/operators/distributed/rpc_common.h b/paddle/fluid/operators/distributed/rpc_common.h index 3de89c2ae89d29edc317ca123882d1c55038b6ca..eb127bf4ad5a5c9a28210e2fbcdb69b07543f4b9 100644 --- a/paddle/fluid/operators/distributed/rpc_common.h +++ b/paddle/fluid/operators/distributed/rpc_common.h @@ -27,23 +27,26 @@ struct RpcContext { RpcContext(const std::string &name, const std::vector &names, const std::vector &emap, - const std::vector §ions) + const std::vector §ions, int id) : var_name(name), splited_var_names(names), epmap(emap), - height_sections(sections) {} + height_sections(sections), + trainer_id(id) {} RpcContext(const RpcContext &ctx) { var_name = ctx.var_name; splited_var_names = ctx.splited_var_names; epmap = ctx.epmap; height_sections = ctx.height_sections; + trainer_id = ctx.trainer_id; } std::string var_name; std::vector splited_var_names; std::vector epmap; std::vector height_sections; + int trainer_id; }; inline std::ostream &operator<<(std::ostream &os, const RpcContext &rpc_ctx) { diff --git a/paddle/fluid/operators/distributed_ops/CMakeLists.txt b/paddle/fluid/operators/distributed_ops/CMakeLists.txt index a1ef1af39ff2ab1456706ebafbd3d7ce1acc0c07..1096f3773c6d44560d370502b1c550d67d40ca64 100644 --- a/paddle/fluid/operators/distributed_ops/CMakeLists.txt +++ b/paddle/fluid/operators/distributed_ops/CMakeLists.txt @@ -2,9 +2,9 @@ include(operators) set(DISTRIBUTE_DEPS "") if(WITH_GRPC) - set(DISTRIBUTE_DEPS sendrecvop_rpc parameter_send parameter_recv communicator grpc++_unsecure grpc_unsecure gpr cares zlib protobuf node) + set(DISTRIBUTE_DEPS sendrecvop_rpc parameter_send parameter_recv communicator async_sparse_param_update_recorder grpc++_unsecure grpc_unsecure gpr cares zlib protobuf node) else() - set(DISTRIBUTE_DEPS sendrecvop_rpc parameter_send parameter_recv communicator brpc leveldb snappystream snappy protobuf ssl crypto zlib node) + set(DISTRIBUTE_DEPS sendrecvop_rpc parameter_send parameter_recv communicator async_sparse_param_update_recorder brpc leveldb snappystream snappy protobuf ssl crypto zlib node) if(WITH_BRPC_RDMA) find_library(IBVERBS_LIBRARY NAMES ibverbs) ADD_LIBRARY(ibverbs SHARED IMPORTED GLOBAL) diff --git a/paddle/fluid/operators/distributed_ops/listen_and_serv_op.cc b/paddle/fluid/operators/distributed_ops/listen_and_serv_op.cc index 5b30ed472d51a37a0705d1717395da9e4ff7d743..a672fb2a9141a81383d947dcc961a112aee3f7ac 100644 --- a/paddle/fluid/operators/distributed_ops/listen_and_serv_op.cc +++ b/paddle/fluid/operators/distributed_ops/listen_and_serv_op.cc @@ -24,8 +24,10 @@ limitations under the License. */ #include "paddle/fluid/operators/distributed/distributed.h" #include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/operators/distributed/async_sparse_param_update_recorder.h" #include "paddle/fluid/operators/distributed/request_handler_impl.h" #include "paddle/fluid/operators/distributed_ops/listen_and_serv_op.h" + #include "paddle/fluid/platform/profiler.h" DEFINE_int32(rpc_send_thread_num, 12, "number of threads for rpc send"); @@ -292,6 +294,8 @@ static void FillRequestCtx( std::unordered_map> *prefetch_ctx, + std::unordered_map + *sparse_grad_name_to_param_name, std::shared_ptr checkpoint_ctx, distributed::RPCServer *rpc_server) { h->SetScope(scope); @@ -299,6 +303,7 @@ static void FillRequestCtx( h->SetExecutor(executor); h->SetProgram(program); h->SetPrefetchPreparedCtx(prefetch_ctx); + h->SetSparseGradToParam(sparse_grad_name_to_param_name); h->SetRPCServer(rpc_server); h->SetCheckpointNotifyPreparedCtx(checkpoint_ctx); } @@ -414,10 +419,24 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope, prefetch_var_name_to_prepared_ctx[prefetch_var_name] = prefetch_prepared[i]; } - auto f = - std::bind(FillRequestCtx, std::placeholders::_1, &recv_scope, &dev_ctx, - &executor, program, &prefetch_var_name_to_prepared_ctx, - ckpt_pre_context, rpc_service_.get()); + // parse attr of kSparseGradToParam sparse_grad_name -> param_name + std::unordered_map sparse_grad_name_to_param_name; + auto sparse_grad_name_to_param_name_str = + Attr>(kSparseGradToParam); + for (const auto &sparse_grad_name_and_param_name : + sparse_grad_name_to_param_name_str) { + std::vector pieces; + split(sparse_grad_name_and_param_name, ':', &pieces); + PADDLE_ENFORCE_EQ(pieces.size(), 2); + VLOG(3) << "after split, sparse_grad_name = " << pieces[0] + << ", param_name = " << pieces[1]; + sparse_grad_name_to_param_name[pieces[0]] = pieces[1]; + } + + auto f = std::bind( + FillRequestCtx, std::placeholders::_1, &recv_scope, &dev_ctx, &executor, + program, &prefetch_var_name_to_prepared_ctx, + &sparse_grad_name_to_param_name, ckpt_pre_context, rpc_service_.get()); f(request_send_handler_.get()); f(request_get_handler_.get()); @@ -445,6 +464,8 @@ void ListenAndServOp::RunImpl(const framework::Scope &scope, RunSyncLoop(&executor, program, &recv_scope, &dev_ctx, prefetch_block_id_list, checkpoint_block_id); } else { + distributed::AsyncSparseParamUpdateRecorder::Init( + fan_in, sparse_grad_name_to_param_name); RunAsyncLoop(&executor, program, &recv_scope); } } @@ -475,6 +496,10 @@ class ListenAndServOpMaker : public framework::OpProtoAndCheckerMaker { AddAttr>(kPrefetchVarNameToBlockId, "prefetch blocks to run on server side.") .SetDefault({}); + AddAttr>( + kSparseGradToParam, + "sparse grad name to param name. like: 'emb@Grad:emb'") + .SetDefault({}); AddAttr("Fanin", "How many clients send to this server.") .SetDefault(1); AddAttr(kCheckpointBlockId, diff --git a/paddle/fluid/operators/distributed_ops/listen_and_serv_op.h b/paddle/fluid/operators/distributed_ops/listen_and_serv_op.h index f20442bad7c5bd96173b9d6efc4dceb13feacf5b..1cf2130d7a593077d1145b4f3be379c32557dd53 100644 --- a/paddle/fluid/operators/distributed_ops/listen_and_serv_op.h +++ b/paddle/fluid/operators/distributed_ops/listen_and_serv_op.h @@ -16,8 +16,10 @@ limitations under the License. */ #include #include +#include #include #include +#include #include #include @@ -35,6 +37,7 @@ namespace operators { constexpr char kOptimizeBlocks[] = "optimize_blocks"; constexpr char kPrefetchVarNameToBlockId[] = "prefetch_var_name_to_block_id"; constexpr char kCheckpointBlockId[] = "checkpint_block_id"; +constexpr char kSparseGradToParam[] = "sparse_grad_to_param"; void RunServer(std::shared_ptr service); diff --git a/paddle/fluid/operators/distributed_ops/recv_op.cc b/paddle/fluid/operators/distributed_ops/recv_op.cc index 3fd0700a077321d931e87b1d94c3637d167c9eff..8e9846b1fc89953526149be3838103526d5c441b 100644 --- a/paddle/fluid/operators/distributed_ops/recv_op.cc +++ b/paddle/fluid/operators/distributed_ops/recv_op.cc @@ -50,17 +50,18 @@ class RecvOp : public framework::OperatorBase { platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); auto &ctx = *pool.Get(place); + auto trainer_id = Attr("trainer_id"); distributed::RPCClient *rpc_client = - distributed::RPCClient::GetInstance( - Attr("trainer_id")); + distributed::RPCClient::GetInstance(trainer_id); std::vector recv_varnames = Attr>("recv_varnames"); if (recv_varnames.size() > 0) { auto recv_functor = distributed::ParameterRecv(); - auto rpc_ctx = distributed::RpcContext(outs[0], recv_varnames, epmap, {}); + auto rpc_ctx = distributed::RpcContext(outs[0], recv_varnames, epmap, {}, + trainer_id); recv_functor(rpc_ctx, scope); } else { if (with_barrier) { diff --git a/paddle/fluid/operators/distributed_ops/send_op.cc b/paddle/fluid/operators/distributed_ops/send_op.cc index b08cd0942f8c89b60d722c931d0cec2063b96578..5731bcc15a07074b3d77873c5cdcbb70dc41aba8 100644 --- a/paddle/fluid/operators/distributed_ops/send_op.cc +++ b/paddle/fluid/operators/distributed_ops/send_op.cc @@ -42,6 +42,7 @@ class SendOp : public framework::OperatorBase { auto epmap = Attr>("epmap"); int sync_send = Attr("sync_mode"); + auto trainer_id = Attr("trainer_id"); auto send_varnames = Attr>("send_varnames"); auto height_sections = Attr>("sections"); @@ -51,7 +52,7 @@ class SendOp : public framework::OperatorBase { if (distributed::Communicator::GetInstance() == nullptr) { auto send_functor = distributed::ParameterSend(); auto rpc_ctx = distributed::RpcContext(ins[0], send_varnames, epmap, - height_sections); + height_sections, trainer_id); send_functor(rpc_ctx, scope, true); } else { distributed::Communicator::GetInstance()->Send(ins[0], scope); @@ -62,8 +63,7 @@ class SendOp : public framework::OperatorBase { auto& ctx = *pool.Get(place); distributed::RPCClient* rpc_client = - distributed::RPCClient::GetInstance( - Attr("trainer_id")); + distributed::RPCClient::GetInstance(trainer_id); std::vector rets; for (size_t i = 0; i < ins.size(); i++) { diff --git a/paddle/fluid/operators/linspace_op.cc b/paddle/fluid/operators/linspace_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..f4aeb062d8dfae31a72b8ebccb3d377276662da6 --- /dev/null +++ b/paddle/fluid/operators/linspace_op.cc @@ -0,0 +1,84 @@ +/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/linspace_op.h" + +namespace paddle { +namespace operators { + +class LinspaceOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("Start"), + "Input(Start) of LinspaceOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Stop"), + "Input(Stop) of LinspaceOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("Num"), + "Input(Num) of LinspaceOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "Output(OUt) of LinspaceOp should not be null."); + + auto s_dims = ctx->GetInputDim("Start"); + PADDLE_ENFORCE((s_dims.size() == 1) && (s_dims[0] == 1), + "The shape of Input(Start) should be [1]."); + + auto e_dims = ctx->GetInputDim("Stop"); + PADDLE_ENFORCE((e_dims.size() == 1) && (e_dims[0] == 1), + "The shape of Input(Stop) should be [1]."); + + auto step_dims = ctx->GetInputDim("Num"); + PADDLE_ENFORCE((step_dims.size() == 1) && (step_dims[0] == 1), + "The shape of Input(Num) should be [1]."); + + ctx->SetOutputDim("Out", {-1}); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + framework::LibraryType library_{framework::LibraryType::kPlain}; + framework::DataLayout layout_ = framework::DataLayout::kAnyLayout; + return framework::OpKernelType( + ctx.Input("Start")->type(), ctx.device_context(), + layout_, library_); + } +}; + +class LinspaceOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("Start", + "First entry in the sequence. It is a tensor of shape [1], should " + "be of type float32 or float64."); + AddInput("Stop", + "Last entry in the sequence. It is a tensor of shape [1], should " + "be of type float32 or float64."); + AddInput("Num", + "Number of entry in the sequence. It is a tensor of shape [1], " + "should be of type int32."); + AddOutput("Out", "A sequence of numbers."); + AddComment(R"DOC( + Return fixed number of evenly spaced values within a given interval. First entry is start, and last entry is stop. In the case when Num is 1, only Start is returned. Like linspace function of numpy. +)DOC"); + } +}; +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_WITHOUT_GRADIENT(linspace, ops::LinspaceOp, ops::LinspaceOpMaker); +REGISTER_OP_CPU_KERNEL(linspace, ops::CPULinspaceKernel, + ops::CPULinspaceKernel); diff --git a/paddle/fluid/operators/linspace_op.cu b/paddle/fluid/operators/linspace_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..90bd17cda0e0d1f78810233537bb502f9115fbd0 --- /dev/null +++ b/paddle/fluid/operators/linspace_op.cu @@ -0,0 +1,75 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/linspace_op.h" +#include "paddle/fluid/platform/cuda_primitives.h" + +namespace paddle { +namespace operators { + +#define CUDA_1D_KERNEL_LOOP(i, n) \ + for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \ + i += blockDim.x * gridDim.x) + +template +__global__ void LinspaceKernel(T start, T step, int64_t size, T* out) { + CUDA_1D_KERNEL_LOOP(index, size) { out[index] = start + step * index; } +} + +template +__global__ void LinspaceSpecialKernel(T start, T* out) { + out[0] = start; +} + +template +class CUDALinspaceKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto* start_t = context.Input("Start"); + auto* stop_t = context.Input("Stop"); + auto* num_t = context.Input("Num"); + auto* out = context.Output("Out"); + + framework::Tensor n; + framework::TensorCopy(*start_t, platform::CPUPlace(), &n); + T start = n.data()[0]; + framework::TensorCopy(*stop_t, platform::CPUPlace(), &n); + T stop = n.data()[0]; + framework::TensorCopy(*num_t, platform::CPUPlace(), &n); + int32_t num = n.data()[0]; + + PADDLE_ENFORCE(num > 0, "The num of linspace op should be larger than 0."); + + out->Resize(framework::make_ddim({num})); + T* out_data = out->mutable_data(context.GetPlace()); + + T step = 0; + if (num != 1) { + step = (stop - start) / (num - 1); + } + + auto stream = context.cuda_device_context().stream(); + int block = 512; + int grid = (num + block - 1) / block; + LinspaceKernel<<>>(start, step, num, out_data); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL(linspace, ops::CUDALinspaceKernel, + ops::CUDALinspaceKernel); diff --git a/paddle/fluid/operators/linspace_op.h b/paddle/fluid/operators/linspace_op.h new file mode 100644 index 0000000000000000000000000000000000000000..b1fcac73b0ad249aa19859bde770a8554cdb7408 --- /dev/null +++ b/paddle/fluid/operators/linspace_op.h @@ -0,0 +1,51 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/math/math_function.h" + +namespace paddle { +namespace operators { + +template +class CPULinspaceKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + T start = context.Input("Start")->data()[0]; + T stop = context.Input("Stop")->data()[0]; + int32_t num = context.Input("Num")->data()[0]; + auto* out = context.Output("Out"); + PADDLE_ENFORCE(num > 0, "The num of linspace op should be larger than 0."); + + out->Resize(framework::make_ddim({num})); + + T* out_data = out->mutable_data(context.GetPlace()); + + if (num > 1) { + T step = (stop - start) / (num - 1); + T value = start; + for (int i = 0; i < num; ++i) { + out_data[i] = value; + value += step; + } + } else { + out_data[0] = start; + } + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/lod_reset_op.cc b/paddle/fluid/operators/lod_reset_op.cc index e0ab02cd90cdee848250a6aba882b0cb0c17abd7..458037c5aca6af4c8c97b2da630c35929770c156 100644 --- a/paddle/fluid/operators/lod_reset_op.cc +++ b/paddle/fluid/operators/lod_reset_op.cc @@ -30,10 +30,10 @@ class LoDResetOp : public framework::OperatorWithKernel { if (!ctx->HasInput("Y")) { auto level0 = ctx->Attrs().Get>("target_lod"); - PADDLE_ENFORCE_GT(level0.size(), 1, + PADDLE_ENFORCE_GT(level0.size(), 0, "If Input(Y) not provided, the target lod should be " "specified by attribute `target_lod`."); - } else { + } else if (ctx->IsRuntime()) { ctx->ShareLoD("Y", "Out"); } @@ -48,6 +48,23 @@ class LoDResetOp : public framework::OperatorWithKernel { } }; +class LoDResetOpVarTypeInference : public framework::VarTypeInference { + public: + void operator()(framework::InferVarTypeContext *ctx) const override { + auto x_var_name = ctx->Input("X").front(); + auto out_var_name = ctx->Output("Out").front(); + if (ctx->HasInput("Y")) { + auto y_var_name = ctx->Input("Y").front(); + auto y_lod_level = std::max(ctx->GetLoDLevel(y_var_name), 1); + ctx->SetLoDLevel(out_var_name, y_lod_level); + } else { + ctx->SetLoDLevel(out_var_name, 1); + } + ctx->SetDataType(out_var_name, ctx->GetDataType(x_var_name)); + ctx->SetType(out_var_name, paddle::framework::proto::VarType::LOD_TENSOR); + } +}; + class LoDResetOpMaker : public framework::OpProtoAndCheckerMaker { public: void Make() override { @@ -177,9 +194,10 @@ DECLARE_NO_NEED_BUFFER_VARS_INFERENCE(LoDResetGradNoNeedBufferVarInference, namespace ops = paddle::operators; REGISTER_OPERATOR(lod_reset, ops::LoDResetOp, ops::LoDResetOpMaker, - ops::LoDResetGradDescMaker); + ops::LoDResetGradDescMaker, ops::LoDResetOpVarTypeInference); REGISTER_OPERATOR(lod_reset_grad, ops::LoDResetGradOp, ops::LoDResetGradNoNeedBufferVarInference); + REGISTER_OP_CPU_KERNEL( lod_reset, ops::LoDResetKernel, ops::LoDResetKernel, diff --git a/paddle/fluid/operators/lod_reset_op.h b/paddle/fluid/operators/lod_reset_op.h index d36aa0ce025a1c0f717913131fcc75040d16afac..1c2f0b0ac8ab4be35e4716acc7be3f05b9d63805 100644 --- a/paddle/fluid/operators/lod_reset_op.h +++ b/paddle/fluid/operators/lod_reset_op.h @@ -63,7 +63,7 @@ class LoDResetKernel : public framework::OpKernel { "Target LoD should be a vector end with the " "first dimension of Input(X)."); for (size_t i = 0; i < level0.size() - 1; ++i) { - PADDLE_ENFORCE(level0[i + 1] > level0[i], + PADDLE_ENFORCE(level0[i + 1] >= level0[i], "Target LoD should be an ascending vector."); } diff --git a/paddle/fluid/operators/recurrent_op.cc b/paddle/fluid/operators/recurrent_op.cc index 2898a62ddbac524ceb212cac5f34aeda3b1e01cb..1a2feee11c951cd4a55958df58f3756472f64769 100644 --- a/paddle/fluid/operators/recurrent_op.cc +++ b/paddle/fluid/operators/recurrent_op.cc @@ -23,6 +23,7 @@ constexpr char kInitialStates[] = "initial_states"; constexpr char kParameters[] = "parameters"; constexpr char kOutputs[] = "outputs"; constexpr char kStepScopes[] = "step_scopes"; +constexpr char kHasStates[] = "has_states"; constexpr char kExStates[] = "ex_states"; constexpr char kStates[] = "states"; constexpr char kStepBlock[] = "sub_block"; @@ -241,11 +242,16 @@ class RecurrentOp : public RecurrentBase { private: void RunImpl(const framework::Scope &scope, const platform::Place &place) const override { + bool has_state = Attr(kHasStates); auto seq_len = static_cast(this->GetSequenceLength(scope)); VLOG(3) << "Static RNN input sequence length = " << seq_len; StepScopes scopes = CreateStepScopes(scope, seq_len); auto reverse = Attr(kReverse); + // get device context from pool + platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); + auto &dev_ctx = *pool.Get(place); + framework::Executor executor(place); auto *block = Attr(kStepBlock); @@ -269,15 +275,17 @@ class RecurrentOp : public RecurrentBase { inside->Resize(framework::make_ddim(dims)); }); - if (i == 0) { - // Link initial states --> ex_states - LinkTensor(scope, Inputs(kInitialStates), &cur_scope, - Attr>(kExStates)); - } else { - auto &ex_scope = scopes.ExScope(); - // Link ex_scope::state --> cur_scope::ex_state - LinkTensor(ex_scope, Attr>(kStates), - &cur_scope, Attr>(kExStates)); + if (has_state) { + if (i == 0) { + // Link initial states --> ex_states + LinkTensor(scope, Inputs(kInitialStates), &cur_scope, + Attr>(kExStates)); + } else { + auto &ex_scope = scopes.ExScope(); + // Link ex_scope::state --> cur_scope::ex_state + LinkTensor(ex_scope, Attr>(kStates), + &cur_scope, Attr>(kExStates)); + } } // Every inputs are linked now, execute! @@ -286,11 +294,6 @@ class RecurrentOp : public RecurrentBase { std::vector() /*skip_ref_cnt_vars*/, true /*force_disable_gc*/); - // get device context from pool - platform::DeviceContextPool &pool = - platform::DeviceContextPool::Instance(); - auto &dev_ctx = *pool.Get(place); - // Copy inside::output -> outside::output // outside::output[seq_offset: seq_offset + 1] = inside::output this->LinkTensorWithCallback( @@ -333,13 +336,13 @@ class RecurrentGradOp : public RecurrentBase { private: void RunImpl(const framework::Scope &scope, const platform::Place &place) const override { - auto seq_len = static_cast(GetSequenceLength(scope)); + bool has_state = Attr(kHasStates); + const size_t seq_len = static_cast(GetSequenceLength(scope)); StepScopes scopes = CreateStepScopes(scope, seq_len); auto reverse = Attr(kReverse); framework::Executor executor(place); auto *block = Attr(kStepBlock); - auto *program = block->Program(); // get device context from pool @@ -350,6 +353,7 @@ class RecurrentGradOp : public RecurrentBase { size_t seq_offset = reverse ? step_id : seq_len - step_id - 1; VLOG(3) << "Recurrent backward operate at the time step " << seq_offset; auto &cur_scope = scopes.CurScope(); + // Link outside::output_grads --> inside::output_grads // inside::output_grad = outside::output_grad[seq_offset:seq_offset+1] LinkTensorWithCallback( @@ -370,30 +374,32 @@ class RecurrentGradOp : public RecurrentBase { VLOG(10) << " RNN output gradients = [" << sout.str() << "]"; } - // Link states - // if cur_scope::cur_state_grad in out_grads: - // cur_scope::cur_state_grad += ex_scope::ex_state_grad - // else: - // ex_scope::ex_state_grad --> cur_scope::cur_state_grad - if (step_id != 0) { // not at beginning - auto &ex_scope = scopes.ExScope(); - auto ex_state_grads = - GradVarLists(Attr>(kExStates)); - auto cur_state_grads = - GradVarLists(Attr>(kStates)); - - PADDLE_ENFORCE_EQ(ex_state_grads.size(), cur_state_grads.size()); - for (size_t i = 0; i < ex_state_grads.size(); ++i) { - auto &cur_grad = cur_state_grads[i]; - auto &ex_grad = ex_state_grads[i]; - auto &ex_tensor = - ex_scope.FindVar(ex_grad)->Get(); - - VLOG(10) << " RNN link " << cur_grad << " from " << ex_grad; - auto *cur_grad_var = cur_scope.Var(cur_grad); - auto cur_grad_tensor = - cur_grad_var->GetMutable(); - framework::TensorCopy(ex_tensor, place, dev_ctx, cur_grad_tensor); + if (has_state) { + // Link states + // if cur_scope::cur_state_grad in out_grads: + // cur_scope::cur_state_grad += ex_scope::ex_state_grad + // else: + // ex_scope::ex_state_grad --> cur_scope::cur_state_grad + if (step_id != 0) { // not at beginning + auto &ex_scope = scopes.ExScope(); + auto ex_state_grads = + GradVarLists(Attr>(kExStates)); + auto cur_state_grads = + GradVarLists(Attr>(kStates)); + + PADDLE_ENFORCE_EQ(ex_state_grads.size(), cur_state_grads.size()); + for (size_t i = 0; i < ex_state_grads.size(); ++i) { + auto &cur_grad = cur_state_grads[i]; + auto &ex_grad = ex_state_grads[i]; + auto &ex_tensor = + ex_scope.FindVar(ex_grad)->Get(); + + VLOG(10) << " RNN link " << cur_grad << " from " << ex_grad; + auto *cur_grad_var = cur_scope.Var(cur_grad); + auto cur_grad_tensor = + cur_grad_var->GetMutable(); + framework::TensorCopy(ex_tensor, place, dev_ctx, cur_grad_tensor); + } } } @@ -442,8 +448,8 @@ class RecurrentGradOp : public RecurrentBase { } auto new_inside_name = cur_scope.Rename(inside_grad_name); - // sum gradient + // sum gradient auto sum_op = framework::OpRegistry::CreateOp( "sum", {{"X", {pg_names[param_id], new_inside_name}}}, {{"Out", {pg_names[param_id]}}}, @@ -475,22 +481,33 @@ class RecurrentGradOp : public RecurrentBase { true /*is_backward*/); VLOG(5) << "Link outside gradient finished "; - if (step_id + 1 == seq_len) { // at_end - // copy initialize states gradient from inside to outside - LinkTensorWithCallback( - cur_scope, GradVarLists(Attr>(kExStates)), - scope, Outputs(kInitStateGrads), - [&](const framework::LoDTensor &inside, - framework::LoDTensor *outside) { - outside->Resize(inside.dims()); - outside->mutable_data(place, inside.type()); - framework::TensorCopy(inside, place, dev_ctx, outside); - }, - true /*is_backward*/); - VLOG(5) << "Link initialize state gradient finished "; + if (has_state) { + if (step_id + 1 == seq_len) { // at_end + // copy initialize states gradient from inside to outside + LinkTensorWithCallback( + cur_scope, + GradVarLists(Attr>(kExStates)), scope, + Outputs(kInitStateGrads), + [&](const framework::LoDTensor &inside, + framework::LoDTensor *outside) { + outside->Resize(inside.dims()); + outside->mutable_data(place, inside.type()); + framework::TensorCopy(inside, place, dev_ctx, outside); + }, + true /*is_backward*/); + VLOG(5) << "Link initialize state gradient finished "; + } } scopes.Next(); } + // Delete the scope of StepScopes + dev_ctx.Wait(); + auto *var = scope.FindVar(Input(kStepScopes)); + PADDLE_ENFORCE(var != nullptr); + auto step_scopes = var->GetMutable(); + for (auto *sub_scope : *step_scopes) { + const_cast(scope).DeleteScope(sub_scope); + } } private: @@ -541,6 +558,7 @@ class RecurrentOpProtoMaker : public framework::OpProtoAndCheckerMaker { .AsDuplicable(); AddOutput(kStepScopes, "StepScopes contain all local variables in each time step."); + AddAttr(kHasStates, "Whether has states.").SetDefault(false); AddAttr>(kExStates, string::Sprintf( R"DOC(The ex-state variable names. @@ -624,20 +642,44 @@ class RecurrentGradOpDescMaker : public framework::SingleGradOpDescMaker { class RecurrentGradOpShapeInference : public framework::InferShapeBase { public: void operator()(framework::InferShapeContext *ctx) const override { - std::vector input{kInputs, kInitialStates}; std::vector output{kOutputs}; - for (auto &s : input) { - // NOTE(zcd): In some case, some of kInputs doesn't have gradient. - PADDLE_ENFORCE(ctx->HasInputs(s)); - } - for (auto &s : output) { - PADDLE_ENFORCE(ctx->HasInputs(s)); + + // In some case the kInitialStates is empty. + // If the kInitialStates is empty, all the states should be empty. + if (!ctx->HasInputs(kInitialStates)) { + PADDLE_ENFORCE_EQ( + ctx->Attrs().Get>(kExStates).size(), 0, + "The Attr(%s) should be empty.", kExStates); + PADDLE_ENFORCE_EQ( + ctx->Attrs().Get>(kStates).size(), 0, + "The Attr(%s) should be empty.", kStates); } - for (auto &s : input) { - ctx->SetOutputsDim(framework::GradVarName(s), ctx->GetInputsDim(s)); + + PADDLE_ENFORCE(ctx->HasInputs(kInputs), + "The input(%s) should not be empty.", kInputs); + PADDLE_ENFORCE(ctx->HasInputs(kOutputs), + "The input(%s) should not be empty.", kOutputs); + + // In some case the kInitialStates is empty. + if (ctx->HasInputs(kInitialStates)) { + PADDLE_ENFORCE(ctx->HasOutputs(framework::GradVarName(kInitialStates)), + "The output of(%s) should not be empty.", + framework::GradVarName(kInitialStates)); + ctx->SetOutputsDim(framework::GradVarName(kInitialStates), + ctx->GetInputsDim(kInitialStates)); } + + PADDLE_ENFORCE(ctx->HasOutputs(framework::GradVarName(kInputs)), + "The output of(%s) should not be empty.", + framework::GradVarName(kInputs)); + ctx->SetOutputsDim(framework::GradVarName(kInputs), + ctx->GetInputsDim(kInputs)); + + // In some case the kParameters is empty. if (ctx->HasInputs(kParameters)) { - PADDLE_ENFORCE(ctx->HasOutputs(framework::GradVarName(kParameters))); + PADDLE_ENFORCE(ctx->HasOutputs(framework::GradVarName(kParameters)), + "The output of(%s) should not be empty.", + framework::GradVarName(kParameters)); ctx->SetOutputsDim(framework::GradVarName(kParameters), ctx->GetInputsDim(kParameters)); } diff --git a/paddle/fluid/operators/reduce_ops/reduce_all_op.cc b/paddle/fluid/operators/reduce_ops/reduce_all_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..b087fbbb94c7ba2f7449f6bda56010dee1c38ea6 --- /dev/null +++ b/paddle/fluid/operators/reduce_ops/reduce_all_op.cc @@ -0,0 +1,20 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/operators/reduce_ops/reduce_all_op.h" + +REGISTER_REDUCE_OP(reduce_all); +REGISTER_OP_CPU_KERNEL(reduce_all, + ops::ReduceKernel); diff --git a/paddle/fluid/operators/reduce_ops/reduce_all_op.cu b/paddle/fluid/operators/reduce_ops/reduce_all_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..bd94ba263d957d0d65506ecd802bf43add6e2fb4 --- /dev/null +++ b/paddle/fluid/operators/reduce_ops/reduce_all_op.cu @@ -0,0 +1,19 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/operators/reduce_ops/reduce_all_op.h" + +REGISTER_OP_CUDA_KERNEL(reduce_all, + ops::ReduceKernel); diff --git a/paddle/fluid/operators/reduce_ops/reduce_all_op.h b/paddle/fluid/operators/reduce_ops/reduce_all_op.h new file mode 100644 index 0000000000000000000000000000000000000000..ba159dd703c8904784546eda262bf7be77967d48 --- /dev/null +++ b/paddle/fluid/operators/reduce_ops/reduce_all_op.h @@ -0,0 +1,29 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#pragma once + +#include "paddle/fluid/operators/reduce_ops/reduce_op.h" + +namespace paddle { +namespace operators { + +struct AllFunctor { + template + void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) { + y->device(place) = x->all(dim); + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/reduce_ops/reduce_any_op.cc b/paddle/fluid/operators/reduce_ops/reduce_any_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..d865dcb3c935b76b8da25d723a5f780fb4de255b --- /dev/null +++ b/paddle/fluid/operators/reduce_ops/reduce_any_op.cc @@ -0,0 +1,20 @@ +// Copyright (c) 2018 PaddlePaddle Authors. Any Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/operators/reduce_ops/reduce_any_op.h" + +REGISTER_REDUCE_OP(reduce_any); +REGISTER_OP_CPU_KERNEL(reduce_any, + ops::ReduceKernel); diff --git a/paddle/fluid/operators/reduce_ops/reduce_any_op.cu b/paddle/fluid/operators/reduce_ops/reduce_any_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..66f0c9997ea1e27cf172a6839a68d2eb23395c4d --- /dev/null +++ b/paddle/fluid/operators/reduce_ops/reduce_any_op.cu @@ -0,0 +1,19 @@ +// Copyright (c) 2018 PaddlePaddle Authors. Any Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/operators/reduce_ops/reduce_any_op.h" + +REGISTER_OP_CUDA_KERNEL(reduce_any, + ops::ReduceKernel); diff --git a/paddle/fluid/operators/reduce_ops/reduce_any_op.h b/paddle/fluid/operators/reduce_ops/reduce_any_op.h new file mode 100644 index 0000000000000000000000000000000000000000..b36bad9cada259932d2bd77c2426fbb46790de76 --- /dev/null +++ b/paddle/fluid/operators/reduce_ops/reduce_any_op.h @@ -0,0 +1,29 @@ +// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#pragma once + +#include "paddle/fluid/operators/reduce_ops/reduce_op.h" + +namespace paddle { +namespace operators { + +struct AnyFunctor { + template + void operator()(const DeviceContext& place, X* x, Y* y, const Dim& dim) { + y->device(place) = x->any(dim); + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/rnn_memory_helper_op.cc b/paddle/fluid/operators/rnn_memory_helper_op.cc index 834dd1eabd68db6c8b571071f8043589c66f8671..b00cc07dea920a6d7caa8b70c99d84b72a785a99 100644 --- a/paddle/fluid/operators/rnn_memory_helper_op.cc +++ b/paddle/fluid/operators/rnn_memory_helper_op.cc @@ -40,9 +40,12 @@ class RNNMemoryHelperOp : public framework::OperatorBase { "Cannot find out_var in scope, out_var_name is %s", out_name); + platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); + auto &dev_ctx = *pool.Get(dev_place); + auto *out_tensor = out_var->GetMutable(); auto &mem_tensor = mem_var->Get(); - framework::TensorCopySync(mem_tensor, dev_place, out_tensor); + framework::TensorCopy(mem_tensor, dev_place, dev_ctx, out_tensor); out_tensor->set_lod(mem_tensor.lod()); } }; @@ -92,6 +95,9 @@ class RNNMemoryHelperGradOp : public framework::OperatorBase { "Cannot find in_grad_var in scope, name is %s", in_grad_var_name); + platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); + auto &dev_ctx = *pool.Get(dev_place); + if (out_grad_var == nullptr) { VLOG(5) << "Using fill constant 0 as starting gradient"; auto in_var_name = Input("X"); @@ -109,7 +115,8 @@ class RNNMemoryHelperGradOp : public framework::OperatorBase { } else { auto &out_grad_tensor = out_grad_var->Get(); auto *in_grad_tensor = in_grad_var->GetMutable(); - framework::TensorCopySync(out_grad_tensor, dev_place, in_grad_tensor); + framework::TensorCopy(out_grad_tensor, dev_place, dev_ctx, + in_grad_tensor); in_grad_tensor->set_lod(out_grad_tensor.lod()); } } diff --git a/paddle/fluid/operators/squared_l2_distance_op.h b/paddle/fluid/operators/squared_l2_distance_op.h index e0133d33e6a840d2d06832393a064df978cb9cbc..12a8f05b5a603417ead8ebd250ff7951f928f4a1 100644 --- a/paddle/fluid/operators/squared_l2_distance_op.h +++ b/paddle/fluid/operators/squared_l2_distance_op.h @@ -77,6 +77,9 @@ class SquaredL2DistanceGradKernel : public framework::OpKernel { auto* x_g = context.Output(framework::GradVarName("X")); auto* y_g = context.Output(framework::GradVarName("Y")); + PADDLE_ENFORCE_NOT_NULL(x_g); + PADDLE_ENFORCE_NOT_NULL(y_g); + auto sub_result = EigenMatrix::From(*in0); auto out_grad = EigenMatrix::From(*in1); @@ -92,31 +95,28 @@ class SquaredL2DistanceGradKernel : public framework::OpKernel { // propagate back to input auto& eigen_place = *context.template device_context().eigen_device(); - if (x_g) { - x_g->mutable_data(context.GetPlace()); - // eigen matrix - auto x_grad = - EigenMatrix::From(*x_g, framework::make_ddim({x_dims[0], cols})); - // dimensions are same with subResult - x_grad.device(eigen_place) = grad_mat; - } - if (y_g) { - y_g->mutable_data(context.GetPlace()); - - PADDLE_ENFORCE_GE(sub_result.dimensions()[0], y_dims[0], - "First dimension of gradient must be greater or " - "equal than first dimension of target."); - - if (sub_result.dimensions()[0] == y_dims[0]) { - auto y_grad = - EigenMatrix::From(*y_g, framework::make_ddim({y_dims[0], cols})); - y_grad.device(eigen_place) = -1 * grad_mat; - } else { - auto col_sum_res = -1 * (grad_mat.sum(Eigen::array({{0}}))); - auto y_grad = EigenVector::Flatten(*y_g); - y_grad.device(eigen_place) = col_sum_res; - } + x_g->mutable_data(context.GetPlace()); + // eigen matrix + auto x_grad = + EigenMatrix::From(*x_g, framework::make_ddim({x_dims[0], cols})); + // dimensions are same with subResult + x_grad.device(eigen_place) = grad_mat; + + y_g->mutable_data(context.GetPlace()); + + PADDLE_ENFORCE_GE(sub_result.dimensions()[0], y_dims[0], + "First dimension of gradient must be greater or " + "equal than first dimension of target."); + + if (sub_result.dimensions()[0] == y_dims[0]) { + auto y_grad = + EigenMatrix::From(*y_g, framework::make_ddim({y_dims[0], cols})); + y_grad.device(eigen_place) = -1 * grad_mat; + } else { + auto col_sum_res = -1 * (grad_mat.sum(Eigen::array({{0}}))); + auto y_grad = EigenVector::Flatten(*y_g); + y_grad.device(eigen_place) = col_sum_res; } } }; diff --git a/paddle/fluid/platform/lodtensor_printer.cc b/paddle/fluid/platform/lodtensor_printer.cc index a5aa1a4148686b032c52f99497252fde4867438f..07eaf42d2d3bc20e7f7dc56bb0f4e0cc2fbac5e3 100644 --- a/paddle/fluid/platform/lodtensor_printer.cc +++ b/paddle/fluid/platform/lodtensor_printer.cc @@ -52,16 +52,26 @@ void PrintVar(framework::Scope* scope, const std::string& var_name, return; } -#define PrintLoDTensorCallback(cpp_type, proto_type) \ - do { \ - if (tensor->type() == proto_type) { \ - print_lod_tensor(var_name, *tensor, print_info); \ - return; \ - } \ + framework::LoDTensor printed_tensor; + printed_tensor.set_lod(tensor->lod()); + printed_tensor.Resize(tensor->dims()); + if (platform::is_cpu_place(tensor->place())) { + printed_tensor.ShareDataWith(*tensor); + } else { + platform::CPUPlace place; + framework::TensorCopy(*tensor, place, &printed_tensor); + } + +#define PrintLoDTensorCallback(cpp_type, proto_type) \ + do { \ + if (tensor->type() == proto_type) { \ + print_lod_tensor(var_name, printed_tensor, print_info); \ + return; \ + } \ } while (0) _ForEachDataType_(PrintLoDTensorCallback); - VLOG(1) << "PrintVar: unrecognized data type:" << tensor->type(); + VLOG(1) << "PrintVar: unrecognized data type:" << printed_tensor.type(); } } // end namespace platform diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index f0ea6d9b0a751c86e3911c35d9403a32604056d7..a8a2a94d473b18fdcd78771063ef4565c7fe0e42 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -1366,6 +1366,10 @@ All parameter, weight, gradient are variables in Paddle. "cache_runtime_context", [](const BuildStrategy &self) { return self.cache_runtime_context_; }, [](BuildStrategy &self, bool b) { self.cache_runtime_context_ = b; }) + .def_property( + "cache_expected_kernel", + [](const BuildStrategy &self) { return self.cache_expected_kernel_; }, + [](BuildStrategy &self, bool b) { self.cache_expected_kernel_ = b; }) .def("_finalize_strategy_and_create_passes", [](BuildStrategy &self) -> std::shared_ptr { return self.CreatePassesFromStrategy(true); diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh index 43db744b5edeb48f87c4413961e6ded3bd45f86d..7bb713493182239b2fd17f7b7fb496afdc9b8e6c 100755 --- a/paddle/scripts/paddle_build.sh +++ b/paddle/scripts/paddle_build.sh @@ -202,6 +202,7 @@ function cmake_gen() { -DCMAKE_EXPORT_COMPILE_COMMANDS=ON -DWITH_CONTRIB=${WITH_CONTRIB:-ON} -DWITH_INFERENCE_API_TEST=${WITH_INFERENCE_API_TEST:-ON} + -DWITH_HIGH_LEVEL_API_TEST=${WITH_HIGH_LEVEL_API_TEST:-OFF} -DINFERENCE_DEMO_INSTALL_DIR=${INFERENCE_DEMO_INSTALL_DIR} -DWITH_ANAKIN=${WITH_ANAKIN:-OFF} -DANAKIN_BUILD_FAT_BIN=${ANAKIN_BUILD_FAT_BIN:OFF} @@ -234,6 +235,7 @@ EOF -DCMAKE_EXPORT_COMPILE_COMMANDS=ON \ -DWITH_CONTRIB=${WITH_CONTRIB:-ON} \ -DWITH_INFERENCE_API_TEST=${WITH_INFERENCE_API_TEST:-ON} \ + -DWITH_HIGH_LEVEL_API_TEST=${WITH_HIGH_LEVEL_API_TEST:-OFF} \ -DINFERENCE_DEMO_INSTALL_DIR=${INFERENCE_DEMO_INSTALL_DIR} \ -DWITH_ANAKIN=${WITH_ANAKIN:-OFF} \ -DANAKIN_BUILD_FAT_BIN=${ANAKIN_BUILD_FAT_BIN:OFF}\ diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py index 983d8243b1d8aa6c8d01855d6dbeab76c335f70c..3dc2b0c895116155f41df3ca66125fff3ede5ead 100644 --- a/python/paddle/fluid/__init__.py +++ b/python/paddle/fluid/__init__.py @@ -175,6 +175,7 @@ def __bootstrap__(): read_env_flags.append('communicator_thread_pool_size') read_env_flags.append('communicator_max_merge_var_num') read_env_flags.append('communicator_fake_rpc') + read_env_flags.append('communicator_send_wait_times') if core.is_compiled_with_brpc(): read_env_flags.append('max_body_size') #set brpc max body size diff --git a/python/paddle/fluid/dataset.py b/python/paddle/fluid/dataset.py index e655fd4a976a8a6fa2811ddc43de3d1f231229d5..1a023f61675ed62c141bb6e71fabbdf0086b0c64 100644 --- a/python/paddle/fluid/dataset.py +++ b/python/paddle/fluid/dataset.py @@ -136,6 +136,7 @@ class DatasetBase(object): slot_var.name = var.name if var.lod_level == 0: slot_var.is_dense = True + slot_var.shape.extend(var.shape) if var.dtype == core.VarDesc.VarType.FP32: slot_var.type = "float" elif var.dtype == core.VarDesc.VarType.INT64: diff --git a/python/paddle/fluid/device_worker.py b/python/paddle/fluid/device_worker.py index 7fc72191884020f4cc57c9269b636161635f06d0..0998f779acfea23f3a494a25b43a6fa824b985f1 100644 --- a/python/paddle/fluid/device_worker.py +++ b/python/paddle/fluid/device_worker.py @@ -26,8 +26,8 @@ class DeviceWorker(object): """ Init. """ - self.program_ = None - self.infer_ = None + self._program = None + self._infer = None def _set_infer(self, infer=False): """ @@ -36,7 +36,7 @@ class DeviceWorker(object): Args: infer(bool): whether to do inference """ - self.infer_ = infer + self._infer = infer def _set_fleet_desc(self, fleet_desc): """ @@ -45,7 +45,7 @@ class DeviceWorker(object): Args: fleet_desc(PSParameter): pslib.PSParameter object """ - self.fleet_desc_ = fleet_desc + self._fleet_desc = fleet_desc def _set_program(self, program): """ @@ -54,7 +54,7 @@ class DeviceWorker(object): Args: program(Program): a Program object """ - self.program_ = program + self._program = program def _gen_worker_desc(self, trainer_desc): """ @@ -88,7 +88,7 @@ class Hogwild(DeviceWorker): trainer_desc(TrainerDesc): a TrainerDesc object """ trainer_desc.device_worker_name = "HogwildWorker" - if self.infer_: + if self._infer: # just ignore feed op for inference model trainer_desc.hogwild_param.skip_ops.extend(["feed"]) @@ -113,11 +113,11 @@ class DownpourSGD(DeviceWorker): trainer_desc(TrainerDesc): a TrainerDesc object """ dense_table_set = set() - program_id = str(id(self.program_)) - if self.program_ == None: + program_id = str(id(self._program)) + if self._program == None: print("program of current device worker is not configured") exit(-1) - opt_info = self.program_._fleet_opt + opt_info = self._program._fleet_opt program_configs = opt_info["program_configs"] downpour = trainer_desc.downpour_param @@ -140,7 +140,7 @@ class DownpourSGD(DeviceWorker): trainer_desc.device_worker_name = "DownpourWorker" pull_thread = trainer_desc.pull_dense_param pull_thread.device_num = trainer_desc.thread_num - for i in self.fleet_desc_.trainer_param.dense_table: + for i in self._fleet_desc.trainer_param.dense_table: if i.table_id in dense_table_set: dense_table = pull_thread.dense_table.add() dense_table.dense_value_name.extend(i.dense_variable_name) @@ -148,29 +148,29 @@ class DownpourSGD(DeviceWorker): i.table_id sparse_table = downpour.sparse_table.add() sparse_table.table_id = \ - self.fleet_desc_.trainer_param.sparse_table[0].table_id + self._fleet_desc.trainer_param.sparse_table[0].table_id sparse_table.sparse_key_name.extend( - self.fleet_desc_.trainer_param.sparse_table[0].slot_key) + self._fleet_desc.trainer_param.sparse_table[0].slot_key) sparse_table.sparse_value_name.extend( - self.fleet_desc_.trainer_param.sparse_table[0].slot_value) + self._fleet_desc.trainer_param.sparse_table[0].slot_value) sparse_table.sparse_grad_name.extend( - self.fleet_desc_.trainer_param.sparse_table[0].slot_gradient) + self._fleet_desc.trainer_param.sparse_table[0].slot_gradient) sparse_table.emb_dim = \ - self.fleet_desc_.server_param.downpour_server_param.downpour_table_param[ + self._fleet_desc.server_param.downpour_server_param.downpour_table_param[ 0].accessor.fea_dim - 2 sparse_table.fea_dim = sparse_table.emb_dim + 2 # TODO(guru4elephant): hard code here, need to improve sparse_table.label_var_name = "click" - for i in self.fleet_desc_.trainer_param.dense_table: + for i in self._fleet_desc.trainer_param.dense_table: if i.table_id in dense_table_set: dense_table = downpour.dense_table.add() dense_table.table_id = i.table_id dense_table.dense_value_name.extend(i.dense_variable_name) dense_table.dense_grad_name.extend( i.dense_gradient_variable_name) - downpour.skip_ops.extend(self.fleet_desc_.trainer_param.skip_op) - if self.infer_: + downpour.skip_ops.extend(self._fleet_desc.trainer_param.skip_op) + if self._infer: downpour.push_dense = False downpour.push_sparse = False diff --git a/python/paddle/fluid/executor.py b/python/paddle/fluid/executor.py index e15197037e1d901855883919b02a1574b7bc9a29..fa8b49a021294e8555e979459615b1956d9b2b55 100644 --- a/python/paddle/fluid/executor.py +++ b/python/paddle/fluid/executor.py @@ -712,10 +712,6 @@ class Executor(object): if dataset == None: raise RuntimeError("dataset is needed and should be initialized") - if not isinstance(self.place, core.CPUPlace): - raise RuntimeError("infer_from_dataset is verified on CPUPlace" - "We will open CUDAPlace in the future") - scope, trainer = self._prepare_trainer( program=program, dataset=dataset, @@ -796,10 +792,6 @@ class Executor(object): if dataset == None: raise RuntimeError("dataset is need and should be initialized") - if not isinstance(self.place, core.CPUPlace): - raise RuntimeError("train_from_dataset is verified on CPUPlace" - "We will open CUDAPlace in the future") - scope, trainer = self._prepare_trainer( program=program, dataset=dataset, diff --git a/python/paddle/fluid/incubate/fleet/base/role_maker.py b/python/paddle/fluid/incubate/fleet/base/role_maker.py index 528f7b3269eb90435d88cffadfa185cc664e430a..bf50a5815dbe445a7a44fd9199ed51f632ff4997 100644 --- a/python/paddle/fluid/incubate/fleet/base/role_maker.py +++ b/python/paddle/fluid/incubate/fleet/base/role_maker.py @@ -23,10 +23,10 @@ class RoleMakerBase(object): """ def __init__(self): - self.role_maker_name_ = "" - self.trainer_endpoints_ = [] - self.pserver_endpoints_ = [] - self.role_is_generated_ = False + self._role_maker_name = "" + self._trainer_endpoints = [] + self._pserver_endpoints = [] + self._role_is_generated = False def _is_worker(self): """ @@ -45,20 +45,20 @@ class RoleMakerBase(object): return get local ip """ import socket - self.ip_ = socket.gethostbyname(socket.gethostname()) - return self.ip_ + self._ip = socket.gethostbyname(socket.gethostname()) + return self._ip def _get_trainer_endpoints(self): """ return trainer endpoints """ - return self.trainer_endpoints_ + return self._trainer_endpoints def _get_pserver_endpoints(self): """ return pserver endpoints """ - return self.pserver_endpoints_ + return self._pserver_endpoints def _generate_role(self): """ @@ -76,59 +76,59 @@ class MPIRoleMaker(RoleMakerBase): def __init__(self): super(MPIRoleMaker, self).__init__() from mpi4py import MPI - self.comm_ = MPI.COMM_WORLD + self._comm = MPI.COMM_WORLD self.MPI = MPI - self.ips_ = None + self._ips = None def _get_rank(self): """ return rank """ - self.rank_ = self.comm_.Get_rank() - return self.rank_ + self._rank = self._comm.Get_rank() + return self._rank def _get_size(self): """ return size """ - self.size_ = self.comm_.Get_size() - return self.size_ + self._size = self._comm.Get_size() + return self._size def _all_gather(self, obj): """ all_gather(obj) will call MPI's allgather function """ self._barrier_all() - return self.comm_.allgather(obj) + return self._comm.allgather(obj) def _worker_gather(self, obj): """ worker_gather(obj) will call MPI's allgather function """ if self._is_worker(): - self.node_type_comm_.barrier() - return self.node_type_comm_.allgather(obj) + self._node_type_comm.barrier() + return self._node_type_comm.allgather(obj) return None def _barrier_all(self): """ barrier_all() will call MPI's barrier_all function """ - self.comm_.barrier() + self._comm.barrier() def _get_ips(self): """ collect current distributed job's ip list """ - if self.ips_ == None: - self.ips_ = self.comm_.allgather(self._get_local_ip()) - return self.ips_ + if self._ips == None: + self._ips = self._comm.allgather(self._get_local_ip()) + return self._ips def _finalize(self): """ finalize the current MPI instance. """ - self.comm_.finalize() + self._comm.finalize() class MPISymetricRoleMaker(MPIRoleMaker): @@ -140,11 +140,11 @@ class MPISymetricRoleMaker(MPIRoleMaker): def __init__(self): super(MPISymetricRoleMaker, self).__init__() - self.node_type_ = None - self.proc_per_node_ = 2 + self._node_type = None + self._proc_per_node = 2 def _check_role_generation(self): - if not self.role_is_generated_: + if not self._role_is_generated: sys.stderr.write("generate_role() should be called first") sys.exit(-1) return False @@ -163,7 +163,7 @@ class MPISymetricRoleMaker(MPIRoleMaker): return whether current process is worker assigned by role maker """ if self._check_role_generation(): - return self.node_type_ == 1 + return self._node_type == 1 return False def _is_server(self): @@ -171,7 +171,7 @@ class MPISymetricRoleMaker(MPIRoleMaker): return whether current process is server assigned by role maker """ if self._check_role_generation(): - return self.node_type_ == 0 + return self._node_type == 0 return False def _worker_num(self): @@ -197,7 +197,7 @@ class MPISymetricRoleMaker(MPIRoleMaker): return the index of worker """ if self._check_role_generation(): - return self.rank_ / self.proc_per_node_ + return self._rank / self._proc_per_node return 0 def _server_index(self): @@ -205,7 +205,7 @@ class MPISymetricRoleMaker(MPIRoleMaker): return the index of server """ if self._check_role_generation(): - return self.rank_ / self.proc_per_node_ + return self._rank / self._proc_per_node return 0 def _barrier_worker(self): @@ -214,7 +214,7 @@ class MPISymetricRoleMaker(MPIRoleMaker): """ if self._check_role_generation(): if self._is_worker(): - self.node_type_comm_.barrier() + self._node_type_comm.barrier() def _barrier_server(self): """ @@ -222,20 +222,20 @@ class MPISymetricRoleMaker(MPIRoleMaker): """ if self._check_role_generation(): if self._is_server(): - self.node_type_comm_.barrier() + self._node_type_comm.barrier() def _generate_role(self): """ generate currently process's role """ - if not self.role_is_generated_: + if not self._role_is_generated: # TODO(guru4elephant): only allow to be called once - self.trainer_endpoints_ = self._get_ips() - self.pserver_endpoints_ = self._get_ips() + self._trainer_endpoints = self._get_ips() + self._pserver_endpoints = self._get_ips() - if 0 == self._get_rank() % self.proc_per_node_ % 2: - self.node_type_ = 0 + if 0 == self._get_rank() % self._proc_per_node % 2: + self._node_type = 0 else: - self.node_type_ = 1 - self.node_type_comm_ = self.comm_.Split(self.node_type_) - self.role_is_generated_ = True + self._node_type = 1 + self._node_type_comm = self._comm.Split(self._node_type) + self._role_is_generated = True diff --git a/python/paddle/fluid/incubate/fleet/parameter_server/__init__.py b/python/paddle/fluid/incubate/fleet/parameter_server/__init__.py index 9b1ec412c731a4b59d0da8847e91e30d8e1d864a..58f17f95323236cfc559a5cb05a3b09bef3b0a5e 100644 --- a/python/paddle/fluid/incubate/fleet/parameter_server/__init__.py +++ b/python/paddle/fluid/incubate/fleet/parameter_server/__init__.py @@ -64,9 +64,9 @@ class Fleet(object): def __init__(self): self._opt_info = None # for fleet only - self.role_maker_ = None - self.local_ip_ = 0 - self.is_initialized_ = False + self._role_maker = None + self._local_ip = 0 + self._is_initialized = False def init(self): # TODO(guru4elephant) @@ -78,22 +78,22 @@ class Fleet(object): current node's role, e.g. worker, server, etc. """ if not self.is_initialized_: - self.role_maker_ = MPISymetricRoleMaker() - self.role_maker_._generate_role() + self._role_maker = MPISymetricRoleMaker() + self._role_maker._generate_role() self._fleet_ptr = fluid.core.Fleet() - self.is_initialized_ = True + self._is_initialized = True def stop(self): """ stop(): will be called after a user finishes his/her training task. Fleet instance will be destroyed when stop() is called. """ - self.role_maker_._barrier_worker() - if self.role_maker_._is_first_worker(): + self._role_maker._barrier_worker() + if self._role_maker._is_first_worker(): self._fleet_ptr.stop_server() - self.role_maker_._barrier_worker() - self.role_maker_._barrier_all() - self.role_maker_._finalize() + self._role_maker._barrier_worker() + self._role_maker._barrier_all() + self._role_maker._finalize() def init_pserver(self): """ @@ -110,15 +110,15 @@ class Fleet(object): sys.exit(-1) self._fleet_ptr.init_server(self._dist_desc_str, self.role_maker_._get_rank()) - self.local_ip_ = self._fleet_ptr.run_server() + self._local_ip = self._fleet_ptr.run_server() # barrier_all for init_server - self.role_maker_._barrier_all() - self.all_ips_ = self.role_maker_._all_gather(self.local_ip_) + self._role_maker._barrier_all() + self._all_ips = self._role_maker._all_gather(self.local_ip_) - self._fleet_ptr.gather_servers(self.all_ips_, - self.role_maker_._get_size()) + self._fleet_ptr.gather_servers(self._all_ips, + self._role_maker._get_size()) # barrier_all for init_worker, wait all workers start - self.role_maker_._barrier_all() + self._role_maker._barrier_all() else: print("You should run DistributedOptimizer.minimize() first") sys.exit(-1) @@ -151,21 +151,21 @@ class Fleet(object): print("You should run DistributedOptimizer.minimize() first") sys.exit(-1) # barrier_all for init_server, wait for server starts - self.role_maker_._barrier_all() - self.all_ips_ = self.role_maker_._all_gather(self.local_ip_) - self._fleet_ptr.init_worker(self._dist_desc_str, self.all_ips_, - self.role_maker_._get_size(), - self.role_maker_._get_rank()) + self._role_maker._barrier_all() + self._all_ips = self._role_maker._all_gather(self.local_ip_) + self._fleet_ptr.init_worker(self._dist_desc_str, self._all_ips, + self._role_maker._get_size(), + self._role_maker._get_rank()) # barrier_all for init_worker - self.role_maker_._barrier_all() + self._role_maker._barrier_all() # prepare for client to client communication info = self._fleet_ptr.get_clients_info() - all_info = self.role_maker_._worker_gather(info[0]) + all_info = self._role_maker._worker_gather(info[0]) self._fleet_ptr.gather_clients(all_info) self._fleet_ptr.create_client2client_connection() # barrier for init model - self.role_maker_._barrier_worker() - if self.role_maker_._is_first_worker(): + self._role_maker._barrier_worker() + if self._role_maker._is_first_worker(): tables = self._dist_desc.trainer_param.dense_table for prog, scope in zip(programs, scopes): prog_id = str(id(prog)) @@ -192,7 +192,7 @@ class Fleet(object): int(table.table_id), var_name_list) # barrier for init model done - self.role_maker_._barrier_worker() + self._role_maker._barrier_worker() else: print("You should run DistributedOptimizer.minimize() first") sys.exit(-1) @@ -201,39 +201,39 @@ class Fleet(object): """ return the number of current job's worker num """ - return self.role_maker_._worker_num() + return self._role_maker._worker_num() def get_server_num(self): """ return the number of current job's server num """ - return self.role_maker_._server_num() + return self._role_maker._server_num() def get_worker_index(self): """ return the mpi rank of current worker """ - return self.role_maker_._worker_index() + return self._role_maker._worker_index() def is_worker(self): """ return whether current node is a worker """ - return self.role_maker_._is_worker() + return self._role_maker._is_worker() def is_server(self): """ return whether current node is pserver """ - return self.role_maker_._is_server() + return self._role_maker._is_server() def init_pserver_model(self): """ init pserver model called from pserver """ - if self.role_maker_._is_first_worker(): + if self._role_maker._is_first_worker(): self._fleet_ptr.init_model() - self.role_maker_._barrier_worker() + self._role_maker._barrier_worker() def save_pserver_model(self, save_path): """ diff --git a/python/paddle/fluid/incubate/fleet/parameter_server/node.py b/python/paddle/fluid/incubate/fleet/parameter_server/node.py index 60035b6e8da3e40158f8be0bafdd911f6bd6f543..641c294c4a6edeb3d9823b4152b0ea158c8faa80 100644 --- a/python/paddle/fluid/incubate/fleet/parameter_server/node.py +++ b/python/paddle/fluid/incubate/fleet/parameter_server/node.py @@ -42,13 +42,13 @@ class DownpourServer(Server): """ def __init__(self): - self.server_ = pslib.ServerParameter() - self.server_.downpour_server_param.service_param.start_server_port = 0 - self.server_.downpour_server_param.service_param.server_class = "DownpourBrpcPsServer" - self.server_.downpour_server_param.service_param.client_class = "DownpourBrpcPsClient" - self.server_.downpour_server_param.service_param.service_class = "DownpourPsService" - self.server_.downpour_server_param.service_param.start_server_port = 0 - self.server_.downpour_server_param.service_param.server_thread_num = 12 + self._server = pslib.ServerParameter() + self._server.downpour_server_param.service_param.start_server_port = 0 + self._server.downpour_server_param.service_param.server_class = "DownpourBrpcPsServer" + self._server.downpour_server_param.service_param.client_class = "DownpourBrpcPsClient" + self._server.downpour_server_param.service_param.service_class = "DownpourPsService" + self._server.downpour_server_param.service_param.start_server_port = 0 + self._server.downpour_server_param.service_param.server_thread_num = 12 def add_sparse_table(self, table_id, learning_rate, slot_key_vars, slot_value_var): @@ -62,7 +62,7 @@ class DownpourServer(Server): Returns: return None """ - table = self.server_.downpour_server_param.downpour_table_param.add() + table = self._server.downpour_server_param.downpour_table_param.add() table.table_id = table_id table.table_class = "DownpourSparseTable" table.type = pslib.PS_SPARSE_TABLE @@ -123,7 +123,7 @@ class DownpourServer(Server): Returns: return None """ - table = self.server_.downpour_server_param.downpour_table_param.add() + table = self._server.downpour_server_param.downpour_table_param.add() table.table_id = table_id table.table_class = "DownpourDenseTable" table.type = pslib.PS_DENSE_TABLE @@ -140,7 +140,7 @@ class DownpourServer(Server): """ Return downpour server program_desc """ - return self.server_ + return self._server class DownpourWorker(Worker): @@ -155,7 +155,7 @@ class DownpourWorker(Worker): def __init__(self, window): self.window = window - self.worker_ = pslib.DownpourTrainerParameter() + self._worker = pslib.DownpourTrainerParameter() def add_sparse_table(self, table_id, learning_rate, slot_key_vars, slot_value_vars): @@ -187,7 +187,7 @@ class DownpourWorker(Worker): Returns: return None """ - table = self.worker_.dense_table.add() + table = self._worker.dense_table.add() table.table_id = table_id table.dense_variable_name.extend( filter(lambda x: x.find("embedding") == -1, @@ -200,4 +200,4 @@ class DownpourWorker(Worker): """ Return downpour worker program_desc """ - return self.worker_ + return self._worker diff --git a/python/paddle/fluid/incubate/fleet/parameter_server/optimizer_factory.py b/python/paddle/fluid/incubate/fleet/parameter_server/optimizer_factory.py index 94f79e77e72bfa2d0a09502722ef36d474b610b2..ba1f2c8f6ba43bcdb8d4240e33210370e5a454f6 100644 --- a/python/paddle/fluid/incubate/fleet/parameter_server/optimizer_factory.py +++ b/python/paddle/fluid/incubate/fleet/parameter_server/optimizer_factory.py @@ -24,9 +24,9 @@ from .node import DownpourWorker, DownpourServer class DistributedOptimizerImplBase(object): def __init__(self, optimizer): - self.optimizer_ = optimizer - self.learning_rate_ = optimizer._learning_rate - self.regularization_ = optimizer.regularization + self._optimizer = optimizer + self._learning_rate = optimizer._learning_rate + self._regularization = optimizer.regularization def minimize(self, losses, @@ -41,7 +41,7 @@ class DistributedAdam(DistributedOptimizerImplBase): # todo(guru4elephant): add more optimizers here as argument # todo(guru4elephant): make learning_rate as a variable super(DistributedAdam, self).__init__(optimizer) - self.window_ = 1 + self._window = 1 self.type = "downpour" self.data_norm_name = [ ".batch_size", ".batch_square_sum", ".batch_sum", @@ -79,9 +79,9 @@ class DistributedAdam(DistributedOptimizerImplBase): server = DownpourServer() worker = DownpourWorker(self.window_) sparse_table_index = 0 - server.add_sparse_table(sparse_table_index, self.learning_rate_, + server.add_sparse_table(sparse_table_index, self._learning_rate, prefetch_slots, prefetch_slots_emb) - worker.add_sparse_table(sparse_table_index, self.learning_rate_, + worker.add_sparse_table(sparse_table_index, self._learning_rate, prefetch_slots, prefetch_slots_emb) dense_table_index = 1 program_configs = {} @@ -124,9 +124,9 @@ class DistributedAdam(DistributedOptimizerImplBase): data_norm_grads.append(i[1]) if not is_data_norm_data: grads.append(i[1]) - server.add_dense_table(dense_table_index, self.learning_rate_, + server.add_dense_table(dense_table_index, self._learning_rate, params, grads) - worker.add_dense_table(dense_table_index, self.learning_rate_, + worker.add_dense_table(dense_table_index, self._learning_rate, params, grads) program_configs[program_id]["pull_dense"] = [dense_table_index] program_configs[program_id]["push_dense"] = [dense_table_index] @@ -135,9 +135,9 @@ class DistributedAdam(DistributedOptimizerImplBase): if len(data_norm_params) != 0 and len(data_norm_grads) != 0: dense_table_index += 1 server.add_data_norm_table(dense_table_index, - self.learning_rate_, + self._learning_rate, data_norm_params, data_norm_grads) - worker.add_dense_table(dense_table_index, self.learning_rate_, + worker.add_dense_table(dense_table_index, self._learning_rate, data_norm_params, data_norm_grads) #program_config.pull_dense_table_id.extend([dense_table_index]) #program_config.push_dense_table_id.extend([dense_table_index]) diff --git a/python/paddle/fluid/layers/control_flow.py b/python/paddle/fluid/layers/control_flow.py index a5e513ed5e35d530dd07c49339995461da8454a1..f8f461853f34a09eb2317f6ac93ad385cca3609f 100644 --- a/python/paddle/fluid/layers/control_flow.py +++ b/python/paddle/fluid/layers/control_flow.py @@ -267,8 +267,44 @@ class StaticRNN(object): """ StaticRNN class. - StaticRNN class is used to create a StaticRNN. The RNN will have its - own parameters like inputs, outputs, memories, status and length. + The StaticRNN can process a batch of sequence data. The length of each + sample sequence must be equal. The StaticRNN will have its own parameters + like inputs, outputs, memories. **Note that the first dimension of inputs + represents sequence length, and all the sequence length of inputs must be + the same. And the meaning of each axis of input and output are the same.** + + Examples: + >>> import paddle.fluid as fluid + >>> import paddle.fluid.layers as layers + >>> + >>> vocab_size, hidden_size=10000, 200 + >>> x = layers.data(name="x", shape=[-1, 1, 1], dtype='int64') + >>> x_emb = layers.embedding( + >>> input=x, + >>> size=[vocab_size, hidden_size], + >>> dtype='float32', + >>> is_sparse=False) + >>> x_emb = layers.transpose(x_emb, perm=[1, 0, 2]) + >>> + >>> rnn = fluid.layers.StaticRNN() + >>> with rnn.step(): + >>> word = rnn.step_input(x_emb) + >>> prev = rnn.memory(shape=[-1, hidden_size], batch_ref = word) + >>> hidden = fluid.layers.fc(input=[word, prev], size=hidden_size, act='relu') + >>> rnn.update_memory(prev, hidden) # set prev to hidden + >>> rnn.step_output(hidden) + >>> + >>> result = rnn() + + The StaticRNN will unfold sequence into time steps. Users need to define + how to process each time step during the :code:`with` step. + + The :code:`memory` is used as a staging data cross time step. The initial + value of memory can be a variable that is filled with a constant value or + a specified variable. + + The StaticRNN can mark multiple variables as its output. Use `rnn()` to + get the output sequence. """ BEFORE_RNN_BLOCK = 0 IN_RNN_BLOCK = 1 @@ -284,6 +320,9 @@ class StaticRNN(object): self.seq_len = None def step(self): + """ + The block for user to define operators in RNN. + """ return BlockGuardWithCompletion(self) def _assert_in_rnn_block_(self, method): @@ -298,13 +337,28 @@ class StaticRNN(object): init_batch_dim_idx=0, ref_batch_dim_idx=1): """ + Create a memory variable for static rnn. + + If the :code:`init` is not None, :code:`memory` will be initialized by + this Variable. If the :code:`init` is None, :code:`shape` and :code:`batch_ref` + must be set, and this function will initialize a :code:`init` Variable. + Args: - init: boot memory, if not set, a shape, batch_ref must be provided - shape: shape of the boot memory - batch_ref: batch size reference variable - init_value: the init value of boot memory - init_batch_dim_idx: the index of batch size in init's dimension - ref_batch_dim_idx: the index of batch size in batch_ref's dimension + init(Variable|None): The initialized variable. If it is not set, + :code:`shape` and :code:`batch_ref` must be provided. + Default: None. + shape(list|tuple): The shape of the boot memory. NOTE the shape + does not contain batch_size. Default: None. + batch_ref(Variable|None): The batch size reference Variable. + Default: None. + init_value(float): the init value of boot memory. Default: 0.0. + init_batch_dim_idx(int): the batch_size axis of the + :code:`init` Variable. Default: 0. + ref_batch_dim_idx(int): the batch_size axis of the + :code:`batch_ref` Variable. Default: 1. + + Returns: + The memory variable. """ self._assert_in_rnn_block_('memory') if init is None: @@ -343,6 +397,16 @@ class StaticRNN(object): return pre_mem def step_input(self, x): + """ + Mark a sequence as a StaticRNN input. + + Args: + x(Variable): The input sequence, the shape of x + should be [seq_len, ...]. + + Returns: + The current time step in the input sequence. + """ self._assert_in_rnn_block_('step_input') if not isinstance(x, Variable): raise TypeError("step input takes a Variable") @@ -357,6 +421,15 @@ class StaticRNN(object): return ipt def step_output(self, o): + """ + Mark a sequence as a StaticRNN output. + + Args: + o(Variable): The output sequence. + + Returns: + None. + """ self._assert_in_rnn_block_('step_output') if not isinstance(o, Variable): raise TypeError("step output takes a Variable") @@ -376,10 +449,30 @@ class StaticRNN(object): self.outputs.append(out_var) def output(self, *outputs): + """ + Mark the StaticRNN output variables. + + Args: + outputs: The output Variables. + + Returns: + None + """ for each in outputs: self.step_output(each) def update_memory(self, mem, var): + """ + Update the memory from ex_mem to new_mem. NOTE that the shape and data + type of :code:`ex_mem` and :code:`new_mem` must be same. + + Args: + mem(Variable): the memory variable. + var(Variable): the plain variable generated in RNN block. + + Returns: + None + """ if not isinstance(mem, Variable) or not isinstance(var, Variable): raise TypeError("update memory should take variables") self.memories[mem.name].mem = var @@ -419,6 +512,9 @@ class StaticRNN(object): for m in self.memories: local_inputs.add(m) + # NOTE(zcd): the params have two categories of variables. + # - the variables that are the out of StaticRnn. + # - the variables that are the parameters of some layers, for example, conv2d. params = list() for op in rnn_block.ops: assert isinstance(op, Operator) @@ -435,17 +531,19 @@ class StaticRNN(object): inlinks = [parent_block.var(i.name) for i in self.inputs] outlinks = self.outputs + # NOTE(zcd): the states maybe empty in some case. boot_memories = [] pre_memories = [] memories = [] for _, mem in six.iteritems(self.memories): boot_memories.append(mem.init) pre_memories.append(mem.pre_mem.name) + assert mem.mem is not None, "%s should be updated in every step." % ( + mem.init.name) mem_var = rnn_block.var(mem.mem.name) assert isinstance(mem_var, Variable) new_mem = self.helper.create_variable_for_type_inference( dtype=mem_var.dtype) - rnn_block.append_op( type='rnn_memory_helper', inputs={'X': [mem_var]}, @@ -464,6 +562,7 @@ class StaticRNN(object): outputs={'outputs': outlinks, 'step_scopes': [step_scope]}, attrs={ + 'has_states': len(pre_memories) > 0, 'ex_states': pre_memories, 'states': memories, 'sub_block': rnn_block diff --git a/python/paddle/fluid/layers/learning_rate_scheduler.py b/python/paddle/fluid/layers/learning_rate_scheduler.py index b7d1eeba80d93d549a019455087bb7cc1d2a1083..a67c8058f2c42713738420e81316452e15acb697 100644 --- a/python/paddle/fluid/layers/learning_rate_scheduler.py +++ b/python/paddle/fluid/layers/learning_rate_scheduler.py @@ -35,8 +35,8 @@ from ..dygraph import learning_rate_scheduler as imperate_lr __all__ = [ 'exponential_decay', 'natural_exp_decay', 'inverse_time_decay', - 'polynomial_decay', 'piecewise_decay', 'noam_decay', 'append_LARS', - 'cosine_decay', 'linear_lr_warmup' + 'polynomial_decay', 'piecewise_decay', 'noam_decay', 'cosine_decay', + 'linear_lr_warmup' ] @@ -349,24 +349,26 @@ def cosine_decay(learning_rate, step_each_epoch, epochs): training progresses. By using this function, the learning rate will be decayed by following cosine decay strategy. - decayed_lr = learning_rate * 0.5 * (math.cos(epoch * math.pi / epochs) + 1) + .. math:: + + decayed\_lr = learning\_rate * 0.5 * (math.cos * (epoch * \\frac{math.pi}{epochs} ) + 1) Args: learning_rate(Variable|float): The initial learning rate. step_each_epoch(int): the number of steps in an epoch. epochs(int): the number of epochs. - Returns: - Variable: The decayed learning rate. - - Examples: + Returns: + Variable: The decayed learning rate. - ..code-block:: python + Examples: + .. code-block:: python - base_lr = 0.1 - lr = fluid.layers.cosine_decay( - learning_rate = base_lr, step_each_epoch=10000, epochs=120) + base_lr = 0.1 + lr = fluid.layers.cosine_decay( + learning_rate = base_lr, step_each_epoch=10000, epochs=120) """ + with default_main_program()._lr_schedule_guard(): if imperative_base.enabled(): decay = imperate_lr.CosineDecay(learning_rate, step_each_epoch, @@ -381,50 +383,6 @@ def cosine_decay(learning_rate, step_each_epoch, epochs): return decayed_lr -def append_LARS(params_grads, learning_rate, weight_decay): - """ - Applies LARS (LAYER-WISE ADAPTIVE RATE SCALING) to learning rate for - each layer. - - Args: - learning_rate: A learning rate Variable. This - is the global learning rate for LARS. - weight_decay: A Python `float` number. - - Returns: - The decayed learning rate - Examples: - .. code-block:: python - - learning_rate *= local_gw_ratio * sqrt(sumsq(param)) - / (sqrt(sumsq(gradient))+ weight_decay * sqrt(sumsq(param))) - """ - - assert not imperative_base.enabled( - ), "append_LARS is NOT supported in dygraph mode now" - - def _balanced_weight(param_norm, grad_norm): - if weight_decay == 1.0: - return grad_norm + param_norm - else: - return grad_norm + weight_decay * param_norm - - for param, grad in params_grads: - with param.block.program.optimized_guard( - [param, grad]), name_scope("optimizer"): - param_lr = param.optimize_attr['learning_rate'] - param_norm = ops.sqrt(nn.reduce_sum(input=ops.square(param))) - grad_norm = ops.sqrt(nn.reduce_sum(input=ops.square(grad))) - if type(param_lr) == float and param_lr == 1.0: - decayed_lr = learning_rate * param_norm \ - / _balanced_weight(param_norm, grad_norm) - else: - decayed_lr = learning_rate * param_lr * param_norm \ - / _balanced_weight(param_norm, grad_norm) - # set back param local learning rate - param.optimize_attr['learning_rate'] = decayed_lr - - def linear_lr_warmup(learning_rate, warmup_steps, start_lr, end_lr): """ Applies linear learning rate warmup before the normal learning rate diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index e70ea9da38f7640fa09a39e7f8cbfeb91ddc9832..93e46eef16fb177169db679a8437d9a33ed38e99 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -73,6 +73,8 @@ __all__ = [ 'reduce_max', 'reduce_min', 'reduce_prod', + 'reduce_all', + 'reduce_any', 'sequence_first_step', 'sequence_last_step', 'sequence_slice', @@ -159,6 +161,7 @@ __all__ = [ 'sum', 'slice', 'shape', + 'rank', 'logical_and', 'logical_or', 'logical_xor', @@ -4738,6 +4741,106 @@ def reduce_prod(input, dim=None, keep_dim=False, name=None): return out +def reduce_all(input, dim=None, keep_dim=False, name=None): + """ + Computes the ``logical and`` of tensor elements over the given dimension. + + Args: + input (Variable): The input variable which is a Tensor or LoDTensor. + dim (list|int|None): The dimension along which the logical and is computed. + If :attr:`None`, compute the logical and over all elements of + :attr:`input` and return a Tensor variable with a single element, + otherwise must be in the range :math:`[-rank(input), rank(input))`. + If :math:`dim[i] < 0`, the dimension to reduce is :math:`rank + dim[i]`. + keep_dim (bool): Whether to reserve the reduced dimension in the + output Tensor. The result tensor will have one fewer dimension + than the :attr:`input` unless :attr:`keep_dim` is true. + name(str|None): A name for this layer(optional). If set None, the layer + will be named automatically. + + Returns: + Variable: The reduced Tensor variable. + + Examples: + .. code-block:: python + + # x is a bool Tensor variable with following elements: + # [[True, False] + # [True, True]] + # Each example is followed by the correspending output tensor. + fluid.layers.reduce_all(x) # False + fluid.layers.reduce_all(x, dim=0) # [True, False] + fluid.layers.reduce_all(x, dim=-1) # [False, True] + fluid.layers.reduce_all(x, dim=1, + keep_dim=True) # [[False], [True]] + + """ + helper = LayerHelper('reduce_all', **locals()) + out = helper.create_variable_for_type_inference(dtype=helper.input_dtype()) + if dim is not None and not isinstance(dim, list): + dim = [dim] + helper.append_op( + type='reduce_all', + inputs={'X': input}, + outputs={'Out': out}, + attrs={ + 'dim': dim if dim != None else [0], + 'keep_dim': keep_dim, + 'reduce_all': True if dim == None else False + }) + return out + + +def reduce_any(input, dim=None, keep_dim=False, name=None): + """ + Computes the ``logical or`` of tensor elements over the given dimension. + + Args: + input (Variable): The input variable which is a Tensor or LoDTensor. + dim (list|int|None): The dimension along which the logical or is computed. + If :attr:`None`, compute the logical or over all elements of + :attr:`input` and return a Tensor variable with a single element, + otherwise must be in the range :math:`[-rank(input), rank(input))`. + If :math:`dim[i] < 0`, the dimension to reduce is :math:`rank + dim[i]`. + keep_dim (bool): Whether to reserve the reduced dimension in the + output Tensor. The result tensor will have one fewer dimension + than the :attr:`input` unless :attr:`keep_dim` is true. + name(str|None): A name for this layer(optional). If set None, the layer + will be named automatically. + + Returns: + Variable: The reduced Tensor variable. + + Examples: + .. code-block:: python + + # x is a bool Tensor variable with following elements: + # [[True, False] + # [False, False]] + # Each example is followed by the correspending output tensor. + fluid.layers.reduce_any(x) # True + fluid.layers.reduce_any(x, dim=0) # [True, False] + fluid.layers.reduce_any(x, dim=-1) # [True, False] + fluid.layers.reduce_any(x, dim=1, + keep_dim=True) # [[True], [False]] + + """ + helper = LayerHelper('reduce_any', **locals()) + out = helper.create_variable_for_type_inference(dtype=helper.input_dtype()) + if dim is not None and not isinstance(dim, list): + dim = [dim] + helper.append_op( + type='reduce_any', + inputs={'X': input}, + outputs={'Out': out}, + attrs={ + 'dim': dim if dim != None else [0], + 'keep_dim': keep_dim, + 'reduce_all': True if dim == None else False + }) + return out + + def split(input, num_or_sections, dim=-1, name=None): """ Split the input tensor into multiple sub-tensors. @@ -4819,7 +4922,7 @@ def l2_normalize(x, axis, epsilon=1e-12, name=None): the dimension to normalization is rank(X) + axis. -1 is the last dimension. epsilon(float): The epsilon value is used to avoid division by zero, \ - the defalut value is 1e-10. + the defalut value is 1e-12. name(str|None): A name for this layer(optional). If set None, the layer \ will be named automatically. @@ -9237,6 +9340,32 @@ def shape(input): return out +def rank(input): + """ + **Rank Layer** + + Returns the number of dimensions for a tensor, which is a 0-D int32 Tensor. + + Args: + input (Variable): The input variable. + + Returns: + Variable: The rank of the input variable. + + Examples: + .. code-block:: python + + input = layers.data( + name="input", shape=[3, 100, 100], dtype="float32") + rank = layers.rank(input) # 4 + """ + + ndims = len(input.shape) + out = assign(np.array(ndims, 'int32')) + + return out + + def _elementwise_op(helper): op_type = helper.layer_type x = helper.kwargs.get('x', None) @@ -11002,7 +11131,7 @@ def pixel_shuffle(x, upscale_factor): Returns: - Out(Variable): the pixel shuffle result is a tensor variable with the same shape and the same type as the input. + Out(Variable): Reshaped tensor according to the new dimension. Raises: diff --git a/python/paddle/fluid/layers/tensor.py b/python/paddle/fluid/layers/tensor.py index 80450119f44e93aae4b483983484ea18be5b2035..03ebd41fa00c69bfce66d325e32fc9aeb25a2486 100644 --- a/python/paddle/fluid/layers/tensor.py +++ b/python/paddle/fluid/layers/tensor.py @@ -24,26 +24,11 @@ from .layer_function_generator import templatedoc import numpy __all__ = [ - 'create_tensor', - 'create_parameter', - 'create_global_var', - 'cast', - 'tensor_array_to_tensor', - 'concat', - 'sums', - 'assign', - 'fill_constant_batch_size_like', - 'fill_constant', - 'argmin', - 'argmax', - 'argsort', - 'ones', - 'zeros', - 'reverse', - 'has_inf', - 'has_nan', - 'isfinite', - 'range', + 'create_tensor', 'create_parameter', 'create_global_var', 'cast', + 'tensor_array_to_tensor', 'concat', 'sums', 'assign', + 'fill_constant_batch_size_like', 'fill_constant', 'argmin', 'argmax', + 'argsort', 'ones', 'zeros', 'reverse', 'has_inf', 'has_nan', 'isfinite', + 'range', 'linspace' ] @@ -826,3 +811,45 @@ def range(start, end, step, dtype): 'Step': step}, outputs={'Out': [out]}) return out + + +def linspace(start, stop, num, dtype): + """ + Return fixed number of evenly spaced values within a given interval. + + First entry is start, and last entry is stop. In the case when Num is 1, only Start is returned. Like linspace function of numpy. + + Args: + start(float|Variable): First entry in the sequence. It is a float scalar, or a tensor of shape [1] with type 'float32'|'float64'. + stop(float|Variable): Last entry in the sequence. It is a float scalar, or a tensor of shape [1] with type 'float32'|'float64'. + num(int|Variable): Number of entry in the sequence. It is an int scalar, or a tensor of shape [1] with type int32. + dtype(string): 'float32'|'float64', the data type of the output tensor. + + Returns: + Variable: The tensor variable storing a 1-D tensor. + + Examples: + .. code-block:: python + + data = fluid.layers.linspace(0, 10, 5, 'float32') # [0.0, 2.5, 5.0, 7.5, 10.0] + data = fluid.layers.linspace(0, 10, 1, 'float32') # [0.0] + + """ + helper = LayerHelper("linspace", **locals()) + + if not isinstance(start, Variable): + start = fill_constant([1], dtype, start) + if not isinstance(stop, Variable): + stop = fill_constant([1], dtype, stop) + if not isinstance(num, Variable): + num = fill_constant([1], 'int32', num) + + out = helper.create_variable_for_type_inference(dtype=start.dtype) + + helper.append_op( + type='linspace', + inputs={'Start': start, + 'Stop': stop, + 'Num': num}, + outputs={'Out': [out]}) + return out diff --git a/python/paddle/fluid/tests/book/CMakeLists.txt b/python/paddle/fluid/tests/book/CMakeLists.txt index ee734f3c782adb5196a03aca5718377009a5b4e7..999a765b6dc32323a24f9069f11134360dbadcb8 100644 --- a/python/paddle/fluid/tests/book/CMakeLists.txt +++ b/python/paddle/fluid/tests/book/CMakeLists.txt @@ -6,4 +6,6 @@ foreach(src ${TEST_OPS}) py_test(${src} SRCS ${src}.py) endforeach() -add_subdirectory(high-level-api) +if(WITH_HIGH_LEVEL_API_TEST) + add_subdirectory(high-level-api) +endif() diff --git a/python/paddle/fluid/tests/book/high-level-api/CMakeLists.txt b/python/paddle/fluid/tests/book/high-level-api/CMakeLists.txt index efa5ee2d06af3d31e7d84122dd7eea37d6dcf3a3..c034709fbdc2aa315ca995a42c278b261e6283a4 100644 --- a/python/paddle/fluid/tests/book/high-level-api/CMakeLists.txt +++ b/python/paddle/fluid/tests/book/high-level-api/CMakeLists.txt @@ -1,16 +1,28 @@ -file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py") +file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*_new_api.py") string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}") -# default test -foreach(src ${TEST_OPS}) - py_test(${src} SRCS ${src}.py) -endforeach() +# This test is buggy +# py_test(test_understand_sentiment_dynamic_rnn SRCS +# test_understand_sentiment_dynamic_rnn.py SERIAL) +LIST(REMOVE_ITEM TEST_OPS test_understand_sentiment_dynamic_rnn_new_api) -add_subdirectory(fit_a_line) -add_subdirectory(recognize_digits) -add_subdirectory(image_classification) -add_subdirectory(understand_sentiment) -add_subdirectory(label_semantic_roles) -add_subdirectory(word2vec) -add_subdirectory(recommender_system) -add_subdirectory(machine_translation) +if(NOT APPLE) + # default test + foreach(src ${TEST_OPS}) + py_test(${src} SRCS ${src}.py) + endforeach() +else() + foreach(src ${TEST_OPS}) + if(${src} STREQUAL "test_image_classification_vgg_new_api") + message(WARNING "These tests has been disabled in OSX for random fail: \n" ${src}) + elseif(${src} STREQUAL "test_image_classification_resnet_new_api") + message(WARNING "These tests has been disabled in OSX for random fail: \n" ${src}) + elseif(${src} STREQUAL "test_recognize_digits_conv_new_api") + message(WARNING "These tests has been disabled in OSX for random fail: \n" ${src}) + elseif(${src} STREQUAL "test_recognize_digits_mlp_new_api") + message(WARNING "These tests has been disabled in OSX for random fail: \n" ${src}) + elseif() + py_test(${src} SRCS ${src}.py) + endif() + endforeach() +endif() diff --git a/python/paddle/fluid/tests/book/high-level-api/image_classification/cifar10_small_test_set.py b/python/paddle/fluid/tests/book/high-level-api/cifar10_small_test_set.py similarity index 100% rename from python/paddle/fluid/tests/book/high-level-api/image_classification/cifar10_small_test_set.py rename to python/paddle/fluid/tests/book/high-level-api/cifar10_small_test_set.py diff --git a/python/paddle/fluid/tests/book/high-level-api/fit_a_line/CMakeLists.txt b/python/paddle/fluid/tests/book/high-level-api/fit_a_line/CMakeLists.txt deleted file mode 100644 index 673c965b662a022739f8d489c331f4de9455a926..0000000000000000000000000000000000000000 --- a/python/paddle/fluid/tests/book/high-level-api/fit_a_line/CMakeLists.txt +++ /dev/null @@ -1,7 +0,0 @@ -file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py") -string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}") - -# default test -foreach(src ${TEST_OPS}) - py_test(${src} SRCS ${src}.py) -endforeach() diff --git a/python/paddle/fluid/tests/book/high-level-api/image_classification/CMakeLists.txt b/python/paddle/fluid/tests/book/high-level-api/image_classification/CMakeLists.txt deleted file mode 100644 index 91c1d17eb5391ea37a41a886594cc71c6e6c56bd..0000000000000000000000000000000000000000 --- a/python/paddle/fluid/tests/book/high-level-api/image_classification/CMakeLists.txt +++ /dev/null @@ -1,19 +0,0 @@ -file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py") -string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}") - -if(NOT APPLE) - # default test - foreach(src ${TEST_OPS}) - py_test(${src} SRCS ${src}.py) - endforeach() -else() - foreach(src ${TEST_OPS}) - if(${src} STREQUAL "test_image_classification_vgg") - message(WARNING "These tests has been disabled in OSX for random fail: \n" ${src}) - elseif(${src} STREQUAL "test_image_classification_resnet") - message(WARNING "These tests has been disabled in OSX for random fail: \n" ${src}) - elseif() - py_test(${src} SRCS ${src}.py) - endif() - endforeach() -endif() diff --git a/python/paddle/fluid/tests/book/high-level-api/label_semantic_roles/CMakeLists.txt b/python/paddle/fluid/tests/book/high-level-api/label_semantic_roles/CMakeLists.txt deleted file mode 100644 index 673c965b662a022739f8d489c331f4de9455a926..0000000000000000000000000000000000000000 --- a/python/paddle/fluid/tests/book/high-level-api/label_semantic_roles/CMakeLists.txt +++ /dev/null @@ -1,7 +0,0 @@ -file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py") -string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}") - -# default test -foreach(src ${TEST_OPS}) - py_test(${src} SRCS ${src}.py) -endforeach() diff --git a/python/paddle/fluid/tests/book/high-level-api/machine_translation/CMakeLists.txt b/python/paddle/fluid/tests/book/high-level-api/machine_translation/CMakeLists.txt deleted file mode 100644 index 673c965b662a022739f8d489c331f4de9455a926..0000000000000000000000000000000000000000 --- a/python/paddle/fluid/tests/book/high-level-api/machine_translation/CMakeLists.txt +++ /dev/null @@ -1,7 +0,0 @@ -file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py") -string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}") - -# default test -foreach(src ${TEST_OPS}) - py_test(${src} SRCS ${src}.py) -endforeach() diff --git a/python/paddle/fluid/tests/book/high-level-api/recognize_digits/CMakeLists.txt b/python/paddle/fluid/tests/book/high-level-api/recognize_digits/CMakeLists.txt deleted file mode 100644 index f9c6d60540fcb6f8a73fdc4e68471448e16cbdc2..0000000000000000000000000000000000000000 --- a/python/paddle/fluid/tests/book/high-level-api/recognize_digits/CMakeLists.txt +++ /dev/null @@ -1,19 +0,0 @@ -file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py") -string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}") - -# default test -if(NOT APPLE) - foreach(src ${TEST_OPS}) - py_test(${src} SRCS ${src}.py) - endforeach() -else() - foreach(src ${TEST_OPS}) - if(${src} STREQUAL "test_recognize_digits_conv") - message(WARNING "These tests has been disabled in OSX for random fail: \n" ${src}) - elseif(${src} STREQUAL "test_recognize_digits_mlp") - message(WARNING "These tests has been disabled in OSX for random fail: \n" ${src}) - else() - py_test(${src} SRCS ${src}.py) - endif() - endforeach() -endif() diff --git a/python/paddle/fluid/tests/book/high-level-api/recommender_system/CMakeLists.txt b/python/paddle/fluid/tests/book/high-level-api/recommender_system/CMakeLists.txt deleted file mode 100644 index 673c965b662a022739f8d489c331f4de9455a926..0000000000000000000000000000000000000000 --- a/python/paddle/fluid/tests/book/high-level-api/recommender_system/CMakeLists.txt +++ /dev/null @@ -1,7 +0,0 @@ -file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py") -string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}") - -# default test -foreach(src ${TEST_OPS}) - py_test(${src} SRCS ${src}.py) -endforeach() diff --git a/python/paddle/fluid/tests/book/high-level-api/fit_a_line/test_fit_a_line.py b/python/paddle/fluid/tests/book/high-level-api/test_fit_a_line_new_api.py similarity index 100% rename from python/paddle/fluid/tests/book/high-level-api/fit_a_line/test_fit_a_line.py rename to python/paddle/fluid/tests/book/high-level-api/test_fit_a_line_new_api.py diff --git a/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_resnet.py b/python/paddle/fluid/tests/book/high-level-api/test_image_classification_resnet_new_api.py similarity index 100% rename from python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_resnet.py rename to python/paddle/fluid/tests/book/high-level-api/test_image_classification_resnet_new_api.py diff --git a/python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_vgg.py b/python/paddle/fluid/tests/book/high-level-api/test_image_classification_vgg_new_api.py similarity index 100% rename from python/paddle/fluid/tests/book/high-level-api/image_classification/test_image_classification_vgg.py rename to python/paddle/fluid/tests/book/high-level-api/test_image_classification_vgg_new_api.py diff --git a/python/paddle/fluid/tests/book/high-level-api/label_semantic_roles/test_label_semantic_roles_newapi.py b/python/paddle/fluid/tests/book/high-level-api/test_label_semantic_roles_new_api.py similarity index 100% rename from python/paddle/fluid/tests/book/high-level-api/label_semantic_roles/test_label_semantic_roles_newapi.py rename to python/paddle/fluid/tests/book/high-level-api/test_label_semantic_roles_new_api.py diff --git a/python/paddle/fluid/tests/book/high-level-api/machine_translation/test_machine_translation.py b/python/paddle/fluid/tests/book/high-level-api/test_machine_translation_new_api.py similarity index 100% rename from python/paddle/fluid/tests/book/high-level-api/machine_translation/test_machine_translation.py rename to python/paddle/fluid/tests/book/high-level-api/test_machine_translation_new_api.py diff --git a/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_conv.py b/python/paddle/fluid/tests/book/high-level-api/test_recognize_digits_conv_new_api.py similarity index 100% rename from python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_conv.py rename to python/paddle/fluid/tests/book/high-level-api/test_recognize_digits_conv_new_api.py diff --git a/python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_mlp.py b/python/paddle/fluid/tests/book/high-level-api/test_recognize_digits_mlp_new_api.py similarity index 100% rename from python/paddle/fluid/tests/book/high-level-api/recognize_digits/test_recognize_digits_mlp.py rename to python/paddle/fluid/tests/book/high-level-api/test_recognize_digits_mlp_new_api.py diff --git a/python/paddle/fluid/tests/book/high-level-api/recommender_system/test_recommender_system_newapi.py b/python/paddle/fluid/tests/book/high-level-api/test_recommender_system_new_api.py similarity index 100% rename from python/paddle/fluid/tests/book/high-level-api/recommender_system/test_recommender_system_newapi.py rename to python/paddle/fluid/tests/book/high-level-api/test_recommender_system_new_api.py diff --git a/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_conv.py b/python/paddle/fluid/tests/book/high-level-api/test_understand_sentiment_conv_new_api.py similarity index 100% rename from python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_conv.py rename to python/paddle/fluid/tests/book/high-level-api/test_understand_sentiment_conv_new_api.py diff --git a/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_dynamic_rnn.py b/python/paddle/fluid/tests/book/high-level-api/test_understand_sentiment_dynamic_rnn_new_api.py similarity index 100% rename from python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_dynamic_rnn.py rename to python/paddle/fluid/tests/book/high-level-api/test_understand_sentiment_dynamic_rnn_new_api.py diff --git a/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_stacked_lstm.py b/python/paddle/fluid/tests/book/high-level-api/test_understand_sentiment_stacked_lstm_new_api.py similarity index 100% rename from python/paddle/fluid/tests/book/high-level-api/understand_sentiment/test_understand_sentiment_stacked_lstm.py rename to python/paddle/fluid/tests/book/high-level-api/test_understand_sentiment_stacked_lstm_new_api.py diff --git a/python/paddle/fluid/tests/book/high-level-api/word2vec/test_word2vec_new_api.py b/python/paddle/fluid/tests/book/high-level-api/test_word2vec_new_api.py similarity index 100% rename from python/paddle/fluid/tests/book/high-level-api/word2vec/test_word2vec_new_api.py rename to python/paddle/fluid/tests/book/high-level-api/test_word2vec_new_api.py diff --git a/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/CMakeLists.txt b/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/CMakeLists.txt deleted file mode 100644 index d71147a85e77ea6dc5b6391aa169abd9b02a0aa1..0000000000000000000000000000000000000000 --- a/python/paddle/fluid/tests/book/high-level-api/understand_sentiment/CMakeLists.txt +++ /dev/null @@ -1,12 +0,0 @@ -file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py") -string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}") - -# This test is buggy -# py_test(test_understand_sentiment_dynamic_rnn SRCS -# test_understand_sentiment_dynamic_rnn.py SERIAL) -LIST(REMOVE_ITEM TEST_OPS test_understand_sentiment_dynamic_rnn) - -# default test -foreach(src ${TEST_OPS}) - py_test(${src} SRCS ${src}.py) -endforeach() diff --git a/python/paddle/fluid/tests/book/high-level-api/word2vec/CMakeLists.txt b/python/paddle/fluid/tests/book/high-level-api/word2vec/CMakeLists.txt deleted file mode 100644 index 673c965b662a022739f8d489c331f4de9455a926..0000000000000000000000000000000000000000 --- a/python/paddle/fluid/tests/book/high-level-api/word2vec/CMakeLists.txt +++ /dev/null @@ -1,7 +0,0 @@ -file(GLOB TEST_OPS RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "test_*.py") -string(REPLACE ".py" "" TEST_OPS "${TEST_OPS}") - -# default test -foreach(src ${TEST_OPS}) - py_test(${src} SRCS ${src}.py) -endforeach() diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt index cbe9afce035ea9918a41fafe3c2d4a3eb3f4dcb0..65045a4ab20338817908e481e5f990706cc88e71 100644 --- a/python/paddle/fluid/tests/unittests/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt @@ -25,7 +25,6 @@ endif() list(REMOVE_ITEM TEST_OPS test_seq_concat_op) # FIXME(helin): https://github.com/PaddlePaddle/Paddle/issues/8290 list(REMOVE_ITEM TEST_OPS test_modified_huber_loss_op) # FIXME(qijun) https://github.com/PaddlePaddle/Paddle/issues/5184 list(REMOVE_ITEM TEST_OPS test_lstm_unit_op) # # FIXME(qijun) https://github.com/PaddlePaddle/Paddle/issues/5185 -list(REMOVE_ITEM TEST_OPS test_recurrent_op) # FIXME(qijun) https://github.com/PaddlePaddle/Paddle/issues/6152 list(REMOVE_ITEM TEST_OPS test_cond_op) # FIXME(qijun): https://github.com/PaddlePaddle/Paddle/issues/5101#issuecomment-339814957 list(REMOVE_ITEM TEST_OPS op_test) # op_test is a helper python file, not a test @@ -74,7 +73,6 @@ list(REMOVE_ITEM TEST_OPS test_dgc_op) list(REMOVE_ITEM TEST_OPS test_dist_se_resnext_nccl) list(REMOVE_ITEM TEST_OPS test_dist_transformer) list(REMOVE_ITEM TEST_OPS test_parallel_executor_transformer) -list(REMOVE_ITEM TEST_OPS test_image_classification_resnet) list(REMOVE_ITEM TEST_OPS test_bilinear_interp_op) list(REMOVE_ITEM TEST_OPS test_nearest_interp_op) list(REMOVE_ITEM TEST_OPS test_imperative_resnet) @@ -125,10 +123,6 @@ if(NOT WIN32) py_test_modules(test_ir_memory_optimize_transformer MODULES test_ir_memory_optimize_transformer SERIAL) endif() -if(NOT APPLE) - py_test_modules(test_image_classification_resnet MODULES test_image_classification_resnet SERIAL) -endif() - if(CMAKE_BUILD_TYPE STREQUAL "Debug") # change the timeout from 600 to 2200, because in debug mode, this test need more time. set_tests_properties(test_parallel_executor_seresnext PROPERTIES TIMEOUT 2200) diff --git a/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py b/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py index 18ed02a72275437fa6106e57c0383e17647d9700..723aafb171271ed248c93665a21089029a30a836 100644 --- a/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py +++ b/python/paddle/fluid/tests/unittests/parallel_executor_test_base.py @@ -29,7 +29,8 @@ __all__ = ['TestParallelExecutorBase'] class TestParallelExecutorBase(unittest.TestCase): - def check_network_convergence(self, + @classmethod + def check_network_convergence(cls, method, use_cuda=True, memory_opt=True, diff --git a/python/paddle/fluid/tests/unittests/test_dyn_rnn.py b/python/paddle/fluid/tests/unittests/test_dyn_rnn.py index 48fb93ec529bee32b9652a89ba7da3dc77f7853a..4b0195d307dc83f77ff04e89544d7bc751b8c011 100644 --- a/python/paddle/fluid/tests/unittests/test_dyn_rnn.py +++ b/python/paddle/fluid/tests/unittests/test_dyn_rnn.py @@ -24,15 +24,15 @@ from paddle.fluid.layers.control_flow import max_sequence_len from paddle.fluid.layers.control_flow import lod_tensor_to_array from paddle.fluid.layers.control_flow import array_to_lod_tensor from paddle.fluid.layers.control_flow import shrink_memory +from fake_reader import fake_imdb_reader class TestDynRNN(unittest.TestCase): def setUp(self): - self.word_dict = paddle.dataset.imdb.word_dict() + self.word_dict_len = 5147 self.BATCH_SIZE = 2 - self.train_data = paddle.batch( - paddle.dataset.imdb.train(self.word_dict), - batch_size=self.BATCH_SIZE) + reader = fake_imdb_reader(self.word_dict_len, self.BATCH_SIZE * 100) + self.train_data = paddle.batch(reader, batch_size=self.BATCH_SIZE) def test_plain_while_op(self): main_program = fluid.Program() @@ -42,7 +42,7 @@ class TestDynRNN(unittest.TestCase): sentence = fluid.layers.data( name='word', shape=[1], dtype='int64', lod_level=1) sent_emb = fluid.layers.embedding( - input=sentence, size=[len(self.word_dict), 32], dtype='float32') + input=sentence, size=[self.word_dict_len, 32], dtype='float32') label = fluid.layers.data(name='label', shape=[1], dtype='float32') @@ -109,7 +109,7 @@ class TestDynRNN(unittest.TestCase): sentence = fluid.layers.data( name='word', shape=[1], dtype='int64', lod_level=1) sent_emb = fluid.layers.embedding( - input=sentence, size=[len(self.word_dict), 32], dtype='float32') + input=sentence, size=[self.word_dict_len, 32], dtype='float32') rnn = fluid.layers.DynamicRNN() diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py index 38d0533a7ec820241c6a08f2180a7426984068f2..91f8bc5fd0a510dcc05cb7ba2397cad52be16af5 100644 --- a/python/paddle/fluid/tests/unittests/test_layers.py +++ b/python/paddle/fluid/tests/unittests/test_layers.py @@ -1759,10 +1759,20 @@ class TestBook(LayerTest): def test_lod_reset(self): # TODO(minqiyang): dygraph do not support lod now with self.static_graph(): + # case 1 x = layers.data(name='x', shape=[10], dtype='float32') y = layers.data( name='y', shape=[10, 20], dtype='float32', lod_level=2) - return (layers.lod_reset(x=x, y=y)) + z = layers.lod_reset(x=x, y=y) + self.assertTrue(z.lod_level == 2) + # case 2 + lod_tensor_in = layers.data(name='lod_in', shape=[1], dtype='int64') + z = layers.lod_reset(x=x, y=lod_tensor_in) + self.assertTrue(z.lod_level == 1) + # case 3 + z = layers.lod_reset(x=x, target_lod=[1, 2, 3]) + self.assertTrue(z.lod_level == 1) + return z def test_affine_grid(self): with self.static_graph(): @@ -1925,6 +1935,13 @@ class TestBook(LayerTest): out = layers.flatten(x, axis=1, name="flatten") return (out) + def test_linspace(self): + program = Program() + with program_guard(program): + out = layers.linspace(20, 10, 5, 'float64') + self.assertIsNotNone(out) + print(str(program)) + if __name__ == '__main__': unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_linspace.py b/python/paddle/fluid/tests/unittests/test_linspace.py new file mode 100644 index 0000000000000000000000000000000000000000..eeecf178320327cc251f32bfe46c1622200339f4 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_linspace.py @@ -0,0 +1,71 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import numpy as np +from op_test import OpTest + + +class TestLinspaceOpCommonCase(OpTest): + def setUp(self): + self.op_type = "linspace" + dtype = 'float32' + self.inputs = { + 'Start': np.array([0]).astype(dtype), + 'Stop': np.array([10]).astype(dtype), + 'Num': np.array([11]).astype('int32') + } + + self.outputs = {'Out': np.arange(0, 11).astype(dtype)} + + def test_check_output(self): + self.check_output() + + +class TestLinspaceOpReverseCase(OpTest): + def setUp(self): + self.op_type = "linspace" + dtype = 'float32' + self.inputs = { + 'Start': np.array([10]).astype(dtype), + 'Stop': np.array([0]).astype(dtype), + 'Num': np.array([11]).astype('int32') + } + + self.outputs = {'Out': np.arange(10, -1, -1).astype(dtype)} + + def test_check_output(self): + self.check_output() + + +class TestLinspaceOpNumOneCase(OpTest): + def setUp(self): + self.op_type = "linspace" + dtype = 'float32' + self.inputs = { + 'Start': np.array([10]).astype(dtype), + 'Stop': np.array([0]).astype(dtype), + 'Num': np.array([1]).astype('int32') + } + + self.outputs = {'Out': np.array(10, dtype=dtype)} + + def test_check_output(self): + self.check_output() + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext.py index 12d854fb54ac30ff2eeed97c16a78198d92387fd..92a5c58c11773e97ca0bb5ff2c21cbc8df612d58 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_seresnext.py @@ -29,7 +29,7 @@ import unittest import math import numpy as np from functools import partial - +os.environ['CPU_NUM'] = str(4) # FIXME(zcd): If the neural net has dropout_op, the output of ParallelExecutor # and Executor is different. Because, for ParallelExecutor, the dropout_op of # the neural net will be copied N copies(N is the number of device). This will @@ -113,7 +113,6 @@ def bottleneck_block(input, num_filters, stride, cardinality, reduction_ratio): return fluid.layers.elementwise_add(x=short, y=scale, act='relu') -batch_size = 12 img_shape = [3, 224, 224] @@ -181,43 +180,84 @@ def optimizer(learning_rate=0.01): return optimizer +def _batch_size(): + return 12 + + +def _iter(use_cuda): + if use_cuda: + return 10 + return 2 + + +gpu_img, gpu_label = init_data( + batch_size=_batch_size(), img_shape=img_shape, label_range=999) +cpu_img, cpu_label = init_data( + batch_size=_batch_size(), img_shape=img_shape, label_range=999) +feed_dict_gpu = {"image": gpu_img, "label": gpu_label} +feed_dict_cpu = {"image": cpu_img, "label": cpu_label} +model = SE_ResNeXt50Small + + +def _feed_dict(use_cuda): + if use_cuda: + return feed_dict_gpu + return feed_dict_cpu + + +def _get_result_of_origin_model(use_cuda): + global remove_bn + global remove_dropout + remove_bn = True + remove_dropout = True + first_loss, last_loss = TestParallelExecutorBase.check_network_convergence( + model, + feed_dict=_feed_dict(use_cuda), + iter=_iter(use_cuda), + batch_size=_batch_size(), + use_cuda=use_cuda, + use_reduce=False, + optimizer=optimizer) + + return first_loss, last_loss + + +origin_cpu_first_loss, origin_cpu_last_loss = _get_result_of_origin_model(False) +if core.is_compiled_with_cuda(): + origin_gpu_first_loss, origin_gpu_last_loss = _get_result_of_origin_model( + True) + + +def _get_origin_result(use_cuda): + if use_cuda: + assert core.is_compiled_with_cuda(), "Doesn't compiled with CUDA." + return origin_gpu_first_loss, origin_gpu_last_loss + return origin_cpu_first_loss, origin_cpu_last_loss + + class TestResnet(TestParallelExecutorBase): - @classmethod - def setUpClass(cls): - os.environ['CPU_NUM'] = str(4) - global remove_dropout - global remove_bn - remove_dropout = False - remove_bn = False - - def _compare_reduce_and_allreduce(self, - model, - use_cuda, - iter=20, - delta2=1e-5): + def _compare_reduce_and_allreduce(self, use_cuda, delta2=1e-5): if use_cuda and not core.is_compiled_with_cuda(): return global remove_bn + global remove_dropout remove_bn = True + remove_dropout = True - img, label = init_data( - batch_size=batch_size, img_shape=img_shape, label_range=999) all_reduce_first_loss, all_reduce_last_loss = self.check_network_convergence( model, - feed_dict={"image": img, - "label": label}, - iter=iter, - batch_size=batch_size, + feed_dict=_feed_dict(use_cuda), + iter=_iter(use_cuda), + batch_size=_batch_size(), use_cuda=use_cuda, use_reduce=False, optimizer=optimizer) reduce_first_loss, reduce_last_loss = self.check_network_convergence( model, - feed_dict={"image": img, - "label": label}, - iter=iter, - batch_size=batch_size, + feed_dict=_feed_dict(use_cuda), + iter=_iter(use_cuda), + batch_size=_batch_size(), use_cuda=use_cuda, use_reduce=True, optimizer=optimizer) @@ -232,10 +272,9 @@ class TestResnet(TestParallelExecutorBase): all_reduce_first_loss_seq, all_reduce_last_loss_seq = self.check_network_convergence( model, - feed_dict={"image": img, - "label": label}, - iter=iter, - batch_size=batch_size, + feed_dict=_feed_dict(use_cuda), + iter=_iter(use_cuda), + batch_size=_batch_size(), use_cuda=use_cuda, use_reduce=False, optimizer=optimizer, @@ -243,10 +282,9 @@ class TestResnet(TestParallelExecutorBase): reduce_first_loss_seq, reduce_last_loss_seq = self.check_network_convergence( model, - feed_dict={"image": img, - "label": label}, - iter=iter, - batch_size=batch_size, + feed_dict=_feed_dict(use_cuda), + iter=_iter(use_cuda), + batch_size=_batch_size(), use_cuda=use_cuda, use_reduce=True, optimizer=optimizer, @@ -267,37 +305,28 @@ class TestResnet(TestParallelExecutorBase): for loss in zip(all_reduce_last_loss_seq, reduce_last_loss_seq): self.assertAlmostEquals(loss[0], loss[1], delta=delta2) - def _check_resnet_convergence(self, - model, - check_func_1, - check_func_2, - use_cuda, - iter=20, - delta2=1e-5, - compare_seperately=True): + def _compare_result_with_origin_model(self, + get_origin_result, + check_func_2, + use_cuda, + delta2=1e-5, + compare_seperately=True, + rm_drop_out=False, + rm_bn=False): if use_cuda and not core.is_compiled_with_cuda(): return - global remove_dropout global remove_bn - remove_dropout = True - remove_bn = True + global remove_dropout + remove_bn = rm_bn or use_cuda + remove_dropout = rm_drop_out - img, label = init_data( - batch_size=batch_size, img_shape=img_shape, label_range=999) - func_1_first_loss, func_1_last_loss = check_func_1( - model, - feed_dict={"image": img, - "label": label}, - iter=iter, - batch_size=batch_size, - use_cuda=use_cuda) + func_1_first_loss, func_1_last_loss = get_origin_result(use_cuda) func_2_first_loss, func_2_last_loss = check_func_2( model, - feed_dict={"image": img, - "label": label}, - iter=iter, - batch_size=batch_size, + feed_dict=_feed_dict(use_cuda), + iter=_iter(use_cuda), + batch_size=_batch_size(), use_cuda=use_cuda) if compare_seperately: @@ -311,97 +340,55 @@ class TestResnet(TestParallelExecutorBase): self.assertAlmostEquals( np.mean(func_1_last_loss), func_2_last_loss[0], delta=delta2) - def _compare_with_fused_all_reduce(self, - model, - use_cuda, - iter=20, - delta2=1e-5): - if use_cuda and not core.is_compiled_with_cuda(): - return - - global remove_bn - remove_bn = True - - img, label = init_data( - batch_size=batch_size, img_shape=img_shape, label_range=999) - all_reduce_first_loss, all_reduce_last_loss = self.check_network_convergence( - model, - feed_dict={"image": img, - "label": label}, - iter=iter, - batch_size=batch_size, - use_cuda=use_cuda, - fuse_all_reduce_ops=False, - optimizer=optimizer) - reduce_first_loss, reduce_last_loss = self.check_network_convergence( - model, - feed_dict={"image": img, - "label": label}, - iter=iter, - batch_size=batch_size, - use_cuda=use_cuda, - fuse_all_reduce_ops=True, - optimizer=optimizer) - - for loss in zip(all_reduce_first_loss, reduce_first_loss): - self.assertAlmostEquals(loss[0], loss[1], delta=1e-5) - for loss in zip(all_reduce_last_loss, reduce_last_loss): - self.assertAlmostEquals(loss[0], loss[1], delta=delta2) - def test_seresnext_with_reduce(self): - self._compare_reduce_and_allreduce( - model=SE_ResNeXt50Small, use_cuda=True, delta2=1e-2) - self._compare_reduce_and_allreduce( - model=SE_ResNeXt50Small, use_cuda=False, iter=5) - - def test_seresnext_with_fused_all_reduce(self): - self._compare_with_fused_all_reduce( - model=SE_ResNeXt50Small, use_cuda=True, delta2=1e-3) - self._compare_with_fused_all_reduce( - model=SE_ResNeXt50Small, use_cuda=False, iter=2, delta2=1e-3) + self._compare_reduce_and_allreduce(use_cuda=False, delta2=1e-3) + self._compare_reduce_and_allreduce(use_cuda=True, delta2=1e-2) def test_seresnext_with_learning_rate_decay(self): - check_func_1 = partial( - self.check_network_convergence, - optimizer=optimizer, - use_parallel_executor=True) + # NOTE(zcd): This test is compare the result of use parallel_executor and executor, + # and the result of drop_out op and batch_norm op in this two executor + # have diff, so the two ops should be removed from the model. + check_func_1 = _get_origin_result check_func_2 = partial( self.check_network_convergence, optimizer=optimizer, use_parallel_executor=False) - self._check_resnet_convergence( - SE_ResNeXt50Small, - check_func_1, - check_func_2, - use_cuda=True, - compare_seperately=False) - self._check_resnet_convergence( - SE_ResNeXt50Small, + self._compare_result_with_origin_model( check_func_1, check_func_2, use_cuda=False, + rm_drop_out=True, + rm_bn=True, compare_seperately=False, - iter=2, delta2=1e-3) + self._compare_result_with_origin_model( + check_func_1, + check_func_2, + use_cuda=True, + rm_drop_out=True, + rm_bn=True, + compare_seperately=False) - def test_seresnext_with_fused_optimizer_ops(self): - check_func_1 = partial( - self.check_network_convergence, fuse_all_optimizer_ops=False) + def test_seresnext_with_fused_all_reduce(self): + # NOTE(zcd): In order to make the program faster, + # this unit test remove drop_out and batch_norm. + check_func_1 = _get_origin_result check_func_2 = partial( - self.check_network_convergence, fuse_all_optimizer_ops=True) - # TODO(zcd): this test failed random, I will fix it in next PR. - # self._check_resnet_convergence( - # SE_ResNeXt50Small, - # check_func_1, - # check_func_2, - # use_cuda=True, - # delta2=1e-3) - self._check_resnet_convergence( - SE_ResNeXt50Small, + self.check_network_convergence, + optimizer=optimizer, + fuse_all_reduce_ops=True) + self._compare_result_with_origin_model( check_func_1, check_func_2, use_cuda=False, - iter=2, + rm_drop_out=True, + rm_bn=True) + self._compare_result_with_origin_model( + check_func_1, + check_func_2, + use_cuda=True, + rm_drop_out=True, + rm_bn=True, delta2=1e-3) diff --git a/python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py b/python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py index aacc1c3ecda8c25dec9f08827a856d38c37b1b2f..8960cbcdd2f574a647229894c44c2b6ea188b7d4 100644 --- a/python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py +++ b/python/paddle/fluid/tests/unittests/test_parallel_executor_transformer.py @@ -175,7 +175,7 @@ class TestTransformer(TestParallelExecutorBase): self.check_network_convergence(transformer, use_cuda=True) self.check_network_convergence( transformer, use_cuda=True, enable_sequential_execution=True) - self.check_network_convergence(transformer, use_cuda=False, iter=5) + self.check_network_convergence(transformer, use_cuda=False, iter=2) if __name__ == '__main__': diff --git a/python/paddle/fluid/tests/unittests/test_recurrent_op.py b/python/paddle/fluid/tests/unittests/test_recurrent_op.py index 6dfc85e301a2eda66bade09a8b6dd0004155f385..cf86ebf0a81c5c6cd36a5edb5d61a11cdd98ae11 100644 --- a/python/paddle/fluid/tests/unittests/test_recurrent_op.py +++ b/python/paddle/fluid/tests/unittests/test_recurrent_op.py @@ -15,7 +15,7 @@ from __future__ import print_function import unittest - +import paddle.fluid as fluid import paddle.fluid.layers as layers from paddle.fluid.framework import Program, grad_var_name from paddle.fluid.executor import Executor @@ -115,10 +115,6 @@ class RecurrentOpTest1(unittest.TestCase): def setup_program(self): self.main_program = Program() self.startup_program = Program() - self.p_info = { - "main_program": self.main_program, - "startup_program": self.startup_program - } self.place = core.CPUPlace() def setUp(self): @@ -129,33 +125,29 @@ class RecurrentOpTest1(unittest.TestCase): self.output_shape = (self.sent_len, self.batch_size, self.input_dim) self.py_rnn = PySimpleRNN1(self.input_shape, self.output_shape) - self.output = layers.mean(self.create_rnn_op(), **self.p_info) + with fluid.program_guard(self.main_program, self.startup_program): + self.output = layers.mean(self.create_rnn_op()) def create_rnn_op(self): x = layers.data( shape=[self.sent_len, self.batch_size, self.input_dim], dtype='float32', name='x', - append_batch_size=False, - **self.p_info) + append_batch_size=False) x.stop_gradient = False h_boot = layers.data( - shape=[self.input_dim], - dtype='float32', - name='h_boot', - **self.p_info) + shape=[self.input_dim], dtype='float32', name='h_boot') h_boot.stop_gradient = False - rnn = layers.StaticRNN(main_program=self.main_program) + rnn = layers.StaticRNN() with rnn.step(): h_pre = rnn.memory(init=h_boot) x_t = rnn.step_input(x) h = layers.scale( x=layers.elementwise_add( - x=h_pre, y=x_t, **self.p_info), - scale=self.py_rnn.scale, - **self.p_info) + x=h_pre, y=x_t), + scale=self.py_rnn.scale) rnn.update_memory(h_pre, h) rnn.output(h) @@ -193,7 +185,8 @@ class RecurrentOpTest1(unittest.TestCase): def test_backward(self): self.check_forward() - append_backward(self.output) + with fluid.program_guard(self.main_program, self.startup_program): + append_backward(self.output) ana_grad = [np.array(x) for x in self.backward()] @@ -205,12 +198,8 @@ class RecurrentOpTest1(unittest.TestCase): num_grad[idx], ana_grad[idx], rtol=0.1).all()) def check_forward(self): - print('test recurrent op forward') pd_output = self.forward() py_output = self.py_rnn.forward() - print('pd_output', pd_output) - print - print('py_output', py_output) self.assertEqual(pd_output.shape, py_output.shape) self.assertTrue(np.isclose(pd_output, py_output, rtol=0.1).all()) @@ -263,24 +252,21 @@ class RecurrentOpTest2(RecurrentOpTest1): self.output_shape = (self.sent_len, self.batch_size, self.input_dim) self.py_rnn = PySimpleRNN2(self.input_shape, self.output_shape) - self.output = layers.mean(self.create_rnn_op(), **self.p_info) + with fluid.program_guard(self.main_program, self.startup_program): + self.output = layers.mean(self.create_rnn_op()) def create_rnn_op(self): x = layers.data( shape=[self.sent_len, self.batch_size, self.input_dim], dtype='float32', name='x', - append_batch_size=False, - **self.p_info) + append_batch_size=False) x.stop_gradient = False h_boot = layers.data( - shape=[self.input_dim], - dtype='float32', - name='h_boot', - **self.p_info) + shape=[self.input_dim], dtype='float32', name='h_boot') h_boot.stop_gradient = False - rnn = layers.StaticRNN(main_program=self.main_program) + rnn = layers.StaticRNN() with rnn.step(): h_pre = rnn.memory(init=h_boot) x_t = rnn.step_input(x) @@ -288,18 +274,13 @@ class RecurrentOpTest2(RecurrentOpTest1): temp_l = layers.fc(input=x_t, size=self.input_dim, param_attr='W', - bias_attr=False, - **self.p_info) + bias_attr=False) temp_r = layers.fc(input=h_pre, size=self.input_dim, param_attr='U', - bias_attr=False, - **self.p_info) + bias_attr=False) - h = layers.sigmoid( - x=layers.elementwise_add( - x=temp_l, y=temp_r, **self.p_info), - **self.p_info) + h = layers.sigmoid(x=layers.elementwise_add(x=temp_l, y=temp_r)) rnn.update_memory(h_pre, h) rnn.output(h) @@ -362,40 +343,38 @@ class RecurrentOpMultipleMemoryTest(RecurrentOpTest1): self.py_rnn = RecurrentOpMultipleMemoryTest.PySimpleRNN3( self.input_shape, self.output_shape) - self.output = layers.mean(self.create_rnn_op(), **self.p_info) + with fluid.program_guard(self.main_program, self.startup_program): + self.output = layers.mean(self.create_rnn_op()) def create_rnn_op(self): x = layers.data( shape=[self.sent_len, self.batch_size, self.input_dim], dtype='float32', name='x', - append_batch_size=False, - **self.p_info) + append_batch_size=False) x.stop_gradient = False h_boot1 = layers.data( shape=[self.batch_size, self.input_dim], dtype='float32', name='h_boot1', - append_batch_size=False, - **self.p_info) + append_batch_size=False) h_boot1.stop_gradient = False h_boot2 = layers.data( shape=[self.batch_size, self.input_dim], dtype='float32', name='h_boot2', - append_batch_size=False, - **self.p_info) + append_batch_size=False) h_boot2.stop_gradient = False - rnn = layers.StaticRNN(main_program=self.main_program) + rnn = layers.StaticRNN() with rnn.step(): h_pre1 = rnn.memory(init=h_boot1) h_pre2 = rnn.memory(init=h_boot2) x_t = rnn.step_input(x) - mem1 = layers.scale(x=h_pre1, scale=1.0, **self.p_info) - mem2 = layers.scale(x=h_pre2, scale=1.0, **self.p_info) - out = layers.sums(input=[mem1, x_t, mem2], **self.p_info) + mem1 = layers.scale(x=h_pre1, scale=1.0) + mem2 = layers.scale(x=h_pre2, scale=1.0) + out = layers.sums(input=[mem1, x_t, mem2]) rnn.update_memory(h_pre1, mem1) rnn.update_memory(h_pre2, mem2) @@ -446,23 +425,23 @@ class RecurrentOpNoMemBootTest(RecurrentOpTest1): self.output_shape = (self.sent_len, self.batch_size, self.input_dim) self.py_rnn = RecurrentOpNoMemBootTest.PySimpleRNN4(self.input_shape, self.output_shape) - self.output = layers.mean(self.create_rnn_op(), **self.p_info) - print(self.main_program) + + with fluid.program_guard(self.main_program, self.startup_program): + self.output = layers.mean(self.create_rnn_op()) def create_rnn_op(self): x = layers.data( shape=[self.sent_len, self.batch_size, self.input_dim], dtype='float32', name='x', - append_batch_size=False, - **self.p_info) + append_batch_size=False) x.stop_gradient = False - rnn = layers.StaticRNN(main_program=self.main_program) + rnn = layers.StaticRNN() with rnn.step(): mem_pre = rnn.memory(shape=[-1, self.input_dim], batch_ref=x) x_t = rnn.step_input(x) - mem = layers.elementwise_add(x=mem_pre, y=x_t, **self.p_info) + mem = layers.elementwise_add(x=mem_pre, y=x_t) rnn.update_memory(mem_pre, mem) rnn.output(mem) diff --git a/python/paddle/fluid/tests/unittests/test_reduce_op.py b/python/paddle/fluid/tests/unittests/test_reduce_op.py index 8fc8125a773543eea768783155ad152c475535b5..65fc1453d8db13ad9c85746c3bf148f898e8f788 100644 --- a/python/paddle/fluid/tests/unittests/test_reduce_op.py +++ b/python/paddle/fluid/tests/unittests/test_reduce_op.py @@ -91,6 +91,78 @@ class TestProdOp(OpTest): self.check_grad(['X'], 'Out') +class TestAllOp(OpTest): + def setUp(self): + self.op_type = "reduce_all" + self.inputs = {'X': np.random.randint(0, 2, (5, 6, 10)).astype("bool")} + self.outputs = {'Out': self.inputs['X'].all()} + self.attrs = {'reduce_all': True} + + def test_check_output(self): + self.check_output() + + +class TestAllOpWithDim(OpTest): + def setUp(self): + self.op_type = "reduce_all" + self.inputs = {'X': np.random.randint(0, 2, (5, 6, 10)).astype("bool")} + self.attrs = {'dim': [1]} + self.outputs = {'Out': self.inputs['X'].all(axis=1)} + + def test_check_output(self): + self.check_output() + + +class TestAllOpWithKeepDim(OpTest): + def setUp(self): + self.op_type = "reduce_all" + self.inputs = {'X': np.random.randint(0, 2, (5, 6, 10)).astype("bool")} + self.attrs = {'dim': [1], 'keep_dim': True} + self.outputs = { + 'Out': np.expand_dims( + self.inputs['X'].all(axis=1), axis=1) + } + + def test_check_output(self): + self.check_output() + + +class TestAnyOp(OpTest): + def setUp(self): + self.op_type = "reduce_any" + self.inputs = {'X': np.random.randint(0, 2, (5, 6, 10)).astype("bool")} + self.outputs = {'Out': self.inputs['X'].any()} + self.attrs = {'reduce_all': True} + + def test_check_output(self): + self.check_output() + + +class TestAnyOpWithDim(OpTest): + def setUp(self): + self.op_type = "reduce_any" + self.inputs = {'X': np.random.randint(0, 2, (5, 6, 10)).astype("bool")} + self.attrs = {'dim': [1]} + self.outputs = {'Out': self.inputs['X'].any(axis=1)} + + def test_check_output(self): + self.check_output() + + +class TestAnyOpWithKeepDim(OpTest): + def setUp(self): + self.op_type = "reduce_any" + self.inputs = {'X': np.random.randint(0, 2, (5, 6, 10)).astype("bool")} + self.attrs = {'dim': [1], 'keep_dim': True} + self.outputs = { + 'Out': np.expand_dims( + self.inputs['X'].any(axis=1), axis=1) + } + + def test_check_output(self): + self.check_output() + + class Test1DReduce(OpTest): def setUp(self): self.op_type = "reduce_sum" diff --git a/python/paddle/fluid/trainer_desc.py b/python/paddle/fluid/trainer_desc.py index 380c404fb2d6a36bf3732ebbff4b6cef22f71362..c742ee002aa6c470c41d46978a4e08fc774c3152 100644 --- a/python/paddle/fluid/trainer_desc.py +++ b/python/paddle/fluid/trainer_desc.py @@ -28,10 +28,10 @@ class TrainerDesc(object): import multiprocessing as mp # set default thread num == cpu count self.proto_desc.thread_num = mp.cpu_count() - self.fleet_desc_ = None - self.device_worker_ = None - self.program_ = None - self.infer_ = False + self._fleet_desc = None + self._device_worker = None + self._program = None + self._infer = False def _set_fetch_var_and_info(self, fetch_vars, fetch_info, print_period): for i, v in enumerate(fetch_vars): @@ -47,19 +47,19 @@ class TrainerDesc(object): self.proto_desc.thread_num = thread_num def _set_device_worker(self, device_worker): - self.device_worker_ = device_worker + self._device_worker = device_worker def _set_infer(self, infer): - self.infer_ = infer + self._infer = infer def _set_fleet_desc(self, fleet_desc): - self.fleet_desc_ = fleet_desc + self._fleet_desc = fleet_desc def _gen_trainer_desc(self): pass def _set_program(self, program): - self.program_ = program + self._program = program def _desc(self): from google.protobuf import text_format @@ -73,13 +73,13 @@ class MultiTrainer(TrainerDesc): def _set_program(self, program): super(MultiTrainer, self)._set_program(program) - self.program_ = program + self._program = program def _gen_trainer_desc(self): super(MultiTrainer, self)._gen_trainer_desc() self.proto_desc.class_name = "MultiTrainer" - self.device_worker_._set_infer(self.infer_) - self.device_worker_._gen_worker_desc(self.proto_desc) + self._device_worker._set_infer(self._infer) + self._device_worker._gen_worker_desc(self.proto_desc) class DistMultiTrainer(TrainerDesc): @@ -89,13 +89,13 @@ class DistMultiTrainer(TrainerDesc): def _set_program(self, program): super(DistMultiTrainer, self)._set_program(program) - self.program_ = program + self._program = program def _gen_trainer_desc(self): super(DistMultiTrainer, self)._gen_trainer_desc() self.proto_desc.class_name = "DistMultiTrainer" - if self.program_ == None: + if self._program == None: raise RuntimeError("None Program") - self.device_worker_._set_infer(self.infer_) - self.device_worker_._set_program(self.program_) - self.device_worker_._gen_worker_desc(self.proto_desc) + self._device_worker._set_infer(self._infer) + self._device_worker._set_program(self._program) + self._device_worker._gen_worker_desc(self.proto_desc) diff --git a/python/paddle/fluid/transpiler/distribute_transpiler.py b/python/paddle/fluid/transpiler/distribute_transpiler.py index 41e5f47976c566306ad141f655a0f6516831d690..19a1f8bf74060905ecb4b81b44f7080db79c45e4 100644 --- a/python/paddle/fluid/transpiler/distribute_transpiler.py +++ b/python/paddle/fluid/transpiler/distribute_transpiler.py @@ -658,6 +658,7 @@ class DistributeTranspiler(object): outputs={"Out": splited_var}, attrs={ "epmap": eps, + "trainer_id": self.trainer_id, RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE }) @@ -669,6 +670,7 @@ class DistributeTranspiler(object): outputs={"Out": fetch_barrier_out}, attrs={ "endpoints": self.pserver_endpoints, + "trainer_id": self.trainer_id, RPC_OP_ROLE_ATTR_NAME: RPC_OP_ROLE_ATTR_VALUE }) @@ -791,11 +793,15 @@ class DistributeTranspiler(object): global_ops = [] + # sparse grad name to param name + sparse_grad_to_param = [] + def __append_optimize_op__(op, block, grad_to_block_id, merged_var, lr_ops): if self._is_optimizer_op(op): self._append_pserver_ops(block, op, endpoint, grad_to_block_id, - self.origin_program, merged_var) + self.origin_program, merged_var, + sparse_grad_to_param) elif op not in lr_ops: self._append_pserver_non_opt_ops(block, op) @@ -911,6 +917,7 @@ class DistributeTranspiler(object): "Fanin": self.trainer_num, "sync_mode": self.sync_mode, "grad_to_block_id": grad_to_block_id, + "sparse_grad_to_param": sparse_grad_to_param, } if self.has_distributed_lookup_table: @@ -1779,7 +1786,8 @@ class DistributeTranspiler(object): return o4 def _append_pserver_ops(self, optimize_block, opt_op, endpoint, - grad_to_block_id, origin_program, merged_var): + grad_to_block_id, origin_program, merged_var, + sparse_grad_to_param): program = optimize_block.program pserver_block = program.global_block() new_inputs = collections.OrderedDict() @@ -1863,6 +1871,12 @@ class DistributeTranspiler(object): outputs=outputs, attrs=opt_op.all_attrs()) + # record sparse grad to param name + if new_inputs["Grad"].type == core.VarDesc.VarType.SELECTED_ROWS: + sparse_grad_to_param.append( + str(new_inputs["Grad"].name) + ":" + str(new_inputs["Param"] + .name)) + def _get_pserver_grad_param_var(self, var, var_dict): """ Return pserver side grad/param variable, return None