diff --git a/README.md b/README.md index 56d6c10c642787836abb55cb2974bda0b8d22da4..c535e9514e1cac9aff51edfcd9bcdc5d34ccd9fd 100644 --- a/README.md +++ b/README.md @@ -2,8 +2,8 @@ [![Build Status](https://travis-ci.org/PaddlePaddle/Paddle.svg?branch=develop)](https://travis-ci.org/PaddlePaddle/Paddle) -[![Documentation Status](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](http://paddlepaddle.org/documentation/docs/en/1.1/getstarted/index_en.html) -[![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](http://paddlepaddle.org/documentation/docs/zh/1.1/beginners_guide/index.html) +[![Documentation Status](https://img.shields.io/badge/docs-latest-brightgreen.svg?style=flat)](http://paddlepaddle.org/documentation/docs/en/1.2/getstarted/index_en.html) +[![Documentation Status](https://img.shields.io/badge/中文文档-最新-brightgreen.svg)](http://paddlepaddle.org/documentation/docs/zh/1.2/beginners_guide/index.html) [![Release](https://img.shields.io/github/release/PaddlePaddle/Paddle.svg)](https://github.com/PaddlePaddle/Paddle/releases) [![License](https://img.shields.io/badge/license-Apache%202-blue.svg)](LICENSE) @@ -19,7 +19,7 @@ Our vision is to enable deep learning for everyone via PaddlePaddle. Please refer to our [release announcement](https://github.com/PaddlePaddle/Paddle/releases) to track the latest feature of PaddlePaddle. -### Latest PaddlePaddle Release: [Fluid 1.1.0](https://github.com/PaddlePaddle/Paddle/tree/release/1.1) +### Latest PaddlePaddle Release: [Fluid 1.2.0](https://github.com/PaddlePaddle/Paddle/tree/release/1.2) ### Install Latest Stable Release: ``` # Linux CPU @@ -27,9 +27,9 @@ pip install paddlepaddle # Linux GPU cuda9cudnn7 pip install paddlepaddle-gpu # Linux GPU cuda8cudnn7 -pip install paddlepaddle-gpu==1.1.0.post87 +pip install paddlepaddle-gpu==1.2.0.post87 # Linux GPU cuda8cudnn5 -pip install paddlepaddle-gpu==1.1.0.post85 +pip install paddlepaddle-gpu==1.2.0.post85 # For installation on other platform, refer to http://paddlepaddle.org/ ``` @@ -76,26 +76,26 @@ pip install paddlepaddle-gpu==1.1.0.post85 ## Installation -It is recommended to read [this doc](http://paddlepaddle.org/documentation/docs/zh/1.1/beginners_guide/index.html) on our website. +It is recommended to read [this doc](http://paddlepaddle.org/documentation/docs/zh/1.2/beginners_guide/install/index_cn.html) on our website. ## Documentation -We provide [English](http://paddlepaddle.org/documentation/docs/en/1.1/getstarted/index_en.html) and -[Chinese](http://paddlepaddle.org/documentation/docs/zh/1.1/beginners_guide/index.html) documentation. +We provide [English](http://paddlepaddle.org/documentation/docs/en/1.2/getstarted/index_en.html) and +[Chinese](http://paddlepaddle.org/documentation/docs/zh/1.2/beginners_guide/index.html) documentation. - [Deep Learning 101](https://github.com/PaddlePaddle/book) You might want to start from this online interactive book that can run in a Jupyter Notebook. -- [Distributed Training](http://paddlepaddle.org/documentation/docs/zh/1.1/user_guides/howto/training/cluster_howto.html) +- [Distributed Training](http://paddlepaddle.org/documentation/docs/zh/1.2/user_guides/howto/training/cluster_howto.html) You can run distributed training jobs on MPI clusters. -- [Python API](http://paddlepaddle.org/documentation/api/zh/1.1/fluid.html) +- [Python API](http://paddlepaddle.org/documentation/docs/zh/1.2/api_cn/index_cn.html) Our new API enables much shorter programs. -- [How to Contribute](http://paddlepaddle.org/documentation/docs/zh/1.1/advanced_usage/development/contribute_to_paddle.html) +- [How to Contribute](http://paddlepaddle.org/documentation/docs/zh/1.2/advanced_usage/development/contribute_to_paddle/index_cn.html) We appreciate your contributions! diff --git a/cmake/operators.cmake b/cmake/operators.cmake index 89726bf9859e71ee04c2f9380554090845fd44e5..2ced43f9e6c60da642f7a6252f889d9c9ab9748f 100644 --- a/cmake/operators.cmake +++ b/cmake/operators.cmake @@ -166,6 +166,8 @@ function(op_library TARGET) # Append first implemented MKLDNN activation operator if (${MKLDNN_FILE} STREQUAL "activation_mkldnn_op") file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(relu, MKLDNN);\n") + elseif(${MKLDNN_FILE} STREQUAL "conv_mkldnn_op") + file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL_WITH_CUSTOM_TYPE(conv2d, MKLDNN, FP32);\n") else() file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, MKLDNN);\n") endif() diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec index 4f286049d40af988e83b0d0bc24a7b61ee90444f..fd4cf92d85d5daa891d602d4365122c870920bba 100644 --- a/paddle/fluid/API.spec +++ b/paddle/fluid/API.spec @@ -66,6 +66,7 @@ paddle.fluid.layers.linear_chain_crf ArgSpec(args=['input', 'label', 'param_attr paddle.fluid.layers.crf_decoding ArgSpec(args=['input', 'param_attr', 'label'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.cos_sim ArgSpec(args=['X', 'Y'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.cross_entropy ArgSpec(args=['input', 'label', 'soft_label', 'ignore_index'], varargs=None, keywords=None, defaults=(False, -100)) +paddle.fluid.layers.bpr_loss ArgSpec(args=['input', 'label', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.square_error_cost ArgSpec(args=['input', 'label'], varargs=None, keywords=None, defaults=None) paddle.fluid.layers.chunk_eval ArgSpec(args=['input', 'label', 'chunk_scheme', 'num_chunk_types', 'excluded_chunk_types'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.sequence_conv ArgSpec(args=['input', 'num_filters', 'filter_size', 'filter_stride', 'padding', 'bias_attr', 'param_attr', 'act', 'name'], varargs=None, keywords=None, defaults=(3, 1, None, None, None, None, None)) @@ -194,6 +195,8 @@ paddle.fluid.layers.grid_sampler ArgSpec(args=['x', 'grid', 'name'], varargs=Non paddle.fluid.layers.log_loss ArgSpec(args=['input', 'label', 'epsilon', 'name'], varargs=None, keywords=None, defaults=(0.0001, None)) paddle.fluid.layers.add_position_encoding ArgSpec(args=['input', 'alpha', 'beta', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.bilinear_tensor_product ArgSpec(args=['x', 'y', 'size', 'act', 'name', 'param_attr', 'bias_attr'], varargs=None, keywords=None, defaults=(None, None, None, None)) +paddle.fluid.layers.merge_selected_rows ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)) +paddle.fluid.layers.get_tensor_from_selected_rows ArgSpec(args=['x', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.lstm ArgSpec(args=['input', 'init_h', 'init_c', 'max_len', 'hidden_size', 'num_layers', 'dropout_prob', 'is_bidirec', 'is_test', 'name', 'default_initializer', 'seed'], varargs=None, keywords=None, defaults=(0.0, False, False, None, None, -1)) paddle.fluid.layers.data ArgSpec(args=['name', 'shape', 'append_batch_size', 'dtype', 'lod_level', 'type', 'stop_gradient'], varargs=None, keywords=None, defaults=(True, 'float32', 0, VarType.LOD_TENSOR, True)) paddle.fluid.layers.open_files ArgSpec(args=['filenames', 'shapes', 'lod_levels', 'dtypes', 'thread_num', 'buffer_size', 'pass_num', 'is_test'], varargs=None, keywords=None, defaults=(None, None, 1, None)) @@ -299,6 +302,7 @@ paddle.fluid.layers.generate_proposals ArgSpec(args=['scores', 'bbox_deltas', 'i paddle.fluid.layers.iou_similarity ArgSpec(args=['x', 'y', 'name'], varargs=None, keywords=None, defaults=(None,)) paddle.fluid.layers.box_coder ArgSpec(args=['prior_box', 'prior_box_var', 'target_box', 'code_type', 'box_normalized', 'name'], varargs=None, keywords=None, defaults=('encode_center_size', True, None)) paddle.fluid.layers.polygon_box_transform ArgSpec(args=['input', 'name'], varargs=None, keywords=None, defaults=(None,)) +paddle.fluid.layers.yolov3_loss ArgSpec(args=['x', 'gtbox', 'gtlabel', 'anchors', 'class_num', 'ignore_thresh', 'loss_weight_xy', 'loss_weight_wh', 'loss_weight_conf_target', 'loss_weight_conf_notarget', 'loss_weight_class', 'name'], varargs=None, keywords=None, defaults=(None, None, None, None, None, None)) paddle.fluid.layers.accuracy ArgSpec(args=['input', 'label', 'k', 'correct', 'total'], varargs=None, keywords=None, defaults=(1, None, None)) paddle.fluid.layers.auc ArgSpec(args=['input', 'label', 'curve', 'num_thresholds', 'topk', 'slide_steps'], varargs=None, keywords=None, defaults=('ROC', 4095, 1, 1)) paddle.fluid.layers.exponential_decay ArgSpec(args=['learning_rate', 'decay_steps', 'decay_rate', 'staircase'], varargs=None, keywords=None, defaults=(False,)) @@ -419,3 +423,17 @@ paddle.fluid.Scope.drop_kids drop_kids(self: paddle.fluid.core.Scope) -> None paddle.fluid.Scope.find_var find_var(self: paddle.fluid.core.Scope, arg0: unicode) -> paddle.fluid.core.Variable paddle.fluid.Scope.new_scope new_scope(self: paddle.fluid.core.Scope) -> paddle.fluid.core.Scope paddle.fluid.Scope.var var(self: paddle.fluid.core.Scope, arg0: unicode) -> paddle.fluid.core.Variable +paddle.reader.map_readers ArgSpec(args=['func'], varargs='readers', keywords=None, defaults=None) +paddle.reader.buffered ArgSpec(args=['reader', 'size'], varargs=None, keywords=None, defaults=None) +paddle.reader.compose ArgSpec(args=[], varargs='readers', keywords='kwargs', defaults=None) +paddle.reader.chain ArgSpec(args=[], varargs='readers', keywords=None, defaults=None) +paddle.reader.shuffle ArgSpec(args=['reader', 'buf_size'], varargs=None, keywords=None, defaults=None) +paddle.reader.firstn ArgSpec(args=['reader', 'n'], varargs=None, keywords=None, defaults=None) +paddle.reader.xmap_readers ArgSpec(args=['mapper', 'reader', 'process_num', 'buffer_size', 'order'], varargs=None, keywords=None, defaults=(False,)) +paddle.reader.PipeReader.__init__ ArgSpec(args=['self', 'command', 'bufsize', 'file_type'], varargs=None, keywords=None, defaults=(8192, 'plain')) +paddle.reader.PipeReader.get_line ArgSpec(args=['self', 'cut_lines', 'line_break'], varargs=None, keywords=None, defaults=(True, '\n')) +paddle.reader.multiprocess_reader ArgSpec(args=['readers', 'use_pipe', 'queue_size'], varargs=None, keywords=None, defaults=(True, 1000)) +paddle.reader.Fake.__init__ ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None) +paddle.reader.creator.np_array ArgSpec(args=['x'], varargs=None, keywords=None, defaults=None) +paddle.reader.creator.text_file ArgSpec(args=['path'], varargs=None, keywords=None, defaults=None) +paddle.reader.creator.recordio ArgSpec(args=['paths', 'buf_size'], varargs=None, keywords=None, defaults=(100,)) diff --git a/paddle/fluid/CMakeLists.txt b/paddle/fluid/CMakeLists.txt index 6b526f0103ad3c530c06a68757cf89293f4fb84b..595454e90b9cd713fd2baed24538cf5fbc93934a 100644 --- a/paddle/fluid/CMakeLists.txt +++ b/paddle/fluid/CMakeLists.txt @@ -1,6 +1,7 @@ add_subdirectory(memory) add_subdirectory(platform) add_subdirectory(framework) +add_subdirectory(imperative) add_subdirectory(operators) add_subdirectory(string) add_subdirectory(recordio) diff --git a/paddle/fluid/framework/CMakeLists.txt b/paddle/fluid/framework/CMakeLists.txt index b236eef3cee36a61bc18ad2eff1713ab39cbcc78..aa8b4628c5f39d49a89ac18b8b11154ae19dfecc 100644 --- a/paddle/fluid/framework/CMakeLists.txt +++ b/paddle/fluid/framework/CMakeLists.txt @@ -120,8 +120,9 @@ cc_library(op_info SRCS op_info.cc DEPS attribute framework_proto) cc_library(shape_inference SRCS shape_inference.cc DEPS ddim attribute device_context) cc_library(transfer_scope_cache SRCS transfer_scope_cache.cc DEPS scope framework_proto device_context) +cc_library(op_kernel_type SRCS op_kernel_type.cc DEPS device_context place) cc_library(operator SRCS operator.cc DEPS op_info device_context tensor scope glog - shape_inference data_transform lod_tensor profiler transfer_scope_cache) + shape_inference data_transform lod_tensor profiler transfer_scope_cache op_kernel_type) cc_test(operator_test SRCS operator_test.cc DEPS operator op_registry device_context) @@ -193,7 +194,7 @@ cc_test(var_type_inference_test SRCS var_type_inference_test.cc DEPS op_registry cc_library(selected_rows SRCS selected_rows.cc DEPS tensor) cc_test(selected_rows_test SRCS selected_rows_test.cc DEPS selected_rows) -cc_test(op_kernel_type_test SRCS op_kernel_type_test.cc DEPS place device_context framework_proto) +cc_test(op_kernel_type_test SRCS op_kernel_type_test.cc DEPS place device_context framework_proto op_kernel_type) cc_test(cow_ptr_tests SRCS details/cow_ptr_test.cc) cc_test(tuple_test SRCS tuple_test.cc ) diff --git a/paddle/fluid/framework/data_feed.cc b/paddle/fluid/framework/data_feed.cc index 291d8ffc3c3334c2836e1651a8997984bba084e1..a99cf53b410433c6e4b8a19821779f28c25e678f 100644 --- a/paddle/fluid/framework/data_feed.cc +++ b/paddle/fluid/framework/data_feed.cc @@ -33,11 +33,7 @@ void DataFeed::AddFeedVar(Variable* var, const std::string& name) { CheckInit(); for (size_t i = 0; i < use_slots_.size(); ++i) { if (name == use_slots_[i]) { - if (use_slots_is_dense_[i]) { - feed_vec_[i] = MixTensor(var->GetMutable()); - } else { - feed_vec_[i] = MixTensor(var->GetMutable()); - } + feed_vec_[i] = var->GetMutable(); } } } @@ -301,6 +297,7 @@ bool MultiSlotDataFeed::ParseOneInstance(std::vector* instance) { "the data, please check if the data contains unresolvable " "characters.\nplease check this error line: %s", str); + if (idx != -1) { (*instance)[idx].Init(all_slots_type_[i]); if ((*instance)[idx].GetType()[0] == 'f') { // float @@ -337,6 +334,7 @@ void MultiSlotDataFeed::AddInstanceToInsVec( (*ins_vec)[i].InitOffset(); } } + for (size_t i = 0; i < instance.size(); ++i) { (*ins_vec)[i].AddIns(instance[i]); } @@ -348,36 +346,25 @@ void MultiSlotDataFeed::PutToFeedVec( const auto& type = ins_vec[i].GetType(); const auto& offset = ins_vec[i].GetOffset(); int total_instance = static_cast(offset.back()); + if (type[0] == 'f') { // float const auto& feasign = ins_vec[i].GetFloatData(); - if (feed_vec_[i].IsDense()) { - int size_in_each_batch = total_instance / batch_size_; - float* tensor_ptr = feed_vec_[i].GetTensor()->mutable_data( - {batch_size_, size_in_each_batch}, platform::CPUPlace()); - memcpy(tensor_ptr, &feasign[0], total_instance * sizeof(float)); - } else { - float* tensor_ptr = feed_vec_[i].GetLoDTensor()->mutable_data( - {total_instance, 1}, platform::CPUPlace()); - memcpy(tensor_ptr, &feasign[0], total_instance * sizeof(float)); - LoD data_lod{offset}; - feed_vec_[i].GetLoDTensor()->set_lod(data_lod); - } + float* tensor_ptr = feed_vec_[i]->mutable_data( + {total_instance, 1}, platform::CPUPlace()); + memcpy(tensor_ptr, &feasign[0], total_instance * sizeof(float)); } else if (type[0] == 'u') { // uint64 // no uint64_t type in paddlepaddle const auto& feasign = ins_vec[i].GetUint64Data(); - if (feed_vec_[i].IsDense()) { - int size_in_each_batch = total_instance / batch_size_; - int64_t* tensor_ptr = feed_vec_[i].GetTensor()->mutable_data( - {batch_size_, size_in_each_batch}, platform::CPUPlace()); - memcpy(tensor_ptr, &feasign[0], total_instance * sizeof(int64_t)); - } else { - int64_t* tensor_ptr = - feed_vec_[i].GetLoDTensor()->mutable_data( - {total_instance, 1}, platform::CPUPlace()); - memcpy(tensor_ptr, &feasign[0], total_instance * sizeof(int64_t)); - LoD data_lod{offset}; - feed_vec_[i].GetLoDTensor()->set_lod(data_lod); - } + int64_t* tensor_ptr = feed_vec_[i]->mutable_data( + {total_instance, 1}, platform::CPUPlace()); + memcpy(tensor_ptr, &feasign[0], total_instance * sizeof(int64_t)); + } + + LoD data_lod{offset}; + feed_vec_[i]->set_lod(data_lod); + if (use_slots_is_dense_[i]) { + int dim = total_instance / batch_size_; + feed_vec_[i]->Resize({batch_size_, dim}); } } } diff --git a/paddle/fluid/framework/data_feed.h b/paddle/fluid/framework/data_feed.h index a7f8d1d31752af200145bc7934e7880910338e9d..7cc6919703680c359b89075777e97676f5253c57 100644 --- a/paddle/fluid/framework/data_feed.h +++ b/paddle/fluid/framework/data_feed.h @@ -30,35 +30,6 @@ limitations under the License. */ namespace paddle { namespace framework { -// Pack Tensor type and LoDTensor type into MixTensor type, in order -// to record either Tensor or LoDTensor information at the same time. -class MixTensor { - public: - MixTensor() {} - explicit MixTensor(LoDTensor* lodtensor) { - is_dense_ = false; - lodtensor_ = lodtensor; - } - explicit MixTensor(Tensor* tensor) { - is_dense_ = true; - tensor_ = tensor; - } - bool IsDense() { return is_dense_; } - LoDTensor* GetLoDTensor() { - PADDLE_ENFORCE(!is_dense_, "Let a dense var return a LoDTensor ptr."); - return lodtensor_; - } - Tensor* GetTensor() { - PADDLE_ENFORCE(is_dense_, "Let a sparse var return a Tensor ptr."); - return tensor_; - } - - private: - bool is_dense_; - LoDTensor* lodtensor_; - Tensor* tensor_; -}; - // DataFeed is the base virtual class for all ohther DataFeeds. // It is used to read files and parse the data for subsequent trainer. // Example: @@ -133,7 +104,7 @@ class DataFeed { use_slots_index_; // -1: not used; >=0: the index of use_slots_ // The data read by DataFeed will be stored here - std::vector feed_vec_; + std::vector feed_vec_; // the batch size defined by user int default_batch_size_; diff --git a/paddle/fluid/framework/data_feed_test.cc b/paddle/fluid/framework/data_feed_test.cc index 3974f8dbadf332801a822618d77f140db440b29d..b3e969871592394a7ac2fdeab8495677e7bba070 100644 --- a/paddle/fluid/framework/data_feed_test.cc +++ b/paddle/fluid/framework/data_feed_test.cc @@ -152,19 +152,13 @@ void GetElemSetFromReader(std::vector* reader_elem_set, const auto& multi_slot_desc = data_feed_desc.multi_slot_desc(); std::map lodtensor_targets; - std::map tensor_targets; for (int i = 0; i < multi_slot_desc.slots_size(); ++i) { const auto& slot = multi_slot_desc.slots(i); if (slot.is_used()) { const auto& name = slot.name(); readers[idx]->AddFeedVar(scope->Var(name), name); - if (slot.is_dense()) { - tensor_targets[name] = - &scope->FindVar(name)->Get(); - } else { - lodtensor_targets[name] = - &scope->FindVar(name)->Get(); - } + lodtensor_targets[name] = + &scope->FindVar(name)->Get(); } } readers[idx]->Start(); @@ -175,8 +169,9 @@ void GetElemSetFromReader(std::vector* reader_elem_set, if (!slot.is_used()) { continue; } + const paddle::framework::LoDTensor* tens = + lodtensor_targets[slot.name()]; if (slot.is_dense()) { // dense branch - const paddle::framework::Tensor* tens = tensor_targets[slot.name()]; if (slot.type() == "uint64") { const int64_t* data = tens->data(); int batch_size = tens->dims()[0]; @@ -202,8 +197,6 @@ void GetElemSetFromReader(std::vector* reader_elem_set, PADDLE_THROW("Error type in proto file."); } } else { // sparse branch - const paddle::framework::LoDTensor* tens = - lodtensor_targets[slot.name()]; if (slot.type() == "uint64") { const int64_t* data = tens->data(); for (size_t i = 0; i < tens->NumElements(); ++i) { diff --git a/paddle/fluid/framework/data_layout_transform.cc b/paddle/fluid/framework/data_layout_transform.cc index c9e3a8ac1d1e5228725bff49ecc6d91e640dfe57..5467f6d1b23c0058f06387e3da97c4193dd5ca6c 100644 --- a/paddle/fluid/framework/data_layout_transform.cc +++ b/paddle/fluid/framework/data_layout_transform.cc @@ -151,19 +151,22 @@ void TransDataLayoutFromMKLDNN(const OpKernelType& kernel_type_for_var, auto out_format = platform::MKLDNNFormatForSize(in_tz.size(), ToMKLDNNFormat(out_layout)); - void* in_data = GetDataFromTensor(in, in_type); - // output tensor has the same dims as input. Reorder don't change dims out->Resize(in.dims()); - auto out_data = out->mutable_data(expected_kernel_type.place_, in.type()); - - auto in_memory = memory({{{in_tz}, in_type, in_format}, cpu_engine}, in_data); - auto out_memory = - memory({{{out_tz}, out_type, out_format}, cpu_engine}, out_data); + if (in_format != out_format) { + void* in_data = GetDataFromTensor(in, in_type); + auto out_data = out->mutable_data(expected_kernel_type.place_, in.type()); - platform::Reorder(in_memory, out_memory); + auto in_memory = + memory({{{in_tz}, in_type, in_format}, cpu_engine}, in_data); + auto out_memory = + memory({{{out_tz}, out_type, out_format}, cpu_engine}, out_data); + platform::Reorder(in_memory, out_memory); + } else { + out->ShareDataWith(in); + } out->set_layout(out_layout); // reset format since the out tensor will be feed to non-MKLDNN OPkernel out->set_format(memory::format::format_undef); diff --git a/paddle/fluid/framework/details/CMakeLists.txt b/paddle/fluid/framework/details/CMakeLists.txt index a6c8ef408a8b40e5104bd6c4ace233e51a96b862..a927a3afcddb52f571543462e485b682aac163ae 100644 --- a/paddle/fluid/framework/details/CMakeLists.txt +++ b/paddle/fluid/framework/details/CMakeLists.txt @@ -15,14 +15,26 @@ cc_library(variable_visitor SRCS variable_visitor.cc DEPS lod_tensor selected_ro if(WITH_GPU) nv_library(all_reduce_op_handle SRCS all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory dynload_cuda variable_visitor) - nv_library(reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base variable_visitor scope ddim dynload_cuda) + if(WITH_DISTRIBUTE) + nv_library(reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base variable_visitor scope + ddim dynload_cuda selected_rows_functor sendrecvop_grpc) + else() + nv_library(reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base variable_visitor scope + ddim dynload_cuda selected_rows_functor) + endif() nv_library(broadcast_op_handle SRCS broadcast_op_handle.cc DEPS op_handle_base scope ddim memory variable_visitor dynload_cuda) nv_library(fused_broadcast_op_handle SRCS fused_broadcast_op_handle.cc DEPS broadcast_op_handle) else() cc_library(all_reduce_op_handle SRCS all_reduce_op_handle.cc DEPS op_handle_base scope lod_tensor ddim memory variable_visitor) - cc_library(reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base variable_visitor scope ddim) + if(WITH_DISTRIBUTE) + cc_library(reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base variable_visitor scope + ddim selected_rows_functor sendrecvop_grpc) + else() + cc_library(reduce_op_handle SRCS reduce_op_handle.cc DEPS op_handle_base variable_visitor scope + ddim selected_rows_functor) + endif() cc_library(broadcast_op_handle SRCS broadcast_op_handle.cc DEPS op_handle_base scope ddim memory variable_visitor) cc_library(fused_broadcast_op_handle SRCS fused_broadcast_op_handle.cc DEPS broadcast_op_handle) endif() diff --git a/paddle/fluid/framework/details/all_reduce_op_handle.cc b/paddle/fluid/framework/details/all_reduce_op_handle.cc index a003995ae3f8e111881b4681554aa8eb17b60cc1..e8bf53e160e7382122c3c2f92a152fea058a032e 100644 --- a/paddle/fluid/framework/details/all_reduce_op_handle.cc +++ b/paddle/fluid/framework/details/all_reduce_op_handle.cc @@ -48,7 +48,14 @@ AllReduceOpHandle::AllReduceOpHandle(ir::Node *node, void AllReduceOpHandle::RunImpl() { platform::RecordEvent record_event(Name(), dev_ctxes_.cbegin()->second); +// FIXME(typhoonzero): If scope0(global scope) have NCCL_ID_VAR, +// this is a distributed or inter-process call, find a better way. +#ifdef PADDLE_WITH_CUDA + if (NoDummyInputSize() == 1 && + local_scopes_[0]->FindLocalVar(NCCL_ID_VARNAME) == nullptr) { +#else if (NoDummyInputSize() == 1) { +#endif return; // No need to all reduce when GPU count = 1; } else { // Wait input done diff --git a/paddle/fluid/framework/details/build_strategy.cc b/paddle/fluid/framework/details/build_strategy.cc index 523f9eadf2d7e2e08504c5920372fb7cdb0d7aba..d8526b3f2492992c5c0f6f5e0a85cffca7398700 100644 --- a/paddle/fluid/framework/details/build_strategy.cc +++ b/paddle/fluid/framework/details/build_strategy.cc @@ -58,10 +58,23 @@ class ParallelExecutorPassBuilder : public ir::PassBuilder { } } + CollectiveContext *context = CollectiveContext::GetInstance(); + context->endpoints_ = strategy_.trainers_endpoints_; + context->trainer_id_ = strategy_.trainer_id_; + PADDLE_ENFORCE(strategy_.trainer_id_ >= 0, "trainer_id_ >= 0"); + if (strategy_.trainer_id_ > 0) { + PADDLE_ENFORCE((unsigned)(strategy_.trainer_id_) < + strategy_.trainers_endpoints_.size(), + "trainer_id_ < endpoints_ size"); + } + VLOG(1) << "CollectiveContext:" << context->String(); + // Convert graph to run on multi-devices. auto multi_devices_pass = AppendPass("multi_devices_pass"); multi_devices_pass->SetNotOwned("strategy", &strategy_); + multi_devices_pass->Set("num_trainers", + new int(strategy_.num_trainers_)); // Add a graph print pass to record a graph with device info. if (!strategy_.debug_graphviz_path_.empty()) { @@ -133,16 +146,16 @@ std::unique_ptr BuildStrategy::Apply( pass->SetNotOwned("nccl_ctxs", nctx); #endif } else if (pass->Type() == "sequential_execution_pass") { - VLOG(1) << "set enable_sequential_execution:" - << enable_sequential_execution_; + LOG(INFO) << "set enable_sequential_execution:" + << enable_sequential_execution_; pass->Erase(kAllOpDescs); pass->Set>( kAllOpDescs, new std::vector(main_program.Block(0).AllOps())); } else if (pass->Type() == "all_reduce_deps_pass") { - VLOG(1) << "SeqOnlyAllReduceOps:" << SeqOnlyAllReduceOps(*this) - << ", num_trainers:" << num_trainers_; + LOG(INFO) << "SeqOnlyAllReduceOps:" << SeqOnlyAllReduceOps(*this) + << ", num_trainers:" << num_trainers_; pass->Erase(kAllOpDescs); pass->Set>( diff --git a/paddle/fluid/framework/details/build_strategy.h b/paddle/fluid/framework/details/build_strategy.h index 9f0a25912886cea7a1f287125cfe8612e4b336eb..c97be169575f578dfd18a6290230d1b3f3bd7596 100644 --- a/paddle/fluid/framework/details/build_strategy.h +++ b/paddle/fluid/framework/details/build_strategy.h @@ -74,6 +74,8 @@ struct BuildStrategy { bool fuse_broadcast_op_{false}; int num_trainers_{1}; + int trainer_id_{0}; + std::vector trainers_endpoints_; bool remove_unnecessary_lock_{false}; // NOTE: diff --git a/paddle/fluid/framework/details/multi_devices_graph_pass.cc b/paddle/fluid/framework/details/multi_devices_graph_pass.cc index ba4c8dd9bbac0f76037d144fa7b7ff3c4be23df0..8af1d62dea89343ff2d41dd7c6ac837459df7685 100644 --- a/paddle/fluid/framework/details/multi_devices_graph_pass.cc +++ b/paddle/fluid/framework/details/multi_devices_graph_pass.cc @@ -133,6 +133,7 @@ static const char kPlaces[] = "places"; static const char kParams[] = "params"; static const char kLocalScopes[] = "local_scopes"; static const char kStrategy[] = "strategy"; +static const char kNumTrainers[] = "num_trainers"; void MultiDevSSAGraphBuilder::Init() const { all_vars_.clear(); @@ -299,6 +300,8 @@ std::unique_ptr MultiDevSSAGraphBuilder::ApplyImpl( auto nodes = graph->ReleaseNodes(); ir::Graph &result = *graph; + int num_trainers = Get(kNumTrainers); + for (auto &node : nodes) { if (node->IsVar() && node->Var()) { all_vars_.emplace(node->Name(), node->Var()); @@ -383,7 +386,7 @@ std::unique_ptr MultiDevSSAGraphBuilder::ApplyImpl( CreateComputationalOps(&result, node, places_.size()); } - if (!is_forwarding && places_.size() > 1) { + if (!is_forwarding && (places_.size() > 1 || num_trainers > 1)) { // Currently, we assume that once gradient is generated, it can be // broadcast, and each gradient is only broadcast once. if (static_cast(boost::get(node->Op()->GetAttr( @@ -895,4 +898,5 @@ REGISTER_PASS(multi_devices_pass, .RequirePassAttr(paddle::framework::details::kPlaces) .RequirePassAttr(paddle::framework::details::kParams) .RequirePassAttr(paddle::framework::details::kLocalScopes) - .RequirePassAttr(paddle::framework::details::kStrategy); + .RequirePassAttr(paddle::framework::details::kStrategy) + .RequirePassAttr(paddle::framework::details::kNumTrainers); diff --git a/paddle/fluid/framework/details/op_registry.h b/paddle/fluid/framework/details/op_registry.h index 1ce18c3d6b26c541beed668a113b7a4de7f0e79e..eea7e712f8f6e187cdceedce77cc76d1d4ca2101 100644 --- a/paddle/fluid/framework/details/op_registry.h +++ b/paddle/fluid/framework/details/op_registry.h @@ -32,9 +32,7 @@ enum OpInfoFillType { kOpProtoAndCheckerMaker = 1, kGradOpDescMaker = 2, kVarTypeInference = 3, - kShapeInference = 4, - kEstimateFlops = 5, - kUnknown = -1 + kShapeInference = 4 }; template @@ -50,10 +48,8 @@ struct OpInfoFillTypeID { ? kVarTypeInference : (std::is_base_of::value ? kShapeInference - : (std::is_base_of::value - ? kEstimateFlops - : kUnknown))))); + : static_cast( + -1))))); } }; @@ -143,16 +139,6 @@ struct OpInfoFiller { } }; -template -struct OpInfoFiller { - void operator()(const char* op_tpe, OpInfo* info) const { - info->estimate_flops_ = [](InferShapeContext* ctx) { - T estimate_flops; - return estimate_flops(ctx); - }; - } -}; - } // namespace details } // namespace framework diff --git a/paddle/fluid/framework/details/reduce_and_gather.h b/paddle/fluid/framework/details/reduce_and_gather.h index bd6153c0c736f6e32378eebcbf6c4d7e402c9b42..2e5256fbd49a3f8c72840cd55dada4301cb04eb9 100644 --- a/paddle/fluid/framework/details/reduce_and_gather.h +++ b/paddle/fluid/framework/details/reduce_and_gather.h @@ -53,7 +53,7 @@ struct ReduceLoDTensor { } }; -inline void GatherSelectedRows( +inline void GatherLocalSelectedRows( const std::vector &src_selecte_rows_, const std::vector &in_places, const std::map &dev_ctxes, diff --git a/paddle/fluid/framework/details/reduce_op_handle.cc b/paddle/fluid/framework/details/reduce_op_handle.cc index c9f1107aeab5a21d46e828308cfcb2dde827cba6..cb864848b938e249ecd9d09e2a02f683959ce413 100644 --- a/paddle/fluid/framework/details/reduce_op_handle.cc +++ b/paddle/fluid/framework/details/reduce_op_handle.cc @@ -16,6 +16,12 @@ #include "paddle/fluid/framework/details/container_cast.h" #include "paddle/fluid/framework/details/reduce_and_gather.h" #include "paddle/fluid/framework/details/variable_visitor.h" +#if defined PADDLE_WITH_CUDA && defined PADDLE_WITH_DISTRIBUTE +#include "paddle/fluid/operators/distributed/collective_client.h" +#include "paddle/fluid/operators/distributed/collective_server.h" +#include "paddle/fluid/operators/distributed/request_handler.h" +#endif +#include "paddle/fluid/operators/math/selected_rows_functor.h" #include "paddle/fluid/platform/profiler.h" DEFINE_bool( @@ -26,6 +32,112 @@ namespace paddle { namespace framework { namespace details { +std::once_flag CollectiveContext::init_flag_; +std::unique_ptr CollectiveContext::context_; + +static inline std::string GetRemoteVarName(const std::string &var_name, + int trainer_id) { + return string::Sprintf("%s_merged_tmp@trainer_%d", var_name, trainer_id); +} + +void ReduceOpHandle::Wait( + const std::map &dev_ctxes) { + // TODO(gongwb): use event wait? + for (auto &dev_ctx : dev_ctxes) { + dev_ctx.second->Wait(); + } +} + +#if defined PADDLE_WITH_CUDA && defined PADDLE_WITH_DISTRIBUTE +template +void ReduceOpHandle::GatherSelectedRows( + const std::vector &src_selected_rows, + const std::vector &in_places, + const std::map &dev_ctxes, + VarHandle *out_var_handle, const platform::Place &out_place, + SelectedRows *dst_selected_rows) { + const CollectiveContext &collective_context = + *CollectiveContext::GetInstance(); + + // 1. gather local selected rows, merge them + std::string gathered_var_name = out_var_handle->name_ + "_gathered_tmp"; + auto scope = local_scopes_.at(out_var_handle->scope_idx_); + auto gathered_var_mid = scope->Var(gathered_var_name); + auto gathered_select_rows = + gathered_var_mid->GetMutable(); + GatherLocalSelectedRows(src_selected_rows, in_places, dev_ctxes, out_place, + gathered_select_rows); + // FIXME(gongwb): remove this Wait. + Wait(dev_ctxes); + + // merge them + auto merged_dev_ctx = dynamic_cast(dev_ctxes.at(out_place)); + std::string merged_var_name = + GetRemoteVarName(out_var_handle->name_, collective_context.trainer_id_); + auto merged_select_rows = + scope->Var(merged_var_name)->GetMutable(); + operators::math::scatter::MergeAdd merge_func; + merge_func(*merged_dev_ctx, *gathered_select_rows, merged_select_rows); + + // 2. start collective server if it doesn't exist + operators::distributed::CollectiveServer *server = + operators::distributed::CollectiveServer::GetInstance( + collective_context.endpoints_[collective_context.trainer_id_], + collective_context.endpoints_.size() - 1); + + auto rpc_server = server->GetRPCServer(); + rpc_server->RegisterVar(merged_var_name, + operators::distributed::kRequestGetMonomerVariable, + scope, merged_dev_ctx); + + // 3. gather them from all remote nodes. + std::vector remote; + operators::distributed::CollectiveClient *client = + operators::distributed::CollectiveClient::GetInstance(); + + std::vector vars; + for (unsigned int i = 0; i < collective_context.endpoints_.size(); i++) { + if (i == (unsigned)collective_context.trainer_id_) continue; + + operators::distributed::RemoteVar var; + var.trainer_id_ = i; + var.var_name_ = GetRemoteVarName(out_var_handle->name_, i); + var.ep_ = collective_context.endpoints_[i]; + + vars.push_back(var); + VLOG(4) << "gather from:" << var.String(); + } + + // erase gathered vars + merged_dev_ctx->Wait(); + scope->EraseVars(std::vector{gathered_var_name}); + + PADDLE_ENFORCE(client->Gather(vars, &remote, *merged_dev_ctx, scope)); + PADDLE_ENFORCE(remote.size() == vars.size()); + + // 4. merged local selected rows. + std::vector all; + all.resize(collective_context.endpoints_.size()); + for (auto v : vars) { + all[v.trainer_id_] = + scope->FindVar(v.var_name_)->GetMutable(); + } + all[collective_context.trainer_id_] = merged_select_rows; + + merge_func(*merged_dev_ctx, all, dst_selected_rows); + + rpc_server->WaitVarBarrier(merged_var_name); + rpc_server->ClearVar(merged_var_name); + + // 5. clear mid vars + std::vector tmp_vars{merged_var_name}; + for (auto r : vars) { + tmp_vars.push_back(r.var_name_); + } + scope->EraseVars(tmp_vars); +} +#endif + void ReduceOpHandle::RunImpl() { platform::RecordEvent record_event(Name(), dev_ctxes_.cbegin()->second); @@ -90,8 +202,36 @@ void ReduceOpHandle::RunImpl() { this->RunAndRecordEvent([&] { std::vector in_selected_rows = GetInputValues(in_var_handles, var_scopes); - GatherSelectedRows(in_selected_rows, in_places, dev_ctxes_, t_out_p, - out_var->GetMutable()); + + const CollectiveContext &collective_context = + *CollectiveContext::GetInstance(); + VLOG(10) << "GatherSelectedRows CollectiveContext:" + << collective_context.String(); + + // TODO(gongwb): add cpu support + if (collective_context.endpoints_.size() <= 1 || + is_cpu_place(in_places[0]) || is_cpu_place(t_out_p)) { + GatherLocalSelectedRows(in_selected_rows, in_places, dev_ctxes_, + t_out_p, + out_var->GetMutable()); + return; + } + +#if defined PADDLE_WITH_CUDA && defined PADDLE_WITH_DISTRIBUTE + if (framework::IsType(in_selected_rows[0]->value().type())) { + GatherSelectedRows( + in_selected_rows, in_places, dev_ctxes_, out_var_handle, t_out_p, + out_var->GetMutable()); + } else if (framework::IsType( + in_selected_rows[0]->value().type())) { + GatherSelectedRows( + in_selected_rows, in_places, dev_ctxes_, out_var_handle, t_out_p, + out_var->GetMutable()); + } else { + PADDLE_ENFORCE(false, + "only support double or float when gahter SelectedRows"); + } +#endif }); } else { std::vector lod_tensors = diff --git a/paddle/fluid/framework/details/reduce_op_handle.h b/paddle/fluid/framework/details/reduce_op_handle.h index 846839029ca65be1bdeac2f6ea497db07a01b6cf..5491f00f45e9d48c5eb7455396ac51801f2c40ab 100644 --- a/paddle/fluid/framework/details/reduce_op_handle.h +++ b/paddle/fluid/framework/details/reduce_op_handle.h @@ -30,6 +30,32 @@ namespace paddle { namespace framework { namespace details { +struct CollectiveContext { + std::vector endpoints_; + int trainer_id_{0}; + + std::string String() const { + std::stringstream ss; + ss << "endpoints_:"; + for (auto e : endpoints_) { + ss << e << ","; + } + + ss << "trainer_id_:" << trainer_id_; + + return ss.str(); + } + + static CollectiveContext *GetInstance() { + std::call_once(init_flag_, + [&]() { context_.reset(new CollectiveContext()); }); + return context_.get(); + } + + private: + static std::once_flag init_flag_; + static std::unique_ptr context_; +}; struct ReduceOpHandle : public OpHandleBase { std::vector local_scopes_; @@ -64,6 +90,19 @@ struct ReduceOpHandle : public OpHandleBase { protected: void RunImpl() override; +#if defined PADDLE_WITH_CUDA && defined PADDLE_WITH_DISTRIBUTE + template + void GatherSelectedRows( + const std::vector &src_selecte_rows_, + const std::vector &in_places, + const std::map &dev_ctxes, + VarHandle *out_var_handle, const platform::Place &out_place, + SelectedRows *dst_selecte_rows); +#endif + + void Wait( + const std::map &dev_ctxes); + template std::vector GetInputValues( const std::vector &in_var_handles, diff --git a/paddle/fluid/framework/executor_thread_worker.cc b/paddle/fluid/framework/executor_thread_worker.cc index 4e4001e979fdd0774779fa288402c7847af90637..3d5351161554f46539c54ce3efbfc2d6a36fc419 100644 --- a/paddle/fluid/framework/executor_thread_worker.cc +++ b/paddle/fluid/framework/executor_thread_worker.cc @@ -97,7 +97,7 @@ void ExecutorThreadWorker::SetDevice() { static unsigned concurrency_cap = std::thread::hardware_concurrency(); int thread_id = this->thread_id_; - if (thread_id < concurrency_cap) { + if (static_cast(thread_id) < concurrency_cap) { unsigned proc = thread_id; cpu_set_t mask; diff --git a/paddle/fluid/framework/feed_fetch_method.cc b/paddle/fluid/framework/feed_fetch_method.cc index 3e9353f5cf67d8de62c5551f12ea786e49190549..6338be75a4b1d3c4caf7a6f7add4d05fec690340 100644 --- a/paddle/fluid/framework/feed_fetch_method.cc +++ b/paddle/fluid/framework/feed_fetch_method.cc @@ -16,7 +16,9 @@ limitations under the License. */ #include #include #include "glog/logging.h" +#include "paddle/fluid/framework/var_type.h" #include "paddle/fluid/framework/variable.h" +#include "paddle/fluid/platform/place.h" namespace paddle { namespace framework { @@ -53,5 +55,12 @@ LoDTensor& GetFetchVariable(const Scope& scope, const std::string& var_name, return tensor; } +LoDTensor& GetVariableTensor(const Scope& scope, const std::string& var_name) { + Variable* var = scope.FindVar(var_name); + PADDLE_ENFORCE(var, "%s no in scope", var_name); + PADDLE_ENFORCE(var->IsType(), "Only support lod tensor now."); + return *var->GetMutable(); +} + } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/feed_fetch_method.h b/paddle/fluid/framework/feed_fetch_method.h index 7f504bfd232862c014cb59b6e8301eec74e0351f..031f8e01aa6128b803dcbfb990778e87d4fafc13 100644 --- a/paddle/fluid/framework/feed_fetch_method.h +++ b/paddle/fluid/framework/feed_fetch_method.h @@ -27,5 +27,7 @@ void SetFeedVariable(Scope* scope, const LoDTensor& input, LoDTensor& GetFetchVariable(const Scope& scope, const std::string& var_name, size_t index); +LoDTensor& GetVariableTensor(const Scope& scope, const std::string& var_name); + } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/ir/conv_bias_mkldnn_fuse_pass.cc b/paddle/fluid/framework/ir/conv_bias_mkldnn_fuse_pass.cc index 449cc78be15bcd2575ce2e6846b41e475f8921f6..d4a701e0b173a96d8605dff308fee7007a0ecc0c 100644 --- a/paddle/fluid/framework/ir/conv_bias_mkldnn_fuse_pass.cc +++ b/paddle/fluid/framework/ir/conv_bias_mkldnn_fuse_pass.cc @@ -46,14 +46,16 @@ std::unique_ptr ConvBiasFusePass::ApplyImpl( auto* scope = param_scope(); PADDLE_ENFORCE(scope); + std::string type = is_conv3d() ? "conv3d" : "conv2d"; + GraphPatternDetector gpd; auto* conv_input = gpd.mutable_pattern() ->NewNode(patterns::PDNodeName(name_scope_, "conv_input")) ->AsInput() - ->assert_is_op_input("conv2d", "Input"); + ->assert_is_op_input(type, "Input"); patterns::ConvBias conv_bias_pattern(gpd.mutable_pattern(), name_scope_); - conv_bias_pattern(conv_input); + conv_bias_pattern(conv_input, is_conv3d()); int found_conv_bias_count = 0; auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, Graph* g) { @@ -109,7 +111,7 @@ std::unique_ptr ConvBiasFusePass::ApplyImpl( desc.SetInput("Filter", std::vector({conv_weight->Name()})); desc.SetInput("Bias", std::vector({eltwise_bias->Name()})); desc.SetOutput("Output", std::vector({eltwise_out->Name()})); - desc.SetType("conv2d"); + desc.SetType(type); for (auto& attr : conv->Op()->GetAttrMap()) { desc.SetAttr(attr.first, attr.second); @@ -135,3 +137,5 @@ std::unique_ptr ConvBiasFusePass::ApplyImpl( } // namespace paddle REGISTER_PASS(conv_bias_mkldnn_fuse_pass, paddle::framework::ir::ConvBiasFusePass); +REGISTER_PASS(conv3d_bias_mkldnn_fuse_pass, + paddle::framework::ir::Conv3DBiasFusePass); diff --git a/paddle/fluid/framework/ir/conv_bias_mkldnn_fuse_pass.h b/paddle/fluid/framework/ir/conv_bias_mkldnn_fuse_pass.h index 5775b83b88730ec298c421a15f5c0b83c27b0750..f3ad9f1c2bf14db418629e0c607e2510f01908b8 100644 --- a/paddle/fluid/framework/ir/conv_bias_mkldnn_fuse_pass.h +++ b/paddle/fluid/framework/ir/conv_bias_mkldnn_fuse_pass.h @@ -26,11 +26,19 @@ namespace ir { class ConvBiasFusePass : public FusePassBase { public: virtual ~ConvBiasFusePass() {} + virtual bool is_conv3d() const { return false; } protected: std::unique_ptr ApplyImpl(std::unique_ptr graph) const; const std::string name_scope_{"conv_bias_mkldnn_fuse"}; }; +/* +* Fuse the Conv3D and Elementwise_add to a Conv3DBiasOp. +*/ +class Conv3DBiasFusePass : public ConvBiasFusePass { + public: + bool is_conv3d() const override { return true; } +}; } // namespace ir } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/ir/graph.cc b/paddle/fluid/framework/ir/graph.cc index fc91564bbaecf7b1725908fc1eb8b1e4d2e20d32..8679118fe28b1c68aea30caf711441823b5255c0 100644 --- a/paddle/fluid/framework/ir/graph.cc +++ b/paddle/fluid/framework/ir/graph.cc @@ -38,9 +38,8 @@ void CheckProgram(const ProgramDesc &program) { switch (role_id) { case _INT(OpRole::kForward): if (visit.find(_INT(OpRole::kBackward)) != visit.end()) { - LOG(ERROR) - << "Cannot add backward operator before forward operator %s." - << op->Type(); + LOG(ERROR) << "Cannot add backward operator before forward operator " + << op->Type(); } break; case _INT(OpRole::kBackward): diff --git a/paddle/fluid/framework/ir/graph.h b/paddle/fluid/framework/ir/graph.h index 7a2560c14df2a26832be9866d5862670ff3745b6..47fcf96a3f92b1f915e5254fff36feb8b2870730 100644 --- a/paddle/fluid/framework/ir/graph.h +++ b/paddle/fluid/framework/ir/graph.h @@ -184,14 +184,13 @@ class Graph { return nullptr; } - const ProgramDesc &program() const { return program_; } - std::map> InitFromProgram( - const ProgramDesc &program); - void ResolveHazard( const std::map> &var_nodes); private: + std::map> InitFromProgram( + const ProgramDesc &program); + // This method takes ownership of `node`. ir::Node *AddNode(ir::Node *node) { PADDLE_ENFORCE(node_set_.find(node) == node_set_.end()); diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc index 258182b25a16d9135f55cfc300e2602d14f26d73..0118019df2f779a6409365555b530ae3b6d3971f 100644 --- a/paddle/fluid/framework/ir/graph_pattern_detector.cc +++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc @@ -1030,10 +1030,11 @@ PDNode *patterns::ElewiseAddActInplaceGrad::operator()( } PDNode *patterns::ConvBias::operator()( - paddle::framework::ir::PDNode *conv_input) { + paddle::framework::ir::PDNode *conv_input, bool is_conv3d) { + std::string type = is_conv3d ? "conv3d" : "conv2d"; // Create Operators - conv_input->assert_is_op_input("conv2d", "Input"); - auto *conv_op = pattern->NewNode(conv_repr())->assert_is_op("conv2d"); + conv_input->assert_is_op_input(type, "Input"); + auto *conv_op = pattern->NewNode(conv_repr())->assert_is_op(type); auto *eltiwse_op = pattern->NewNode(eltwise_repr())->assert_is_op("elementwise_add"); // Create variables @@ -1041,11 +1042,11 @@ PDNode *patterns::ConvBias::operator()( auto *conv_weight_var = pattern->NewNode(conv_weight_repr()) ->AsInput() ->assert_is_persistable_var() - ->assert_is_op_input("conv2d", "Filter"); + ->assert_is_op_input(type, "Filter"); // intermediate variable, will be removed in the IR after fuse. auto *conv_out_var = pattern->NewNode(conv_out_repr()) ->AsIntermediate() - ->assert_is_only_output_of_op("conv2d") + ->assert_is_only_output_of_op(type) ->assert_is_op_input("elementwise_add"); // Bias stored in elementwise_add auto *eltwise_bias_var = pattern->NewNode(eltwise_bias_repr()) diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.h b/paddle/fluid/framework/ir/graph_pattern_detector.h index c12b9503fd817757ec8d1e988be3e449fc63c6ff..d044802f22d02372e0ddb72c6fd702aebf2f82c3 100644 --- a/paddle/fluid/framework/ir/graph_pattern_detector.h +++ b/paddle/fluid/framework/ir/graph_pattern_detector.h @@ -623,7 +623,7 @@ struct ElewiseAddActInplaceGrad : public PatternBase { struct ConvBias : public PatternBase { ConvBias(PDPattern* pattern, const std::string& name_scope) : PatternBase(pattern, name_scope, "conv_bias") {} - PDNode* operator()(PDNode* conv_input); + PDNode* operator()(PDNode* conv_input, bool is_conv3d = false); // declare operator node's name PATTERN_DECL_NODE(conv); PATTERN_DECL_NODE(eltwise); diff --git a/paddle/fluid/framework/ir/is_test_pass.cc b/paddle/fluid/framework/ir/is_test_pass.cc index 6d8f020918d4e56fa7f125a659f7f8511ca067ca..57cc98e2ca0175848aa62c62c8ad3b20594b3bde 100644 --- a/paddle/fluid/framework/ir/is_test_pass.cc +++ b/paddle/fluid/framework/ir/is_test_pass.cc @@ -38,7 +38,7 @@ std::unique_ptr IsTestPass::ApplyImpl( for (const Node* n : graph->Nodes()) { if (n->IsOp()) { auto* op = n->Op(); - if (n->RuntimeHasAttr("is_test")) { + if (op->HasAttr("is_test") || op->HasProtoAttr("is_test")) { op->SetAttr("is_test", true); } else if (std::find(begin(op_list), end(op_list), op->Type()) != end(op_list)) { diff --git a/paddle/fluid/framework/ir/is_test_pass_tester.cc b/paddle/fluid/framework/ir/is_test_pass_tester.cc index d9a68c7f1dd2a0dca5204719c4ce6cd9d68292a2..9696441a21661db89146c448742a992d1f7df022 100644 --- a/paddle/fluid/framework/ir/is_test_pass_tester.cc +++ b/paddle/fluid/framework/ir/is_test_pass_tester.cc @@ -104,9 +104,9 @@ TEST(IsTestPass, basic) { auto* op = node->Op(); auto op_name = boost::get(op->GetAttr("name")); if (op_name == "conv3") { - ASSERT_FALSE(node->RuntimeHasAttr("is_test")); + ASSERT_FALSE(op->HasAttr("is_test")); } else { - ASSERT_TRUE(node->RuntimeHasAttr("is_test")); + ASSERT_TRUE(op->HasAttr("is_test")); EXPECT_TRUE(boost::get(op->GetAttr("is_test"))); } } diff --git a/paddle/fluid/framework/ir/mkldnn_placement_pass.cc b/paddle/fluid/framework/ir/mkldnn_placement_pass.cc index 1cf1315d3d3059261d84d0e8795a75df4deae005..951fcb066ce759ebfec0182e1e9dca887e343170 100644 --- a/paddle/fluid/framework/ir/mkldnn_placement_pass.cc +++ b/paddle/fluid/framework/ir/mkldnn_placement_pass.cc @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "paddle/fluid/framework/ir/mkldnn_placement_pass.h" +#include namespace paddle { namespace framework { @@ -21,9 +22,19 @@ namespace ir { std::unique_ptr MKLDNNPlacementPass::ApplyImpl( std::unique_ptr graph) const { VLOG(3) << "Aplies MKL-DNN placement strategy."; + const auto& op_types_list = + Get>("mkldnn_enabled_op_types"); for (const Node* n : graph->Nodes()) { - if (n->IsOp() && n->RuntimeHasAttr("use_mkldnn")) { - n->Op()->SetAttr("use_mkldnn", true); + if (n->IsOp()) { + auto* op = n->Op(); + if (op->HasAttr("use_mkldnn") || op->HasProtoAttr("use_mkldnn")) { + if (op_types_list.empty()) { + op->SetAttr("use_mkldnn", true); + } else if (std::find(op_types_list.begin(), op_types_list.end(), + n->Name()) != op_types_list.end()) { + op->SetAttr("use_mkldnn", true); + } + } } } return graph; @@ -33,5 +44,5 @@ std::unique_ptr MKLDNNPlacementPass::ApplyImpl( } // namespace framework } // namespace paddle -REGISTER_PASS(mkldnn_placement_pass, - paddle::framework::ir::MKLDNNPlacementPass); +REGISTER_PASS(mkldnn_placement_pass, paddle::framework::ir::MKLDNNPlacementPass) + .RequirePassAttr("mkldnn_enabled_op_types"); diff --git a/paddle/fluid/framework/ir/node.cc b/paddle/fluid/framework/ir/node.cc index 7a88cb2b681c1aa5e1b75481b1aba66a125a1d9c..eac67108e2106e986cbe1255a64c956153bc5560 100644 --- a/paddle/fluid/framework/ir/node.cc +++ b/paddle/fluid/framework/ir/node.cc @@ -30,28 +30,6 @@ std::unique_ptr CreateNodeForTest(const std::string &name, return std::unique_ptr(new Node(name, type)); } -bool Node::RuntimeHasAttr(const std::string &name) const { - if (Op()->HasAttr(name)) { - return true; - } else { - auto &op_info = OpInfoMap::Instance(); - auto op_type = Op()->Type(); - if (op_info.Has(op_type)) { - auto op_info_ptr = op_info.Get(op_type); - if (op_info_ptr.HasOpProtoAndChecker()) { - const proto::OpProto &proto = op_info_ptr.Proto(); - for (int i = 0; i != proto.attrs_size(); ++i) { - const proto::OpProto::Attr &attr = proto.attrs(i); - if (attr.name() == name) { - return true; - } - } - } - } - } - return false; -} - } // namespace ir } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/ir/node.h b/paddle/fluid/framework/ir/node.h index 1044a96430f060b750580ea0b225787ba6ebd2a0..d2a393b3f19e9aab79098757dae663d030b0fa2b 100644 --- a/paddle/fluid/framework/ir/node.h +++ b/paddle/fluid/framework/ir/node.h @@ -108,18 +108,6 @@ class Node { Name().find(ir::Node::kControlDepVarName) != std::string::npos; } - // RuntimeHasAttr is different with HasAttr now. - // 1. For Op()->HasAttr(), it judges whether a stored program_desc_ has attr, - // thus, if stored program_desc_ are old which don't have an attr, a new - // library which adds the attr already will fail on this function. - // Details: - // https://github.com/PaddlePaddle/Paddle/pull/14608#issuecomment-442309087 - // 2. For Op()->RuntimeHasAttr, it judges the attr in runtime to avoid above - // problem. - // TODO(luotao): Maybe we should enhance HasAttr later, instead of adding - // RuntimeHasAttr. - bool RuntimeHasAttr(const std::string& name) const; - std::vector inputs; std::vector outputs; diff --git a/paddle/fluid/framework/op_desc.cc b/paddle/fluid/framework/op_desc.cc index e8ecd90502933a049cc8f886212579fc061d44ff..dde642764fa5dfce11edcef51ad1be11be331fbc 100644 --- a/paddle/fluid/framework/op_desc.cc +++ b/paddle/fluid/framework/op_desc.cc @@ -237,6 +237,23 @@ void OpDesc::SetOutput(const std::string ¶m_name, this->outputs_[param_name] = args; } +bool OpDesc::HasProtoAttr(const std::string &name) const { + auto &op_info = OpInfoMap::Instance(); + if (op_info.Has(desc_.type())) { + auto op_info_ptr = op_info.Get(desc_.type()); + if (op_info_ptr.HasOpProtoAndChecker()) { + const proto::OpProto &proto = op_info_ptr.Proto(); + for (int i = 0; i != proto.attrs_size(); ++i) { + const proto::OpProto::Attr &attr = proto.attrs(i); + if (attr.name() == name) { + return true; + } + } + } + } + return false; +} + proto::AttrType OpDesc::GetAttrType(const std::string &name) const { auto it = attrs_.find(name); PADDLE_ENFORCE(it != attrs_.end(), "Attribute %s is not found", name); diff --git a/paddle/fluid/framework/op_desc.h b/paddle/fluid/framework/op_desc.h index 30c8a26c3d2f0068674aa70b4ff875a2f73c1dca..e8debec7f13706b7fc5a4882d237ee2257e53b7e 100644 --- a/paddle/fluid/framework/op_desc.h +++ b/paddle/fluid/framework/op_desc.h @@ -65,6 +65,8 @@ class OpDesc { return attrs_.find(name) != attrs_.end(); } + bool HasProtoAttr(const std::string &name) const; + proto::AttrType GetAttrType(const std::string &name) const; std::vector AttrNames() const; diff --git a/paddle/fluid/framework/op_info.h b/paddle/fluid/framework/op_info.h index e0bf5ed999f580f217af285bf97d0bc0232f1ded..19e5c2c73eac74dee030a4f7820531800f737e4e 100644 --- a/paddle/fluid/framework/op_info.h +++ b/paddle/fluid/framework/op_info.h @@ -31,12 +31,6 @@ class InferShapeBase { virtual void operator()(InferShapeContext*) const = 0; }; -class EstimateFlopsBase { - public: - virtual ~EstimateFlopsBase() = default; - virtual size_t operator()(InferShapeContext*) const = 0; -}; - struct OpInfo { OpCreator creator_; GradOpMakerFN grad_op_maker_; @@ -44,7 +38,6 @@ struct OpInfo { OpAttrChecker* checker_{nullptr}; InferVarTypeFN infer_var_type_; InferShapeFN infer_shape_; - EstimateFlopsFN estimate_flops_; bool HasOpProtoAndChecker() const { return proto_ != nullptr && checker_ != nullptr; diff --git a/paddle/fluid/framework/op_kernel_type.cc b/paddle/fluid/framework/op_kernel_type.cc new file mode 100644 index 0000000000000000000000000000000000000000..6d4801e4a0eed7083e671e1d49b8628dfb280cf9 --- /dev/null +++ b/paddle/fluid/framework/op_kernel_type.cc @@ -0,0 +1,54 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/op_kernel_type.h" + +namespace paddle { +namespace framework { + +size_t OpKernelType::Hash::operator()(const OpKernelType& key) const { + int cur_loc = 0; + + int place = key.place_.which(); + cur_loc += OpKernelType::kPlaceBits; + + int data_type = static_cast(key.data_type_) << cur_loc; + cur_loc += OpKernelType::kPrimaryDTypeBits; + + int data_layout = static_cast(key.data_layout_) << cur_loc; + cur_loc += OpKernelType::kLayoutBits; + + int library_type = static_cast(key.library_type_) << cur_loc; + cur_loc += OpKernelType::kLibBits; + + int customized_value = key.customized_type_value_; + PADDLE_ENFORCE(customized_value < (1 << OpKernelType::kCustomizeBits)); + customized_value = customized_value << cur_loc; + cur_loc += OpKernelType::kCustomizeBits; + PADDLE_ENFORCE(cur_loc < 64); + + std::hash hasher; + return hasher(place + data_type + data_layout + library_type + + customized_value); +} + +bool OpKernelType::operator==(const OpKernelType& o) const { + return platform::places_are_same_class(place_, o.place_) && + data_type_ == o.data_type_ && data_layout_ == o.data_layout_ && + library_type_ == o.library_type_ && + customized_type_value_ == o.customized_type_value_; +} + +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/op_kernel_type.h b/paddle/fluid/framework/op_kernel_type.h index ac0330218973123771367ed5ba9477c90143a043..9edc1a3e150027b5a3dbd8483dc8b58d1d4ab918 100644 --- a/paddle/fluid/framework/op_kernel_type.h +++ b/paddle/fluid/framework/op_kernel_type.h @@ -24,54 +24,55 @@ limitations under the License. */ namespace paddle { namespace framework { -struct OpKernelType { - struct Hash { - size_t operator()(const OpKernelType& key) const { - int place = key.place_.which(); - int data_type = static_cast(key.data_type_) << LEFT_SHIFT; - int data_layout = static_cast(key.data_layout_) << (LEFT_SHIFT * 2); - int library_type = static_cast(key.library_type_) - << (LEFT_SHIFT * 3); - - std::hash hasher; - return hasher(place + data_type + data_layout + library_type); - } - }; +class OpKernelType { + public: + constexpr static int kDefaultCustomizedTypeValue = 0; - // place, data_type, library_type kinds less than 2^8 - constexpr static int LEFT_SHIFT = 8; - - proto::VarType::Type data_type_; - DataLayout data_layout_; - platform::Place place_; - LibraryType library_type_; + // In total should be smaller than 64. + constexpr static int kPlaceBits = 4; + constexpr static int kPrimaryDTypeBits = 8; + constexpr static int kLayoutBits = 4; + constexpr static int kLibBits = 4; + constexpr static int kCustomizeBits = 4; OpKernelType(proto::VarType::Type data_type, platform::Place place, DataLayout data_layout = DataLayout::kAnyLayout, - LibraryType library_type = LibraryType::kPlain) + LibraryType library_type = LibraryType::kPlain, + int customized_type_value = kDefaultCustomizedTypeValue) : data_type_(data_type), data_layout_(data_layout), place_(place), - library_type_(library_type) {} + library_type_(library_type), + customized_type_value_(customized_type_value) {} OpKernelType(proto::VarType::Type data_type, const platform::DeviceContext& dev_ctx, DataLayout data_layout = DataLayout::kAnyLayout, - LibraryType library_type = LibraryType::kPlain) + LibraryType library_type = LibraryType::kPlain, + int customized_type_value = kDefaultCustomizedTypeValue) : data_type_(data_type), data_layout_(data_layout), place_(dev_ctx.GetPlace()), - library_type_(library_type) {} + library_type_(library_type), + customized_type_value_(customized_type_value) {} + + virtual ~OpKernelType() {} + + struct Hash { + size_t operator()(const OpKernelType& key) const; + }; size_t hash_key() const { return Hash()(*this); } - bool operator==(const OpKernelType& o) const { - return platform::places_are_same_class(place_, o.place_) && - data_type_ == o.data_type_ && data_layout_ == o.data_layout_ && - library_type_ == o.library_type_; - } + bool operator==(const OpKernelType& o) const; bool operator!=(const OpKernelType& o) const { return !(*this == o); } + + proto::VarType::Type data_type_; + DataLayout data_layout_; + platform::Place place_; + LibraryType library_type_; + int customized_type_value_; }; inline std::ostream& operator<<(std::ostream& os, diff --git a/paddle/fluid/framework/op_registry.h b/paddle/fluid/framework/op_registry.h index 0e6e74293c30d5f8caa58fe6bfa63657d2669b46..36673e48c2047bca54f604b082dfec123f1e2c82 100644 --- a/paddle/fluid/framework/op_registry.h +++ b/paddle/fluid/framework/op_registry.h @@ -35,6 +35,7 @@ limitations under the License. */ namespace paddle { namespace framework { + class Registrar { public: // In our design, various kinds of classes, e.g., operators and kernels, @@ -78,7 +79,7 @@ struct OpKernelRegistrarFunctor; template inline void RegisterKernelClass(const char* op_type, const char* library_type, - Func func) { + int customized_type_value, Func func) { std::string library(library_type); std::string data_layout = "ANYLAYOUT"; if (library == "MKLDNN") { @@ -86,7 +87,7 @@ inline void RegisterKernelClass(const char* op_type, const char* library_type, } OpKernelType key(ToDataType(std::type_index(typeid(T))), PlaceType(), StringToDataLayout(data_layout), - StringToLibraryType(library_type)); + StringToLibraryType(library_type), customized_type_value); OperatorWithKernel::AllOpKernels()[op_type][key] = func; } @@ -95,22 +96,26 @@ struct OpKernelRegistrarFunctor { using KERNEL_TYPE = typename std::tuple_element>::type; - void operator()(const char* op_type, const char* library_type) const { + void operator()(const char* op_type, const char* library_type, + int customized_type_value) const { using T = typename KERNEL_TYPE::ELEMENT_TYPE; RegisterKernelClass( - op_type, library_type, [](const framework::ExecutionContext& ctx) { + op_type, library_type, customized_type_value, + + [](const framework::ExecutionContext& ctx) { KERNEL_TYPE().Compute(ctx); }); constexpr auto size = std::tuple_size>::value; OpKernelRegistrarFunctor func; - func(op_type, library_type); + func(op_type, library_type, customized_type_value); } }; template struct OpKernelRegistrarFunctor { - void operator()(const char* op_type, const char* library_type) const {} + void operator()(const char* op_type, const char* library_type, + int customized_type_value) const {} }; // User can register many kernel in one place. The data type could be @@ -118,9 +123,10 @@ struct OpKernelRegistrarFunctor { template class OpKernelRegistrar : public Registrar { public: - explicit OpKernelRegistrar(const char* op_type, const char* library_type) { + explicit OpKernelRegistrar(const char* op_type, const char* library_type, + int customized_type_value) { OpKernelRegistrarFunctor func; - func(op_type, library_type); + func(op_type, library_type, customized_type_value); } }; @@ -130,17 +136,19 @@ struct OpKernelRegistrarFunctorEx; template class OpKernelRegistrarEx : public Registrar { public: - explicit OpKernelRegistrarEx(const char* op_type, const char* library_type) { + explicit OpKernelRegistrarEx(const char* op_type, const char* library_type, + int customized_type_value) { OpKernelRegistrarFunctorEx func; - func(op_type, library_type); + func(op_type, library_type, customized_type_value); } }; template struct OpKernelRegistrarFunctorEx { - void operator()(const char* op_type, const char* library_type) const {} + void operator()(const char* op_type, const char* library_type, + int customized_type_value) const {} }; template @@ -153,18 +161,21 @@ struct OpKernelRegistrarFunctorEx>::type; - void operator()(const char* op_type, const char* library_type) const { - RegisterKernelClass(op_type, library_type, Functor()); + void operator()(const char* op_type, const char* library_type, + int customized_type_value) const { + RegisterKernelClass(op_type, library_type, + customized_type_value, Functor()); constexpr auto size = std::tuple_size>::value; OpKernelRegistrarFunctorEx= size, I + 2, DataTypeAndKernelType...> func; - func(op_type, library_type); + func(op_type, library_type, customized_type_value); } }; +// clang-format off /** * check if MACRO is used in GLOBAL NAMESPACE. */ @@ -199,42 +210,64 @@ struct OpKernelRegistrarFunctorEx \ - __op_kernel_registrar_##op_type##_##library_type##__(#op_type, \ - #library_type); \ - int TouchOpKernelRegistrar_##op_type##_##library_type() { \ - __op_kernel_registrar_##op_type##_##library_type##__.Touch(); \ - return 0; \ +#define REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(op_type, library_type, \ + place_class, customized_name, \ + customized_type_value, ...) \ + STATIC_ASSERT_GLOBAL_NAMESPACE( \ + __reg_op_kernel_##op_type##_##library_type##_##customized_name##__, \ + "REGISTER_OP_KERNEL must be called in " \ + "global namespace"); \ + static ::paddle::framework::OpKernelRegistrar \ + __op_kernel_registrar_##op_type##_##library_type##_##customized_name##__(\ + #op_type, #library_type, customized_type_value); \ + int TouchOpKernelRegistrar_##op_type##_##library_type##_##customized_name() {\ + __op_kernel_registrar_##op_type##_##library_type##_##customized_name##__ \ + .Touch(); \ + return 0; \ } +#define REGISTER_OP_KERNEL(op_type, library_type, place_class, ...) \ + REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE( \ + op_type, library_type, place_class, DEFAULT_TYPE, \ + ::paddle::framework::OpKernelType::kDefaultCustomizedTypeValue, \ + __VA_ARGS__) + #define REGISTER_OP_CUDA_KERNEL(op_type, ...) \ REGISTER_OP_KERNEL(op_type, CUDA, ::paddle::platform::CUDAPlace, __VA_ARGS__) #define REGISTER_OP_CPU_KERNEL(op_type, ...) \ REGISTER_OP_KERNEL(op_type, CPU, ::paddle::platform::CPUPlace, __VA_ARGS__) -#define REGISTER_OP_KERNEL_EX(op_type, library_type, place_class, ...) \ - STATIC_ASSERT_GLOBAL_NAMESPACE( \ - __reg_op_kernel_##op_type##_##library_type##__, \ - "REGISTER_OP_KERNEL_EX must be called in global namespace"); \ - static ::paddle::framework::OpKernelRegistrarEx \ - __op_kernel_registrar_##op_type##_##library_type##__(#op_type, \ - #library_type); \ - int TouchOpKernelRegistrar_##op_type##_##library_type() { \ - __op_kernel_registrar_##op_type##_##library_type##__.Touch(); \ - return 0; \ +#define REGISTER_OP_KERNEL_EX(op_type, library_type, place_class, \ + customized_name, \ + customized_type_value, \ + ...) \ + STATIC_ASSERT_GLOBAL_NAMESPACE( \ + __reg_op_kernel_##op_type##_##library_type##_##customized_name##__, \ + "REGISTER_OP_KERNEL_EX must be called in " \ + "global namespace"); \ + static ::paddle::framework::OpKernelRegistrarEx \ + __op_kernel_registrar_##op_type##_##library_type##_##customized_name##__(\ + #op_type, #library_type, customized_type_value); \ + int TouchOpKernelRegistrar_##op_type##_##library_type##_##customized_name() {\ + __op_kernel_registrar_##op_type##_##library_type##_##customized_name##__ \ + .Touch(); \ + return 0; \ } #define REGISTER_OP_CUDA_KERNEL_FUNCTOR(op_type, ...) \ - REGISTER_OP_KERNEL_EX(op_type, CUDA, ::paddle::platform::CUDAPlace, \ - __VA_ARGS__) + REGISTER_OP_KERNEL_EX( \ + op_type, CUDA, ::paddle::platform::CUDAPlace, DEFAULT_TYPE, \ + ::paddle::framework::OpKernelType::kDefaultCustomizedTypeValue, \ + __VA_ARGS__) -#define REGISTER_OP_CPU_KERNEL_FUNCTOR(op_type, ...) \ - REGISTER_OP_KERNEL_EX(op_type, CPU, ::paddle::platform::CPUPlace, __VA_ARGS__) +#define REGISTER_OP_CPU_KERNEL_FUNCTOR(op_type, ...) \ + REGISTER_OP_KERNEL_EX( \ + op_type, CPU, ::paddle::platform::CPUPlace, DEFAULT_TYPE, \ + ::paddle::framework::OpKernelType::kDefaultCustomizedTypeValue, \ + __VA_ARGS__) /** * Macro to mark what Operator and Kernel @@ -248,13 +281,19 @@ struct OpKernelRegistrarFunctorEx("scale", "scale of cosine op"); + AddAttr("kernel_sub_type", "kernels with different implementations.") + .SetDefault(0); AddComment("This is test op"); } }; @@ -95,6 +97,8 @@ TEST(OperatorBase, all) { namespace paddle { namespace framework { +static int special_type_value = 1; + class OpKernelTestProtoAndCheckerMaker : public OpProtoAndCheckerMaker { public: void Make() { @@ -103,11 +107,14 @@ class OpKernelTestProtoAndCheckerMaker : public OpProtoAndCheckerMaker { AddAttr("scale", "scale of cosine op") .SetDefault(1.0) .GreaterThan(0.0); + AddAttr("kernel_sub_type", "kernels with different implementations.") + .SetDefault(0); AddComment("This is test op"); } }; static int cpu_kernel_run_num = 0; +static int cpu_kernel2_run_num = 0; class OpWithKernelTest : public OperatorWithKernel { public: @@ -117,7 +124,10 @@ class OpWithKernelTest : public OperatorWithKernel { void InferShape(framework::InferShapeContext* ctx) const override {} OpKernelType GetExpectedKernelType( const ExecutionContext& ctx) const override { - return OpKernelType(proto::VarType::FP32, ctx.GetPlace()); + int sub_type = ctx.Attr("kernel_sub_type"); + return OpKernelType(proto::VarType::FP32, ctx.GetPlace(), + framework::DataLayout::kAnyLayout, + framework::LibraryType::kPlain, sub_type); } }; @@ -132,6 +142,17 @@ class CPUKernelTest : public OpKernel { } }; +template +class CPUKernel2Test : public OpKernel { + public: + void Compute(const ExecutionContext& ctx) const { + std::cout << ctx.op().DebugString() << std::endl; + cpu_kernel2_run_num++; + ASSERT_EQ(ctx.op().Input("x"), "IN1"); + ASSERT_EQ(ctx.op().Output("y"), "OUT1"); + } +}; + class OpKernelTestMultiInputsProtoAndCheckerMaker : public OpProtoAndCheckerMaker { public: @@ -142,6 +163,8 @@ class OpKernelTestMultiInputsProtoAndCheckerMaker AddAttr("scale", "scale of cosine op") .SetDefault(1.0) .GreaterThan(0.0); + AddAttr("kernel_sub_type", "kernels with different implementations.") + .SetDefault(0); AddComment("This is test op"); } }; @@ -189,9 +212,15 @@ class CPUKernalMultiInputsTest : public OpKernel { REGISTER_OP_WITHOUT_GRADIENT( op_with_kernel, paddle::framework::OpWithKernelTest, paddle::framework::OpKernelTestProtoAndCheckerMaker); + REGISTER_OP_CPU_KERNEL(op_with_kernel, paddle::framework::CPUKernelTest); +REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE( + op_with_kernel, CPU, paddle::platform::CPUPlace, MY_SPECIAL_NAME, + paddle::framework::special_type_value, + paddle::framework::CPUKernel2Test); + // test with single input TEST(OpKernel, all) { paddle::framework::InitDevices(true); @@ -211,7 +240,19 @@ TEST(OpKernel, all) { auto op = paddle::framework::OpRegistry::CreateOp(op_desc); ASSERT_EQ(paddle::framework::cpu_kernel_run_num, 0); op->Run(scope, cpu_place); + // kerne_sub_type = 0, hence cpu_kernel is called, cpu_kernel2 is not called. + ASSERT_EQ(paddle::framework::cpu_kernel_run_num, 1); + ASSERT_EQ(paddle::framework::cpu_kernel2_run_num, 0); + + attr = op_desc.mutable_attrs()->Add(); + attr->set_name("kernel_sub_type"); + attr->set_type(paddle::framework::proto::AttrType::INT); + attr->set_i(1); + auto op2 = paddle::framework::OpRegistry::CreateOp(op_desc); + op2->Run(scope, cpu_place); + // kerne_sub_type = 1, hence cpu_kernel2 is called, cpu_kernel is not called. ASSERT_EQ(paddle::framework::cpu_kernel_run_num, 1); + ASSERT_EQ(paddle::framework::cpu_kernel2_run_num, 1); } REGISTER_OP_WITHOUT_GRADIENT( diff --git a/paddle/fluid/framework/selected_rows.h b/paddle/fluid/framework/selected_rows.h index 44384082dbaf7a8d654e8461da87009bde33a3d5..e1bdba9b46a4cbdb664b70c7419f567ef95bdf31 100644 --- a/paddle/fluid/framework/selected_rows.h +++ b/paddle/fluid/framework/selected_rows.h @@ -32,8 +32,7 @@ namespace framework { class SelectedRows { /* * @brief We can use the SelectedRows structure to reproduce a sparse table. - * A sparse table is a key-value structure that the key is an `int64_t` - * number, + * A sparse table is a key-value structure that the key is an `int64_t`, * and the value is a Tensor which the first dimension is 0. * You can use the following interface to operate the sparse table, and you * can find diff --git a/paddle/fluid/framework/type_defs.h b/paddle/fluid/framework/type_defs.h index cdc5fa6862e3b2a2784151302f15540a0e9db8ff..2de6233a9e0d320ec9a06d547db3575eb61925c0 100644 --- a/paddle/fluid/framework/type_defs.h +++ b/paddle/fluid/framework/type_defs.h @@ -54,7 +54,5 @@ using InferVarTypeFN = using InferShapeFN = std::function; -using EstimateFlopsFN = std::function; - } // namespace framework } // namespace paddle diff --git a/paddle/fluid/imperative/CMakeLists.txt b/paddle/fluid/imperative/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..373d292b443b7651b785a52a6986b0a0be58ad61 --- /dev/null +++ b/paddle/fluid/imperative/CMakeLists.txt @@ -0,0 +1,3 @@ +cc_library(layer SRCS layer.cc DEPS proto_desc operator) +cc_library(tracer SRCS tracer.cc DEPS proto_desc) +cc_library(engine SRCS engine.cc) diff --git a/paddle/fluid/imperative/engine.cc b/paddle/fluid/imperative/engine.cc new file mode 100644 index 0000000000000000000000000000000000000000..de7ab0e5918281579728ef48d1517be2cd530af7 --- /dev/null +++ b/paddle/fluid/imperative/engine.cc @@ -0,0 +1,53 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/imperative/engine.h" + +#include // NOLINT +#include + +#include "glog/logging.h" + +namespace paddle { +namespace imperative { + +static std::once_flag init_engine; +static Engine* engine; + +class DummyEngine : public Engine { + public: + void Enqueue(Runnable* runnable) override { + queued_runnables_.push_back(runnable); + } + + size_t Size() const override { return queued_runnables_.size(); } + + void Sync() override { + for (Runnable* l : queued_runnables_) { + LOG(INFO) << "running " << reinterpret_cast(l); + } + queued_runnables_.clear(); + } + + private: + std::vector queued_runnables_; +}; + +Engine* GetEngine() { + std::call_once(init_engine, []() { engine = new DummyEngine(); }); + return engine; +} + +} // namespace imperative +} // namespace paddle diff --git a/paddle/fluid/imperative/engine.h b/paddle/fluid/imperative/engine.h new file mode 100644 index 0000000000000000000000000000000000000000..a1dfa5bda38d0c419aa4ccbea77b32eb7e0d5b23 --- /dev/null +++ b/paddle/fluid/imperative/engine.h @@ -0,0 +1,39 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include + +namespace paddle { +namespace imperative { + +struct Runnable {}; + +class Engine { + public: + virtual ~Engine() {} + + virtual void Enqueue(Runnable* runnable) = 0; + + virtual size_t Size() const = 0; + + virtual void Sync() = 0; +}; + +Engine* GetEngine(); + +} // namespace imperative +} // namespace paddle diff --git a/paddle/fluid/imperative/layer.cc b/paddle/fluid/imperative/layer.cc new file mode 100644 index 0000000000000000000000000000000000000000..612503768079472ba233ee3fcd43a47fdba9a0cc --- /dev/null +++ b/paddle/fluid/imperative/layer.cc @@ -0,0 +1,221 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/imperative/layer.h" +#include +#include +#include +#include +#include + +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/string/printf.h" + +namespace paddle { +namespace imperative { + +using framework::Variable; + +void AddTo(Variable* src, Variable* dst) { + framework::LoDTensor* dst_tensor = dst->GetMutable(); + framework::LoDTensor* src_tensor = src->GetMutable(); + PADDLE_ENFORCE(dst_tensor->numel() == src_tensor->numel(), "%lld vs %lld", + dst_tensor->numel(), src_tensor->numel()); + float* dst_data = dst_tensor->mutable_data(platform::CPUPlace()); + const float* src_data = src_tensor->data(); + for (size_t i = 0; i < src_tensor->numel(); ++i) { + dst_data[i] += src_data[i]; + } +} + +class Autograd { + public: + explicit Autograd(framework::Scope* scope) : scope_(scope) {} + + void RunBackward(VarBase* var) { + PADDLE_ENFORCE(var->pre_op_->op_desc_); + // TODO(panyx0718): Only create for vars that "require_grad" + (*var->pre_op_->output_vars_)[var->pre_op_out_idx_]->grads_ = var->grads_; + + std::deque ready; + ready.push_back(var->pre_op_); + + std::map dep_counts = ComputeDepCounts(var->pre_op_); + + while (!ready.empty()) { + OpBase* ready_op = ready.front(); + ready.pop_front(); + std::vector input_grads = ready_op->ApplyGrad(scope_); + + for (size_t i = 0; i < input_grads.size(); ++i) { + if (!input_grads[i]) continue; + OpBase* pre_op = ready_op->pre_ops_->at(i); + if (!pre_op) continue; + + dep_counts[pre_op] -= 1; + PADDLE_ENFORCE(dep_counts[pre_op] >= 0); + bool pre_op_ready = dep_counts[pre_op] == 0; + if (pre_op_ready) { + ready.push_back(pre_op); + } + } + } + } + + private: + std::map ComputeDepCounts(OpBase* op) { + std::map ret; + + std::deque queue; + queue.push_back(op); + std::unordered_set visited; + visited.insert(op); + while (!queue.empty()) { + OpBase* candidate = queue.front(); + queue.pop_front(); + for (OpBase* pre_op : *(candidate->pre_ops_)) { + if (!pre_op) continue; + if (visited.find(pre_op) == visited.end()) { + visited.insert(pre_op); + queue.push_back(pre_op); + } + ret[pre_op] += 1; + } + } + + return ret; + } + + framework::Scope* scope_; +}; + +framework::Variable* CreateVariable(const std::string& name, + const framework::DDim& dim, float val, + framework::Scope* scope, + bool random_name = true) { + std::string varname = name; + if (random_name) { + std::mt19937 rng; + rng.seed(std::random_device()()); + std::uniform_int_distribution dist6( + 1, std::numeric_limits::max()); + int id = dist6(rng); + varname = string::Sprintf("%s@%d", varname, id); + } + + VLOG(3) << "creating var " << varname; + framework::Variable* var = scope->Var(varname); + framework::LoDTensor* tensor = var->GetMutable(); + + float* data = tensor->mutable_data(dim, platform::CPUPlace()); + std::fill(data, data + tensor->numel(), val); + return var; +} + +framework::LoDTensor& VarBase::Grad() { + VLOG(3) << "get var grad " << var_desc_->Name(); + return *grads_->GetMutable(); +} + +void VarBase::ApplyGrad(framework::Scope* scope, Variable* grad) { + VLOG(3) << "apply var grad " << var_desc_->Name() << " " + << grad->Get().data()[0]; + if (!grads_) { + grads_ = + CreateVariable(string::Sprintf("%s@IGrad", var_desc_->Name()), + var_->Get().dims(), 0.0, scope); + } + AddTo(grad, grads_); + VLOG(3) << "grad_ after apply var grad " << var_desc_->Name() << " " + << grads_->Get().data()[0]; +} + +std::vector OpBase::ApplyGrad(framework::Scope* scope) { + VLOG(3) << "op grad " << grad_op_desc_->Type(); + + for (const std::string& grad_invar : grad_op_desc_->InputArgumentNames()) { + if (grad_to_var_->find(grad_invar) == grad_to_var_->end()) { + // grad op inputs can be forward inputs, so not in grad_to_var. + continue; + } + VLOG(3) << "op grad in var " << grad_invar; + block_->FindRecursiveOrCreateVar(grad_invar); + framework::Variable* var = scope->Var(grad_invar); + const std::string& invar = grad_to_var_->at(grad_invar); + for (VarBase* varbase : *output_vars_) { + // Use the accumulated grads_ by sharing the input with grads_. + if (varbase->var_desc_->Name() == invar) { + var->GetMutable()->ShareDataWith( + varbase->grads_->Get()); + break; + } + } + } + + for (const std::string& outvar : grad_op_desc_->OutputArgumentNames()) { + VLOG(3) << "grad outvar " << outvar; + block_->FindRecursiveOrCreateVar(outvar); + framework::Variable* var = scope->Var(outvar); + if (!var->IsInitialized()) { + framework::VarDesc* var_desc = block_->FindVar(outvar); + if (var_desc->GetType() == framework::proto::VarType::LOD_TENSOR) { + var->GetMutable(); + } else { + LOG(ERROR) << "tracer doesn't support yet"; + } + } + } + grad_op_desc_->InferShape(*block_); + grad_op_desc_->InferVarType(block_); + std::unique_ptr opbase = + framework::OpRegistry::CreateOp(*grad_op_desc_); + + opbase->Run(*scope, platform::CPUPlace()); + + // `ret` matches exactly with `input_vars_` of forward op. + std::vector ret; + for (size_t i = 0; i < input_vars_->size(); ++i) { + bool found = false; + for (const std::string& outvar : grad_op_desc_->OutputArgumentNames()) { + Variable* var = scope->FindVar(outvar); + VarBase* origin_var = (*input_vars_)[i]; + std::string orig_var = grad_to_var_->at(outvar); + PADDLE_ENFORCE(origin_var->var_desc_->Name() == orig_var); + VLOG(3) << "apply grad " << outvar << " with origin " << orig_var; + origin_var->ApplyGrad(scope, var); + found = true; + ret.push_back(var); + // TODO(panyx0718): There might be another outvar with the same name. + // In that case, it doesn't matter the first one or the second one is + // used. + break; + } + if (!found) { + ret.push_back(nullptr); + } + } + return ret; +} + +void VarBase::RunBackward(framework::Scope* scope) { + grads_ = CreateVariable(framework::GradVarName(var_desc_->Name()), + var_->Get().dims(), 1.0, scope, + false); + if (!pre_op_) return; + Autograd(scope).RunBackward(this); +} + +} // namespace imperative +} // namespace paddle diff --git a/paddle/fluid/imperative/layer.h b/paddle/fluid/imperative/layer.h new file mode 100644 index 0000000000000000000000000000000000000000..85a71ca83d21ed2595ddbe684300a46c05fed3af --- /dev/null +++ b/paddle/fluid/imperative/layer.h @@ -0,0 +1,102 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include "paddle/fluid/framework/op_desc.h" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/framework/var_desc.h" +#include "paddle/fluid/platform/enforce.h" + +namespace paddle { +namespace imperative { + +class OpBase; + +class VarBase { + public: + VarBase() + : pre_op_(nullptr), + pre_op_out_idx_(-1), + var_desc_(nullptr), + var_(nullptr), + grads_(nullptr) {} + + virtual ~VarBase() {} + + void ApplyGrad(framework::Scope* scope, framework::Variable* grad); + + void RunBackward(framework::Scope* scope); + + framework::LoDTensor& Grad(); + + OpBase* pre_op_; + int pre_op_out_idx_; + + framework::VarDesc* var_desc_; + framework::Variable* var_; + framework::Variable* grads_; +}; + +class OpBase { + public: + OpBase() + : input_vars_(new std::vector()), + output_vars_(new std::vector()), + pre_ops_(new std::vector()), + pre_ops_out_idx_(new std::vector()), + op_desc_(nullptr), + grad_op_desc_(nullptr) {} + + virtual ~OpBase() { + delete input_vars_; + delete output_vars_; + + delete pre_ops_; + delete pre_ops_out_idx_; + + if (grad_op_desc_) delete grad_op_desc_; + if (grad_to_var_) delete grad_to_var_; + } + + std::vector ApplyGrad(framework::Scope* scope); + + std::vector* input_vars_; + std::vector* output_vars_; + std::vector* pre_ops_; + std::vector* pre_ops_out_idx_; + framework::OpDesc* op_desc_; + + framework::OpDesc* grad_op_desc_; + std::unordered_map* grad_to_var_; + framework::BlockDesc* block_; +}; + +class Layer { + public: + virtual ~Layer() {} + + virtual std::vector Forward(const std::vector& inputs) { + std::vector vars; + return vars; + } + + virtual void Backward() { LOG(ERROR) << "To support customize"; } +}; + +} // namespace imperative +} // namespace paddle diff --git a/paddle/fluid/imperative/tracer.cc b/paddle/fluid/imperative/tracer.cc new file mode 100644 index 0000000000000000000000000000000000000000..f64f9e72c4a23528948183b909d65e90783a4463 --- /dev/null +++ b/paddle/fluid/imperative/tracer.cc @@ -0,0 +1,19 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/imperative/tracer.h" + +namespace paddle { +namespace imperative {} // namespace imperative +} // namespace paddle diff --git a/paddle/fluid/imperative/tracer.h b/paddle/fluid/imperative/tracer.h new file mode 100644 index 0000000000000000000000000000000000000000..433d07c0e5aa0986ab1e9fe349ef865d2851c0c0 --- /dev/null +++ b/paddle/fluid/imperative/tracer.h @@ -0,0 +1,128 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include +#include + +#include "paddle/fluid/framework/op_desc.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/imperative/engine.h" +#include "paddle/fluid/imperative/layer.h" + +namespace paddle { +namespace imperative { + +void CreateGradOp(const framework::OpDesc& op_desc, + const std::unordered_set& no_grad_set, + const std::vector& grad_sub_block, + framework::OpDesc** grad_op_desc, + std::unordered_map* grad_to_var) { + std::vector> grad_op_descs = + framework::OpInfoMap::Instance() + .Get(op_desc.Type()) + .GradOpMaker()(op_desc, no_grad_set, grad_to_var, grad_sub_block); + PADDLE_ENFORCE(grad_op_descs.size() == 1, "Only support 1 grad op now."); + // TODO(panyx0718): Leak? + *grad_op_desc = grad_op_descs[0].release(); +} + +class Tracer { + public: + explicit Tracer(framework::BlockDesc* root_block) : root_block_(root_block) { + root_scope_ = new framework::Scope(); + scopes_[root_block_] = root_scope_; + } + + virtual ~Tracer() { delete root_scope_; } + + void Trace(OpBase* op, const std::vector& inputs, + const std::vector& outputs, + framework::BlockDesc* block) { + framework::Scope* scope = GetScope(block); + framework::OpDesc* op_desc = op->op_desc_; + VLOG(3) << "tracer tracing " << op_desc->Type(); + op_desc->InferShape(*block); + op_desc->InferVarType(block); + std::unique_ptr op_base = + framework::OpRegistry::CreateOp(*op_desc); + + *op->input_vars_ = inputs; + for (VarBase* input : inputs) { + const std::string vname = input->var_desc_->Name(); + framework::Variable* var = scope->Var(vname); + input->var_ = var; + if (!var->IsInitialized()) { + framework::VarDesc* var_desc = block->FindVar(vname); + if (var_desc->GetType() == framework::proto::VarType::LOD_TENSOR) { + var->GetMutable(); + } else { + LOG(ERROR) << "tracer doesn't support yet"; + } + } + if (input->pre_op_) { + op->pre_ops_->push_back(input->pre_op_); + op->pre_ops_out_idx_->push_back(input->pre_op_out_idx_); + } else { + op->pre_ops_->push_back(nullptr); + } + } + + *op->output_vars_ = outputs; + for (size_t i = 0; i < outputs.size(); ++i) { + const std::string vname = outputs[i]->var_desc_->Name(); + framework::Variable* var = scope->Var(vname); + if (!var->IsInitialized()) { + framework::VarDesc* var_desc = block->FindVar(vname); + if (var_desc->GetType() == framework::proto::VarType::LOD_TENSOR) { + var->GetMutable(); + } else { + LOG(ERROR) << "tracer doesn't support yet"; + } + } + outputs[i]->var_ = var; + outputs[i]->pre_op_ = op; + outputs[i]->pre_op_out_idx_ = i; + } + op_base->Run(*scope, platform::CPUPlace()); + framework::OpDesc* grad_op_desc; + auto grad_to_var = new std::unordered_map(); + CreateGradOp(*op_desc, {}, {block}, &grad_op_desc, grad_to_var); + op->grad_op_desc_ = grad_op_desc; + op->grad_to_var_ = grad_to_var; + op->block_ = block; + } + + framework::Scope* GetScope(framework::BlockDesc* block) { + if (scopes_.find(block) != scopes_.end()) { + return scopes_.at(block); + } + framework::BlockDesc* parent_block = block->ParentBlock(); + PADDLE_ENFORCE(scopes_.find(parent_block) != scopes_.end()); + framework::Scope* scope = &scopes_[parent_block]->NewScope(); + scopes_[block] = scope; + return scope; + } + + private: + std::map scopes_; + framework::BlockDesc* root_block_; + framework::Scope* root_scope_; +}; + +} // namespace imperative +} // namespace paddle diff --git a/paddle/fluid/inference/analysis/argument.h b/paddle/fluid/inference/analysis/argument.h index 21203e2d9f4e4cd22ea49ea7b6808aff07e70eff..83d411eecf6d706615243fd78cb7e4330d904fc1 100644 --- a/paddle/fluid/inference/analysis/argument.h +++ b/paddle/fluid/inference/analysis/argument.h @@ -103,6 +103,7 @@ struct Argument { // Model specified with program and parameters files. DECL_ARGUMENT_FIELD(model_program_path, ModelProgramPath, std::string); DECL_ARGUMENT_FIELD(model_params_path, ModelParamsPath, std::string); + DECL_ARGUMENT_FIELD(model_from_memory, ModelFromMemory, bool); // The overall graph to work on. DECL_ARGUMENT_UNIQUE_FIELD(main_graph, MainGraph, framework::ir::Graph); @@ -115,6 +116,10 @@ struct Argument { DECL_ARGUMENT_FIELD(ir_analysis_passes, IrAnalysisPasses, std::vector); + // Pass a set of op types to enable its mkldnn kernel + DECL_ARGUMENT_FIELD(mkldnn_enabled_op_types, MKLDNNEnabledOpTypes, + std::unordered_set); + DECL_ARGUMENT_FIELD(use_gpu, UseGPU, bool); DECL_ARGUMENT_FIELD(gpu_device_id, GPUDeviceId, int); DECL_ARGUMENT_FIELD(use_tensorrt, UseTensorRT, bool); diff --git a/paddle/fluid/inference/analysis/ir_pass_manager.cc b/paddle/fluid/inference/analysis/ir_pass_manager.cc index fce5e1cac92064a320179243380ea02b2c5d7838..51bca8039d4531536cd7a3c39ef8a27f1a5412a1 100644 --- a/paddle/fluid/inference/analysis/ir_pass_manager.cc +++ b/paddle/fluid/inference/analysis/ir_pass_manager.cc @@ -63,6 +63,11 @@ void IRPassManager::CreatePasses(Argument *argument, pass->Set("graph_viz_path", new std::string(std::move(dot_file_path))); pass_num++; } + if (pass_name == "mkldnn_placement_pass") { + pass->Set("mkldnn_enabled_op_types", + new std::unordered_set( + argument->mkldnn_enabled_op_types())); + } if (pass_name == "tensorrt_subgraph_pass") { PADDLE_ENFORCE(argument->tensorrt_node_teller_valid()); diff --git a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc index c6b7c05f784b7c44fe30dd69529fe48405538ab6..4ffe5f575c232ccfc0089cb86e28737e56b32f94 100644 --- a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc +++ b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc @@ -178,11 +178,12 @@ void TensorRtSubgraphPass::CreateTensorRTOp(framework::ir::Node *node, output_mapping.push_back(output_name_map[name]); } - *block_desc.Proto()->mutable_vars() = - const_cast(&graph->program()) - ->Proto() - ->blocks(0) - .vars(); + auto *vars = block_desc.Proto()->mutable_vars(); + for (framework::ir::Node *node : graph->Nodes()) { + if (node->IsVar() && node->Var()) { + *vars->Add() = *node->Var()->Proto(); + } + } PADDLE_ENFORCE(!block_desc.Proto()->vars().empty(), "the block has no var-desc"); PADDLE_ENFORCE(!output_mapping.empty()); diff --git a/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.cc b/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.cc index 740030c3a80e4d7e2ac47998a304be97758b95cb..b8a045c18fab54581b4d2b902be373f55ad09e8a 100644 --- a/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.cc +++ b/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.cc @@ -46,7 +46,7 @@ void IrGraphBuildPass::RunImpl(Argument *argument) { argument->model_params_path_valid()) { auto program = LoadModel(argument->model_program_path(), argument->model_params_path(), - argument->scope_ptr(), place); + argument->scope_ptr(), place, argument->model_from_memory()); argument->SetMainProgram(program.release()); } else { PADDLE_THROW( @@ -68,9 +68,14 @@ std::unique_ptr IrGraphBuildPass::LoadModel( std::unique_ptr IrGraphBuildPass::LoadModel( const std::string &program_path, const std::string ¶ms_path, - framework::Scope *scope, const platform::Place &place) { + framework::Scope *scope, const platform::Place &place, + bool model_from_memory) { framework::Executor exe(place); - return Load(&exe, scope, program_path, params_path); + if (!model_from_memory) { + return Load(&exe, scope, program_path, params_path); + } else { + return LoadFromMemory(&exe, scope, program_path, params_path); + } } std::string IrGraphBuildPass::repr() const { return "ir-graph-build-pass"; } diff --git a/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.h b/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.h index 271e64fce579bc9001b1dd632576571cec949752..adbde0433fad28b006b18b47c8fd0a8946d21a98 100644 --- a/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.h +++ b/paddle/fluid/inference/analysis/passes/ir_graph_build_pass.h @@ -24,7 +24,7 @@ namespace inference { namespace analysis { /* - * Load program and parameter to memory from the disk. + * Load program and parameter to memory from the disk or directly from memory. */ class IrGraphBuildPass : public AnalysisPass { public: @@ -38,7 +38,8 @@ class IrGraphBuildPass : public AnalysisPass { const platform::Place &place); std::unique_ptr LoadModel( const std::string &program_path, const std::string ¶ms_path, - framework::Scope *scope, const platform::Place &place); + framework::Scope *scope, const platform::Place &place, + bool model_from_memory); std::string model_binary_str_; }; diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc index dd75f0d9a65404908667d873786160ddaa73fa57..dcefdd92f5157dce7426f2f3e4a2bc053ce24775 100644 --- a/paddle/fluid/inference/api/analysis_config.cc +++ b/paddle/fluid/inference/api/analysis_config.cc @@ -49,10 +49,15 @@ contrib::AnalysisConfig::AnalysisConfig(const contrib::AnalysisConfig &other) { cpu_math_library_num_threads_ = other.cpu_math_library_num_threads_; // fields from this. enable_ir_optim = other.enable_ir_optim; + // For mkldnn + use_mkldnn_ = other.use_mkldnn_; + mkldnn_enabled_op_types_ = other.mkldnn_enabled_op_types_; + use_feed_fetch_ops = other.use_feed_fetch_ops; use_tensorrt_ = other.use_tensorrt_; tensorrt_max_batchsize_ = other.tensorrt_max_batchsize_; tensorrt_workspace_size_ = other.tensorrt_workspace_size_; + model_from_memory_ = other.model_from_memory_; if (use_gpu) { pass_builder_.reset(new GpuPassStrategy( @@ -76,10 +81,16 @@ contrib::AnalysisConfig::AnalysisConfig(contrib::AnalysisConfig &&other) { cpu_math_library_num_threads_ = other.cpu_math_library_num_threads_; // fields from this. enable_ir_optim = other.enable_ir_optim; + // For mkldnn + use_mkldnn_ = other.use_mkldnn_; + mkldnn_enabled_op_types_ = other.mkldnn_enabled_op_types_; + use_feed_fetch_ops = other.use_feed_fetch_ops; use_tensorrt_ = other.use_tensorrt_; tensorrt_max_batchsize_ = other.tensorrt_max_batchsize_; tensorrt_workspace_size_ = other.tensorrt_workspace_size_; + model_from_memory_ = other.model_from_memory_; + pass_builder_ = std::move(other.pass_builder_); } @@ -102,4 +113,13 @@ void contrib::AnalysisConfig::EnableTensorRtEngine(int workspace_size, pass_builder()->InsertPass(1, "tensorrt_subgraph_pass"); } +void contrib::AnalysisConfig::SetModelBuffer(const char *prog_buffer, + size_t prog_buffer_size, + const char *param_buffer, + size_t param_buffer_size) { + prog_file = std::string(prog_buffer, prog_buffer + prog_buffer_size); + param_file = std::string(param_buffer, param_buffer + param_buffer_size); + model_from_memory_ = true; +} + } // namespace paddle diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index 391330a7c0f2dda731fe8455fdab81b276e3f272..be51e7fc1f01c5fc4a48c7f32db15bb82a5ddc07 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -308,6 +308,7 @@ void AnalysisPredictor::OptimizeInferenceProgram() { argument_.SetUseGPU(config_.use_gpu); argument_.SetGPUDeviceId(config_.device); + argument_.SetModelFromMemory(config_.model_from_memory_); // Analyze inference_program if (!config_.model_dir.empty()) { argument_.SetModelDir(config_.model_dir); @@ -326,6 +327,10 @@ void AnalysisPredictor::OptimizeInferenceProgram() { argument_.SetTensorRtMaxBatchSize(config_.tensorrt_max_batchsize_); } + if (config_.use_mkldnn_) { + argument_.SetMKLDNNEnabledOpTypes(config_.mkldnn_enabled_op_types_); + } + auto passes = config_.pass_builder()->AllPasses(); if (!config_.enable_ir_optim) passes.clear(); argument_.SetIrAnalysisPasses(passes); @@ -448,20 +453,24 @@ bool AnalysisPredictor::LoadProgramDesc() { return false; } - std::string pb_content; - // Read binary - std::ifstream fin(filename, std::ios::in | std::ios::binary); - PADDLE_ENFORCE(static_cast(fin), "Cannot open file %s", filename); - fin.seekg(0, std::ios::end); - - pb_content.resize(fin.tellg()); - fin.seekg(0, std::ios::beg); - fin.read(&(pb_content.at(0)), pb_content.size()); - fin.close(); - // Create ProgramDesc framework::proto::ProgramDesc proto; - proto.ParseFromString(pb_content); + if (!config_.model_from_memory()) { + std::string pb_content; + // Read binary + std::ifstream fin(filename, std::ios::in | std::ios::binary); + PADDLE_ENFORCE(static_cast(fin.is_open()), "Cannot open file %s", + filename); + fin.seekg(0, std::ios::end); + pb_content.resize(fin.tellg()); + fin.seekg(0, std::ios::beg); + fin.read(&(pb_content.at(0)), pb_content.size()); + fin.close(); + + proto.ParseFromString(pb_content); + } else { + proto.ParseFromString(config_.prog_file); + } inference_program_.reset(new framework::ProgramDesc(proto)); return true; } @@ -469,6 +478,7 @@ bool AnalysisPredictor::LoadProgramDesc() { bool AnalysisPredictor::LoadParameters() { PADDLE_ENFORCE_NOT_NULL(inference_program_.get(), "The inference program should be loaded first."); + const auto &global_block = inference_program_->MutableBlock(0); // create a temporary program to load parameters. diff --git a/paddle/fluid/inference/api/paddle_analysis_config.h b/paddle/fluid/inference/api/paddle_analysis_config.h index a09bd1cac2aa31b2ecee9b0f77d2b777104f1161..f05b9832da55f10b34eb2df914e443a478e5a4a4 100644 --- a/paddle/fluid/inference/api/paddle_analysis_config.h +++ b/paddle/fluid/inference/api/paddle_analysis_config.h @@ -16,6 +16,7 @@ #include #include #include +#include #include // Here we include some header files with relative paths, for that in deploy, @@ -52,18 +53,26 @@ struct AnalysisConfig : public NativeConfig { bool use_tensorrt() const { return use_tensorrt_; } void EnableMKLDNN(); - // NOTE this is just for internal development, please not use it. - // NOT stable yet. bool use_mkldnn() const { return use_mkldnn_; } + void SetMKLDNNOp(std::unordered_set op_list) { + mkldnn_enabled_op_types_ = op_list; + } + + // Specify the memory buffer of program and parameter + void SetModelBuffer(const char* prog_buffer, size_t prog_buffer_size, + const char* program_buffer, size_t program_buffer_size); + bool model_from_memory() const { return model_from_memory_; } friend class ::paddle::AnalysisPredictor; protected: bool use_tensorrt_{false}; bool use_mkldnn_{false}; + std::unordered_set mkldnn_enabled_op_types_; int tensorrt_workspace_size_; int tensorrt_max_batchsize_; std::unique_ptr pass_builder_; + bool model_from_memory_{false}; }; // Configurations for Anakin engine. diff --git a/paddle/fluid/inference/api/paddle_pass_builder.h b/paddle/fluid/inference/api/paddle_pass_builder.h index 825bee833bf918067497f56adebbbcaf55f892a2..bc5139a7e54eaf7133ea96ae3b36915a236a2c5e 100644 --- a/paddle/fluid/inference/api/paddle_pass_builder.h +++ b/paddle/fluid/inference/api/paddle_pass_builder.h @@ -98,9 +98,10 @@ class CpuPassStrategy : public PassStrategy { passes_.insert(passes_.begin(), "mkldnn_placement_pass"); for (auto &pass : - std::vector({"depthwise_conv_mkldnn_pass", // - "conv_bias_mkldnn_fuse_pass", // - "conv_relu_mkldnn_fuse_pass", // + std::vector({"depthwise_conv_mkldnn_pass", // + "conv_bias_mkldnn_fuse_pass", // + "conv3d_bias_mkldnn_fuse_pass", // + "conv_relu_mkldnn_fuse_pass", // "conv_elementwise_add_mkldnn_fuse_pass"})) { passes_.push_back(pass); } diff --git a/paddle/fluid/inference/io.cc b/paddle/fluid/inference/io.cc index 31f43bfdcaafb18c611d86ef26fd9de118562799..24d15f12f9cd4a9280cd316bd727fdbccb831b9b 100644 --- a/paddle/fluid/inference/io.cc +++ b/paddle/fluid/inference/io.cc @@ -69,7 +69,8 @@ bool IsPersistable(const framework::VarDesc* var) { void LoadPersistables(framework::Executor* executor, framework::Scope* scope, const framework::ProgramDesc& main_program, const std::string& dirname, - const std::string& param_filename) { + const std::string& param_filename, + bool model_from_memory = false) { const framework::BlockDesc& global_block = main_program.Block(0); framework::ProgramDesc* load_program = new framework::ProgramDesc(); @@ -108,6 +109,7 @@ void LoadPersistables(framework::Executor* executor, framework::Scope* scope, op->SetType("load_combine"); op->SetOutput("Out", paramlist); op->SetAttr("file_path", {param_filename}); + op->SetAttr("model_from_memory", {model_from_memory}); op->CheckAttrs(); } @@ -130,16 +132,17 @@ std::unique_ptr Load(framework::Executor* executor, "model version %ld is not supported.", main_program->Version()); - LoadPersistables(executor, scope, *main_program, dirname, ""); + // model_from_memory is false in seperate parameters. + LoadPersistables(executor, scope, *main_program, dirname, "", + false /* model_from_memory */); return main_program; } std::unique_ptr Load( framework::Executor* executor, framework::Scope* scope, const std::string& prog_filename, const std::string& param_filename) { - std::string model_filename = prog_filename; std::string program_desc_str; - ReadBinaryFile(model_filename, &program_desc_str); + ReadBinaryFile(prog_filename, &program_desc_str); std::unique_ptr main_program( new framework::ProgramDesc(program_desc_str)); @@ -147,7 +150,22 @@ std::unique_ptr Load( "model version %ld is not supported.", main_program->Version()); - LoadPersistables(executor, scope, *main_program, "", param_filename); + LoadPersistables(executor, scope, *main_program, "", param_filename, + false /* model_from_memory */); + return main_program; +} + +std::unique_ptr LoadFromMemory( + framework::Executor* executor, framework::Scope* scope, + const std::string& prog_buffer, const std::string& param_buffer) { + std::unique_ptr main_program( + new framework::ProgramDesc(prog_buffer)); + PADDLE_ENFORCE(framework::IsProgramVersionSupported(main_program->Version()), + "model version %ld is not supported.", + main_program->Version()); + + LoadPersistables(executor, scope, *main_program, "", param_buffer, + true /* model_filename */); return main_program; } diff --git a/paddle/fluid/inference/io.h b/paddle/fluid/inference/io.h index ab492577c1476abee30d6dd1c740394391e5a93a..317ef9d93acf3af174cb44da6099425fff1418eb 100644 --- a/paddle/fluid/inference/io.h +++ b/paddle/fluid/inference/io.h @@ -30,7 +30,8 @@ void Init(const std::vector argv); void LoadPersistables(framework::Executor* executor, framework::Scope* scope, const framework::ProgramDesc& main_program, const std::string& dirname, - const std::string& param_filename); + const std::string& param_filename, + bool model_from_memory); std::unique_ptr Load(framework::Executor* executor, framework::Scope* scope, @@ -41,6 +42,10 @@ std::unique_ptr Load(framework::Executor* executor, const std::string& prog_filename, const std::string& param_filename); +std::unique_ptr LoadFromMemory( + framework::Executor* executor, framework::Scope* scope, + const std::string& prog_buffer, const std::string& param_buffer); + // Save the variables from a scope to disk. void SaveVars(const framework::Scope& scope, const std::vector& vars, const std::string& dirname, diff --git a/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc b/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc index 343fd3f7c5aed6931fc215445c17d3ed7074368e..1d0d83d1f368f879878a4df8b2eefae0bc89423d 100644 --- a/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/pool2d_op.cc @@ -109,8 +109,12 @@ class Pool2dOpConverter : public OpConverter { } if (pool_type == "max") { - nvinfer1::DimsHW pre_pad(paddings[0], paddings[1]); - nvinfer1::DimsHW post_pad(paddings[0], paddings[1]); + // Under ceil mode, the pre_pad and post_pad are used to + // record the the padding size. In some ceil mode cases, + // we do not need padding, so we initialize the two vars to 0. + + nvinfer1::DimsHW pre_pad(0, 0); + nvinfer1::DimsHW post_pad(0, 0); if (ceil_mode) { // If ceil mode is true, we will pad the appropriate size to the input. DealCeilMode(input_shape, ksize, strides, paddings, &pre_pad, &post_pad, diff --git a/paddle/fluid/inference/tensorrt/convert/test_prelu_op.cc b/paddle/fluid/inference/tensorrt/convert/test_prelu_op.cc index 453f222f1f1e3f3b9ee8fa7bd49f4cab2286e7ea..b086c910d38a243d98315f2d6eb82ecc0ec5c06d 100644 --- a/paddle/fluid/inference/tensorrt/convert/test_prelu_op.cc +++ b/paddle/fluid/inference/tensorrt/convert/test_prelu_op.cc @@ -90,5 +90,4 @@ TEST(prelu_op, test_scalar) { } // namespace inference } // namespace paddle -// USE_OP(prelu); -USE_CPU_ONLY_OP(prelu); +USE_OP(prelu); diff --git a/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt b/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt index e822785ad6f4f6f67b72141f3e7b04aefa72e58b..95443e813327c1247ac530c4d2e68b3607ff0e73 100644 --- a/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt +++ b/paddle/fluid/inference/tensorrt/plugin/CMakeLists.txt @@ -1,4 +1,4 @@ nv_library(tensorrt_plugin SRCS trt_plugin.cc split_op_plugin.cu elementwise_op_plugin.cu prelu_op_plugin.cu avg_pool_op_plugin.cu - DEPS enforce tensorrt_engine) + DEPS enforce tensorrt_engine prelu) diff --git a/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.cu b/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.cu index e8f4254402a5d8a5e6c5a2384bf9fbe48341956e..3075e87ea6d719a3f49d14c8c4b8015f7d688a50 100644 --- a/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.cu +++ b/paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.cu @@ -14,92 +14,16 @@ #include #include +#include #include "glog/logging.h" #include "paddle/fluid/inference/tensorrt/plugin/prelu_op_plugin.h" +#include "paddle/fluid/operators/math/prelu.h" namespace paddle { namespace inference { namespace tensorrt { namespace plugin { -static const int CUDA_NUM_THREADS = 1024; -static const int CUDA_MAX_NUM_BLOCKS = 65535; -inline static int GET_NUM_BLOCKS(const int N) { - return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS; -} - -__global__ void PReluChannelWiseKernel(const float *input, const float *alpha, - float *output, int channel, - size_t spatial_size) { - size_t offset = blockIdx.x * spatial_size; - const float *in = input + offset; - float *out = output + offset; - float scale = alpha[blockIdx.x % channel]; - - for (size_t i = threadIdx.x; i < spatial_size; i += blockDim.x) { - float x = in[i]; - out[i] = (x > 0) ? x : scale * x; - } -} - -__global__ void PReluElementWiseKernel(const float *input, const float *alpha, - float *output, size_t spatial_size) { - size_t offset = blockIdx.x * spatial_size; - const float *in = input + offset; - const float *scale = alpha + offset; - float *out = output + offset; - - for (size_t i = threadIdx.x; i < spatial_size; i += blockDim.x) { - float x = in[i]; - out[i] = (x > 0) ? x : scale[i] * x; - } -} - -__global__ void PReluScalarKernel(const float *input, const float *alpha, - float *output, size_t spatial_size) { - size_t offset = blockIdx.x * spatial_size; - const float *in = input + offset; - float scale = *alpha; - float *out = output + offset; - - for (size_t i = threadIdx.x; i < spatial_size; i += blockDim.x) { - float x = in[i]; - out[i] = (x > 0) ? x : scale * x; - } -} - -static inline void PReluChannelWise(cudaStream_t stream, const float *input, - const float *alpha, float *output, - int batch_size, - const nvinfer1::Dims &dims) { - size_t unroll = batch_size * dims.d[0]; - size_t spatial_size = dims.d[1] * dims.d[2]; - CHECK_LT(unroll, CUDA_MAX_NUM_BLOCKS); - PReluChannelWiseKernel<<>>( - input, alpha, output, dims.d[0], spatial_size); -} - -static inline void PReluElementWise(cudaStream_t stream, const float *input, - const float *alpha, float *output, - int batch_size, - const nvinfer1::Dims &dims) { - size_t unroll = batch_size * dims.d[0]; - size_t spatial_size = dims.d[1] * dims.d[2]; - CHECK_LT(unroll, CUDA_MAX_NUM_BLOCKS); - PReluElementWiseKernel<<>>( - input, alpha, output, spatial_size); -} - -static inline void PReluScalar(cudaStream_t stream, const float *input, - const float *alpha, float *output, - int batch_size, const nvinfer1::Dims &dims) { - size_t unroll = batch_size * dims.d[0]; - size_t spatial_size = dims.d[1] * dims.d[2]; - CHECK_LT(unroll, CUDA_MAX_NUM_BLOCKS); - PReluScalarKernel<<>>( - input, alpha, output, spatial_size); -} - nvinfer1::Dims PReluPlugin::getOutputDimensions(int index, const nvinfer1::Dims *inputDims, int nbInputs) { @@ -110,19 +34,31 @@ nvinfer1::Dims PReluPlugin::getOutputDimensions(int index, return output_dims; } -int PReluPlugin::enqueue(int batchSize, const void *const *inputs, +int PReluPlugin::enqueue(int batch_size, const void *const *inputs, void **outputs, void *workspace, cudaStream_t stream) { // input dims is CHW. const auto &input_dims = this->getInputDims(0); const float *input = reinterpret_cast(inputs[0]); const float *alpha = reinterpret_cast(alpha_.get().values); float *output = reinterpret_cast(outputs)[0]; + + std::vector input_shape; + input_shape.push_back(batch_size); + for (int i = 0; i < input_dims.nbDims; i++) { + input_shape.push_back(input_dims.d[i]); + } + if (mode_ == "channel") { - PReluChannelWise(stream, input, alpha, output, batchSize, input_dims); + operators::math::PreluChannelWiseDirectCUDAFunctor + prelu_channel_wise; + prelu_channel_wise(stream, input, alpha, output, input_shape); } else if (mode_ == "element") { - PReluElementWise(stream, input, alpha, output, batchSize, input_dims); + operators::math::PreluElementWiseDirectCUDAFunctor + prelu_element_wise; + prelu_element_wise(stream, input, alpha, output, input_shape); } else { - PReluScalar(stream, input, alpha, output, batchSize, input_dims); + operators::math::PreluScalarDirectCUDAFunctor prelu_scalar; + prelu_scalar(stream, input, alpha, output, input_shape); } return cudaGetLastError() != cudaSuccess; } diff --git a/paddle/fluid/inference/tests/api/analyzer_dam_tester.cc b/paddle/fluid/inference/tests/api/analyzer_dam_tester.cc index a3a6130db7cfe75ef558dc901883c29a20088b3f..227e2ff45873fded45899146b97a7bee0c8ad763 100644 --- a/paddle/fluid/inference/tests/api/analyzer_dam_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_dam_tester.cc @@ -188,10 +188,16 @@ void SetInput(std::vector> *inputs) { } // Easy for profiling independently. -TEST(Analyzer_dam, profile) { +void profile(bool use_mkldnn = false) { contrib::AnalysisConfig cfg; SetConfig(&cfg); + if (use_mkldnn) { + cfg.EnableMKLDNN(); + std::unordered_set op_list = {"conv3d"}; + cfg.SetMKLDNNOp(op_list); + } + std::vector outputs; std::vector> input_slots_all; SetInput(&input_slots_all); @@ -209,6 +215,11 @@ TEST(Analyzer_dam, profile) { } } +TEST(Analyzer_dam, profile) { profile(); } +#ifdef PADDLE_WITH_MKLDNN +TEST(Analyzer_dam, profile_mkldnn) { profile(true /* use_mkldnn */); } +#endif + // Check the fuse status TEST(Analyzer_dam, fuse_statis) { contrib::AnalysisConfig cfg; @@ -222,9 +233,14 @@ TEST(Analyzer_dam, fuse_statis) { } // Compare result of NativeConfig and AnalysisConfig -TEST(Analyzer_dam, compare) { - contrib::AnalysisConfig cfg; +void compare(bool use_mkldnn = false) { + AnalysisConfig cfg; SetConfig(&cfg); + if (use_mkldnn) { + cfg.EnableMKLDNN(); + std::unordered_set op_list = {"conv3d"}; + cfg.SetMKLDNNOp(op_list); + } std::vector> input_slots_all; SetInput(&input_slots_all); @@ -233,5 +249,10 @@ TEST(Analyzer_dam, compare) { reinterpret_cast(&cfg), input_slots_all); } +TEST(Analyzer_dam, compare) { compare(); } +#ifdef PADDLE_WITH_MKLDNN +TEST(Analyzer_dam, compare_mkldnn) { compare(true /* use_mkldnn */); } +#endif + } // namespace inference } // namespace paddle diff --git a/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc b/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc index 3a5f844de3cae7eb9b6e3555c5219c6cf8ee1919..66d85420c5701b1bf308b6850465beb6d8a0b703 100644 --- a/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc +++ b/paddle/fluid/inference/tests/api/analyzer_ner_tester.cc @@ -93,9 +93,17 @@ void PrepareInputs(std::vector *input_slots, DataRecord *data, } } -void SetConfig(contrib::AnalysisConfig *cfg) { - cfg->prog_file = FLAGS_infer_model + "/__model__"; - cfg->param_file = FLAGS_infer_model + "/param"; +void SetConfig(contrib::AnalysisConfig *cfg, bool memory_load = false) { + if (memory_load) { + std::string buffer_prog, buffer_param; + ReadBinaryFile(FLAGS_infer_model + "/__model__", &buffer_prog); + ReadBinaryFile(FLAGS_infer_model + "/param", &buffer_param); + cfg->SetModelBuffer(&buffer_prog[0], buffer_prog.size(), &buffer_param[0], + buffer_param.size()); + } else { + cfg->prog_file = FLAGS_infer_model + "/__model__"; + cfg->param_file = FLAGS_infer_model + "/param"; + } cfg->use_gpu = false; cfg->device = 0; cfg->specify_input_name = true; @@ -114,9 +122,9 @@ void SetInput(std::vector> *inputs) { } // Easy for profiling independently. -TEST(Analyzer_Chinese_ner, profile) { +void profile(bool memory_load = false) { contrib::AnalysisConfig cfg; - SetConfig(&cfg); + SetConfig(&cfg, memory_load); std::vector outputs; std::vector> input_slots_all; @@ -138,6 +146,12 @@ TEST(Analyzer_Chinese_ner, profile) { } } +TEST(Analyzer_Chinese_ner, profile) { profile(); } + +TEST(Analyzer_Chinese_ner, profile_memory_load) { + profile(true /* memory_load */); +} + // Check the fuse status TEST(Analyzer_Chinese_ner, fuse_statis) { contrib::AnalysisConfig cfg; diff --git a/paddle/fluid/inference/tests/api/config_printer.h b/paddle/fluid/inference/tests/api/config_printer.h index 4231eef7220735d0b80eb1adc951c55ff7378f1b..7046bce303e2bd46197ab512ae273500b9af88bf 100644 --- a/paddle/fluid/inference/tests/api/config_printer.h +++ b/paddle/fluid/inference/tests/api/config_printer.h @@ -49,8 +49,6 @@ std::ostream &operator<<(std::ostream &os, const NativeConfig &config) { os << GenSpaces(num_spaces) << "device: " << config.device << "\n"; os << GenSpaces(num_spaces) << "fraction_of_gpu_memory: " << config.fraction_of_gpu_memory << "\n"; - os << GenSpaces(num_spaces) << "prog_file: " << config.prog_file << "\n"; - os << GenSpaces(num_spaces) << "param_file: " << config.param_file << "\n"; os << GenSpaces(num_spaces) << "specify_input_name: " << config.specify_input_name << "\n"; os << GenSpaces(num_spaces) @@ -65,6 +63,13 @@ std::ostream &operator<<(std::ostream &os, os << GenSpaces(num_spaces) << "contrib::AnalysisConfig {\n"; num_spaces++; os << *reinterpret_cast(&config); + if (!config.model_from_memory()) { + os << GenSpaces(num_spaces) << "prog_file: " << config.prog_file << "\n"; + os << GenSpaces(num_spaces) << "param_file: " << config.param_file << "\n"; + } else { + os << GenSpaces(num_spaces) + << "prog_file and param_file: load from memory \n"; + } os << GenSpaces(num_spaces) << "enable_ir_optim: " << config.enable_ir_optim << "\n"; os << GenSpaces(num_spaces) diff --git a/paddle/fluid/inference/utils/CMakeLists.txt b/paddle/fluid/inference/utils/CMakeLists.txt index 2104e4ac7222258ee025bd5acd60b1db251df654..cfb80fe6ec11a55a887c7552ec4e6a8a0c6a2fce 100644 --- a/paddle/fluid/inference/utils/CMakeLists.txt +++ b/paddle/fluid/inference/utils/CMakeLists.txt @@ -1,2 +1,7 @@ cc_library(benchmark SRCS benchmark.cc DEPS enforce) cc_test(test_benchmark SRCS benchmark_tester.cc DEPS benchmark) +cc_binary(visualizer SRCS visualizer.cc DEPS analysis + paddle_pass_builder ir_pass_manager pass graph_viz_pass analysis_passes) +if(WIN32) + target_link_libraries(visualizer shlwapi) +endif(WIN32) diff --git a/paddle/fluid/inference/utils/visualizer.cc b/paddle/fluid/inference/utils/visualizer.cc new file mode 100644 index 0000000000000000000000000000000000000000..040b6476fb4febc5ca1912c8db72dc63c3bced08 --- /dev/null +++ b/paddle/fluid/inference/utils/visualizer.cc @@ -0,0 +1,92 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/inference/utils/visualizer.h" +#include +#include +#include +#include +#include "paddle/fluid/framework/ir/graph_viz_pass.h" +#include "paddle/fluid/inference/analysis/analyzer.h" +#include "paddle/fluid/inference/analysis/passes/ir_analysis_pass.h" +#include "paddle/fluid/platform/init.h" + +DEFINE_string(model_dir, "", "model directory"); +DEFINE_string(model_program_path, "", "model program path"); +DEFINE_string(model_params_path, "", "model params path"); + +USE_PASS(graph_viz_pass); +USE_PASS(graph_to_program_pass); + +using paddle::inference::analysis::Argument; + +namespace paddle { +namespace inference { +namespace utils { + +void Visualizer::SetArgument(Argument *argument) { argument_ = argument; } + +bool Visualizer::Run() { + paddle::framework::InitDevices(false); + paddle::inference::analysis::Analyzer().Run(argument_); + + return true; +} + +} // namespace utils +} // namespace inference +} // namespace paddle + +// Generate a dot file describing the structure of graph. +// To use this tool, run command: ./visualizer [options...] +// Options: +// --model_dir: the directory of model +// --model_program_path: the path of program +// --model_params_path: the path of params +int main(int argc, char *argv[]) { + gflags::ParseCommandLineFlags(&argc, &argv, true); + google::InitGoogleLogging(argv[0]); + + paddle::inference::analysis::Argument argument; + argument.SetUseGPU(false); + argument.SetUseTensorRT(false); + + if (FLAGS_model_dir.empty()) { + if (FLAGS_model_program_path.empty() || FLAGS_model_params_path.empty()) { + LOG(ERROR) << "Please set model_dir" + " or model_program_path and model_params_path"; + return -1; + } else { + argument.SetModelProgramPath(FLAGS_model_program_path); + argument.SetModelParamsPath(FLAGS_model_params_path); + } + } else { + argument.SetModelDir(FLAGS_model_dir); + } + + // Only 1 pass, default filename is 0_ir_origin.dot + // For more details, looking for paddle::inference::analysis::IRPassManager + argument.SetIrAnalysisPasses({"graph_viz_pass"}); + + std::unique_ptr scope{ + new paddle::framework::Scope()}; + argument.SetScopeNotOwned( + const_cast(scope.get())); + + paddle::inference::utils::Visualizer visualizer; + visualizer.SetArgument(&argument); + visualizer.Run(); + + return 0; +} diff --git a/paddle/fluid/inference/utils/visualizer.h b/paddle/fluid/inference/utils/visualizer.h new file mode 100644 index 0000000000000000000000000000000000000000..be532f92cf60e06094bfcf8cc2be85085795fcf4 --- /dev/null +++ b/paddle/fluid/inference/utils/visualizer.h @@ -0,0 +1,42 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include +#include "paddle/fluid/inference/analysis/argument.h" + +namespace paddle { +namespace inference { +namespace utils { + +using paddle::inference::analysis::Argument; + +class Visualizer final { + public: + Visualizer() = default; + ~Visualizer() = default; + Visualizer(const Visualizer &) = delete; + Visualizer &operator=(const Visualizer &) = delete; + + void SetArgument(Argument *); + bool Run(); + + private: + Argument *argument_; +}; + +} // namespace utils +} // namespace inference +} // namespace paddle diff --git a/paddle/fluid/memory/allocation/legacy_allocator.cc b/paddle/fluid/memory/allocation/legacy_allocator.cc index 26e2038a534c18d2b7ab77adf33846803dcffcf5..64aa63ffe9705d75e70c8d9d9cbc433dd6358596 100644 --- a/paddle/fluid/memory/allocation/legacy_allocator.cc +++ b/paddle/fluid/memory/allocation/legacy_allocator.cc @@ -14,11 +14,13 @@ #include "paddle/fluid/memory/allocation/legacy_allocator.h" #include +#include #include "glog/logging.h" #include "paddle/fluid/memory/detail/buddy_allocator.h" #include "paddle/fluid/memory/detail/system_allocator.h" #include "paddle/fluid/platform/gpu_info.h" #include "paddle/fluid/string/printf.h" +#include "paddle/fluid/string/split.h" DEFINE_bool(init_allocated_mem, false, "It is a mistake that the values of the memory allocated by " @@ -86,7 +88,7 @@ struct NaiveAllocator { template <> void *Alloc(const platform::CPUPlace &place, size_t size) { - VLOG(1) << "Allocate " << size << " bytes on " << platform::Place(place); + VLOG(10) << "Allocate " << size << " bytes on " << platform::Place(place); void *p = GetCPUBuddyAllocator()->Alloc(size); if (FLAGS_init_allocated_mem) { memset(p, 0xEF, size); @@ -97,7 +99,7 @@ void *Alloc(const platform::CPUPlace &place, size_t size) { template <> void Free(const platform::CPUPlace &place, void *p) { - VLOG(1) << "Free pointer=" << p << " on " << platform::Place(place); + VLOG(10) << "Free pointer=" << p << " on " << platform::Place(place); GetCPUBuddyAllocator()->Free(p); } @@ -110,19 +112,21 @@ size_t Used(const platform::CPUPlace &place) { BuddyAllocator *GetGPUBuddyAllocator(int gpu_id) { static std::once_flag init_flag; static detail::BuddyAllocator **a_arr = nullptr; + static std::vector devices; std::call_once(init_flag, [gpu_id]() { - int gpu_num = platform::GetCUDADeviceCount(); - PADDLE_ENFORCE(gpu_id < gpu_num, "gpu_id:%d should < gpu_num:%d", gpu_id, - gpu_num); + devices = platform::GetSelectedDevices(); + int gpu_num = devices.size(); a_arr = new BuddyAllocator *[gpu_num]; - for (int i = 0; i < gpu_num; i++) { + for (size_t i = 0; i < devices.size(); ++i) { + int dev_id = devices[i]; a_arr[i] = nullptr; - platform::SetDeviceId(i); - a_arr[i] = new BuddyAllocator( - std::unique_ptr(new detail::GPUAllocator(i)), - platform::GpuMinChunkSize(), platform::GpuMaxChunkSize()); + platform::SetDeviceId(dev_id); + a_arr[i] = new BuddyAllocator(std::unique_ptr( + new detail::GPUAllocator(dev_id)), + platform::GpuMinChunkSize(), + platform::GpuMaxChunkSize()); VLOG(10) << "\n\nNOTE: each GPU device use " << FLAGS_fraction_of_gpu_memory_to_use * 100 @@ -134,7 +138,9 @@ BuddyAllocator *GetGPUBuddyAllocator(int gpu_id) { }); platform::SetDeviceId(gpu_id); - return a_arr[gpu_id]; + auto pos = std::distance(devices.begin(), + std::find(devices.begin(), devices.end(), gpu_id)); + return a_arr[pos]; } #endif diff --git a/paddle/fluid/operators/CMakeLists.txt b/paddle/fluid/operators/CMakeLists.txt index 8c8dc7026e1b3c1bb1899ebcf151f52711ea5bc1..257bfc0a3f926d20abc4647b27e8e9cc2c49e014 100644 --- a/paddle/fluid/operators/CMakeLists.txt +++ b/paddle/fluid/operators/CMakeLists.txt @@ -70,7 +70,7 @@ endif() set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence_padding sequence_scale cos_sim_functor memory jit_kernel concat_and_split cross_entropy softmax vol2col im2col sampler) set(COMMON_OP_DEPS ${COMMON_OP_DEPS} sequence2batch lstm_compute matrix_bit_code gru_compute activation_functions) if (WITH_GPU) - set(COMMON_OP_DEPS ${COMMON_OP_DEPS} depthwise_conv) + set(COMMON_OP_DEPS ${COMMON_OP_DEPS} depthwise_conv prelu) endif() # FIXME(typhoonzero): operator deps may not needed. diff --git a/paddle/fluid/operators/activation_mkldnn_op.cc b/paddle/fluid/operators/activation_mkldnn_op.cc index 64649b1a5e471a30f435e2b1c1a9db03d35dbd8a..e16b6f78d16ce29cc493c4c795c7fe97a4bf2550 100644 --- a/paddle/fluid/operators/activation_mkldnn_op.cc +++ b/paddle/fluid/operators/activation_mkldnn_op.cc @@ -100,8 +100,9 @@ void eltwise_forward(const framework::ExecutionContext &ctx, const T *x_data = x->data(); T *y_data = y->mutable_data(ctx.GetPlace()); - PADDLE_ENFORCE(x->dims().size() == 2 || x->dims().size() == 4, - "Input dim must be with 2 or 4"); + PADDLE_ENFORCE( + x->dims().size() == 2 || x->dims().size() == 3 || x->dims().size() == 4, + "Input dim must be with 2, 3 or 4"); std::vector src_tz = framework::vectorize2int(x->dims()); diff --git a/paddle/fluid/operators/activation_op.cc b/paddle/fluid/operators/activation_op.cc index 832245371e0b1966000ec0252a58ca02193332a7..9c5b8604f40ae56c463b54c71623feb61bd8d297 100644 --- a/paddle/fluid/operators/activation_op.cc +++ b/paddle/fluid/operators/activation_op.cc @@ -76,8 +76,8 @@ framework::OpKernelType GetKernelType(const framework::ExecutionContext& ctx, } #endif return framework::OpKernelType( - framework::ToDataType(ctx.Input(name)->type()), - ctx.GetPlace(), layout, library); + framework::GetDataTypeOfVar(ctx.InputVar(name)), ctx.GetPlace(), layout, + library); } class ActivationOp : public framework::OperatorWithKernel { diff --git a/paddle/fluid/operators/activation_op.h b/paddle/fluid/operators/activation_op.h index a0f8c5c14c48cb1e2be60b53a2198e30b050b33d..87d549678a0e6c183aac89539cf1f6331729de2c 100644 --- a/paddle/fluid/operators/activation_op.h +++ b/paddle/fluid/operators/activation_op.h @@ -41,6 +41,12 @@ static std::unordered_set InplaceOpSet = { "floor", "reciprocal", "relu6", "soft_relu", "hard_sigmoid", }; +/* The following operator can be used to process SelectedRows, because the + * output of those operator for zero is zero too. + */ +static std::unordered_set CanBeUsedBySelectedRows = { + "abs", "abs_grad", "square", "square_grad", "sqrt", "sqrt_grad"}; + static bool IsInplace(std::string op) { return InplaceOpSet.count(op); } template @@ -50,16 +56,38 @@ class ActivationKernel using T = typename Functor::ELEMENT_TYPE; void Compute(const framework::ExecutionContext& context) const override { - auto& X = detail::Ref(context.Input("X"), - "Cannot get input tensor X, variable name = %s", - context.op().Input("X")); - - auto& Out = detail::Ref(context.Output("Out"), - "Cannot get output tensor Out, variable name = %s", - context.op().Output("Out")); - Out.mutable_data(context.GetPlace()); + auto x_var = context.InputVar("X"); + auto out_var = context.OutputVar("Out"); + PADDLE_ENFORCE(x_var != nullptr, + "Cannot get input Variable X, variable name = %s", + context.op().Input("X")); + PADDLE_ENFORCE(out_var != nullptr, + "Cannot get output Variable Out, variable name = %s", + context.op().Output("Out")); + + framework::Tensor X, *Out; + + if (CanBeUsedBySelectedRows.count(context.op().Type())) { + X = detail::Ref( + paddle::framework::GetLoDTensorOrSelectedRowsValueFromVar(*x_var), + "Cannot get input Tensor X, variable name = %s", + context.op().Input("X")); + Out = paddle::framework::GetMutableLoDTensorOrSelectedRowsValueFromVar( + out_var); + } else { + X = detail::Ref(context.Input("X"), + "Cannot get input Tensor X, variable name = %s", + context.op().Input("X")); + Out = context.Output("Out"); + } + + PADDLE_ENFORCE(Out != nullptr, + "Cannot get output tensor Out, variable name = %s", + context.op().Output("Out")); + + Out->mutable_data(context.GetPlace()); auto x = framework::EigenVector::Flatten(X); - auto out = framework::EigenVector::Flatten(Out); + auto out = framework::EigenVector::Flatten(*Out); auto* place = context.template device_context().eigen_device(); Functor functor; @@ -78,14 +106,54 @@ class ActivationGradKernel public: using T = typename Functor::ELEMENT_TYPE; void Compute(const framework::ExecutionContext& context) const override { - auto* Out = context.Input("Out"); - auto* dOut = - context.Input(framework::GradVarName("Out")); - auto* dX = context.Output(framework::GradVarName("X")); + auto out_var = context.InputVar("Out"); + auto out_grad_var = context.InputVar(framework::GradVarName("Out")); + auto x_grad_var = context.OutputVar(framework::GradVarName("X")); + PADDLE_ENFORCE(out_var != nullptr, + "Cannot get input Variable Out, variable name = %s", + context.op().Input("Out")); + PADDLE_ENFORCE(out_grad_var != nullptr, + "Cannot get input Variable %s, variable name = %s", + framework::GradVarName("Out"), + context.op().Input(framework::GradVarName("Out"))); + PADDLE_ENFORCE(x_grad_var != nullptr, + "Cannot get output Variable %s, variable name = %s", + framework::GradVarName("X"), + context.op().Output(framework::GradVarName("X"))); + + framework::Tensor Out, dOut, *dX; + if (CanBeUsedBySelectedRows.count(context.op().Type())) { + Out = detail::Ref( + paddle::framework::GetLoDTensorOrSelectedRowsValueFromVar(*out_var), + "Cannot get input Tensor Out, variable name = %s", + context.op().Input("Out")); + dOut = + detail::Ref(paddle::framework::GetLoDTensorOrSelectedRowsValueFromVar( + *out_grad_var), + "Cannot get input Tensor %s, variable name = %s", + framework::GradVarName("Out"), + context.op().Input(framework::GradVarName("Out"))); + dX = paddle::framework::GetMutableLoDTensorOrSelectedRowsValueFromVar( + x_grad_var); + } else { + Out = detail::Ref(context.Input("Out"), + "Cannot get input Tensor Out, variable name = %s", + context.op().Input("Out")); + dOut = detail::Ref( + context.Input(framework::GradVarName("Out")), + "Cannot get input Tensor %s, variable name = %s", + framework::GradVarName("Out"), + context.op().Input(framework::GradVarName("Out"))); + dX = context.Output(framework::GradVarName("X")); + } + PADDLE_ENFORCE(dX != nullptr, + "Cannot get output tensor %s, variable name = %s", + framework::GradVarName("X"), + context.op().Output(framework::GradVarName("X"))); dX->mutable_data(context.GetPlace()); - auto dout = framework::EigenVector::Flatten(*dOut); - auto out = framework::EigenVector::Flatten(*Out); + auto dout = framework::EigenVector::Flatten(dOut); + auto out = framework::EigenVector::Flatten(Out); auto dx = framework::EigenVector::Flatten(*dX); auto* place = context.template device_context().eigen_device(); @@ -96,8 +164,19 @@ class ActivationGradKernel } bool inplace = functor.Inplace(); if (!inplace) { - auto* X = context.Input("X"); - auto x = framework::EigenVector::Flatten(*X); + auto x_var = context.InputVar("X"); + PADDLE_ENFORCE(x_var != nullptr, + "Cannot get input tensor X, variable name = %s", + context.op().Input("X")); + framework::Tensor X; + if (CanBeUsedBySelectedRows.count(context.op().Type())) { + X = detail::Ref( + paddle::framework::GetLoDTensorOrSelectedRowsValueFromVar(*x_var)); + } else { + X = detail::Ref(context.Input("X")); + } + + auto x = framework::EigenVector::Flatten(X); functor(*place, x, out, dout, dx); } else { VLOG(10) << " Inplace activation "; diff --git a/paddle/fluid/operators/attention_lstm_op.cc b/paddle/fluid/operators/attention_lstm_op.cc index 9b943440a869e213db4ed761cfe7c508bc5e94ae..75fc59125f21901b6781315eb3d7dba36b7f11f2 100644 --- a/paddle/fluid/operators/attention_lstm_op.cc +++ b/paddle/fluid/operators/attention_lstm_op.cc @@ -231,10 +231,10 @@ use lstm_x_t as input and compute as standard LSTM. template inline void bias_relu(const int n, const T* x, const T* bias, T* y) { if (bias) { - math::vec_add_bias(n, *bias, x, y); - math::vec_relu(n, y, y); + math::vec_add_bias(n, *bias, x, y); + math::vec_relu(n, y, y); } else { - math::vec_relu(n, x, y); + math::vec_relu(n, x, y); } } @@ -245,8 +245,8 @@ inline void vec_softmax(const int n, const T* x, T* y) { for (int i = 1; i < n; ++i) { scalar = scalar < x[i] ? x[i] : scalar; } - math::vec_add_bias(n, -scalar, x, y); // sub - math::vec_exp(n, y, y); // exp + math::vec_add_bias(n, -scalar, x, y); // sub + math::vec_exp(n, y, y); // exp // sum scalar = T(0); for (int i = 0; i < n; ++i) { @@ -302,13 +302,13 @@ class AttentionLSTMKernel : public framework::OpKernel { auto& act_gate_str = ctx.Attr("gate_activation"); auto& act_cell_str = ctx.Attr("cell_activation"); auto& act_cand_str = ctx.Attr("candidate_activation"); - if (platform::jit::MayIUse(platform::jit::avx)) { - math::VecActivations act_functor; + if (platform::MayIUse(platform::avx)) { + math::VecActivations act_functor; act_gate = act_functor(act_gate_str); act_cell = act_functor(act_cell_str); act_cand = act_functor(act_cand_str); } else { - math::VecActivations act_functor; + math::VecActivations act_functor; act_gate = act_functor(act_gate_str); act_cell = act_functor(act_cell_str); act_cand = act_functor(act_cand_str); diff --git a/paddle/fluid/operators/bpr_loss_op.cc b/paddle/fluid/operators/bpr_loss_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..9258d7c7e83122149c7cbc42e4a4bdd84903ce67 --- /dev/null +++ b/paddle/fluid/operators/bpr_loss_op.cc @@ -0,0 +1,145 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/bpr_loss_op.h" + +namespace paddle { +namespace operators { + +class BprLossOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should be not null."); + PADDLE_ENFORCE(ctx->HasInput("Label"), "Input(Label) should be not null."); + PADDLE_ENFORCE(ctx->HasOutput("Y"), "Output(Y) should be not null."); + + auto x_dims = ctx->GetInputDim("X"); + auto label_dims = ctx->GetInputDim("Label"); + int rank = x_dims.size(); + PADDLE_ENFORCE_EQ(rank, label_dims.size(), + "Input(X) and Input(Label) shall have the same rank."); + PADDLE_ENFORCE_EQ(framework::slice_ddim(x_dims, 0, rank - 1), + framework::slice_ddim(label_dims, 0, rank - 1), + "Input(X) and Input(Label) shall have the same shape " + "except the last dimension."); + + auto y_dims = x_dims; + y_dims[rank - 1] = 1; + ctx->SetOutputDim("Y", y_dims); + ctx->ShareLoD("X", /*->*/ "Y"); + } + + protected: + // Explicitly set that the data type of computation kernel of Seq-bpr + // is determined by its input "X". + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + framework::ToDataType(ctx.Input("X")->type()), + platform::CPUPlace()); + } +}; + +class BprLossGradientOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should be not null."); + PADDLE_ENFORCE(ctx->HasInput("Label"), "Input(Label) should be not null."); + PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Y")), + "Input(Y@GRAD) shoudl be not null."); + PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")), + "Output(X@GRAD) should be not null."); + + auto x_dims = ctx->GetInputDim("X"); + auto label_dims = ctx->GetInputDim("Label"); + auto dy_dims = ctx->GetInputDim(framework::GradVarName("Y")); + int rank = x_dims.size(); + PADDLE_ENFORCE_EQ(dy_dims.size(), rank, + "Input(Y@Grad) and Input(X) should have the same rank."); + PADDLE_ENFORCE_EQ(label_dims.size(), rank, + "Input(Label) and Input(X) should have the same rank."); + PADDLE_ENFORCE_EQ(framework::slice_ddim(x_dims, 0, rank - 1), + framework::slice_ddim(label_dims, 0, rank - 1), + "The Input(X) and Input(Label) should have the same " + "shape except the last dimension."); + PADDLE_ENFORCE_EQ(framework::slice_ddim(x_dims, 0, rank - 1), + framework::slice_ddim(dy_dims, 0, rank - 1), + "The Input(X) and Input(Y@Grad) should have the same " + "shape except the last dimension."); + PADDLE_ENFORCE_EQ(dy_dims[rank - 1], 1, + "The last dimension of Input(Y@Grad) should be 1."); + PADDLE_ENFORCE_EQ(label_dims[rank - 1], 1, + " the last dimension of Input(Label) should be 1."); + ctx->SetOutputDim(framework::GradVarName("X"), x_dims); + ctx->ShareLoD("X", framework::GradVarName("X")); + } + + protected: + // Explicitly set that the data type of computation kernel of cross_entropy + // is determined by its input "X". + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + framework::ToDataType(ctx.Input("X")->type()), + platform::CPUPlace()); + } +}; + +class BprLossOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("X", + "(Tensor, default Tensor), a tensor whose last dimension " + "size is equal to the number of classes. This input is a " + "real number."); + AddInput( + "Label", + "(Tensor), the tensor which represents the ground truth. It has the " + "same shape with 'X' except the last dimension. the last dimension " + "size is 1."); + AddOutput("Y", + "(Tensor, default Tensor), a tensor whose shape is same " + "with 'X' except that the last dimension size is 1. It " + "represents the sequence bpr loss."); + AddComment(R"DOC( +Bayesian Personalized Ranking Loss Operator. + +This operator belongs to pairwise ranking loss. Label is the desired item. +The loss at a given point in one session is defined as: +$Y[i] = -\frac{1}{N_{i}} * \sum_{j=0}^{N_{i}}\log(\sigma(X[i, Label[i]]-X[i, j]))$ + +Learn more details by reading paper (https://arxiv.org/abs/1511.06939) + +)DOC"); + } +}; +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +using CPUCtx = paddle::platform::CPUDeviceContext; + +REGISTER_OPERATOR(bpr_loss, ops::BprLossOp, ops::BprLossOpMaker, + paddle::framework::DefaultGradOpDescMaker); +REGISTER_OPERATOR(bpr_loss_grad, ops::BprLossGradientOp); +REGISTER_OP_CPU_KERNEL(bpr_loss, ops::BprLossOpKernel, + ops::BprLossOpKernel); +REGISTER_OP_CPU_KERNEL(bpr_loss_grad, + ops::BprLossGradientOpKernel, + ops::BprLossGradientOpKernel); diff --git a/paddle/fluid/operators/bpr_loss_op.h b/paddle/fluid/operators/bpr_loss_op.h new file mode 100644 index 0000000000000000000000000000000000000000..e223be7af82146e7c69c7c5aab8f08d0fe0d1710 --- /dev/null +++ b/paddle/fluid/operators/bpr_loss_op.h @@ -0,0 +1,118 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/platform/for_range.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +/*Todo: + *Find a way to adapt TolerableValue, using blas or eigen. + */ +template +struct TolerableValue { + HOSTDEVICE T operator()(const T& x) const { + PADDLE_ASSERT(std::is_floating_point::value); + const T kApproInf = 1e20; + if (x == INFINITY) return kApproInf; + if (x == -INFINITY) return -kApproInf; + return x; + } +}; + +template +class BprLossOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* x = ctx.Input("X"); + auto* label = ctx.Input("Label"); + auto* y = ctx.Output("Y"); + y->mutable_data(ctx.GetPlace()); + int rank = x->dims().size(); + + Tensor x_2d = framework::ReshapeToMatrix(*x, rank - 1); + Tensor labels_2d = framework::ReshapeToMatrix(*label, rank - 1); + Tensor y_2d = framework::ReshapeToMatrix(*y, rank - 1); + + const framework::Tensor* logits = &x_2d; + const framework::Tensor* labels = &labels_2d; + framework::Tensor* out = &y_2d; + + const int step_size = logits->dims()[0]; + const int class_num = logits->dims()[1]; + const T* logits_data = logits->data(); + T* loss_data = out->data(); + + const int64_t* label_data = labels->data(); + for (int i = 0; i < step_size; ++i) { + int lbl_pos = label_data[i]; + PADDLE_ENFORCE_GE(lbl_pos, 0); + PADDLE_ENFORCE_LT(lbl_pos, class_num); + int index_pos = i * class_num + lbl_pos; + T sum = static_cast(0); + for (int j = 0; j < class_num; j++) { + if (j == lbl_pos) continue; + int index_neg = i * class_num + j; + sum += TolerableValue()(-std::log( + 1.0f + TolerableValue()(std::exp(logits_data[index_neg] - + logits_data[index_pos])))); + } + loss_data[i] = -sum / (class_num - 1); + } + } +}; + +template +class BprLossGradientOpKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* x = ctx.Input("X"); + auto* dy = ctx.Input(framework::GradVarName("Y")); + auto* label = ctx.Input("Label"); + auto* dx = ctx.Output(framework::GradVarName("X")); + + const int step_size = x->dims()[0]; + const int num_classes = x->dims()[1]; + T* dx_data = dx->mutable_data(ctx.GetPlace()); + const T* dy_data = dy->data(); + const T* x_data = x->data(); + const int64_t* label_data = label->data(); + + for (size_t sample_id = 0; sample_id < step_size; sample_id++) { + for (size_t x_offset = sample_id * num_classes; + x_offset < (sample_id + 1) * num_classes; x_offset++) { + dx_data[x_offset] = static_cast(0); + } + auto p_index = sample_id * num_classes + label_data[sample_id]; + for (size_t ni = 0; ni < num_classes; ni++) { + if (label_data[sample_id] == ni) continue; + auto n_index = sample_id * num_classes + ni; + auto grad_ = -dy_data[sample_id] / + ((num_classes - 1) * + (1.0f + TolerableValue()(std::exp(x_data[p_index] - + x_data[n_index])))); + dx_data[p_index] += grad_; + dx_data[n_index] -= grad_; + } + } + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/conv_fusion_op.cu.cc b/paddle/fluid/operators/conv_fusion_op.cu.cc index 2c09ee7394ad605f7a324d021ce0468a79bb71ca..3235ad52b999e1ca3f992034781edaab9921a300 100644 --- a/paddle/fluid/operators/conv_fusion_op.cu.cc +++ b/paddle/fluid/operators/conv_fusion_op.cu.cc @@ -110,11 +110,7 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel { auto x_dims = framework::vectorize(input->dims()); auto f_dims = framework::vectorize(filter->dims()); - if (activation == "identity") { - // Only the CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM algo is - // enabled with CUDNN_ACTIVATION_IDENTITY in cuDNN lib. - algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM; - } else if (!exhaustive_search) { + if (!exhaustive_search) { CUDNN_ENFORCE(platform::dynload::cudnnGetConvolutionForwardAlgorithm( handle, cudnn_input_desc, cudnn_filter_desc, cudnn_conv_desc, cudnn_output_desc, CUDNN_CONVOLUTION_FWD_SPECIFY_WORKSPACE_LIMIT, @@ -165,18 +161,42 @@ class CUDNNConvFusionOpKernel : public framework::OpKernel { PADDLE_ENFORCE_LE(workspace_size_in_bytes, workspace_size_limit, "workspace_size to be allocated exceeds the limit"); - // ------------------- cudnn conv+bias+act forward -------------------- - ScalingParamType alpha1 = 1.0f; - ScalingParamType alpha2 = residual ? 1.0f : 0.0f; - auto cudnn_func = [&](void* cudnn_workspace) { - CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBiasActivationForward( - handle, &alpha1, cudnn_input_desc, input_data, cudnn_filter_desc, - filter_data, cudnn_conv_desc, algo, cudnn_workspace, - workspace_size_in_bytes, &alpha2, cudnn_output_desc, residual_data, - cudnn_bias_desc, bias_data, cudnn_act_desc, cudnn_output_desc, + if ((activation == "identity") && + (algo != CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM) && + (!residual)) { + // Only the CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM algo is + // enabled with CUDNN_ACTIVATION_IDENTITY in cuDNN lib. + // But test in some case, the speed is slower, change to use + // cudnnConvolutionForward and cudnnAddTensor + // ------------- cudnn conv forward and bias add --------------------- + ScalingParamType alpha = 1.0f, beta = 0.0f; + auto cudnn_func = [&](void* cudnn_workspace) { + CUDNN_ENFORCE(platform::dynload::cudnnConvolutionForward( + handle, &alpha, cudnn_input_desc, input_data, cudnn_filter_desc, + filter_data, cudnn_conv_desc, algo, cudnn_workspace, + workspace_size_in_bytes, &beta, cudnn_output_desc, output_data)); + }; + workspace_handle.RunFunc(cudnn_func, workspace_size_in_bytes); + CUDNN_ENFORCE(platform::dynload::cudnnAddTensor( + handle, &alpha, cudnn_bias_desc, bias_data, &alpha, cudnn_output_desc, output_data)); - }; - workspace_handle.RunFunc(cudnn_func, workspace_size_in_bytes); + } else { + if (activation == "identity") { + algo = CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM; + } + // ------------------- cudnn conv+bias+act forward -------------------- + ScalingParamType alpha1 = 1.0f; + ScalingParamType alpha2 = residual ? 1.0f : 0.0f; + auto cudnn_func = [&](void* cudnn_workspace) { + CUDNN_ENFORCE(platform::dynload::cudnnConvolutionBiasActivationForward( + handle, &alpha1, cudnn_input_desc, input_data, cudnn_filter_desc, + filter_data, cudnn_conv_desc, algo, cudnn_workspace, + workspace_size_in_bytes, &alpha2, cudnn_output_desc, residual_data, + cudnn_bias_desc, bias_data, cudnn_act_desc, cudnn_output_desc, + output_data)); + }; + workspace_handle.RunFunc(cudnn_func, workspace_size_in_bytes); + } } }; #endif diff --git a/paddle/fluid/operators/conv_mkldnn_op.cc b/paddle/fluid/operators/conv_mkldnn_op.cc index 05e268bf6a8d9a2562a4c278d317f75dac28e52c..154ff2bb209bb8f932c06caa319223ccf3314767 100644 --- a/paddle/fluid/operators/conv_mkldnn_op.cc +++ b/paddle/fluid/operators/conv_mkldnn_op.cc @@ -28,6 +28,46 @@ using mkldnn::stream; using platform::to_void_cast; using platform::GetMKLDNNFormat; +inline void GetWeightsTz(std::vector& weights_tz, int groups, // NOLINT + bool is_conv3d) { + if (groups > 1) { + if (is_conv3d) { + int output = weights_tz[0]; + int input = weights_tz[1]; + int dimension = weights_tz[2]; + int height = weights_tz[3]; + int width = weights_tz[4]; + weights_tz.resize(6); + weights_tz[0] = groups; + weights_tz[1] = output / groups; + weights_tz[2] = input; + weights_tz[3] = dimension; + weights_tz[4] = height; + weights_tz[5] = width; + } else { + int output = weights_tz[0]; + int input = weights_tz[1]; + int height = weights_tz[2]; + int width = weights_tz[3]; + weights_tz.resize(5); + weights_tz[0] = groups; + weights_tz[1] = output / groups; + weights_tz[2] = input; + weights_tz[3] = height; + weights_tz[4] = width; + } + } +} + +inline mkldnn::memory::format GetWeightsFormat(mkldnn::memory::format format, + int groups, bool is_conv3d) { + if (is_conv3d) { + return (groups == 1) ? format : mkldnn::memory::format::goidhw; + } else { + return (groups == 1) ? format : mkldnn::memory::format::goihw; + } +} + template class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { public: @@ -52,10 +92,10 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { PADDLE_ENFORCE(filter->layout() == DataLayout::kMKLDNN && filter->format() != memory::format::format_undef, "Wrong layout/format set for Filter tensor"); - PADDLE_ENFORCE(input->dims().size() == 4, - "Input must be with 4 dimensions, i.e. NCHW"); - PADDLE_ENFORCE(filter->dims().size() == 4, - "Filter must be with 4 dimensions, i.e. OIHW"); + PADDLE_ENFORCE(input->dims().size() == 4 || input->dims().size() == 5, + "Input must be with 4 or 5 dimensions, i.e. NCHW or NCDHW"); + PADDLE_ENFORCE(filter->dims().size() == 4 || filter->dims().size() == 5, + "Filter must be with 4 or 5 dimensions, i.e. OIHW or OIDHW"); if (bias) { PADDLE_ENFORCE(bias->layout() == DataLayout::kMKLDNN && bias->format() != memory::format::format_undef, @@ -71,9 +111,13 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { bool fuse_residual_conn = ctx.Attr("fuse_residual_connection"); int groups = ctx.Attr("groups"); + bool is_conv3d = strides.size() == 3U; // TODO(tpatejko): add support for dilation PADDLE_ENFORCE( - dilations.size() == 2 && dilations[0] == 1 && dilations[1] == 1, + is_conv3d + ? dilations.size() == 3 && dilations[0] == 1 && dilations[1] == 1 && + dilations[2] == 1 + : dilations.size() == 2 && dilations[0] == 1 && dilations[1] == 1, "dilation in convolution is not implemented yet"); const T* input_data = input->data(); @@ -83,18 +127,7 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { std::vector weights_tz = paddle::framework::vectorize2int(filter->dims()); int g = std::max(groups, 1); - if (g > 1) { - int o = weights_tz[0]; - int i = weights_tz[1]; - int h = weights_tz[2]; - int w = weights_tz[3]; - weights_tz.resize(5); - weights_tz[0] = g; - weights_tz[1] = o / g; - weights_tz[2] = i; - weights_tz[3] = h; - weights_tz[4] = w; - } + GetWeightsTz(weights_tz, g, is_conv3d); std::vector dst_tz = paddle::framework::vectorize2int(output->dims()); // Get unique name for storing MKLDNN primitives @@ -105,11 +138,14 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { std::vector pipeline; + auto src_format = input->format(); + mkldnn::memory::format weights_format = + GetWeightsFormat(filter->format(), g, is_conv3d); + auto user_src_md = platform::MKLDNNMemDesc( - {src_tz}, platform::MKLDNNGetDataType(), input->format()); + {src_tz}, platform::MKLDNNGetDataType(), src_format); auto user_weights_md = platform::MKLDNNMemDesc( - {weights_tz}, platform::MKLDNNGetDataType(), - (g == 1) ? filter->format() : mkldnn::memory::format::goihw); + {weights_tz}, platform::MKLDNNGetDataType(), weights_format); /* create memory descriptor for convolution without specified format * ('any') which lets a primitive (convolution in this case) choose @@ -119,10 +155,16 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { auto chosen_memory_format = platform::data_format_to_memory_format(data_format); + if (is_conv3d) { + chosen_memory_format = + platform::MKLDNNFormatForSize(src_tz.size(), chosen_memory_format); + } + weights_format = GetWeightsFormat(chosen_memory_format, g, is_conv3d); + auto src_md = platform::MKLDNNMemDesc( src_tz, platform::MKLDNNGetDataType(), chosen_memory_format); auto weights_md = platform::MKLDNNMemDesc( - weights_tz, platform::MKLDNNGetDataType(), chosen_memory_format); + weights_tz, platform::MKLDNNGetDataType(), weights_format); std::vector bias_tz; // TODO(mgallus): avoid empty vector creation. // Currently used whenever bias is != nullptr. auto dst_md = platform::MKLDNNMemDesc( @@ -263,8 +305,8 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { const mkldnn::engine& engine, const bool fuse_relu, const bool fuse_residual_conn, mkldnn::prop_kind fwd_prop_kind) const { - memory::dims stride_dims = {strides[0], strides[1]}; - memory::dims padding_dims = {paddings[0], paddings[1]}; + memory::dims stride_dims = strides; + memory::dims padding_dims = paddings; auto conv_desc = mkldnn::convolution_forward::desc( fwd_prop_kind, mkldnn::convolution_direct, src, weights, dst, @@ -288,8 +330,8 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { const mkldnn::engine& engine, const bool fuse_relu, const bool fuse_residual_conn, mkldnn::prop_kind fwd_prop_kind) const { - memory::dims stride_dims = {strides[0], strides[1]}; - memory::dims padding_dims = {paddings[0], paddings[1]}; + memory::dims stride_dims = strides; + memory::dims padding_dims = paddings; auto conv_desc = mkldnn::convolution_forward::desc( fwd_prop_kind, mkldnn::convolution_direct, src, weights, bias, dst, @@ -349,6 +391,7 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel { std::vector dilations = ctx.Attr>("dilations"); int groups = ctx.Attr("groups"); + bool is_conv3d = strides.size() == 3U; const T* input_data = input->data(); const T* filter_data = filter->data(); const T* output_grad_data = output_grad->data(); @@ -358,8 +401,14 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel { std::vector src_tz = paddle::framework::vectorize2int(input->dims()); std::vector weights_tz = paddle::framework::vectorize2int(filter->dims()); + int g = std::max(groups, 1); + GetWeightsTz(weights_tz, g, is_conv3d); std::vector dst_tz = paddle::framework::vectorize2int(output->dims()); + auto src_format = input->format(); + mkldnn::memory::format weights_format = + GetWeightsFormat(filter->format(), g, is_conv3d); + // Get an unique name from "argument" name of "Output" variable // as well as attributes of primitive to be created // This name will be used as key when saving info into device context @@ -372,9 +421,9 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel { // Create user memory descriptors auto user_src_md = platform::MKLDNNMemDesc( - {src_tz}, platform::MKLDNNGetDataType(), input->format()); + {src_tz}, platform::MKLDNNGetDataType(), src_format); auto user_weights_md = platform::MKLDNNMemDesc( - {weights_tz}, platform::MKLDNNGetDataType(), filter->format()); + {weights_tz}, platform::MKLDNNGetDataType(), weights_format); auto user_diff_dst_md = platform::MKLDNNMemDesc( {dst_tz}, platform::MKLDNNGetDataType(), output_grad->format()); @@ -386,14 +435,20 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel { auto chosen_memory_format = platform::data_format_to_memory_format(data_format); + if (is_conv3d) { + chosen_memory_format = + platform::MKLDNNFormatForSize(src_tz.size(), chosen_memory_format); + } + weights_format = GetWeightsFormat(chosen_memory_format, g, is_conv3d); + auto src_md = platform::MKLDNNMemDesc( src_tz, platform::MKLDNNGetDataType(), chosen_memory_format); auto diff_src_md = platform::MKLDNNMemDesc( src_tz, platform::MKLDNNGetDataType(), chosen_memory_format); auto weights_md = platform::MKLDNNMemDesc( - weights_tz, platform::MKLDNNGetDataType(), chosen_memory_format); + weights_tz, platform::MKLDNNGetDataType(), weights_format); auto diff_weights_md = platform::MKLDNNMemDesc( - weights_tz, platform::MKLDNNGetDataType(), chosen_memory_format); + weights_tz, platform::MKLDNNGetDataType(), weights_format); auto diff_dst_md = platform::MKLDNNMemDesc( dst_tz, platform::MKLDNNGetDataType(), chosen_memory_format); @@ -491,8 +546,22 @@ class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel { namespace ops = paddle::operators; -REGISTER_OP_KERNEL(conv2d, MKLDNN, ::paddle::platform::CPUPlace, - ops::ConvMKLDNNOpKernel); - -REGISTER_OP_KERNEL(conv2d_grad, MKLDNN, ::paddle::platform::CPUPlace, - ops::ConvMKLDNNGradOpKernel); +REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(conv2d, MKLDNN, + ::paddle::platform::CPUPlace, FP32, + ops::kConvMKLDNNFP32, + ops::ConvMKLDNNOpKernel); + +REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(conv2d_grad, MKLDNN, + ::paddle::platform::CPUPlace, FP32, + ops::kConvMKLDNNFP32, + ops::ConvMKLDNNGradOpKernel); + +REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(conv3d, MKLDNN, + ::paddle::platform::CPUPlace, FP32, + ops::kConvMKLDNNFP32, + ops::ConvMKLDNNOpKernel); + +REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(conv3d_grad, MKLDNN, + ::paddle::platform::CPUPlace, FP32, + ops::kConvMKLDNNFP32, + ops::ConvMKLDNNGradOpKernel); diff --git a/paddle/fluid/operators/conv_op.cc b/paddle/fluid/operators/conv_op.cc index 342525be49e28f1785e25d4daad38c3c81b4774f..d7b876628855b8b76b340cd1e6115896ead4aa6c 100644 --- a/paddle/fluid/operators/conv_op.cc +++ b/paddle/fluid/operators/conv_op.cc @@ -74,6 +74,8 @@ void ConvOp::InferShape(framework::InferShapeContext* ctx) const { framework::OpKernelType ConvOp::GetExpectedKernelType( const framework::ExecutionContext& ctx) const { + int customized_type_value = + framework::OpKernelType::kDefaultCustomizedTypeValue; framework::LibraryType library{framework::LibraryType::kPlain}; // TODO(pzelazko-intel): enable MKLDNN layout when it's ready std::string data_format = ctx.Attr("data_format"); @@ -89,6 +91,7 @@ framework::OpKernelType ConvOp::GetExpectedKernelType( platform::CanMKLDNNBeUsed(ctx)) { library = framework::LibraryType::kMKLDNN; layout = framework::DataLayout::kMKLDNN; + customized_type_value = kConvMKLDNNFP32; } #endif @@ -105,7 +108,7 @@ framework::OpKernelType ConvOp::GetExpectedKernelType( } return framework::OpKernelType(input_data_type, ctx.GetPlace(), layout, - library); + library, customized_type_value); } void Conv2DOpMaker::Make() { @@ -131,14 +134,14 @@ void Conv2DOpMaker::Make() { "The format of output tensor is X (one-dimensional) of size equal" "to the number of output channels. Only used with MKL-DNN.") .AsDispensable(); - AddOutput("Output", - "(Tensor) The output tensor of convolution operator. " - "The format of output tensor is also NCHW."); AddInput("ResidualData", "(Tensor) Tensor with residual data " "to which convolution output will be added." "Used with fuse_residual_connection fusion.") .AsDispensable(); + AddOutput("Output", + "(Tensor) The output tensor of convolution operator. " + "The format of output tensor is also NCHW."); AddAttr>("strides", "(vector default:{1, 1}), the " "strides(h_stride, w_stride) of " @@ -229,6 +232,10 @@ $$ } void Conv3DOpMaker::Make() { + AddAttr("is_test", + "(bool, default false) Set to true for inference only, false " + "for training. Some layers may run faster when this is true.") + .SetDefault(false); AddInput( "Input", "(Tensor) The input tensor of convolution operator. " @@ -244,6 +251,11 @@ void Conv3DOpMaker::Make() { "is the width of the filter." "If the groups attribute is greater than 1, C equals the number of " "input image channels divided by the groups."); + AddInput("ResidualData", + "(Tensor) Tensor with residual data " + "to which convolution output will be added." + "Used with fuse_residual_connection fusion.") + .AsDispensable(); AddOutput("Output", "(Tensor) The output tensor of convolution operator." "The format of output tensor is also NCDHW."); @@ -277,6 +289,13 @@ void Conv3DOpMaker::Make() { AddAttr("use_mkldnn", "(bool, default false) Only used in mkldnn kernel") .SetDefault(false); + AddAttr("fuse_relu", "(bool, default false) Only used in mkldnn kernel") + .SetDefault(false); + AddAttr("fuse_residual_connection", + "(bool, default false) Only used in mkldnn kernel. Used " + "whenever convolution output is as an input to residual " + "connection.") + .SetDefault(false); AddAttr( "data_format", "(string, default NCHW) Only used in " @@ -342,6 +361,8 @@ void ConvOpGrad::InferShape(framework::InferShapeContext* ctx) const { framework::OpKernelType ConvOpGrad::GetExpectedKernelType( const framework::ExecutionContext& ctx) const { + int customized_type_value = + framework::OpKernelType::kDefaultCustomizedTypeValue; framework::LibraryType library_{framework::LibraryType::kPlain}; // TODO(pzelazko-intel): enable MKLDNN layout when it's ready std::string data_format = ctx.Attr("data_format"); @@ -357,12 +378,13 @@ framework::OpKernelType ConvOpGrad::GetExpectedKernelType( platform::CanMKLDNNBeUsed(ctx)) { library_ = framework::LibraryType::kMKLDNN; layout_ = framework::DataLayout::kMKLDNN; + customized_type_value = kConvMKLDNNFP32; } #endif return framework::OpKernelType( framework::ToDataType(ctx.Input("Input")->type()), ctx.GetPlace(), - layout_, library_); + layout_, library_, customized_type_value); } } // namespace operators diff --git a/paddle/fluid/operators/conv_op.h b/paddle/fluid/operators/conv_op.h index e69814001e4da5d10e51ee57c1dbe291338b8b49..249f308c13ff5636fbaa6747b28cab7886b7e736 100644 --- a/paddle/fluid/operators/conv_op.h +++ b/paddle/fluid/operators/conv_op.h @@ -27,6 +27,8 @@ namespace paddle { namespace operators { using Tensor = framework::Tensor; +constexpr int kConvMKLDNNFP32 = 1; +constexpr int kConvMKLDNNINT8 = 2; // Base convolution operator definations for other conv // like operators to reuse the implementation. diff --git a/paddle/fluid/operators/cudnn_lstm_op.cu.cc b/paddle/fluid/operators/cudnn_lstm_op.cu.cc index e01070c7b8ed4374cf8a61cfde4de940b4ea38b2..dd64cc327fc383937bc9a9d6e7daa0cec488e4cc 100644 --- a/paddle/fluid/operators/cudnn_lstm_op.cu.cc +++ b/paddle/fluid/operators/cudnn_lstm_op.cu.cc @@ -177,11 +177,19 @@ struct CudnnRNNCache { seed_)); CUDNN_ENFORCE(platform::dynload::cudnnCreateRNNDescriptor(&rnn_desc_)); + +#if CUDNN_VERSION >= 6000 CUDNN_ENFORCE(platform::dynload::cudnnSetRNNDescriptor_v6( handle, rnn_desc_, hidden_size_, num_layers_, dropout_desc_, CUDNN_LINEAR_INPUT, is_bidirec_ ? CUDNN_BIDIRECTIONAL : CUDNN_UNIDIRECTIONAL, CUDNN_LSTM, CUDNN_RNN_ALGO_STANDARD, CUDNN_DATA_FLOAT)); +#else + CUDNN_ENFORCE(platform::dynload::cudnnSetRNNDescriptor( + rnn_desc_, hidden_size_, num_layers_, dropout_desc_, CUDNN_LINEAR_INPUT, + is_bidirec_ ? CUDNN_BIDIRECTIONAL : CUDNN_UNIDIRECTIONAL, CUDNN_LSTM, + CUDNN_DATA_FLOAT)); +#endif CUDNN_ENFORCE(platform::dynload::cudnnCreateFilterDescriptor(&w_desc_)); CUDNN_ENFORCE(platform::dynload::cudnnCreateFilterDescriptor(&dw_desc_)); diff --git a/paddle/fluid/operators/distributed/CMakeLists.txt b/paddle/fluid/operators/distributed/CMakeLists.txt index 36979de68f3abfdedfcc4a49cc312c1f849f5676..101dbe9c89616b7025337261469e2b1aa3e8bc76 100644 --- a/paddle/fluid/operators/distributed/CMakeLists.txt +++ b/paddle/fluid/operators/distributed/CMakeLists.txt @@ -13,16 +13,26 @@ set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor if(WITH_GRPC) grpc_library(sendrecvop_grpc SRCS grpc_bytebuffer_stream.cc sendrecvop_utils.cc grpc_client.cc - request_handler_impl.cc rpc_client.cc rpc_server.cc grpc_server.cc variable_response.cc grpc_variable_response.cc grpc_serde.cc + request_handler_impl.cc rpc_client.cc rpc_server.cc grpc_server.cc variable_response.cc grpc_variable_response.cc grpc_serde.cc collective_client.cc collective_server.cc PROTO send_recv.proto - DEPS lod_tensor selected_rows memory) + DEPS lod_tensor selected_rows_functor memory) set_source_files_properties(grpc_serde_test.cc rpc_server_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS}) + cc_test(grpc_serde_test SRCS grpc_serde_test.cc DEPS grpc++_unsecure grpc_unsecure gpr cares zlib protobuf sendrecvop_grpc scope profiler math_function SERIAL) + cc_test(rpc_server_test SRCS rpc_server_test.cc DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf executor proto_desc lookup_sparse_table_op SERIAL) + cc_test(varhandle_test SRCS varhandle_test.cc DEPS profiler) + + if(WITH_GPU) + cc_test(collective_server_test SRCS collective_server_test.cc + DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf executor + selected_rows_functor scope math_function SERIAL) + endif() + cc_library(parameter_prefetch SRCS parameter_prefetch.cc DEPS sendrecvop_grpc memory) else() set_source_files_properties(brpc_server.cc brpc_client.cc rpc_server_test.cc brpc_serde_test.cc diff --git a/paddle/fluid/operators/distributed/collective_client.cc b/paddle/fluid/operators/distributed/collective_client.cc new file mode 100644 index 0000000000000000000000000000000000000000..6d3f53431113621fc859eda8e7448383772d20a3 --- /dev/null +++ b/paddle/fluid/operators/distributed/collective_client.cc @@ -0,0 +1,59 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include // NOLINT +#include +#include "gflags/gflags.h" + +#include "paddle/fluid/operators/distributed/collective_client.h" + +DECLARE_int32(rpc_deadline); + +namespace paddle { +namespace operators { +namespace distributed { +std::once_flag CollectiveClient::init_flag_; +std::unique_ptr CollectiveClient::client_(nullptr); + +bool CollectiveClient::Gather(const std::vector& remote_vars, + std::vector* dst, + const platform::DeviceContext& ctx, + framework::Scope* scope, int64_t time_out) { + for (auto r : remote_vars) { + VLOG(50) << "begin gather from ep:" << r.String(); + scope->Var(r.var_name_)->GetMutable(); + VarHandlePtr ptr = rpc_client_->AsyncGetMonomerVariable( + r.ep_, ctx, *scope, r.var_name_, time_out); + } + + rpc_client_->Wait(); + + for (auto r : remote_vars) { + auto select_rows = + scope->FindVar(r.var_name_)->GetMutable(); + dst->push_back(select_rows); + + VLOG(4) << "gather from ep:" << r.String() + << ", select_rows:" << GetSelectedRowsInfo(*select_rows); + + rpc_client_->AsyncGetMonomerBarrier(r.ep_, r.var_name_); + } + + rpc_client_->Wait(); + return true; +} + +} // namespace distributed +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/distributed/collective_client.h b/paddle/fluid/operators/distributed/collective_client.h new file mode 100644 index 0000000000000000000000000000000000000000..53b03c531a2b8859e6d7c904e9ab4d1b7a5c8b9b --- /dev/null +++ b/paddle/fluid/operators/distributed/collective_client.h @@ -0,0 +1,93 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include // NOLINT +#include +#include +#include "gflags/gflags.h" + +#include "paddle/fluid/framework/data_type.h" +#include "paddle/fluid/framework/lod_tensor.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/operators/detail/macros.h" +#include "paddle/fluid/operators/distributed/request_handler.h" + +DECLARE_int32(rpc_deadline); + +namespace paddle { +namespace operators { +namespace distributed { + +inline std::string GetSelectedRowsInfo(const framework::SelectedRows& slr) { + std::stringstream ss; + ss << ", height:" << slr.height() << ", rows:["; + for (unsigned int i = 0; i < slr.rows().size(); i++) { + if (i != slr.rows().size() - 1) { + ss << slr.rows()[i] << ","; + } else { + ss << slr.rows()[i]; + } + } + ss << "], dims:" << slr.value().dims(); + return ss.str(); +} + +struct RemoteVar { + std::string ep_; + std::string var_name_; + int trainer_id_{0}; + + std::string String() { + std::stringstream ss; + ss << "ep:" << ep_ << ", var_name:" << var_name_ + << ", trainer_id:" << trainer_id_; + + return ss.str(); + } +}; + +class CollectiveClient { + public: + CollectiveClient() { + rpc_client_.reset(new RPCCLIENT_T()); + rpc_client_->InitImpl(); + } + virtual ~CollectiveClient() {} + + // note this function will retain the rank order. + bool Gather(const std::vector& remote_vars, + std::vector* dst, + const platform::DeviceContext& ctx, framework::Scope* scope, + int64_t time_out = FLAGS_rpc_deadline); + + static CollectiveClient* GetInstance() { + std::call_once(init_flag_, [&]() { + if (client_.get() == nullptr) { + client_.reset(new CollectiveClient()); + } + }); + return client_.get(); + } + + private: + std::unique_ptr rpc_client_; + + static std::once_flag init_flag_; + static std::unique_ptr client_; +}; +} // namespace distributed +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/distributed/collective_server.cc b/paddle/fluid/operators/distributed/collective_server.cc new file mode 100644 index 0000000000000000000000000000000000000000..c95652400c27acd406ca3f70a0dfa8d329e94358 --- /dev/null +++ b/paddle/fluid/operators/distributed/collective_server.cc @@ -0,0 +1,74 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include // for removing the port file +#include +#include +#include +#include // NOLINT +#include + +#include "paddle/fluid/operators/distributed/collective_server.h" + +DEFINE_int32(collective_get_thread_num, 5, "number of threads for rpc get"); + +namespace paddle { +namespace operators { +namespace distributed { + +std::once_flag CollectiveServer::init_flag_; +std::shared_ptr CollectiveServer::collective_server_(nullptr); + +CollectiveServer::CollectiveServer(const std::string& end_point, int fan_in) { + VLOG(1) << "Create colllective server:" << end_point << ", fan_in:" << fan_in; + rpc_server_.reset(new RPCSERVER_T(end_point, fan_in)); +} + +void CollectiveServer::Stop() { + rpc_server_->ShutDown(); + server_thread_->join(); + loop_thread_->join(); +} + +void CollectiveServer::StartServer() { + get_monomer_handler_.reset(new GetMonomerHandler()); + get_monomer_handler_->SetRPCServer(rpc_server_.get()); + + get_barrier_handler_.reset(new GetMonomerBarrierHandler()); + get_barrier_handler_->SetRPCServer(rpc_server_.get()); + + rpc_server_->RegisterRPC(distributed::kRequestGetMonomerVariable, + get_monomer_handler_.get(), + FLAGS_collective_get_thread_num); + rpc_server_->RegisterRPC(distributed::kRequestGetMonomerBarrier, + get_barrier_handler_.get(), 1); + + server_thread_.reset(new std::thread([&]() { rpc_server_->StartServer(); })); + rpc_server_->WaitServerReady(); + + loop_thread_.reset(new std::thread([&]() { + while (true) { + if (rpc_server_->IsExit()) { + LOG(WARNING) << "get exit!rpc_processor break!"; + break; + } + sleep(1); + } + VLOG(1) << "CollectiveServer loop_thread end"; + })); +} + +}; // namespace distributed +}; // namespace operators +}; // namespace paddle diff --git a/paddle/fluid/operators/distributed/collective_server.h b/paddle/fluid/operators/distributed/collective_server.h new file mode 100644 index 0000000000000000000000000000000000000000..a23dc18b4de86421a0995b9951e0ae6f4dc76150 --- /dev/null +++ b/paddle/fluid/operators/distributed/collective_server.h @@ -0,0 +1,110 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include +#include +#include +#include // NOLINT +#include +#include + +#include "gflags/gflags.h" + +#include "paddle/fluid/operators/detail/macros.h" +#include "paddle/fluid/operators/distributed/request_handler.h" +#include "paddle/fluid/operators/distributed/request_handler_impl.h" +#include "paddle/fluid/operators/distributed/rpc_server.h" + +namespace paddle { +namespace operators { +namespace distributed { + +class CollectiveServer; + +class GetMonomerHandler final : public RequestHandler { + public: + GetMonomerHandler() : RequestHandler(true) {} + virtual ~GetMonomerHandler() {} + bool Handle(const std::string& var_name, framework::Scope* scope, + framework::Variable* var, framework::Variable** outvar, + const int trainer_id, const std::string& out_var_name = "", + const std::string& table_name = "") override { + VLOG(50) << "GetMonomerHandler recv " << var_name; + + *outvar = scope->FindVar(var_name); + PADDLE_ENFORCE(outvar != nullptr, "%s not found", var_name); + + return true; + } +}; + +class GetMonomerBarrierHandler final : public RequestHandler { + public: + GetMonomerBarrierHandler() : RequestHandler(true) {} + virtual ~GetMonomerBarrierHandler() {} + bool Handle(const std::string& var_name, framework::Scope* scope, + framework::Variable* var, framework::Variable** outvar, + const int trainer_id, const std::string& out_var_name = "", + const std::string& table_name = "") override { + VLOG(50) << "GetMonomerHandler recv " << var_name; + + rpc_server_->IncreaseVarBarrier(var_name); + + return true; + } +}; + +class CollectiveServer final { + public: + explicit CollectiveServer(const std::string& end_point, int fan_in); + + virtual ~CollectiveServer() {} + + void StartServer(); + + static CollectiveServer* GetInstance(const std::string& end_point, + int fan_in) { + std::call_once(init_flag_, [&]() { + if (collective_server_.get() == nullptr) { + collective_server_.reset(new CollectiveServer(end_point, fan_in)); + collective_server_->StartServer(); + } + }); + + return collective_server_.get(); + } + + std::shared_ptr GetRPCServer() { return rpc_server_; } + + void Stop(); + + private: + std::unique_ptr get_monomer_handler_; + std::unique_ptr get_barrier_handler_; + + std::shared_ptr rpc_server_; + std::shared_ptr server_thread_; + std::shared_ptr loop_thread_; + + bool ready_{false}; + + static std::once_flag init_flag_; + static std::shared_ptr collective_server_; +}; + +}; // namespace distributed +}; // namespace operators +}; // namespace paddle diff --git a/paddle/fluid/operators/distributed/collective_server_test.cc b/paddle/fluid/operators/distributed/collective_server_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..0a9c69e393257068371e88253b82a500f58ed837 --- /dev/null +++ b/paddle/fluid/operators/distributed/collective_server_test.cc @@ -0,0 +1,115 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include // NOLINT + +#include "gtest/gtest.h" +#include "paddle/fluid/framework/block_desc.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" + +#include "paddle/fluid/operators/detail/macros.h" +#include "paddle/fluid/operators/distributed/collective_client.h" +#include "paddle/fluid/operators/distributed/collective_server.h" +#include "paddle/fluid/operators/distributed/request_handler_impl.h" +#include "paddle/fluid/operators/math/math_function.h" + +namespace framework = paddle::framework; +namespace platform = paddle::platform; +namespace distributed = paddle::operators::distributed; + +std::unique_ptr StartServer( + const std::string& ep, int fan_in, framework::Scope* scope, + platform::DeviceContext* dev_ctx) { + distributed::CollectiveServer* server = + distributed::CollectiveServer::GetInstance(ep, fan_in); + + auto rpc_server = server->GetRPCServer(); + rpc_server->RegisterVar("var1", distributed::kRequestGetMonomerVariable, + scope, dev_ctx); + + std::cout << "StartServer return" << std::endl; + return std::unique_ptr(server); +} + +std::unique_ptr GenerateVars(platform::Place place) { + platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); + auto& ctx = *pool.Get(place); + + framework::Scope* scope = new framework::Scope(); + framework::Variable* var = scope->Var("var1"); + auto* slr = var->GetMutable(); + slr->set_height(1000); + + auto* tensor = slr->mutable_value(); + auto* rows = slr->mutable_rows(); + + tensor->Resize(framework::make_ddim({3, 5})); + tensor->mutable_data(place); + + paddle::operators::math::set_constant(ctx, tensor, 32.7); + for (int i = 0; i < 3; ++i) rows->push_back(i); + + std::cout << "src:" << distributed::GetSelectedRowsInfo(*slr); + + return std::unique_ptr(scope); +} + +void Gather(const std::vector& vars, + platform::DeviceContext* dev_ctx) { + distributed::CollectiveClient* client = + distributed::CollectiveClient::GetInstance(); + + framework::Scope* scope = new framework::Scope(); + framework::Variable* var = scope->Var("var1"); + var->GetMutable(); + + std::vector dst; + client->Gather(vars, &dst, *dev_ctx, scope); + std::cout << "dst:" << distributed::GetSelectedRowsInfo(*dst[0]); +} + +TEST(PREFETCH, GPU) { + platform::CUDAPlace place; + platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); + auto& ctx = *pool.Get(place); + + std::string ep = "127.0.0.1:7164"; + auto scope = GenerateVars(place); + + auto* v1 = scope->FindVar("var1"); + std::cout << "var1:" << v1 << std::endl; + + auto server = StartServer(ep, 2, scope.get(), &ctx); + auto rpc_server = server->GetRPCServer(); + + distributed::RemoteVar var; + var.ep_ = ep; + var.var_name_ = "var1"; + var.trainer_id_ = 0; + + std::vector vars{var}; + Gather(vars, &ctx); + Gather(vars, &ctx); + + std::cout << "begin WaitVarBarrier" << std::endl; + rpc_server->WaitVarBarrier("var1"); + rpc_server->ClearRegisteredVars(); + server->Stop(); + + scope.release(); + server.release(); +} diff --git a/paddle/fluid/operators/distributed/grpc_client.cc b/paddle/fluid/operators/distributed/grpc_client.cc index d7f3ea86aff9e7df3cd9ff3dca573a1ec6ccc27a..857214aa211aee0251571e46049c66c084b470f1 100644 --- a/paddle/fluid/operators/distributed/grpc_client.cc +++ b/paddle/fluid/operators/distributed/grpc_client.cc @@ -28,11 +28,11 @@ namespace paddle { namespace operators { namespace distributed { -void GRPCClient::InitImpl() { InitEventLoop(); } - -void GRPCClient::InitEventLoop() { +void GRPCClient::InitImpl() { // start the client process thread // TODO(wuyi): can make this in a threadpool + PADDLE_ENFORCE(client_thread_ == nullptr, + "please not re init proceed thread"); client_thread_.reset(new std::thread(std::bind(&GRPCClient::Proceed, this))); } @@ -106,6 +106,7 @@ VarHandlePtr GRPCClient::AsyncSendVar(const std::string& ep, void ProcGetResponse(const VarHandle& var_h, const ::grpc::ByteBuffer& ret_msg) { + VLOG(100) << "ProcGetResponse"; framework::Variable* outvar = nullptr; // get response's trainer_id is not used int trainer_id; @@ -126,6 +127,24 @@ VarHandlePtr GRPCClient::AsyncGetVar(const std::string& ep, const framework::Scope& scope, const std::string& var_name, int64_t time_out) { + return _AsyncGetVar(ep, ctx, scope, var_name, + "/sendrecv.SendRecvService/GetVariable", time_out); +} + +VarHandlePtr GRPCClient::AsyncGetMonomerVariable( + const std::string& ep, const platform::DeviceContext& ctx, + const framework::Scope& scope, const std::string& var_name, + int64_t time_out) { + return _AsyncGetVar(ep, ctx, scope, var_name, + "/sendrecv.SendRecvService/GetMonomerVariable", time_out); +} + +VarHandlePtr GRPCClient::_AsyncGetVar(const std::string& ep, + const platform::DeviceContext& ctx, + const framework::Scope& scope, + const std::string& var_name, + const std::string& rpc_path, + int64_t time_out) { const platform::DeviceContext* p_ctx = &ctx; const std::string ep_val = ep; const std::string var_name_val = var_name; @@ -136,7 +155,7 @@ VarHandlePtr GRPCClient::AsyncGetVar(const std::string& ep, VarHandlePtr h(new VarHandle(ep, method, var_name_val, p_ctx, p_scope)); s->Prepare(h, time_out); - framework::AsyncIO([var_name_val, s, method, p_ctx, h, this] { + framework::AsyncIO([var_name_val, s, method, p_ctx, h, rpc_path, this] { // prepare input sendrecv::VariableMessage req; req.set_varname(var_name_val); @@ -151,8 +170,8 @@ VarHandlePtr GRPCClient::AsyncGetVar(const std::string& ep, platform::RecordRPCEvent record_event(method, p_ctx); - auto call = s->stub_g_.PrepareUnaryCall( - s->context_.get(), "/sendrecv.SendRecvService/GetVariable", buf, &cq_); + auto call = + s->stub_g_.PrepareUnaryCall(s->context_.get(), rpc_path, buf, &cq_); call->StartCall(); call->Finish(&s->reply_, &s->status_, reinterpret_cast(s)); @@ -268,6 +287,34 @@ VarHandlePtr GRPCClient::AsyncSendFetchBarrier(const std::string& ep, return h; } +VarHandlePtr GRPCClient::AsyncGetMonomerBarrier(const std::string& ep, + const std::string& var_name, + int64_t time_out) { + const auto ch = GetChannel(ep); + BatchBarrierProcessor* s = new BatchBarrierProcessor(ch); + const std::string method = "SendMonomerFetchBarrierRPC"; + VarHandlePtr h( + new VarHandle(ep, method, FETCH_BARRIER_MESSAGE, nullptr, nullptr)); + s->Prepare(h, time_out); + + VLOG(30) << s->GetVarHandlePtr()->String() << " begin"; + + sendrecv::VariableMessage req; + req.set_varname(var_name); + + platform::RecordRPCEvent record_event(method, nullptr); + + auto rpc = s->stub_->AsyncGetMonomerBarrier(s->context_.get(), req, &cq_); + rpc->Finish(&s->reply_, &s->status_, reinterpret_cast(s)); + req_count_++; + + if (UNLIKELY(platform::IsProfileEnabled())) { + h->Wait(); + } + + return h; +} + VarHandlePtr GRPCClient::AsyncSendComplete(const std::string& ep, int64_t time_out) { const auto ch = GetChannel(ep); diff --git a/paddle/fluid/operators/distributed/grpc_client.h b/paddle/fluid/operators/distributed/grpc_client.h index a31a465645ee4256a76573576ea7fa5af7a5a101..01bf46cc313b4707c7af7a9605926a8b298d679d 100644 --- a/paddle/fluid/operators/distributed/grpc_client.h +++ b/paddle/fluid/operators/distributed/grpc_client.h @@ -189,6 +189,11 @@ class GRPCClient : public RPCClient { const std::string& var_name, int64_t time_out = FLAGS_rpc_deadline) override; + VarHandlePtr AsyncGetMonomerVariable( + const std::string& ep, const platform::DeviceContext& ctx, + const framework::Scope& scope, const std::string& var_name, + int64_t time_out = FLAGS_rpc_deadline) override; + VarHandlePtr AsyncPrefetchVar(const std::string& ep, const platform::DeviceContext& ctx, const framework::Scope& scope, @@ -200,8 +205,12 @@ class GRPCClient : public RPCClient { VarHandlePtr AsyncSendBatchBarrier( const std::string& ep, int64_t time_out = FLAGS_rpc_deadline) override; - VarHandlePtr AsyncSendFetchBarrier( - const std::string& ep, int64_t time_out = FLAGS_rpc_deadline) override; + VarHandlePtr AsyncSendFetchBarrier(const std::string& ep, + int64_t time_out) override; + + VarHandlePtr AsyncGetMonomerBarrier( + const std::string& ep, const std::string& var_name, + int64_t time_out = FLAGS_rpc_deadline) override; VarHandlePtr AsyncCheckpointNotify( const std::string& ep, const std::string& dir, @@ -214,21 +223,22 @@ class GRPCClient : public RPCClient { void SendComplete() override; - protected: void InitImpl() override; private: - // InitEventLoop should only be called by Init() - void InitEventLoop(); - void Proceed(); std::shared_ptr GetChannel(const std::string& ep); + VarHandlePtr _AsyncGetVar(const std::string& ep, + const platform::DeviceContext& ctx, + const framework::Scope& scope, + const std::string& var_name, const std::string& rpc, + int64_t time_out); private: grpc::CompletionQueue cq_; std::unordered_map> channels_; - std::unique_ptr client_thread_; + std::unique_ptr client_thread_{nullptr}; // mutex for Wait client sync std::mutex sync_mutex_; diff --git a/paddle/fluid/operators/distributed/grpc_server.cc b/paddle/fluid/operators/distributed/grpc_server.cc index d9200c98b23601f8ffaa8eb7a7092a9cf881ca24..c3974138f4d4665c46bdfccaef09c0bd84b9d028 100644 --- a/paddle/fluid/operators/distributed/grpc_server.cc +++ b/paddle/fluid/operators/distributed/grpc_server.cc @@ -158,6 +158,98 @@ class RequestGet final : public RequestBase { ServerAsyncResponseWriter<::grpc::ByteBuffer> responder_; }; +class RequestGetMonomerVariable final : public RequestBase { + public: + explicit RequestGetMonomerVariable(GrpcService::AsyncService* service, + ::grpc::ServerCompletionQueue* cq, + RequestHandler* request_handler, + int req_id, RPCServer* rpc_server) + : RequestBase(service, cq, request_handler, req_id), + responder_(&ctx_), + rpc_server_(rpc_server) { + auto method_id = + static_cast(distributed::GrpcMethod::kGetMonomerVariable); + service_->RequestAsyncUnary( + method_id, &ctx_, &request_, &responder_, cq_, cq_, + reinterpret_cast(static_cast(req_id))); + } + + virtual ~RequestGetMonomerVariable() {} + + std::string GetReqName() override { return request_.varname(); } + + void Process() override { + // proc request. + std::string varname = request_.varname(); + + rpc_server_->WaitVarCond(varname); + MonomerHandle h = rpc_server_->GetMonomer(varname); + + auto scope = h.scope_; + auto invar = scope->FindVar(varname); + framework::Variable* outvar = nullptr; + + request_handler_->Handle(varname, scope, invar, &outvar, + request_.trainer_id()); + + if (outvar) { + SerializeToByteBuffer(varname, outvar, *h.dev_ctx_, &reply_); + } + Finish(reply_, &responder_); + } + + protected: + sendrecv::VariableMessage request_; + ::grpc::ByteBuffer reply_; + ServerAsyncResponseWriter<::grpc::ByteBuffer> responder_; + RPCServer* rpc_server_{nullptr}; +}; + +class RequestGetMonomerBarrier final : public RequestBase { + public: + explicit RequestGetMonomerBarrier(GrpcService::AsyncService* service, + ::grpc::ServerCompletionQueue* cq, + RequestHandler* request_handler, int req_id, + RPCServer* rpc_server) + : RequestBase(service, cq, request_handler, req_id), + responder_(&ctx_), + rpc_server_(rpc_server) { + auto method_id = + static_cast(distributed::GrpcMethod::kGetMonomerBarrier); + service_->RequestAsyncUnary( + method_id, &ctx_, &request_, &responder_, cq_, cq_, + reinterpret_cast(static_cast(req_id))); + } + + virtual ~RequestGetMonomerBarrier() {} + + std::string GetReqName() override { return request_.varname(); } + + void Process() override { + // proc request. + std::string varname = request_.varname(); + VLOG(4) << "RequestGetMonomerBarrier " << varname; + + rpc_server_->WaitVarCond(varname); + MonomerHandle h = rpc_server_->GetMonomer(varname); + + framework::Scope* scope = nullptr; + framework::Variable* invar = nullptr; + framework::Variable* outvar = nullptr; + + request_handler_->Handle(varname, scope, invar, &outvar, + request_.trainer_id()); + + Finish(reply_, &responder_); + } + + protected: + sendrecv::VariableMessage request_; + sendrecv::VoidMessage reply_; + ServerAsyncResponseWriter responder_; + RPCServer* rpc_server_{nullptr}; +}; + class RequestPrefetch final : public RequestBase { public: explicit RequestPrefetch(GrpcService::AsyncService* service, @@ -249,7 +341,7 @@ class RequestCheckpointNotify final : public RequestBase { }; void AsyncGRPCServer::WaitServerReady() { - VLOG(4) << "AsyncGRPCServer is wait server ready"; + VLOG(4) << "AsyncGRPCServer is waiting server ready"; std::unique_lock lock(this->mutex_ready_); condition_ready_.wait(lock, [=] { return this->ready_ == 1; }); VLOG(4) << "AsyncGRPCServer WaitSeverReady"; @@ -368,6 +460,12 @@ void AsyncGRPCServer::TryToRegisterNewOne(const std::string& rpc_name, b = new RequestSend(&service_, cq.get(), handler, req_id); } else if (rpc_name == kRequestGet) { b = new RequestGet(&service_, cq.get(), handler, req_id); + } else if (rpc_name == kRequestGetMonomerVariable) { + b = new RequestGetMonomerVariable(&service_, cq.get(), handler, req_id, + this); + } else if (rpc_name == kRequestGetMonomerBarrier) { + b = new RequestGetMonomerBarrier(&service_, cq.get(), handler, req_id, + this); } else if (rpc_name == kRequestPrefetch) { b = new RequestPrefetch(&service_, cq.get(), handler, req_id); } else if (rpc_name == kRequestCheckpoint) { @@ -378,7 +476,7 @@ void AsyncGRPCServer::TryToRegisterNewOne(const std::string& rpc_name, reqs[req_id] = b; - VLOG(4) << "Create RequestSend status:" << b->Status(); + VLOG(4) << "TryToRegisterNewOne status:" << b->Status(); } void AsyncGRPCServer::HandleRequest( diff --git a/paddle/fluid/operators/distributed/grpc_service.h b/paddle/fluid/operators/distributed/grpc_service.h index 9ae9a31a003cbb1f808fd1127a5dd78511aa3e99..537429b5fe989269d437b6dfe558c0a7dcfc2dcc 100644 --- a/paddle/fluid/operators/distributed/grpc_service.h +++ b/paddle/fluid/operators/distributed/grpc_service.h @@ -81,10 +81,12 @@ enum class GrpcMethod { kGetVariable, kPrefetchVariable, kCheckpointNotify, + kGetMonomerVariable, + kGetMonomerBarrier, }; static const int kGrpcNumMethods = - static_cast(GrpcMethod::kCheckpointNotify) + 1; + static_cast(GrpcMethod::kGetMonomerBarrier) + 1; inline const char* GrpcMethodName(GrpcMethod id) { switch (id) { @@ -92,6 +94,10 @@ inline const char* GrpcMethodName(GrpcMethod id) { return "/sendrecv.SendRecvService/SendVariable"; case GrpcMethod::kGetVariable: return "/sendrecv.SendRecvService/GetVariable"; + case GrpcMethod::kGetMonomerVariable: + return "/sendrecv.SendRecvService/GetMonomerVariable"; + case GrpcMethod::kGetMonomerBarrier: + return "/sendrecv.SendRecvService/GetMonomerBarrier"; case GrpcMethod::kPrefetchVariable: return "/sendrecv.SendRecvService/PrefetchVariable"; case GrpcMethod::kCheckpointNotify: diff --git a/paddle/fluid/operators/distributed/request_handler.h b/paddle/fluid/operators/distributed/request_handler.h index 5272afd42851160ca5352ef474d940a5d2dd2456..62b24f150b41efead24c8bdbe08c9b44e160445a 100644 --- a/paddle/fluid/operators/distributed/request_handler.h +++ b/paddle/fluid/operators/distributed/request_handler.h @@ -37,6 +37,8 @@ namespace distributed { constexpr char kRequestSend[] = "RequestSend"; constexpr char kRequestGet[] = "RequestGet"; +constexpr char kRequestGetMonomerVariable[] = "RequestGetMonomerVariable"; +constexpr char kRequestGetMonomerBarrier[] = "RequestGetMonomerBarrier"; constexpr char kRequestPrefetch[] = "RequestPrefetch"; constexpr char kRequestCheckpoint[] = "RequestCheckpoint"; constexpr char kRequestPassBarrier[] = "RequestPassBarrier"; diff --git a/paddle/fluid/operators/distributed/rpc_client.h b/paddle/fluid/operators/distributed/rpc_client.h index 4cd3abb5a61068bc4f9f5b38cafc2daa8406d448..b668d869787a47ebd36f570061421ddbeae5a09a 100644 --- a/paddle/fluid/operators/distributed/rpc_client.h +++ b/paddle/fluid/operators/distributed/rpc_client.h @@ -45,6 +45,11 @@ class RPCClient { const std::string& var_name, int64_t time_out = FLAGS_rpc_deadline) = 0; + virtual VarHandlePtr AsyncGetMonomerVariable( + const std::string& ep, const platform::DeviceContext& ctx, + const framework::Scope& scope, const std::string& var_name, + int64_t time_out = FLAGS_rpc_deadline) = 0; + virtual VarHandlePtr AsyncPrefetchVar( const std::string& ep, const platform::DeviceContext& ctx, const framework::Scope& scope, const std::string& in_var_name, @@ -57,6 +62,10 @@ class RPCClient { virtual VarHandlePtr AsyncSendFetchBarrier( const std::string& ep, int64_t time_out = FLAGS_rpc_deadline) = 0; + virtual VarHandlePtr AsyncGetMonomerBarrier( + const std::string& ep, const std::string& var_name, + int64_t time_out = FLAGS_rpc_deadline) = 0; + virtual VarHandlePtr AsyncCheckpointNotify( const std::string& ep, const std::string& dir, int64_t time_out = FLAGS_rpc_deadline) = 0; @@ -87,8 +96,9 @@ class RPCClient { } } - protected: virtual void InitImpl() {} + + protected: // each trainer have exact one trainer id, it should be static static int trainer_id_; diff --git a/paddle/fluid/operators/distributed/rpc_server.cc b/paddle/fluid/operators/distributed/rpc_server.cc index 3e30ed4ac86bd2cb3f7c4301163e54a947c3d5b4..122619d41b25da488742b4a7192b6a18b8bf9283 100644 --- a/paddle/fluid/operators/distributed/rpc_server.cc +++ b/paddle/fluid/operators/distributed/rpc_server.cc @@ -132,6 +132,96 @@ void RPCServer::WaitCond(const std::string& rpc_name) { lock, [=] { return (cur_cond_.load() == cond || exit_flag_.load()); }); } +void RPCServer::RegisterVar(const std::string& var_name, + const std::string& rpc_name, + framework::Scope* scope, + platform::DeviceContext* dev_ctx) { + MonomerHandle h; + h.var_name_ = var_name; + h.rpc_name_ = rpc_name; + h.scope_ = scope; + h.dev_ctx_ = dev_ctx; + + { + std::unique_lock lock(mutex_); + if (var_map_.find(var_name) != var_map_.end()) { + PADDLE_ENFORCE(false, "%s alreay in var_map", var_name); + } + var_map_[var_name] = h; + } + + rpc_cond_.notify_all(); + VLOG(4) << "RegisterVar context:" << h.String(); +} + +void RPCServer::IncreaseVarBarrier(const std::string& var_name) { + int b = 0; + MonomerHandle h; + { + std::unique_lock lock(mutex_); + b = ++var_map_[var_name].barrier_; + h = var_map_[var_name]; + } + + if (b >= client_num_) { + barrier_cond_.notify_all(); + } + + VLOG(4) << "IncreaseVarBarrier context:" << h.String(); +} + +void RPCServer::WaitVarBarrier(const std::string& var_name) { + VLOG(4) << "WaitBarrier var_name:" << var_name; + + std::unique_lock lock(mutex_); + barrier_cond_.wait(lock, [&]() { + return ((var_map_[var_name].barrier_ >= client_num_ && client_num_ != 0) || + exit_flag_.load()); + }); + + VLOG(4) << "WaitBarrier context: " << var_map_[var_name].String(); +} + +void RPCServer::SetVarCond(const std::string& var_name) { + VLOG(4) << "SetVarCond var_name:" << var_name; + { + std::unique_lock lock(mutex_); + if (var_map_.find(var_name) != var_map_.end()) { + rpc_cond_.notify_all(); + } + } +} + +void RPCServer::WaitVarCond(const std::string& var_name) { + VLOG(4) << "WaitVarCond var_name:" << var_name; + + std::unique_lock lock(mutex_); + rpc_cond_.wait(lock, [=] { + return (var_map_.find(var_name) != var_map_.end() || exit_flag_.load()); + }); + + VLOG(4) << "WaitVarCond var_name:" << var_name << " end"; +} + +MonomerHandle RPCServer::GetMonomer(const std::string& var_name) { + MonomerHandle h; + { + std::unique_lock lock(mutex_); + h = var_map_[var_name]; + } + + return h; +} + +void RPCServer::ClearRegisteredVars() { + std::unique_lock lock(mutex_); + var_map_.clear(); +} + +void RPCServer::ClearVar(const std::string& var_name) { + std::unique_lock lock(mutex_); + var_map_.erase(var_name); +} } // namespace distributed } // namespace operators } // namespace paddle diff --git a/paddle/fluid/operators/distributed/rpc_server.h b/paddle/fluid/operators/distributed/rpc_server.h index c78c5007a7f262f15305b6c284e8c4fbddef42a0..45d1d3479ce731894c26bbff40f456bbfdc13d44 100644 --- a/paddle/fluid/operators/distributed/rpc_server.h +++ b/paddle/fluid/operators/distributed/rpc_server.h @@ -21,12 +21,30 @@ #include #include +#include "paddle/fluid/framework/scope.h" #include "paddle/fluid/operators/distributed/request_handler.h" +#include "paddle/fluid/platform/device_context.h" namespace paddle { namespace operators { namespace distributed { +struct MonomerHandle { + std::string var_name_; + std::string rpc_name_; + framework::Scope* scope_{nullptr}; + platform::DeviceContext* dev_ctx_{nullptr}; + int64_t barrier_{0}; + + std::string String() { + std::stringstream ss; + ss << "var_name:" << var_name_ << ", rpc_name:" << rpc_name_ + << ", scope:" << scope_ << ", dev_ctx:" << dev_ctx_ + << ", barrier_:" << barrier_; + return ss.str(); + } +}; + class RPCServer { public: explicit RPCServer(const std::string& address, int client_num) @@ -67,6 +85,16 @@ class RPCServer { void WaitCond(const std::string& rpc_name); void IncreaseBatchBarrier(const std::string rpc_name); + void RegisterVar(const std::string& var_name, const std::string& rpc_name, + framework::Scope* scope, platform::DeviceContext* dev_ctx); + void IncreaseVarBarrier(const std::string& var_name); + void WaitVarBarrier(const std::string& var_name); + void SetVarCond(const std::string& var_name); + void WaitVarCond(const std::string& var_name); + void ClearRegisteredVars(); + void ClearVar(const std::string& var_name); + MonomerHandle GetMonomer(const std::string& var_name); + void Complete(); void ResetBarrierCounter(); @@ -95,6 +123,9 @@ class RPCServer { std::unordered_map rpc_call_map_; std::unordered_map rpc_thread_num_; friend class RequestHandler; + + // TODO(gongwb): use more cond to notify or wait; + std::unordered_map var_map_; }; }; // namespace distributed diff --git a/paddle/fluid/operators/distributed/send_recv.proto.in b/paddle/fluid/operators/distributed/send_recv.proto.in index 7b7d069f17fd0f9e6a776fa4d1a19cf01914cfeb..2637619f304d246fa535bbfc7be3474209b63b0f 100644 --- a/paddle/fluid/operators/distributed/send_recv.proto.in +++ b/paddle/fluid/operators/distributed/send_recv.proto.in @@ -28,6 +28,9 @@ service SendRecvService { rpc PrefetchVariable(VariableMessage) returns (VariableMessage) {} rpc CheckpointNotify(VariableMessage) returns (VoidMessage) {} + + rpc GetMonomerVariable(VariableMessage) returns (VariableMessage) {} + rpc GetMonomerBarrier(VariableMessage) returns (VoidMessage) {} } // VariableMessage is serialized paddle variable message. diff --git a/paddle/fluid/operators/elementwise/elementwise_mul_op.h b/paddle/fluid/operators/elementwise/elementwise_mul_op.h index dc25bc57103286ce183a4649964fd96c62169b7f..a8b8a67a114b956f2d6b1b072ef343a179114b34 100644 --- a/paddle/fluid/operators/elementwise/elementwise_mul_op.h +++ b/paddle/fluid/operators/elementwise/elementwise_mul_op.h @@ -60,15 +60,37 @@ template class ElementwiseMulKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { - auto* x = ctx.Input("X"); + auto x_var = ctx.InputVar("X"); + PADDLE_ENFORCE(x_var != nullptr, + "Cannot get input Variable X, variable name = %s", + ctx.op().Input("X")); auto* y = ctx.Input("Y"); - auto* z = ctx.Output("Out"); + + framework::Tensor x, *z; + if (x_var->IsType()) { + PADDLE_ENFORCE(y->dims().size() == 1 && y->dims()[0] == 1, + "For elementwise_op, if X is Sparse, Y must be scalar."); + auto& x_sele = x_var->Get(); + auto out_sele = ctx.Output("Out"); + x = x_sele.value(); + out_sele->set_rows(x_sele.rows()); + out_sele->set_height(x_sele.height()); + out_sele->mutable_value()->Resize(x_sele.value().dims()); + out_sele->mutable_value()->mutable_data(ctx.GetPlace(), x.type()); + z = ctx.Output("Out")->mutable_value(); + } else if (x_var->IsType()) { + x = x_var->Get(); + z = ctx.Output("Out"); + } else { + PADDLE_THROW("X's type[%s] is not supported by elementwise_op.", + x_var->Type().name()); + } z->mutable_data(ctx.GetPlace()); - if (x->numel() == y->numel()) { - elementwise_mul(ctx, x, y, z); + if (x.numel() == y->numel()) { + elementwise_mul(ctx, &x, y, z); } else { - default_elementwise_mul(ctx, x, y, z); + default_elementwise_mul(ctx, &x, y, z); } } }; diff --git a/paddle/fluid/operators/elementwise/elementwise_op.h b/paddle/fluid/operators/elementwise/elementwise_op.h index 85a7817be9b3a82d40853b417d78a7fdf67f6c1f..87bf7c6b156f32b8f6a1abc30b0676e1d4711d64 100644 --- a/paddle/fluid/operators/elementwise/elementwise_op.h +++ b/paddle/fluid/operators/elementwise/elementwise_op.h @@ -40,21 +40,28 @@ class ElementwiseOp : public framework::OperatorWithKernel { PADDLE_ENFORCE(ctx->HasOutput("Out"), "Output(Out) of elementwise op should not be null."); - PADDLE_ENFORCE( - ctx->GetInputsVarType("X").front() == - framework::proto::VarType::LOD_TENSOR, - "The input var's type should be LoDTensor, but the received is %s", - ctx->Inputs("X").front(), ctx->GetInputsVarType("X").front()); PADDLE_ENFORCE( ctx->GetInputsVarType("Y").front() == framework::proto::VarType::LOD_TENSOR, - "The input var's type should be LoDTensor, but the received is %s", - ctx->Inputs("Y").front(), ctx->GetInputsVarType("Y").front()); - - auto x_dim = ctx->GetInputDim("X"); - auto y_dim = ctx->GetInputDim("Y"); - PADDLE_ENFORCE_GE(x_dim.size(), y_dim.size(), - "Rank of first input must >= rank of second input."); + "The input var's type should be LoDTensor, but the received is %s [%s]", + ctx->GetInputsVarType("Y").front(), ctx->Inputs("Y").front()); + + if (ctx->GetInputsVarType("X").front() == + framework::proto::VarType::LOD_TENSOR) { + auto x_dim = ctx->GetInputDim("X"); + auto y_dim = ctx->GetInputDim("Y"); + PADDLE_ENFORCE_GE(x_dim.size(), y_dim.size(), + "Rank of first input must >= rank of second input."); + } else if (ctx->GetInputsVarType("X").front() == + framework::proto::VarType::SELECTED_ROWS) { + PADDLE_ENFORCE((ctx->GetInputDim("Y").size() == 1u) && + (ctx->GetInputDim("Y")[0] == 1), + "For elementwise_op, if X is Sparse, " + "Y must be scalar."); + } else { + PADDLE_THROW("X's type[%s] is not supported by elementwise_op.", + ctx->GetInputsVarType("X").front()); + } ctx->ShareDim("X", /*->*/ "Out"); ctx->ShareLoD("X", /*->*/ "Out"); diff --git a/paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.cc b/paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.cc index 6d463538d232e1a38f845e7abc3786568ca3bb21..1eb6523a2dfb358490a07bf1b806d5638442a4d5 100644 --- a/paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.cc +++ b/paddle/fluid/operators/fused/fused_embedding_fc_lstm_op.cc @@ -217,13 +217,13 @@ class FusedEmbeddingFCLSTMKernel : public framework::OpKernel { auto& act_gate_str = ctx.Attr("gate_activation"); \ auto& act_cell_str = ctx.Attr("cell_activation"); \ auto& act_cand_str = ctx.Attr("candidate_activation"); \ - if (platform::jit::MayIUse(platform::jit::avx)) { \ - math::VecActivations act_functor; \ + if (platform::MayIUse(platform::avx)) { \ + math::VecActivations act_functor; \ act_gate = act_functor(act_gate_str); \ act_cell = act_functor(act_cell_str); \ act_cand = act_functor(act_cand_str); \ } else { \ - math::VecActivations act_functor; \ + math::VecActivations act_functor; \ act_gate = act_functor(act_gate_str); \ act_cell = act_functor(act_cell_str); \ act_cand = act_functor(act_cand_str); \ diff --git a/paddle/fluid/operators/fused/fusion_seqexpand_concat_fc_op.cc b/paddle/fluid/operators/fused/fusion_seqexpand_concat_fc_op.cc index 288b56fc2485138b20c5b53af3e950f1c1886ba5..17ed9771d074cf7ae8c6735e4cb859139503a0af 100644 --- a/paddle/fluid/operators/fused/fusion_seqexpand_concat_fc_op.cc +++ b/paddle/fluid/operators/fused/fusion_seqexpand_concat_fc_op.cc @@ -151,11 +151,11 @@ class FusionSeqExpandConcatFCOpKernel : public framework::OpKernel { std::function fc_act; auto& fc_act_str = ctx.Attr("fc_activation"); - if (platform::jit::MayIUse(platform::jit::avx)) { - math::VecActivations act_functor; + if (platform::MayIUse(platform::avx)) { + math::VecActivations act_functor; fc_act = act_functor(fc_act_str); } else { - math::VecActivations act_functor; + math::VecActivations act_functor; fc_act = act_functor(fc_act_str); } diff --git a/paddle/fluid/operators/get_tensor_from_selected_rows_op.cc b/paddle/fluid/operators/get_tensor_from_selected_rows_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..a4ae19d9c1e3bb2af3eb95650fbb5aabb8944a36 --- /dev/null +++ b/paddle/fluid/operators/get_tensor_from_selected_rows_op.cc @@ -0,0 +1,117 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/tensor_util.h" + +namespace paddle { +namespace operators { + +class GetTensorFromSelectedRowsOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext *ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), + "GetTensorFromSelectedRowsOp must has input X."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "GetTensorFromSelectedRowsOp must has output Out."); + PADDLE_ENFORCE( + ctx->GetInputsVarType("X").front() == + framework::proto::VarType::SELECTED_ROWS, + "The input X's type should be SelectedRows, but the received is %s", + ctx->Inputs("X").front(), ctx->GetInputsVarType("X").front()); + PADDLE_ENFORCE( + ctx->GetOutputsVarType("Out").front() == + framework::proto::VarType::LOD_TENSOR, + "The output Out's type should be LoDTensor, but the received is %s", + ctx->Outputs("Out").front(), ctx->GetOutputsVarType("Out").front()); + + ctx->SetOutputDim("Out", ctx->GetInputDim("X")); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext &ctx) const override { + return framework::OpKernelType( + framework::GetDataTypeOfVar(ctx.InputVar("X")), ctx.device_context()); + } +}; + +class GetTensorFromSelectedRowsKernel { + public: + void operator()(const framework::ExecutionContext &ctx) const { + auto *x = ctx.Input("X"); + auto *out = ctx.Output("Out"); + + out->Resize(x->value().dims()); + out->mutable_data(ctx.GetPlace(), x->value().type()); + framework::TensorCopy(x->value(), ctx.GetPlace(), ctx.device_context(), + out); + } +}; + +class GetTensorFromSelectedRowsOpProtoMaker + : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("X", "The input type is SelectedRows."); + AddOutput("Out", "The output type is LoDTensor."); + AddComment( + R"DOC( +GetTensorFromSelectedRows Operator + +GetTensorFromSelectedRows is used to get the tensor from SelectedRows. + +)DOC"); + } +}; + +class GetTensorFromSelectedRowsOpVarTypeInference + : public framework::VarTypeInference { + public: + void operator()(const framework::OpDesc &op_desc, + framework::BlockDesc *block) const final { + auto out_var_name = op_desc.Output("Out").front(); + auto in_var_name = op_desc.Input("X").front(); + + auto out_var = block->FindRecursiveOrCreateVar(out_var_name); + auto in_var = block->FindRecursiveOrCreateVar(in_var_name); + out_var.SetType(framework::proto::VarType::LOD_TENSOR); + out_var.SetDataType(in_var.GetDataType()); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(get_tensor_from_selected_rows, + ops::GetTensorFromSelectedRowsOp, + ops::GetTensorFromSelectedRowsOpProtoMaker, + ops::GetTensorFromSelectedRowsOpVarTypeInference); + +REGISTER_OP_CPU_KERNEL_FUNCTOR(get_tensor_from_selected_rows, float, + ops::GetTensorFromSelectedRowsKernel, double, + ops::GetTensorFromSelectedRowsKernel, int, + ops::GetTensorFromSelectedRowsKernel, int64_t, + ops::GetTensorFromSelectedRowsKernel); + +#ifdef PADDLE_WITH_CUDA +REGISTER_OP_CUDA_KERNEL_FUNCTOR(get_tensor_from_selected_rows, float, + ops::GetTensorFromSelectedRowsKernel, double, + ops::GetTensorFromSelectedRowsKernel, int, + ops::GetTensorFromSelectedRowsKernel, int64_t, + ops::GetTensorFromSelectedRowsKernel); +#endif diff --git a/paddle/fluid/operators/hierarchical_sigmoid_op.cc b/paddle/fluid/operators/hierarchical_sigmoid_op.cc index 972dcf5494e9acd47e7ff615db45f056a43724a6..0dbcc442dfa1a395cdb0ffbd69eb78ad66cfaa17 100644 --- a/paddle/fluid/operators/hierarchical_sigmoid_op.cc +++ b/paddle/fluid/operators/hierarchical_sigmoid_op.cc @@ -150,14 +150,14 @@ class HierarchicalSigmoidGradOp : public framework::OperatorWithKernel { "Output(W@Grad should not be null."); PADDLE_ENFORCE(ctx->HasOutput(framework::GradVarName("X")), "Output(X@Grad should not be null."); - if (!ctx->Attrs().Get("is_sparse")) { - if (ctx->HasOutput(framework::GradVarName("Bias"))) { - ctx->SetOutputDim(framework::GradVarName("Bias"), - ctx->GetInputDim("Bias")); - } - ctx->SetOutputDim(framework::GradVarName("W"), ctx->GetInputDim("W")); + + if (ctx->HasOutput(framework::GradVarName("Bias"))) { + ctx->SetOutputDim(framework::GradVarName("Bias"), + ctx->GetInputDim("Bias")); } + ctx->SetOutputDim(framework::GradVarName("W"), ctx->GetInputDim("W")); ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X")); + ctx->ShareLoD("X", /*->*/ framework::GradVarName("X")); } protected: diff --git a/paddle/fluid/operators/hierarchical_sigmoid_op.h b/paddle/fluid/operators/hierarchical_sigmoid_op.h index 07ff8f947e59d2954783e2ba537bfce3cb320f22..b73a32af89e882ac02623dd1d312f400a78fc47a 100644 --- a/paddle/fluid/operators/hierarchical_sigmoid_op.h +++ b/paddle/fluid/operators/hierarchical_sigmoid_op.h @@ -185,7 +185,6 @@ class HierarchicalSigmoidGradOpKernel : public framework::OpKernel { ctx.Output(framework::GradVarName("W")); w_grad->set_rows(real_rows); // Build a map of id -> row_index to speed up finding the index of one id - w_grad->SyncIndex(); w_grad->set_height(w.dims()[0]); auto* w_grad_value = w_grad->mutable_value(); framework::DDim temp_dim(w.dims()); diff --git a/paddle/fluid/operators/load_combine_op.cc b/paddle/fluid/operators/load_combine_op.cc index 0522a94195786c767194ec727d982a60451e7c62..9d1423915afc25889b9fa96963d6f9514bea2870 100644 --- a/paddle/fluid/operators/load_combine_op.cc +++ b/paddle/fluid/operators/load_combine_op.cc @@ -32,16 +32,26 @@ class LoadCombineOp : public framework::OperatorBase { const platform::Place &place) const override { auto filename = Attr("file_path"); auto load_as_fp16 = Attr("load_as_fp16"); - - std::ifstream fin(filename); - PADDLE_ENFORCE(static_cast(fin), - "Cannot open file %s for load_combine op", filename); - + auto model_from_memory = Attr("model_from_memory"); auto out_var_names = Outputs("Out"); PADDLE_ENFORCE_GT( static_cast(out_var_names.size()), 0, "The number of output variables should be greater than 0."); - + if (!model_from_memory) { + std::ifstream fin(filename); + PADDLE_ENFORCE(static_cast(fin), + "Cannot open file %s for load_combine op", filename); + LoadParamsFromBuffer(scope, place, &fin, load_as_fp16, out_var_names); + } else { + PADDLE_ENFORCE(!filename.empty(), "Cannot load file from memory"); + std::stringstream fin(filename); + LoadParamsFromBuffer(scope, place, &fin, load_as_fp16, out_var_names); + } + } + void LoadParamsFromBuffer( + const framework::Scope &scope, const platform::Place &place, + std::istream *buffer, bool load_as_fp16, + const std::vector &out_var_names) const { platform::DeviceContextPool &pool = platform::DeviceContextPool::Instance(); auto &dev_ctx = *pool.Get(place); @@ -54,11 +64,10 @@ class LoadCombineOp : public framework::OperatorBase { auto *tensor = out_var->GetMutable(); // Error checking - PADDLE_ENFORCE(static_cast(fin), "Cannot read more from file %s", - filename); + PADDLE_ENFORCE(static_cast(buffer), "Cannot read more"); // Get data from fin to tensor - DeserializeFromStream(fin, tensor, dev_ctx); + DeserializeFromStream(*buffer, tensor, dev_ctx); auto in_dtype = framework::ToDataType(tensor->type()); auto out_dtype = @@ -103,11 +112,17 @@ class LoadCombineOpProtoMaker : public framework::OpProtoAndCheckerMaker { "LoDTensors will be loaded from \"file_path\".") .AddCustomChecker( [](const std::string &path) { return !path.empty(); }); + AddAttr("model_from_memory", + "(boolean, default false)" + "If true, file_path is in memory, and LoDTensors will be " + "loaded directly from memory") + .SetDefault(false); AddComment(R"DOC( LoadCombine Operator. -LoadCombine operator loads LoDTensor variables from a file. The file should -contain one or more LoDTensors serialized using the SaveCombine operator. The +LoadCombine operator loads LoDTensor variables from a file, which could be +loaded in memory already. The file should contain one or more LoDTensors +serialized using the SaveCombine operator. The LoadCombine operator applies a deserialization strategy to appropriately load the LodTensors, and this strategy complements the serialization strategy used in the SaveCombine operator. Hence, the LoadCombine operator is tightly coupled diff --git a/paddle/fluid/operators/math/CMakeLists.txt b/paddle/fluid/operators/math/CMakeLists.txt index 63363086adbf12c38ac09949ac20483116ccf4ee..b3d2ea38eb1bfffadc1f68c5a34bc4d557bdea3b 100644 --- a/paddle/fluid/operators/math/CMakeLists.txt +++ b/paddle/fluid/operators/math/CMakeLists.txt @@ -59,6 +59,7 @@ math_library(matrix_bit_code) math_library(unpooling) math_library(vol2col) +math_library(prelu) cc_test(math_function_test SRCS math_function_test.cc DEPS math_function) cc_test(selected_rows_functor_test SRCS selected_rows_functor_test.cc DEPS selected_rows_functor) diff --git a/paddle/fluid/operators/math/cpu_vec.h b/paddle/fluid/operators/math/cpu_vec.h index 7d81aee596934308763002d440f52400f45b5f20..e1e4d168db3ca594b44396a6e30c5bfc03483eaf 100644 --- a/paddle/fluid/operators/math/cpu_vec.h +++ b/paddle/fluid/operators/math/cpu_vec.h @@ -77,7 +77,7 @@ inline void vec_scal(const int n, const double a, double* x) { #endif // MKL scal only support inplace, choose this if src and dst are not equal -template +template inline void vec_scal(const int n, const T a, const T* x, T* y) { for (int i = 0; i < n; ++i) { y[i] = a * x[i]; @@ -85,12 +85,12 @@ inline void vec_scal(const int n, const T a, const T* x, T* y) { } template <> -inline void vec_scal(const int n, const float a, - const float* x, float* y) { +inline void vec_scal(const int n, const float a, + const float* x, float* y) { #ifdef __AVX__ constexpr int block = YMM_FLOAT_BLOCK; if (n < block) { - vec_scal(n, a, x, y); + vec_scal(n, a, x, y); return; } const int rest = n % block; @@ -114,24 +114,24 @@ inline void vec_scal(const int n, const float a, y[i] = a * x[i]; } #else - vec_scal(n, a, x, y); + vec_scal(n, a, x, y); #endif } template <> -inline void vec_scal(const int n, const float a, - const float* x, float* y) { - vec_scal(n, a, x, y); +inline void vec_scal(const int n, const float a, + const float* x, float* y) { + vec_scal(n, a, x, y); } template <> -inline void vec_scal(const int n, const float a, - const float* x, float* y) { +inline void vec_scal(const int n, const float a, + const float* x, float* y) { // TODO(TJ): enable me - vec_scal(n, a, x, y); + vec_scal(n, a, x, y); } -template +template inline void vec_bias_sub(const int n, const T a, const T* x, T* y) { for (int i = 0; i < n; ++i) { y[i] = a - x[i]; @@ -139,12 +139,12 @@ inline void vec_bias_sub(const int n, const T a, const T* x, T* y) { } template <> -inline void vec_bias_sub(const int n, const float a, - const float* x, float* y) { +inline void vec_bias_sub(const int n, const float a, + const float* x, float* y) { #ifdef __AVX__ constexpr int block = YMM_FLOAT_BLOCK; if (n < block) { - vec_bias_sub(n, a, x, y); + vec_bias_sub(n, a, x, y); return; } const int rest = n % block; @@ -168,27 +168,25 @@ inline void vec_bias_sub(const int n, const float a, y[i] = a - x[i]; } #else - vec_bias_sub(n, a, x, y); + vec_bias_sub(n, a, x, y); #endif } template <> -inline void vec_bias_sub(const int n, const float a, - const float* x, float* y) { - vec_bias_sub(n, a, x, y); +inline void vec_bias_sub(const int n, const float a, + const float* x, float* y) { + vec_bias_sub(n, a, x, y); } template <> -inline void vec_bias_sub(const int n, - const float a, - const float* x, - float* y) { +inline void vec_bias_sub(const int n, const float a, + const float* x, float* y) { // TODO(TJ): enable me - vec_bias_sub(n, a, x, y); + vec_bias_sub(n, a, x, y); } // out = x*y + (1-x)*z -template +template inline void vec_cross(const int n, const T* x, const T* y, const T* z, T* out) { for (int i = 0; i < n; ++i) { out[i] = x[i] * y[i] + (static_cast(1) - x[i]) * z[i]; @@ -196,13 +194,13 @@ inline void vec_cross(const int n, const T* x, const T* y, const T* z, T* out) { } template <> -inline void vec_cross(const int n, const float* x, - const float* y, const float* z, - float* out) { +inline void vec_cross(const int n, const float* x, + const float* y, const float* z, + float* out) { #ifdef __AVX__ constexpr int block = YMM_FLOAT_BLOCK; if (n < block) { - vec_cross(n, x, y, z, out); + vec_cross(n, x, y, z, out); return; } const int rest = n % block; @@ -228,25 +226,26 @@ inline void vec_cross(const int n, const float* x, out[i] = x[i] * y[i] + (1.f - x[i]) * z[i]; } #else - vec_cross(n, x, y, z, out); + vec_cross(n, x, y, z, out); #endif } template <> -inline void vec_cross(const int n, const float* x, - const float* y, - const float* z, float* out) { - vec_cross(n, x, y, z, out); +inline void vec_cross(const int n, const float* x, + const float* y, const float* z, + float* out) { + vec_cross(n, x, y, z, out); } template <> -inline void vec_cross( - const int n, const float* x, const float* y, const float* z, float* out) { +inline void vec_cross(const int n, const float* x, + const float* y, const float* z, + float* out) { // TODO(TJ): enable me - vec_cross(n, x, y, z, out); + vec_cross(n, x, y, z, out); } -template +template inline void vec_add_bias(const int n, const T a, const T* x, T* y) { for (int i = 0; i < n; ++i) { y[i] = x[i] + a; @@ -254,12 +253,12 @@ inline void vec_add_bias(const int n, const T a, const T* x, T* y) { } template <> -inline void vec_add_bias(const int n, const float a, - const float* x, float* y) { +inline void vec_add_bias(const int n, const float a, + const float* x, float* y) { #ifdef __AVX__ constexpr int block = YMM_FLOAT_BLOCK; if (n < block) { - vec_add_bias(n, a, x, y); + vec_add_bias(n, a, x, y); return; } const int rest = n % block; @@ -283,32 +282,30 @@ inline void vec_add_bias(const int n, const float a, y[i] = x[i] + a; } #else - vec_add_bias(n, a, x, y); + vec_add_bias(n, a, x, y); #endif } template <> -inline void vec_add_bias(const int n, const float a, - const float* x, float* y) { - vec_add_bias(n, a, x, y); +inline void vec_add_bias(const int n, const float a, + const float* x, float* y) { + vec_add_bias(n, a, x, y); } template <> -inline void vec_add_bias(const int n, - const float a, - const float* x, - float* y) { +inline void vec_add_bias(const int n, const float a, + const float* x, float* y) { // TODO(TJ): enable me - vec_add_bias(n, a, x, y); + vec_add_bias(n, a, x, y); } -template +template inline void vec_identity(const int n, const T* x, T* y) { // do nothing return; } -template +template inline void vec_sigmoid(const int n, const T* x, T* y) { const T min = SIGMOID_THRESHOLD_MIN; const T max = SIGMOID_THRESHOLD_MAX; @@ -323,12 +320,12 @@ inline void vec_sigmoid(const int n, const T* x, T* y) { } template <> -inline void vec_sigmoid(const int n, const float* x, - float* y) { +inline void vec_sigmoid(const int n, const float* x, + float* y) { #ifdef __AVX__ constexpr int block = YMM_FLOAT_BLOCK; if (n < block) { - vec_sigmoid(n, x, y); + vec_sigmoid(n, x, y); return; } const int rest = n % block; @@ -377,25 +374,24 @@ inline void vec_sigmoid(const int n, const float* x, y[i] = 1.f / (1.f + y[i]); } #else - vec_sigmoid(n, x, y); + vec_sigmoid(n, x, y); #endif } template <> -inline void vec_sigmoid(const int n, const float* x, - float* y) { - vec_sigmoid(n, x, y); +inline void vec_sigmoid(const int n, const float* x, + float* y) { + vec_sigmoid(n, x, y); } template <> -inline void vec_sigmoid(const int n, - const float* x, - float* y) { +inline void vec_sigmoid(const int n, const float* x, + float* y) { // TODO(TJ): enable me - vec_sigmoid(n, x, y); + vec_sigmoid(n, x, y); } -template +template inline void vec_tanh(const int n, const T* x, T* y) { vec_scal(n, static_cast(2), x, y); vec_sigmoid(n, y, y); @@ -404,7 +400,7 @@ inline void vec_tanh(const int n, const T* x, T* y) { } // TODO(TJ): make relu clip -template +template inline void vec_relu(const int n, const T* x, T* y) { for (int i = 0; i < n; ++i) { y[i] = x[i] > 0 ? x[i] : 0; @@ -412,12 +408,12 @@ inline void vec_relu(const int n, const T* x, T* y) { } template <> -inline void vec_relu(const int n, const float* x, - float* y) { +inline void vec_relu(const int n, const float* x, + float* y) { #ifdef __AVX__ constexpr int block = YMM_FLOAT_BLOCK; if (n < block * 4) { - vec_relu(n, x, y); + vec_relu(n, x, y); return; } @@ -441,26 +437,26 @@ inline void vec_relu(const int n, const float* x, #undef MOVE_ONE_STEP #else - vec_relu(n, x, y); + vec_relu(n, x, y); #endif } template <> -inline void vec_relu(const int n, const float* x, - float* y) { - vec_relu(n, x, y); +inline void vec_relu(const int n, const float* x, + float* y) { + vec_relu(n, x, y); } template <> -inline void vec_relu(const int n, const float* x, - float* y) { +inline void vec_relu(const int n, const float* x, + float* y) { // TODO(TJ): enable me - vec_relu(n, x, y); + vec_relu(n, x, y); } // TODO(TJ): optimize double of sigmoid, tanh and relu if necessary -template +template class VecActivations { public: std::function operator()( diff --git a/paddle/fluid/operators/math/cpu_vec_test.cc b/paddle/fluid/operators/math/cpu_vec_test.cc index c37fa291a259550a3cb6d4f3dd9d5a415c3a2130..28eb9cadc9d4258bf4f8f71a06e029531e448014 100644 --- a/paddle/fluid/operators/math/cpu_vec_test.cc +++ b/paddle/fluid/operators/math/cpu_vec_test.cc @@ -104,38 +104,42 @@ void TestAndBench(const int n, std::function tgt, } TEST(CpuVecTest, sigmoid) { - namespace jit = paddle::platform::jit; + namespace platform = paddle::platform; using namespace paddle::operators::math; // NOLINT for (auto sz : {1, 2, 15, 16, 30, 32, 128, 200, 512}) { TestAndBench(sz, vec_sigmoid, ref_sigmoid); - TestAndBench(sz, vec_sigmoid, ref_sigmoid); - TestAndBench(sz, vec_sigmoid, ref_sigmoid); - TestAndBench(sz, vec_sigmoid, + TestAndBench(sz, vec_sigmoid, + ref_sigmoid); + TestAndBench(sz, vec_sigmoid, + ref_sigmoid); + TestAndBench(sz, vec_sigmoid, ref_sigmoid); } TestAndBench(30, vec_sigmoid, ref_sigmoid); } TEST(CpuVecTest, tanh) { - namespace jit = paddle::platform::jit; + namespace platform = paddle::platform; using namespace paddle::operators::math; // NOLINT for (auto sz : {1, 2, 15, 16, 30, 32, 128, 200, 512}) { TestAndBench(sz, vec_tanh, ref_tanh); - TestAndBench(sz, vec_tanh, ref_tanh); - TestAndBench(sz, vec_tanh, ref_tanh); - TestAndBench(sz, vec_tanh, ref_tanh); + TestAndBench(sz, vec_tanh, ref_tanh); + TestAndBench(sz, vec_tanh, ref_tanh); + TestAndBench(sz, vec_tanh, + ref_tanh); } TestAndBench(30, vec_tanh, ref_tanh); } TEST(CpuVecTest, relu) { - namespace jit = paddle::platform::jit; + namespace platform = paddle::platform; using namespace paddle::operators::math; // NOLINT for (auto sz : {1, 2, 15, 16, 30, 32, 128, 200, 512}) { TestAndBench(sz, vec_relu, ref_relu); - TestAndBench(sz, vec_relu, ref_relu); - TestAndBench(sz, vec_relu, ref_relu); - TestAndBench(sz, vec_relu, ref_relu); + TestAndBench(sz, vec_relu, ref_relu); + TestAndBench(sz, vec_relu, ref_relu); + TestAndBench(sz, vec_relu, + ref_relu); } TestAndBench(30, vec_relu, ref_relu); } @@ -162,38 +166,40 @@ void TestInplace(const int n, std::function tgt, } TEST(CpuVecTest, inplace_sigmoid) { - namespace jit = paddle::platform::jit; + namespace platform = paddle::platform; using namespace paddle::operators::math; // NOLINT for (auto sz : {1, 2, 15, 16, 30, 32, 128, 200, 512}) { TestInplace(sz, vec_sigmoid, ref_sigmoid); - TestInplace(sz, vec_sigmoid, ref_sigmoid); - TestInplace(sz, vec_sigmoid, ref_sigmoid); - TestInplace(sz, vec_sigmoid, + TestInplace(sz, vec_sigmoid, + ref_sigmoid); + TestInplace(sz, vec_sigmoid, + ref_sigmoid); + TestInplace(sz, vec_sigmoid, ref_sigmoid); } TestInplace(30, vec_sigmoid, ref_sigmoid); } TEST(CpuVecTest, inplace_tanh) { - namespace jit = paddle::platform::jit; + namespace platform = paddle::platform; using namespace paddle::operators::math; // NOLINT for (auto sz : {1, 2, 15, 16, 30, 32, 128, 200, 512}) { TestInplace(sz, vec_tanh, ref_tanh); - TestInplace(sz, vec_tanh, ref_tanh); - TestInplace(sz, vec_tanh, ref_tanh); - TestInplace(sz, vec_tanh, ref_tanh); + TestInplace(sz, vec_tanh, ref_tanh); + TestInplace(sz, vec_tanh, ref_tanh); + TestInplace(sz, vec_tanh, ref_tanh); } TestInplace(30, vec_tanh, ref_tanh); } TEST(CpuVecTest, inplace_relu) { - namespace jit = paddle::platform::jit; + namespace platform = paddle::platform; using namespace paddle::operators::math; // NOLINT for (auto sz : {1, 2, 15, 16, 30, 32, 128, 200, 512}) { TestInplace(sz, vec_relu, ref_relu); - TestInplace(sz, vec_relu, ref_relu); - TestInplace(sz, vec_relu, ref_relu); - TestInplace(sz, vec_relu, ref_relu); + TestInplace(sz, vec_relu, ref_relu); + TestInplace(sz, vec_relu, ref_relu); + TestInplace(sz, vec_relu, ref_relu); } TestInplace(30, vec_relu, ref_relu); } diff --git a/paddle/fluid/operators/math/jit_code.cc b/paddle/fluid/operators/math/jit_code.cc index 52cbdf685dee651cbc1490dc6faacb8680004c89..78d0c3e8808f0daf6a18d2217664e965773b95ff 100644 --- a/paddle/fluid/operators/math/jit_code.cc +++ b/paddle/fluid/operators/math/jit_code.cc @@ -22,7 +22,7 @@ namespace math { namespace jitkernel { namespace gen { -using namespace platform::jit; // NOLINT +using namespace platform; // NOLINT bool VXXJitCode::init(int d, int scalar_index) { // It's not necessary to use avx512 since it would slow down the frequency diff --git a/paddle/fluid/operators/math/jit_code.h b/paddle/fluid/operators/math/jit_code.h index a9214621295a7740b804b26c02d216dd5118d8bb..e2b4761435594fdc952ff5dba5b5fa4f4aa98e6c 100644 --- a/paddle/fluid/operators/math/jit_code.h +++ b/paddle/fluid/operators/math/jit_code.h @@ -179,7 +179,7 @@ class VActJitCode : public JitCode { template void exp_jmm(JMM& dst, JMM& src, int src_idx = 11, int fx_idx = 12, // NOLINT int fy_idx = 13, int mask_idx = 14, int tmp_idx = 15) { - using namespace platform::jit; // NOLINT + using namespace platform; // NOLINT // check all idx can not equal JMM jmm_src = JMM(src_idx); JMM jmm_fx = JMM(fx_idx); diff --git a/paddle/fluid/operators/math/jit_gen.cc b/paddle/fluid/operators/math/jit_gen.cc index 6af39518ed926554c8c839bba701d3827923dba0..5c6672928e8c03ccb1920bd828f785084e422fc2 100644 --- a/paddle/fluid/operators/math/jit_gen.cc +++ b/paddle/fluid/operators/math/jit_gen.cc @@ -36,7 +36,7 @@ void JitCode::preCode() { for (int i = 0; i < num_g_abi_regs; ++i) { push(Xbyak::Reg64(g_abi_regs[i])); } - if (platform::jit::MayIUse(platform::jit::avx512f)) { + if (platform::MayIUse(platform::avx512f)) { mov(reg_EVEX_max_8b_offt, 2 * EVEX_max_8b_offt); } } diff --git a/paddle/fluid/operators/math/jit_kernel.cc b/paddle/fluid/operators/math/jit_kernel.cc index 68b708b345334bc63b5e2e88c308d20ca6378e6b..118696ba47986e2dbf97535333c9817b7c264a54 100644 --- a/paddle/fluid/operators/math/jit_kernel.cc +++ b/paddle/fluid/operators/math/jit_kernel.cc @@ -21,8 +21,6 @@ namespace operators { namespace math { namespace jitkernel { -namespace jit = platform::jit; - KernelPool& KernelPool::Instance() { static thread_local KernelPool g_jit_kernels; return g_jit_kernels; diff --git a/paddle/fluid/operators/math/jit_kernel_blas.cc b/paddle/fluid/operators/math/jit_kernel_blas.cc index a0f93fd8e7eb7d81211724a6991a681e2a0ed9ce..8cf588efba52314650bfd376b95b10e6d4336b2e 100644 --- a/paddle/fluid/operators/math/jit_kernel_blas.cc +++ b/paddle/fluid/operators/math/jit_kernel_blas.cc @@ -30,7 +30,6 @@ namespace paddle { namespace operators { namespace math { namespace jitkernel { -namespace jit = platform::jit; #ifdef PADDLE_WITH_MKLML template @@ -125,7 +124,7 @@ bool VMulKernelImpl::useJIT(int d) { #ifdef PADDLE_WITH_MKLML template <> bool VMulKernelImpl::useMKL(int d) { - return jit::MayIUse(jit::avx512f) && d > 512; + return platform::MayIUse(platform::avx512f) && d > 512; } template <> diff --git a/paddle/fluid/operators/math/jit_kernel_crf_decode.cc b/paddle/fluid/operators/math/jit_kernel_crf_decode.cc index 4d26b81948238f18b097f535534fcfe9049b93c3..eeb305a88bee8f0e21b205684d24b19ca4631f65 100644 --- a/paddle/fluid/operators/math/jit_kernel_crf_decode.cc +++ b/paddle/fluid/operators/math/jit_kernel_crf_decode.cc @@ -25,10 +25,8 @@ namespace operators { namespace math { namespace jitkernel { -namespace jit = platform::jit; - /* CRF Decode JitKernel */ -template +template class CRFDecodeKernelImpl : public CRFDecodeKernel { public: explicit CRFDecodeKernelImpl(int tag_num) : CRFDecodeKernel() { @@ -101,7 +99,7 @@ class CRFDecodeKernelImpl : public CRFDecodeKernel { #define INTRIAVX_FLOAT(block) \ template <> \ - CRFDecodeKernelImpl::CRFDecodeKernelImpl( \ + CRFDecodeKernelImpl::CRFDecodeKernelImpl( \ int tag_num) \ : CRFDecodeKernel() { \ this->num_ = tag_num; \ @@ -109,7 +107,7 @@ class CRFDecodeKernelImpl : public CRFDecodeKernel { this->rest_ = this->num_ % YMM_FLOAT_BLOCK; \ } \ template <> \ - void CRFDecodeKernelImpl::Compute( \ + void CRFDecodeKernelImpl::Compute( \ const int seq_len, const float* x, const float* w, float* alpha, \ int* track) const { \ INIT_ALPHA(YMM_FLOAT_BLOCK) \ @@ -204,7 +202,7 @@ class CRFDecodeKernelImpl : public CRFDecodeKernel { #define INTRIAVX512_FLOAT(block) \ template <> \ - CRFDecodeKernelImpl::CRFDecodeKernelImpl( \ + CRFDecodeKernelImpl::CRFDecodeKernelImpl( \ int tag_num) \ : CRFDecodeKernel() { \ this->num_ = tag_num; \ @@ -212,7 +210,7 @@ class CRFDecodeKernelImpl : public CRFDecodeKernel { this->rest_ = this->num_ % ZMM_FLOAT_BLOCK; \ } \ template <> \ - void CRFDecodeKernelImpl::Compute( \ + void CRFDecodeKernelImpl::Compute( \ const int seq_len, const float* x, const float* w, float* alpha, \ int* track) const { \ INIT_ALPHA(ZMM_FLOAT_BLOCK) \ @@ -270,14 +268,14 @@ INTRIAVX_FLOAT(kEQ16); INTRIAVX_FLOAT(kGT16); #endif #ifdef __AVX2__ -INTRIAVX2_FLOAT(jit::avx2, kEQ8); -INTRIAVX2_FLOAT(jit::avx2, kGT8LT16); -INTRIAVX2_FLOAT(jit::avx2, kEQ16); -INTRIAVX2_FLOAT(jit::avx2, kGT16); +INTRIAVX2_FLOAT(platform::avx2, kEQ8); +INTRIAVX2_FLOAT(platform::avx2, kGT8LT16); +INTRIAVX2_FLOAT(platform::avx2, kEQ16); +INTRIAVX2_FLOAT(platform::avx2, kGT16); #endif #ifdef __AVX512F__ -INTRIAVX2_FLOAT(jit::avx512f, kEQ8); -INTRIAVX2_FLOAT(jit::avx512f, kGT8LT16); +INTRIAVX2_FLOAT(platform::avx512f, kEQ8); +INTRIAVX2_FLOAT(platform::avx512f, kGT8LT16); INTRIAVX512_FLOAT(kEQ16); INTRIAVX512_FLOAT(kGT16); #endif diff --git a/paddle/fluid/operators/math/jit_kernel_exp.cc b/paddle/fluid/operators/math/jit_kernel_exp.cc index 686f3dd9836cb9192088771753065c6add639620..7945cfb253a61b7d1191c39537254126e2bb85dd 100644 --- a/paddle/fluid/operators/math/jit_kernel_exp.cc +++ b/paddle/fluid/operators/math/jit_kernel_exp.cc @@ -29,7 +29,6 @@ namespace paddle { namespace operators { namespace math { namespace jitkernel { -namespace jit = platform::jit; #ifdef PADDLE_WITH_MKLML // try to use MKL to speedup diff --git a/paddle/fluid/operators/math/jit_kernel_layer_norm.cc b/paddle/fluid/operators/math/jit_kernel_layer_norm.cc index 49904e6e8c7cd346bcbfb67c3a7574118b36e058..fead13ebadcd131afafc308740cdd39b1c53bc08 100644 --- a/paddle/fluid/operators/math/jit_kernel_layer_norm.cc +++ b/paddle/fluid/operators/math/jit_kernel_layer_norm.cc @@ -22,10 +22,8 @@ namespace operators { namespace math { namespace jitkernel { -namespace jit = platform::jit; - /* Layer Norm JitKernel */ -template +template class LayerNormKernelImpl : public LayerNormKernel { public: explicit LayerNormKernelImpl(int right) : LayerNormKernel() { @@ -90,7 +88,7 @@ class LayerNormKernelImpl : public LayerNormKernel { this->end_ = this->num_ - this->rest_; \ } \ template <> \ - void LayerNormKernelImpl::Compute( \ + void LayerNormKernelImpl::Compute( \ float* x, float* out, float* mean, float* var, const float* scale, \ const float* bias, int height, const float epsilon) const { \ __m256 sum; \ @@ -219,16 +217,16 @@ class LayerNormKernelImpl : public LayerNormKernel { } #ifdef __AVX__ -INTRIAVX_FLOAT(jit::avx, kEQ8); -INTRIAVX_FLOAT(jit::avx, kGT8LT16); -INTRIAVX_FLOAT(jit::avx, kEQ16); -INTRIAVX_FLOAT(jit::avx, kGT16); +INTRIAVX_FLOAT(platform::avx, kEQ8); +INTRIAVX_FLOAT(platform::avx, kGT8LT16); +INTRIAVX_FLOAT(platform::avx, kEQ16); +INTRIAVX_FLOAT(platform::avx, kGT16); #endif #ifdef __AVX2__ -INTRIAVX_FLOAT(jit::avx2, kEQ8); -INTRIAVX_FLOAT(jit::avx2, kGT8LT16); -INTRIAVX_FLOAT(jit::avx2, kEQ16); -INTRIAVX_FLOAT(jit::avx2, kGT16); +INTRIAVX_FLOAT(platform::avx2, kEQ8); +INTRIAVX_FLOAT(platform::avx2, kGT8LT16); +INTRIAVX_FLOAT(platform::avx2, kEQ16); +INTRIAVX_FLOAT(platform::avx2, kGT16); #endif #undef INTRIAVX_FLOAT diff --git a/paddle/fluid/operators/math/jit_kernel_macro.h b/paddle/fluid/operators/math/jit_kernel_macro.h index 5a3efd979f803d396a5084c199b1d71b88a77126..4dba3b56810794cb4839d26386ae77a8f4507977 100644 --- a/paddle/fluid/operators/math/jit_kernel_macro.h +++ b/paddle/fluid/operators/math/jit_kernel_macro.h @@ -92,7 +92,6 @@ namespace jitkernel { JITKERNEL_DECLARE, JITKERNEL_FIND_KEY, \ JITKERNEL_IMPL) -namespace jit = platform::jit; // TODO(TJ): below defines are deprecated, would be remove recently #define SEARCH_BLOCK(macro_, ker, dtype, isa) \ if (d < YMM_FLOAT_BLOCK) { \ @@ -107,15 +106,15 @@ namespace jit = platform::jit; macro_(ker, dtype, isa, kGT16); \ } -#define SEARCH_ISA_BLOCK(macro_, ker, dtype) \ - if (jit::MayIUse(jit::avx512f)) { \ - SEARCH_BLOCK(macro_, ker, dtype, jit::avx512f); \ - } else if (jit::MayIUse(jit::avx2)) { \ - SEARCH_BLOCK(macro_, ker, dtype, jit::avx2); \ - } else if (jit::MayIUse(jit::avx)) { \ - SEARCH_BLOCK(macro_, ker, dtype, jit::avx); \ - } else { \ - SEARCH_BLOCK(macro_, ker, dtype, jit::isa_any); \ +#define SEARCH_ISA_BLOCK(macro_, ker, dtype) \ + if (platform::MayIUse(platform::avx512f)) { \ + SEARCH_BLOCK(macro_, ker, dtype, platform::avx512f); \ + } else if (platform::MayIUse(platform::avx2)) { \ + SEARCH_BLOCK(macro_, ker, dtype, platform::avx2); \ + } else if (platform::MayIUse(platform::avx)) { \ + SEARCH_BLOCK(macro_, ker, dtype, platform::avx); \ + } else { \ + SEARCH_BLOCK(macro_, ker, dtype, platform::isa_any); \ } #define JITKERNEL_KEY(ker_key, dtype_key) \ @@ -156,10 +155,10 @@ namespace jit = platform::jit; marco_declare, macro_key, macro_impl) #define FOR_EACH_ISA(macro_, block) \ - macro_(jit::avx512f, block); \ - macro_(jit::avx2, block); \ - macro_(jit::avx, block); \ - macro_(jit::isa_any, block) + macro_(platform::avx512f, block); \ + macro_(platform::avx2, block); \ + macro_(platform::avx, block); \ + macro_(platform::isa_any, block) #define FOR_EACH_BLOCK(macro_, isa) \ macro_(isa, kLT8); \ @@ -168,11 +167,11 @@ namespace jit = platform::jit; macro_(isa, kEQ16); \ macro_(isa, kGT16) -#define FOR_EACH_ISA_BLOCK(macro_) \ - FOR_EACH_BLOCK(macro_, jit::avx512f); \ - FOR_EACH_BLOCK(macro_, jit::avx2); \ - FOR_EACH_BLOCK(macro_, jit::avx); \ - FOR_EACH_BLOCK(macro_, jit::isa_any) +#define FOR_EACH_ISA_BLOCK(macro_) \ + FOR_EACH_BLOCK(macro_, platform::avx512f); \ + FOR_EACH_BLOCK(macro_, platform::avx2); \ + FOR_EACH_BLOCK(macro_, platform::avx); \ + FOR_EACH_BLOCK(macro_, platform::isa_any) } // namespace jitkernel } // namespace math diff --git a/paddle/fluid/operators/math/jit_kernel_test.cc b/paddle/fluid/operators/math/jit_kernel_test.cc index ed86a47e159cacd4f5572e22c7633f725aaeb516..19f7bd8909499c12fd5bee4db0d0a71a632e7f19 100644 --- a/paddle/fluid/operators/math/jit_kernel_test.cc +++ b/paddle/fluid/operators/math/jit_kernel_test.cc @@ -705,7 +705,7 @@ TEST(JitKernel, pool) { jit::lstm_attr_t attr(frame_size, act_gate, act_cand, act_cell, false); // empty call it to avoid unknown flag 'use_pinned_memory' on Mac - paddle::platform::jit::MayIUse(paddle::platform::jit::avx); + paddle::platform::MayIUse(paddle::platform::avx); const auto& plstm1 = jit::KernelPool::Instance() .template Get, const jit::lstm_attr_t&>(attr); diff --git a/paddle/fluid/operators/math/matrix_bit_code.cc b/paddle/fluid/operators/math/matrix_bit_code.cc index 71b9293eeded77553ca06a8574cca3941fa36b6a..5a6e64b6f87d33249f0153e5f391deaf78e53de5 100644 --- a/paddle/fluid/operators/math/matrix_bit_code.cc +++ b/paddle/fluid/operators/math/matrix_bit_code.cc @@ -89,6 +89,8 @@ template void MatrixBitCodeFunctor::Mul(framework::Tensor* tmat, const framework::Tensor& weight, const framework::Tensor& input) { + auto blas = + GetBlas(platform::CPUDeviceContext()); size_t num_samples = tmat->dims()[0]; size_t tmat_width = tmat->dims()[1]; size_t input_width = input.dims()[1]; @@ -99,13 +101,12 @@ void MatrixBitCodeFunctor::Mul(framework::Tensor* tmat, for (size_t i = 0; i < num_samples; ++i) { auto code = code_table_->get_code(i); int code_length = code->get_length(); + const T* input_row = input_value + input_width * i; for (int j = 0; j < code_length; ++j) { size_t index = code->calc_index(j); + const T* weight_row = weight_value + weight_width * index; T sum = static_cast(0.0); - for (size_t k = 0; k < input_width; ++k) { - sum += weight_value[weight_width * index + k] * - input_value[input_width * i + k]; - } + sum = blas.DOT(input_width, weight_row, input_row); tmat_value[i * tmat_width + j] += sum; } } @@ -115,6 +116,8 @@ template void MatrixBitCodeFunctor::MulGradWeight(const framework::Tensor& tmat, framework::Tensor* weight, const framework::Tensor& input) { + auto blas = + GetBlas(platform::CPUDeviceContext()); size_t num_samples = tmat.dims()[0]; size_t input_width = input.dims()[1]; size_t tmat_width = tmat.dims()[1]; @@ -122,16 +125,25 @@ void MatrixBitCodeFunctor::MulGradWeight(const framework::Tensor& tmat, auto tmat_value = tmat.data(); auto weight_value = weight->data(); auto input_value = input.data(); + + std::unordered_map>> ops; + for (size_t i = 0; i < num_samples; ++i) { auto code = code_table_->get_code(i); int code_length = code->get_length(); + const T* input_value_row = input_value + input_width * i; + const T* tmat_row = tmat_value + i * tmat_width; for (int j = 0; j < code_length; ++j) { - size_t index = code->calc_index(j); - - for (size_t k = 0; k < input_width; ++k) { - weight_value[weight_width * index + k] += - tmat_value[i * tmat_width + j] * input_value[input_width * i + k]; - } + ops[code->calc_index(j)].emplace_back(tmat_row[j], input_value_row); + } + } + for (auto& op : ops) { + auto& op_in_row = op.second; + for (auto& pair : op_in_row) { + auto& scale = pair.first; + auto* input_row = pair.second; + T* weight_row = weight_value + op.first * weight_width; + blas.AXPY(input_width, scale, input_row, weight_row); } } } @@ -140,6 +152,8 @@ template void MatrixBitCodeFunctor::MulGradWeight(const framework::Tensor& tmat, framework::SelectedRows* weight, const framework::Tensor& input) { + auto blas = + GetBlas(platform::CPUDeviceContext()); size_t num_samples = tmat.dims()[0]; size_t input_width = input.dims()[1]; size_t tmat_width = tmat.dims()[1]; @@ -147,17 +161,28 @@ void MatrixBitCodeFunctor::MulGradWeight(const framework::Tensor& tmat, auto tmat_value = tmat.data(); auto weight_value = weight->mutable_value()->data(); auto input_value = input.data(); + + std::unordered_map>> ops; + ops.reserve(weight->rows().size()); + for (size_t i = 0; i < num_samples; ++i) { auto code = code_table_->get_code(i); int code_length = code->get_length(); + const T* input_value_row = input_value + input_width * i; + const T* tmat_row = tmat_value + i * tmat_width; for (int j = 0; j < code_length; ++j) { - size_t index = code->calc_index(j); - for (size_t k = 0; k < input_width; ++k) { - int64_t row_index = weight->GetIndexFromId(static_cast(index)); - weight_value[row_index * weight_width + k] += - tmat_value[i * tmat_width + j] * input_value[input_width * i + k]; - } + ops[code->calc_index(j)].emplace_back(tmat_row[j], input_value_row); + } + } + + for (auto& row : weight->rows()) { + auto& op_in_row = ops[row]; + for (auto& pair : op_in_row) { + auto& scale = pair.first; + auto* input_row = pair.second; + blas.AXPY(input_width, scale, input_row, weight_value); } + weight_value += weight_width; } } diff --git a/paddle/fluid/operators/math/matrix_bit_code.h b/paddle/fluid/operators/math/matrix_bit_code.h index c30bb52641e865efe57659a551bc4b493634c6b9..35ca73802b48982ddf3ed7485b56f50221c9f28c 100644 --- a/paddle/fluid/operators/math/matrix_bit_code.h +++ b/paddle/fluid/operators/math/matrix_bit_code.h @@ -13,10 +13,14 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once +#include +#include +#include #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/selected_rows.h" #include "paddle/fluid/framework/tensor.h" +#include "paddle/fluid/operators/math/blas.h" #include "paddle/fluid/platform/device_context.h" #if defined(_WIN32) diff --git a/paddle/fluid/operators/math/prelu.cu b/paddle/fluid/operators/math/prelu.cu new file mode 100644 index 0000000000000000000000000000000000000000..701a802080f65ea32b95402682dc46362ccf0966 --- /dev/null +++ b/paddle/fluid/operators/math/prelu.cu @@ -0,0 +1,148 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/math/prelu.h" + +namespace paddle { +namespace operators { +namespace math { + +static const int CUDA_NUM_THREADS = 1024; +static const int CUDA_MAX_NUM_BLOCKS = 65535; +inline static int GET_NUM_BLOCKS(const int N) { + return (N + CUDA_NUM_THREADS - 1) / CUDA_NUM_THREADS; +} + +template +__global__ void PReluChannelWiseKernel(const T *input, const T *alpha, + T *output, int channel, + size_t spatial_size) { + size_t offset = blockIdx.x * spatial_size; + const T *in = input + offset; + T *out = output + offset; + T scale = alpha[blockIdx.x % channel]; + + for (size_t i = threadIdx.x; i < spatial_size; i += blockDim.x) { + T x = in[i]; + out[i] = (x > 0) ? x : scale * x; + } +} + +template +__global__ void PReluElementWiseKernel(const T *input, const T *alpha, + T *output, size_t spatial_size) { + size_t offset = blockIdx.x * spatial_size; + const T *in = input + offset; + const T *scale = alpha + offset; + T *out = output + offset; + + for (size_t i = threadIdx.x; i < spatial_size; i += blockDim.x) { + T x = in[i]; + out[i] = (x > 0) ? x : scale[i] * x; + } +} + +template +__global__ void PReluScalarKernel(const T *input, const T *alpha, T *output, + size_t spatial_size) { + size_t offset = blockIdx.x * spatial_size; + const T *in = input + offset; + T scale = *alpha; + T *out = output + offset; + + for (size_t i = threadIdx.x; i < spatial_size; i += blockDim.x) { + T x = in[i]; + out[i] = (x > 0) ? x : scale * x; + } +} + +template +static inline void PReluChannelWise(cudaStream_t stream, const T *input, + const T *alpha, T *output, + std::vector input_shape) { + size_t unroll = input_shape[0] * input_shape[1]; + size_t spatial_size = input_shape[2] * input_shape[3]; + CHECK_LT(unroll, CUDA_MAX_NUM_BLOCKS); + PReluChannelWiseKernel<<>>( + input, alpha, output, input_shape[1], spatial_size); +} + +template +static inline void PReluElementWise(cudaStream_t stream, const T *input, + const T *alpha, T *output, + std::vector input_shape) { + size_t unroll = input_shape[0] * input_shape[1]; + size_t spatial_size = input_shape[2] * input_shape[3]; + CHECK_LT(unroll, CUDA_MAX_NUM_BLOCKS); + PReluElementWiseKernel<<>>( + input, alpha, output, spatial_size); +} + +template +static inline void PReluScalar(cudaStream_t stream, const T *input, + const T *alpha, T *output, + std::vector input_shape) { + size_t unroll = input_shape[0] * input_shape[1]; + size_t spatial_size = input_shape[2] * input_shape[3]; + CHECK_LT(unroll, CUDA_MAX_NUM_BLOCKS); + PReluScalarKernel<<>>( + input, alpha, output, spatial_size); +} + +template +void PreluChannelWiseDirectCUDAFunctor::operator()( + cudaStream_t stream, const T *input, const T *alpha, T *output, + std::vector input_shape) { + size_t unroll = input_shape[0] * input_shape[1]; + size_t spatial_size = input_shape[2] * input_shape[3]; + CHECK_LT(unroll, CUDA_MAX_NUM_BLOCKS); + PReluChannelWiseKernel<<>>( + input, alpha, output, input_shape[1], spatial_size); +} + +template +void PreluElementWiseDirectCUDAFunctor::operator()( + cudaStream_t stream, const T *input, const T *alpha, T *output, + std::vector input_shape) { + size_t unroll = input_shape[0] * input_shape[1]; + size_t spatial_size = input_shape[2] * input_shape[3]; + CHECK_LT(unroll, CUDA_MAX_NUM_BLOCKS); + PReluElementWiseKernel<<>>( + input, alpha, output, spatial_size); +} + +template +void PreluScalarDirectCUDAFunctor::operator()(cudaStream_t stream, + const T *input, const T *alpha, + T *output, + std::vector input_shape) { + size_t unroll = input_shape[0] * input_shape[1]; + size_t spatial_size = input_shape[2] * input_shape[3]; + CHECK_LT(unroll, CUDA_MAX_NUM_BLOCKS); + PReluScalarKernel<<>>( + input, alpha, output, spatial_size); +} + +template class PreluChannelWiseDirectCUDAFunctor; +template class PreluChannelWiseDirectCUDAFunctor; + +template class PreluElementWiseDirectCUDAFunctor; +template class PreluElementWiseDirectCUDAFunctor; + +template class PreluScalarDirectCUDAFunctor; +template class PreluScalarDirectCUDAFunctor; + +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/math/prelu.h b/paddle/fluid/operators/math/prelu.h new file mode 100644 index 0000000000000000000000000000000000000000..3237c6d4cbf956aafb4046ea2ffa42efe62e7b28 --- /dev/null +++ b/paddle/fluid/operators/math/prelu.h @@ -0,0 +1,49 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include "paddle/fluid/operators/math/math_function.h" +#include "paddle/fluid/platform/cudnn_helper.h" + +namespace paddle { +namespace operators { +namespace math { + +#ifdef PADDLE_WITH_CUDA +template +class PreluChannelWiseDirectCUDAFunctor { + public: + void operator()(cudaStream_t stream, const T *input, const T *alpha, + T *output, std::vector input_shape); +}; + +template +class PreluElementWiseDirectCUDAFunctor { + public: + void operator()(cudaStream_t stream, const T *input, const T *alpha, + T *output, std::vector input_shape); +}; + +template +class PreluScalarDirectCUDAFunctor { + public: + void operator()(cudaStream_t stream, const T *input, const T *alpha, + T *output, std::vector input_shape); +}; +#endif + +} // namespace math +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/math/softmax_impl.h b/paddle/fluid/operators/math/softmax_impl.h index 31ed5196668954bc387423c34a0667622db71373..9e99e44822b2fce971b751967ca8076a1f1384ec 100644 --- a/paddle/fluid/operators/math/softmax_impl.h +++ b/paddle/fluid/operators/math/softmax_impl.h @@ -13,6 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once +#include #include "paddle/fluid/framework/eigen.h" #include "paddle/fluid/framework/tensor.h" diff --git a/paddle/fluid/operators/merge_selected_rows_op.cc b/paddle/fluid/operators/merge_selected_rows_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..3c15c839554599104d21a5225c078d41735c4a60 --- /dev/null +++ b/paddle/fluid/operators/merge_selected_rows_op.cc @@ -0,0 +1,72 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/merge_selected_rows_op.h" + +namespace paddle { +namespace operators { + +class MergeSelectedRowsOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), + "Input(X) of MergeSelectedRowsOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Out"), + "Output(Out) of MergeSelectedRowsOp should not be null."); + ctx->ShareDim("X", /*->*/ "Out"); + } +}; + +class MergeSelectedRowsOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("X", + "The input type is SelectedRows, and the selected rows may be " + "duplicated."); + AddOutput("Out", + "The output type is SelectedRows, and the selected rows are not " + "duplicated."); + AddComment( + R"DOC( +MergeSelectedRows Operator. + +MergeSelectedRows is used to merge the duplicated rows of the input. +)DOC"); + } +}; + +class MergeSelectedRowsOpInferVarType + : public framework::PassInDtypeAndVarTypeToOutput { + protected: + std::unordered_map GetInputOutputWithSameType() + const override { + return std::unordered_map{{"X", /*->*/ "Out"}}; + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +namespace plat = paddle::platform; +REGISTER_OPERATOR(merge_selected_rows, ops::MergeSelectedRowsOp, + ops::MergeSelectedRowsOpMaker, + ops::MergeSelectedRowsOpInferVarType); + +REGISTER_OP_CPU_KERNEL( + merge_selected_rows, + ops::MergeSelectedRowsKernel, + ops::MergeSelectedRowsKernel); diff --git a/paddle/fluid/operators/merge_selected_rows_op.cu.cc b/paddle/fluid/operators/merge_selected_rows_op.cu.cc new file mode 100644 index 0000000000000000000000000000000000000000..90d5fb3eaeb1f155eeea29ea0cf3f5ecd610f5f0 --- /dev/null +++ b/paddle/fluid/operators/merge_selected_rows_op.cu.cc @@ -0,0 +1,23 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/merge_selected_rows_op.h" + +namespace ops = paddle::operators; +namespace plat = paddle::platform; + +REGISTER_OP_CUDA_KERNEL( + merge_selected_rows, + ops::MergeSelectedRowsKernel, + ops::MergeSelectedRowsKernel); diff --git a/paddle/fluid/operators/merge_selected_rows_op.h b/paddle/fluid/operators/merge_selected_rows_op.h new file mode 100644 index 0000000000000000000000000000000000000000..4c977e94b175c988e4253b273365b0cabc4b87aa --- /dev/null +++ b/paddle/fluid/operators/merge_selected_rows_op.h @@ -0,0 +1,36 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/math/selected_rows_functor.h" + +namespace paddle { +namespace operators { + +template +class MergeSelectedRowsKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto* x = context.Input("X"); + auto* out = context.Output("Out"); + + math::scatter::MergeAdd merge_func; + merge_func(context.template device_context(), *x, out); + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/prelu_op.cc b/paddle/fluid/operators/prelu_op.cc index 58cfbb76e93a1c15c9b7cf9f9e596066c29b7ebb..64d94ab6044c1992145062319120b0372f5061c0 100644 --- a/paddle/fluid/operators/prelu_op.cc +++ b/paddle/fluid/operators/prelu_op.cc @@ -58,7 +58,7 @@ class PReluOp : public framework::OperatorWithKernel { const framework::ExecutionContext &ctx) const override { return framework::OpKernelType( framework::ToDataType(ctx.Input("X")->type()), - platform::CPUPlace()); + ctx.device_context()); } }; diff --git a/paddle/fluid/operators/prelu_op.cu b/paddle/fluid/operators/prelu_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..36b5259ae5106914f5668625cad535ebc8aa72ec --- /dev/null +++ b/paddle/fluid/operators/prelu_op.cu @@ -0,0 +1,64 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/operators/math/prelu.h" +#include "paddle/fluid/operators/prelu_op.h" +#include "paddle/fluid/platform/cuda_primitives.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; + +template +class CUDAPReluKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& context) const override { + auto* x = context.Input("X"); + auto* alpha = context.Input("Alpha"); + auto* out = context.Output("Out"); + + const T* x_ptr = x->data(); + T* o_ptr = out->mutable_data(context.GetPlace()); + + const T* alpha_ptr = alpha->data(); + auto& mode = context.Attr("mode"); + + int numel = x->numel(); + auto dim = x->dims(); + std::vector input_shape = framework::vectorize2int(dim); + + if (mode == "channel") { + math::PreluChannelWiseDirectCUDAFunctor prelu_channel_wise; + prelu_channel_wise(context.cuda_device_context().stream(), x_ptr, + alpha_ptr, o_ptr, input_shape); + } else if (mode == "element") { + math::PreluElementWiseDirectCUDAFunctor prelu_element_wise; + prelu_element_wise(context.cuda_device_context().stream(), x_ptr, + alpha_ptr, o_ptr, input_shape); + } else { + math::PreluScalarDirectCUDAFunctor prelu_scalar; + prelu_scalar(context.cuda_device_context().stream(), x_ptr, alpha_ptr, + o_ptr, input_shape); + } + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL( + prelu, ops::CUDAPReluKernel, + ops::CUDAPReluKernel); diff --git a/paddle/fluid/operators/sequence_ops/sequence_mask_op.h b/paddle/fluid/operators/sequence_ops/sequence_mask_op.h index 18acb735cecabd1e01f7821c880fd8ed5e52971f..8fceed3558b4357b7863368c18add329ea9922b3 100644 --- a/paddle/fluid/operators/sequence_ops/sequence_mask_op.h +++ b/paddle/fluid/operators/sequence_ops/sequence_mask_op.h @@ -36,12 +36,10 @@ class SequenceMaskOp : public framework::OperatorWithKernel { PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) must exist"); PADDLE_ENFORCE(ctx->HasOutput("Y"), "Output(Y) must exist"); - auto maxlen = ctx->Attrs().Get("maxlen"); - if (maxlen > 0) { // We can only infershape when maxlen > 0 - auto dim = framework::vectorize2int(ctx->GetInputDim("X")); - dim.push_back(maxlen); - ctx->SetOutputDim("Y", framework::make_ddim(dim)); - } + int maxlen = ctx->Attrs().Get("maxlen"); + auto dim = framework::vectorize2int(ctx->GetInputDim("X")); + dim.push_back(maxlen > 0 ? maxlen : -1); + ctx->SetOutputDim("Y", framework::make_ddim(dim)); } }; diff --git a/paddle/fluid/operators/yolov3_loss_op.cc b/paddle/fluid/operators/yolov3_loss_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..e7597f732430038a4a180297e730340d1bc47b8c --- /dev/null +++ b/paddle/fluid/operators/yolov3_loss_op.cc @@ -0,0 +1,221 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#include "paddle/fluid/operators/yolov3_loss_op.h" +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +using framework::Tensor; + +class Yolov3LossOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), + "Input(X) of Yolov3LossOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("GTBox"), + "Input(GTBox) of Yolov3LossOp should not be null."); + PADDLE_ENFORCE(ctx->HasInput("GTLabel"), + "Input(GTLabel) of Yolov3LossOp should not be null."); + PADDLE_ENFORCE(ctx->HasOutput("Loss"), + "Output(Loss) of Yolov3LossOp should not be null."); + + auto dim_x = ctx->GetInputDim("X"); + auto dim_gtbox = ctx->GetInputDim("GTBox"); + auto dim_gtlabel = ctx->GetInputDim("GTLabel"); + auto anchors = ctx->Attrs().Get>("anchors"); + auto class_num = ctx->Attrs().Get("class_num"); + PADDLE_ENFORCE_EQ(dim_x.size(), 4, "Input(X) should be a 4-D tensor."); + PADDLE_ENFORCE_EQ(dim_x[2], dim_x[3], + "Input(X) dim[3] and dim[4] should be euqal."); + PADDLE_ENFORCE_EQ(dim_x[1], anchors.size() / 2 * (5 + class_num), + "Input(X) dim[1] should be equal to (anchor_number * (5 " + "+ class_num))."); + PADDLE_ENFORCE_EQ(dim_gtbox.size(), 3, + "Input(GTBox) should be a 3-D tensor"); + PADDLE_ENFORCE_EQ(dim_gtbox[2], 4, "Input(GTBox) dim[2] should be 5"); + PADDLE_ENFORCE_EQ(dim_gtlabel.size(), 2, + "Input(GTBox) should be a 2-D tensor"); + PADDLE_ENFORCE_EQ(dim_gtlabel[0], dim_gtbox[0], + "Input(GTBox) and Input(GTLabel) dim[0] should be same"); + PADDLE_ENFORCE_EQ(dim_gtlabel[1], dim_gtbox[1], + "Input(GTBox) and Input(GTLabel) dim[1] should be same"); + PADDLE_ENFORCE_GT(anchors.size(), 0, + "Attr(anchors) length should be greater then 0."); + PADDLE_ENFORCE_EQ(anchors.size() % 2, 0, + "Attr(anchors) length should be even integer."); + PADDLE_ENFORCE_GT(class_num, 0, + "Attr(class_num) should be an integer greater then 0."); + + std::vector dim_out({1}); + ctx->SetOutputDim("Loss", framework::make_ddim(dim_out)); + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + framework::ToDataType(ctx.Input("X")->type()), + platform::CPUPlace()); + } +}; + +class Yolov3LossOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("X", + "The input tensor of YOLO v3 loss operator, " + "This is a 4-D tensor with shape of [N, C, H, W]." + "H and W should be same, and the second dimention(C) stores" + "box locations, confidence score and classification one-hot" + "key of each anchor box"); + AddInput("GTBox", + "The input tensor of ground truth boxes, " + "This is a 3-D tensor with shape of [N, max_box_num, 5], " + "max_box_num is the max number of boxes in each image, " + "In the third dimention, stores x, y, w, h coordinates, " + "x, y is the center cordinate of boxes and w, h is the " + "width and height and x, y, w, h should be divided by " + "input image height to scale to [0, 1]."); + AddInput("GTLabel", + "The input tensor of ground truth label, " + "This is a 2-D tensor with shape of [N, max_box_num], " + "and each element shoudl be an integer to indicate the " + "box class id."); + AddOutput("Loss", + "The output yolov3 loss tensor, " + "This is a 1-D tensor with shape of [1]"); + + AddAttr("class_num", "The number of classes to predict."); + AddAttr>("anchors", + "The anchor width and height, " + "it will be parsed pair by pair."); + AddAttr("ignore_thresh", + "The ignore threshold to ignore confidence loss."); + AddAttr("loss_weight_xy", "The weight of x, y location loss.") + .SetDefault(1.0); + AddAttr("loss_weight_wh", "The weight of w, h location loss.") + .SetDefault(1.0); + AddAttr( + "loss_weight_conf_target", + "The weight of confidence score loss in locations with target object.") + .SetDefault(1.0); + AddAttr("loss_weight_conf_notarget", + "The weight of confidence score loss in locations without " + "target object.") + .SetDefault(1.0); + AddAttr("loss_weight_class", "The weight of classification loss.") + .SetDefault(1.0); + AddComment(R"DOC( + This operator generate yolov3 loss by given predict result and ground + truth boxes. + + The output of previous network is in shape [N, C, H, W], while H and W + should be the same, specify the grid size, each grid point predict given + number boxes, this given number is specified by anchors, it should be + half anchors length, which following will be represented as S. In the + second dimention(the channel dimention), C should be S * (class_num + 5), + class_num is the box categoriy number of source dataset(such as coco), + so in the second dimention, stores 4 box location coordinates x, y, w, h + and confidence score of the box and class one-hot key of each anchor box. + + While the 4 location coordinates if $$tx, ty, tw, th$$, the box predictions + correspnd to: + + $$ + b_x = \sigma(t_x) + c_x + b_y = \sigma(t_y) + c_y + b_w = p_w e^{t_w} + b_h = p_h e^{t_h} + $$ + + While $$c_x, c_y$$ is the left top corner of current grid and $$p_w, p_h$$ + is specified by anchors. + + As for confidence score, it is the logistic regression value of IoU between + anchor boxes and ground truth boxes, the score of the anchor box which has + the max IoU should be 1, and if the anchor box has IoU bigger then ignore + thresh, the confidence score loss of this anchor box will be ignored. + + Therefore, the yolov3 loss consist of three major parts, box location loss, + confidence score loss, and classification loss. The MSE loss is used for + box location, and binary cross entropy loss is used for confidence score + loss and classification loss. + + Final loss will be represented as follow. + + $$ + loss = \loss_weight_{xy} * loss_{xy} + \loss_weight_{wh} * loss_{wh} + + \loss_weight_{conf_target} * loss_{conf_target} + + \loss_weight_{conf_notarget} * loss_{conf_notarget} + + \loss_weight_{class} * loss_{class} + $$ + )DOC"); + } +}; + +class Yolov3LossOpGrad : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE(ctx->HasInput("X"), "Input(X) should not be null"); + PADDLE_ENFORCE(ctx->HasInput(framework::GradVarName("Loss")), + "Input(Loss@GRAD) should not be null"); + auto dim_x = ctx->GetInputDim("X"); + if (ctx->HasOutput(framework::GradVarName("X"))) { + ctx->SetOutputDim(framework::GradVarName("X"), dim_x); + } + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType( + framework::ToDataType(ctx.Input("X")->type()), + platform::CPUPlace()); + } +}; + +class Yolov3LossGradMaker : public framework::SingleGradOpDescMaker { + public: + using framework::SingleGradOpDescMaker::SingleGradOpDescMaker; + + protected: + std::unique_ptr Apply() const override { + auto* op = new framework::OpDesc(); + op->SetType("yolov3_loss_grad"); + op->SetInput("X", Input("X")); + op->SetInput("GTBox", Input("GTBox")); + op->SetInput("GTLabel", Input("GTLabel")); + op->SetInput(framework::GradVarName("Loss"), OutputGrad("Loss")); + + op->SetAttrMap(Attrs()); + + op->SetOutput(framework::GradVarName("X"), InputGrad("X")); + op->SetOutput(framework::GradVarName("GTBox"), {}); + op->SetOutput(framework::GradVarName("GTLabel"), {}); + return std::unique_ptr(op); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(yolov3_loss, ops::Yolov3LossOp, ops::Yolov3LossOpMaker, + ops::Yolov3LossGradMaker); +REGISTER_OPERATOR(yolov3_loss_grad, ops::Yolov3LossOpGrad); +REGISTER_OP_CPU_KERNEL(yolov3_loss, ops::Yolov3LossKernel, + ops::Yolov3LossKernel); +REGISTER_OP_CPU_KERNEL(yolov3_loss_grad, ops::Yolov3LossGradKernel, + ops::Yolov3LossGradKernel); diff --git a/paddle/fluid/operators/yolov3_loss_op.h b/paddle/fluid/operators/yolov3_loss_op.h new file mode 100644 index 0000000000000000000000000000000000000000..0bb285722ddedf721d98237760ec9868e2134442 --- /dev/null +++ b/paddle/fluid/operators/yolov3_loss_op.h @@ -0,0 +1,483 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserve. + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + http://www.apache.org/licenses/LICENSE-2.0 + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#pragma once +#include +#include +#include "paddle/fluid/framework/op_registry.h" + +namespace paddle { +namespace operators { + +using Tensor = framework::Tensor; +template +using EigenTensor = framework::EigenTensor; +template +using EigenVector = framework::EigenVector; + +using Array5 = Eigen::DSizes; + +template +static inline bool isZero(T x) { + return fabs(x) < 1e-6; +} + +template +static inline T sigmoid(T x) { + return 1.0 / (exp(-1.0 * x) + 1.0); +} + +template +static inline T CalcMaskPointNum(const Tensor& mask) { + auto mask_t = EigenVector::Flatten(mask); + T count = 0.0; + for (int i = 0; i < mask_t.dimensions()[0]; i++) { + if (mask_t(i)) { + count += 1.0; + } + } + return count; +} + +template +static inline T CalcMSEWithMask(const Tensor& x, const Tensor& y, + const Tensor& mask) { + auto x_t = EigenVector::Flatten(x); + auto y_t = EigenVector::Flatten(y); + auto mask_t = EigenVector::Flatten(mask); + + T error_sum = 0.0; + T points = 0.0; + for (int i = 0; i < x_t.dimensions()[0]; i++) { + if (mask_t(i)) { + error_sum += pow(x_t(i) - y_t(i), 2); + points += 1; + } + } + return (error_sum / points); +} + +template +static void CalcMSEGradWithMask(Tensor* grad, const Tensor& x, const Tensor& y, + const Tensor& mask, T mf) { + auto grad_t = EigenVector::Flatten(*grad).setConstant(0.0); + auto x_t = EigenVector::Flatten(x); + auto y_t = EigenVector::Flatten(y); + auto mask_t = EigenVector::Flatten(mask); + + for (int i = 0; i < x_t.dimensions()[0]; i++) { + if (mask_t(i)) { + grad_t(i) = 2.0 * (x_t(i) - y_t(i)) / mf; + } + } +} + +template +static inline T CalcBCEWithMask(const Tensor& x, const Tensor& y, + const Tensor& mask) { + auto x_t = EigenVector::Flatten(x); + auto y_t = EigenVector::Flatten(y); + auto mask_t = EigenVector::Flatten(mask); + + T error_sum = 0.0; + T points = 0.0; + for (int i = 0; i < x_t.dimensions()[0]; i++) { + if (mask_t(i)) { + error_sum += + -1.0 * (y_t(i) * log(x_t(i)) + (1.0 - y_t(i)) * log(1.0 - x_t(i))); + points += 1; + } + } + return (error_sum / points); +} + +template +static inline void CalcBCEGradWithMask(Tensor* grad, const Tensor& x, + const Tensor& y, const Tensor& mask, + T mf) { + auto grad_t = EigenVector::Flatten(*grad).setConstant(0.0); + auto x_t = EigenVector::Flatten(x); + auto y_t = EigenVector::Flatten(y); + auto mask_t = EigenVector::Flatten(mask); + + for (int i = 0; i < x_t.dimensions()[0]; i++) { + if (mask_t(i)) { + grad_t(i) = ((1.0 - y_t(i)) / (1.0 - x_t(i)) - y_t(i) / x_t(i)) / mf; + } + } +} + +template +static void CalcPredResult(const Tensor& input, Tensor* pred_conf, + Tensor* pred_class, Tensor* pred_x, Tensor* pred_y, + Tensor* pred_w, Tensor* pred_h, const int anchor_num, + const int class_num) { + const int n = input.dims()[0]; + const int h = input.dims()[2]; + const int w = input.dims()[3]; + const int box_attr_num = 5 + class_num; + + auto input_t = EigenTensor::From(input); + auto pred_conf_t = EigenTensor::From(*pred_conf); + auto pred_class_t = EigenTensor::From(*pred_class); + auto pred_x_t = EigenTensor::From(*pred_x); + auto pred_y_t = EigenTensor::From(*pred_y); + auto pred_w_t = EigenTensor::From(*pred_w); + auto pred_h_t = EigenTensor::From(*pred_h); + + for (int i = 0; i < n; i++) { + for (int an_idx = 0; an_idx < anchor_num; an_idx++) { + for (int j = 0; j < h; j++) { + for (int k = 0; k < w; k++) { + pred_x_t(i, an_idx, j, k) = + sigmoid(input_t(i, box_attr_num * an_idx, j, k)); + pred_y_t(i, an_idx, j, k) = + sigmoid(input_t(i, box_attr_num * an_idx + 1, j, k)); + pred_w_t(i, an_idx, j, k) = + input_t(i, box_attr_num * an_idx + 2, j, k); + pred_h_t(i, an_idx, j, k) = + input_t(i, box_attr_num * an_idx + 3, j, k); + + pred_conf_t(i, an_idx, j, k) = + sigmoid(input_t(i, box_attr_num * an_idx + 4, j, k)); + + for (int c = 0; c < class_num; c++) { + pred_class_t(i, an_idx, j, k, c) = + sigmoid(input_t(i, box_attr_num * an_idx + 5 + c, j, k)); + } + } + } + } + } +} + +template +static T CalcBoxIoU(std::vector box1, std::vector box2) { + T b1_x1 = box1[0] - box1[2] / 2; + T b1_x2 = box1[0] + box1[2] / 2; + T b1_y1 = box1[1] - box1[3] / 2; + T b1_y2 = box1[1] + box1[3] / 2; + T b2_x1 = box2[0] - box2[2] / 2; + T b2_x2 = box2[0] + box2[2] / 2; + T b2_y1 = box2[1] - box2[3] / 2; + T b2_y2 = box2[1] + box2[3] / 2; + + T b1_area = (b1_x2 - b1_x1) * (b1_y2 - b1_y1); + T b2_area = (b2_x2 - b2_x1) * (b2_y2 - b2_y1); + + T inter_rect_x1 = std::max(b1_x1, b2_x1); + T inter_rect_y1 = std::max(b1_y1, b2_y1); + T inter_rect_x2 = std::min(b1_x2, b2_x2); + T inter_rect_y2 = std::min(b1_y2, b2_y2); + T inter_area = std::max(inter_rect_x2 - inter_rect_x1, static_cast(0.0)) * + std::max(inter_rect_y2 - inter_rect_y1, static_cast(0.0)); + + return inter_area / (b1_area + b2_area - inter_area); +} + +template +static void PreProcessGTBox(const Tensor& gt_box, const Tensor& gt_label, + const float ignore_thresh, std::vector anchors, + const int grid_size, Tensor* obj_mask, + Tensor* noobj_mask, Tensor* tx, Tensor* ty, + Tensor* tw, Tensor* th, Tensor* tconf, + Tensor* tclass) { + const int n = gt_box.dims()[0]; + const int b = gt_box.dims()[1]; + const int anchor_num = anchors.size() / 2; + auto gt_box_t = EigenTensor::From(gt_box); + auto gt_label_t = EigenTensor::From(gt_label); + auto obj_mask_t = EigenTensor::From(*obj_mask).setConstant(0); + auto noobj_mask_t = EigenTensor::From(*noobj_mask).setConstant(1); + auto tx_t = EigenTensor::From(*tx).setConstant(0.0); + auto ty_t = EigenTensor::From(*ty).setConstant(0.0); + auto tw_t = EigenTensor::From(*tw).setConstant(0.0); + auto th_t = EigenTensor::From(*th).setConstant(0.0); + auto tconf_t = EigenTensor::From(*tconf).setConstant(0.0); + auto tclass_t = EigenTensor::From(*tclass).setConstant(0.0); + + for (int i = 0; i < n; i++) { + for (int j = 0; j < b; j++) { + if (isZero(gt_box_t(i, j, 0)) && isZero(gt_box_t(i, j, 1)) && + isZero(gt_box_t(i, j, 2)) && isZero(gt_box_t(i, j, 3))) { + continue; + } + + int cur_label = gt_label_t(i, j); + T gx = gt_box_t(i, j, 0) * grid_size; + T gy = gt_box_t(i, j, 1) * grid_size; + T gw = gt_box_t(i, j, 2) * grid_size; + T gh = gt_box_t(i, j, 3) * grid_size; + int gi = static_cast(gx); + int gj = static_cast(gy); + + T max_iou = static_cast(0); + T iou; + int best_an_index = -1; + std::vector gt_box_shape({0, 0, gw, gh}); + for (int an_idx = 0; an_idx < anchor_num; an_idx++) { + std::vector anchor_shape({0, 0, static_cast(anchors[2 * an_idx]), + static_cast(anchors[2 * an_idx + 1])}); + iou = CalcBoxIoU(gt_box_shape, anchor_shape); + if (iou > max_iou) { + max_iou = iou; + best_an_index = an_idx; + } + if (iou > ignore_thresh) { + noobj_mask_t(i, an_idx, gj, gi) = 0; + } + } + obj_mask_t(i, best_an_index, gj, gi) = 1; + noobj_mask_t(i, best_an_index, gj, gi) = 0; + tx_t(i, best_an_index, gj, gi) = gx - gi; + ty_t(i, best_an_index, gj, gi) = gy - gj; + tw_t(i, best_an_index, gj, gi) = log(gw / anchors[2 * best_an_index]); + th_t(i, best_an_index, gj, gi) = log(gh / anchors[2 * best_an_index + 1]); + tclass_t(i, best_an_index, gj, gi, cur_label) = 1; + tconf_t(i, best_an_index, gj, gi) = 1; + } + } +} + +static void ExpandObjMaskByClassNum(Tensor* obj_mask_expand, + const Tensor& obj_mask) { + const int n = obj_mask_expand->dims()[0]; + const int an_num = obj_mask_expand->dims()[1]; + const int h = obj_mask_expand->dims()[2]; + const int w = obj_mask_expand->dims()[3]; + const int class_num = obj_mask_expand->dims()[4]; + auto obj_mask_expand_t = EigenTensor::From(*obj_mask_expand); + auto obj_mask_t = EigenTensor::From(obj_mask); + + obj_mask_expand_t = obj_mask_t.reshape(Array5(n, an_num, h, w, 1)) + .broadcast(Array5(1, 1, 1, 1, class_num)); +} + +template +static void AddAllGradToInputGrad( + Tensor* grad, T loss, const Tensor& pred_x, const Tensor& pred_y, + const Tensor& pred_conf, const Tensor& pred_class, const Tensor& grad_x, + const Tensor& grad_y, const Tensor& grad_w, const Tensor& grad_h, + const Tensor& grad_conf_target, const Tensor& grad_conf_notarget, + const Tensor& grad_class, const int class_num, const float loss_weight_xy, + const float loss_weight_wh, const float loss_weight_conf_target, + const float loss_weight_conf_notarget, const float loss_weight_class) { + const int n = pred_x.dims()[0]; + const int an_num = pred_x.dims()[1]; + const int h = pred_x.dims()[2]; + const int w = pred_x.dims()[3]; + const int attr_num = class_num + 5; + auto grad_t = EigenTensor::From(*grad).setConstant(0.0); + auto pred_x_t = EigenTensor::From(pred_x); + auto pred_y_t = EigenTensor::From(pred_y); + auto pred_conf_t = EigenTensor::From(pred_conf); + auto pred_class_t = EigenTensor::From(pred_class); + auto grad_x_t = EigenTensor::From(grad_x); + auto grad_y_t = EigenTensor::From(grad_y); + auto grad_w_t = EigenTensor::From(grad_w); + auto grad_h_t = EigenTensor::From(grad_h); + auto grad_conf_target_t = EigenTensor::From(grad_conf_target); + auto grad_conf_notarget_t = EigenTensor::From(grad_conf_notarget); + auto grad_class_t = EigenTensor::From(grad_class); + + for (int i = 0; i < n; i++) { + for (int j = 0; j < an_num; j++) { + for (int k = 0; k < h; k++) { + for (int l = 0; l < w; l++) { + grad_t(i, j * attr_num, k, l) = + grad_x_t(i, j, k, l) * pred_x_t(i, j, k, l) * + (1.0 - pred_x_t(i, j, k, l)) * loss * loss_weight_xy; + grad_t(i, j * attr_num + 1, k, l) = + grad_y_t(i, j, k, l) * pred_y_t(i, j, k, l) * + (1.0 - pred_y_t(i, j, k, l)) * loss * loss_weight_xy; + grad_t(i, j * attr_num + 2, k, l) = + grad_w_t(i, j, k, l) * loss * loss_weight_wh; + grad_t(i, j * attr_num + 3, k, l) = + grad_h_t(i, j, k, l) * loss * loss_weight_wh; + grad_t(i, j * attr_num + 4, k, l) = + grad_conf_target_t(i, j, k, l) * pred_conf_t(i, j, k, l) * + (1.0 - pred_conf_t(i, j, k, l)) * loss * loss_weight_conf_target; + grad_t(i, j * attr_num + 4, k, l) += + grad_conf_notarget_t(i, j, k, l) * pred_conf_t(i, j, k, l) * + (1.0 - pred_conf_t(i, j, k, l)) * loss * + loss_weight_conf_notarget; + + for (int c = 0; c < class_num; c++) { + grad_t(i, j * attr_num + 5 + c, k, l) = + grad_class_t(i, j, k, l, c) * pred_class_t(i, j, k, l, c) * + (1.0 - pred_class_t(i, j, k, l, c)) * loss * loss_weight_class; + } + } + } + } + } +} + +template +class Yolov3LossKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* input = ctx.Input("X"); + auto* gt_box = ctx.Input("GTBox"); + auto* gt_label = ctx.Input("GTLabel"); + auto* loss = ctx.Output("Loss"); + auto anchors = ctx.Attr>("anchors"); + int class_num = ctx.Attr("class_num"); + float ignore_thresh = ctx.Attr("ignore_thresh"); + float loss_weight_xy = ctx.Attr("loss_weight_xy"); + float loss_weight_wh = ctx.Attr("loss_weight_wh"); + float loss_weight_conf_target = ctx.Attr("loss_weight_conf_target"); + float loss_weight_conf_notarget = + ctx.Attr("loss_weight_conf_notarget"); + float loss_weight_class = ctx.Attr("loss_weight_class"); + + const int n = input->dims()[0]; + const int h = input->dims()[2]; + const int w = input->dims()[3]; + const int an_num = anchors.size() / 2; + + Tensor pred_x, pred_y, pred_w, pred_h; + Tensor pred_conf, pred_class; + pred_x.mutable_data({n, an_num, h, w}, ctx.GetPlace()); + pred_y.mutable_data({n, an_num, h, w}, ctx.GetPlace()); + pred_w.mutable_data({n, an_num, h, w}, ctx.GetPlace()); + pred_h.mutable_data({n, an_num, h, w}, ctx.GetPlace()); + pred_conf.mutable_data({n, an_num, h, w}, ctx.GetPlace()); + pred_class.mutable_data({n, an_num, h, w, class_num}, ctx.GetPlace()); + CalcPredResult(*input, &pred_conf, &pred_class, &pred_x, &pred_y, + &pred_w, &pred_h, an_num, class_num); + + Tensor obj_mask, noobj_mask; + Tensor tx, ty, tw, th, tconf, tclass; + obj_mask.mutable_data({n, an_num, h, w}, ctx.GetPlace()); + noobj_mask.mutable_data({n, an_num, h, w}, ctx.GetPlace()); + tx.mutable_data({n, an_num, h, w}, ctx.GetPlace()); + ty.mutable_data({n, an_num, h, w}, ctx.GetPlace()); + tw.mutable_data({n, an_num, h, w}, ctx.GetPlace()); + th.mutable_data({n, an_num, h, w}, ctx.GetPlace()); + tconf.mutable_data({n, an_num, h, w}, ctx.GetPlace()); + tclass.mutable_data({n, an_num, h, w, class_num}, ctx.GetPlace()); + PreProcessGTBox(*gt_box, *gt_label, ignore_thresh, anchors, h, &obj_mask, + &noobj_mask, &tx, &ty, &tw, &th, &tconf, &tclass); + + Tensor obj_mask_expand; + obj_mask_expand.mutable_data({n, an_num, h, w, class_num}, + ctx.GetPlace()); + ExpandObjMaskByClassNum(&obj_mask_expand, obj_mask); + + T loss_x = CalcMSEWithMask(pred_x, tx, obj_mask); + T loss_y = CalcMSEWithMask(pred_y, ty, obj_mask); + T loss_w = CalcMSEWithMask(pred_w, tw, obj_mask); + T loss_h = CalcMSEWithMask(pred_h, th, obj_mask); + T loss_conf_target = CalcBCEWithMask(pred_conf, tconf, obj_mask); + T loss_conf_notarget = CalcBCEWithMask(pred_conf, tconf, noobj_mask); + T loss_class = CalcBCEWithMask(pred_class, tclass, obj_mask_expand); + + auto* loss_data = loss->mutable_data({1}, ctx.GetPlace()); + loss_data[0] = loss_weight_xy * (loss_x + loss_y) + + loss_weight_wh * (loss_w + loss_h) + + loss_weight_conf_target * loss_conf_target + + loss_weight_conf_notarget * loss_conf_notarget + + loss_weight_class * loss_class; + } +}; + +template +class Yolov3LossGradKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + auto* input = ctx.Input("X"); + auto* gt_box = ctx.Input("GTBox"); + auto* gt_label = ctx.Input("GTLabel"); + auto anchors = ctx.Attr>("anchors"); + int class_num = ctx.Attr("class_num"); + float ignore_thresh = ctx.Attr("ignore_thresh"); + auto* input_grad = ctx.Output(framework::GradVarName("X")); + auto* output_grad = ctx.Input(framework::GradVarName("Loss")); + const T loss = output_grad->data()[0]; + float loss_weight_xy = ctx.Attr("loss_weight_xy"); + float loss_weight_wh = ctx.Attr("loss_weight_wh"); + float loss_weight_conf_target = ctx.Attr("loss_weight_conf_target"); + float loss_weight_conf_notarget = + ctx.Attr("loss_weight_conf_notarget"); + float loss_weight_class = ctx.Attr("loss_weight_class"); + + const int n = input->dims()[0]; + const int c = input->dims()[1]; + const int h = input->dims()[2]; + const int w = input->dims()[3]; + const int an_num = anchors.size() / 2; + + Tensor pred_x, pred_y, pred_w, pred_h; + Tensor pred_conf, pred_class; + pred_x.mutable_data({n, an_num, h, w}, ctx.GetPlace()); + pred_y.mutable_data({n, an_num, h, w}, ctx.GetPlace()); + pred_w.mutable_data({n, an_num, h, w}, ctx.GetPlace()); + pred_h.mutable_data({n, an_num, h, w}, ctx.GetPlace()); + pred_conf.mutable_data({n, an_num, h, w}, ctx.GetPlace()); + pred_class.mutable_data({n, an_num, h, w, class_num}, ctx.GetPlace()); + CalcPredResult(*input, &pred_conf, &pred_class, &pred_x, &pred_y, + &pred_w, &pred_h, an_num, class_num); + + Tensor obj_mask, noobj_mask; + Tensor tx, ty, tw, th, tconf, tclass; + obj_mask.mutable_data({n, an_num, h, w}, ctx.GetPlace()); + noobj_mask.mutable_data({n, an_num, h, w}, ctx.GetPlace()); + tx.mutable_data({n, an_num, h, w}, ctx.GetPlace()); + ty.mutable_data({n, an_num, h, w}, ctx.GetPlace()); + tw.mutable_data({n, an_num, h, w}, ctx.GetPlace()); + th.mutable_data({n, an_num, h, w}, ctx.GetPlace()); + tconf.mutable_data({n, an_num, h, w}, ctx.GetPlace()); + tclass.mutable_data({n, an_num, h, w, class_num}, ctx.GetPlace()); + PreProcessGTBox(*gt_box, *gt_label, ignore_thresh, anchors, h, &obj_mask, + &noobj_mask, &tx, &ty, &tw, &th, &tconf, &tclass); + + Tensor obj_mask_expand; + obj_mask_expand.mutable_data({n, an_num, h, w, class_num}, + ctx.GetPlace()); + ExpandObjMaskByClassNum(&obj_mask_expand, obj_mask); + + Tensor grad_x, grad_y, grad_w, grad_h; + Tensor grad_conf_target, grad_conf_notarget, grad_class; + grad_x.mutable_data({n, an_num, h, w}, ctx.GetPlace()); + grad_y.mutable_data({n, an_num, h, w}, ctx.GetPlace()); + grad_w.mutable_data({n, an_num, h, w}, ctx.GetPlace()); + grad_h.mutable_data({n, an_num, h, w}, ctx.GetPlace()); + grad_conf_target.mutable_data({n, an_num, h, w}, ctx.GetPlace()); + grad_conf_notarget.mutable_data({n, an_num, h, w}, ctx.GetPlace()); + grad_class.mutable_data({n, an_num, h, w, class_num}, ctx.GetPlace()); + T obj_mf = CalcMaskPointNum(obj_mask); + T noobj_mf = CalcMaskPointNum(noobj_mask); + T obj_expand_mf = CalcMaskPointNum(obj_mask_expand); + CalcMSEGradWithMask(&grad_x, pred_x, tx, obj_mask, obj_mf); + CalcMSEGradWithMask(&grad_y, pred_y, ty, obj_mask, obj_mf); + CalcMSEGradWithMask(&grad_w, pred_w, tw, obj_mask, obj_mf); + CalcMSEGradWithMask(&grad_h, pred_h, th, obj_mask, obj_mf); + CalcBCEGradWithMask(&grad_conf_target, pred_conf, tconf, obj_mask, + obj_mf); + CalcBCEGradWithMask(&grad_conf_notarget, pred_conf, tconf, noobj_mask, + noobj_mf); + CalcBCEGradWithMask(&grad_class, pred_class, tclass, obj_mask_expand, + obj_expand_mf); + + input_grad->mutable_data({n, c, h, w}, ctx.GetPlace()); + AddAllGradToInputGrad( + input_grad, loss, pred_x, pred_y, pred_conf, pred_class, grad_x, grad_y, + grad_w, grad_h, grad_conf_target, grad_conf_notarget, grad_class, + class_num, loss_weight_xy, loss_weight_wh, loss_weight_conf_target, + loss_weight_conf_notarget, loss_weight_class); + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/platform/cpu_info.cc b/paddle/fluid/platform/cpu_info.cc index d466f28d1ea0a8327f8d7a45c3e55c5aacd61544..f9a32bfa4c15261ba6b79fc4efd3a1961f7c6d4d 100644 --- a/paddle/fluid/platform/cpu_info.cc +++ b/paddle/fluid/platform/cpu_info.cc @@ -123,7 +123,6 @@ size_t CUDAPinnedMaxChunkSize() { return CUDAPinnedMaxAllocSize() / 256; } -namespace jit { #ifdef PADDLE_WITH_XBYAK static Xbyak::util::Cpu cpu; bool MayIUse(const cpu_isa_t cpu_isa) { @@ -165,6 +164,5 @@ bool MayIUse(const cpu_isa_t cpu_isa) { } #endif -} // namespace jit } // namespace platform } // namespace paddle diff --git a/paddle/fluid/platform/cpu_info.h b/paddle/fluid/platform/cpu_info.h index fd31ef77b46d5b5b641983a0421da31914c87c18..55dba545ff133b1c219ee58f6d1bb2d2130d1a59 100644 --- a/paddle/fluid/platform/cpu_info.h +++ b/paddle/fluid/platform/cpu_info.h @@ -39,7 +39,6 @@ size_t CUDAPinnedMinChunkSize(); //! Get the maximum chunk size for buddy allocator. size_t CUDAPinnedMaxChunkSize(); -namespace jit { typedef enum { isa_any, sse42, @@ -55,7 +54,5 @@ typedef enum { // May I use some instruction bool MayIUse(const cpu_isa_t cpu_isa); -} // namespace jit - } // namespace platform } // namespace paddle diff --git a/paddle/fluid/platform/device_context.cc b/paddle/fluid/platform/device_context.cc index d0a108f905f46135bcd2b68be19ab396ab897272..bd81d4dd1f1073edffcb9fd4a02b455db27361d5 100644 --- a/paddle/fluid/platform/device_context.cc +++ b/paddle/fluid/platform/device_context.cc @@ -120,15 +120,24 @@ class EigenCudaStreamDevice : public Eigen::StreamInterface { } void* allocate(size_t num_bytes) const override { + if (UNLIKELY(num_bytes == 0)) { + return nullptr; + } auto buf = paddle::memory::Alloc(place_, num_bytes, memory::Allocator::kScratchpad); void* retv = buf->ptr(); - allocations_[buf->ptr()] = std::move(buf); + { + std::lock_guard lock(mtx_); + allocations_.emplace(retv, std::move(buf)); + } return retv; } void deallocate(void* buffer) const override { - allocations_.erase(allocations_.find(buffer)); + if (LIKELY(buffer)) { + std::lock_guard lock(mtx_); + allocations_.erase(buffer); + } } void* scratchpad() const override { @@ -155,6 +164,7 @@ class EigenCudaStreamDevice : public Eigen::StreamInterface { const cudaDeviceProp* device_prop_; // not owned; mutable void* scratch_; mutable unsigned int* semaphore_; + mutable std::mutex mtx_; // to protect allocations_ mutable std::unordered_map allocations_; }; @@ -210,6 +220,40 @@ CUDADeviceContext::CUDADeviceContext(CUDAPlace place) LOG_FIRST_N(WARNING, 1) << "device: " << place_.device << ", cuDNN Version: " << cudnn_dso_ver / 1000 << "." << (cudnn_dso_ver % 100) / 10 << "."; + + { + // Check CUDA/CUDNN version compatiblity + auto local_cuda_version = runtime_version_ / 100; + auto compile_cuda_version = CUDA_VERSION / 100; + if (local_cuda_version < compile_cuda_version) { + LOG_FIRST_N(WARNING, 1) + << "WARNING: device: " << place_.device + << ". The installed Paddle is compiled with CUDA " + << compile_cuda_version / 10 << "." << compile_cuda_version % 10 + << ", but CUDA runtime version in your machine is " + << local_cuda_version / 10 << "." << local_cuda_version % 10 + << ", which may cause serious incompatible bug. " + << "Please recompile or reinstall Paddle with compatible CUDA " + "version."; + } + + if (dynload::HasCUDNN()) { + auto local_cudnn_version = cudnn_dso_ver / 100; + auto compile_cudnn_version = CUDNN_VERSION / 100; + if (local_cuda_version < compile_cuda_version) { + LOG_FIRST_N(WARNING, 1) + << "WARNING: device: " << place_.device + << ". The installed Paddle is compiled with CUDNN " + << compile_cudnn_version / 10 << "." << compile_cudnn_version % 10 + << ", but CUDNN version in your machine is " + << local_cudnn_version / 10 << "." << local_cudnn_version % 10 + << ", which may cause serious incompatible bug. " + << "Please recompile or reinstall Paddle with compatible CUDNN " + "version."; + } + } + } + callback_manager_.reset(new StreamCallbackManager(stream_)); } diff --git a/paddle/fluid/platform/device_tracer.cc b/paddle/fluid/platform/device_tracer.cc index dc1d751141187edb7738e42c41514614d4d399b0..0a4563ead65b1e45adca1d1a1fce066a1a55d932 100644 --- a/paddle/fluid/platform/device_tracer.cc +++ b/paddle/fluid/platform/device_tracer.cc @@ -143,7 +143,7 @@ void CUPTIAPI bufferCompleted(CUcontext ctx, uint32_t streamId, uint8_t *buffer, case CUPTI_ACTIVITY_KIND_CONCURRENT_KERNEL: { auto *kernel = reinterpret_cast(record); - tracer->AddKernelRecords(kernel->start, kernel->end, + tracer->AddKernelRecords(kernel->name, kernel->start, kernel->end, kernel->deviceId, kernel->streamId, kernel->correlationId); break; @@ -224,8 +224,9 @@ class DeviceTracerImpl : public DeviceTracer { stream_id, correlation_id, bytes}); } - void AddKernelRecords(uint64_t start, uint64_t end, int64_t device_id, - int64_t stream_id, uint32_t correlation_id) { + void AddKernelRecords(std::string name, uint64_t start, uint64_t end, + int64_t device_id, int64_t stream_id, + uint32_t correlation_id) { // 0 means timestamp information could not be collected for the kernel. if (start == 0 || end == 0) { VLOG(3) << correlation_id << " cannot be traced"; @@ -233,7 +234,7 @@ class DeviceTracerImpl : public DeviceTracer { } std::lock_guard l(trace_mu_); kernel_records_.push_back( - KernelRecord{start, end, device_id, stream_id, correlation_id}); + KernelRecord{name, start, end, device_id, stream_id, correlation_id}); } bool IsEnabled() { @@ -276,13 +277,13 @@ class DeviceTracerImpl : public DeviceTracer { profile_pb.set_start_ns(start_ns_); profile_pb.set_end_ns(end_ns_); for (const KernelRecord &r : kernel_records_) { - if (correlations_.find(r.correlation_id) == correlations_.end()) { - fprintf(stderr, "cannot relate a kernel activity\n"); - continue; - } auto *event = profile_pb.add_events(); event->set_type(proto::Event::GPUKernel); - event->set_name(correlations_.at(r.correlation_id)); + if (correlations_.find(r.correlation_id) != correlations_.end()) { + event->set_name(correlations_.at(r.correlation_id)); + } else { + event->set_name(r.name); + } event->set_start_ns(r.start_ns); event->set_end_ns(r.end_ns); event->set_sub_device_id(r.stream_id); diff --git a/paddle/fluid/platform/device_tracer.h b/paddle/fluid/platform/device_tracer.h index eaf047d4744762f69d50bff8d467da8e3b8317cc..bf0786be2d0fafbf4b610d16ef587ac219399203 100644 --- a/paddle/fluid/platform/device_tracer.h +++ b/paddle/fluid/platform/device_tracer.h @@ -39,6 +39,7 @@ inline uint64_t PosixInNsec() { class DeviceTracer { public: struct KernelRecord { + std::string name; uint64_t start_ns; uint64_t end_ns; int64_t device_id; @@ -84,8 +85,9 @@ class DeviceTracer { // Add a cuda kernel stats. `correlation_id` will be mapped to annotation // added before for human readability. - virtual void AddKernelRecords(uint64_t start, uint64_t end, int64_t device_id, - int64_t stream_id, uint32_t correlation_id) = 0; + virtual void AddKernelRecords(std::string name, uint64_t start, uint64_t end, + int64_t device_id, int64_t stream_id, + uint32_t correlation_id) = 0; // Generate a proto after done (Disabled). virtual proto::Profile GenProfile(const std::string& profile_path) = 0; diff --git a/paddle/fluid/platform/dynload/cudnn.h b/paddle/fluid/platform/dynload/cudnn.h index 213cd8a9ce094512cea6f6405492ec8feff11516..550fe2edee13d628e761eca194809823537a4024 100644 --- a/paddle/fluid/platform/dynload/cudnn.h +++ b/paddle/fluid/platform/dynload/cudnn.h @@ -125,8 +125,7 @@ extern void EnforceCUDNNLoaded(const char* fn_name); __macro(cudnnRNNBackwardWeights); \ __macro(cudnnRNNForwardInference); \ __macro(cudnnDestroyDropoutDescriptor); \ - __macro(cudnnDestroyRNNDescriptor); \ - __macro(cudnnSetRNNDescriptor_v6); + __macro(cudnnDestroyRNNDescriptor); CUDNN_DNN_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP) @@ -165,6 +164,12 @@ CUDNN_DNN_ROUTINE_EACH_AFTER_R4(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP) CUDNN_DNN_ROUTINE_EACH_R5(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP) #endif +// APIs in R6 +#if CUDNN_VERSION >= 6000 +#define CUDNN_DNN_ROUTINE_EACH_R6(__macro) __macro(cudnnSetRNNDescriptor_v6); +CUDNN_DNN_ROUTINE_EACH_R6(DECLARE_DYNAMIC_LOAD_CUDNN_WRAP) +#endif + #if CUDNN_VERSION >= 7001 #define CUDNN_DNN_ROUTINE_EACH_R7(__macro) \ __macro(cudnnSetConvolutionGroupCount); \ diff --git a/paddle/fluid/platform/gpu_info.cc b/paddle/fluid/platform/gpu_info.cc index 6954e4c6a9df8dea01ec2b0f193965d835503b17..ca89d91aadb2d3e9005e6dd06cef124428d7e250 100644 --- a/paddle/fluid/platform/gpu_info.cc +++ b/paddle/fluid/platform/gpu_info.cc @@ -18,6 +18,7 @@ limitations under the License. */ #include "gflags/gflags.h" #include "paddle/fluid/platform/enforce.h" +#include "paddle/fluid/string/split.h" #ifndef _WIN32 constexpr static float fraction_of_gpu_memory_to_use = 0.92f; @@ -45,6 +46,15 @@ DEFINE_bool( "input and output must be half precision) and recurrent neural networks " "(RNNs)."); +DEFINE_string(selected_gpus, "", + "A list of device ids separated by comma, like: 0,1,2,3. " + "This option is useful when doing multi process training and " + "each process have only one device (GPU). If you want to use " + "all visible devices, set this to empty string. NOTE: the " + "reason of doing this is that we want to use P2P communication" + "between GPU devices, use CUDA_VISIBLE_DEVICES can only use" + "share-memory only."); + namespace paddle { namespace platform { @@ -121,6 +131,24 @@ int GetCurrentDeviceId() { return device_id; } +//! Get a list of device ids from environment variable or use all. +std::vector GetSelectedDevices() { + // use user specified GPUs in single-node multi-process mode. + std::vector devices; + if (!FLAGS_selected_gpus.empty()) { + auto devices_str = paddle::string::Split(FLAGS_selected_gpus, ','); + for (auto id : devices_str) { + devices.push_back(atoi(id.c_str())); + } + } else { + int count = GetCUDADeviceCount(); + for (int i = 0; i < count; ++i) { + devices.push_back(i); + } + } + return devices; +} + void SetDeviceId(int id) { // TODO(qijun): find a better way to cache the cuda device count PADDLE_ENFORCE_LT(id, GetCUDADeviceCount(), "id must less than GPU count"); diff --git a/paddle/fluid/platform/gpu_info.h b/paddle/fluid/platform/gpu_info.h index 6a0b3c8e02d49068c2dbe14c7feea7e139947694..1e1ab2503f53fe20bbe62c48f65d8535947f1aa8 100644 --- a/paddle/fluid/platform/gpu_info.h +++ b/paddle/fluid/platform/gpu_info.h @@ -19,6 +19,7 @@ limitations under the License. */ #include #include #include +#include namespace paddle { namespace platform { @@ -47,6 +48,9 @@ int GetCUDAMaxThreadsPerMultiProcessor(int i); //! Get the current GPU device id in system. int GetCurrentDeviceId(); +//! Get a list of device ids from environment variable or use all. +std::vector GetSelectedDevices(); + //! Set the GPU device id for next execution. void SetDeviceId(int device_id); diff --git a/paddle/fluid/platform/init.cc b/paddle/fluid/platform/init.cc index 258779ba51026d0cc418257a37b78f346fa48efa..0d10d82d74a2011b1b2bc088fe88cbfdb49600b8 100644 --- a/paddle/fluid/platform/init.cc +++ b/paddle/fluid/platform/init.cc @@ -19,6 +19,7 @@ limitations under the License. */ #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/platform/cpu_helper.h" #include "paddle/fluid/platform/cpu_info.h" +#include "paddle/fluid/string/split.h" #ifdef PADDLE_WITH_CUDA #include "paddle/fluid/platform/cuda_device_guard.h" #endif @@ -82,10 +83,8 @@ void InitDevices(bool init_p2p) { std::vector devices; #ifdef PADDLE_WITH_CUDA try { - int count = platform::GetCUDADeviceCount(); - for (int i = 0; i < count; ++i) { - devices.push_back(i); - } + // use user specified GPUs in single-node multi-process mode. + devices = platform::GetSelectedDevices(); } catch (const std::exception &exp) { LOG(WARNING) << "Compiled with WITH_GPU, but no GPU found in runtime."; } @@ -95,20 +94,15 @@ void InitDevices(bool init_p2p) { void InitDevices(bool init_p2p, const std::vector devices) { std::vector places; - int count = 0; -#ifdef PADDLE_WITH_CUDA - try { - count = platform::GetCUDADeviceCount(); - } catch (const std::exception &exp) { - LOG(WARNING) << "Compiled with WITH_GPU, but no GPU found in runtime."; - } -#endif for (size_t i = 0; i < devices.size(); ++i) { - if (devices[i] >= count || devices[i] < 0) { + // In multi process multi gpu mode, we may have gpuid = 7 + // but count = 1. + if (devices[i] < 0) { LOG(WARNING) << "Invalid devices id."; continue; } + places.emplace_back(platform::CUDAPlace(devices[i])); } if (init_p2p) { @@ -122,7 +116,7 @@ void InitDevices(bool init_p2p, const std::vector devices) { #endif #if !defined(_WIN32) && !defined(__APPLE__) && !defined(__OSX__) - if (platform::jit::MayIUse(platform::jit::avx)) { + if (platform::MayIUse(platform::avx)) { #ifndef __AVX__ LOG(WARNING) << "AVX is available, Please re-compile on local machine"; #endif @@ -137,10 +131,10 @@ void InitDevices(bool init_p2p, const std::vector devices) { " version or compile from source code." #ifdef __AVX512F__ - if (!platform::jit::MayIUse(platform::jit::avx512f)) { - if (platform::jit::MayIUse(platform::jit::avx2)) { + if (!platform::MayIUse(platform::avx512f)) { + if (platform::MayIUse(platform::avx2)) { AVX_GUIDE(AVX512, AVX2); - } else if (platform::jit::MayIUse(platform::jit::avx)) { + } else if (platform::MayIUse(platform::avx)) { AVX_GUIDE(AVX512, AVX); } else { AVX_GUIDE(AVX512, NonAVX); @@ -149,8 +143,8 @@ void InitDevices(bool init_p2p, const std::vector devices) { #endif #ifdef __AVX2__ - if (!platform::jit::MayIUse(platform::jit::avx2)) { - if (platform::jit::MayIUse(platform::jit::avx)) { + if (!platform::MayIUse(platform::avx2)) { + if (platform::MayIUse(platform::avx)) { AVX_GUIDE(AVX2, AVX); } else { AVX_GUIDE(AVX2, NonAVX); @@ -159,7 +153,7 @@ void InitDevices(bool init_p2p, const std::vector devices) { #endif #ifdef __AVX__ - if (!platform::jit::MayIUse(platform::jit::avx)) { + if (!platform::MayIUse(platform::avx)) { AVX_GUIDE(AVX, NonAVX); } #endif diff --git a/paddle/fluid/platform/mkldnn_helper.h b/paddle/fluid/platform/mkldnn_helper.h index 167bd4e81d0ddbbba260417b460d083dbeb932b6..e53064893ee89f663a76483b92de32b318b6c61f 100644 --- a/paddle/fluid/platform/mkldnn_helper.h +++ b/paddle/fluid/platform/mkldnn_helper.h @@ -113,6 +113,18 @@ inline mkldnn::memory::format MKLDNNFormatForSize( return mkldnn::memory::format::x; } else if (dims_size == 2) { return mkldnn::memory::format::nc; + } else if (dims_size == 3) { + if (data_format == mkldnn::memory::format::nchw) { + return mkldnn::memory::format::ncw; + } else if (data_format == mkldnn::memory::format::nhwc) { + return mkldnn::memory::format::nwc; + } + } else if (dims_size == 5) { + if (data_format == mkldnn::memory::format::nchw) { + return mkldnn::memory::format::ncdhw; + } else if (data_format == mkldnn::memory::format::nhwc) { + return mkldnn::memory::format::ndhwc; + } } return data_format; } diff --git a/paddle/fluid/platform/nccl_helper.h b/paddle/fluid/platform/nccl_helper.h index fc903b548c70e9b72c6121dd24c014973e3cd1d4..7c539d25f6dd02fc09aa1234d7bf0164b54a610f 100644 --- a/paddle/fluid/platform/nccl_helper.h +++ b/paddle/fluid/platform/nccl_helper.h @@ -97,7 +97,7 @@ struct NCCLContextMap { order_.size(), contexts_.size(), "NCCL Context Map does not support contain two or more same device"); - if (places.size() <= 1) { + if (places.size() <= 1 && num_trainers == 1) { return; } std::unique_ptr comms(new ncclComm_t[order_.size()]); @@ -111,12 +111,19 @@ struct NCCLContextMap { { int nranks = num_trainers * order_.size(); NCCLGroupGuard gurad; - for (auto &gpu_id : order_) { - int rank = trainer_id * order_.size() + gpu_id; - VLOG(3) << "init nccl rank: " << rank << " nranks: " << nranks; + for (size_t i = 0; i < order_.size(); ++i) { + int gpu_id = order_[i]; + int rank; + if (order_.size() > 1) { + rank = trainer_id * order_.size() + i; + } else { + rank = trainer_id; + } + VLOG(30) << "init nccl rank: " << rank << " nranks: " << nranks + << "gpu id: " << gpu_id; PADDLE_ENFORCE(cudaSetDevice(gpu_id)); PADDLE_ENFORCE(platform::dynload::ncclCommInitRank( - comms.get() + gpu_id, nranks, *nccl_id, rank)); + comms.get() + i, nranks, *nccl_id, rank)); } } } diff --git a/paddle/fluid/pybind/CMakeLists.txt b/paddle/fluid/pybind/CMakeLists.txt index d602613fc82223e14f48830a87533880696eb550..b8954cb12628d1f4f333956e0213ddf9c01e592c 100644 --- a/paddle/fluid/pybind/CMakeLists.txt +++ b/paddle/fluid/pybind/CMakeLists.txt @@ -1,6 +1,7 @@ -set(PYBIND_DEPS pybind python proto_desc memory executor async_executor prune feed_fetch_method pass_builder parallel_executor profiler) -set(PYBIND_SRCS pybind.cc exception.cc protobuf.cc const_value.cc recordio.cc async_executor_py.cc) +set(PYBIND_DEPS pybind python proto_desc memory executor async_executor prune feed_fetch_method pass_builder parallel_executor profiler layer) +set(PYBIND_SRCS pybind.cc exception.cc protobuf.cc const_value.cc recordio.cc async_executor_py.cc imperative.cc) + if(WITH_PYTHON) if(WITH_AMD_GPU) hip_library(paddle_pybind SHARED diff --git a/paddle/fluid/pybind/imperative.cc b/paddle/fluid/pybind/imperative.cc new file mode 100644 index 0000000000000000000000000000000000000000..34e9c897d9e95feb185083b7c0a6a824d8dc809c --- /dev/null +++ b/paddle/fluid/pybind/imperative.cc @@ -0,0 +1,36 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/pybind/imperative.h" +#include "paddle/fluid/framework/block_desc.h" +#include "paddle/fluid/framework/scope.h" +#include "paddle/fluid/imperative/tracer.h" + +namespace paddle { +namespace pybind { + +// Bind Methods +void BindTracer(pybind11::module *m) { + pybind11::class_(*m, "Tracer", "") + .def("__init__", + [](imperative::Tracer &self, framework::BlockDesc *root_block) { + new (&self) imperative::Tracer(root_block); + }) + .def("trace", &imperative::Tracer::Trace) + .def("get_scope", &imperative::Tracer::GetScope, + pybind11::return_value_policy::reference); +} + +} // namespace pybind +} // namespace paddle diff --git a/paddle/fluid/pybind/imperative.h b/paddle/fluid/pybind/imperative.h new file mode 100644 index 0000000000000000000000000000000000000000..7a9d3a01ea81f11ac85000c3d0153f20e108789a --- /dev/null +++ b/paddle/fluid/pybind/imperative.h @@ -0,0 +1,53 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#pragma once + +#include +#include +#include "paddle/fluid/imperative/layer.h" +#include "pybind11/pybind11.h" +#include "pybind11/stl.h" + +namespace paddle { +namespace pybind { + +class PyLayer : public imperative::Layer { + public: + using imperative::Layer::Layer; // Inherit constructors + + std::vector Forward( + const std::vector& inputs) override { + PYBIND11_OVERLOAD(std::vector, Layer, Forward, + inputs); // NOLINT + } + + void Backward() override { + PYBIND11_OVERLOAD(void, Layer, Backward, ); // NOLINT + } +}; + +class PyOpBase : public imperative::OpBase { + public: + using imperative::OpBase::OpBase; // Inherit constructors +}; + +class PyVarBase : public imperative::VarBase { + public: + using imperative::VarBase::VarBase; // Inherit constructors +}; + +void BindTracer(pybind11::module* m); + +} // namespace pybind +} // namespace paddle diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc index fc7991d2974c9262e6225de1537025944c1068c1..dca0c01ab229ce8c1f8578ad489ef927e15a1068 100644 --- a/paddle/fluid/pybind/pybind.cc +++ b/paddle/fluid/pybind/pybind.cc @@ -34,6 +34,7 @@ limitations under the License. */ #include "paddle/fluid/framework/reader.h" #include "paddle/fluid/framework/selected_rows.h" #include "paddle/fluid/framework/version.h" +#include "paddle/fluid/imperative/layer.h" #include "paddle/fluid/memory/allocation/allocator_strategy.h" #include "paddle/fluid/operators/activation_op.h" #include "paddle/fluid/operators/reader/lod_tensor_blocking_queue.h" @@ -45,6 +46,7 @@ limitations under the License. */ #include "paddle/fluid/pybind/async_executor_py.h" #include "paddle/fluid/pybind/const_value.h" #include "paddle/fluid/pybind/exception.h" +#include "paddle/fluid/pybind/imperative.h" #include "paddle/fluid/pybind/protobuf.h" #include "paddle/fluid/pybind/pybind.h" // NOLINT #include "paddle/fluid/pybind/recordio.h" @@ -100,6 +102,42 @@ PYBIND11_MODULE(core, m) { BindException(&m); + py::class_(m, "VarBase", R"DOC()DOC") + .def(py::init<>()) + .def("_run_backward", + [](imperative::VarBase &self, framework::Scope *scope) { + self.RunBackward(scope); + }) + .def("_grad", &imperative::VarBase::Grad) + .def_property( + "desc", + [](const imperative::VarBase &self) { return self.var_desc_; }, + [](imperative::VarBase &self, framework::VarDesc *var_desc) { + self.var_desc_ = var_desc; + }, + py::return_value_policy::reference); + + py::class_(m, "OpBase", R"DOC()DOC") + .def(py::init<>()) + .def_property( + "desc", [](const imperative::OpBase &self) { return self.op_desc_; }, + [](imperative::OpBase &self, framework::OpDesc *op_desc) { + if (op_desc) { + self.op_desc_ = op_desc; + } + }, + py::return_value_policy::reference); + + py::class_ layer(m, "Layer"); + layer.def(py::init<>()) + .def("forward", + [](imperative::Layer &self, + const std::vector &inputs) { + return self.Forward(inputs); + }) + .def("backward", &imperative::Layer::Backward); + BindTracer(&m); + py::class_(m, "Tensor", py::buffer_protocol()) .def_buffer( [](Tensor &self) -> py::buffer_info { return CastToPyBuffer(self); }) @@ -601,6 +639,7 @@ All parameter, weight, gradient are variables in Paddle. m.def("set_feed_variable", framework::SetFeedVariable); m.def("get_fetch_variable", framework::GetFetchVariable); + m.def("get_variable_tensor", framework::GetVariableTensor); m.def("_is_program_version_supported", IsProgramVersionSupported); @@ -886,6 +925,18 @@ All parameter, weight, gradient are variables in Paddle. [](BuildStrategy &self, int num_trainers) { self.num_trainers_ = num_trainers; }) + .def_property( + "trainers_endpoints", + [](const BuildStrategy &self) { return self.trainers_endpoints_; }, + [](BuildStrategy &self, + const std::vector &trainers_endpoints) { + self.trainers_endpoints_ = trainers_endpoints; + }) + .def_property("trainer_id", + [](const BuildStrategy &self) { return self.trainer_id_; }, + [](BuildStrategy &self, int trainer_id) { + self.trainer_id_ = trainer_id; + }) .def_property( "fuse_elewise_add_act_ops", [](const BuildStrategy &self) { diff --git a/paddle/fluid/string/CMakeLists.txt b/paddle/fluid/string/CMakeLists.txt index 8572dc1e8e543b552e3ed5a180ec942faf90a624..169a925d12328e7d1df744635445b5674c19b125 100644 --- a/paddle/fluid/string/CMakeLists.txt +++ b/paddle/fluid/string/CMakeLists.txt @@ -3,3 +3,4 @@ cc_library(pretty_log SRCS pretty_log.cc) cc_test(stringpiece_test SRCS piece_test.cc DEPS stringpiece glog gflags) cc_test(stringprintf_test SRCS printf_test.cc DEPS glog gflags) cc_test(to_string_test SRCS to_string_test.cc) +cc_test(split_test SRCS split_test.cc) diff --git a/paddle/fluid/string/split.h b/paddle/fluid/string/split.h new file mode 100644 index 0000000000000000000000000000000000000000..ccb96b8a9cb68f03acbca592a2149ba5001f34d2 --- /dev/null +++ b/paddle/fluid/string/split.h @@ -0,0 +1,37 @@ +/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include +#include +#include + +namespace paddle { +namespace string { + +static inline std::vector Split(std::string const& original, + char separator) { + std::vector results; + std::string token; + std::istringstream is(original); + while (std::getline(is, token, separator)) { + if (!token.empty()) { + results.push_back(token); + } + } + return results; +} + +} // namespace string +} // namespace paddle diff --git a/paddle/fluid/string/split_test.cc b/paddle/fluid/string/split_test.cc new file mode 100644 index 0000000000000000000000000000000000000000..c85dc1eed40dbe25d922c0f4810a747d1bd2d60f --- /dev/null +++ b/paddle/fluid/string/split_test.cc @@ -0,0 +1,28 @@ +// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/string/split.h" + +#include + +#include "gtest/gtest.h" + +TEST(StringSplit, StringSplit) { + std::string to_split = "0,1,2,3,4,5"; + int i = 0; + for (auto s : paddle::string::Split(to_split, ',')) { + EXPECT_EQ(atoi(s.c_str()), i); + i++; + } +} diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh index dbb73f7a27ac815ecfeee2efcc09bb2cafb8395e..6299b166af8a5f65cf587ae282c955f33db0044b 100755 --- a/paddle/scripts/paddle_build.sh +++ b/paddle/scripts/paddle_build.sh @@ -437,7 +437,7 @@ EOF export http_proxy= export https_proxy= # TODO: jiabin need to refine this part when these tests fixed on mac - ctest --output-on-failure -j $1 + ctest --output-on-failure -j $2 # make install should also be test when unittest make install -j 8 if [ "$1" == "cp27-cp27m" ]; then @@ -449,7 +449,7 @@ EOF elif [ "$1" == "cp37-cp37m" ]; then pip3.7 install --user ${INSTALL_PREFIX:-/paddle/build}/opt/paddle/share/wheels/*.whl fi - + if [[ ${WITH_FLUID_ONLY:-OFF} == "OFF" ]] ; then paddle version fi @@ -472,12 +472,15 @@ function assert_api_not_changed() { virtualenv .env source .env/bin/activate pip install ${PADDLE_ROOT}/build/python/dist/*whl - python ${PADDLE_ROOT}/tools/print_signatures.py paddle.fluid > new.spec + python ${PADDLE_ROOT}/tools/print_signatures.py paddle.fluid,paddle.reader > new.spec if [ "$1" == "cp35-cp35m" ] || [ "$1" == "cp36-cp36m" ] || [ "$1" == "cp37-cp37m" ]; then # Use sed to make python2 and python3 sepc keeps the same sed -i 's/arg0: str/arg0: unicode/g' new.spec sed -i "s/\(.*Transpiler.*\).__init__ ArgSpec(args=\['self'].*/\1.__init__ /g" new.spec fi + # ComposeNotAligned has significant difference between py2 and py3 + sed -i '/.*ComposeNotAligned.*/d' new.spec + python ${PADDLE_ROOT}/tools/diff_api.py ${PADDLE_ROOT}/paddle/fluid/API.spec new.spec deactivate } @@ -487,7 +490,19 @@ function assert_api_spec_approvals() { BRANCH="develop" fi - API_FILES=("paddle/fluid/API.spec" "paddle/fluid/framework/operator.h") + API_FILES=("paddle/fluid/API.spec" + "paddle/fluid/framework/operator.h" + "paddle/fluid/framework/tensor.h" + "paddle/fluid/framework/lod_tensor.h" + "paddle/fluid/framework/selected_rows.h" + "paddle/fluid/framework/op_desc.h" + "paddle/fluid/framework/block_desc.h" + "paddle/fluid/framework/var_desc.h" + "paddle/fluid/framework/scope.h" + "paddle/fluid/framework/ir/node.h" + "paddle/fluid/framework/ir/graph.h" + "paddle/fluid/framework/framework.proto" + "paddle/fluid/operators/distributed/send_recv.proto.in") for API_FILE in ${API_FILES[*]}; do API_CHANGE=`git diff --name-only upstream/$BRANCH | grep "${API_FILE}" || true` echo "checking ${API_FILE} change, PR: ${GIT_PR_ID}, changes: ${API_CHANGE}" @@ -901,7 +916,7 @@ function main() { maccheck) cmake_gen ${PYTHON_ABI:-""} build_mac - run_mac_test ${PROC_RUN:-1} + run_mac_test ${PYTHON_ABI:-""} ${PROC_RUN:-1} ;; macbuild) cmake_gen ${PYTHON_ABI:-""} diff --git a/python/paddle/dataset/image.py b/python/paddle/dataset/image.py index 19fc229e6fa84792f58aeeb00be09eb2401b19c7..57547f1867a937d16fb2dfc9b84e1a30759a527e 100644 --- a/python/paddle/dataset/image.py +++ b/python/paddle/dataset/image.py @@ -32,11 +32,28 @@ the image layout as follows. from __future__ import print_function +import six import numpy as np -try: - import cv2 -except ImportError: - cv2 = None +# FIXME(minqiyang): this is an ugly fix for the numpy bug reported here +# https://github.com/numpy/numpy/issues/12497 +if six.PY3: + import subprocess + import sys + import_cv2_proc = subprocess.Popen( + [sys.executable, "-c", "import cv2"], + stdout=subprocess.PIPE, + stderr=subprocess.PIPE) + out, err = import_cv2_proc.communicate() + retcode = import_cv2_proc.poll() + if retcode != 0: + cv2 = None + else: + import cv2 +else: + try: + import cv2 + except ImportError: + cv2 = None import os import tarfile import six.moves.cPickle as pickle diff --git a/python/paddle/fluid/__init__.py b/python/paddle/fluid/__init__.py index d5150f5e54db1de48de463f623e00ba6c083f3c1..dc1d152a093b1df4f1d6b76f83b00fe827016ae9 100644 --- a/python/paddle/fluid/__init__.py +++ b/python/paddle/fluid/__init__.py @@ -34,6 +34,7 @@ from . import io from . import evaluator from . import initializer from . import layers +from . import imperative from . import contrib from . import nets from . import optimizer @@ -67,6 +68,7 @@ __all__ = framework.__all__ + executor.__all__ + \ 'initializer', 'layers', 'contrib', + 'imperative', 'transpiler', 'nets', 'optimizer', @@ -148,7 +150,7 @@ def __bootstrap__(): read_env_flags += [ 'fraction_of_gpu_memory_to_use', 'cudnn_deterministic', 'enable_cublas_tensor_op_math', 'conv_workspace_size_limit', - 'cudnn_exhaustive_search' + 'cudnn_exhaustive_search', 'selected_gpus' ] core.init_gflags([sys.argv[0]] + ["--tryfromenv=" + ",".join(read_env_flags)]) diff --git a/python/paddle/fluid/clip.py b/python/paddle/fluid/clip.py index 1738afe93e99f1de28bec2fb23be8e1a309d9288..0f7dd531b3e5992caa558def6bbdf446a7d2ffaa 100644 --- a/python/paddle/fluid/clip.py +++ b/python/paddle/fluid/clip.py @@ -134,12 +134,12 @@ class GradientClipByValue(BaseGradientClipAttr): Examples: .. code-block:: python - w_param_attrs = ParamAttr(name=None, - initializer=UniformInitializer(low=-1.0, high=1.0, seed=0), + w_param_attrs = fluid.ParamAttr(name=None, + initializer=fluid.initializer.UniformInitializer(low=-1.0, high=1.0, seed=0), learning_rate=1.0, - regularizer=L1Decay(1.0), + regularizer=fluid.regularizer.L1Decay(1.0), trainable=True, - clip=GradientClipByValue(-1.0, 1.0)) + clip=fluid.clip.GradientClipByValue(-1.0, 1.0)) y_predict = fluid.layers.fc(input=x, size=1, param_attr=w_param_attrs) """ @@ -185,12 +185,12 @@ class GradientClipByNorm(BaseGradientClipAttr): Examples: .. code-block:: python - w_param_attrs = ParamAttr(name=None, - initializer=UniformInitializer(low=-1.0, high=1.0, seed=0), + w_param_attrs = flui.ParamAttr(name=None, + initializer=fluid.initializer.UniformInitializer(low=-1.0, high=1.0, seed=0), learning_rate=1.0, - regularizer=L1Decay(1.0), + regularizer=fluid.regularizer.L1Decay(1.0), trainable=True, - clip=GradientClipByNorm(clip_norm=2.0)) + clip=fluid.clip.GradientClipByNorm(clip_norm=2.0)) y_predict = fluid.layers.fc(input=x, size=1, param_attr=w_param_attrs) """ @@ -271,7 +271,12 @@ class GradientClipByGlobalNorm(BaseGradientClipAttr): "All parameters' 'clip_norm' of a same group should be the same" ) - square = grad * grad + merge_grad = grad + if grad.type == core.VarDesc.VarType.SELECTED_ROWS: + merge_grad = layers.merge_selected_rows(grad) + merge_grad = layers.get_tensor_from_selected_rows(merge_grad) + + square = layers.square(merge_grad) local_norm_var = layers.reduce_sum(input=square) context[self.group_name].append(local_norm_var) @@ -292,6 +297,7 @@ class GradientClipByGlobalNorm(BaseGradientClipAttr): new_grad = layers.elementwise_mul( x=grad, y=self.context[group_scale_name]) + return param, new_grad diff --git a/python/paddle/fluid/data_feeder.py b/python/paddle/fluid/data_feeder.py index 5102a558fd3fdfd89ad769cd3a10f5dc3ea78716..13d2893fd146b5a3d9100ee1ba6c2243cb9c411b 100644 --- a/python/paddle/fluid/data_feeder.py +++ b/python/paddle/fluid/data_feeder.py @@ -258,10 +258,13 @@ class DataFeeder(object): multiple mini-batches. Each mini-batch will be feed on each device. Args: - reader(fun): the input data. - multi_devices(bool): the number of places. Default None. - num_places(int): the number of places. Default None. - drop_last(bool): the number of places. Default None. + reader(function): the reader is the function which can generate data. + multi_devices(bool): whether to use multiple devices or not. + num_places(int): if the multi_devices is True, you can specify the number + of GPU to use, if 'num_places' is None, the function will use all the + GPU of the current machine. Default None. + drop_last(bool): whether to drop the last batch if the + size of the last batch is less than batch_size. Default True. Returns: dict: the result of conversion. diff --git a/python/paddle/fluid/executor.py b/python/paddle/fluid/executor.py index 42c2484b284844a1f1acf53f79296e13da72676a..f2886090d75f87654b33cf7aa6f98ebf6f2e27d1 100644 --- a/python/paddle/fluid/executor.py +++ b/python/paddle/fluid/executor.py @@ -20,7 +20,7 @@ import six from .framework import Program, default_main_program, Variable from . import core -__all__ = ['Executor', 'global_scope', 'scope_guard', '_switch_scope'] +__all__ = ['Executor', 'global_scope', 'scope_guard'] g_scope = core.Scope() @@ -407,16 +407,17 @@ class Executor(object): Examples: - >>> data = layers.data(name='X', shape=[1], dtype='float32') - >>> hidden = layers.fc(input=data, size=10) - >>> layers.assign(hidden, out) - >>> loss = layers.mean(out) + >>> data = fluid.layers.data(name='X', shape=[1], dtype='float32') + >>> out = fluid.layers.create_tensor(dtype='float32') + >>> hidden = fluid.layers.fc(input=data, size=10) + >>> fluid.layers.assign(hidden,out) + >>> loss = fluid.layers.mean(out) >>> adam = fluid.optimizer.Adam() - >>> adam.minimize(loss) + >>> adam.minimize(loss) >>> cpu = core.CPUPlace() - >>> exe = Executor(cpu) - >>> exe.run(default_startup_program()) + >>> exe = fluid.Executor(cpu) + >>> exe.run(fluid.default_startup_program()) >>> x = numpy.random.random(size=(10, 1)).astype('float32') >>> outs = exe.run( diff --git a/python/paddle/fluid/framework.py b/python/paddle/fluid/framework.py index b991187d424108db176ebd6996d7d161f11dcd3d..089792059465c60da43d02e8389f4e36900c2292 100644 --- a/python/paddle/fluid/framework.py +++ b/python/paddle/fluid/framework.py @@ -18,6 +18,7 @@ import collections import contextlib import re import six +import sys import numpy as np @@ -49,6 +50,16 @@ GRAD_VAR_SUFFIX = core.kGradVarSuffix() ZERO_VAR_SUFFIX = core.kZeroVarSuffix() CONTROL_DEP_VAR_PREFIX = core.kControlDepVarName() +_imperative_tracer_ = None + + +def _in_imperative_mode(): + return _imperative_tracer_ is not None + + +def _imperative_tracer(): + return _imperative_tracer_ + class NameScope(object): def __init__(self, name="", parent=None): @@ -89,12 +100,13 @@ def name_scope(prefix=None): Examples: .. code-block:: python + with name_scope("encoder"): ... with name_scope("decoder"): ... - with name_scope("attention"): - ... + with name_scope("attention"): + ... """ # TODO(panyx0718): Only [0-9a-z]. assert prefix, "namescope prefix cannot be empty." @@ -344,6 +356,21 @@ class Variable(object): self.op = None self.stop_gradient = stop_gradient self.is_data = is_data + if _in_imperative_mode(): + self._ivar = core.VarBase() + self._ivar.desc = self.desc + + def _numpy(self): + scope = _imperative_tracer().get_scope(self.block.desc) + tensor = core.get_variable_tensor(scope, self.desc.name()) + return np.array(tensor) + + def _backward(self): + scope = _imperative_tracer().get_scope(self.block.desc) + self._ivar._run_backward(scope) + + def _gradient(self): + return np.array(self._ivar._grad()) def __str__(self): return self.to_string(True) @@ -654,6 +681,23 @@ class Operator(object): if self._has_kernel(type): self.desc.infer_var_type(self.block.desc) self.desc.infer_shape(self.block.desc) + if _in_imperative_mode(): + self.iop = core.OpBase() + self.iop.desc = self.desc + self.inputs = [] + if inputs is not None: + for inp in inputs.values(): + if isinstance(inp, Variable): + self.inputs.append(inp) + elif isinstance(inp, list) or isinstance(inp, tuple): + self.inputs.extend(inp[:]) + self.outputs = [] + if outputs is not None: + for out in outputs.values(): + if isinstance(out, Variable): + self.outputs.append(out) + elif isinstance(out, list) or isinstance(out, tuple): + self.outputs.extend(out[:]) def _has_kernel(self, op_type): return op_type not in self.OP_WITHOUT_KERNEL_SET @@ -1040,19 +1084,15 @@ class Block(object): raise ValueError("var %s not in this block" % name) return v - def _var_recursive(self, name): + def _find_var_recursive(self, name): """ Get a Variable by name from this block recursively. Args: name(str): the Variable's name. - Raises: - ValueError: this block and this parent block doesn't - have a Variable with the giving name. - Returns: - Variable: the Variable with the giving name. + Variable: the Variable with the giving name. Or None if not found. """ frontier = list() visited = set() @@ -1078,8 +1118,27 @@ class Block(object): frontier.append(prog.block(cur.forward_block_idx)) visited.add(id(cur)) + return None - raise ValueError("Var {0} is not found recursively".format(name)) + def _var_recursive(self, name): + """ + Get a Variable by name from this block recursively. + + Args: + name(str): the Variable's name. + + Raises: + ValueError: this block and this parent block doesn't + have a Variable with the giving name. + + Returns: + Variable: the Variable with the giving name. + """ + var = self._find_var_recursive(name) + if var: + return var + else: + raise ValueError("Var {0} is not found recursively".format(name)) def all_parameters(self): return list(self.iter_parameters()) @@ -1205,6 +1264,9 @@ class Block(object): """ op_desc = self.desc.append_op() op = Operator(block=self, desc=op_desc, *args, **kwargs) + if _in_imperative_mode(): + _imperative_tracer().trace(op.iop, [v._ivar for v in op.inputs], + [v._ivar for v in op.outputs], self.desc) self.ops.append(op) return op @@ -1441,6 +1503,7 @@ class Program(object): self._is_chief = False self._slice_vars_and_attrs = [] self._endpoints = [] + self._trainers_endpoints = [] self._distributed_lookup_table = None @property @@ -2208,3 +2271,12 @@ def _get_var(name, program=None): assert isinstance(program, Program) return program.global_block().var(name) + + +@contextlib.contextmanager +def _imperative_guard(tracer): + global _imperative_tracer_ + tmp_trace = _imperative_tracer_ + _imperative_tracer_ = tracer + yield + _imperative_tracer_ = tmp_trace diff --git a/python/paddle/fluid/imperative/__init__.py b/python/paddle/fluid/imperative/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..922308b6b18b335535d41f24d544cde04991b794 --- /dev/null +++ b/python/paddle/fluid/imperative/__init__.py @@ -0,0 +1,25 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +from . import base +from .base import * + +from . import layers +from .layers import * + +__all__ = [] +__all__ += layers.__all__ +__all__ += base.__all__ diff --git a/python/paddle/fluid/imperative/base.py b/python/paddle/fluid/imperative/base.py new file mode 100644 index 0000000000000000000000000000000000000000..15d38ddb56c71ef7de67f79cf52cd26070f470cb --- /dev/null +++ b/python/paddle/fluid/imperative/base.py @@ -0,0 +1,56 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +import contextlib +import numpy as np + +from paddle.fluid import core +from paddle.fluid import framework + +__all__ = ['enabled', 'guard', 'to_variable'] + + +def enabled(): + return framework._in_imperative_mode() + + +@contextlib.contextmanager +def guard(): + train = framework.Program() + startup = framework.Program() + tracer = core.Tracer(train.current_block().desc) + with framework.program_guard(train, startup): + with framework.unique_name.guard(): + with framework._imperative_guard(tracer): + yield + + +def to_variable(value, block=None): + if isinstance(value, np.ndarray): + if not block: + block = framework.default_main_program().current_block() + py_var = framework.Variable( + block, + type=core.VarDesc.VarType.LOD_TENSOR, + name=None, + shape=value.shape, + dtype=value.dtype) + scope = framework._imperative_tracer().get_scope(block.desc) + var = scope.var(py_var.name) + tensor = var.get_tensor() + tensor.set(value, core.CPUPlace()) + return py_var + elif isinstance(value, framework.Variable): + return value + else: + raise ValueError("Unsupported type %s" % type(value)) diff --git a/python/paddle/fluid/imperative/layers.py b/python/paddle/fluid/imperative/layers.py new file mode 100644 index 0000000000000000000000000000000000000000..1a28f7f4ae35295394b560d79e3dc0cdd5f2beab --- /dev/null +++ b/python/paddle/fluid/imperative/layers.py @@ -0,0 +1,44 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import contextlib +import sys +import numpy as np + +from paddle.fluid import core +from paddle.fluid import framework +from paddle.fluid.imperative import base + +__all__ = ['PyLayer'] + + +class PyLayer(core.Layer): + def __init__(self): + pass + + def __call__(self, inputs): + # TODO(panyx0718): Support declarative mode as well. + assert base.enabled() + if not isinstance(inputs, list) and not isinstance(inputs, tuple): + inputs = [inputs] + + var_inputs = [] + for x in inputs: + py_var = base.to_variable(x) + var_inputs.append(py_var) + outputs = self.forward(var_inputs) + return outputs + + def forward(self, inputs): + return [] diff --git a/python/paddle/fluid/io.py b/python/paddle/fluid/io.py index 0782933c6c4851b410ee3fdf14d4f9d9e83d49cc..e74a87fc68db0e126098f7188db4a712dff2612d 100644 --- a/python/paddle/fluid/io.py +++ b/python/paddle/fluid/io.py @@ -145,7 +145,7 @@ def save_vars(executor, prog = fluid.default_main_program() fluid.io.save_vars(executor=exe, dirname=path, main_program=prog, - vars=None) + vars=None, predicate = name_has_fc) # All variables in `main_program` whose name includes "fc" will be saved. # And variables are going to be saved separately. @@ -369,7 +369,7 @@ def load_vars(executor, prog = fluid.default_main_program() fluid.io.load_vars(executor=exe, dirname=path, main_program=prog, - vars=None) + vars=None, predicate=name_has_fc) # All variables in `main_program` whose name includes "fc" will be loaded. # And all the variables are supposed to have been saved in differnet files. diff --git a/python/paddle/fluid/layer_helper.py b/python/paddle/fluid/layer_helper.py index dc317de9abbd06f4021e64b87ea88ba6af8809c9..74b4a977db6b69d4d256e1f7b36eb53524269bb1 100644 --- a/python/paddle/fluid/layer_helper.py +++ b/python/paddle/fluid/layer_helper.py @@ -17,10 +17,13 @@ from __future__ import print_function import copy import itertools import six +import sys +import numpy as np from .framework import Variable, Parameter, default_main_program, default_startup_program, dtype_is_floating from . import unique_name from paddle.fluid.initializer import Constant, Xavier +from paddle.fluid.imperative import base from .param_attr import ParamAttr, WeightNormParamAttr from . import core from six.moves import zip @@ -46,23 +49,21 @@ class LayerHelper(object): def startup_program(self): return default_startup_program() + def to_variable(self, x): + return base.to_variable(x, self.main_program.current_block()) + def append_op(self, *args, **kwargs): return self.main_program.current_block().append_op(*args, **kwargs) def multiple_input(self, input_param_name='input'): inputs = self.kwargs.get(input_param_name, []) - type_error = TypeError( - "Input of {0} layer should be Variable or sequence of Variable". - format(self.layer_type)) - if isinstance(inputs, Variable): - inputs = [inputs] - elif not isinstance(inputs, list) and not isinstance(inputs, tuple): - raise type_error + ret = [] + if isinstance(inputs, list) or isinstance(inputs, tuple): + for inp in inputs: + ret.append(self.to_variable(inp)) else: - for each in inputs: - if not isinstance(each, Variable): - raise type_error - return inputs + ret.append(self.to_variable(inputs)) + return ret def input(self, input_param_name='input'): inputs = self.multiple_input(input_param_name) diff --git a/python/paddle/fluid/layers/control_flow.py b/python/paddle/fluid/layers/control_flow.py index 05138bf94598f649ef7fdbaa94896b6ba0884416..b7e39685691809d04ecddc21d2d04a7a85e478d5 100644 --- a/python/paddle/fluid/layers/control_flow.py +++ b/python/paddle/fluid/layers/control_flow.py @@ -717,8 +717,9 @@ class While(object): out_vars = [] for inner_out_name in inner_outputs: - if inner_out_name in parent_block.vars: - out_vars.append(parent_block.var(inner_out_name)) + inner_var = parent_block._find_var_recursive(inner_out_name) + if inner_var: + out_vars.append(inner_var) step_scope = parent_block.create_var( type=core.VarDesc.VarType.STEP_SCOPES) @@ -1264,10 +1265,11 @@ class ConditionalBlock(object): if each_name not in input_set ] - out_list = [ - parent_block.var(var_name) for var_name in parent_block.vars - if var_name in intermediate - ] + out_list = [] + for inner_out_name in intermediate: + inner_var = parent_block._find_var_recursive(inner_out_name) + if inner_var: + out_list.append(inner_var) step_scope = parent_block.create_var( type=core.VarDesc.VarType.STEP_SCOPES) diff --git a/python/paddle/fluid/layers/detection.py b/python/paddle/fluid/layers/detection.py index 4843af8340310e0f47964d41708b13216fcd2161..ce731f39ea099a4d8948812989ad19b3cce119ff 100644 --- a/python/paddle/fluid/layers/detection.py +++ b/python/paddle/fluid/layers/detection.py @@ -20,6 +20,7 @@ from __future__ import print_function from .layer_function_generator import generate_layer_fn from .layer_function_generator import autodoc, templatedoc from ..layer_helper import LayerHelper +from ..framework import Variable from . import tensor from . import nn from . import ops @@ -46,6 +47,7 @@ __all__ = [ 'iou_similarity', 'box_coder', 'polygon_box_transform', + 'yolov3_loss', ] @@ -401,6 +403,113 @@ def polygon_box_transform(input, name=None): return output +@templatedoc(op_type="yolov3_loss") +def yolov3_loss(x, + gtbox, + gtlabel, + anchors, + class_num, + ignore_thresh, + loss_weight_xy=None, + loss_weight_wh=None, + loss_weight_conf_target=None, + loss_weight_conf_notarget=None, + loss_weight_class=None, + name=None): + """ + ${comment} + + Args: + x (Variable): ${x_comment} + gtbox (Variable): groud truth boxes, should be in shape of [N, B, 4], + in the third dimenstion, x, y, w, h should be stored + and x, y, w, h should be relative value of input image. + N is the batch number and B is the max box number in + an image. + gtlabel (Variable): class id of ground truth boxes, shoud be ins shape + of [N, B]. + anchors (list|tuple): ${anchors_comment} + class_num (int): ${class_num_comment} + ignore_thresh (float): ${ignore_thresh_comment} + loss_weight_xy (float|None): ${loss_weight_xy_comment} + loss_weight_wh (float|None): ${loss_weight_wh_comment} + loss_weight_conf_target (float|None): ${loss_weight_conf_target_comment} + loss_weight_conf_notarget (float|None): ${loss_weight_conf_notarget_comment} + loss_weight_class (float|None): ${loss_weight_class_comment} + name (string): the name of yolov3 loss + + Returns: + Variable: A 1-D tensor with shape [1], the value of yolov3 loss + + Raises: + TypeError: Input x of yolov3_loss must be Variable + TypeError: Input gtbox of yolov3_loss must be Variable" + TypeError: Input gtlabel of yolov3_loss must be Variable" + TypeError: Attr anchors of yolov3_loss must be list or tuple + TypeError: Attr class_num of yolov3_loss must be an integer + TypeError: Attr ignore_thresh of yolov3_loss must be a float number + + Examples: + .. code-block:: python + + x = fluid.layers.data(name='x', shape=[255, 13, 13], dtype='float32') + gtbox = fluid.layers.data(name='gtbox', shape=[6, 5], dtype='float32') + gtlabel = fluid.layers.data(name='gtlabel', shape=[6, 1], dtype='int32') + anchors = [10, 13, 16, 30, 33, 23] + loss = fluid.layers.yolov3_loss(x=x, gtbox=gtbox, class_num=80 + anchors=anchors, ignore_thresh=0.5) + """ + helper = LayerHelper('yolov3_loss', **locals()) + + if not isinstance(x, Variable): + raise TypeError("Input x of yolov3_loss must be Variable") + if not isinstance(gtbox, Variable): + raise TypeError("Input gtbox of yolov3_loss must be Variable") + if not isinstance(gtlabel, Variable): + raise TypeError("Input gtlabel of yolov3_loss must be Variable") + if not isinstance(anchors, list) and not isinstance(anchors, tuple): + raise TypeError("Attr anchors of yolov3_loss must be list or tuple") + if not isinstance(class_num, int): + raise TypeError("Attr class_num of yolov3_loss must be an integer") + if not isinstance(ignore_thresh, float): + raise TypeError( + "Attr ignore_thresh of yolov3_loss must be a float number") + + if name is None: + loss = helper.create_variable_for_type_inference(dtype=x.dtype) + else: + loss = helper.create_variable( + name=name, dtype=x.dtype, persistable=False) + + attrs = { + "anchors": anchors, + "class_num": class_num, + "ignore_thresh": ignore_thresh, + } + + if loss_weight_xy is not None and isinstance(loss_weight_xy, float): + self.attrs['loss_weight_xy'] = loss_weight_xy + if loss_weight_wh is not None and isinstance(loss_weight_wh, float): + self.attrs['loss_weight_wh'] = loss_weight_wh + if loss_weight_conf_target is not None and isinstance( + loss_weight_conf_target, float): + self.attrs['loss_weight_conf_target'] = loss_weight_conf_target + if loss_weight_conf_notarget is not None and isinstance( + loss_weight_conf_notarget, float): + self.attrs['loss_weight_conf_notarget'] = loss_weight_conf_notarget + if loss_weight_class is not None and isinstance(loss_weight_class, float): + self.attrs['loss_weight_class'] = loss_weight_class + + helper.append_op( + type='yolov3_loss', + inputs={"X": x, + "GTBox": gtbox, + "GTLabel": gtlabel}, + outputs={'Loss': loss}, + attrs=attrs) + return loss + + @templatedoc() def detection_map(detect_res, label, diff --git a/python/paddle/fluid/layers/io.py b/python/paddle/fluid/layers/io.py index 3f47053961bcc41b82f1b6776e9365166e78ddbf..42f4959a83fe113d6cbbe0db355249a9c203d602 100644 --- a/python/paddle/fluid/layers/io.py +++ b/python/paddle/fluid/layers/io.py @@ -943,7 +943,18 @@ def __create_unshared_decorated_reader__(op_type, reader, attrs, name=None): def shuffle(reader, buffer_size): """ - Shuffle the reader. + Creates a data reader whose data output is shuffled. + Output from the iterator that created by original reader will be + buffered into shuffle buffer, and then shuffled. The size of shuffle buffer + is determined by argument buf_size. + + Args: + param reader: the original reader whose output will be shuffled. + type reader: callable + param buf_size: shuffle buffer size. + type buf_size: int + return: the new reader whose output is shuffled. + rtype: callable """ return __create_unshared_decorated_reader__( 'create_shuffle_reader', reader, {'buffer_size': int(buffer_size)}) diff --git a/python/paddle/fluid/layers/layer_function_generator.py b/python/paddle/fluid/layers/layer_function_generator.py index eea0a362a0c31083f304a2167d0fdadfb30fb640..09b1b30216b03e71253ca8da1d462db897e1a607 100644 --- a/python/paddle/fluid/layers/layer_function_generator.py +++ b/python/paddle/fluid/layers/layer_function_generator.py @@ -20,7 +20,7 @@ import string from six.moves import cStringIO from ..proto import framework_pb2 -from ..framework import OpProtoHolder, Variable +from ..framework import OpProtoHolder, Variable, core, convert_np_dtype_to_dtype_ from ..layer_helper import LayerHelper __all__ = [ @@ -178,6 +178,15 @@ def generate_layer_fn(op_type): "operator {0} must input same dtype. {1} vs {2}".format( op_type, dtype, each.dtype)) + if dtype is None: + arg_dtype = kwargs.get("dtype") + if arg_dtype: + if not isinstance(arg_dtype, core.VarDesc.VarType): + dtype = convert_np_dtype_to_dtype_(arg_dtype) + else: + dtype = arg_dtype + else: + dtype = core.VarDesc.VarType.FP32 return dtype def func(*args, **kwargs): diff --git a/python/paddle/fluid/layers/learning_rate_scheduler.py b/python/paddle/fluid/layers/learning_rate_scheduler.py index 149224bb68ac869dec14ac9f953f0072bd24c7e2..dde05189722fef77e03a1c2d8f3cbae44a3e8245 100644 --- a/python/paddle/fluid/layers/learning_rate_scheduler.py +++ b/python/paddle/fluid/layers/learning_rate_scheduler.py @@ -308,13 +308,9 @@ def piecewise_decay(boundaries, values): def append_LARS(params_grads, learning_rate, weight_decay): - """Applies LARS (LAYER-WISE ADAPTIVE RATE SCALING) to learning rate for - each layer. - - ```python - learning_rate *= local_gw_ratio * sqrt(sumsq(param)) - / (sqrt(sumsq(gradient))+ weight_decay * sqrt(sumsq(param))) - ``` + """ + Applies LARS (LAYER-WISE ADAPTIVE RATE SCALING) to learning rate for + each layer. Args: learning_rate: A learning rate Variable. This @@ -323,6 +319,11 @@ def append_LARS(params_grads, learning_rate, weight_decay): Returns: The decayed learning rate + Examples: + .. code-block:: python + + learning_rate *= local_gw_ratio * sqrt(sumsq(param)) + / (sqrt(sumsq(gradient))+ weight_decay * sqrt(sumsq(param))) """ def _balanced_weight(param_norm, grad_norm): diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index ac1c8657759cad33f939c64413e669ba9ae2aad3..e25eaaa9fda6add9d8e81d9e6bdfb711cee3648e 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -41,6 +41,7 @@ __all__ = [ 'crf_decoding', 'cos_sim', 'cross_entropy', + 'bpr_loss', 'square_error_cost', 'chunk_eval', 'sequence_conv', @@ -169,6 +170,8 @@ __all__ = [ 'log_loss', 'add_position_encoding', 'bilinear_tensor_product', + 'merge_selected_rows', + 'get_tensor_from_selected_rows', 'lstm', ] @@ -928,7 +931,7 @@ def dynamic_gru(input, emb = fluid.layers.embedding(input=data, size=[dict_dim, emb_dim]) hidden_dim = 512 x = fluid.layers.fc(input=emb, size=hidden_dim * 3) - hidden = fluid.layers.dynamic_gru(input=x, dim=hidden_dim) + hidden = fluid.layers.dynamic_gru(input=x, size=hidden_dim) """ helper = LayerHelper('gru', **locals()) @@ -1346,6 +1349,44 @@ def cross_entropy(input, label, soft_label=False, ignore_index=kIgnoreIndex): return out +def bpr_loss(input, label, name=None): + """ + Bayesian Personalized Ranking Loss Operator. + + This operator belongs to pairwise ranking loss. Label is the desired item. + The loss at a given point in one session is defined as: + $Y[i] = -\frac{1}{N_{i}-1} * \sum_{0\le j(https://arxiv.org/abs/1511.06939) + + Args: + input (Variable|list): a 2-D tensor with shape [N x D], where N is the + batch size and D is the number of classes. + This input is not probability but logits. + label (Variable|list): the ground truth which is a 2-D tensor. `label` + is a tensor with shape [N x 1]. + name (str|None): A name for this layer(optional). If set None, the + layer will be named automatically. Default: None. + Returns: + A 2-D tensor with shape [N x 1], the bpr loss. + + Examples: + .. code-block:: python + + cost = fluid.layers.bpr_loss(input=predict, label=label) + """ + + helper = LayerHelper('bpr_loss', **locals()) + out = helper.create_variable_for_type_inference(dtype=input.dtype) + helper.append_op( + type='bpr_loss', + inputs={'X': [input], + 'Label': [label]}, + outputs={'Y': [out]}) + return out + + def square_error_cost(input, label): """ **Square error cost layer** @@ -3586,6 +3627,7 @@ def beam_search_decode(ids, scores, beam_size, end_id, name=None): Examples: .. code-block:: python + # Suppose `ids` and `scores` are LodTensorArray variables reserving # the selected ids and scores of all steps finished_ids, finished_scores = layers.beam_search_decode( @@ -5083,7 +5125,7 @@ def im2sequence(input, output.lod = [[4, 4]] - Examples: + Examples: .. code-block:: python @@ -5870,24 +5912,23 @@ def pad_constant_like(x, y, pad_value=0., name=None): [[38, 39, 40]], [[41, 42, 43]]]] Y.shape = (1, 3, 1, 3) + And + pad_value = -1, - And - pad_value = -1, - - Return: - Out = [[[[35, 36, 37], - [-1, -1, -1]], - [[38, 39, 40], - [-1, -1, -1]], - [[41, 42, 43], - [-1, -1, -1]]], - [[[-1, -1, -1], - [-1, -1, -1]], - [[-1, -1, -1], - [-1, -1, -1]], - [[-1, -1, -1], - [-1, -1, -1]]]] - Out.shape = (2, 3, 2, 3) + Return: + Out = [[[[35, 36, 37], + [-1, -1, -1]], + [[38, 39, 40], + [-1, -1, -1]], + [[41, 42, 43], + [-1, -1, -1]]], + [[[-1, -1, -1], + [-1, -1, -1]], + [[-1, -1, -1], + [-1, -1, -1]], + [[-1, -1, -1], + [-1, -1, -1]]]] + Out.shape = (2, 3, 2, 3) Args: x (Variable): The input tensor variable. @@ -6126,6 +6167,7 @@ def image_resize(input, Supporting resample methods: 'BILINEAR' : Bilinear interpolation + 'NEAREST' : Nearest neighbor interpolation Args: @@ -6620,7 +6662,8 @@ def relu(x, name=None): helper = LayerHelper('relu', **locals()) dtype = helper.input_dtype(input_param_name='x') out = helper.create_variable_for_type_inference(dtype) - helper.append_op(type="relu", inputs={"X": x}, outputs={"Out": out}) + helper.append_op( + type="relu", inputs={"X": helper.input('x')}, outputs={"Out": out}) return out @@ -6781,7 +6824,7 @@ def crop(x, shape=None, offsets=None, name=None): # or z = fluid.layers.data(name="z", shape=[3, 5], dtype="float32") - crop = fluid.layers.crop(z, shape=[2, 3]) + crop = fluid.layers.crop(z, shape=[-1, 2, 3]) """ helper = LayerHelper('crop', **locals()) @@ -7062,39 +7105,40 @@ def pad2d(input, than height-1. And the width dimension has the same condition. Example: + .. code-block:: text - Given that X is a channel of image from input: + Given that X is a channel of image from input: - X = [[1, 2, 3], - [4, 5, 6]] + X = [[1, 2, 3], + [4, 5, 6]] - Case 0: + Case 0: - paddings = [0, 1, 2, 3], - mode = 'constant' - pad_value = 0 + paddings = [0, 1, 2, 3], + mode = 'constant' + pad_value = 0 - Out = [[0, 0, 1, 2, 3, 0, 0, 0] - [0, 0, 4, 5, 6, 0, 0, 0] - [0, 0, 0, 0, 0, 0, 0, 0]] + Out = [[0, 0, 1, 2, 3, 0, 0, 0] + [0, 0, 4, 5, 6, 0, 0, 0] + [0, 0, 0, 0, 0, 0, 0, 0]] - Case 1: + Case 1: - paddings = [0, 1, 2, 1], - mode = 'reflect' + paddings = [0, 1, 2, 1], + mode = 'reflect' - Out = [[3, 2, 1, 2, 3, 2] - [6, 5, 4, 5, 6, 5] - [3, 2, 1, 2, 3, 2]] + Out = [[3, 2, 1, 2, 3, 2] + [6, 5, 4, 5, 6, 5] + [3, 2, 1, 2, 3, 2]] - Case 2: + Case 2: - paddings = [0, 1, 2, 1], - mode = 'edge' + paddings = [0, 1, 2, 1], + mode = 'edge' - Out = [[1, 1, 1, 2, 3, 3] - [4, 4, 4, 5, 6, 6] - [4, 4, 4, 5, 6, 6]] + Out = [[1, 1, 1, 2, 3, 3] + [4, 4, 4, 5, 6, 6] + [4, 4, 4, 5, 6, 6]] Args: @@ -7332,13 +7376,13 @@ def prelu(x, mode, param_attr=None, name=None): Args: x (Variable): The input tensor. param_attr(ParamAttr|None): The parameter attribute for the learnable - weight (alpha). + weight (alpha). mode (string): The mode for weight sharing. It supports all, channel - and element. all: all elements share same weight - channel:elements in a channel share same weight - element:each element has a weight + and element. all: all elements share same weight + channel:elements in a channel share same weight + element:each element has a weight name(str|None): A name for this layer(optional). If set None, the layer - will be named automatically. + will be named automatically. Returns: Variable: The output tensor with the same shape as input. @@ -8380,6 +8424,29 @@ def mean(x, name=None): return out +@templatedoc() +def merge_selected_rows(x, name=None): + """ + ${comment} + + Args: + x(${x_type}): ${x_comment} + name(basestring|None): Name of the output. + + Returns: + out(${out_type}): ${out_comment} + """ + + helper = LayerHelper("merge_selected_rows", **locals()) + out = helper.create_variable_for_type_inference(dtype=x.dtype) + helper.append_op( + type="merge_selected_rows", + inputs={"X": x}, + attrs={}, + outputs={"Out": out}) + return out + + @templatedoc() def mul(x, y, x_num_col_dims=1, y_num_col_dims=1, name=None): """ @@ -9032,3 +9099,26 @@ def bilinear_tensor_product(x, # add activation return helper.append_activation(out) + + +@templatedoc() +def get_tensor_from_selected_rows(x, name=None): + """ + ${comment} + + Args: + x(${x_type}): ${x_comment} + name(basestring|None): Name of the output. + + Returns: + out(${out_type}): ${out_comment} + """ + + helper = LayerHelper('get_tensor_from_selected_rows', **locals()) + out = helper.create_variable_for_type_inference(dtype=x.dtype) + helper.append_op( + type='get_tensor_from_selected_rows', + inputs={'X': x}, + outputs={'Out': out}, + attrs={}) + return out diff --git a/python/paddle/fluid/layers/tensor.py b/python/paddle/fluid/layers/tensor.py index ff32c00104171bf42c00be33f05758a4387228e1..49a486cf0c3d11b18417e8838aead07d748f3e02 100644 --- a/python/paddle/fluid/layers/tensor.py +++ b/python/paddle/fluid/layers/tensor.py @@ -622,7 +622,7 @@ def reverse(x, axis): out = helper.create_variable_for_type_inference(dtype=x.dtype) helper.append_op( type='reverse', - inputs={'Input': x}, + inputs={'X': x}, outputs={'Out': [out]}, attrs={'axis': axis}) return out diff --git a/python/paddle/fluid/metrics.py b/python/paddle/fluid/metrics.py index 829154f1b23d6e49bf963762be6b6488c98ec94a..85af8fea13d5b9a1e22014fbd727e1baed3247be 100644 --- a/python/paddle/fluid/metrics.py +++ b/python/paddle/fluid/metrics.py @@ -222,13 +222,13 @@ class Precision(MetricBase): Examples: .. code-block:: python - metric = fluid.metrics.Precision() - for pass in range(PASSES): - metric.reset() - for data in train_reader(): - loss, preds, labels = exe.run(fetch_list=[cost, preds, labels]) - metric.update(preds=preds, labels=labels) - numpy_precision = metric.eval() + metric = fluid.metrics.Precision() + for pass in range(PASSES): + metric.reset() + for data in train_reader(): + loss, preds, labels = exe.run(fetch_list=[cost, preds, labels]) + metric.update(preds=preds, labels=labels) + numpy_precision = metric.eval() """ def __init__(self, name=None): @@ -267,13 +267,13 @@ class Recall(MetricBase): Examples: .. code-block:: python - metric = fluid.metrics.Recall() - for pass in range(PASSES): - metric.reset() - for data in train_reader(): - loss, preds, labels = exe.run(fetch_list=[cost, preds, labels]) - metric.update(preds=preds, labels=labels) - numpy_recall = metric.eval() + metric = fluid.metrics.Recall() + for pass in range(PASSES): + metric.reset() + for data in train_reader(): + loss, preds, labels = exe.run(fetch_list=[cost, preds, labels]) + metric.update(preds=preds, labels=labels) + numpy_recall = metric.eval() """ def __init__(self, name=None): @@ -449,8 +449,9 @@ class EditDistance(MetricBase): distance_evaluator.update(distances, seq_num) distance, instance_error = distance_evaluator.eval() - In the above example: + In the above example: 'distance' is the average of the edit distance in a pass. + 'instance_error' is the instance error rate in a pass. """ diff --git a/python/paddle/fluid/parallel_executor.py b/python/paddle/fluid/parallel_executor.py index bdcd045341212d6cf9dbfbc3cebc72f320e37e9d..c54c3963a152851f5396c2ba71c28cc09c1cd523 100644 --- a/python/paddle/fluid/parallel_executor.py +++ b/python/paddle/fluid/parallel_executor.py @@ -95,7 +95,14 @@ class ParallelExecutor(object): self._places = [] self._act_places = [] if use_cuda: - for i in six.moves.range(core.get_cuda_device_count()): + gpus = [] + gpus_env = os.getenv("FLAGS_selected_gpus") + if gpus_env: + gpus = [int(s) for s in gpus_env.split(",")] + else: + for i in six.moves.range(core.get_cuda_device_count()): + gpus.append(i) + for i in gpus: p = core.Place() self._act_places.append(core.CUDAPlace(i)) p.set_place(self._act_places[-1]) @@ -128,9 +135,17 @@ class ParallelExecutor(object): build_strategy = BuildStrategy() build_strategy.num_trainers = num_trainers + build_strategy.trainer_id = trainer_id main = main_program main = main if main else framework.default_main_program() + + trainers_endpoints = main._trainers_endpoints + if num_trainers > 1 and trainers_endpoints: + assert num_trainers == len( + trainers_endpoints), "num_trainers == len(end_points)" + build_strategy.trainers_endpoints = trainers_endpoints + if scope == None: scope = executor.global_scope() diff --git a/python/paddle/fluid/param_attr.py b/python/paddle/fluid/param_attr.py index a51607bfdb1dde3d25f490770cc2ba368ceb27ff..38ddf93198d7c58382e36a5b7af488f56e6f9878 100644 --- a/python/paddle/fluid/param_attr.py +++ b/python/paddle/fluid/param_attr.py @@ -50,8 +50,9 @@ class ParamAttr(object): w_param_attrs = fluid.ParamAttr(name="fc_weight", learning_rate=0.5, - regularizer=fluid.L2Decay(1.0), + regularizer=fluid.regularizer.L2Decay(1.0), trainable=True) + x = fluid.layers.data(name='X', shape=[1], dtype='float32') y_predict = fluid.layers.fc(input=x, size=10, param_attr=w_param_attrs) """ diff --git a/python/paddle/fluid/tests/test_detection.py b/python/paddle/fluid/tests/test_detection.py index a2eca5541a152ca99804a7f87c9b0bc3d12d4eee..d99eaa0634f93dcd16dd80ae172f11e8090a2623 100644 --- a/python/paddle/fluid/tests/test_detection.py +++ b/python/paddle/fluid/tests/test_detection.py @@ -388,5 +388,18 @@ class TestGenerateProposals(unittest.TestCase): print(rpn_rois.shape) +class TestYoloDetection(unittest.TestCase): + def test_yolov3_loss(self): + program = Program() + with program_guard(program): + x = layers.data(name='x', shape=[30, 7, 7], dtype='float32') + gtbox = layers.data(name='gtbox', shape=[10, 4], dtype='float32') + gtlabel = layers.data(name='gtlabel', shape=[10], dtype='int32') + loss = layers.yolov3_loss(x, gtbox, gtlabel, [10, 13, 30, 13], 10, + 0.5) + + self.assertIsNotNone(loss) + + if __name__ == '__main__': unittest.main() diff --git a/python/paddle/fluid/tests/test_gradient_clip.py b/python/paddle/fluid/tests/test_gradient_clip.py deleted file mode 100644 index 266687fcd092dfdeec9343e2592f4c22b683d588..0000000000000000000000000000000000000000 --- a/python/paddle/fluid/tests/test_gradient_clip.py +++ /dev/null @@ -1,84 +0,0 @@ -# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from __future__ import print_function - -import numpy as np -import paddle -import paddle.fluid as fluid - -BATCH_SIZE = 128 -CLIP = 1 - -prog = fluid.framework.Program() -with fluid.program_guard(main_program=prog): - image = fluid.layers.data(name='x', shape=[784], dtype='float32') - - hidden1 = fluid.layers.fc(input=image, size=128, act='relu') - hidden2 = fluid.layers.fc(input=hidden1, size=64, act='relu') - predict = fluid.layers.fc(input=hidden2, size=10, act='softmax') - - label = fluid.layers.data(name='y', shape=[1], dtype='int64') - - cost = fluid.layers.cross_entropy(input=predict, label=label) - avg_cost = fluid.layers.mean(cost) - -prog_clip = prog.clone() - -avg_cost_clip = prog_clip.block(0).var(avg_cost.name) - -p_g = fluid.backward.append_backward(loss=avg_cost) -p_g_clip = fluid.backward.append_backward(loss=avg_cost_clip) - -with fluid.program_guard(main_program=prog_clip): - fluid.clip.set_gradient_clip( - fluid.clip.GradientClipByGlobalNorm(clip_norm=CLIP)) - p_g_clip = fluid.clip.append_gradient_clip_ops(p_g_clip) - -grad_list = [elem[1] for elem in p_g] -grad_clip_list = [elem[1] for elem in p_g_clip] - -train_reader = paddle.batch( - paddle.reader.shuffle( - paddle.dataset.mnist.train(), buf_size=8192), - batch_size=BATCH_SIZE) - -place = fluid.CPUPlace() -exe = fluid.Executor(place) -feeder = fluid.DataFeeder(feed_list=[image, label], place=place) -exe.run(fluid.default_startup_program()) - -count = 0 -for data in train_reader(): - count += 1 - if count > 5: - break - out = exe.run(prog, feed=feeder.feed(data), fetch_list=grad_list) - out_clip = exe.run(prog_clip, - feed=feeder.feed(data), - fetch_list=grad_clip_list) - global_norm = 0 - for v in out[1:]: - global_norm += np.sum(np.power(v, 2)) - global_norm = np.sqrt(global_norm) - - global_norm_clip = 0 - for v in out_clip[1:]: - global_norm_clip += np.sum(np.power(v, 2)) - global_norm_clip = np.sqrt(global_norm_clip) - - if not np.isclose( - a=global_norm_clip, b=np.minimum(global_norm, CLIP), rtol=5e-3): - exit(1) -exit(0) diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt index 26035f303e72a87b81fdb120fbb92894d78e996b..a4089ba3ca08bed5702a66ed370da52ecd9b58c6 100644 --- a/python/paddle/fluid/tests/unittests/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt @@ -43,7 +43,7 @@ if(APPLE) list(REMOVE_ITEM TEST_OPS test_desc_clone) list(REMOVE_ITEM TEST_OPS test_program_code) endif(NOT WITH_DISTRIBUTE) - message(WARNING "These tests has been disabled in OSX before being fixed: \n test_fuse_elewise_add_act_pass \n test_detection_map_op \n test_dist_se_resnext") + message(WARNING "These tests has been disabled in OSX before being fixed:\n test_fuse_elewise_add_act_pass \n test_detection_map_op \n test_dist_se_resnext") # this op is not support on mac list(REMOVE_ITEM TEST_OPS test_fusion_seqexpand_concat_fc_op) # TODO: add the unitest back when it fixed @@ -95,13 +95,12 @@ if(WITH_DISTRIBUTE) if(NOT APPLE) set_tests_properties(test_dist_mnist PROPERTIES TIMEOUT 200) set_tests_properties(test_dist_word2vec PROPERTIES TIMEOUT 200) + py_test_modules(test_dist_se_resnext MODULES test_dist_se_resnext) + set_tests_properties(test_dist_se_resnext PROPERTIES TIMEOUT 1000) # FIXME(typhoonzero): add these tests back - # py_test_modules(test_dist_se_resnext MODULES test_dist_se_resnext) - # set_tests_properties(test_dist_se_resnext PROPERTIES TIMEOUT 1000) # py_test_modules(test_dist_transformer MODULES test_dist_transformer) # set_tests_properties(test_dist_transformer PROPERTIES TIMEOUT 1000) - # TODO(typhoonzero): make dist test parallel when fix port management issue - set_tests_properties(test_dist_mnist test_dist_word2vec test_dist_ctr test_dist_simnet_bow test_dist_save_load test_dist_text_classification test_dist_mnist_batch_merge PROPERTIES RUN_SERIAL TRUE) + set_tests_properties(test_dist_ctr test_dist_mnist test_dist_mnist_batch_merge test_dist_save_load test_dist_se_resnext test_dist_simnet_bow test_dist_text_classification test_dist_train test_dist_word2vec PROPERTIES RUN_SERIAL TRUE) endif(NOT APPLE) py_test_modules(test_dist_transpiler MODULES test_dist_transpiler) endif() diff --git a/python/paddle/fluid/tests/unittests/dist_save_load.py b/python/paddle/fluid/tests/unittests/dist_save_load.py index cf62817956c12cd4487eba88bf49ed43331dff03..faec5350424668fca6416e91c3e58174bd4ec877 100644 --- a/python/paddle/fluid/tests/unittests/dist_save_load.py +++ b/python/paddle/fluid/tests/unittests/dist_save_load.py @@ -102,7 +102,7 @@ class TestDistSaveLoad2x2(TestDistSimnetBow2x2): if args.mem_opt: fluid.memory_optimize(fluid.default_main_program(), skip_grads=True) - if args.is_dist: + if args.update_method == "pserver": t = self.get_transpiler(args.trainer_id, fluid.default_main_program(), args.endpoints, args.trainers, @@ -147,7 +147,7 @@ class TestDistSaveLoad2x2(TestDistSimnetBow2x2): def get_data(): origin_batch = next(reader_generator) - if args.is_dist and args.use_reader_alloc: + if args.update_method == "pserver" and args.use_reader_alloc: new_batch = [] for offset, item in enumerate(origin_batch): if offset % 2 == args.trainer_id: diff --git a/python/paddle/fluid/tests/unittests/test_bpr_loss_op.py b/python/paddle/fluid/tests/unittests/test_bpr_loss_op.py new file mode 100644 index 0000000000000000000000000000000000000000..c8dc5fbd237d17f2d4e45b06e5806fff5cbf58fe --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_bpr_loss_op.py @@ -0,0 +1,52 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import numpy as np +from op_test import OpTest, randomize_probability + + +class TestBprLossOp1(OpTest): + """Test BprLoss with discrete one-hot labels. + """ + + def setUp(self): + self.op_type = "bpr_loss" + batch_size = 40 + class_num = 5 + X = randomize_probability(batch_size, class_num, dtype='float64') + label = np.random.randint(0, class_num, (batch_size, 1), dtype="int64") + bpr_loss_result = [] + for i in range(batch_size): + sum = 0.0 + for j in range(class_num): + if j == label[i][0]: + continue + sum += (-np.log(1.0 + np.exp(X[i][j] - X[i][label[i][0]]))) + bpr_loss_result.append(-sum / (class_num - 1)) + bpr_loss = np.asmatrix([[x] for x in bpr_loss_result], dtype="float64") + self.inputs = {"X": X, "Label": label} + self.outputs = {"Y": bpr_loss} + + def test_check_output(self): + self.check_output() + + def test_check_grad(self): + self.check_grad(["X"], "Y", numeric_grad_delta=0.001) + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_conv2d_fusion_op.py b/python/paddle/fluid/tests/unittests/test_conv2d_fusion_op.py index 9f3f2f348166864be9583855fcd1949fd4ac818c..6cd71e39e41dae5d07e5761fc9caeca113f3b47e 100644 --- a/python/paddle/fluid/tests/unittests/test_conv2d_fusion_op.py +++ b/python/paddle/fluid/tests/unittests/test_conv2d_fusion_op.py @@ -128,6 +128,12 @@ class TestIdentityActivation(TestConv2dFusionOp): self.activation = 'identity' +class TestIdentityActivation(TestConv2dFusionOp): + def init_activation(self): + self.activation = 'identity' + self.add_residual_data = False + + class TestWithGroup(TestConv2dFusionOp): def init_group(self): self.groups = 3 diff --git a/python/paddle/fluid/tests/unittests/test_conv3d_mkldnn_op.py b/python/paddle/fluid/tests/unittests/test_conv3d_mkldnn_op.py new file mode 100644 index 0000000000000000000000000000000000000000..f0e1265e142b800587599783367eca2203033bf1 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_conv3d_mkldnn_op.py @@ -0,0 +1,59 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest + +from test_conv3d_op import TestConv3dOp, TestCase1, TestWithGroup1, TestWithGroup2, TestWith1x1, TestWithInput1x1Filter1x1 + + +class TestMKLDNN(TestConv3dOp): + def init_kernel_type(self): + self.use_mkldnn = True + self.data_format = "NCHW" + + +class TestMKLDNNCase1(TestCase1): + def init_kernel_type(self): + self.use_mkldnn = True + self.data_format = "NCHW" + + +class TestMKLDNNGroup1(TestWithGroup1): + def init_kernel_type(self): + self.use_mkldnn = True + self.data_format = "NCHW" + + +class TestMKLDNNGroup2(TestWithGroup2): + def init_kernel_type(self): + self.use_mkldnn = True + self.data_format = "NCHW" + + +class TestMKLDNNWith1x1(TestWith1x1): + def init_kernel_type(self): + self.use_mkldnn = True + self.data_format = "NCHW" + + +class TestMKLDNNWithInput1x1Filter1x1(TestWithInput1x1Filter1x1): + def init_kernel_type(self): + self.use_mkldnn = True + self.data_format = "NCHW" + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_conv3d_op.py b/python/paddle/fluid/tests/unittests/test_conv3d_op.py index 69c5ab7a4a4cbd552d27dcb07052d46752eeb54a..c6b749fe09b18b1d704f45a5a5b3adbd5c6a6d0b 100644 --- a/python/paddle/fluid/tests/unittests/test_conv3d_op.py +++ b/python/paddle/fluid/tests/unittests/test_conv3d_op.py @@ -74,6 +74,8 @@ class TestConv3dOp(OpTest): def setUp(self): self.op_type = "conv3d" self.use_cudnn = False + self.use_mkldnn = False + self.data_format = "AnyLayout" self.dtype = np.float32 self.init_kernel_type() self.init_group() @@ -83,8 +85,7 @@ class TestConv3dOp(OpTest): conv3d_param = { 'stride': self.stride, 'pad': self.pad, - 'dilations': self.dilations, - 'data_format': 'AnyLayout' # TODO(dzhwinter) : should be fix latter + 'dilations': self.dilations } input = np.random.random(self.input_size).astype(self.dtype) @@ -101,7 +102,9 @@ class TestConv3dOp(OpTest): 'paddings': self.pad, 'groups': self.groups, 'dilations': self.dilations, - 'use_cudnn': self.use_cudnn + 'use_cudnn': self.use_cudnn, + 'use_mkldnn': self.use_mkldnn, + 'data_format': self.data_format } self.outputs = {'Output': output} @@ -109,59 +112,35 @@ class TestConv3dOp(OpTest): return core.is_compiled_with_cuda() and self.use_cudnn def test_check_output(self): - if self.testcudnn(): - place = core.CUDAPlace(0) - self.check_output_with_place(place, atol=1e-5) - else: - self.check_output() + place = core.CUDAPlace(0) if self.testcudnn() else core.CPUPlace() + self.check_output_with_place(place, atol=1e-5) def test_check_grad(self): if self.dtype == np.float16: return - if self.testcudnn(): - place = core.CUDAPlace(0) - self.check_grad_with_place( - place, - set(['Input', 'Filter']), - 'Output', - max_relative_error=0.03) - else: - self.check_grad( - set(['Input', 'Filter']), 'Output', max_relative_error=0.03) + place = core.CUDAPlace(0) if self.testcudnn() else core.CPUPlace() + self.check_grad_with_place( + place, {'Input', 'Filter'}, 'Output', max_relative_error=0.03) def test_check_grad_no_filter(self): if self.dtype == np.float16: return - if self.testcudnn(): - place = core.CUDAPlace(0) - self.check_grad_with_place( - place, ['Input'], - 'Output', - max_relative_error=0.03, - no_grad_set=set(['Filter'])) - else: - self.check_grad( - ['Input'], - 'Output', - max_relative_error=0.03, - no_grad_set=set(['Filter'])) + place = core.CUDAPlace(0) if self.testcudnn() else core.CPUPlace() + self.check_grad_with_place( + place, ['Input'], + 'Output', + max_relative_error=0.03, + no_grad_set=set(['Filter'])) def test_check_grad_no_input(self): if self.dtype == np.float16: return - if self.testcudnn(): - place = core.CUDAPlace(0) - self.check_grad_with_place( - place, ['Filter'], - 'Output', - max_relative_error=0.03, - no_grad_set=set(['Input'])) - else: - self.check_grad( - ['Filter'], - 'Output', - max_relative_error=0.03, - no_grad_set=set(['Input'])) + place = core.CUDAPlace(0) if self.testcudnn() else core.CPUPlace() + self.check_grad_with_place( + place, ['Input'], + 'Output', + max_relative_error=0.03, + no_grad_set=set(['Input'])) def init_test_case(self): self.pad = [0, 0, 0] diff --git a/python/paddle/fluid/tests/unittests/test_dist_base.py b/python/paddle/fluid/tests/unittests/test_dist_base.py index 97e7ee6229f081ff67ca3e2aedcad0a2e3d9cabf..26fa20291b52e469066e23b5c29a8e11b40a1270 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_base.py +++ b/python/paddle/fluid/tests/unittests/test_dist_base.py @@ -76,12 +76,24 @@ class TestDistRunnerBase(object): if args.mem_opt: fluid.memory_optimize(fluid.default_main_program(), skip_grads=True) - if args.is_dist: + if args.update_method == "pserver": t = self.get_transpiler(args.trainer_id, fluid.default_main_program(), args.endpoints, args.trainers, args.sync_mode, args.dc_asgd) trainer_prog = t.get_trainer_program() + elif args.update_method == "nccl2": + # transpile for nccl2 + config = fluid.DistributeTranspilerConfig() + config.mode = "nccl2" + nccl2_t = fluid.DistributeTranspiler(config=config) + nccl2_t.transpile( + args.trainer_id, + program=fluid.default_main_program(), + startup_program=fluid.default_startup_program(), + trainers=args.endpoints, + current_endpoint=args.current_endpoint) + trainer_prog = fluid.default_main_program() else: trainer_prog = fluid.default_main_program() @@ -110,11 +122,20 @@ class TestDistRunnerBase(object): len(pass_builder.all_passes()) - 2, "multi_batch_merge_pass") mypass.set_int("num_repeats", args.batch_merge_repeat) + if args.update_method == "nccl2": + num_trainers = len(args.endpoints.split(",")) + trainer_id = args.trainer_id + else: + num_trainers = 1 + trainer_id = 0 + exe = fluid.ParallelExecutor( args.use_cuda, loss_name=avg_cost.name, exec_strategy=strategy, - build_strategy=build_stra) + build_strategy=build_stra, + num_trainers=num_trainers, + trainer_id=trainer_id) feed_var_list = [ var for var in trainer_prog.global_block().vars.values() @@ -126,7 +147,7 @@ class TestDistRunnerBase(object): def get_data(): origin_batch = next(reader_generator) - if args.is_dist and args.use_reader_alloc: + if args.update_method != "local" and args.use_reader_alloc: new_batch = [] for offset, item in enumerate(origin_batch): if offset % 2 == args.trainer_id: @@ -151,7 +172,11 @@ def runtime_main(test_class): parser.add_argument( '--role', type=str, required=True, choices=['pserver', 'trainer']) parser.add_argument('--endpoints', type=str, required=False, default="") - parser.add_argument('--is_dist', action='store_true') + parser.add_argument( + '--update_method', + type=str, + default="local", + choices=["pserver", "nccl2", "local"]) parser.add_argument('--trainer_id', type=int, required=False, default=0) parser.add_argument('--trainers', type=int, required=False, default=1) parser.add_argument( @@ -170,7 +195,7 @@ def runtime_main(test_class): args = parser.parse_args() model = test_class() - if args.role == "pserver" and args.is_dist: + if args.role == "pserver" and args.update_method == "pserver": model.run_pserver(args) else: model.run_trainer(args) @@ -208,6 +233,7 @@ class TestDistBase(unittest.TestCase): self._use_reduce = False self._dc_asgd = False # must use with async mode self._use_reader_alloc = True + self._nccl2_mode = False self._setup_config() self._after_setup_config() @@ -218,7 +244,7 @@ class TestDistBase(unittest.TestCase): def start_pserver(self, model_file, check_error_log, required_envs): ps0_ep, ps1_ep = self._ps_endpoints.split(",") - ps_cmd = "%s %s --role pserver --endpoints %s --trainer_id 0 --current_endpoint %s --trainers %d --is_dist" + ps_cmd = "%s %s --role pserver --endpoints %s --trainer_id 0 --current_endpoint %s --trainers %d --update_method pserver" ps0_cmd = ps_cmd % \ (self._python_interp, model_file, self._ps_endpoints, ps0_ep, self._trainers) @@ -270,7 +296,8 @@ class TestDistBase(unittest.TestCase): else: env_local = {'CPU_NUM': '1'} - envs.update(env_local) + env_local.update(envs) + print("local_cmd: {}, env: {}".format(cmd, env_local)) if check_error_log: err_log = open("/tmp/trainer.err.log", "wb") @@ -278,21 +305,21 @@ class TestDistBase(unittest.TestCase): cmd.split(" "), stdout=subprocess.PIPE, stderr=err_log, - env=envs) + env=env_local) else: local_proc = subprocess.Popen( cmd.split(" "), stdout=subprocess.PIPE, stderr=subprocess.PIPE, - env=envs) + env=env_local) local_out, local_err = local_proc.communicate() if check_error_log: err_log.close() - sys.stderr.write('local_stdout: %s\n' % pickle.loads(local_out)) sys.stderr.write('local_stderr: %s\n' % local_err) + sys.stderr.write('local_stdout: %s\n' % pickle.loads(local_out)) return pickle.loads(local_out) @@ -303,7 +330,7 @@ class TestDistBase(unittest.TestCase): ps0_ep, ps1_ep = self._ps_endpoints.split(",") - tr_cmd = "%s %s --role trainer --endpoints %s --trainer_id %d --current_endpoint %s --trainers %d --is_dist" + tr_cmd = "%s %s --role trainer --endpoints %s --trainer_id %d --current_endpoint %s --trainers %d --update_method pserver" tr0_cmd = tr_cmd % \ (self._python_interp, model, self._ps_endpoints, 0, ps0_ep, self._trainers) @@ -335,8 +362,8 @@ class TestDistBase(unittest.TestCase): env0.update(envs) env1.update(envs) - print("tr0_cmd:{}".format(tr0_cmd)) - print("tr1_cmd:{}".format(tr1_cmd)) + print("tr0_cmd: {}, env: {}".format(tr0_cmd, env0)) + print("tr1_cmd: {}, env: {}".format(tr1_cmd, env1)) tr0_pipe = open("/tmp/tr0_err.log", "wb") tr1_pipe = open("/tmp/tr1_err.log", "wb") @@ -351,28 +378,111 @@ class TestDistBase(unittest.TestCase): stderr=tr1_pipe, env=env1) + # Wait until trainer process terminate + while True: + stat0 = tr0_proc.poll() + time.sleep(0.1) + if stat0 is not None: + break + while True: + stat1 = tr1_proc.poll() + time.sleep(0.1) + if stat1 is not None: + break + tr0_out, tr0_err = tr0_proc.communicate() tr1_out, tr1_err = tr1_proc.communicate() # close trainer file tr0_pipe.close() tr1_pipe.close() - ps0_pipe.close() ps1_pipe.close() - # FIXME: use terminate() instead of sigkill. - os.kill(ps0.pid, signal.SIGKILL) - os.kill(ps1.pid, signal.SIGKILL) + ps0.terminate() ps1.terminate() + # print server log + with open("/tmp/ps0_err.log", "r") as fn: + sys.stderr.write("ps0 stderr: %s\n" % fn.read()) + with open("/tmp/ps1_err.log", "r") as fn: + sys.stderr.write("ps1 stderr: %s\n" % fn.read()) + + # print log + if stat0 == 0: + sys.stderr.write('trainer 0 stdout: %s\n' % pickle.loads(tr0_out)) + with open("/tmp/tr0_err.log", "r") as fn: + sys.stderr.write('trainer 0 stderr: %s\n' % fn.read()) + if stat1 == 0: + sys.stderr.write('trainer 1 stdout: %s\n' % pickle.loads(tr1_out)) + with open("/tmp/tr1_err.log", "r") as fn: + sys.stderr.write('trainer 1 stderr: %s\n' % fn.read()) + + return pickle.loads(tr0_out), pickle.loads(tr1_out) + + def _run_cluster_nccl2(self, model, envs, check_error_log): + # NOTE: we reuse ps_endpoints as nccl2 worker endpoints + worker_endpoints = self._ps_endpoints.split(",") + w0_ep, w1_ep = worker_endpoints + + tr_cmd = "%s %s --role trainer --endpoints %s --trainer_id %d --current_endpoint %s --update_method nccl2" + tr0_cmd = tr_cmd % \ + (self._python_interp, model, self._ps_endpoints, + 0, w0_ep) + tr1_cmd = tr_cmd % \ + (self._python_interp, model, self._ps_endpoints, + 1, w1_ep) + + if self._mem_opt: + tr0_cmd += " --mem_opt" + tr1_cmd += " --mem_opt" + if self._use_reduce: + tr0_cmd += " --use_reduce" + tr1_cmd += " --use_reduce" + if self._use_reader_alloc: + tr0_cmd += " --use_reader_alloc" + tr1_cmd += " --use_reader_alloc" + if self.__use_cuda: + tr0_cmd += " --use_cuda" + tr1_cmd += " --use_cuda" + env0 = {"CUDA_VISIBLE_DEVICES": "0"} + env1 = {"CUDA_VISIBLE_DEVICES": "1"} + else: + env0 = {'CPU_NUM': '1'} + env1 = {'CPU_NUM': '1'} + + env0.update(envs) + env1.update(envs) + + print("tr0_cmd:{}, env: {}".format(tr0_cmd, env0)) + print("tr1_cmd:{}, env: {}".format(tr1_cmd, env1)) + tr0_pipe = open("/tmp/tr0_err.log", "wb") + tr1_pipe = open("/tmp/tr1_err.log", "wb") + + tr0_proc = subprocess.Popen( + tr0_cmd.strip().split(" "), + stdout=subprocess.PIPE, + stderr=tr0_pipe, + env=env0) + tr1_proc = subprocess.Popen( + tr1_cmd.strip().split(" "), + stdout=subprocess.PIPE, + stderr=tr1_pipe, + env=env1) + + tr0_out, tr0_err = tr0_proc.communicate() + tr1_out, tr1_err = tr1_proc.communicate() + + # close trainer file + tr0_pipe.close() + tr1_pipe.close() + # print log - sys.stderr.write('trainer 0 stdout: %s\n' % pickle.loads(tr0_out)) sys.stderr.write('trainer 0 stderr: %s\n' % tr0_err) - sys.stderr.write('trainer 1 stdout: %s\n' % pickle.loads(tr1_out)) sys.stderr.write('trainer 1 stderr: %s\n' % tr1_err) + sys.stderr.write('trainer 0 stdout: %s\n' % tr0_out) + sys.stderr.write('trainer 1 stdout: %s\n' % tr1_out) - # return tr0_losses, tr1_losses return pickle.loads(tr0_out), pickle.loads(tr1_out) def check_with_place(self, @@ -386,21 +496,27 @@ class TestDistBase(unittest.TestCase): "PYTHONPATH": os.getenv("PYTHONPATH", ""), "LD_LIBRARY_PATH": os.getenv("LD_LIBRARY_PATH", ""), "FLAGS_fraction_of_gpu_memory_to_use": "0.15", + "FLAGS_rpc_deadline": "5000", # 5sec to fail fast "FLAGS_cudnn_deterministic": "1", - "http_proxy": "" + "http_proxy": "", + "NCCL_P2P_DISABLE": "1" } required_envs.update(need_envs) if check_error_log: - required_envs["GLOG_v"] = "7" + required_envs["GLOG_v"] = "3" required_envs["GLOG_logtostderr"] = "1" local_losses\ = self._run_local(model_file, required_envs, check_error_log) - tr0_losses, tr1_losses = self._run_cluster(model_file, required_envs, - check_error_log) + if self._nccl2_mode: + tr0_losses, tr1_losses = self._run_cluster_nccl2( + model_file, required_envs, check_error_log) + else: + tr0_losses, tr1_losses = self._run_cluster( + model_file, required_envs, check_error_log) for step_id in range(RUN_STEP): local_loss = local_losses[step_id] diff --git a/python/paddle/fluid/tests/unittests/test_dist_mnist.py b/python/paddle/fluid/tests/unittests/test_dist_mnist.py index 81eb651878209164b3f339cc5030dbac847942d1..630bed198f4fc382d716373ea872e24b1b45bbf3 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_mnist.py +++ b/python/paddle/fluid/tests/unittests/test_dist_mnist.py @@ -26,6 +26,19 @@ class TestDistMnist2x2(TestDistBase): self.check_with_place("dist_mnist.py", delta=1e-5) +class TestDistMnistNCCL2(TestDistBase): + def _setup_config(self): + self._sync_mode = True + self._use_reduce = False + self._use_reader_alloc = False + self._nccl2_mode = True + + def test_dist_train(self): + import paddle.fluid as fluid + if fluid.core.is_compiled_with_cuda(): + self.check_with_place("dist_mnist.py", delta=1) + + class TestDistMnist2x2Lars(TestDistBase): def _setup_config(self): self._sync_mode = True diff --git a/python/paddle/fluid/tests/unittests/test_dist_save_load.py b/python/paddle/fluid/tests/unittests/test_dist_save_load.py index ea2b554dac83988955e3a7e8919e57a4ed7a8215..4588ca7c17ba5db893f080813d299feaa47626a7 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_save_load.py +++ b/python/paddle/fluid/tests/unittests/test_dist_save_load.py @@ -44,7 +44,7 @@ class TestDistSaveLoadDense2x2(TestDistBase): required_envs.update(need_envs) if check_error_log: - required_envs["GLOG_v"] = "7" + required_envs["GLOG_v"] = "3" required_envs["GLOG_logtostderr"] = "1" model_dir = tempfile.mkdtemp() diff --git a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py index 194387bc98752e66acd2c08a4abcaddfc34ad155..d9ad4e2e2c7b8d0a99d917495fbc8efc6cbd188d 100644 --- a/python/paddle/fluid/tests/unittests/test_dist_transpiler.py +++ b/python/paddle/fluid/tests/unittests/test_dist_transpiler.py @@ -769,6 +769,7 @@ class TestNCCL2Transpile(TranspilerTest): config = fluid.DistributeTranspilerConfig() config.mode = "nccl2" + config.wait_port = False t = fluid.DistributeTranspiler(config=config) t.transpile( 0, diff --git a/python/paddle/fluid/tests/unittests/test_get_tensor_from_selected_rows_op.py b/python/paddle/fluid/tests/unittests/test_get_tensor_from_selected_rows_op.py new file mode 100644 index 0000000000000000000000000000000000000000..021b950b3b6245caecab22d476bbb9d6b6b45c5e --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_get_tensor_from_selected_rows_op.py @@ -0,0 +1,65 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import paddle.fluid.core as core +import numpy as np +from paddle.fluid.op import Operator + + +class TestGetTensorFromSelectedRows(unittest.TestCase): + def get_places(self): + places = [core.CPUPlace()] + if core.is_compiled_with_cuda(): + places.append(core.CUDAPlace(0)) + return places + + def check_with_place(self, place): + scope = core.Scope() + x_rows = [0, 5, 5, 4, 20] + height = 20 + row_numel = 2 + + np_array = np.ones((len(x_rows), row_numel)).astype("float32") + np_array[1, :] = 2.0 + np_array[2, :] = 3.0 + np_array[3, :] = 4.0 + + # initialize input variable X + x = scope.var('X').get_selected_rows() + x.set_rows(x_rows) + x.set_height(height) + x_tensor = x.get_tensor() + x_tensor.set(np_array, place) + + # initialize input variable Out + out = scope.var("Out").get_tensor() + + op = Operator("get_tensor_from_selected_rows", X="X", Out="Out") + + op.run(scope, place) + + out_array = np.array(out) + self.assertEqual((5, 2), out_array.shape) + assert (out_array == np_array).all() + + def test_check_output(self): + for place in self.get_places(): + self.check_with_place(place) + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_gradient_clip.py b/python/paddle/fluid/tests/unittests/test_gradient_clip.py new file mode 100644 index 0000000000000000000000000000000000000000..e49239da6d3918211fbbc302d2c56818460b6d51 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_gradient_clip.py @@ -0,0 +1,161 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import numpy as np +import paddle +import paddle.fluid.core as core +import paddle.fluid as fluid + + +def bow_net(data, + label, + dict_dim, + emb_dim=128, + hid_dim=128, + hid_dim2=96, + class_dim=2): + """ + BOW net + This model is from https://github.com/PaddlePaddle/models: + fluid/PaddleNLP/text_classification/nets.py + """ + emb = fluid.layers.embedding( + input=data, is_sparse=True, size=[dict_dim, emb_dim]) + bow = fluid.layers.sequence_pool(input=emb, pool_type='sum') + bow_tanh = fluid.layers.tanh(bow) + fc_1 = fluid.layers.fc(input=bow_tanh, size=hid_dim, act="tanh") + fc_2 = fluid.layers.fc(input=fc_1, size=hid_dim2, act="tanh") + prediction = fluid.layers.fc(input=[fc_2], size=class_dim, act="softmax") + cost = fluid.layers.cross_entropy(input=prediction, label=label) + avg_cost = fluid.layers.mean(x=cost) + + return avg_cost + + +class TestGradientClip(unittest.TestCase): + def setUp(self): + self.word_dict = paddle.dataset.imdb.word_dict() + self.BATCH_SIZE = 2 + self.train_data = paddle.batch( + paddle.dataset.imdb.train(self.word_dict), + batch_size=self.BATCH_SIZE) + + def get_places(self): + places = [core.CPUPlace()] + if core.is_compiled_with_cuda(): + places.append(core.CUDAPlace(0)) + return places + + def check_operators(self, place): + CLIP = 1 + + prog = fluid.framework.Program() + startup_program = fluid.framework.Program() + with fluid.program_guard( + main_program=prog, startup_program=startup_program): + image = fluid.layers.data(name='x', shape=[784], dtype='float32') + label = fluid.layers.data(name='y', shape=[1], dtype='int64') + + hidden1 = fluid.layers.fc(input=image, size=128, act='relu') + hidden2 = fluid.layers.fc(input=hidden1, size=64, act='relu') + predict = fluid.layers.fc(input=hidden2, size=10, act='softmax') + + cost = fluid.layers.cross_entropy(input=predict, label=label) + avg_cost = fluid.layers.mean(cost) + + prog_clip = prog.clone() + avg_cost_clip = prog_clip.block(0).var(avg_cost.name) + + p_g = fluid.backward.append_backward(loss=avg_cost) + p_g_clip = fluid.backward.append_backward(loss=avg_cost_clip) + + with fluid.program_guard( + main_program=prog_clip, startup_program=startup_program): + fluid.clip.set_gradient_clip( + fluid.clip.GradientClipByGlobalNorm(clip_norm=CLIP)) + p_g_clip = fluid.clip.append_gradient_clip_ops(p_g_clip) + + grad_list = [elem[1] for elem in p_g] + grad_clip_list = [elem[1] for elem in p_g_clip] + + train_reader = paddle.batch( + paddle.reader.shuffle( + paddle.dataset.mnist.train(), buf_size=8192), + batch_size=128) + + exe = fluid.Executor(place) + feeder = fluid.DataFeeder(feed_list=[image, label], place=place) + exe.run(startup_program) + + count = 0 + for data in train_reader(): + count += 1 + if count > 5: + break + out = exe.run(prog, feed=feeder.feed(data), fetch_list=grad_list) + out_clip = exe.run(prog_clip, + feed=feeder.feed(data), + fetch_list=grad_clip_list) + global_norm = 0 + for v in out: + global_norm += np.sum(np.power(v, 2)) + global_norm = np.sqrt(global_norm) + + global_norm_clip = 0 + for v in out_clip: + global_norm_clip += np.sum(np.power(v, 2)) + global_norm_clip = np.sqrt(global_norm_clip) + + assert np.isclose( + a=global_norm_clip, b=np.minimum(global_norm, CLIP), rtol=5e-3) + + def check_sparse_gradient_clip(self, place): + prog = fluid.framework.Program() + startup_program = fluid.framework.Program() + with fluid.program_guard( + main_program=prog, startup_program=startup_program): + data = fluid.layers.data( + name="words", shape=[1], dtype="int64", lod_level=1) + label = fluid.layers.data(name="label", shape=[1], dtype="int64") + cost = bow_net(data, label, len(self.word_dict)) + + fluid.clip.set_gradient_clip( + clip=fluid.clip.GradientClipByGlobalNorm(clip_norm=5.0)) + + sgd_optimizer = fluid.optimizer.SGD(learning_rate=0.01) + sgd_optimizer.minimize(cost) + + exe = fluid.Executor(place) + feeder = fluid.DataFeeder(feed_list=[data, label], place=place) + exe.run(startup_program) + + data = next(self.train_data()) + val = exe.run(prog, feed=feeder.feed(data), fetch_list=[cost])[0] + self.assertEqual((1, ), val.shape) + print(val) + self.assertFalse(np.isnan(val)) + + def test_operators(self): + self.check_operators(core.CPUPlace()) + + def test_sparse_gradient_clip(self): + for place in self.get_places(): + self.check_sparse_gradient_clip(place) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_imperative.py b/python/paddle/fluid/tests/unittests/test_imperative.py new file mode 100644 index 0000000000000000000000000000000000000000..b5b6305155d1ef3dcf6ce590c221664754c5bdc8 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_imperative.py @@ -0,0 +1,52 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +import sys +import numpy as np + +import paddle.fluid as fluid +from paddle.fluid import core + + +class MyLayer(fluid.imperative.PyLayer): + def __init__(self): + super(MyLayer, self).__init__() + + def forward(self, inputs): + x = fluid.layers.relu(inputs[0]) + self._x_for_debug = x + return [fluid.layers.elementwise_mul(x, x)] + + +class TestImperative(unittest.TestCase): + def test_layer(self): + with fluid.imperative.guard(): + cl = core.Layer() + cl.forward([]) + l = fluid.imperative.PyLayer() + l.forward([]) + + def test_layer_in_out(self): + with fluid.imperative.guard(): + l = MyLayer() + x = l(np.array([1.0, 2.0, -1.0], dtype=np.float32))[0] + self.assertIsNotNone(x) + sys.stderr.write("%s output: %s\n" % (x, x._numpy())) + x._backward() + sys.stderr.write("grad %s\n" % l._x_for_debug._gradient()) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_layers.py b/python/paddle/fluid/tests/unittests/test_layers.py index be51fb06a37a376f6f410336184c95981ded35dc..10e8bb5a86691d8654c5ae48794e49f30f47500d 100644 --- a/python/paddle/fluid/tests/unittests/test_layers.py +++ b/python/paddle/fluid/tests/unittests/test_layers.py @@ -846,6 +846,15 @@ class TestBook(unittest.TestCase): out = layers.cross_entropy(x, label, False, 4) self.assertIsNotNone(out) + def test_bpr_loss(self): + program = Program() + with program_guard(program): + x = layers.data(name="x", shape=[30, 10], dtype="float32") + label = layers.data(name="label", shape=[30, 1], dtype="int32") + out = layers.bpr_loss(x, label) + self.assertIsNotNone(out) + print(str(program)) + def test_expand(self): program = Program() with program_guard(program): diff --git a/python/paddle/fluid/tests/unittests/test_memory_optimization_transpiler.py b/python/paddle/fluid/tests/unittests/test_memory_optimization_transpiler.py index 275e5c49d5c298a95b012582a74f8073b800991e..fa16f082880eb97f54abe8bf75e26321f72b3bd3 100644 --- a/python/paddle/fluid/tests/unittests/test_memory_optimization_transpiler.py +++ b/python/paddle/fluid/tests/unittests/test_memory_optimization_transpiler.py @@ -22,6 +22,15 @@ from paddle.fluid.framework import Program, program_guard from paddle.fluid.transpiler import memory_optimize +def _get_vars(prog): + assert (isinstance(prog, Program)) + all_vars = set() + for op in prog.global_block().ops: + all_vars.update(op.input_arg_names) + all_vars.update(op.output_arg_names) + return all_vars + + class TestControlFlowGraph(unittest.TestCase): def setUp(self): program = Program() @@ -37,11 +46,11 @@ class TestControlFlowGraph(unittest.TestCase): self.program = program def test_control_flow_graph(self): - print("before optimization") - print(str(self.program)) - result_program = memory_optimize(self.program) - print("after optimization") - print(str(result_program)) + result_program = self.program.clone() + memory_optimize(self.program) + old_vars = _get_vars(self.program) + new_vars = _get_vars(result_program) + self.assertTrue(old_vars != new_vars) class TestMemoryTranspiler2(unittest.TestCase): @@ -58,14 +67,22 @@ class TestMemoryTranspiler2(unittest.TestCase): avg_cost = layers.mean(cost) opt = optimizer.SGD(learning_rate=0.001) opt.minimize(avg_cost) + self.skip_set = set([cost.name, fc.name]) self.program = program def test_inplace_ops(self): - print("before optimization") - print(str(self.program)) - result_program = memory_optimize(self.program) - print("after optimization") - print(str(result_program)) + result_program = self.program.clone() + memory_optimize(self.program) + old_vars = _get_vars(self.program) + new_vars = _get_vars(result_program) + self.assertTrue(old_vars != new_vars) + + def test_skip_opt(self): + result_program = self.program.clone() + memory_optimize(self.program, skip_opt_set=self.skip_set) + old_vars = _get_vars(self.program) + new_vars = _get_vars(result_program) + self.assertTrue(old_vars != new_vars) class TestMemoryTranspiler3(unittest.TestCase): diff --git a/python/paddle/fluid/tests/unittests/test_merge_selectedrows_op.py b/python/paddle/fluid/tests/unittests/test_merge_selectedrows_op.py new file mode 100644 index 0000000000000000000000000000000000000000..ce64da0478d3997f4889ca942c67e0defac80b45 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_merge_selectedrows_op.py @@ -0,0 +1,73 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import paddle.fluid.core as core +import numpy as np +from paddle.fluid.op import Operator + + +class TestMergeSelectedRows(unittest.TestCase): + def get_places(self): + places = [core.CPUPlace()] + if core.is_compiled_with_cuda(): + places.append(core.CUDAPlace(0)) + return places + + def check_with_place(self, place): + scope = core.Scope() + x_rows = [0, 5, 5, 4, 20] + out_rows = [0, 4, 5, 20] + height = 20 + row_numel = 2 + + np_array = np.ones((len(x_rows), row_numel)).astype("float32") + np_array[1, :] = 2.0 + np_array[2, :] = 3.0 + np_array[3, :] = 4.0 + + # initialize input variable X + x = scope.var('X').get_selected_rows() + x.set_rows(x_rows) + x.set_height(height) + x_tensor = x.get_tensor() + x_tensor.set(np_array, place) + + # initialize input variable Out + out = scope.var("Out").get_selected_rows() + + op = Operator("merge_selected_rows", X="X", Out="Out") + + op.run(scope, place) + + self.assertEqual(out.rows(), out_rows) + self.assertEqual(out.height(), height) + + out_array = np.array(out.get_tensor()) + self.assertEqual((4, 2), out_array.shape) + + assert (out_array[0, :] == 1.0).all() + assert (out_array[1, :] == 4.0).all() + assert (out_array[2, :] == 5.0).all() + assert (out_array[3, :] == 1.0).all() + + def test_check_output(self): + for place in self.get_places(): + self.check_with_place(place) + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_yolov3_loss_op.py b/python/paddle/fluid/tests/unittests/test_yolov3_loss_op.py new file mode 100644 index 0000000000000000000000000000000000000000..544fe4b4f81909b69a05d9751316e3d3137fdc45 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_yolov3_loss_op.py @@ -0,0 +1,215 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import division + +import unittest +import numpy as np +from op_test import OpTest + +from paddle.fluid import core + + +def sigmoid(x): + return 1.0 / (1.0 + np.exp(-1.0 * x)) + + +def mse(x, y, num): + return ((y - x)**2).sum() / num + + +def bce(x, y, mask): + x = x.reshape((-1)) + y = y.reshape((-1)) + mask = mask.reshape((-1)) + + error_sum = 0.0 + count = 0 + for i in range(x.shape[0]): + if mask[i] > 0: + error_sum += y[i] * np.log(x[i]) + (1 - y[i]) * np.log(1 - x[i]) + count += 1 + return error_sum / (-1.0 * count) + + +def box_iou(box1, box2): + b1_x1 = box1[0] - box1[2] / 2 + b1_x2 = box1[0] + box1[2] / 2 + b1_y1 = box1[1] - box1[3] / 2 + b1_y2 = box1[1] + box1[3] / 2 + b2_x1 = box2[0] - box2[2] / 2 + b2_x2 = box2[0] + box2[2] / 2 + b2_y1 = box2[1] - box2[3] / 2 + b2_y2 = box2[1] + box2[3] / 2 + + b1_area = (b1_x2 - b1_x1) * (b1_y2 - b1_y1) + b2_area = (b2_x2 - b2_x1) * (b2_y2 - b2_y1) + + inter_rect_x1 = max(b1_x1, b2_x1) + inter_rect_y1 = max(b1_y1, b2_y1) + inter_rect_x2 = min(b1_x2, b2_x2) + inter_rect_y2 = min(b1_y2, b2_y2) + inter_area = max(inter_rect_x2 - inter_rect_x1, 0) * max( + inter_rect_y2 - inter_rect_y1, 0) + + return inter_area / (b1_area + b2_area + inter_area) + + +def build_target(gtboxs, gtlabel, attrs, grid_size): + n, b, _ = gtboxs.shape + ignore_thresh = attrs["ignore_thresh"] + anchors = attrs["anchors"] + class_num = attrs["class_num"] + an_num = len(anchors) // 2 + obj_mask = np.zeros((n, an_num, grid_size, grid_size)).astype('float32') + noobj_mask = np.ones((n, an_num, grid_size, grid_size)).astype('float32') + tx = np.zeros((n, an_num, grid_size, grid_size)).astype('float32') + ty = np.zeros((n, an_num, grid_size, grid_size)).astype('float32') + tw = np.zeros((n, an_num, grid_size, grid_size)).astype('float32') + th = np.zeros((n, an_num, grid_size, grid_size)).astype('float32') + tconf = np.zeros((n, an_num, grid_size, grid_size)).astype('float32') + tcls = np.zeros( + (n, an_num, grid_size, grid_size, class_num)).astype('float32') + + for i in range(n): + for j in range(b): + if gtboxs[i, j, :].sum() == 0: + continue + + gt_label = gtlabel[i, j] + gx = gtboxs[i, j, 0] * grid_size + gy = gtboxs[i, j, 1] * grid_size + gw = gtboxs[i, j, 2] * grid_size + gh = gtboxs[i, j, 3] * grid_size + + gi = int(gx) + gj = int(gy) + + gtbox = [0, 0, gw, gh] + max_iou = 0 + for k in range(an_num): + anchor_box = [0, 0, anchors[2 * k], anchors[2 * k + 1]] + iou = box_iou(gtbox, anchor_box) + if iou > max_iou: + max_iou = iou + best_an_index = k + if iou > ignore_thresh: + noobj_mask[i, best_an_index, gj, gi] = 0 + + obj_mask[i, best_an_index, gj, gi] = 1 + noobj_mask[i, best_an_index, gj, gi] = 0 + tx[i, best_an_index, gj, gi] = gx - gi + ty[i, best_an_index, gj, gi] = gy - gj + tw[i, best_an_index, gj, gi] = np.log(gw / anchors[2 * + best_an_index]) + th[i, best_an_index, gj, gi] = np.log( + gh / anchors[2 * best_an_index + 1]) + tconf[i, best_an_index, gj, gi] = 1 + tcls[i, best_an_index, gj, gi, gt_label] = 1 + + return (tx, ty, tw, th, tconf, tcls, obj_mask, noobj_mask) + + +def YoloV3Loss(x, gtbox, gtlabel, attrs): + n, c, h, w = x.shape + an_num = len(attrs['anchors']) // 2 + class_num = attrs["class_num"] + x = x.reshape((n, an_num, 5 + class_num, h, w)).transpose((0, 1, 3, 4, 2)) + pred_x = sigmoid(x[:, :, :, :, 0]) + pred_y = sigmoid(x[:, :, :, :, 1]) + pred_w = x[:, :, :, :, 2] + pred_h = x[:, :, :, :, 3] + pred_conf = sigmoid(x[:, :, :, :, 4]) + pred_cls = sigmoid(x[:, :, :, :, 5:]) + + tx, ty, tw, th, tconf, tcls, obj_mask, noobj_mask = build_target( + gtbox, gtlabel, attrs, x.shape[2]) + + obj_mask_expand = np.tile( + np.expand_dims(obj_mask, 4), (1, 1, 1, 1, int(attrs['class_num']))) + loss_x = mse(pred_x * obj_mask, tx * obj_mask, obj_mask.sum()) + loss_y = mse(pred_y * obj_mask, ty * obj_mask, obj_mask.sum()) + loss_w = mse(pred_w * obj_mask, tw * obj_mask, obj_mask.sum()) + loss_h = mse(pred_h * obj_mask, th * obj_mask, obj_mask.sum()) + loss_conf_target = bce(pred_conf * obj_mask, tconf * obj_mask, obj_mask) + loss_conf_notarget = bce(pred_conf * noobj_mask, tconf * noobj_mask, + noobj_mask) + loss_class = bce(pred_cls * obj_mask_expand, tcls * obj_mask_expand, + obj_mask_expand) + + return attrs['loss_weight_xy'] * (loss_x + loss_y) \ + + attrs['loss_weight_wh'] * (loss_w + loss_h) \ + + attrs['loss_weight_conf_target'] * loss_conf_target \ + + attrs['loss_weight_conf_notarget'] * loss_conf_notarget \ + + attrs['loss_weight_class'] * loss_class + + +class TestYolov3LossOp(OpTest): + def setUp(self): + self.loss_weight_xy = 1.0 + self.loss_weight_wh = 1.0 + self.loss_weight_conf_target = 1.0 + self.loss_weight_conf_notarget = 1.0 + self.loss_weight_class = 1.0 + self.initTestCase() + self.op_type = 'yolov3_loss' + x = np.random.random(size=self.x_shape).astype('float32') + gtbox = np.random.random(size=self.gtbox_shape).astype('float32') + gtlabel = np.random.randint(0, self.class_num, + self.gtbox_shape[:2]).astype('int32') + + self.attrs = { + "anchors": self.anchors, + "class_num": self.class_num, + "ignore_thresh": self.ignore_thresh, + "loss_weight_xy": self.loss_weight_xy, + "loss_weight_wh": self.loss_weight_wh, + "loss_weight_conf_target": self.loss_weight_conf_target, + "loss_weight_conf_notarget": self.loss_weight_conf_notarget, + "loss_weight_class": self.loss_weight_class, + } + + self.inputs = {'X': x, 'GTBox': gtbox, 'GTLabel': gtlabel} + self.outputs = { + 'Loss': np.array( + [YoloV3Loss(x, gtbox, gtlabel, self.attrs)]).astype('float32') + } + + def test_check_output(self): + place = core.CPUPlace() + self.check_output_with_place(place, atol=1e-3) + + def test_check_grad_ignore_gtbox(self): + place = core.CPUPlace() + self.check_grad_with_place( + place, ['X'], + 'Loss', + no_grad_set=set(["GTBox", "GTLabel"]), + max_relative_error=0.06) + + def initTestCase(self): + self.anchors = [10, 13, 12, 12] + self.class_num = 10 + self.ignore_thresh = 0.5 + self.x_shape = (5, len(self.anchors) // 2 * (5 + self.class_num), 7, 7) + self.gtbox_shape = (5, 10, 4) + self.loss_weight_xy = 2.5 + self.loss_weight_wh = 0.8 + self.loss_weight_conf_target = 1.5 + self.loss_weight_conf_notarget = 0.5 + self.loss_weight_class = 1.2 + + +if __name__ == "__main__": + unittest.main() diff --git a/python/paddle/fluid/transpiler/distribute_transpiler.py b/python/paddle/fluid/transpiler/distribute_transpiler.py index 5d348f0995fbff7bbefa3324caffb448c98f552f..d21ec42dccde80fd354a730274edb04f654113c3 100644 --- a/python/paddle/fluid/transpiler/distribute_transpiler.py +++ b/python/paddle/fluid/transpiler/distribute_transpiler.py @@ -125,13 +125,14 @@ def slice_variable(var_list, slice_count, min_block_size): class DistributeTranspilerConfig(object): """ - slice_var_up (bool): Do Tensor slice for pservers, default is True. - split_method (PSDispatcher): RoundRobin or HashName can be used - try to choose the best method to balance loads for pservers. - min_block_size (int): Minimum splitted element number in block. - According:https://github.com/PaddlePaddle/Paddle/issues/8638#issuecomment-369912156 - We can use bandwidth effiently when data size is larger than 2MB.If you - want to change it, please be sure you see the slice_variable function. + Args: + slice_var_up (bool): Do Tensor slice for pservers, default is True. + split_method (PSDispatcher): RoundRobin or HashName can be used + try to choose the best method to balance loads for pservers. + min_block_size (int): Minimum splitted element number in block. + According:https://github.com/PaddlePaddle/Paddle/issues/8638#issuecomment-369912156 + We can use bandwidth effiently when data size is larger than 2MB.If you + want to change it, please be sure you see the slice_variable function. """ slice_var_up = True @@ -141,6 +142,7 @@ class DistributeTranspilerConfig(object): # supported modes: pserver, nccl2 mode = "pserver" print_log = False + wait_port = True class DistributeTranspiler(object): @@ -163,35 +165,34 @@ class DistributeTranspiler(object): Examples: .. code-block:: python - # for pserver mode - pserver_endpoints = "192.168.0.1:6174,192.168.0.2:6174" - trainer_endpoints = "192.168.0.1:6174,192.168.0.2:6174" - current_endpoint = "192.168.0.1:6174" - trainer_id = 0 - trainers = 4 - role = os.getenv("PADDLE_TRAINING_ROLE") - - t = fluid.DistributeTranspiler() - t.transpile( - trainer_id, pservers=pserver_endpoints, trainers=trainers) - if role == "PSERVER": - pserver_program = t.get_pserver_program(current_endpoint) - pserver_startup_program = t.get_startup_program(current_endpoint, + # for pserver mode + pserver_endpoints = "192.168.0.1:6174,192.168.0.2:6174" + trainer_endpoints = "192.168.0.1:6174,192.168.0.2:6174" + current_endpoint = "192.168.0.1:6174" + trainer_id = 0 + trainers = 4 + role = os.getenv("PADDLE_TRAINING_ROLE") + t = fluid.DistributeTranspiler() + t.transpile( + trainer_id, pservers=pserver_endpoints, trainers=trainers) + if role == "PSERVER": + pserver_program = t.get_pserver_program(current_endpoint) + pserver_startup_program = t.get_startup_program(current_endpoint, pserver_program) - elif role == "TRAINER": - trainer_program = t.get_trainer_program() - - # for nccl2 mode - config = fluid.DistributeTranspilerConfig() - config.mode = "nccl2" - t = fluid.DistributeTranspiler(config=config) - t.transpile(trainer_id, workers=workers, current_endpoint=curr_ep) - exe = fluid.ParallelExecutor( - use_cuda, - loss_name=loss_var.name, - num_trainers=len(trainers.split(",)), - trainer_id=trainer_id - ) + elif role == "TRAINER": + trainer_program = t.get_trainer_program() + + # for nccl2 mode + config = fluid.DistributeTranspilerConfig() + config.mode = "nccl2" + t = fluid.DistributeTranspiler(config=config) + t.transpile(trainer_id, workers=workers, current_endpoint=curr_ep) + exe = fluid.ParallelExecutor( + use_cuda, + loss_name=loss_var.name, + num_trainers=len(trainers.split(",)), + trainer_id=trainer_id + ) """ def __init__(self, config=None): @@ -213,13 +214,16 @@ class DistributeTranspiler(object): trainer_id, trainers, current_endpoint, - startup_program=None): + startup_program=None, + wait_port=True): if not startup_program: startup_program = default_startup_program() if trainer_id >= 0: worker_endpoints = trainers.split(",") # send NCCL_ID to others or recv from trainer 0 worker_endpoints.remove(current_endpoint) + if trainer_id == 0 and wait_port: + wait_server_ready(worker_endpoints) nccl_id_var = startup_program.global_block().create_var( name="NCCLID", persistable=True, type=core.VarDesc.VarType.RAW) @@ -301,11 +305,13 @@ class DistributeTranspiler(object): if self.config.mode == "nccl2": assert (isinstance(trainers, str)) + self.origin_program._trainers_endpoints = trainers.split(",") self._transpile_nccl2( trainer_id, trainers, current_endpoint, - startup_program=startup_program) + startup_program=startup_program, + wait_port=self.config.wait_port) return self.trainer_num = trainers @@ -651,9 +657,6 @@ class DistributeTranspiler(object): # NOTE: assume blocks of the same variable is not distributed # on the same pserver, only change param/grad varnames for # trainers to fetch. - sys.stderr.write("get_pserver_program() is deprecated, call \ -get_pserver_programs() to get pserver main and startup \ -in a single call.") # step1 pserver_program = Program() pserver_program.random_seed = self.origin_program.random_seed @@ -921,18 +924,6 @@ in a single call.") Returns: Program: parameter server side startup program. """ - sys.stderr.write("get_startup_program() is deprecated, call \ -get_pserver_programs() to get pserver main and startup \ -in a single call.") - if pserver_program != None: - sys.stderr.write("passing pserver_program to get_startup_program() \ -is deprecated, you can use new API get_pserver_programs() to \ -get both pserver main program and startup program.") - if startup_program != None: - sys.stderr.write("passing startup_program to get_startup_program() \ -is deprecated, use fluid.program_guard() or pass this argument \ -to transpile() call.") - s_prog = Program() orig_s_prog = self.startup_program s_prog.random_seed = orig_s_prog.random_seed diff --git a/python/paddle/fluid/transpiler/memory_optimization_transpiler.py b/python/paddle/fluid/transpiler/memory_optimization_transpiler.py index c9f1be934773cc28f026f2b867b9e3a4f7aa8472..95aafec05361a8b66b849268c7a738bb2ee5da86 100755 --- a/python/paddle/fluid/transpiler/memory_optimization_transpiler.py +++ b/python/paddle/fluid/transpiler/memory_optimization_transpiler.py @@ -14,6 +14,7 @@ from __future__ import print_function +import six from collections import defaultdict, MutableSet from .. import core from ... import compat as cpt @@ -470,8 +471,21 @@ def memory_optimize(input_program, Returns: None """ + + def to_name_str(var): + if isinstance(var, Variable): + return var.desc.name() + elif isinstance(var, str): + return var + elif isinstance(var, six.string_types): + return str(var) + else: + raise TypeError(str(var) + " should be Variable or str") + if level != 0 and level != 1: raise ValueError("only support opt_level 0 or 1.") + if skip_opt_set is not None and not isinstance(skip_opt_set, set): + raise ValueError("only support skip_opt_set as set.") global PRINT_LOG PRINT_LOG = print_log if skip_grads: @@ -486,6 +500,8 @@ def memory_optimize(input_program, skip_opt_set = grad_set else: skip_opt_set.update(grad_set) + if skip_opt_set is not None: + skip_opt_set = set(map(to_name_str, skip_opt_set)) cfgs = _get_cfgs(input_program) for cfg in cfgs: cfg.memory_optimize(skip_opt_set=skip_opt_set, level=level) diff --git a/python/setup.py.in b/python/setup.py.in index 5aee26b63832889272cde09c553b4615efb8872a..0eb69cdb5c7d140527dba7a648728750bfb404f7 100644 --- a/python/setup.py.in +++ b/python/setup.py.in @@ -101,6 +101,7 @@ packages=['paddle', 'paddle.dataset', 'paddle.reader', 'paddle.fluid', + 'paddle.fluid.imperative', 'paddle.fluid.proto', 'paddle.fluid.proto.profiler', 'paddle.fluid.layers', diff --git a/tools/print_signatures.py b/tools/print_signatures.py index e2805c4e7e6aa26a5865b64a874feef672bf9b36..7e61dde0a446cf5bfe656105ffd2472f03576f05 100644 --- a/tools/print_signatures.py +++ b/tools/print_signatures.py @@ -15,7 +15,7 @@ Print all signature of a python module in alphabet order. Usage: - ./print_signature "paddle.fluid" > signature.txt + ./print_signature "paddle.fluid,paddle.reader" > signature.txt """ from __future__ import print_function @@ -27,6 +27,8 @@ import pydoc member_dict = collections.OrderedDict() +experimental_namespace = {"paddle.fluid.imperative"} + def visit_member(parent_name, member): cur_name = ".".join([parent_name, member.__name__]) @@ -43,13 +45,16 @@ def visit_member(parent_name, member): line.strip() for line in pydoc.render_doc(member).split('\n') if "->" in line ]) - + elif inspect.isgetsetdescriptor(member): + return else: raise RuntimeError("Unsupported generate signature of member, type {0}". format(str(type(member)))) def visit_all_module(mod): + if (mod.__name__ in experimental_namespace): + return for member_name in ( name for name in (mod.__all__ if hasattr(mod, "__all__") else dir(mod)) @@ -63,7 +68,9 @@ def visit_all_module(mod): visit_member(mod.__name__, instance) -visit_all_module(importlib.import_module(sys.argv[1])) +modules = sys.argv[1].split(",") +for m in modules: + visit_all_module(importlib.import_module(m)) for name in member_dict: print(name, member_dict[name])