diff --git a/.github/PULL_REQUEST_TEMPLATE.md b/.github/PULL_REQUEST_TEMPLATE.md index 9b5014f5aa6ee482cdae190371bca3b4bb3363ba..5ba1cd6ba7cb2c942015e78fa2fe8fbcf97bbe59 100644 --- a/.github/PULL_REQUEST_TEMPLATE.md +++ b/.github/PULL_REQUEST_TEMPLATE.md @@ -1,22 +1,8 @@ -#### Required(必填, multiple choices, two at most) -- **PR type(PR 类型) is ( ):** -A. New features(新功能)---------------- D. Performance optimization(性能优化) -B. Bug fixes(问题修复)------------------ E. Breaking changes(向后不兼容的改变) -C. Function optimization(功能优化)------F. Others(其它) - -- **PR changes(改动点)is ( ):** -A. OPs(operators)---------------------- C. Docs(文档) -B. APIs(接口)--------------------------- D. Others(其它) - -- **Use one sentence to describe what this PR does.(简述本次PR的目的和改动)** - ------------------------ -#### Optional(选填, If None, please delete it) - -- **Describe what this PR does in detail. If this PR fixes an issue, please give the issue id.** - - -- **If you modified docs, please make sure that both Chinese and English docs were modified and provide a preview screenshot. (文档必填)** - - -- **Please write down other information you want to tell reviewers.** + + +PR types: + + +PR changes: + +Describe: diff --git a/paddle/fluid/framework/data_feed.cc b/paddle/fluid/framework/data_feed.cc index 7ac023f140ecbd209e902ba67dd64bf8f5fef806..794ddf14dad7816079660261be86f5ee49c587ac 100644 --- a/paddle/fluid/framework/data_feed.cc +++ b/paddle/fluid/framework/data_feed.cc @@ -41,44 +41,44 @@ namespace paddle { namespace framework { void RecordCandidateList::ReSize(size_t length) { - _mutex.lock(); - _capacity = length; - CHECK(_capacity > 0); // NOLINT - _candidate_list.clear(); - _candidate_list.resize(_capacity); - _full = false; - _cur_size = 0; - _total_size = 0; - _mutex.unlock(); + mutex_.lock(); + capacity_ = length; + CHECK(capacity_ > 0); // NOLINT + candidate_list_.clear(); + candidate_list_.resize(capacity_); + full_ = false; + cur_size_ = 0; + total_size_ = 0; + mutex_.unlock(); } void RecordCandidateList::ReInit() { - _mutex.lock(); - _full = false; - _cur_size = 0; - _total_size = 0; - _mutex.unlock(); + mutex_.lock(); + full_ = false; + cur_size_ = 0; + total_size_ = 0; + mutex_.unlock(); } void RecordCandidateList::AddAndGet(const Record& record, RecordCandidate* result) { - _mutex.lock(); + mutex_.lock(); size_t index = 0; - ++_total_size; + ++total_size_; auto fleet_ptr = FleetWrapper::GetInstance(); - if (!_full) { - _candidate_list[_cur_size++] = record; - _full = (_cur_size == _capacity); + if (!full_) { + candidate_list_[cur_size_++] = record; + full_ = (cur_size_ == capacity_); } else { - CHECK(_cur_size == _capacity); - index = fleet_ptr->LocalRandomEngine()() % _total_size; - if (index < _capacity) { - _candidate_list[index] = record; + CHECK(cur_size_ == capacity_); + index = fleet_ptr->LocalRandomEngine()() % total_size_; + if (index < capacity_) { + candidate_list_[index] = record; } } - index = fleet_ptr->LocalRandomEngine()() % _cur_size; - *result = _candidate_list[index]; - _mutex.unlock(); + index = fleet_ptr->LocalRandomEngine()() % cur_size_; + *result = candidate_list_[index]; + mutex_.unlock(); } void DataFeed::AddFeedVar(Variable* var, const std::string& name) { @@ -1452,7 +1452,11 @@ void PaddleBoxDataFeed::PutToFeedVec(const std::vector& pv_vec) { int PaddleBoxDataFeed::GetCurrentPhase() { #ifdef PADDLE_WITH_BOX_PS auto box_ptr = paddle::framework::BoxWrapper::GetInstance(); - return box_ptr->PassFlag(); // join: 1, update: 0 + if (box_ptr->Mode() == 1) { // For AucRunner + return 1; + } else { + return box_ptr->Phase(); + } #else LOG(WARNING) << "It should be complied with BOX_PS..."; return current_phase_; diff --git a/paddle/fluid/framework/data_feed.h b/paddle/fluid/framework/data_feed.h index 8390453d318f2e7b64f4dccc1b45fa8f07259cdf..b4b8f465742254e6942235ab71db661a5a8255be 100644 --- a/paddle/fluid/framework/data_feed.h +++ b/paddle/fluid/framework/data_feed.h @@ -27,6 +27,7 @@ limitations under the License. */ #include #include // NOLINT #include +#include #include #include @@ -34,6 +35,7 @@ limitations under the License. */ #include "paddle/fluid/framework/blocking_queue.h" #include "paddle/fluid/framework/channel.h" #include "paddle/fluid/framework/data_feed.pb.h" +#include "paddle/fluid/framework/fleet/fleet_wrapper.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/reader.h" #include "paddle/fluid/framework/variable.h" @@ -484,13 +486,25 @@ paddle::framework::Archive& operator>>(paddle::framework::Archive& ar, struct RecordCandidate { std::string ins_id_; - std::unordered_multimap feas; + std::unordered_multimap feas_; + size_t shadow_index_ = -1; // Optimization for Reservoir Sample + + RecordCandidate() {} + RecordCandidate(const Record& rec, + const std::unordered_set& slot_index_to_replace) { + for (const auto& fea : rec.uint64_feasigns_) { + if (slot_index_to_replace.find(fea.slot()) != + slot_index_to_replace.end()) { + feas_.insert({fea.slot(), fea.sign()}); + } + } + } RecordCandidate& operator=(const Record& rec) { - feas.clear(); + feas_.clear(); ins_id_ = rec.ins_id_; for (auto& fea : rec.uint64_feasigns_) { - feas.insert({fea.slot(), fea.sign()}); + feas_.insert({fea.slot(), fea.sign()}); } return *this; } @@ -499,22 +513,67 @@ struct RecordCandidate { class RecordCandidateList { public: RecordCandidateList() = default; - RecordCandidateList(const RecordCandidateList&) = delete; - RecordCandidateList& operator=(const RecordCandidateList&) = delete; + RecordCandidateList(const RecordCandidateList&) {} + size_t Size() { return cur_size_; } void ReSize(size_t length); void ReInit(); + void ReInitPass() { + for (size_t i = 0; i < cur_size_; ++i) { + if (candidate_list_[i].shadow_index_ != i) { + candidate_list_[i].ins_id_ = + candidate_list_[candidate_list_[i].shadow_index_].ins_id_; + candidate_list_[i].feas_.swap( + candidate_list_[candidate_list_[i].shadow_index_].feas_); + candidate_list_[i].shadow_index_ = i; + } + } + candidate_list_.resize(cur_size_); + } void AddAndGet(const Record& record, RecordCandidate* result); + void AddAndGet(const Record& record, size_t& index_result) { // NOLINT + // std::unique_lock lock(mutex_); + size_t index = 0; + ++total_size_; + auto fleet_ptr = FleetWrapper::GetInstance(); + if (!full_) { + candidate_list_.emplace_back(record, slot_index_to_replace_); + candidate_list_.back().shadow_index_ = cur_size_; + ++cur_size_; + full_ = (cur_size_ == capacity_); + } else { + index = fleet_ptr->LocalRandomEngine()() % total_size_; + if (index < capacity_) { + candidate_list_.emplace_back(record, slot_index_to_replace_); + candidate_list_[index].shadow_index_ = candidate_list_.size() - 1; + } + } + index = fleet_ptr->LocalRandomEngine()() % cur_size_; + index_result = candidate_list_[index].shadow_index_; + } + const RecordCandidate& Get(size_t index) const { + PADDLE_ENFORCE_LT( + index, candidate_list_.size(), + platform::errors::OutOfRange("Your index [%lu] exceeds the number of " + "elements in candidate_list[%lu].", + index, candidate_list_.size())); + return candidate_list_[index]; + } + void SetSlotIndexToReplace( + const std::unordered_set& slot_index_to_replace) { + slot_index_to_replace_ = slot_index_to_replace; + } private: - size_t _capacity = 0; - std::mutex _mutex; - bool _full = false; - size_t _cur_size = 0; - size_t _total_size = 0; - std::vector _candidate_list; + size_t capacity_ = 0; + std::mutex mutex_; + bool full_ = false; + size_t cur_size_ = 0; + size_t total_size_ = 0; + std::vector candidate_list_; + std::unordered_set slot_index_to_replace_; }; template diff --git a/paddle/fluid/framework/data_set.cc b/paddle/fluid/framework/data_set.cc index 48cb4131d584244563a00dfa9dce54f9007effb8..712592357cbd708bbca49532e93a25222fead78a 100644 --- a/paddle/fluid/framework/data_set.cc +++ b/paddle/fluid/framework/data_set.cc @@ -1141,13 +1141,15 @@ void MultiSlotDataset::MergeByInsId() { VLOG(3) << "MultiSlotDataset::MergeByInsId end"; } -void MultiSlotDataset::GetRandomData(const std::set& slots_to_replace, - std::vector* result) { +void MultiSlotDataset::GetRandomData( + const std::unordered_set& slots_to_replace, + std::vector* result) { int debug_erase_cnt = 0; int debug_push_cnt = 0; auto multi_slot_desc = data_feed_desc_.multi_slot_desc(); slots_shuffle_rclist_.ReInit(); - for (const auto& rec : slots_shuffle_original_data_) { + const auto& slots_shuffle_original_data = GetSlotsOriginalData(); + for (const auto& rec : slots_shuffle_original_data) { RecordCandidate rand_rec; Record new_rec = rec; slots_shuffle_rclist_.AddAndGet(rec, &rand_rec); @@ -1161,7 +1163,7 @@ void MultiSlotDataset::GetRandomData(const std::set& slots_to_replace, } } for (auto slot : slots_to_replace) { - auto range = rand_rec.feas.equal_range(slot); + auto range = rand_rec.feas_.equal_range(slot); for (auto it = range.first; it != range.second; ++it) { new_rec.uint64_feasigns_.push_back({it->second, it->first}); debug_push_cnt += 1; @@ -1173,9 +1175,9 @@ void MultiSlotDataset::GetRandomData(const std::set& slots_to_replace, << " repush feasign num: " << debug_push_cnt; } -// slots shuffle to input_channel_ with needed-shuffle slots -void MultiSlotDataset::SlotsShuffle( - const std::set& slots_to_replace) { +void MultiSlotDataset::PreprocessChannel( + const std::set& slots_to_replace, + std::unordered_set& index_slots) { // NOLINT int out_channel_size = 0; if (cur_channel_ == 0) { for (size_t i = 0; i < multi_output_channel_.size(); ++i) { @@ -1189,20 +1191,14 @@ void MultiSlotDataset::SlotsShuffle( VLOG(2) << "DatasetImpl::SlotsShuffle() begin with input channel size: " << input_channel_->Size() << " output channel size: " << out_channel_size; - if (!slots_shuffle_fea_eval_) { - VLOG(3) << "DatasetImpl::SlotsShuffle() end," - "fea eval mode off, need to set on for slots shuffle"; - return; - } + if ((!input_channel_ || input_channel_->Size() == 0) && slots_shuffle_original_data_.size() == 0 && out_channel_size == 0) { VLOG(3) << "DatasetImpl::SlotsShuffle() end, no data to slots shuffle"; return; } - platform::Timer timeline; - timeline.Start(); + auto multi_slot_desc = data_feed_desc_.multi_slot_desc(); - std::set index_slots; for (int i = 0; i < multi_slot_desc.slots_size(); ++i) { std::string cur_slot = multi_slot_desc.slots(i).name(); if (slots_to_replace.find(cur_slot) != slots_to_replace.end()) { @@ -1287,6 +1283,19 @@ void MultiSlotDataset::SlotsShuffle( } CHECK(input_channel_->Size() == 0) << "input channel should be empty before slots shuffle"; +} + +// slots shuffle to input_channel_ with needed-shuffle slots +void MultiSlotDataset::SlotsShuffle( + const std::set& slots_to_replace) { + PADDLE_ENFORCE_EQ(slots_shuffle_fea_eval_, true, + platform::errors::PreconditionNotMet( + "fea eval mode off, need to set on for slots shuffle")); + platform::Timer timeline; + timeline.Start(); + std::unordered_set index_slots; + PreprocessChannel(slots_to_replace, index_slots); + std::vector random_data; random_data.clear(); // get slots shuffled random_data diff --git a/paddle/fluid/framework/data_set.h b/paddle/fluid/framework/data_set.h index bb6210b1f014d62f8174c275b32e1d3ae73fc506..fdfacccac285e1c0c3e1f9656eb02552d0016d76 100644 --- a/paddle/fluid/framework/data_set.h +++ b/paddle/fluid/framework/data_set.h @@ -67,6 +67,7 @@ class Dataset { virtual void SetParseContent(bool parse_content) = 0; virtual void SetParseLogKey(bool parse_logkey) = 0; virtual void SetEnablePvMerge(bool enable_pv_merge) = 0; + virtual bool EnablePvMerge() = 0; virtual void SetMergeBySid(bool is_merge) = 0; // set merge by ins id virtual void SetMergeByInsId(int merge_size) = 0; @@ -108,10 +109,7 @@ class Dataset { virtual void LocalShuffle() = 0; // global shuffle data virtual void GlobalShuffle(int thread_num = -1) = 0; - // for slots shuffle virtual void SlotsShuffle(const std::set& slots_to_replace) = 0; - virtual void GetRandomData(const std::set& slots_to_replace, - std::vector* result) = 0; // create readers virtual void CreateReaders() = 0; // destroy readers @@ -183,6 +181,9 @@ class DatasetImpl : public Dataset { virtual int GetThreadNum() { return thread_num_; } virtual int GetTrainerNum() { return trainer_num_; } virtual Channel GetInputChannel() { return input_channel_; } + virtual void SetInputChannel(const Channel& input_channel) { + input_channel_ = input_channel; + } virtual int64_t GetFleetSendBatchSize() { return fleet_send_batch_size_; } virtual std::pair GetHdfsConfig() { return std::make_pair(fs_name_, fs_ugi_); @@ -192,6 +193,7 @@ class DatasetImpl : public Dataset { return data_feed_desc_; } virtual int GetChannelNum() { return channel_num_; } + virtual bool EnablePvMerge() { return enable_pv_merge_; } virtual std::vector GetReaders(); virtual void CreateChannel(); virtual void RegisterClientToClientMsgHandler(); @@ -202,8 +204,9 @@ class DatasetImpl : public Dataset { virtual void LocalShuffle(); virtual void GlobalShuffle(int thread_num = -1); virtual void SlotsShuffle(const std::set& slots_to_replace) {} - virtual void GetRandomData(const std::set& slots_to_replace, - std::vector* result) {} + virtual const std::vector& GetSlotsOriginalData() { + return slots_shuffle_original_data_; + } virtual void CreateReaders(); virtual void DestroyReaders(); virtual int64_t GetMemoryDataSize(); @@ -293,9 +296,13 @@ class MultiSlotDataset : public DatasetImpl { } std::vector>().swap(local_tables_); } + virtual void PreprocessChannel( + const std::set& slots_to_replace, + std::unordered_set& index_slot); // NOLINT virtual void SlotsShuffle(const std::set& slots_to_replace); - virtual void GetRandomData(const std::set& slots_to_replace, - std::vector* result); + virtual void GetRandomData( + const std::unordered_set& slots_to_replace, + std::vector* result); virtual ~MultiSlotDataset() {} }; diff --git a/paddle/fluid/framework/fleet/box_wrapper.cc b/paddle/fluid/framework/fleet/box_wrapper.cc index 55512f124b4344f9d8ccaaabfa1e8d6607e0f5d9..2d3e6943822f823d40a21e4e60ec87abf7bfbaef 100644 --- a/paddle/fluid/framework/fleet/box_wrapper.cc +++ b/paddle/fluid/framework/fleet/box_wrapper.cc @@ -28,6 +28,8 @@ std::shared_ptr BoxWrapper::s_instance_ = nullptr; cudaStream_t BoxWrapper::stream_list_[8]; std::shared_ptr BoxWrapper::boxps_ptr_ = nullptr; AfsManager* BoxWrapper::afs_manager = nullptr; +int BoxWrapper::embedx_dim_ = 8; +int BoxWrapper::expand_embed_dim_ = 0; void BasicAucCalculator::compute() { double* table[2] = {&_table[0][0], &_table[1][0]}; @@ -57,6 +59,94 @@ void BasicAucCalculator::compute() { _size = fp + tp; } +void BoxWrapper::CheckEmbedSizeIsValid(int embedx_dim, int expand_embed_dim) { + PADDLE_ENFORCE_EQ( + embedx_dim_, embedx_dim, + platform::errors::InvalidArgument("SetInstance(): invalid embedx_dim. " + "When embedx_dim = %d, but got %d.", + embedx_dim_, embedx_dim)); + PADDLE_ENFORCE_EQ(expand_embed_dim_, expand_embed_dim, + platform::errors::InvalidArgument( + "SetInstance(): invalid expand_embed_dim. When " + "expand_embed_dim = %d, but got %d.", + expand_embed_dim_, expand_embed_dim)); +} + +void BoxWrapper::PullSparse(const paddle::platform::Place& place, + const std::vector& keys, + const std::vector& values, + const std::vector& slot_lengths, + const int hidden_size, const int expand_embed_dim) { +#define EMBEDX_CASE(i, ...) \ + case i: { \ + constexpr size_t EmbedxDim = i; \ + switch (expand_embed_dim) { \ + __VA_ARGS__ \ + default: \ + PADDLE_THROW(platform::errors::InvalidArgument( \ + "Unsupport this expand embedding size [%d]", expand_embed_dim)); \ + } \ + } break + +#define PULLSPARSE_CASE(i, ...) \ + case i: { \ + constexpr size_t ExpandDim = i; \ + PullSparseCase(place, keys, values, slot_lengths, \ + hidden_size, expand_embed_dim); \ + } break + + CheckEmbedSizeIsValid(hidden_size - 3, expand_embed_dim); + switch (hidden_size - 3) { + EMBEDX_CASE(8, PULLSPARSE_CASE(0); PULLSPARSE_CASE(8); + PULLSPARSE_CASE(64);); + EMBEDX_CASE(16, PULLSPARSE_CASE(0);); + default: + PADDLE_THROW(platform::errors::InvalidArgument( + "Unsupport this embedding size [%d]", hidden_size - 3)); + } +#undef PULLSPARSE_CASE +#undef EMBEDX_CASE +} + +void BoxWrapper::PushSparseGrad(const paddle::platform::Place& place, + const std::vector& keys, + const std::vector& grad_values, + const std::vector& slot_lengths, + const int hidden_size, + const int expand_embed_dim, + const int batch_size) { +#define EMBEDX_CASE(i, ...) \ + case i: { \ + constexpr size_t EmbedxDim = i; \ + switch (expand_embed_dim) { \ + __VA_ARGS__ \ + default: \ + PADDLE_THROW(platform::errors::InvalidArgument( \ + "Unsupport this expand embedding size [%d]", expand_embed_dim)); \ + } \ + } break + +#define PUSHSPARSE_CASE(i, ...) \ + case i: { \ + constexpr size_t ExpandDim = i; \ + PushSparseGradCase(place, keys, grad_values, \ + slot_lengths, hidden_size, \ + expand_embed_dim, batch_size); \ + } break + + CheckEmbedSizeIsValid(hidden_size - 3, expand_embed_dim); + switch (hidden_size - 3) { + EMBEDX_CASE(8, PUSHSPARSE_CASE(0); PUSHSPARSE_CASE(8); + PUSHSPARSE_CASE(64);); + EMBEDX_CASE(16, PUSHSPARSE_CASE(0);); + default: + PADDLE_THROW(platform::errors::InvalidArgument( + "Unsupport this embedding size [%d]", hidden_size - 3)); + } +#undef PUSHSPARSE_CASE +#undef EMBEDX_CASE +} + void BasicAucCalculator::calculate_bucket_error() { double last_ctr = -1; double impression_sum = 0; @@ -128,133 +218,112 @@ void BoxWrapper::EndPass(bool need_save_delta) const { ret, 0, platform::errors::PreconditionNotMet("EndPass failed in BoxPS.")); } -void BoxWrapper::PullSparse(const paddle::platform::Place& place, - const std::vector& keys, - const std::vector& values, - const std::vector& slot_lengths, - const int hidden_size) { - VLOG(3) << "Begin PullSparse"; - platform::Timer all_timer; - platform::Timer pull_boxps_timer; - all_timer.Start(); - - int64_t total_length = - std::accumulate(slot_lengths.begin(), slot_lengths.end(), 0UL); - auto buf = - memory::AllocShared(place, total_length * sizeof(boxps::FeatureValueGpu)); - boxps::FeatureValueGpu* total_values_gpu = - reinterpret_cast(buf->ptr()); - - if (platform::is_cpu_place(place)) { - PADDLE_THROW(platform::errors::Unimplemented( - "Warning:: CPUPlace is not supported in PaddleBox now.")); - } else if (platform::is_gpu_place(place)) { -#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) - VLOG(3) << "Begin copy keys, key_num[" << total_length << "]"; - int device_id = BOOST_GET_CONST(platform::CUDAPlace, place).GetDeviceId(); - LoDTensor& total_keys_tensor = keys_tensor[device_id]; - uint64_t* total_keys = reinterpret_cast( - total_keys_tensor.mutable_data({total_length, 1}, place)); - - // construct slot_level lod info - auto slot_lengths_lod = slot_lengths; - for (size_t i = 1; i < slot_lengths_lod.size(); i++) { - slot_lengths_lod[i] += slot_lengths_lod[i - 1]; - } - auto buf_key = memory::AllocShared(place, keys.size() * sizeof(uint64_t*)); - auto buf_length = - memory::AllocShared(place, slot_lengths.size() * sizeof(int64_t)); - uint64_t** gpu_keys = reinterpret_cast(buf_key->ptr()); - int64_t* gpu_len = reinterpret_cast(buf_length->ptr()); - cudaMemcpy(gpu_keys, keys.data(), keys.size() * sizeof(uint64_t*), - cudaMemcpyHostToDevice); - cudaMemcpy(gpu_len, slot_lengths_lod.data(), - slot_lengths.size() * sizeof(int64_t), cudaMemcpyHostToDevice); - - this->CopyKeys(place, gpu_keys, total_keys, gpu_len, - static_cast(slot_lengths.size()), - static_cast(total_length)); - VLOG(3) << "Begin call PullSparseGPU in BoxPS"; - pull_boxps_timer.Start(); - int ret = - boxps_ptr_->PullSparseGPU(total_keys, total_values_gpu, - static_cast(total_length), device_id); - PADDLE_ENFORCE_EQ(ret, 0, platform::errors::PreconditionNotMet( - "PullSparseGPU failed in BoxPS.")); - pull_boxps_timer.Pause(); - - VLOG(3) << "Begin Copy result to tensor, total_length[" << total_length - << "]"; - this->CopyForPull(place, gpu_keys, values, total_values_gpu, gpu_len, - static_cast(slot_lengths.size()), hidden_size, - total_length); -#else - PADDLE_THROW(platform::errors::PreconditionNotMet( - "Please compile WITH_GPU option, because NCCL doesn't support " - "windows.")); -#endif - } else { - PADDLE_THROW(platform::errors::PreconditionNotMet( - "PaddleBox: PullSparse Only Support CPUPlace or CUDAPlace Now.")); +void BoxWrapper::GetRandomReplace(const std::vector& pass_data) { + VLOG(0) << "Begin GetRandomReplace"; + size_t ins_num = pass_data.size(); + replace_idx_.resize(ins_num); + for (auto& cand_list : random_ins_pool_list) { + cand_list.ReInitPass(); + } + std::vector threads; + for (int tid = 0; tid < auc_runner_thread_num_; ++tid) { + threads.push_back(std::thread([this, &pass_data, tid, ins_num]() { + int start = tid * ins_num / auc_runner_thread_num_; + int end = (tid + 1) * ins_num / auc_runner_thread_num_; + VLOG(3) << "GetRandomReplace begin for thread[" << tid + << "], and process [" << start << ", " << end + << "), total ins: " << ins_num; + auto& random_pool = random_ins_pool_list[tid]; + for (int i = start; i < end; ++i) { + const auto& ins = pass_data[i]; + random_pool.AddAndGet(ins, replace_idx_[i]); + } + })); + } + for (int tid = 0; tid < auc_runner_thread_num_; ++tid) { + threads[tid].join(); } - all_timer.Pause(); - VLOG(1) << "PullSparse total costs: " << all_timer.ElapsedSec() - << " s, of which BoxPS costs: " << pull_boxps_timer.ElapsedSec() - << " s"; - VLOG(3) << "End PullSparse"; + pass_done_semi_->Put(1); + VLOG(0) << "End GetRandomReplace"; } -void BoxWrapper::PushSparseGrad(const paddle::platform::Place& place, - const std::vector& keys, - const std::vector& grad_values, - const std::vector& slot_lengths, - const int hidden_size, const int batch_size) { - VLOG(3) << "Begin PushSparseGrad"; - platform::Timer all_timer; - platform::Timer push_boxps_timer; - all_timer.Start(); - int64_t total_length = - std::accumulate(slot_lengths.begin(), slot_lengths.end(), 0UL); - auto buf = memory::AllocShared( - place, total_length * sizeof(boxps::FeaturePushValueGpu)); - boxps::FeaturePushValueGpu* total_grad_values_gpu = - reinterpret_cast(buf->ptr()); - if (platform::is_cpu_place(place)) { - PADDLE_THROW(platform::errors::Unimplemented( - "Warning:: CPUPlace is not supported in PaddleBox now.")); - } else if (platform::is_gpu_place(place)) { -#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) - int device_id = BOOST_GET_CONST(platform::CUDAPlace, place).GetDeviceId(); - LoDTensor& cached_total_keys_tensor = keys_tensor[device_id]; - uint64_t* total_keys = - reinterpret_cast(cached_total_keys_tensor.data()); - VLOG(3) << "Begin copy grad tensor to boxps struct"; - this->CopyForPush(place, grad_values, total_grad_values_gpu, slot_lengths, - hidden_size, total_length, batch_size); +void BoxWrapper::GetRandomData( + const std::vector& pass_data, + const std::unordered_set& slots_to_replace, + std::vector* result) { + VLOG(0) << "Begin GetRandomData"; + std::vector threads; + for (int tid = 0; tid < auc_runner_thread_num_; ++tid) { + threads.push_back(std::thread([this, &pass_data, tid, &slots_to_replace, + result]() { + int debug_erase_cnt = 0; + int debug_push_cnt = 0; + size_t ins_num = pass_data.size(); + int start = tid * ins_num / auc_runner_thread_num_; + int end = (tid + 1) * ins_num / auc_runner_thread_num_; + VLOG(3) << "GetRandomData begin for thread[" << tid << "], and process [" + << start << ", " << end << "), total ins: " << ins_num; + const auto& random_pool = random_ins_pool_list[tid]; + for (int i = start; i < end; ++i) { + const auto& ins = pass_data[i]; + const RecordCandidate& rand_rec = random_pool.Get(replace_idx_[i]); + Record new_rec = ins; + for (auto it = new_rec.uint64_feasigns_.begin(); + it != new_rec.uint64_feasigns_.end();) { + if (slots_to_replace.find(it->slot()) != slots_to_replace.end()) { + it = new_rec.uint64_feasigns_.erase(it); + debug_erase_cnt += 1; + } else { + ++it; + } + } + for (auto slot : slots_to_replace) { + auto range = rand_rec.feas_.equal_range(slot); + for (auto it = range.first; it != range.second; ++it) { + new_rec.uint64_feasigns_.push_back({it->second, it->first}); + debug_push_cnt += 1; + } + } + (*result)[i] = std::move(new_rec); + } + VLOG(3) << "thread[" << tid << "]: erase feasign num: " << debug_erase_cnt + << " repush feasign num: " << debug_push_cnt; + })); + } + for (int tid = 0; tid < auc_runner_thread_num_; ++tid) { + threads[tid].join(); + } + VLOG(0) << "End GetRandomData"; +} - VLOG(3) << "Begin call PushSparseGPU in BoxPS"; - push_boxps_timer.Start(); - int ret = boxps_ptr_->PushSparseGPU( - total_keys, total_grad_values_gpu, static_cast(total_length), - BOOST_GET_CONST(platform::CUDAPlace, place).GetDeviceId()); - PADDLE_ENFORCE_EQ(ret, 0, platform::errors::PreconditionNotMet( - "PushSparseGPU failed in BoxPS.")); - push_boxps_timer.Pause(); -#else - PADDLE_THROW(platform::errors::PreconditionNotMet( - "Please compile WITH_GPU option, because NCCL doesn't support " - "windows.")); -#endif - } else { - PADDLE_THROW(platform::errors::PreconditionNotMet( - "PaddleBox: PushSparseGrad Only Support CPUPlace or CUDAPlace Now.")); +void BoxWrapper::AddReplaceFeasign(boxps::PSAgentBase* p_agent, + int feed_pass_thread_num) { + VLOG(0) << "Enter AddReplaceFeasign Function"; + int semi; + pass_done_semi_->Get(semi); + VLOG(0) << "Last Pass had updated random pool done. Begin AddReplaceFeasign"; + std::vector threads; + for (int tid = 0; tid < feed_pass_thread_num; ++tid) { + threads.push_back(std::thread([this, tid, p_agent, feed_pass_thread_num]() { + VLOG(3) << "AddReplaceFeasign begin for thread[" << tid << "]"; + for (size_t pool_id = tid; pool_id < random_ins_pool_list.size(); + pool_id += feed_pass_thread_num) { + auto& random_pool = random_ins_pool_list[pool_id]; + for (size_t i = 0; i < random_pool.Size(); ++i) { + auto& ins_candidate = random_pool.Get(i); + for (const auto& pair : ins_candidate.feas_) { + p_agent->AddKey(pair.second.uint64_feasign_, tid); + } + } + } + })); } - all_timer.Pause(); - VLOG(1) << "PushSparseGrad total cost: " << all_timer.ElapsedSec() - << " s, of which BoxPS cost: " << push_boxps_timer.ElapsedSec() - << " s"; - VLOG(3) << "End PushSparseGrad"; + for (int tid = 0; tid < feed_pass_thread_num; ++tid) { + threads[tid].join(); + } + VLOG(0) << "End AddReplaceFeasign"; } + } // end namespace framework } // end namespace paddle #endif diff --git a/paddle/fluid/framework/fleet/box_wrapper.cu b/paddle/fluid/framework/fleet/box_wrapper.cu index a24627c068fc06bcec1ac6e92a3f1d66f36782f4..c315abd737c9bd42106f27b0ba11fece8163820d 100644 --- a/paddle/fluid/framework/fleet/box_wrapper.cu +++ b/paddle/fluid/framework/fleet/box_wrapper.cu @@ -27,9 +27,12 @@ namespace framework { for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \ i += blockDim.x * gridDim.x) -__global__ void PullCopy(float** dest, const boxps::FeatureValueGpu* src, - const int64_t* len, int hidden, int slot_num, - int total_len, uint64_t** keys) { +template +__global__ void PullCopy( + float** dest, + const boxps::FeatureValueGpu* src, + const int64_t* len, int hidden, int expand_dim, int slot_num, int total_len, + uint64_t** keys) { CUDA_KERNEL_LOOP(i, total_len) { int low = 0; int high = slot_num - 1; @@ -52,15 +55,28 @@ __global__ void PullCopy(float** dest, const boxps::FeatureValueGpu* src, *(dest[x] + y * hidden + 2) = (src + i)->embed_w; } if ((src + i)->embedding_size == 0 || *(keys[x] + y) == 0) { - for (int j = 0; j < 8; j++) { + for (int j = 0; j < hidden - 3; j++) { *(dest[x] + y * hidden + 3 + j) = 0; } } else { - for (int j = 0; j < 8; j++) { + for (int j = 0; j < hidden - 3; j++) { *(dest[x] + y * hidden + 3 + j) = (src + i)->embedx[1 + j]; } } - } + // process embed_expand + if (expand_dim > 0) { + int z = x + slot_num; + if ((src + i)->embed_expand_size[0] == 0 || *(keys[x] + y) == 0) { + for (int j = 0; j < expand_dim; j++) { + *(dest[z] + y * expand_dim + j) = 0; + } + } else { + for (int j = 0; j < expand_dim; j++) { + *(dest[z] + y * expand_dim + j) = (src + i)->embed_expand[1 + j]; + } + } + } + } // end kernel loop } __global__ void CopyKeysKernel(uint64_t** src_keys, uint64_t* dest_total_keys, @@ -82,9 +98,11 @@ __global__ void CopyKeysKernel(uint64_t** src_keys, uint64_t* dest_total_keys, } } -__global__ void PushCopy(boxps::FeaturePushValueGpu* dest, float** src, - int64_t* len, int hidden, int slot_num, int total_len, - int bs, int* slot_vector) { +template +__global__ void PushCopy( + boxps::FeaturePushValueGpu* dest, float** src, + int64_t* len, int hidden, int expand_dim, int slot_num, int total_len, + int bs, int* slot_vector) { CUDA_KERNEL_LOOP(i, total_len) { int low = 0; int high = slot_num - 1; @@ -101,18 +119,25 @@ __global__ void PushCopy(boxps::FeaturePushValueGpu* dest, float** src, (dest + i)->show = *(src[x] + y * hidden); (dest + i)->clk = *(src[x] + y * hidden + 1); (dest + i)->embed_g = *(src[x] + y * hidden + 2) * -1. * bs; - for (int j = 0; j < 8; j++) { + for (int j = 0; j < hidden - 3; j++) { (dest + i)->embedx_g[j] = *(src[x] + y * hidden + 3 + j) * -1. * bs; } + if (expand_dim > 0) { + int z = x + slot_num; + for (int j = 0; j < expand_dim; j++) { + (dest + i)->embed_expand_g[j] = + *(src[z] + y * expand_dim + j) * -1. * bs; + } + } } } void BoxWrapper::CopyForPull(const paddle::platform::Place& place, uint64_t** gpu_keys, const std::vector& values, - const boxps::FeatureValueGpu* total_values_gpu, - const int64_t* gpu_len, const int slot_num, - const int hidden_size, + void* total_values_gpu, const int64_t* gpu_len, + const int slot_num, const int hidden_size, + const int expand_embed_dim, const int64_t total_length) { auto stream = dynamic_cast( platform::DeviceContextPool::Instance().Get( @@ -122,11 +147,40 @@ void BoxWrapper::CopyForPull(const paddle::platform::Place& place, float** gpu_values = reinterpret_cast(buf_value->ptr()); cudaMemcpy(gpu_values, values.data(), values.size() * sizeof(float*), cudaMemcpyHostToDevice); +#define EMBEDX_CASE(i, ...) \ + case i: { \ + constexpr size_t EmbedxDim = i; \ + switch (expand_embed_dim) { \ + __VA_ARGS__ \ + default: \ + PADDLE_THROW(platform::errors::InvalidArgument( \ + "Unsupport this expand embedding size [%d]", expand_embed_dim)); \ + } \ + } break + +#define EXPAND_EMBED_PULL_CASE(i, ...) \ + case i: { \ + constexpr size_t ExpandDim = i; \ + PullCopy<<<(total_length + 512 - 1) / 512, 512, 0, stream>>>( \ + gpu_values, \ + reinterpret_cast*>( \ + total_values_gpu), \ + gpu_len, hidden_size, expand_embed_dim, slot_num, total_length, \ + gpu_keys); \ + } break - PullCopy<<<(total_length + 512 - 1) / 512, 512, 0, stream>>>( - gpu_values, total_values_gpu, gpu_len, hidden_size, slot_num, - total_length, gpu_keys); + switch (hidden_size - 3) { + EMBEDX_CASE(8, EXPAND_EMBED_PULL_CASE(0); EXPAND_EMBED_PULL_CASE(8); + EXPAND_EMBED_PULL_CASE(64);); + EMBEDX_CASE(16, EXPAND_EMBED_PULL_CASE(0);); + default: + PADDLE_THROW(platform::errors::InvalidArgument( + "Unsupport this embedding size [%d]", hidden_size - 3)); + } cudaStreamSynchronize(stream); +#undef EXPAND_EMBED_PULL_CASE +#undef EMBEDX_CASE } void BoxWrapper::CopyKeys(const paddle::platform::Place& place, @@ -143,10 +197,10 @@ void BoxWrapper::CopyKeys(const paddle::platform::Place& place, void BoxWrapper::CopyForPush(const paddle::platform::Place& place, const std::vector& grad_values, - boxps::FeaturePushValueGpu* total_grad_values_gpu, + void* total_grad_values_gpu, const std::vector& slot_lengths, - const int hidden_size, const int64_t total_length, - const int batch_size) { + const int hidden_size, const int expand_embed_dim, + const int64_t total_length, const int batch_size) { auto stream = dynamic_cast( platform::DeviceContextPool::Instance().Get( BOOST_GET_CONST(platform::CUDAPlace, place))) @@ -173,11 +227,42 @@ void BoxWrapper::CopyForPush(const paddle::platform::Place& place, cudaMemcpy(d_slot_vector, slot_vector_.data(), slot_lengths_lod.size() * sizeof(int), cudaMemcpyHostToDevice); - PushCopy<<<(total_length + 512 - 1) / 512, 512, 0, stream>>>( - total_grad_values_gpu, gpu_values, gpu_len, hidden_size, - slot_lengths.size(), total_length, batch_size, d_slot_vector); +#define EMBEDX_CASE(i, ...) \ + case i: { \ + constexpr size_t EmbedxDim = i; \ + switch (expand_embed_dim) { \ + __VA_ARGS__ \ + default: \ + PADDLE_THROW(platform::errors::InvalidArgument( \ + "Unsupport this expand embedding size [%d]", expand_embed_dim)); \ + } \ + } break + +#define EXPAND_EMBED_PUSH_CASE(i, ...) \ + case i: { \ + constexpr size_t ExpandDim = i; \ + PushCopy<<<(total_length + 512 - 1) / 512, 512, 0, stream>>>( \ + reinterpret_cast*>( \ + total_grad_values_gpu), \ + gpu_values, gpu_len, hidden_size, expand_embed_dim, \ + slot_lengths.size(), total_length, batch_size, d_slot_vector); \ + } break + + switch (hidden_size - 3) { + EMBEDX_CASE(8, EXPAND_EMBED_PUSH_CASE(0); EXPAND_EMBED_PUSH_CASE(8); + EXPAND_EMBED_PUSH_CASE(64);); + EMBEDX_CASE(16, EXPAND_EMBED_PUSH_CASE(0);); + default: + PADDLE_THROW(platform::errors::InvalidArgument( + "Unsupport this embedding size [%d]", hidden_size - 3)); + } + cudaStreamSynchronize(stream); +#undef EXPAND_EMBED_PUSH_CASE +#undef EMBEDX_CASE } + } // end namespace framework } // end namespace paddle #endif diff --git a/paddle/fluid/framework/fleet/box_wrapper.h b/paddle/fluid/framework/fleet/box_wrapper.h index 0b15a6dfa7a7c26afda7c19b7f715971d9768334..af533fe22e0032745b2520461d7e4425c85f46f4 100644 --- a/paddle/fluid/framework/fleet/box_wrapper.h +++ b/paddle/fluid/framework/fleet/box_wrapper.h @@ -31,10 +31,12 @@ limitations under the License. */ #include #include #include // NOLINT +#include #include #include #include #include +#include "paddle/fluid/framework/data_feed.h" #include "paddle/fluid/framework/data_set.h" #include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/scope.h" @@ -339,30 +341,54 @@ class BoxWrapper { void BeginPass() const; void EndPass(bool need_save_delta) const; void SetTestMode(bool is_test) const; + + template + void PullSparseCase(const paddle::platform::Place& place, + const std::vector& keys, + const std::vector& values, + const std::vector& slot_lengths, + const int hidden_size, const int expand_embed_dim); + void PullSparse(const paddle::platform::Place& place, const std::vector& keys, const std::vector& values, const std::vector& slot_lengths, - const int hidden_size); + const int hidden_size, const int expand_embed_dim); + + template + void PushSparseGradCase(const paddle::platform::Place& place, + const std::vector& keys, + const std::vector& grad_values, + const std::vector& slot_lengths, + const int hidden_size, const int expand_embed_dim, + const int batch_size); + void PushSparseGrad(const paddle::platform::Place& place, const std::vector& keys, const std::vector& grad_values, const std::vector& slot_lengths, - const int hidden_size, const int batch_size); + const int hidden_size, const int expand_embed_dim, + const int batch_size); + void CopyForPull(const paddle::platform::Place& place, uint64_t** gpu_keys, - const std::vector& values, - const boxps::FeatureValueGpu* total_values_gpu, + const std::vector& values, void* total_values_gpu, const int64_t* gpu_len, const int slot_num, - const int hidden_size, const int64_t total_length); + const int hidden_size, const int expand_embed_dim, + const int64_t total_length); + void CopyForPush(const paddle::platform::Place& place, const std::vector& grad_values, - boxps::FeaturePushValueGpu* total_grad_values_gpu, + void* total_grad_values_gpu, const std::vector& slot_lengths, - const int hidden_size, const int64_t total_length, - const int batch_size); + const int hidden_size, const int expand_embed_dim, + const int64_t total_length, const int batch_size); + void CopyKeys(const paddle::platform::Place& place, uint64_t** origin_keys, uint64_t* total_keys, const int64_t* gpu_len, int slot_num, int total_len); + + void CheckEmbedSizeIsValid(int embedx_dim, int expand_embed_dim); + boxps::PSAgentBase* GetAgent() { return p_agent_; } void InitializeGPUAndLoadModel( const char* conf_file, const std::vector& slot_vector, @@ -440,6 +466,15 @@ class BoxWrapper { } static std::shared_ptr GetInstance() { + PADDLE_ENFORCE_EQ( + s_instance_ == nullptr, false, + platform::errors::PreconditionNotMet( + "GetInstance failed in BoxPs, you should use SetInstance firstly")); + return s_instance_; + } + + static std::shared_ptr SetInstance(int embedx_dim = 8, + int expand_embed_dim = 0) { if (nullptr == s_instance_) { // If main thread is guaranteed to init this, this lock can be removed static std::mutex mutex; @@ -447,8 +482,13 @@ class BoxWrapper { if (nullptr == s_instance_) { VLOG(3) << "s_instance_ is null"; s_instance_.reset(new paddle::framework::BoxWrapper()); - s_instance_->boxps_ptr_.reset(boxps::BoxPSBase::GetIns()); + s_instance_->boxps_ptr_.reset( + boxps::BoxPSBase::GetIns(embedx_dim, expand_embed_dim)); + embedx_dim_ = embedx_dim; + expand_embed_dim_ = expand_embed_dim; } + } else { + LOG(WARNING) << "You have already used SetInstance() before"; } return s_instance_; } @@ -469,16 +509,16 @@ class BoxWrapper { public: MetricMsg() {} MetricMsg(const std::string& label_varname, const std::string& pred_varname, - int is_join, int bucket_size = 1000000) + int metric_phase, int bucket_size = 1000000) : label_varname_(label_varname), pred_varname_(pred_varname), - is_join_(is_join) { + metric_phase_(metric_phase) { calculator = new BasicAucCalculator(); calculator->init(bucket_size); } virtual ~MetricMsg() {} - int IsJoin() const { return is_join_; } + int MetricPhase() const { return metric_phase_; } BasicAucCalculator* GetCalculator() { return calculator; } virtual void add_data(const Scope* exe_scope) { std::vector label_data; @@ -514,20 +554,20 @@ class BoxWrapper { protected: std::string label_varname_; std::string pred_varname_; - int is_join_; + int metric_phase_; BasicAucCalculator* calculator; }; class MultiTaskMetricMsg : public MetricMsg { public: MultiTaskMetricMsg(const std::string& label_varname, - const std::string& pred_varname_list, int is_join, + const std::string& pred_varname_list, int metric_phase, const std::string& cmatch_rank_group, const std::string& cmatch_rank_varname, int bucket_size = 1000000) { label_varname_ = label_varname; cmatch_rank_varname_ = cmatch_rank_varname; - is_join_ = is_join; + metric_phase_ = metric_phase; calculator = new BasicAucCalculator(); calculator->init(bucket_size); for (auto& cmatch_rank : string::split_string(cmatch_rank_group)) { @@ -594,14 +634,14 @@ class BoxWrapper { class CmatchRankMetricMsg : public MetricMsg { public: CmatchRankMetricMsg(const std::string& label_varname, - const std::string& pred_varname, int is_join, + const std::string& pred_varname, int metric_phase, const std::string& cmatch_rank_group, const std::string& cmatch_rank_varname, int bucket_size = 1000000) { label_varname_ = label_varname; pred_varname_ = pred_varname; cmatch_rank_varname_ = cmatch_rank_varname; - is_join_ = is_join; + metric_phase_ = metric_phase; calculator = new BasicAucCalculator(); calculator->init(bucket_size); for (auto& cmatch_rank : string::split_string(cmatch_rank_group)) { @@ -653,12 +693,12 @@ class BoxWrapper { class MaskMetricMsg : public MetricMsg { public: MaskMetricMsg(const std::string& label_varname, - const std::string& pred_varname, int is_join, + const std::string& pred_varname, int metric_phase, const std::string& mask_varname, int bucket_size = 1000000) { label_varname_ = label_varname; pred_varname_ = pred_varname; mask_varname_ = mask_varname; - is_join_ = is_join; + metric_phase_ = metric_phase; calculator = new BasicAucCalculator(); calculator->init(bucket_size); } @@ -682,36 +722,59 @@ class BoxWrapper { protected: std::string mask_varname_; }; - const std::vector& GetMetricNameList() const { - return metric_name_list_; + const std::vector GetMetricNameList( + int metric_phase = -1) const { + VLOG(0) << "Want to Get metric phase: " << metric_phase; + if (metric_phase == -1) { + return metric_name_list_; + } else { + std::vector ret; + for (const auto& name : metric_name_list_) { + const auto iter = metric_lists_.find(name); + PADDLE_ENFORCE_NE( + iter, metric_lists_.end(), + platform::errors::InvalidArgument( + "The metric name you provided is not registered.")); + + if (iter->second->MetricPhase() == metric_phase) { + VLOG(0) << name << "'s phase is " << iter->second->MetricPhase() + << ", we want"; + ret.push_back(name); + } else { + VLOG(0) << name << "'s phase is " << iter->second->MetricPhase() + << ", not we want"; + } + } + return ret; + } } - int PassFlag() const { return pass_flag_; } - void FlipPassFlag() { pass_flag_ = 1 - pass_flag_; } + int Phase() const { return phase_; } + void FlipPhase() { phase_ = (phase_ + 1) % phase_num_; } std::map& GetMetricList() { return metric_lists_; } void InitMetric(const std::string& method, const std::string& name, const std::string& label_varname, const std::string& pred_varname, const std::string& cmatch_rank_varname, - const std::string& mask_varname, bool is_join, + const std::string& mask_varname, int metric_phase, const std::string& cmatch_rank_group, int bucket_size = 1000000) { if (method == "AucCalculator") { metric_lists_.emplace(name, new MetricMsg(label_varname, pred_varname, - is_join ? 1 : 0, bucket_size)); + metric_phase, bucket_size)); } else if (method == "MultiTaskAucCalculator") { metric_lists_.emplace( name, new MultiTaskMetricMsg(label_varname, pred_varname, - is_join ? 1 : 0, cmatch_rank_group, + metric_phase, cmatch_rank_group, cmatch_rank_varname, bucket_size)); } else if (method == "CmatchRankAucCalculator") { metric_lists_.emplace( name, new CmatchRankMetricMsg(label_varname, pred_varname, - is_join ? 1 : 0, cmatch_rank_group, + metric_phase, cmatch_rank_group, cmatch_rank_varname, bucket_size)); } else if (method == "MaskAucCalculator") { metric_lists_.emplace( - name, new MaskMetricMsg(label_varname, pred_varname, is_join ? 1 : 0, + name, new MaskMetricMsg(label_varname, pred_varname, metric_phase, mask_varname, bucket_size)); } else { PADDLE_THROW(platform::errors::Unimplemented( @@ -751,9 +814,13 @@ class BoxWrapper { const int feedpass_thread_num_ = 30; // magic number static std::shared_ptr s_instance_; std::unordered_set slot_name_omited_in_feedpass_; + // EMBEDX_DIM and EXPAND_EMBED_DIM + static int embedx_dim_; + static int expand_embed_dim_; // Metric Related - int pass_flag_ = 1; // join: 1, update: 0 + int phase_ = 1; + int phase_num_ = 2; std::map metric_lists_; std::vector metric_name_list_; std::vector slot_vector_; @@ -762,6 +829,57 @@ class BoxWrapper { public: static AfsManager* afs_manager; + + // Auc Runner + public: + void InitializeAucRunner(std::vector> slot_eval, + int thread_num, int pool_size, + std::vector slot_list) { + mode_ = 1; + phase_num_ = static_cast(slot_eval.size()); + phase_ = phase_num_ - 1; + auc_runner_thread_num_ = thread_num; + pass_done_semi_ = paddle::framework::MakeChannel(); + pass_done_semi_->Put(1); // Note: At most 1 pipeline in AucRunner + random_ins_pool_list.resize(thread_num); + + std::unordered_set slot_set; + for (size_t i = 0; i < slot_eval.size(); ++i) { + for (const auto& slot : slot_eval[i]) { + slot_set.insert(slot); + } + } + for (size_t i = 0; i < slot_list.size(); ++i) { + if (slot_set.find(slot_list[i]) != slot_set.end()) { + slot_index_to_replace_.insert(static_cast(i)); + } + } + for (int i = 0; i < auc_runner_thread_num_; ++i) { + random_ins_pool_list[i].SetSlotIndexToReplace(slot_index_to_replace_); + } + VLOG(0) << "AucRunner configuration: thread number[" << thread_num + << "], pool size[" << pool_size << "], runner_group[" << phase_num_ + << "]"; + VLOG(0) << "Slots that need to be evaluated:"; + for (auto e : slot_index_to_replace_) { + VLOG(0) << e << ": " << slot_list[e]; + } + } + void GetRandomReplace(const std::vector& pass_data); + void AddReplaceFeasign(boxps::PSAgentBase* p_agent, int feed_pass_thread_num); + void GetRandomData(const std::vector& pass_data, + const std::unordered_set& slots_to_replace, + std::vector* result); + int Mode() const { return mode_; } + + private: + int mode_ = 0; // 0 means train/test 1 means auc_runner + int auc_runner_thread_num_ = 1; + bool init_done_ = false; + paddle::framework::Channel pass_done_semi_; + std::unordered_set slot_index_to_replace_; + std::vector random_ins_pool_list; + std::vector replace_idx_; }; #endif @@ -810,7 +928,38 @@ class BoxHelper { VLOG(3) << "After PreLoadIntoMemory()"; } void WaitFeedPassDone() { feed_data_thread_->join(); } + void SlotsShuffle(const std::set& slots_to_replace) { +#ifdef PADDLE_WITH_BOX_PS + auto box_ptr = BoxWrapper::GetInstance(); + PADDLE_ENFORCE_EQ(box_ptr->Mode(), 1, + platform::errors::PreconditionNotMet( + "Should call InitForAucRunner first.")); + box_ptr->FlipPhase(); + std::unordered_set index_slots; + dynamic_cast(dataset_)->PreprocessChannel( + slots_to_replace, index_slots); + const std::vector& pass_data = + dynamic_cast(dataset_)->GetSlotsOriginalData(); + if (!get_random_replace_done_) { + box_ptr->GetRandomReplace(pass_data); + get_random_replace_done_ = true; + } + std::vector random_data; + random_data.resize(pass_data.size()); + box_ptr->GetRandomData(pass_data, index_slots, &random_data); + + auto new_input_channel = paddle::framework::MakeChannel(); + new_input_channel->Open(); + new_input_channel->Write(std::move(random_data)); + new_input_channel->Close(); + dynamic_cast(dataset_)->SetInputChannel( + new_input_channel); + if (dataset_->EnablePvMerge()) { + dataset_->PreprocessInstance(); + } +#endif + } #ifdef PADDLE_WITH_BOX_PS // notify boxps to feed this pass feasigns from SSD to memory static void FeedPassThread(const std::deque& t, int begin_index, @@ -881,6 +1030,10 @@ class BoxHelper { for (size_t i = 0; i < tnum; ++i) { threads[i].join(); } + + if (box_ptr->Mode() == 1) { + box_ptr->AddReplaceFeasign(p_agent, tnum); + } VLOG(3) << "Begin call EndFeedPass in BoxPS"; box_ptr->EndFeedPass(p_agent); #endif @@ -892,7 +1045,10 @@ class BoxHelper { int year_; int month_; int day_; + bool get_random_replace_done_ = false; }; } // end namespace framework } // end namespace paddle + +#include "paddle/fluid/framework/fleet/box_wrapper_impl.h" diff --git a/paddle/fluid/framework/fleet/box_wrapper_impl.h b/paddle/fluid/framework/fleet/box_wrapper_impl.h new file mode 100644 index 0000000000000000000000000000000000000000..b4e414dc83ef1000f2e1e09525699b5bb47d2441 --- /dev/null +++ b/paddle/fluid/framework/fleet/box_wrapper_impl.h @@ -0,0 +1,163 @@ +/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#ifdef PADDLE_WITH_BOX_PS +#include +namespace paddle { +namespace framework { + +template +void BoxWrapper::PullSparseCase(const paddle::platform::Place& place, + const std::vector& keys, + const std::vector& values, + const std::vector& slot_lengths, + const int hidden_size, + const int expand_embed_dim) { + VLOG(3) << "Begin PullSparse"; + platform::Timer all_timer; + platform::Timer pull_boxps_timer; + all_timer.Start(); + + int64_t total_length = + std::accumulate(slot_lengths.begin(), slot_lengths.end(), 0UL); + auto buf = memory::AllocShared( + place, total_length * + sizeof(boxps::FeatureValueGpu)); + boxps::FeatureValueGpu* total_values_gpu = + reinterpret_cast*>( + buf->ptr()); + + if (platform::is_cpu_place(place)) { + PADDLE_THROW(platform::errors::Unimplemented( + "Warning:: CPUPlace is not supported in PaddleBox now.")); + } else if (platform::is_gpu_place(place)) { +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) + VLOG(3) << "Begin copy keys, key_num[" << total_length << "]"; + int device_id = BOOST_GET_CONST(platform::CUDAPlace, place).GetDeviceId(); + LoDTensor& total_keys_tensor = keys_tensor[device_id]; + uint64_t* total_keys = reinterpret_cast( + total_keys_tensor.mutable_data({total_length, 1}, place)); + + // construct slot_level lod info + auto slot_lengths_lod = slot_lengths; + for (size_t i = 1; i < slot_lengths_lod.size(); i++) { + slot_lengths_lod[i] += slot_lengths_lod[i - 1]; + } + auto buf_key = memory::AllocShared(place, keys.size() * sizeof(uint64_t*)); + auto buf_length = + memory::AllocShared(place, slot_lengths.size() * sizeof(int64_t)); + uint64_t** gpu_keys = reinterpret_cast(buf_key->ptr()); + int64_t* gpu_len = reinterpret_cast(buf_length->ptr()); + cudaMemcpy(gpu_keys, keys.data(), keys.size() * sizeof(uint64_t*), + cudaMemcpyHostToDevice); + cudaMemcpy(gpu_len, slot_lengths_lod.data(), + slot_lengths.size() * sizeof(int64_t), cudaMemcpyHostToDevice); + + this->CopyKeys(place, gpu_keys, total_keys, gpu_len, + static_cast(slot_lengths.size()), + static_cast(total_length)); + VLOG(3) << "Begin call PullSparseGPU in BoxPS"; + pull_boxps_timer.Start(); + int ret = boxps_ptr_->PullSparseGPU( + total_keys, reinterpret_cast(total_values_gpu), + static_cast(total_length), device_id); + PADDLE_ENFORCE_EQ(ret, 0, platform::errors::PreconditionNotMet( + "PullSparseGPU failed in BoxPS.")); + pull_boxps_timer.Pause(); + + VLOG(3) << "Begin Copy result to tensor, total_length[" << total_length + << "]"; + this->CopyForPull(place, gpu_keys, values, + reinterpret_cast(total_values_gpu), gpu_len, + static_cast(slot_lengths.size()), hidden_size, + expand_embed_dim, total_length); +#else + PADDLE_THROW(platform::errors::PreconditionNotMet( + "Please compile WITH_GPU option, because NCCL doesn't support " + "windows.")); +#endif + } else { + PADDLE_THROW(platform::errors::PreconditionNotMet( + "PaddleBox: PullSparse Only Support CPUPlace or CUDAPlace Now.")); + } + all_timer.Pause(); + VLOG(1) << "PullSparse total costs: " << all_timer.ElapsedSec() + << " s, of which BoxPS costs: " << pull_boxps_timer.ElapsedSec() + << " s"; + VLOG(3) << "End PullSparse"; +} + +template +void BoxWrapper::PushSparseGradCase( + const paddle::platform::Place& place, + const std::vector& keys, + const std::vector& grad_values, + const std::vector& slot_lengths, const int hidden_size, + const int expand_embed_dim, const int batch_size) { + VLOG(3) << "Begin PushSparseGrad"; + platform::Timer all_timer; + platform::Timer push_boxps_timer; + all_timer.Start(); + int64_t total_length = + std::accumulate(slot_lengths.begin(), slot_lengths.end(), 0UL); + auto buf = memory::AllocShared( + place, + total_length * + sizeof(boxps::FeaturePushValueGpu)); + boxps::FeaturePushValueGpu* + total_grad_values_gpu = reinterpret_cast< + boxps::FeaturePushValueGpu*>( + buf->ptr()); + if (platform::is_cpu_place(place)) { + PADDLE_THROW(platform::errors::Unimplemented( + "Warning:: CPUPlace is not supported in PaddleBox now.")); + } else if (platform::is_gpu_place(place)) { +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) + int device_id = BOOST_GET_CONST(platform::CUDAPlace, place).GetDeviceId(); + LoDTensor& cached_total_keys_tensor = keys_tensor[device_id]; + uint64_t* total_keys = + reinterpret_cast(cached_total_keys_tensor.data()); + VLOG(3) << "Begin copy grad tensor to boxps struct"; + this->CopyForPush(place, grad_values, total_grad_values_gpu, slot_lengths, + hidden_size, expand_embed_dim, total_length, batch_size); + + VLOG(3) << "Begin call PushSparseGPU in BoxPS"; + push_boxps_timer.Start(); + int ret = boxps_ptr_->PushSparseGPU( + total_keys, reinterpret_cast(total_grad_values_gpu), + static_cast(total_length), + BOOST_GET_CONST(platform::CUDAPlace, place).GetDeviceId()); + PADDLE_ENFORCE_EQ(ret, 0, platform::errors::PreconditionNotMet( + "PushSparseGPU failed in BoxPS.")); + push_boxps_timer.Pause(); +#else + PADDLE_THROW(platform::errors::PreconditionNotMet( + "Please compile WITH_GPU option, because NCCL doesn't support " + "windows.")); +#endif + } else { + PADDLE_THROW(platform::errors::PreconditionNotMet( + "PaddleBox: PushSparseGrad Only Support CPUPlace or CUDAPlace Now.")); + } + all_timer.Pause(); + VLOG(1) << "PushSparseGrad total cost: " << all_timer.ElapsedSec() + << " s, of which BoxPS cost: " << push_boxps_timer.ElapsedSec() + << " s"; + VLOG(3) << "End PushSparseGrad"; +} + +} // namespace framework +} // namespace paddle +#endif diff --git a/paddle/fluid/framework/parallel_executor.cc b/paddle/fluid/framework/parallel_executor.cc index ea36e83038c7353a0f75612fd13e0a64c8bfd76e..8c6dd628bb9748bb120c1c39841e199659fb53fc 100644 --- a/paddle/fluid/framework/parallel_executor.cc +++ b/paddle/fluid/framework/parallel_executor.cc @@ -31,6 +31,7 @@ limitations under the License. */ #include "paddle/fluid/framework/ir/memory_optimize_pass/memory_optimization_var_info.h" #include "paddle/fluid/framework/ir/memory_optimize_pass/reference_count_pass_helper.h" #include "paddle/fluid/framework/ir/multi_devices_graph_pass/set_reader_device_info_utils.h" +#include "paddle/fluid/platform/event.h" #include "paddle/fluid/platform/profiler.h" DECLARE_double(eager_delete_tensor_gb); @@ -820,6 +821,8 @@ void ParallelExecutor::BCastParamsToDevices( FetchResultType ParallelExecutor::Run( const std::vector &fetch_tensors, bool return_merged) { VLOG(3) << "enter ParallelExecutor Run"; + platform::RecordEvent parallel_executor_event( + "ParallelExecutor::Run", paddle::platform::EventRole::kSpecial); #ifdef WITH_GPERFTOOLS if (gProfileStarted) { ProfilerFlush(); diff --git a/paddle/fluid/framework/section_worker.cc b/paddle/fluid/framework/section_worker.cc index 1d644cdd7fb76ff731c4533b3129ad3fa2c724c2..df8bd61554e590fb0d83960a0fca63f78229c9a4 100644 --- a/paddle/fluid/framework/section_worker.cc +++ b/paddle/fluid/framework/section_worker.cc @@ -211,7 +211,7 @@ void SectionWorker::TrainFiles() { auto& metric_list = box_ptr->GetMetricList(); for (auto iter = metric_list.begin(); iter != metric_list.end(); iter++) { auto* metric_msg = iter->second; - if (metric_msg->IsJoin() != box_ptr->PassFlag()) { + if (box_ptr->Phase() != metric_msg->MetricPhase()) { continue; } metric_msg->add_data(exe_scope); @@ -367,7 +367,7 @@ void SectionWorker::TrainFilesWithProfiler() { auto& metric_list = box_ptr->GetMetricList(); for (auto iter = metric_list.begin(); iter != metric_list.end(); iter++) { auto* metric_msg = iter->second; - if (metric_msg->IsJoin() != box_ptr->PassFlag()) { + if (box_ptr->Phase() != metric_msg->MetricPhase()) { continue; } metric_msg->add_data(exe_scope); diff --git a/paddle/fluid/operators/controlflow/op_variant.h b/paddle/fluid/operators/controlflow/op_variant.h index 3b1e15b1017abf8b90c0f9d7aaf106655ac70d31..9af993f1006c1e0107951c89460abe76c1561064 100644 --- a/paddle/fluid/operators/controlflow/op_variant.h +++ b/paddle/fluid/operators/controlflow/op_variant.h @@ -43,7 +43,8 @@ class OpVariant { const AttrType &Attr(const std::string &name) const { auto &attrs = Attrs(); auto it = attrs.find(name); - PADDLE_ENFORCE(it != attrs.end(), "Cannot find attribute %s", name); + PADDLE_ENFORCE_NE(it, attrs.end(), platform::errors::NotFound( + "Cannot find attribute %s.", name)); return BOOST_GET_CONST(AttrType, it->second); } diff --git a/paddle/fluid/operators/dequantize_log_op.cc b/paddle/fluid/operators/dequantize_log_op.cc index bfd26061e329f0bd6bf52d5fec818ec168eaf1df..2ecd54f7edde63ab5f5256694117cfb15be69384 100644 --- a/paddle/fluid/operators/dequantize_log_op.cc +++ b/paddle/fluid/operators/dequantize_log_op.cc @@ -31,9 +31,9 @@ struct DequantizeFunctor { int ind = in->numel(); for (size_t i = 0; i < (unsigned)ind; i++) { if (input_data[i] < 0) { - output_data[i] = -std::pow(2.0, dict_data[input_data[i] + 128]); + output_data[i] = -dict_data[input_data[i] + 128]; } else { - output_data[i] = std::pow(2.0, dict_data[input_data[i]]); + output_data[i] = dict_data[input_data[i]]; } } } diff --git a/paddle/fluid/operators/dequantize_log_op.cu b/paddle/fluid/operators/dequantize_log_op.cu index 57bad318ab6b649b9ad9dd1a316f75ff7e8d86a5..9f63f8ed6f52019a8d15d2a4ecc3ec0ecc85e165 100644 --- a/paddle/fluid/operators/dequantize_log_op.cu +++ b/paddle/fluid/operators/dequantize_log_op.cu @@ -26,9 +26,9 @@ __global__ void KeDequantize(const T* in, const float* dict, int num, const int idx = threadIdx.x + blockIdx.x * blockDim.x; if (idx < num) { if (in[idx] < 0) { - out[idx] = -std::pow(static_cast(2.0), dict[in[idx] + 128]); + out[idx] = -dict[in[idx] + 128]; } else { - out[idx] = std::pow(static_cast(2.0), dict[in[idx]]); + out[idx] = dict[in[idx]]; } } } diff --git a/paddle/fluid/operators/elementwise/elementwise_op.h b/paddle/fluid/operators/elementwise/elementwise_op.h index deff0ff8d5aa4656372a0d18489cd704de8c9efa..85d501f6bf7f8f856040c120d49a73a4f4d6696d 100644 --- a/paddle/fluid/operators/elementwise/elementwise_op.h +++ b/paddle/fluid/operators/elementwise/elementwise_op.h @@ -104,7 +104,7 @@ class ElementwiseOp : public framework::OperatorWithKernel { int axis = ctx.Attr("axis"); int rankdiff = ctx.Input("X")->dims().size() - ctx.Input("Y")->dims().size(); - return (axis == -1) || (axis == rankdiff); + return (rankdiff == 0) || (axis == -1) || (axis == rankdiff); }; if (platform::CanMKLDNNBeUsed(ctx) && @@ -243,9 +243,7 @@ class ElementwiseOpGrad : public framework::OperatorWithKernel { #ifdef PADDLE_WITH_MKLDNN // If broadcasting is needed, use native implementation auto CanMKLDNNElementwiseAddGradBeUsed = [&]() { - auto dx = ctx.Output(framework::GradVarName("X")); - auto dy = ctx.Output(framework::GradVarName("Y")); - return (dx != nullptr && dy != nullptr && dx->dims() == dy->dims()); + return (ctx.Input("X")->dims() == ctx.Input("Y")->dims()); }; if (platform::CanMKLDNNBeUsed(ctx) && diff --git a/paddle/fluid/operators/elementwise/mkldnn/elementwise_add_mkldnn_op.cc b/paddle/fluid/operators/elementwise/mkldnn/elementwise_add_mkldnn_op.cc index 89face8faaeed8c306ebd482dfb5d4371a92b6a3..98b79d6bb22fcff09533c2e9325d94659b3ef0c1 100644 --- a/paddle/fluid/operators/elementwise/mkldnn/elementwise_add_mkldnn_op.cc +++ b/paddle/fluid/operators/elementwise/mkldnn/elementwise_add_mkldnn_op.cc @@ -85,6 +85,7 @@ class EltwiseAddMKLDNNGradKernel : public ElemwiseGradKernel { in->set_format(out->format()); }; + // TODO(jczaja): Double check if vcopy works for blocked data auto blas = math::GetBlas(ctx); if (dx) { blas.VCOPY(dout->numel(), dout->data(), diff --git a/paddle/fluid/operators/hierarchical_sigmoid_op.cc b/paddle/fluid/operators/hierarchical_sigmoid_op.cc index dd72a212d3642ce4ae081125cc8e8bd12fb86af1..05d521be5a10643906b9cd5a98a19873b4ee64f4 100644 --- a/paddle/fluid/operators/hierarchical_sigmoid_op.cc +++ b/paddle/fluid/operators/hierarchical_sigmoid_op.cc @@ -257,7 +257,7 @@ class HierarchicalSigmoidGradOpGradVarTypeInference }; DECLARE_NO_NEED_BUFFER_VARS_INFERER( - HierarchicalSigmoidGradOpNoNeedBufferVarInference, "Bias"); + HierarchicalSigmoidGradOpNoNeedBufferVarInferer, "Bias"); } // namespace operators } // namespace paddle @@ -270,7 +270,7 @@ REGISTER_OPERATOR( ops::HierarchicalSigmoidGradMaker); REGISTER_OPERATOR(hierarchical_sigmoid_grad, ops::HierarchicalSigmoidGradOp, ops::HierarchicalSigmoidGradOpGradVarTypeInference, - ops::HierarchicalSigmoidGradOpNoNeedBufferVarInference); + ops::HierarchicalSigmoidGradOpNoNeedBufferVarInferer); REGISTER_OP_CPU_KERNEL( hierarchical_sigmoid, ops::HierarchicalSigmoidOpKernel, diff --git a/paddle/fluid/operators/index_select_op.cc b/paddle/fluid/operators/index_select_op.cc index 31ef7b7662b6996f4402cb2120f4a0029cf78ddf..60ca7e2fe7cfd3070b353bca380d48acadaebe8a 100644 --- a/paddle/fluid/operators/index_select_op.cc +++ b/paddle/fluid/operators/index_select_op.cc @@ -138,7 +138,7 @@ class IndexSelectGradMaker : public framework::SingleGradOpMaker { } }; -DECLARE_NO_NEED_BUFFER_VARS_INFERER(IndexSelectGradNoNeedBufferVarsInference, +DECLARE_NO_NEED_BUFFER_VARS_INFERER(IndexSelectGradNoNeedBufferVarsInferer, "X"); } // namespace operators } // namespace paddle @@ -148,7 +148,7 @@ REGISTER_OPERATOR(index_select, ops::IndexSelectOp, ops::IndexSelectOpMaker, ops::IndexSelectGradMaker, ops::IndexSelectGradMaker); REGISTER_OPERATOR(index_select_grad, ops::IndexSelectGradOp, - ops::IndexSelectGradNoNeedBufferVarsInference); + ops::IndexSelectGradNoNeedBufferVarsInferer); REGISTER_OP_CPU_KERNEL( index_select, ops::IndexSelectKernel, diff --git a/paddle/fluid/operators/instance_norm_op.cc b/paddle/fluid/operators/instance_norm_op.cc index 2609d243705bbcdfb1acc89c36d5660cc657182c..a915c018ab9224e68e57e0f4125e5ee192521f14 100644 --- a/paddle/fluid/operators/instance_norm_op.cc +++ b/paddle/fluid/operators/instance_norm_op.cc @@ -603,7 +603,7 @@ class InstanceNormDoubleGradKernel } }; -DECLARE_INPLACE_OP_INFERER(InstanceNormDoubleGradOpInplaceInference, +DECLARE_INPLACE_OP_INFERER(InstanceNormDoubleGradOpInplaceInferer, {"DY", "DDY"}); } // namespace operators @@ -618,7 +618,7 @@ REGISTER_OPERATOR(instance_norm_grad, ops::InstanceNormGradOp, ops::InstanceNormDoubleGradMaker, ops::InstanceNormDoubleGradMaker); REGISTER_OPERATOR(instance_norm_grad_grad, ops::InstanceNormDoubleGradOp, - ops::InstanceNormDoubleGradOpInplaceInference); + ops::InstanceNormDoubleGradOpInplaceInferer); REGISTER_OP_CPU_KERNEL( instance_norm, diff --git a/paddle/fluid/operators/interpolate_op.cc b/paddle/fluid/operators/interpolate_op.cc index 49da719880d249e7b3d9bdb21f253105f7270576..1e99e22e12b2a23685dad742f175fd2b0684d334 100644 --- a/paddle/fluid/operators/interpolate_op.cc +++ b/paddle/fluid/operators/interpolate_op.cc @@ -585,7 +585,7 @@ class InterpolateGradMaker : public framework::SingleGradOpMaker { } }; -DECLARE_NO_NEED_BUFFER_VARS_INFERER(InterpolateGradNoNeedBufferVarsInference, +DECLARE_NO_NEED_BUFFER_VARS_INFERER(InterpolateGradNoNeedBufferVarsInferer, "X"); } // namespace operators @@ -596,22 +596,22 @@ REGISTER_OPERATOR(bilinear_interp, ops::InterpolateOp, ops::InterpolateOpMaker, ops::InterpolateGradMaker, ops::InterpolateGradMaker); REGISTER_OPERATOR(bilinear_interp_grad, ops::InterpolateOpGrad, - ops::InterpolateGradNoNeedBufferVarsInference); + ops::InterpolateGradNoNeedBufferVarsInferer); REGISTER_OPERATOR(nearest_interp, ops::InterpolateOp, ops::InterpolateOpMaker, ops::InterpolateGradMaker, ops::InterpolateGradMaker); REGISTER_OPERATOR(nearest_interp_grad, ops::InterpolateOpGrad, - ops::InterpolateGradNoNeedBufferVarsInference); + ops::InterpolateGradNoNeedBufferVarsInferer); REGISTER_OPERATOR(trilinear_interp, ops::InterpolateOp, ops::InterpolateOpMaker, ops::InterpolateGradMaker, ops::InterpolateGradMaker); REGISTER_OPERATOR(trilinear_interp_grad, ops::InterpolateOpGrad, - ops::InterpolateGradNoNeedBufferVarsInference); + ops::InterpolateGradNoNeedBufferVarsInferer); REGISTER_OPERATOR(bicubic_interp, ops::InterpolateOp, ops::InterpolateOpMaker, ops::InterpolateGradMaker, ops::InterpolateGradMaker); REGISTER_OPERATOR(bicubic_interp_grad, ops::InterpolateOpGrad, - ops::InterpolateGradNoNeedBufferVarsInference); + ops::InterpolateGradNoNeedBufferVarsInferer); REGISTER_OP_CPU_KERNEL(bilinear_interp, ops::InterpolateKernel, ops::InterpolateKernel, ops::InterpolateKernel); @@ -631,7 +631,7 @@ REGISTER_OPERATOR(linear_interp, ops::InterpolateOp, ops::InterpolateOpMaker, ops::InterpolateGradMaker, ops::InterpolateGradMaker); REGISTER_OPERATOR(linear_interp_grad, ops::InterpolateOpGrad, - ops::InterpolateGradNoNeedBufferVarsInference); + ops::InterpolateGradNoNeedBufferVarsInferer); REGISTER_OP_CPU_KERNEL(linear_interp, ops::InterpolateKernel, ops::InterpolateKernel, ops::InterpolateKernel); diff --git a/paddle/fluid/operators/kldiv_loss_op.cc b/paddle/fluid/operators/kldiv_loss_op.cc index 7286e2e6d3144528d39a3980908aeeddd1ad9823..a78d8ec10149db5a1f8d585cb06bb08ea6ca5a5f 100644 --- a/paddle/fluid/operators/kldiv_loss_op.cc +++ b/paddle/fluid/operators/kldiv_loss_op.cc @@ -166,7 +166,7 @@ class KLDivLossOpGradMaker : public framework::SingleGradOpMaker { } }; -DECLARE_NO_NEED_BUFFER_VARS_INFERER(KLDivLossGradNoNeedBufferVarInference, "X"); +DECLARE_NO_NEED_BUFFER_VARS_INFERER(KLDivLossGradNoNeedBufferVarInferer, "X"); } // namespace operators } // namespace paddle @@ -176,7 +176,7 @@ REGISTER_OPERATOR(kldiv_loss, ops::KLDivLossOp, ops::KLDivLossOpMaker, ops::KLDivLossOpGradMaker, ops::KLDivLossOpGradMaker); REGISTER_OPERATOR(kldiv_loss_grad, ops::KLDivLossOpGrad, - ops::KLDivLossGradNoNeedBufferVarInference); + ops::KLDivLossGradNoNeedBufferVarInferer); REGISTER_OP_CPU_KERNEL( kldiv_loss, ops::KLDivLossKernel, ops::KLDivLossKernel); diff --git a/paddle/fluid/operators/layer_norm_op.cc b/paddle/fluid/operators/layer_norm_op.cc index 3c308ecd5100fe4f8c82d706489a11acf15a61ea..89d8b57505da242f365d5bc5c03dce492edc76d4 100644 --- a/paddle/fluid/operators/layer_norm_op.cc +++ b/paddle/fluid/operators/layer_norm_op.cc @@ -220,7 +220,7 @@ class LayerNormGradOpMaker : public framework::SingleGradOpMaker { } }; -DECLARE_NO_NEED_BUFFER_VARS_INFERER(LayerNormGradNoNeedBufferVarInference, +DECLARE_NO_NEED_BUFFER_VARS_INFERER(LayerNormGradNoNeedBufferVarInferer, "Bias"); } // namespace operators @@ -231,7 +231,7 @@ REGISTER_OPERATOR(layer_norm, ops::LayerNormOp, ops::LayerNormOpMaker, ops::LayerNormGradOpMaker, ops::LayerNormGradOpMaker); REGISTER_OPERATOR(layer_norm_grad, ops::LayerNormGradOp, - ops::LayerNormGradNoNeedBufferVarInference); + ops::LayerNormGradNoNeedBufferVarInferer); REGISTER_OP_CPU_KERNEL( layer_norm, ops::LayerNormKernel, ops::LayerNormKernel); diff --git a/paddle/fluid/operators/linear_chain_crf_op.cc b/paddle/fluid/operators/linear_chain_crf_op.cc index 72eb7fb21d110be19517adcebd554772905d4057..f2ccbb2f21a8fd14e67e9fb165cf9b884539b302 100644 --- a/paddle/fluid/operators/linear_chain_crf_op.cc +++ b/paddle/fluid/operators/linear_chain_crf_op.cc @@ -345,7 +345,7 @@ class LinearChainCRFGradMaker : public framework::SingleGradOpMaker { } }; -DECLARE_NO_NEED_BUFFER_VARS_INFERER(LinearChainCRFGradNoNeedBufferVarsInference, +DECLARE_NO_NEED_BUFFER_VARS_INFERER(LinearChainCRFGradNoNeedBufferVarsInferer, "Transition", "Emission"); } // namespace operators @@ -357,7 +357,7 @@ REGISTER_OPERATOR(linear_chain_crf, ops::LinearChainCRFOp, ops::LinearChainCRFGradMaker, ops::LinearChainCRFGradMaker); REGISTER_OPERATOR(linear_chain_crf_grad, ops::LinearChainCRFGradOp, - ops::LinearChainCRFGradNoNeedBufferVarsInference); + ops::LinearChainCRFGradNoNeedBufferVarsInferer); REGISTER_OP_CPU_KERNEL( linear_chain_crf, ops::LinearChainCRFOpKernel, diff --git a/paddle/fluid/operators/lod_reset_op.cc b/paddle/fluid/operators/lod_reset_op.cc index 377ecacabc1750aa2c4b6e6883cf2b2d423f5d3c..5616309683365c30fea9907268ed87a6f2cd4a8d 100644 --- a/paddle/fluid/operators/lod_reset_op.cc +++ b/paddle/fluid/operators/lod_reset_op.cc @@ -223,7 +223,7 @@ DECLARE_INPLACE_OP_INFERER(LoDResetGradInplaceInferer, {framework::GradVarName("Out"), framework::GradVarName("X")}); -DECLARE_NO_NEED_BUFFER_VARS_INFERER(LoDResetGradNoNeedBufferVarInference, "X"); +DECLARE_NO_NEED_BUFFER_VARS_INFERER(LoDResetGradNoNeedBufferVarInferer, "X"); } // namespace operators } // namespace paddle @@ -234,7 +234,7 @@ REGISTER_OPERATOR(lod_reset, ops::LoDResetOp, ops::LoDResetOpMaker, ops::LoDResetGradMaker, ops::LoDResetOpVarTypeInference, ops::LoDResetInplaceInferer); REGISTER_OPERATOR(lod_reset_grad, ops::LoDResetGradOp, - ops::LoDResetGradNoNeedBufferVarInference, + ops::LoDResetGradNoNeedBufferVarInferer, ops::LoDResetGradInplaceInferer); REGISTER_OP_CPU_KERNEL( diff --git a/paddle/fluid/operators/lookup_table_op.cc b/paddle/fluid/operators/lookup_table_op.cc index 158080cf8a390eb627ae592c5aa6ec0d4a960c72..9b1519b54696c8ecd90c98f46d3826d31526894a 100644 --- a/paddle/fluid/operators/lookup_table_op.cc +++ b/paddle/fluid/operators/lookup_table_op.cc @@ -130,7 +130,7 @@ or not. And the output only shares the LoD information with input Ids. } }; -DECLARE_NO_NEED_BUFFER_VARS_INFERER(LookupTableGradOpNoBuffer, "W"); +DECLARE_NO_NEED_BUFFER_VARS_INFERER(LookupTableGradOpNoBufferVarsInferer, "W"); template class LookupTableGradOpMaker : public framework::SingleGradOpMaker { @@ -198,7 +198,7 @@ REGISTER_OPERATOR(lookup_table, ops::LookupTableOp, ops::LookupTableOpMaker, ops::LookupTableGradOpMaker); REGISTER_OPERATOR(lookup_table_grad, ops::LookupTableOpGrad, - ops::LookupTableGradOpNoBuffer, + ops::LookupTableGradOpNoBufferVarsInferer, ops::LookupTableOpGradVarTypeInference); REGISTER_OP_CPU_KERNEL(lookup_table, ops::LookupTableKernel, diff --git a/paddle/fluid/operators/lookup_table_v2_op.cc b/paddle/fluid/operators/lookup_table_v2_op.cc index 53c9e70dc7fb882c5e4f89f2f6623f92495a51e6..122e01f146ccddbdc8e72aba67d47855ad30b0eb 100644 --- a/paddle/fluid/operators/lookup_table_v2_op.cc +++ b/paddle/fluid/operators/lookup_table_v2_op.cc @@ -118,7 +118,8 @@ or not. And the output only shares the LoD information with input Ids. } }; -DECLARE_NO_NEED_BUFFER_VARS_INFERER(LookupTableV2GradOpNoBuffer, "W"); +DECLARE_NO_NEED_BUFFER_VARS_INFERER(LookupTableV2GradOpNoBufferVarsInferer, + "W"); template class LookupTableV2GradOpMaker : public framework::SingleGradOpMaker { @@ -187,7 +188,7 @@ REGISTER_OPERATOR(lookup_table_v2, ops::LookupTableV2Op, ops::LookupTableV2GradOpMaker); REGISTER_OPERATOR(lookup_table_v2_grad, ops::LookupTableV2OpGrad, - ops::LookupTableV2GradOpNoBuffer, + ops::LookupTableV2GradOpNoBufferVarsInferer, ops::LookupTableV2OpGradVarTypeInference); REGISTER_OP_CPU_KERNEL(lookup_table_v2, ops::LookupTableV2Kernel, diff --git a/paddle/fluid/operators/mean_op.cc b/paddle/fluid/operators/mean_op.cc index 7e75905bc4975b59772cb0d22d8a6db3520e1803..764529a15b6a2b2c98f9ac727d971b8b0b8d1855 100644 --- a/paddle/fluid/operators/mean_op.cc +++ b/paddle/fluid/operators/mean_op.cc @@ -83,7 +83,7 @@ class MeanGradMaker : public framework::SingleGradOpMaker { } }; -DECLARE_NO_NEED_BUFFER_VARS_INFERER(MeanGradNoNeedBufferVarsInference, "X"); +DECLARE_NO_NEED_BUFFER_VARS_INFERER(MeanGradNoNeedBufferVarsInferer, "X"); } // namespace operators } // namespace paddle @@ -93,7 +93,7 @@ REGISTER_OPERATOR(mean, ops::MeanOp, ops::MeanOpMaker, ops::MeanOpInferVarType, ops::MeanGradMaker, ops::MeanGradMaker); REGISTER_OPERATOR(mean_grad, ops::MeanGradOp, - ops::MeanGradNoNeedBufferVarsInference); + ops::MeanGradNoNeedBufferVarsInferer); REGISTER_OP_CPU_KERNEL( mean, ops::MeanKernel, ops::MeanKernel); diff --git a/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc index c8e81362c3fa967b600af9af2f6f5490e648dda0..86fe40c4f6a825116cdf8fe884ae06cc3e7bbc34 100644 --- a/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/activation_mkldnn_op.cc @@ -62,8 +62,9 @@ class MKLDNNActivationGradKernel template void eltwise_forward(const framework::ExecutionContext &ctx, mkldnn::algorithm algorithm) { - PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()), - "It must use CPUPlace."); + PADDLE_ENFORCE_EQ(platform::is_cpu_place(ctx.GetPlace()), true, + paddle::platform::errors::PreconditionNotMet( + "Operator DNNL eletwise_forward must use CPUPlace")); auto &dev_ctx = ctx.template device_context(); const auto *x = ctx.Input("X"); diff --git a/paddle/fluid/operators/mkldnn/batch_norm_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/batch_norm_mkldnn_op.cc index b7be0045258e7aafb64912f2cc75c9c9e05413b6..fde4900c6d3c876151adae061182277482899739 100644 --- a/paddle/fluid/operators/mkldnn/batch_norm_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/batch_norm_mkldnn_op.cc @@ -144,7 +144,11 @@ class BatchNormMKLDNNOpKernel : public paddle::framework::OpKernel { auto src_tz = paddle::framework::vectorize(x->dims()); auto scale_tz = paddle::framework::vectorize(scale->dims()); - PADDLE_ENFORCE(scale_tz.size() == 1, "Dims of scale tensor is NOT 1"); + PADDLE_ENFORCE_EQ( + scale_tz.size(), 1, + platform::errors::InvalidArgument( + "Dims of scale tensor must be 1, but received scale's size is %d", + scale_tz.size())); const unsigned int C = scale_tz[0]; // MKLDNN requires a single piece of memory for scale and shift/bias data @@ -248,7 +252,11 @@ class BatchNormMKLDNNGradOpKernel : public paddle::framework::OpKernel { auto src_tz = paddle::framework::vectorize(x->dims()); auto scale_tz = paddle::framework::vectorize(scale->dims()); - PADDLE_ENFORCE(scale_tz.size() == 1, "Dims of scale tensor is NOT 1"); + PADDLE_ENFORCE_EQ( + scale_tz.size(), 1, + platform::errors::InvalidArgument( + "Dims of scale tensor must be 1, but received scale's size is %d", + scale_tz.size())); const unsigned int C = scale_tz[0]; diff --git a/paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc index bd9bee8873250da1cefba7ef8903a61447a438da..40f64800a0b81a161805857cb3e0a3855f386720 100644 --- a/paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/concat_mkldnn_op.cc @@ -134,6 +134,15 @@ class ConcatMKLDNNOpKernel : public paddle::framework::OpKernel { EnforceLayouts(multi_input); Tensor* output = ctx.Output("Out"); int concat_axis = ctx.Attr("axis"); + const int rank = multi_input[0]->dims().size(); + PADDLE_ENFORCE_EQ( + concat_axis >= -rank && concat_axis < rank, true, + platform::errors::InvalidArgument( + "The axis is expected to be in range of [%d, %d), but got %d", + -rank, rank, concat_axis)); + if (concat_axis < 0) { + concat_axis = concat_axis + rank; + } auto& dev_ctx = ctx.template device_context(); auto place = GetCpuPlace(ctx); diff --git a/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc index c6f782046c95271aa4c63106ca3bd00617eaf43c..a01bf8f9b9cfc04d69d5acf8316a2d1f68142ee5 100644 --- a/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/conv_mkldnn_op.cc @@ -94,8 +94,9 @@ template class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { public: void Compute(const paddle::framework::ExecutionContext& ctx) const override { - PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()), - platform::errors::InvalidArgument("It must use CPUPlace.")); + PADDLE_ENFORCE_EQ(platform::is_cpu_place(ctx.GetPlace()), true, + paddle::platform::errors::PreconditionNotMet( + "Operator DNNL Conv must use CPUPlace")); bool is_INT8 = std::is_same::value || std::is_same::value; if (!is_INT8) { @@ -784,9 +785,9 @@ template class ConvMKLDNNGradOpKernel : public paddle::framework::OpKernel { public: void Compute(const paddle::framework::ExecutionContext& ctx) const override { - PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()), - platform::errors::InvalidArgument("It must use CPUPlace.")); - + PADDLE_ENFORCE_EQ(platform::is_cpu_place(ctx.GetPlace()), true, + paddle::platform::errors::PreconditionNotMet( + "Operator DNNL ConvGrad must use CPUPlace")); auto& dev_ctx = ctx.template device_context(); const auto& mkldnn_engine = dev_ctx.GetEngine(); diff --git a/paddle/fluid/operators/mkldnn/conv_transpose_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/conv_transpose_mkldnn_op.cc index bed0885c0d262da9f8c964da86a5f5aa3ea9d50b..48279658c80e93428f940c40e61d7b9af23f4ee3 100644 --- a/paddle/fluid/operators/mkldnn/conv_transpose_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/conv_transpose_mkldnn_op.cc @@ -29,9 +29,9 @@ template class ConvTransposeMKLDNNOpKernel : public paddle::framework::OpKernel { public: void Compute(const paddle::framework::ExecutionContext& ctx) const override { - PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()), - platform::errors::InvalidArgument("It must use CPUPlace.")); - + PADDLE_ENFORCE_EQ(platform::is_cpu_place(ctx.GetPlace()), true, + paddle::platform::errors::PreconditionNotMet( + "Operator DNNL ConvTranspose must use CPUPlace")); const bool is_test = ctx.Attr("is_test"); PADDLE_ENFORCE_EQ(is_test, true, platform::errors::InvalidArgument( diff --git a/paddle/fluid/operators/mkldnn/lrn_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/lrn_mkldnn_op.cc index 5b025fa11e3f306597fc0888dd3b7ff798606b41..817711f3157b1bd1e5fda335c62f6e04c486e479 100644 --- a/paddle/fluid/operators/mkldnn/lrn_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/lrn_mkldnn_op.cc @@ -27,10 +27,12 @@ class LRNMKLDNNOpKernel : public paddle::framework::OpKernel { public: void Compute(const paddle::framework::ExecutionContext& ctx) const override { const bool is_float_type = std::is_same::value; - PADDLE_ENFORCE(is_float_type, "MKLDNN LRN must use float data."); - PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()), - "MKLDNN LRN must use CPUPlace."); - + PADDLE_ENFORCE_EQ( + is_float_type, true, + platform::errors::PreconditionNotMet("DNNL LRN must use float data.")); + PADDLE_ENFORCE_EQ(platform::is_cpu_place(ctx.GetPlace()), true, + paddle::platform::errors::PreconditionNotMet( + "Operator DNNL LRN must use CPUPlace")); auto& dev_ctx = ctx.template device_context(); auto x = ctx.Input("X"); @@ -93,12 +95,16 @@ class LRNMKLDNNGradOpKernel : public paddle::framework::OpKernel { public: void Compute(const paddle::framework::ExecutionContext& ctx) const override { const bool is_float_type = std::is_same::value; - PADDLE_ENFORCE(is_float_type, "MKLDNN LRN must use float data."); - PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()), - "MKLDNN LRN must use CPUPlace."); - PADDLE_ENFORCE( - !ctx.Attr("is_test"), - "is_test attribute should be set to False in training phase."); + PADDLE_ENFORCE_EQ(is_float_type, true, + platform::errors::PreconditionNotMet( + "DNNL LRN GradOpKernl must use float data.")); + PADDLE_ENFORCE_EQ(platform::is_cpu_place(ctx.GetPlace()), true, + paddle::platform::errors::PreconditionNotMet( + "Operator DNNL LRNGrad must use CPUPlace")); + PADDLE_ENFORCE_EQ( + ctx.Attr("is_test"), false, + platform::errors::PreconditionNotMet( + "is_test attribute should be set to False in training phase.")); auto x = ctx.Input("X"); auto mid = ctx.Input("MidOut"); diff --git a/paddle/fluid/operators/mkldnn/mkldnn_activation_op.h b/paddle/fluid/operators/mkldnn/mkldnn_activation_op.h index 6c294a9518653ed6de6b8699cfc44c4539661fde..a7a4f9c6975b3c7220c1922dd3fbcb0e03ab163c 100644 --- a/paddle/fluid/operators/mkldnn/mkldnn_activation_op.h +++ b/paddle/fluid/operators/mkldnn/mkldnn_activation_op.h @@ -30,12 +30,8 @@ class MKLDNNActivationKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& context) const override { - PADDLE_ENFORCE(context.Input("X") != nullptr, - "Cannot get input tensor X, variable name = %s", - context.InputName("X")); - PADDLE_ENFORCE(context.Output("Out") != nullptr, - "Cannot find output tensor Out, variable name = %s", - context.OutputName("Out")); + OP_INOUT_CHECK(context.HasInput("X"), "Input", "X", "Activation"); + OP_INOUT_CHECK(context.HasInput("Out"), "Output", "Out", "Activation"); Functor functor; auto attrs = functor.GetAttrs(); diff --git a/paddle/fluid/operators/mkldnn/mul_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/mul_mkldnn_op.cc index 862d6508c5132ea71930fe2a62a8d33ac7036246..1dd1ad117862d92aa8d358f04f8b03fec7abafff 100644 --- a/paddle/fluid/operators/mkldnn/mul_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/mul_mkldnn_op.cc @@ -333,9 +333,9 @@ template class MulMKLDNNKernel : public framework::OpKernel { public: void Compute(const ExecutionContext &ctx) const override { - PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()), - "It must use CPUPlace."); - + PADDLE_ENFORCE_EQ(platform::is_cpu_place(ctx.GetPlace()), true, + paddle::platform::errors::PreconditionNotMet( + "Operator DNNL Mul must use CPUPlace")); auto &dev_ctx = ctx.template device_context(); const auto &mkldnn_engine = dev_ctx.GetEngine(); diff --git a/paddle/fluid/operators/mkldnn/pool_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/pool_mkldnn_op.cc index 4164e067e5dfffbd3e4166ac642d8ae1e20fb186..2a8b332521804ccebdbd4e6914b2763abfb5dbdc 100644 --- a/paddle/fluid/operators/mkldnn/pool_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/pool_mkldnn_op.cc @@ -33,61 +33,19 @@ template class PoolMKLDNNOpKernel : public paddle::framework::OpKernel { public: void Compute(const paddle::framework::ExecutionContext& ctx) const override { - PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()), - "It must use CPUPlace."); + PADDLE_ENFORCE_EQ(platform::is_cpu_place(ctx.GetPlace()), true, + paddle::platform::errors::PreconditionNotMet( + "Operator DNNL Pool must use CPUPlace")); auto& dev_ctx = ctx.template device_context(); + const auto& mkldnn_engine = dev_ctx.GetEngine(); const Tensor* input = ctx.Input("X"); Tensor* output = ctx.Output("Out"); - PADDLE_ENFORCE_EQ(input->layout(), DataLayout::kMKLDNN, - "Wrong layout set for Input tensor"); - PADDLE_ENFORCE_NE(input->format(), MKLDNNMemoryFormat::undef, - "Wrong format set for Input tensor"); - - std::string pooling_type = ctx.Attr("pooling_type"); - - std::vector ksize_temp = ctx.Attr>("ksize"); - std::vector ksize(begin(ksize_temp), end(ksize_temp)); - - std::vector strides_temp = ctx.Attr>("strides"); - std::vector strides(begin(strides_temp), end(strides_temp)); - - std::vector paddings_temp = ctx.Attr>("paddings"); - std::vector paddings(begin(paddings_temp), end(paddings_temp)); - - bool global_pooling = ctx.Attr("global_pooling"); - std::string padding_algorithm = ctx.Attr("padding_algorithm"); - - // Only 2D pooling is supported now - PADDLE_ENFORCE_EQ(ksize.size(), 2, "ksize must be 2D, i.e. 2D pooling"); - PADDLE_ENFORCE_EQ(pooling_type == "max" || pooling_type == "avg", true, - "pooling_type must be 'max' or 'avg'"); - PADDLE_ENFORCE_EQ(input->dims().size(), 4, - "Input dim must be with 4, i.e. NCHW"); - - auto input_dims = input->dims(); - framework::DDim data_dims = - framework::slice_ddim(input_dims, 2, input_dims.size()); - - if (global_pooling) { - UpdateKsize(&ksize, data_dims); - } - - UpdatePadding(&paddings, global_pooling, 0, padding_algorithm, data_dims, - strides, ksize); - - auto src_tz = paddle::framework::vectorize(input->dims()); - auto dst_tz = paddle::framework::vectorize(output->dims()); - - auto is_test = ctx.Attr("is_test"); - - platform::PoolingMKLDNNHandler handler( - src_tz, dst_tz, ksize, strides, paddings, pooling_type, - ctx.Attr("ceil_mode"), input->format(), - paddle::framework::ToMKLDNNDataType(input->type()), is_test, dev_ctx, - ctx.GetPlace(), ctx.OutputName("Out"), ctx.Attr("exclusive")); + platform::PoolingMKLDNNHandler handler(ctx, dev_ctx, mkldnn_engine, + ctx.GetPlace(), input, output, + ctx.OutputName("Out")); auto src_memory = handler.AcquireSrcMemory(input); auto dst_memory = handler.AcquireDstMemory(output); @@ -95,7 +53,8 @@ class PoolMKLDNNOpKernel : public paddle::framework::OpKernel { auto pool_p = handler.AcquireForwardPrimitive(); mkldnn::stream astream(dev_ctx.GetEngine()); - if ((is_test == false) && (pooling_type == "max")) { + if ((ctx.Attr("is_test") == false) && + (ctx.Attr("pooling_type") == "max")) { // Training auto workspace_memory = handler.AcquireWorkspaceMemory(); pool_p->execute(astream, {{MKLDNN_ARG_SRC, *src_memory}, @@ -117,9 +76,9 @@ template class PoolMKLDNNGradOpKernel : public paddle::framework::OpKernel { public: void Compute(const paddle::framework::ExecutionContext& ctx) const override { - PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()), - "It must use CPUPlace."); - + PADDLE_ENFORCE_EQ(platform::is_cpu_place(ctx.GetPlace()), true, + paddle::platform::errors::PreconditionNotMet( + "Operator DNNL PoolGrad must use CPUPlace")); const Tensor* in_x = ctx.Input("X"); const Tensor* out_grad = ctx.Input(framework::GradVarName("Out")); Tensor* in_x_grad = ctx.Output(framework::GradVarName("X")); diff --git a/paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc index 51bc534bff27c48d7f24c82057008a2367dd073a..4d825e4ee279bc2c505cfabff1917d1a5319d1dd 100644 --- a/paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/softmax_mkldnn_op.cc @@ -129,9 +129,9 @@ template class SoftmaxMKLDNNGradKernel : public paddle::framework::OpKernel { public: void Compute(const paddle::framework::ExecutionContext& ctx) const override { - PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()), - "It must use CPUPlace."); - + PADDLE_ENFORCE_EQ(platform::is_cpu_place(ctx.GetPlace()), true, + paddle::platform::errors::PreconditionNotMet( + "Operator DNNL SoftmaxGrad must use CPUPlace")); auto& dev_ctx = ctx.template device_context(); const Tensor* output = ctx.Input("Out"); auto* dout = ctx.template Input(framework::GradVarName("Out")); diff --git a/paddle/fluid/operators/mkldnn/sum_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/sum_mkldnn_op.cc index 0bee1a6c8b5d64acba1e6464066000b961af51a1..1e0e13abb7c641d441b9c6188f7b9103c4ec7292 100644 --- a/paddle/fluid/operators/mkldnn/sum_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/sum_mkldnn_op.cc @@ -49,8 +49,9 @@ template class SumMKLDNNOpKernel : public paddle::framework::OpKernel { public: void Compute(const paddle::framework::ExecutionContext& ctx) const override { - PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()), - "It must use CPUPlace."); + PADDLE_ENFORCE_EQ(platform::is_cpu_place(ctx.GetPlace()), true, + paddle::platform::errors::PreconditionNotMet( + "Operator DNNL Sum must use CPUPlace")); auto& dev_ctx = ctx.template device_context(); const auto& mkldnn_engine = dev_ctx.GetEngine(); auto in_vars = ctx.MultiInputVar("X"); diff --git a/paddle/fluid/operators/mkldnn/transpose_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/transpose_mkldnn_op.cc index 9e9b2fb15827c4323abbe615148acd4ecb5da784..398bdb01b5c240f704982ec5a75e21677f1ef611 100644 --- a/paddle/fluid/operators/mkldnn/transpose_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/transpose_mkldnn_op.cc @@ -28,8 +28,9 @@ template class TransposeMKLDNNOpKernel : public paddle::framework::OpKernel { public: void Compute(const paddle::framework::ExecutionContext& ctx) const override { - PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()), - "It must use CPUPlace."); + PADDLE_ENFORCE_EQ(platform::is_cpu_place(ctx.GetPlace()), true, + paddle::platform::errors::PreconditionNotMet( + "Operator DNNL Transpose must use CPUPlace")); auto& dev_ctx = ctx.template device_context(); const auto& mkldnn_engine = dev_ctx.GetEngine(); @@ -73,8 +74,9 @@ template class TransposeMKLDNNGradOpKernel : public paddle::framework::OpKernel { public: void Compute(const paddle::framework::ExecutionContext& ctx) const override { - PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()), - "It must use CPUPlace."); + PADDLE_ENFORCE_EQ(platform::is_cpu_place(ctx.GetPlace()), true, + paddle::platform::errors::PreconditionNotMet( + "Operator DNNL TransposeGrad must use CPUPlace")); auto* out_grad = ctx.Input(framework::GradVarName("Out")); auto* x_grad = ctx.Output(framework::GradVarName("X")); diff --git a/paddle/fluid/operators/nccl/nccl_gpu_common.cc b/paddle/fluid/operators/nccl/nccl_gpu_common.cc index 08b61765c2f0fb90056c97618c0ce345155a274c..70d80e26e5c6c51f4f0fbac304f48c5f4b3f62b2 100644 --- a/paddle/fluid/operators/nccl/nccl_gpu_common.cc +++ b/paddle/fluid/operators/nccl/nccl_gpu_common.cc @@ -51,7 +51,7 @@ void Communicator::InitAll(const std::vector& gpus) { for (size_t i = 0; i < gpus.size(); ++i) { (*comm_id_map)[gpus[i]] = i; } - PADDLE_ENFORCE( + PADDLE_ENFORCE_CUDA_SUCCESS( dynload::ncclCommInitAll(global_comms->data(), gpus.size(), gpus.data())); inited = true; } diff --git a/paddle/fluid/operators/nce_op.cc b/paddle/fluid/operators/nce_op.cc index 10ed364e37e88c72b8e6e195c51ade8ac72cbfd7..ce382389aa0b340bea256b425ec4371499eb5207 100644 --- a/paddle/fluid/operators/nce_op.cc +++ b/paddle/fluid/operators/nce_op.cc @@ -307,7 +307,7 @@ class NCEOpGradVarTypeInference : public framework::VarTypeInference { } }; -DECLARE_NO_NEED_BUFFER_VARS_INFERER(NCEGradOpNoNeedBufferVarInference, "Bias"); +DECLARE_NO_NEED_BUFFER_VARS_INFERER(NCEGradOpNoNeedBufferVarInferer, "Bias"); } // namespace operators } // namespace paddle @@ -317,7 +317,7 @@ REGISTER_OPERATOR(nce, ops::NCEOp, ops::NCEOpMaker, ops::NCEGradOpMaker, ops::NCEGradOpMaker); REGISTER_OPERATOR(nce_grad, ops::NCEOpGrad, ops::NCEOpGradVarTypeInference, - ops::NCEGradOpNoNeedBufferVarInference); + ops::NCEGradOpNoNeedBufferVarInferer); REGISTER_OP_CPU_KERNEL(nce, ops::NCEKernel, ops::NCEKernel); REGISTER_OP_CPU_KERNEL(nce_grad, diff --git a/paddle/fluid/operators/pad2d_op.cc b/paddle/fluid/operators/pad2d_op.cc index df3bb9c01012185afa4b063dedd2ffebcaf0015a..e50af02dcc4e0b53c95b27be0245ec76a7aed78e 100644 --- a/paddle/fluid/operators/pad2d_op.cc +++ b/paddle/fluid/operators/pad2d_op.cc @@ -656,7 +656,7 @@ class Pad2dOpGradMaker : public framework::SingleGradOpMaker { }; // TODO(zjl): Paddings can also be skipped! -DECLARE_NO_NEED_BUFFER_VARS_INFERER(Pad2dOpGradNoNeedBufferVarsInference, "X"); +DECLARE_NO_NEED_BUFFER_VARS_INFERER(Pad2dOpGradNoNeedBufferVarsInferer, "X"); } // namespace operators } // namespace paddle @@ -667,7 +667,7 @@ REGISTER_OPERATOR(pad2d, ops::Pad2dOp, ops::Pad2dOpMaker, ops::Pad2dOpGradMaker, ops::Pad2dOpGradMaker); REGISTER_OPERATOR(pad2d_grad, ops::Pad2dOpGrad, - ops::Pad2dOpGradNoNeedBufferVarsInference); + ops::Pad2dOpGradNoNeedBufferVarsInferer); REGISTER_OP_CPU_KERNEL(pad2d, ops::Pad2dCPUKernel, ops::Pad2dCPUKernel, ops::Pad2dCPUKernel, ops::Pad2dCPUKernel); diff --git a/paddle/fluid/operators/pool_with_index_op.cc b/paddle/fluid/operators/pool_with_index_op.cc index d081ecb3a2bfb9107f1791688666d5e0e713f8ca..0371ea5b09bb30064af9dcc9f5a8a2d3a8a64fbf 100644 --- a/paddle/fluid/operators/pool_with_index_op.cc +++ b/paddle/fluid/operators/pool_with_index_op.cc @@ -316,7 +316,7 @@ class MaxPoolWithIndexGradOpMaker : public framework::SingleGradOpMaker { }; DECLARE_NO_NEED_BUFFER_VARS_INFERER( - MaxPoolWithIndexOpGradNoNeedBufferVarsInference, "X"); + MaxPoolWithIndexOpGradNoNeedBufferVarsInferer, "X"); } // namespace operators } // namespace paddle @@ -328,7 +328,7 @@ REGISTER_OPERATOR(max_pool2d_with_index, ops::MaxPoolWithIndexOp, ops::MaxPoolWithIndexGradOpMaker, ops::MaxPoolWithIndexGradOpMaker); REGISTER_OPERATOR(max_pool2d_with_index_grad, ops::MaxPoolWithIndexOpGrad, - ops::MaxPoolWithIndexOpGradNoNeedBufferVarsInference); + ops::MaxPoolWithIndexOpGradNoNeedBufferVarsInferer); REGISTER_OP_CPU_KERNEL( max_pool2d_with_index, @@ -347,7 +347,7 @@ REGISTER_OPERATOR(max_pool3d_with_index, ops::MaxPoolWithIndexOp, ops::MaxPoolWithIndexGradOpMaker, ops::MaxPoolWithIndexGradOpMaker); REGISTER_OPERATOR(max_pool3d_with_index_grad, ops::MaxPoolWithIndexOpGrad, - ops::MaxPoolWithIndexOpGradNoNeedBufferVarsInference); + ops::MaxPoolWithIndexOpGradNoNeedBufferVarsInferer); REGISTER_OP_CPU_KERNEL( max_pool3d_with_index, diff --git a/paddle/fluid/operators/pull_box_extended_sparse_op.cc b/paddle/fluid/operators/pull_box_extended_sparse_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..a3f9defc915f2623008642c3e59298dd459a772b --- /dev/null +++ b/paddle/fluid/operators/pull_box_extended_sparse_op.cc @@ -0,0 +1,157 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/operators/pull_box_extended_sparse_op.h" + +namespace paddle { +namespace operators { + +class PullBoxExtendedSparseOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE_GE( + ctx->Inputs("Ids").size(), 1UL, + platform::errors::InvalidArgument( + "Inputs(Ids) of PullBoxExtendedSparseOp should not be empty.")); + PADDLE_ENFORCE_GE( + ctx->Outputs("Out").size(), 1UL, + platform::errors::InvalidArgument( + "Outputs(Out) of PullBoxExtendedSparseOp should not be empty.")); + PADDLE_ENFORCE_GE(ctx->Outputs("OutExtend").size(), 1UL, + platform::errors::InvalidArgument( + "Outputs(OutExtend) of PullBoxExtendedSparseOp " + "should not be empty.")); + auto emb_size = static_cast(ctx->Attrs().Get("emb_size")); + auto emb_extended_size = + static_cast(ctx->Attrs().Get("emb_extended_size")); + auto all_ids_dim = ctx->GetInputsDim("Ids"); + const size_t n_ids = all_ids_dim.size(); + std::vector outs_dims; + std::vector outs_extended_dims; + outs_dims.resize(n_ids); + outs_extended_dims.resize(n_ids); + for (size_t i = 0; i < n_ids; ++i) { + const auto ids_dims = all_ids_dim[i]; + int ids_rank = ids_dims.size(); + PADDLE_ENFORCE_EQ(ids_dims[ids_rank - 1], 1, + platform::errors::InvalidArgument( + "Shape error in %lu id, the last dimension of the " + "'Ids' tensor must be 1.", + i)); + auto out_dim = framework::vectorize( + framework::slice_ddim(ids_dims, 0, ids_rank - 1)); + out_dim.push_back(emb_size); + outs_dims[i] = framework::make_ddim(out_dim); + + auto out_extended_dim = framework::vectorize( + framework::slice_ddim(ids_dims, 0, ids_rank - 1)); + out_extended_dim.push_back(emb_extended_size); + outs_extended_dims[i] = framework::make_ddim(out_extended_dim); + } + ctx->SetOutputsDim("Out", outs_dims); + ctx->SetOutputsDim("OutExtend", outs_extended_dims); + for (size_t i = 0; i < n_ids; ++i) { + ctx->ShareLoD("Ids", "Out", i, i); + ctx->ShareLoD("Ids", "OutExtend", i, i); + } + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType(framework::proto::VarType::FP32, + ctx.device_context()); + } +}; + +class PullBoxExtendedSparseOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("Ids", + "Input tensors with type int32 or int64 " + "contains the ids to be looked up in BoxPS. " + "The last dimension size must be 1.") + .AsDuplicable(); + AddOutput("Out", "The lookup results tensors.").AsDuplicable(); + AddOutput("OutExtend", "The lookup extended results tensors.") + .AsDuplicable(); + AddAttr("emb_size", "(int, the embedding hidden size").SetDefault(1); + AddAttr("emb_extended_size", + "(int, the extended_embedding hidden size") + .SetDefault(128); + AddComment(R"DOC( +Pull Box Extended Sparse Operator. + +This operator is used to perform lookups on the BoxPS, +then concatenated into a dense tensor. + +The input Ids can carry the LoD (Level of Details) information, +or not. And the output only shares the LoD information with input Ids. + +)DOC"); + } +}; + +template +class PushBoxExtendedSparseOpMaker : public framework::SingleGradOpMaker { + public: + using framework::SingleGradOpMaker::SingleGradOpMaker; + + protected: + void Apply(GradOpPtr op) const override { + op->SetType("push_box_extended_sparse"); + op->SetInput("Ids", this->Input("Ids")); + op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out")); + op->SetInput(framework::GradVarName("OutExtend"), + this->OutputGrad("OutExtend")); + op->SetOutput(framework::GradVarName("Out"), this->OutputGrad("Out")); + op->SetAttrMap(this->Attrs()); + } +}; + +class PushBoxExtendedSparseOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override {} + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType(OperatorWithKernel::IndicateVarDataType( + ctx, framework::GradVarName("Out")), + ctx.device_context()); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR( + pull_box_extended_sparse, ops::PullBoxExtendedSparseOp, + ops::PullBoxExtendedSparseOpMaker, + ops::PushBoxExtendedSparseOpMaker, + ops::PushBoxExtendedSparseOpMaker); + +REGISTER_OPERATOR(push_box_extended_sparse, ops::PushBoxExtendedSparseOp); + +REGISTER_OP_CPU_KERNEL(pull_box_extended_sparse, + ops::PullBoxExtendedSparseCPUKernel, + ops::PullBoxExtendedSparseCPUKernel); + +REGISTER_OP_CPU_KERNEL(push_box_extended_sparse, + ops::PushBoxExtendedSparseCPUKernel, + ops::PushBoxExtendedSparseCPUKernel); diff --git a/paddle/fluid/operators/pull_box_extended_sparse_op.cu b/paddle/fluid/operators/pull_box_extended_sparse_op.cu new file mode 100644 index 0000000000000000000000000000000000000000..5bde6bc2e5cbbd332847cb868806ca44616c40e1 --- /dev/null +++ b/paddle/fluid/operators/pull_box_extended_sparse_op.cu @@ -0,0 +1,46 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/operators/pull_box_extended_sparse_op.h" +#include "paddle/fluid/platform/cuda_primitives.h" +#include "paddle/fluid/platform/gpu_info.h" + +namespace paddle { +namespace operators { + +template +class PullBoxExtendedSparseCUDAKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &ctx) const override { + PullBoxExtendedSparseFunctor(ctx); + } +}; + +template +class PushBoxExtendedSparseCUDAKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &ctx) const override { + PushBoxExtendedSparseFunctor(ctx); + } +}; +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL(pull_box_extended_sparse, + ops::PullBoxExtendedSparseCUDAKernel, + ops::PullBoxExtendedSparseCUDAKernel); +REGISTER_OP_CUDA_KERNEL(push_box_extended_sparse, + ops::PushBoxExtendedSparseCUDAKernel, + ops::PushBoxExtendedSparseCUDAKernel); diff --git a/paddle/fluid/operators/pull_box_extended_sparse_op.h b/paddle/fluid/operators/pull_box_extended_sparse_op.h new file mode 100644 index 0000000000000000000000000000000000000000..559c7eed84e6f85e1e6789ca49ce460c5a48a3bc --- /dev/null +++ b/paddle/fluid/operators/pull_box_extended_sparse_op.h @@ -0,0 +1,119 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include +#include "paddle/fluid/framework/fleet/box_wrapper.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/tensor.h" + +namespace paddle { +namespace operators { + +template +static void PullBoxExtendedSparseFunctor( + const framework::ExecutionContext &ctx) { + auto inputs = ctx.MultiInput("Ids"); + auto outputs = ctx.MultiOutput("Out"); + auto outputs_extend = ctx.MultiOutput("OutExtend"); + const auto slot_size = inputs.size(); + std::vector all_keys(slot_size); + // BoxPS only supports float now + std::vector all_values(slot_size * 2); + std::vector slot_lengths(slot_size); + for (size_t i = 0; i < slot_size; i++) { + const auto *slot = inputs[i]; + const uint64_t *single_slot_keys = + reinterpret_cast(slot->data()); + all_keys[i] = single_slot_keys; + slot_lengths[i] = slot->numel(); + auto *output = outputs[i]->mutable_data(ctx.GetPlace()); + auto *output_extend = outputs_extend[i]->mutable_data(ctx.GetPlace()); + all_values[i] = reinterpret_cast(output); + all_values[i + slot_size] = reinterpret_cast(output_extend); + } +#ifdef PADDLE_WITH_BOX_PS + auto emb_size = ctx.Attr("emb_size"); + auto emb_extended_size = ctx.Attr("emb_extended_size"); + auto box_ptr = paddle::framework::BoxWrapper::GetInstance(); + box_ptr->PullSparse(ctx.GetPlace(), all_keys, all_values, slot_lengths, + emb_size, emb_extended_size); +#endif +} + +template +static void PushBoxExtendedSparseFunctor( + const framework::ExecutionContext &ctx) { + auto inputs = ctx.MultiInput("Ids"); + auto d_output = + ctx.MultiInput(framework::GradVarName("Out")); + auto d_output_extend = + ctx.MultiInput(framework::GradVarName("OutExtend")); + const auto slot_size = inputs.size(); + std::vector all_keys(slot_size); + std::vector all_grad_values(slot_size * 2); + std::vector slot_lengths(slot_size); + int batch_size = -1; + for (size_t i = 0; i < slot_size; i++) { + const auto *slot = inputs[i]; + const uint64_t *single_slot_keys = + reinterpret_cast(slot->data()); + all_keys[i] = single_slot_keys; + slot_lengths[i] = slot->numel(); + int cur_batch_size = + slot->lod().size() ? slot->lod()[0].size() - 1 : slot->dims()[0]; + if (batch_size == -1) { + batch_size = cur_batch_size; + } else { + PADDLE_ENFORCE_EQ(batch_size, cur_batch_size, + platform::errors::PreconditionNotMet( + "The batch size of all input slots should be same," + "please cheack")); + } + const float *grad_value = d_output[i]->data(); + const float *grad_value_extend = d_output_extend[i]->data(); + all_grad_values[i] = reinterpret_cast(grad_value); + all_grad_values[i + slot_size] = + reinterpret_cast(grad_value_extend); + } +#ifdef PADDLE_WITH_BOX_PS + auto emb_size = ctx.Attr("emb_size"); + auto emb_extended_size = ctx.Attr("emb_extended_size"); + auto box_ptr = paddle::framework::BoxWrapper::GetInstance(); + box_ptr->PushSparseGrad(ctx.GetPlace(), all_keys, all_grad_values, + slot_lengths, emb_size, emb_extended_size, + batch_size); +#endif +} + +using LoDTensor = framework::LoDTensor; +template +class PullBoxExtendedSparseCPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &ctx) const override { + PullBoxExtendedSparseFunctor(ctx); + } +}; + +template +class PushBoxExtendedSparseCPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &ctx) const override { + PushBoxExtendedSparseFunctor(ctx); + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/pull_box_sparse_op.h b/paddle/fluid/operators/pull_box_sparse_op.h index 1f8c7932c45f11eca5213a2a4a7319c591b985cc..3b48341368c99e5a4413410131e979f0c43e3b80 100644 --- a/paddle/fluid/operators/pull_box_sparse_op.h +++ b/paddle/fluid/operators/pull_box_sparse_op.h @@ -44,7 +44,7 @@ static void PullBoxSparseFunctor(const framework::ExecutionContext &ctx) { auto hidden_size = ctx.Attr("size"); auto box_ptr = paddle::framework::BoxWrapper::GetInstance(); box_ptr->PullSparse(ctx.GetPlace(), all_keys, all_values, slot_lengths, - hidden_size); + hidden_size, 0); #endif } @@ -81,7 +81,7 @@ static void PushBoxSparseFunctor(const framework::ExecutionContext &ctx) { auto hidden_size = ctx.Attr("size"); auto box_ptr = paddle::framework::BoxWrapper::GetInstance(); box_ptr->PushSparseGrad(ctx.GetPlace(), all_keys, all_grad_values, - slot_lengths, hidden_size, batch_size); + slot_lengths, hidden_size, 0, batch_size); #endif } diff --git a/paddle/fluid/operators/push_dense_op.cc b/paddle/fluid/operators/push_dense_op.cc index 20af4dba0daa9efe51a433839efda693650d0b92..5b9f05bd126b8aa7f895af1f659f705617491643 100644 --- a/paddle/fluid/operators/push_dense_op.cc +++ b/paddle/fluid/operators/push_dense_op.cc @@ -56,7 +56,7 @@ The input gradients is all dense gradient tensors in a table. } }; -DECLARE_NO_NEED_BUFFER_VARS_INFERER(PushDenseNoNeedBufferVarsInference, "Ids"); +DECLARE_NO_NEED_BUFFER_VARS_INFERER(PushDenseNoNeedBufferVarsInferer, "Ids"); } // namespace operators } // namespace paddle @@ -66,5 +66,5 @@ REGISTER_OPERATOR( push_dense, ops::PushDenseOp, ops::PushDenseOpMaker, paddle::framework::EmptyGradOpMaker, paddle::framework::EmptyGradOpMaker, - ops::PushDenseNoNeedBufferVarsInference); + ops::PushDenseNoNeedBufferVarsInferer); REGISTER_OP_CPU_KERNEL(push_dense, ops::PushDenseCPUKernel) diff --git a/paddle/fluid/operators/reader/blocking_queue.h b/paddle/fluid/operators/reader/blocking_queue.h index b8e2fca9ee082a1ba44edea26701217c31c4a6cb..4add9afdfd45b171edd8280b50e1ec13ed64637b 100644 --- a/paddle/fluid/operators/reader/blocking_queue.h +++ b/paddle/fluid/operators/reader/blocking_queue.h @@ -34,9 +34,11 @@ class BlockingQueue { public: explicit BlockingQueue(size_t capacity, bool speed_test_mode = false) : capacity_(capacity), speed_test_mode_(speed_test_mode) { - PADDLE_ENFORCE_GT( - capacity_, static_cast(0), - "The capacity of a reader::BlockingQueue must be greater than 0."); + PADDLE_ENFORCE_GT(capacity_, static_cast(0), + platform::errors::InvalidArgument( + "The capacity of a reader::BlockingQueue must be " + "greater than 0, but received capacity is %d.", + capacity_)); } bool Send(const T& elem) { @@ -49,7 +51,10 @@ class BlockingQueue { << "WARNING: Sending an element to a closed reader::BlokcingQueue."; return false; } - PADDLE_ENFORCE_LT(queue_.size(), capacity_); + PADDLE_ENFORCE_LT( + queue_.size(), capacity_, + platform::errors::PermissionDenied( + "The queue size cannot exceed the set queue capacity.")); queue_.push_back(elem); receive_cv_.notify_one(); return true; @@ -65,7 +70,10 @@ class BlockingQueue { << "WARNING: Sending an element to a closed reader::BlokcingQueue."; return false; } - PADDLE_ENFORCE_LT(queue_.size(), capacity_); + PADDLE_ENFORCE_LT( + queue_.size(), capacity_, + platform::errors::PermissionDenied( + "The queue size cannot exceed the set queue capacity.")); queue_.emplace_back(std::move(elem)); receive_cv_.notify_one(); return true; @@ -77,7 +85,9 @@ class BlockingQueue { [&] { return !queue_.empty() || closed_ || killed_; }); EnforceNotKilled(); if (!queue_.empty()) { - PADDLE_ENFORCE_NOT_NULL(elem); + PADDLE_ENFORCE_NOT_NULL( + elem, platform::errors::InvalidArgument( + "The holder to receive queue data is null pointer.")); *elem = queue_.front(); if (LIKELY(!speed_test_mode_)) { queue_.pop_front(); @@ -85,7 +95,10 @@ class BlockingQueue { send_cv_.notify_one(); return true; } else { - PADDLE_ENFORCE(closed_); + PADDLE_ENFORCE_EQ(closed_, true, + platform::errors::PermissionDenied( + "Blocking queue status error, if queue is empty " + "when pop data, it should be closed.")); VLOG(3) << "queue is closed! return nothing."; return false; } @@ -136,9 +149,9 @@ class BlockingQueue { private: inline void EnforceNotKilled() { - PADDLE_ENFORCE_NE( - killed_, true, - "Blocking queue is killed because the data reader raises an exception"); + PADDLE_ENFORCE_NE(killed_, true, platform::errors::Fatal( + "Blocking queue is killed because the " + "data reader raises an exception.")); } private: diff --git a/paddle/fluid/operators/reader/buffered_reader.cc b/paddle/fluid/operators/reader/buffered_reader.cc index 2fb2fcc40fc2e1085da2c8e7406aa3e6bb85b5d1..4d79a7fcb267d736cf50659b9725661a3ee96fd8 100644 --- a/paddle/fluid/operators/reader/buffered_reader.cc +++ b/paddle/fluid/operators/reader/buffered_reader.cc @@ -62,7 +62,6 @@ BufferedReader::BufferedReader( } void BufferedReader::ReadTillBufferFullAsync() { - PADDLE_ENFORCE_EQ(position_.size(), 0U); for (size_t i = 0; i < buffer_size_; ++i) { ReadAsync(i); } @@ -87,8 +86,10 @@ void BufferedReader::ReadAsync(size_t i) { if (gpu.empty()) { gpu.resize(cpu.size()); } else { - PADDLE_ENFORCE_EQ(gpu.size(), cpu.size(), - "Input tensor number not matched"); + PADDLE_ENFORCE_EQ( + gpu.size(), cpu.size(), + platform::errors::InvalidArgument( + "Input tensor number on GPU and CPU devices are not matched.")); } std::vector gpu_ptrs; diff --git a/paddle/fluid/operators/reader/create_ctr_reader_op.cc b/paddle/fluid/operators/reader/create_ctr_reader_op.cc index 2a3e80c9152b5550631f8c5669283b782f975d4e..86fbddc0ec2cf10055d11b303a0d3e519b641587 100644 --- a/paddle/fluid/operators/reader/create_ctr_reader_op.cc +++ b/paddle/fluid/operators/reader/create_ctr_reader_op.cc @@ -36,8 +36,9 @@ class CreateCTRReaderOp : public framework::OperatorBase { auto* queue_holder_var = scope.FindVar(queue_name); PADDLE_ENFORCE_NOT_NULL( queue_holder_var, - "No LoDTensorBlockingQueueHolder variable with name %s found", - queue_name); + platform::errors::PreconditionNotMet( + "No LoDTensorBlockingQueueHolder variable with name %s found", + queue_name)); auto* queue_holder = queue_holder_var->template GetMutable(); diff --git a/paddle/fluid/operators/reader/create_custom_reader_op.cc b/paddle/fluid/operators/reader/create_custom_reader_op.cc index 1ba7228140b16562552b4a70336d07fbe2b0be3d..d5142ed6301b2b3370fcddb936caadd22f7bea38 100644 --- a/paddle/fluid/operators/reader/create_custom_reader_op.cc +++ b/paddle/fluid/operators/reader/create_custom_reader_op.cc @@ -96,11 +96,14 @@ class CreateCustomReaderOpMaker : public DecoratedReaderMakerBase { class CustomReaderInferShape : public framework::InferShapeBase { public: void operator()(framework::InferShapeContext* ctx) const override { - PADDLE_ENFORCE(!ctx->IsRuntime(), - "'CustomReaderInferShape' should only be invoked during " - "compile time."); - PADDLE_ENFORCE(ctx->HasOutput("Out"), - "The output decorated reader should not be null."); + PADDLE_ENFORCE_NE( + ctx->IsRuntime(), true, + platform::errors::PreconditionNotMet( + "'CustomReaderInferShape' should only be invoked during " + "compile time.")); + PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true, + platform::errors::NotFound( + "The output decorated reader should not be null.")); const auto* sub_block = ctx->Attrs().Get("sub_block"); const auto sink_var_names = @@ -109,7 +112,9 @@ class CustomReaderInferShape : public framework::InferShapeBase { std::vector res_lod_levels; for (const std::string& var_name : sink_var_names) { auto* sink_var = sub_block->FindVar(var_name); - PADDLE_ENFORCE_NOT_NULL(sink_var); + PADDLE_ENFORCE_NOT_NULL( + sink_var, platform::errors::NotFound( + "The sink variable is not found in CustomReader.")); res_dims.emplace_back(sink_var->GetShape()); res_lod_levels.push_back(sink_var->GetLoDLevel()); } @@ -124,7 +129,9 @@ class CustomReaderInferVarType : public framework::VarTypeInference { public: void operator()(framework::InferVarTypeContext* ctx) const override { auto& out_var_name = ctx->Output("Out")[0]; - PADDLE_ENFORCE(ctx->HasVar(out_var_name)); + PADDLE_ENFORCE_EQ(ctx->HasVar(out_var_name), true, + platform::errors::NotFound( + "The output reader variable should not be null.")); ctx->SetType(out_var_name, framework::proto::VarType::READER); auto sink_var_names = BOOST_GET_CONST(std::vector, @@ -134,7 +141,9 @@ class CustomReaderInferVarType : public framework::VarTypeInference { std::vector res_data_types; for (const std::string& var_name : sink_var_names) { framework::VarDesc* var = sub_block->FindVar(var_name); - PADDLE_ENFORCE_NOT_NULL(var); + PADDLE_ENFORCE_NOT_NULL( + var, platform::errors::NotFound( + "The sink variable is not found in CustomReader.")); res_data_types.emplace_back(var->GetDataType()); } ctx->SetDataTypes(out_var_name, res_data_types); @@ -149,11 +158,13 @@ void CustomReader::ReadNextImpl(std::vector* out) { // There is not next data. return; } - PADDLE_ENFORCE(source_var_names_.size() == underlying_outs.size(), - "The size of source_var_names(%d) and the size of " - "underlying_outs(%d) are not consistent. Each feeding element " - "must have its own source variable.", - source_var_names_.size(), underlying_outs.size()); + PADDLE_ENFORCE_EQ( + source_var_names_.size(), underlying_outs.size(), + platform::errors::InvalidArgument( + "The size of source_var_names(%d) and the size of " + "underlying_outs(%d) are not consistent. Each feeding element " + "must have its own source variable.", + source_var_names_.size(), underlying_outs.size())); // The scope for CustomReader's sub-block should be independent and shouldn't // be any other computation scope's child. Otherwise, data preprocessing and // compution cannot be concurrent. diff --git a/paddle/fluid/operators/reader/lod_tensor_blocking_queue.h b/paddle/fluid/operators/reader/lod_tensor_blocking_queue.h index 2a0983d3bd0f33f7e8ee49731c8454105a7d8b19..6bbb643b40fe5e4a6b2fd50ade1fa6ca84a2e80d 100644 --- a/paddle/fluid/operators/reader/lod_tensor_blocking_queue.h +++ b/paddle/fluid/operators/reader/lod_tensor_blocking_queue.h @@ -201,9 +201,10 @@ class OrderedMultiDeviceLoDTensorBlockingQueue { class LoDTensorBlockingQueueHolder { public: void InitOnce(size_t capacity, bool speed_test_mode = false) { - PADDLE_ENFORCE( - queue_ == nullptr, - "LoDTensorBlockingQueueHolder::InitOnce() can only be called once"); + PADDLE_ENFORCE_EQ( + queue_, nullptr, + platform::errors::AlreadyExists("LoDTensorBlockingQueueHolder::" + "InitOnce() can only be called once")); queue_.reset(new LoDTensorBlockingQueue(capacity, speed_test_mode)); } diff --git a/paddle/fluid/operators/reader/py_reader.cc b/paddle/fluid/operators/reader/py_reader.cc index 9aa18fb2f4c615dd494c3d33ef321cf43807739c..2100aeb7cf4d5ca5ae0ae557c5d131ca831c39f9 100644 --- a/paddle/fluid/operators/reader/py_reader.cc +++ b/paddle/fluid/operators/reader/py_reader.cc @@ -25,7 +25,9 @@ PyReader::PyReader( const std::vector& var_types, const std::vector& need_check_feed) : framework::FileReader(dims, var_types, need_check_feed) { - PADDLE_ENFORCE(queue != nullptr, "LoDTensorBlockingQueue must not be null"); + PADDLE_ENFORCE_NOT_NULL(queue, + platform::errors::PreconditionNotMet( + "LoDTensorBlockingQueue must not be null.")); queue_ = queue; } diff --git a/paddle/fluid/operators/reader/read_op.cc b/paddle/fluid/operators/reader/read_op.cc index ec2b2d5f4179b7f631ae12c670111dc0be86d0d3..d7f81dc24cced8c045223d3f62ea8055d1821aa5 100644 --- a/paddle/fluid/operators/reader/read_op.cc +++ b/paddle/fluid/operators/reader/read_op.cc @@ -78,7 +78,10 @@ class ReadInferVarType : public framework::StaticGraphVarTypeInference { std::string reader_name = Input(ctx, "Reader")[0]; auto& out_names = Output(ctx, "Out"); auto dtypes = GetDataTypes(ctx, reader_name); - PADDLE_ENFORCE_EQ(dtypes.size(), out_names.size()); + PADDLE_ENFORCE_EQ(dtypes.size(), out_names.size(), + platform::errors::InvalidArgument( + "The number of input reader's dtypes do not match " + "the output variable number.")); for (size_t i = 0; i < dtypes.size(); ++i) { SetType(ctx, out_names[i], framework::proto::VarType::LOD_TENSOR); SetDataType(ctx, out_names[i], dtypes[i]); diff --git a/paddle/fluid/operators/reader/reader_op_registry.cc b/paddle/fluid/operators/reader/reader_op_registry.cc index eb6fa3c5e7e5838840f9a8db6429336850827ea6..952ed4662880053833384799916fe5435d867ed0 100644 --- a/paddle/fluid/operators/reader/reader_op_registry.cc +++ b/paddle/fluid/operators/reader/reader_op_registry.cc @@ -62,12 +62,14 @@ void FileReaderMakerBase::Make() { } void FileReaderInferShape::operator()(framework::InferShapeContext* ctx) const { - PADDLE_ENFORCE( - !ctx->IsRuntime(), - "'FileReaderInferShape' should only be invoked during compile time."); - - PADDLE_ENFORCE(ctx->HasOutput("Out"), - "The output file reader should not be null."); + PADDLE_ENFORCE_NE( + ctx->IsRuntime(), true, + platform::errors::PreconditionNotMet("'FileReaderInferShape' should only " + "be invoked during compile time.")); + + PADDLE_ENFORCE_EQ( + ctx->HasOutput("Out"), true, + platform::errors::NotFound("The output file reader should not be null.")); bool use_data_config = ctx->Attrs().Get("use_data_config"); if (use_data_config) { const auto shape_concat = @@ -77,21 +79,26 @@ void FileReaderInferShape::operator()(framework::InferShapeContext* ctx) const { ctx->SetReaderDims("Out", shapes); const auto lod_levels = ctx->Attrs().Get>("lod_levels"); - PADDLE_ENFORCE_EQ(lod_levels.size(), shapes.size(), - "The number of 'lod_levels'(%d) doesn't match the number " - "of 'shapes'(%d).", - lod_levels.size(), shapes.size()); + PADDLE_ENFORCE_EQ( + lod_levels.size(), shapes.size(), + platform::errors::InvalidArgument( + "The number of 'lod_levels'(%d) doesn't match the number " + "of 'shapes'(%d).", + lod_levels.size(), shapes.size())); const auto dtypes = ctx->Attrs().Get>("dtypes"); PADDLE_ENFORCE_EQ( dtypes.size(), shapes.size(), - "The number of 'dtypes'(%d) doesn't match the number of 'shapes'(%d).", - dtypes.size(), shapes.size()); + platform::errors::InvalidArgument("The number of 'dtypes'(%d) doesn't " + "match the number of 'shapes'(%d).", + dtypes.size(), shapes.size())); const auto need_check_feed = ctx->Attrs().Get>("need_check_feed"); - PADDLE_ENFORCE_EQ(need_check_feed.size(), shapes.size(), - "The number of 'need_check_feed'(%d) doesn't match the " - "number of 'shapes'(%d).", - need_check_feed.size(), shapes.size()); + PADDLE_ENFORCE_EQ( + need_check_feed.size(), shapes.size(), + platform::errors::InvalidArgument( + "The number of 'need_check_feed'(%d) doesn't match the " + "number of 'shapes'(%d).", + need_check_feed.size(), shapes.size())); framework::VarDesc* reader = BOOST_GET(framework::VarDesc*, ctx->GetOutputVarPtrs("Out")[0]); reader->SetLoDLevels(lod_levels); @@ -105,14 +112,18 @@ void FileReaderInferVarType::operator()( void DecoratedReaderInferShape::operator()( framework::InferShapeContext* ctx) const { - PADDLE_ENFORCE(!ctx->IsRuntime(), - "'DecoratedReaderInferShape' should only be invoked during " - "compile time."); - - PADDLE_ENFORCE(ctx->HasInput("UnderlyingReader"), - "Input(UnderlyingReader) should not be null."); - PADDLE_ENFORCE(ctx->HasOutput("Out"), - "The output decorated reader should not be null."); + PADDLE_ENFORCE_NE( + ctx->IsRuntime(), true, + platform::errors::PreconditionNotMet( + "'DecoratedReaderInferShape' should only be invoked during " + "compile time.")); + + PADDLE_ENFORCE_EQ(ctx->HasInput("UnderlyingReader"), true, + platform::errors::NotFound( + "Input(UnderlyingReader) should not be null.")); + PADDLE_ENFORCE_EQ(ctx->HasOutput("Out"), true, + platform::errors::NotFound( + "The output decorated reader should not be null.")); ctx->SetReaderDims("Out", ctx->GetReaderDims("UnderlyingReader")); framework::VarDesc* in_reader = BOOST_GET( diff --git a/paddle/fluid/operators/reshape_op.cc b/paddle/fluid/operators/reshape_op.cc index 8eba5b888ecc27de1f9e7c6f535098880aa25bfd..fee0f045825591d548350c289f3f290d5dd1d723 100644 --- a/paddle/fluid/operators/reshape_op.cc +++ b/paddle/fluid/operators/reshape_op.cc @@ -545,12 +545,12 @@ class Reshape2DoubleGradOp : public framework::OperatorWithKernel { } }; -DECLARE_INPLACE_OP_INFERER(ReshapeOpInplaceInToOut, {"X", "Out"}); -DECLARE_INPLACE_OP_INFERER(ReshapeGradInplaceInToOut, +DECLARE_INPLACE_OP_INFERER(ReshapeOpInplaceInferer, {"X", "Out"}); +DECLARE_INPLACE_OP_INFERER(ReshapeGradInplaceInferer, {framework::GradVarName("Out"), framework::GradVarName("X")}); -DECLARE_INPLACE_OP_INFERER(ReshapeDoubleGradInplaceInToOut, {"DDX", "DDOut"}); -DECLARE_NO_NEED_BUFFER_VARS_INFERER(ReshapeDoubleGradOpNoNeedBufferVarInference, +DECLARE_INPLACE_OP_INFERER(ReshapeDoubleGradInplaceInferer, {"DDX", "DDOut"}); +DECLARE_NO_NEED_BUFFER_VARS_INFERER(ReshapeDoubleGradOpNoNeedBufferVarInferer, "DOut"); } // namespace operators @@ -562,9 +562,9 @@ REGISTER_OPERATOR( reshape, ops::ReshapeOp, ops::ReshapeOpMaker, paddle::framework::DefaultGradOpMaker, paddle::framework::DefaultGradOpMaker, - ops::ReshapeOpInplaceInToOut); + ops::ReshapeOpInplaceInferer); REGISTER_OPERATOR(reshape_grad, ops::ReshapeGradOp, - ops::ReshapeGradInplaceInToOut); + ops::ReshapeGradInplaceInferer); REGISTER_OP_CPU_KERNEL_FUNCTOR(reshape, float, ops::ReshapeKernel, double, ops::ReshapeKernel, int, ops::ReshapeKernel, @@ -576,14 +576,14 @@ REGISTER_OP_CPU_KERNEL_FUNCTOR(reshape_grad, float, ops::ReshapeGradKernel, REGISTER_OPERATOR(reshape2, ops::Reshape2Op, ops::Reshape2OpMaker, ops::Reshape2GradMaker, ops::Reshape2GradMaker, - ops::ReshapeOpInplaceInToOut); + ops::ReshapeOpInplaceInferer); REGISTER_OPERATOR(reshape2_grad, ops::Reshape2GradOp, ops::Reshape2DoubleGradMaker, ops::Reshape2DoubleGradMaker, - ops::ReshapeGradInplaceInToOut); + ops::ReshapeGradInplaceInferer); REGISTER_OPERATOR(reshape2_grad_grad, ops::Reshape2DoubleGradOp, - ops::ReshapeDoubleGradInplaceInToOut, - ops::ReshapeDoubleGradOpNoNeedBufferVarInference); + ops::ReshapeDoubleGradInplaceInferer, + ops::ReshapeDoubleGradOpNoNeedBufferVarInferer); REGISTER_OP_CPU_KERNEL_FUNCTOR(reshape2, float, ops::ReshapeKernel, double, ops::ReshapeKernel, int8_t, ops::ReshapeKernel, diff --git a/paddle/fluid/operators/scale_op.cc b/paddle/fluid/operators/scale_op.cc index 647e3cea99d3c1975d0da988d58dcab139ec1209..9d51f3e292fa2de114b971032c29f5e769ba617a 100644 --- a/paddle/fluid/operators/scale_op.cc +++ b/paddle/fluid/operators/scale_op.cc @@ -104,7 +104,7 @@ class ScaleGradMaker : public framework::SingleGradOpMaker { } }; -DECLARE_INPLACE_OP_INFERER(ScaleOpInplace, {"X", "Out"}); +DECLARE_INPLACE_OP_INFERER(ScaleOpInplaceInferer, {"X", "Out"}); } // namespace operators } // namespace paddle @@ -113,7 +113,7 @@ namespace ops = paddle::operators; REGISTER_OPERATOR(scale, ops::ScaleOp, ops::ScaleOpMaker, ops::ScaleGradMaker, ops::ScaleGradMaker, - ops::ScaleOpVarTypeInference, ops::ScaleOpInplace); + ops::ScaleOpVarTypeInference, ops::ScaleOpInplaceInferer); REGISTER_OP_CPU_KERNEL( scale, ops::ScaleKernel, ops::ScaleKernel, diff --git a/paddle/fluid/operators/shape_op.h b/paddle/fluid/operators/shape_op.h index 0d510a505583c55e26a26bfc6e5d6192899b3d9e..1f90c041c095331db427ddd5f9a656e948947e46 100644 --- a/paddle/fluid/operators/shape_op.h +++ b/paddle/fluid/operators/shape_op.h @@ -20,15 +20,23 @@ namespace paddle { namespace operators { using Tensor = framework::Tensor; +using LoDTensor = framework::LoDTensor; +using SelectedRows = framework::SelectedRows; template class ShapeKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { - auto* in_t = ctx.Input("Input"); + auto* in_var = ctx.InputVar("Input"); + framework::DDim in_dims; + if (in_var->IsType()) { + in_dims = in_var->Get().value().dims(); + } else { + in_dims = in_var->Get().dims(); + } auto* out_t = ctx.Output("Out"); + out_t->Resize({in_dims.size()}); auto out_data = out_t->mutable_data(platform::CPUPlace()); - auto in_dims = in_t->dims(); for (int i = 0; i < in_dims.size(); ++i) { out_data[i] = in_dims[i]; } diff --git a/paddle/fluid/operators/softmax_with_cross_entropy_op.cc b/paddle/fluid/operators/softmax_with_cross_entropy_op.cc index b5b99d3a929b0e97e286931fc805e29241773933..946ede475ce68447db05f2ecd2bd624e90881376 100644 --- a/paddle/fluid/operators/softmax_with_cross_entropy_op.cc +++ b/paddle/fluid/operators/softmax_with_cross_entropy_op.cc @@ -287,10 +287,10 @@ class SoftmaxGradMaker : public framework::SingleGradOpMaker { } }; -DECLARE_INPLACE_OP_INFERER(SoftmaxWithCrossEntropyInplaceInference, +DECLARE_INPLACE_OP_INFERER(SoftmaxWithCrossEntropyInplaceInferer, {"Logits", "Softmax"}); -DECLARE_INPLACE_OP_INFERER(SoftmaxWithCrossEntropyGradInplaceInference, +DECLARE_INPLACE_OP_INFERER(SoftmaxWithCrossEntropyGradInplaceInferer, {"Softmax", framework::GradVarName("Logits")}); } // namespace operators @@ -302,10 +302,10 @@ REGISTER_OPERATOR(softmax_with_cross_entropy, ops::SoftmaxWithCrossEntropyOp, ops::SoftmaxWithCrossEntropyOpMaker, ops::SoftmaxGradMaker, ops::SoftmaxGradMaker, - ops::SoftmaxWithCrossEntropyInplaceInference); + ops::SoftmaxWithCrossEntropyInplaceInferer); REGISTER_OPERATOR(softmax_with_cross_entropy_grad, ops::SoftmaxWithCrossEntropyOpGrad, - ops::SoftmaxWithCrossEntropyGradInplaceInference); + ops::SoftmaxWithCrossEntropyGradInplaceInferer); REGISTER_OP_CPU_KERNEL(softmax_with_cross_entropy, ops::SoftmaxWithCrossEntropyKernel, ops::SoftmaxWithCrossEntropyKernel); diff --git a/paddle/fluid/operators/sum_op.cc b/paddle/fluid/operators/sum_op.cc index 39d6a992043072ed98bf35a060b99233dd927bc7..b06e8202cc79f017e26e3c8339ad05951a5a2bf7 100644 --- a/paddle/fluid/operators/sum_op.cc +++ b/paddle/fluid/operators/sum_op.cc @@ -299,7 +299,7 @@ class SumGradOpBaseMaker : public imperative::GradOpBaseMakerBase { } }; -DECLARE_INPLACE_OP_INFERER(SumInplace, {"X", "Out"}); +DECLARE_INPLACE_OP_INFERER(SumInplaceInferer, {"X", "Out"}); } // namespace operators } // namespace paddle @@ -308,7 +308,7 @@ namespace ops = paddle::operators; REGISTER_OPERATOR(sum, ops::SumOp, ops::SumOpMaker, ops::SumGradDescMaker, ops::SumGradOpBaseMaker, ops::SumOpVarTypeInference, - ops::SumInplace); + ops::SumInplaceInferer); REGISTER_OP_CPU_KERNEL( sum, ops::SumKernel, diff --git a/paddle/fluid/platform/device_tracer.cc b/paddle/fluid/platform/device_tracer.cc index 4679b7e1da3a752c457cd7109d36e65194154792..6be8ed25e3fe4b817146b359da5e602d52192ab4 100644 --- a/paddle/fluid/platform/device_tracer.cc +++ b/paddle/fluid/platform/device_tracer.cc @@ -40,6 +40,9 @@ namespace { thread_local std::deque block_id_stack; // Tracking the nested event stacks. thread_local std::deque annotation_stack; +// stack to strore event sunch as pe and so on +static std::deque main_thread_annotation_stack{}; +static std::deque main_thread_annotation_stack_name{}; std::map system_thread_id_map; @@ -638,15 +641,49 @@ DeviceTracer *GetDeviceTracer() { return tracer; } -void SetCurAnnotation(Event *event) { - if (!annotation_stack.empty()) { +std::string SetCurAnnotation(Event *event) { + std::string ret; + if (!annotation_stack.empty() && event->role() != EventRole::kSpecial) { event->set_parent(annotation_stack.back()); event->set_name(annotation_stack.back()->name() + "/" + event->name()); } + annotation_stack.push_back(event); + + if (!main_thread_annotation_stack_name.empty() && !annotation_stack.empty() && + main_thread_annotation_stack.back()->thread_id() != + annotation_stack.back()->thread_id()) { + ret = main_thread_annotation_stack_name.back() + "/" + event->name(); + } else { + ret = event->name(); + } + if (event->role() == EventRole::kSpecial) { + std::string name = event->name(); + if (!main_thread_annotation_stack_name.empty()) { + name = main_thread_annotation_stack_name.back() + "/" + event->name(); + } + main_thread_annotation_stack_name.push_back(name); + main_thread_annotation_stack.push_back(event); + } + + return ret; } -void ClearCurAnnotation() { annotation_stack.pop_back(); } +void ClearCurAnnotation() { + if (!main_thread_annotation_stack_name.empty() && !annotation_stack.empty() && + main_thread_annotation_stack.back()->thread_id() != + annotation_stack.back()->thread_id()) { + annotation_stack.back()->set_name(main_thread_annotation_stack_name.back() + + "/" + annotation_stack.back()->name()); + } + if (!main_thread_annotation_stack.empty() && + main_thread_annotation_stack.back()->name() == + annotation_stack.back()->name()) { + main_thread_annotation_stack_name.pop_back(); + main_thread_annotation_stack.pop_back(); + } + annotation_stack.pop_back(); +} Event *CurAnnotation() { if (annotation_stack.empty()) return nullptr; diff --git a/paddle/fluid/platform/device_tracer.h b/paddle/fluid/platform/device_tracer.h index 85168a046fb3fa4317956737871cde56e15bedfb..44b7af149efa9214fe5d9177755541fba4c70ab4 100644 --- a/paddle/fluid/platform/device_tracer.h +++ b/paddle/fluid/platform/device_tracer.h @@ -137,7 +137,7 @@ class DeviceTracer { DeviceTracer* GetDeviceTracer(); // Set a name for the cuda kernel operation being launched by the thread. -void SetCurAnnotation(Event* event); +std::string SetCurAnnotation(Event* event); // Clear the name after the operation is done. void ClearCurAnnotation(); // Current name of the operation being run in the thread. diff --git a/paddle/fluid/platform/event.h b/paddle/fluid/platform/event.h index ed2c05af453585323807ed1ac1455dc63bfa13ef..9a482a63f5e847450601dbd37bb87ff9c48852ae 100644 --- a/paddle/fluid/platform/event.h +++ b/paddle/fluid/platform/event.h @@ -29,6 +29,7 @@ enum class EventRole { kOrdinary, // only record op time with op type key kInnerOp, // record op detail time with op type key kUniqueOp, // record op detail time with op unique name key + kSpecial, // record event such as PE which is outer of thread local }; class Event { diff --git a/paddle/fluid/platform/mkldnn_reuse.h b/paddle/fluid/platform/mkldnn_reuse.h index 4248a2b859f63817291089524794804f6dfdcd04..2d475e7150a73c8e745f267fd60bb0c2bd1d1c8a 100644 --- a/paddle/fluid/platform/mkldnn_reuse.h +++ b/paddle/fluid/platform/mkldnn_reuse.h @@ -21,6 +21,7 @@ limitations under the License. */ #include "boost/optional.hpp" #include "paddle/fluid/framework/data_layout_transform.h" #include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/operators/pool_op.h" #include "paddle/fluid/platform/mkldnn_helper.h" #include "paddle/fluid/platform/place.h" @@ -592,41 +593,100 @@ template class PoolingMKLDNNHandler : public MKLDNNHandlerT { public: - PoolingMKLDNNHandler( - const std::vector& src_dims, - const std::vector& dst_dims, const std::vector& ksize, - const std::vector& strides, const std::vector& paddings, - const std::string& pooling_type, bool ceil_mode, - const MKLDNNMemoryFormat fmt, mkldnn::memory::data_type dt, bool is_test, - const platform::MKLDNNDeviceContext& dev_ctx, platform::Place cpu_place, - const std::string& unique_name, bool exclude_padding) + PoolingMKLDNNHandler(const paddle::framework::ExecutionContext& ctx, + const MKLDNNDeviceContext& dev_ctx, + const mkldnn::engine mkldnn_engine, + platform::Place cpu_place, const Tensor* input, + Tensor* output, const std::string& unique_name) : platform::MKLDNNHandlerT( dev_ctx, dev_ctx.GetEngine(), cpu_place, - platform::CreateKey(src_dims, dt, unique_name)) { - auto src_md = mkldnn::memory::desc(src_dims, dt, fmt); - /* create memory descriptor for pooling without specified format - * ('any') which lets a primitive (pooling in this case) choose - * the memory format preferred for best performance - */ - auto dst_md = - platform::MKLDNNMemDesc(dst_dims, dt, MKLDNNMemoryFormat::any); + platform::CreateKey(framework::vectorize(input->dims()), + framework::ToMKLDNNDataType(input->type()), + unique_name)) { + if (!this->isCached()) { + PADDLE_ENFORCE_EQ(input->layout(), DataLayout::kMKLDNN, + platform::errors::InvalidArgument( + "Wrong layout set for Input tensor")); + PADDLE_ENFORCE_NE(input->format(), MKLDNNMemoryFormat::undef, + platform::errors::InvalidArgument( + "Wrong format set for Input tensor")); + + const std::string pooling_type = ctx.Attr("pooling_type"); + + std::vector ksize_temp = ctx.Attr>("ksize"); + std::vector ksize(begin(ksize_temp), end(ksize_temp)); + + std::vector strides_temp = ctx.Attr>("strides"); + std::vector strides(begin(strides_temp), end(strides_temp)); + + std::vector paddings_temp = ctx.Attr>("paddings"); + std::vector paddings(begin(paddings_temp), end(paddings_temp)); + + const bool global_pooling = ctx.Attr("global_pooling"); + const std::string padding_algorithm = + ctx.Attr("padding_algorithm"); + + // Only 2D pooling is supported now + PADDLE_ENFORCE_EQ(ksize.size(), 2, + platform::errors::InvalidArgument( + "ksize must be 2D, i.e. 2D pooling")); + PADDLE_ENFORCE_EQ(pooling_type == "max" || pooling_type == "avg", true, + platform::errors::InvalidArgument( + "pooling_type must be 'max' or 'avg'")); + PADDLE_ENFORCE_EQ(input->dims().size(), 4, + platform::errors::InvalidArgument( + "Input dim must be with 4, i.e. NCHW")); + + const auto input_dims = input->dims(); + framework::DDim data_dims = + framework::slice_ddim(input_dims, 2, input_dims.size()); + + if (global_pooling) { + operators::UpdateKsize(&ksize, data_dims); + } - auto mkldnn_paddings = ToMkldnnPadding(paddings); + operators::UpdatePadding(&paddings, global_pooling, 0, padding_algorithm, + data_dims, strides, ksize); + + const auto src_tz = paddle::framework::vectorize(input->dims()); + const auto dst_tz = paddle::framework::vectorize(output->dims()); + + const auto is_test = ctx.Attr("is_test"); + + const auto dt = framework::ToMKLDNNDataType(input->type()); + const auto fmt = input->format(); + + const auto exclude_padding = ctx.Attr("exclusive"); + + const auto src_md = mkldnn::memory::desc(src_tz, dt, fmt); + /* create memory descriptor for pooling without specified format + * ('any') which lets a primitive (pooling in this case) choose + * the memory format preferred for best performance + */ + + const auto dst_md = + platform::MKLDNNMemDesc(dst_tz, dt, MKLDNNMemoryFormat::any); - if (ceil_mode) { - CorrectOutputSize(src_dims, dst_dims, ksize, paddings, strides, - mkldnn_paddings[1]); + auto mkldnn_paddings = ToMkldnnPadding(paddings); + + const bool ceil_mode = ctx.Attr("ceil_mode"); + + if (ceil_mode) { + CorrectOutputSize(src_tz, dst_tz, ksize, paddings, strides, + mkldnn_paddings[1]); + } + this->AcquireForwardPrimitiveDescriptor( + is_test ? mkldnn::prop_kind::forward_inference + : mkldnn::prop_kind::forward_training, + pooling_type == "max" + ? mkldnn::algorithm::pooling_max + : (exclude_padding + ? mkldnn::algorithm::pooling_avg_exclude_padding + : mkldnn::algorithm::pooling_avg_include_padding), + src_md, dst_md, strides, ksize, mkldnn_paddings[0], + mkldnn_paddings[1]); } - this->AcquireForwardPrimitiveDescriptor( - is_test ? mkldnn::prop_kind::forward_inference - : mkldnn::prop_kind::forward_training, - pooling_type == "max" - ? mkldnn::algorithm::pooling_max - : (exclude_padding - ? mkldnn::algorithm::pooling_avg_exclude_padding - : mkldnn::algorithm::pooling_avg_include_padding), - src_md, dst_md, strides, ksize, mkldnn_paddings[0], mkldnn_paddings[1]); } PoolingMKLDNNHandler( @@ -1190,8 +1250,11 @@ static std::shared_ptr SetDstMemory( const std::shared_ptr& handler, std::vector* pipeline) { const T* residual_param_data = residual_param->data(); - PADDLE_ENFORCE(residual_param_data != nullptr, - "Provide data if you want MKLDNN conv+elementwise_add fusion"); + PADDLE_ENFORCE_NOT_NULL( + residual_param_data, + platform::errors::PreconditionNotMet("Residual parameter is required for " + "the DNNL conv+elementwise_add " + "fusion, but now it is missing")); std::shared_ptr user_residual_memory_p = handler->AcquireResidualDataMemory(user_residual_md, to_void_cast(residual_param_data)); diff --git a/paddle/fluid/platform/profiler.cc b/paddle/fluid/platform/profiler.cc index a36d8456eeaa5316f56e7fac649442c48355f9c1..be655255bd838a17fa0ffeba274d21c73fd3820a 100644 --- a/paddle/fluid/platform/profiler.cc +++ b/paddle/fluid/platform/profiler.cc @@ -73,8 +73,7 @@ RecordEvent::RecordEvent(const std::string &name, const EventRole role) { // lock is not needed, the code below is thread-safe Event *e = PushEvent(name, role); // Maybe need the same push/pop behavior. - SetCurAnnotation(e); - name_ = e->name(); + name_ = SetCurAnnotation(e); } RecordEvent::~RecordEvent() { @@ -86,7 +85,7 @@ RecordEvent::~RecordEvent() { BlockDepth(), g_thread_id); } ClearCurAnnotation(); - PopEvent(name_); + PopEvent(name_, role_); } void MemEvenRecorder::PushMemRecord(const void *ptr, const Place &place, @@ -187,8 +186,8 @@ Event *PushEvent(const std::string &name, const EventRole role) { return GetEventList().Record(EventType::kPushRange, name, g_thread_id, role); } -void PopEvent(const std::string &name) { - GetEventList().Record(EventType::kPopRange, name, g_thread_id); +void PopEvent(const std::string &name, const EventRole role) { + GetEventList().Record(EventType::kPopRange, name, g_thread_id, role); } void EnableProfiler(ProfilerState state) { PADDLE_ENFORCE_NE(state, ProfilerState::kDisabled, diff --git a/paddle/fluid/platform/profiler.h b/paddle/fluid/platform/profiler.h index dcc9f1eee104deb55e96619fade908f4d6532913..07844713eae47d15ba5a3aae5b2d4e877ef4f30d 100644 --- a/paddle/fluid/platform/profiler.h +++ b/paddle/fluid/platform/profiler.h @@ -197,7 +197,7 @@ void PushMemEvent(uint64_t start_ns, uint64_t end_ns, size_t bytes, void PopMemEvent(uint64_t start_ns, uint64_t end_ns, size_t bytes, const Place& place, const std::string& annotation); Event* PushEvent(const std::string& name, const EventRole role); -void PopEvent(const std::string& name); +void PopEvent(const std::string& name, const EventRole role); // Return the event list of all threads. Assumed the returned value calls // event_lists, event_lists[i][j] represents the j-th Event of i-th thread. std::vector> GetAllEvents(); diff --git a/paddle/fluid/platform/profiler_helper.h b/paddle/fluid/platform/profiler_helper.h index 367676e77efe08919e00efcc32ab222a7f578de8..9d99022f153ad46a5bee630bb47b7a16bfd79128 100644 --- a/paddle/fluid/platform/profiler_helper.h +++ b/paddle/fluid/platform/profiler_helper.h @@ -22,12 +22,12 @@ limitations under the License. */ #include #include // NOLINT #include +#include #include #include #include #include #include - #ifdef PADDLE_WITH_CUDA #include #endif // PADDLE_WITH_CUDA @@ -283,7 +283,8 @@ std::function SetSortedFunc( void SetEvent(bool merge_thread, const Event &analyze_event, size_t *max_name_width, std::list *pushed_events, std::vector *event_items, - std::unordered_map *event_idx) { + std::unordered_map *event_idx, + const std::set &main_thread_event_name) { if (analyze_event.type() == EventType::kPushRange) { pushed_events->push_back(analyze_event); } else if (analyze_event.type() == EventType::kPopRange) { @@ -313,8 +314,35 @@ void SetEvent(bool merge_thread, const Event &analyze_event, if (merge_thread) { event_name = rit->name(); } else { - event_name = - "thread" + std::to_string(rit->thread_id()) + "::" + rit->name(); + if (!main_thread_event_name.empty()) { + auto origin_name = rit->name(); + int index = 1; + int split_pos = 0; + while ((split_pos = FindNthReversePos(origin_name, '/', index)) != + -1) { + auto prefix_str = origin_name.substr(0, split_pos); + if (main_thread_event_name.count(prefix_str)) { + break; + } + index++; + } + if (split_pos == -1 && !main_thread_event_name.count(rit->name())) { + event_name = "thread" + std::to_string(rit->thread_id()) + "::" + + rit->name(); + } else { + if (!main_thread_event_name.count(rit->name())) { + event_name = + origin_name.substr(0, split_pos + 1) + "thread" + + std::to_string(rit->thread_id()) + "::" + + origin_name.substr(split_pos + 1, origin_name.length() - 1); + } else { + event_name = rit->name(); + } + } + } else { + event_name = + "thread" + std::to_string(rit->thread_id()) + "::" + rit->name(); + } } auto print_name_size = event_name.size(); int found_pos = 0; @@ -608,6 +636,16 @@ void AnalyzeEvent( std::function sorted_func, EventSortingKey sorted_by, size_t *max_name_width, OverHead *overhead, bool merge_thread) { + // In oreder to deal with special event in main thread + std::set main_thread_event_name; + for (size_t i = 0; i < (*analyze_events).size(); i++) { + for (size_t j = 0; j < (*analyze_events)[i].size(); j++) { + Event event = (*analyze_events)[i][j]; + if (event.role() == EventRole::kSpecial) { + main_thread_event_name.insert(event.name()); + } + } + } for (size_t i = 0; i < (*analyze_events).size(); i++) { double total = 0.; // the total time in one thread std::list pushed_events; @@ -618,8 +656,10 @@ void AnalyzeEvent( for (size_t j = 0; j < (*analyze_events)[i].size(); j++) { Event analyze_event = (*analyze_events)[i][j]; - SetEvent(merge_thread, analyze_event, max_name_width, &pushed_events, - &event_items, &event_idx); + if (!(analyze_event.role() == EventRole::kSpecial && !merge_thread)) { + SetEvent(merge_thread, analyze_event, max_name_width, &pushed_events, + &event_items, &event_idx, main_thread_event_name); + } } auto table_size = event_items.size(); diff --git a/paddle/fluid/platform/profiler_test.cc b/paddle/fluid/platform/profiler_test.cc index daa69a77b42c005649ffff76a16e6737fa7ec183..d8bfa5445cc53566ac72fb5600706478e97c5a76 100644 --- a/paddle/fluid/platform/profiler_test.cc +++ b/paddle/fluid/platform/profiler_test.cc @@ -59,7 +59,7 @@ TEST(RecordEvent, RecordEvent) { PushEvent(name, EventRole::kOrdinary); int counter = 1; while (counter != i * 1000) counter++; - PopEvent(name); + PopEvent(name, EventRole::kOrdinary); } } @@ -109,7 +109,7 @@ TEST(RecordEvent, RecordEvent) { // Bad Usage: PushEvent("event_without_pop", EventRole::kOrdinary); - PopEvent("event_without_push"); + PopEvent("event_without_push", EventRole::kOrdinary); std::vector> events = paddle::platform::GetAllEvents(); int cuda_startup_count = 0; diff --git a/paddle/fluid/pybind/box_helper_py.cc b/paddle/fluid/pybind/box_helper_py.cc index 001f8a135f91a6fdfa25a8699ff7f7aa48c012d5..33c5cd65a7b05c0c9c685ba29aba8c83e9063637 100644 --- a/paddle/fluid/pybind/box_helper_py.cc +++ b/paddle/fluid/pybind/box_helper_py.cc @@ -54,6 +54,8 @@ void BindBoxHelper(py::module* m) { .def("preload_into_memory", &framework::BoxHelper::PreLoadIntoMemory, py::call_guard()) .def("load_into_memory", &framework::BoxHelper::LoadIntoMemory, + py::call_guard()) + .def("slots_shuffle", &framework::BoxHelper::SlotsShuffle, py::call_guard()); } // end BoxHelper @@ -61,9 +63,9 @@ void BindBoxHelper(py::module* m) { void BindBoxWrapper(py::module* m) { py::class_>( *m, "BoxWrapper") - .def(py::init([]() { + .def(py::init([](int embedx_dim, int expand_embed_dim) { // return std::make_shared(dataset); - return framework::BoxWrapper::GetInstance(); + return framework::BoxWrapper::SetInstance(embedx_dim, expand_embed_dim); })) .def("save_base", &framework::BoxWrapper::SaveBase, py::call_guard()) @@ -76,13 +78,15 @@ void BindBoxWrapper(py::module* m) { .def("initialize_gpu_and_load_model", &framework::BoxWrapper::InitializeGPUAndLoadModel, py::call_guard()) + .def("initialize_auc_runner", &framework::BoxWrapper::InitializeAucRunner, + py::call_guard()) .def("init_metric", &framework::BoxWrapper::InitMetric, py::call_guard()) .def("get_metric_msg", &framework::BoxWrapper::GetMetricMsg, py::call_guard()) .def("get_metric_name_list", &framework::BoxWrapper::GetMetricNameList, py::call_guard()) - .def("flip_pass_flag", &framework::BoxWrapper::FlipPassFlag, + .def("flip_phase", &framework::BoxWrapper::FlipPhase, py::call_guard()) .def("init_afs_api", &framework::BoxWrapper::InitAfsAPI, py::call_guard()) diff --git a/paddle/fluid/pybind/data_set_py.cc b/paddle/fluid/pybind/data_set_py.cc index 4b12f66c617282595fbcecf274424a78baf2c964..aa990e4712fefd3efb460a66968ff311ed3e5337 100644 --- a/paddle/fluid/pybind/data_set_py.cc +++ b/paddle/fluid/pybind/data_set_py.cc @@ -291,6 +291,8 @@ void BindDataset(py::module *m) { py::call_guard()) .def("set_fleet_send_sleep_seconds", &framework::Dataset::SetFleetSendSleepSeconds, + py::call_guard()) + .def("enable_pv_merge", &framework::Dataset::EnablePvMerge, py::call_guard()); py::class_(*m, "IterableDatasetWrapper") diff --git a/paddle/scripts/conda_build.py b/paddle/scripts/conda_build.py index b3956705db48b74b3b94f563a14472e0b2adf82f..05c988211b1d255b88b9d25d2e6ad3acb6300c42 100644 --- a/paddle/scripts/conda_build.py +++ b/paddle/scripts/conda_build.py @@ -116,7 +116,7 @@ python setup.py install """ self.cuda100 = r""" - cudatoolkit>=10.0, <10.1 - - cudnn>=7.3, <7.4 + - cudnn>=7.6, <7.7 """ self.cuda_info = [(self.cuda90, "cuda9.0", ".post97"), (self.cuda100, "cuda10.0", ".post107")] diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh index bd92727545647836c02931130a1ad528889ae2bb..ef23bca95326dd5b733cc3eae02d04a0a9b343ad 100755 --- a/paddle/scripts/paddle_build.sh +++ b/paddle/scripts/paddle_build.sh @@ -59,9 +59,9 @@ function init() { } function cmake_base() { - # build script will not fail if *.deb does not exist + # Build script will not fail if *.deb does not exist rm *.deb 2>/dev/null || true - # delete previous built whl packages + # Delete previous built whl packages rm -rf python/dist 2>/dev/null || true # Support build for all python versions, currently @@ -199,9 +199,7 @@ function cmake_base() { -DWITH_DISTRIBUTE=${distibuted_flag} -DWITH_MKL=${WITH_MKL:-ON} -DWITH_AVX=${WITH_AVX:-OFF} - -DWITH_GOLANG=${WITH_GOLANG:-OFF} -DCUDA_ARCH_NAME=${CUDA_ARCH_NAME:-All} - -DCUDA_ARCH_BIN=${CUDA_ARCH_BIN} -DWITH_PYTHON=${WITH_PYTHON:-ON} -DCUDNN_ROOT=/usr/ -DWITH_TESTING=${WITH_TESTING:-ON} @@ -231,9 +229,7 @@ EOF -DWITH_MKL=${WITH_MKL:-ON} \ -DWITH_AVX=${WITH_AVX:-OFF} \ -DNOAVX_CORE_FILE=${NOAVX_CORE_FILE:-""} \ - -DWITH_GOLANG=${WITH_GOLANG:-OFF} \ -DCUDA_ARCH_NAME=${CUDA_ARCH_NAME:-All} \ - -DCUDA_ARCH_BIN=${CUDA_ARCH_BIN} \ -DWITH_PYTHON=${WITH_PYTHON:-ON} \ -DCUDNN_ROOT=/usr/ \ -DWITH_TESTING=${WITH_TESTING:-ON} \ @@ -1080,7 +1076,7 @@ EOF if [[ "$1" != "" ]]; then parallel_number=$1 fi - cmake .. -DWITH_DISTRIBUTE=OFF -DON_INFER=ON -DCUDA_ARCH_NAME=${CUDA_ARCH_NAME:-Auto} -DCUDA_ARCH_BIN=${CUDA_ARCH_BIN} + cmake .. -DWITH_DISTRIBUTE=OFF -DON_INFER=ON -DCUDA_ARCH_NAME=${CUDA_ARCH_NAME:-Auto} make -j ${parallel_number} fluid_lib_dist make -j ${parallel_number} inference_lib_dist diff --git a/python/paddle/fluid/contrib/layers/nn.py b/python/paddle/fluid/contrib/layers/nn.py index 3bb56edb9b718a73eeabcc2192b9ff4e67bd9e4e..273a669a1414e858920f6f5c2ad1fce8810eb829 100644 --- a/python/paddle/fluid/contrib/layers/nn.py +++ b/python/paddle/fluid/contrib/layers/nn.py @@ -34,7 +34,8 @@ __all__ = [ 'fused_elemwise_activation', 'sequence_topk_avg_pooling', 'var_conv_2d', 'match_matrix_tensor', 'tree_conv', 'fused_embedding_seq_pool', 'multiclass_nms2', 'search_pyramid_hash', 'shuffle_batch', 'partial_concat', - 'partial_sum', 'tdm_child', 'rank_attention', 'tdm_sampler', 'batch_fc' + 'partial_sum', 'tdm_child', 'rank_attention', 'tdm_sampler', 'batch_fc', + '_pull_box_extended_sparse' ] @@ -1361,3 +1362,50 @@ def batch_fc(input, param_size, param_attr, bias_size, bias_attr, act=None): "Bias": b}, outputs={"Out": pre_act}) return helper.append_activation(pre_act) + + +def _pull_box_extended_sparse(input, size, extend_size=64, dtype='float32'): + """ + **Pull Box Extended Sparse Layer** + This layer is used to lookup embeddings of IDs, provided by :attr:`input`, in + BoxPS lookup table. The result of this lookup is the embedding of each ID in the + :attr:`input`. + Args: + input(Variable|list of Variable): Input is a Tensor Variable, which + contains the IDs information. + size(int): The embedding size parameter, which indicates the size of + each embedding vector respectively. + extend_size(int): The embedding size parameter in extended dim, + which indicates the size of each embedding vector respectively. + dtype(str): The dtype refers to the data type of output tensor. Only supports + float32 now. + Returns: + Variable|list of Variable: The tensor variable storing the embeddings of the \ + supplied inputs. + Examples: + .. code-block:: python + import paddle.fluid as fluid + data = fluid.layers.data(name='sequence', shape=[1], dtype='int64', lod_level=1) + emb, emb_ex = fluid.contrib.layers._pull_box_extended_sparse(input=data, size=8, extend_size=128) + """ + helper = LayerHelper('pull_box_extended_sparse', **locals()) + helper.input_dtype() + inputs = helper.multiple_input() + outs = [ + helper.create_variable_for_type_inference(dtype) + for i in range(len(inputs)) + ] + outs_extend = [ + helper.create_variable_for_type_inference(dtype) + for i in range(len(inputs)) + ] + helper.append_op( + type='pull_box_extended_sparse', + inputs={'Ids': inputs}, + outputs={'Out': outs, + 'OutExtend': outs_extend}, + attrs={'emb_size': size, + 'emb_extended_size': extend_size}) + if len(outs) == 1: + return outs[0], outs_extend[0] + return outs, outs_extend diff --git a/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py b/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py index a6ab2aa86d057e60586e54e9e0104b54d2e27191..fcf7a51113563667b6449f2764f4950c0150308d 100644 --- a/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py +++ b/python/paddle/fluid/contrib/slim/quantization/quantization_pass.py @@ -43,7 +43,7 @@ _fake_quant_dequant_op_list = [ _out_scale_op_list = [ "conv2d", "depthwise_conv2d", "mul", "matmul", "relu", "leaky_relu", "relu6", "sigmoid", "tanh", "prelu", "swish", "softmax", "batch_norm", - "elementwise_add", "pool2d", "reshape2", "transpose2" + "elementwise_add", "pool2d", "reshape2", "transpose2", "concat" ] # list op real input and output names, to avoid processing input such as AxisTensor. @@ -1156,14 +1156,13 @@ class OutScaleForTrainingPass(object): assert isinstance(graph, IrGraph), 'graph must be the instance of IrGraph.' self._is_test = graph.is_test() - ops = graph.all_op_nodes() - for op_node in ops: - name = op_node.name() - if name in self._teller_set: - if len(op_node.output_arg_names()) != 1: - continue - in_node = graph._find_node_by_name( - op_node.outputs, op_node.output_arg_names()[0]) + target_ops = [] + for op in graph.all_op_nodes(): + if op.name() in self._teller_set: + target_ops.append(op) + for op in target_ops: + for output_var_name in _get_op_output_var_names(op): + in_node = graph._find_node_by_name(op.outputs, output_var_name) out_node = graph.create_var_node_from_desc(in_node.var()) scale_node = graph.create_persistable_node( name=self._scale_name(in_node.name()), @@ -1263,13 +1262,13 @@ class OutScaleForInferencePass(object): """ assert isinstance(graph, IrGraph), 'graph must be the instance of IrGraph.' - ops = graph.all_op_nodes() - for op_node in ops: - name = op_node.name() - if name in self._teller_set: - if len(op_node.output_arg_names()) != 1: - continue - scale_name = self._scale_name(op_node.output_arg_names()[0]) + op_nodes = graph.all_op_nodes() + for op_node in op_nodes: + if op_node.name() in self._teller_set: + output_var_name = _get_op_output_var_names(op_node) + assert len(output_var_name) == 1, "Only support collecting " \ + "output for op that only has an activation output for now." + scale_name = self._scale_name(output_var_name[0]) scale_v = np.array( self._scope.find_var(scale_name).get_tensor())[0] op_node.op()._set_attr("out_threshold", float(scale_v)) diff --git a/python/paddle/fluid/dataset.py b/python/paddle/fluid/dataset.py index 10fd3d1a3f50496c8053ff9c6a72be2351d8a1ed..87b1ce2511e78714e066325b4d7c3b351b08cf13 100644 --- a/python/paddle/fluid/dataset.py +++ b/python/paddle/fluid/dataset.py @@ -1079,3 +1079,24 @@ class BoxPSDataset(InMemoryDataset): def _dynamic_adjust_after_train(self): pass + + def slots_shuffle(self, slots): + """ + Slots Shuffle + Slots Shuffle is a shuffle method in slots level, which is usually used + in sparse feature with large scale of instances. To compare the metric, i.e. + auc while doing slots shuffle on one or several slots with baseline to + evaluate the importance level of slots(features). + + Args: + slots(list[string]): the set of slots(string) to do slots shuffle. + + Examples: + import paddle.fluid as fluid + dataset = fluid.DatasetFactory().create_dataset("InMemoryDataset") + dataset.set_merge_by_lineid() + #suppose there is a slot 0 + dataset.slots_shuffle(['0']) + """ + slots_set = set(slots) + self.boxps.slots_shuffle(slots_set) diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/call_transformer.py b/python/paddle/fluid/dygraph/dygraph_to_static/call_transformer.py index b872ab723e31a0f4bc1a6c1d6483dedf8658cb78..4ba1d302576df695c5b2e867452b91b3d1d2844a 100644 --- a/python/paddle/fluid/dygraph/dygraph_to_static/call_transformer.py +++ b/python/paddle/fluid/dygraph/dygraph_to_static/call_transformer.py @@ -32,12 +32,23 @@ class CallTransformer(gast.NodeTransformer): self.wrapper_root = wrapper_root self.root = wrapper_root.node - def _is_builtin_call(self, node): + def _no_need_convert_call(self, node): + """ + Determines whether a function needs to be transformed by `convert_call`. + It doesn't need to be transformed when a function satisfies the following conditions: + 1. It's a api of paddle + 2. It's a python builtin function not include `len` + """ assert isinstance(node, gast.Call) + if is_paddle_api(node): + return True + func_str = ast_to_source_code(node.func).strip() try: - from paddle.fluid.dygraph.dygraph_to_static.convert_call_func import is_builtin - return eval("is_builtin({})".format(func_str)) + from paddle.fluid.dygraph.dygraph_to_static.convert_call_func import is_builtin_len, is_builtin + is_builtin = eval("is_builtin({})".format(func_str)) + is_builtin_len = eval("is_builtin_len({})".format(func_str)) + return is_builtin and not is_builtin_len except Exception: return False @@ -46,10 +57,8 @@ class CallTransformer(gast.NodeTransformer): def visit_Call(self, node): self.generic_visit(node) - if is_paddle_api(node): - return node - if self._is_builtin_call(node): + if self._no_need_convert_call(node): return node func_str = ast_to_source_code(node.func).strip() diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/convert_builtins_func.py b/python/paddle/fluid/dygraph/dygraph_to_static/convert_builtins_func.py new file mode 100644 index 0000000000000000000000000000000000000000..f612b9bfaeddf500cd343cc0ea3edde29c7f18e7 --- /dev/null +++ b/python/paddle/fluid/dygraph/dygraph_to_static/convert_builtins_func.py @@ -0,0 +1,47 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +from paddle.fluid import framework +from paddle.fluid import core +from paddle.fluid.layers import nn +from paddle.fluid.layers import control_flow + + +def convert_len(var): + """ + return variable(length) from shape ops based on var.type + + Note: In addition to some ast transformations, some block-related + operations are added in `len` transformation, such as appending + `shape_op` in var.block. + """ + if isinstance(var, framework.Variable): + if var.type in [ + core.VarDesc.VarType.LOD_TENSOR, + core.VarDesc.VarType.SELECTED_ROWS + ]: + # Note: Length of var may be known ahead of time in dygraph, + # but it probably represents batch size which can be variant. + # so we return a variable dynamically inferred from var.shape. + return nn.shape(var)[0] + elif var.type == core.VarDesc.VarType.LOD_TENSOR_ARRAY: + return control_flow.array_length(var) + else: + raise TypeError( + 'len(var) only supports LoDTensor/LoDTensorArray/SelectedRows, but received %s.' + % type(var)) + else: + return len(var) diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/convert_call_func.py b/python/paddle/fluid/dygraph/dygraph_to_static/convert_call_func.py index 1532d5be3775309b2eb9062ecac144a7847a6ab8..dd6cf81732a7b1c4a7f3c9952b9c96e6274371ed 100644 --- a/python/paddle/fluid/dygraph/dygraph_to_static/convert_call_func.py +++ b/python/paddle/fluid/dygraph/dygraph_to_static/convert_call_func.py @@ -29,6 +29,7 @@ import six from paddle.fluid.dygraph.dygraph_to_static import ProgramTranslator from paddle.fluid.dygraph.layers import Layer +from paddle.fluid.dygraph.dygraph_to_static.convert_builtins_func import convert_len DECORATOR_NAMES = ['declarative', 'dygraph_to_static_func'] program_translator = ProgramTranslator() @@ -49,6 +50,12 @@ def is_builtin(func): return False +def is_builtin_len(func): + if isinstance(func, types.BuiltinFunctionType) and func.__name__ == 'len': + return True + return False + + def is_paddle_func(func): m = inspect.getmodule(func) return m is not None and m.__name__.startswith("paddle") @@ -91,10 +98,10 @@ def convert_call(func): func_self = None converted_call = None - if is_builtin(func): - return func + if is_builtin_len(func): + return convert_len - if is_paddle_func(func): + if is_builtin(func) or is_paddle_func(func): return func if inspect.isfunction(func): diff --git a/python/paddle/fluid/dygraph/dygraph_to_static/loop_transformer.py b/python/paddle/fluid/dygraph/dygraph_to_static/loop_transformer.py index d4d1ff6ba2db46298270b3dba36748bd6f92d3e8..b9e6eff2f9b4a49cec5e5811339a8a5915f63f1d 100644 --- a/python/paddle/fluid/dygraph/dygraph_to_static/loop_transformer.py +++ b/python/paddle/fluid/dygraph/dygraph_to_static/loop_transformer.py @@ -166,13 +166,19 @@ class NameVisitor(gast.NodeVisitor): in_loop_vars = self.in_loop_vars[node] in_loop_name_strs = self._var_nodes_to_names(in_loop_vars) + before_loop_body_vars = self.before_loop_body_vars[node] + before_loop_body_vars = self._remove_target_vars_of_for( + before_loop_body_vars, node) before_loop_name_strs = self._var_nodes_to_names(before_loop_body_vars) + after_loop_vars = self.current_seen_vars - before_loop_body_vars - in_loop_vars + after_loop_vars = self._remove_target_vars_of_for(after_loop_vars, node) after_loop_name_strs = self._var_nodes_to_names(after_loop_vars, read_context) condition_vars = self.condition_vars[node] condition_names = self._var_nodes_to_names(condition_vars) + write_vars = self.write_in_loop[node] write_names = self._var_nodes_to_names(write_vars) @@ -203,6 +209,7 @@ class NameVisitor(gast.NodeVisitor): # vars out loop_var_names.add(name) create_var_names.add(name) + return loop_var_names, create_var_names def visit_Name(self, node): @@ -221,8 +228,8 @@ class NameVisitor(gast.NodeVisitor): self.in_loop_vars[loop_node].add(node) if type(node.ctx) in write_context: self.write_in_loop[loop_node].add(node) - if self.in_condition: - self.condition_vars[loop_node].add(node) + if self.in_condition: + self.condition_vars[loop_node].add(node) self.generic_visit(node) def visit_FunctionDef(self, node): @@ -309,11 +316,60 @@ class NameVisitor(gast.NodeVisitor): return False def _is_call_func_name_node(self, node): - parent_node = self.node_to_wrapper_map[node].parent.node + parent_node = self._get_parent_node(node) if isinstance(parent_node, gast.Call) and parent_node.func == node: return True return False + def _get_parent_node(self, node): + wrapper_node = self.node_to_wrapper_map.get(node) + if wrapper_node: + parent_node = wrapper_node.parent.node + return parent_node + return None + + def _remove_target_vars_of_for(self, before_or_after_loop_vars, loop_node): + """ + Remove target vars of gast.For from before_loop_vars or after_loop_vars. + :param before_or_after_loop_vars: before_loop_vars or after_loop_vars of loop_node. + :param loop_node: Current loop node. + """ + + removed_vars = set() + for name_node in before_or_after_loop_vars: + if not isinstance(name_node, gast.Name): + continue + + parent_node = self._get_parent_node(name_node) + + # NOTE: gast.For.target can be gast.Tuple. + # For example: `for i, j in enumerate(x)` has two target vars: i and j + if isinstance(parent_node, gast.Tuple): + parent_node = self._get_parent_node(parent_node) + + if isinstance(parent_node, + gast.For) and parent_node is not loop_node: + target_node = parent_node.target + + if isinstance(target_node, gast.Tuple): + target_vars = target_node.elts + else: + target_vars = [target_node] + + if name_node in target_vars: + removed_vars.add(name_node) + + removed_vars_name_strs = {var.id for var in removed_vars} + + for var in before_or_after_loop_vars: + if not isinstance(var, gast.Name): + continue + if var.id in removed_vars_name_strs and var not in self.condition_vars[ + loop_node]: + removed_vars.add(var) + + return before_or_after_loop_vars - removed_vars + class LoopTransformer(gast.NodeTransformer): """ diff --git a/python/paddle/fluid/dygraph/nn.py b/python/paddle/fluid/dygraph/nn.py index e798ec5fc1f60e617f5c5c424c5552b994046e46..7cb4702fedcbac9a3dd8fc7bb941735a1cfbe435 100644 --- a/python/paddle/fluid/dygraph/nn.py +++ b/python/paddle/fluid/dygraph/nn.py @@ -771,14 +771,19 @@ class Pool2D(layers.Layer): ceil_mode (bool, optional): Whether to use the ceil function to calculate output height and width. False is the default. If it is set to False, the floor function will be used. Default: False. exclusive (bool, optional): Whether to exclude padding points in average pooling mode. Default: True. + data_format (string): The data format of the input and output data. An optional string from: `"NCHW"`, `"NHWC"`. + The default is `"NCHW"`. When it is `"NCHW"`, the data is stored in the order of: + ``[batch_size, input_channels, input_height, input_width]``. When it is `"NHWC"`, the data is + stored in the order of: ``[batch_size, input_height, input_width, input_channels]`` Returns: None Raises: - ValueError: If 'pool_type' is not "max" nor "avg" - ValueError: If 'global_pooling' is False and 'pool_size' is -1 - ValueError: If 'use_cudnn' is not a bool value. + ValueError: If ``pool_type`` is not "max" nor "avg". + ValueError: If ``global_pooling`` is False and ``pool_size`` is -1. + ValueError: If ``use_cudnn`` is not a bool value. + ValueError: If ``data_format`` is not "NCHW" nor "NHWC". Examples: @@ -806,7 +811,10 @@ class Pool2D(layers.Layer): global_pooling=False, use_cudnn=True, ceil_mode=False, - exclusive=True): + exclusive=True, + data_format="NCHW"): + data_format = data_format.upper() # supprt NHWC, nhwc, etc. + pool_type = pool_type.lower() # supprt max, Max, etc. if pool_type not in ["max", "avg"]: raise ValueError( "Unknown pool_type: '%s'. It can only be 'max' or 'avg'.", @@ -820,6 +828,11 @@ class Pool2D(layers.Layer): if not isinstance(use_cudnn, bool): raise ValueError("use_cudnn should be True or False") + if data_format not in ["NCHW", "NHWC"]: + raise ValueError( + "Attr(data_format) should be 'NCHW' or 'NHWC'. Received " + "Attr(data_format): %s." % str(data_format)) + super(Pool2D, self).__init__() self._pool_type = pool_type @@ -831,6 +844,7 @@ class Pool2D(layers.Layer): self._use_cudnn = use_cudnn self._ceil_mode = ceil_mode self._exclusive = exclusive + self._data_format = data_format self._l_type = 'pool2d' def forward(self, input): @@ -839,7 +853,8 @@ class Pool2D(layers.Layer): 'global_pooling', self._global_pooling, 'strides', self._pool_stride, 'paddings', self._pool_padding, 'use_cudnn', self._use_cudnn, 'ceil_mode', self._ceil_mode, - 'use_mkldnn', False, 'exclusive', self._exclusive) + 'use_mkldnn', False, 'exclusive', self._exclusive, + 'data_format', self._data_format) return core.ops.pool2d(input, *attrs) check_variable_and_dtype( @@ -856,6 +871,7 @@ class Pool2D(layers.Layer): "ceil_mode": self._ceil_mode, "use_mkldnn": False, "exclusive": self._exclusive, + "data_format": self._data_format, } inputs = {"X": [input]} diff --git a/python/paddle/fluid/layers/loss.py b/python/paddle/fluid/layers/loss.py index abaa9888c8d7ed019628e708024bbc25b19e3299..805c8f81688ecf7feebe07ef4848a189ecb114d6 100644 --- a/python/paddle/fluid/layers/loss.py +++ b/python/paddle/fluid/layers/loss.py @@ -1536,9 +1536,11 @@ def teacher_student_sigmoid_loss(input, cost = fluid.layers.teacher_student_sigmoid_loss(input=similarity, label=label) """ - check_variable_and_dtype(input, "input", ['float32', 'float64'], + check_variable_and_dtype(input, "input", + ['float32', 'float64', 'int32', 'int64'], 'teacher_student_sigmoid_loss') - check_variable_and_dtype(label, "label", ['float32', 'float64'], + check_variable_and_dtype(label, "label", + ['float32', 'float64', 'int32', 'int64'], 'teacher_student_sigmoid_loss') helper = LayerHelper('teacher_student_sigmoid_loss', **locals()) diff --git a/python/paddle/fluid/layers/nn.py b/python/paddle/fluid/layers/nn.py index b333183257e32153722d6750d86e37f85c6916c2..9bedb03060713aa29cff753f9f40f6d8d4b8ebaa 100644 --- a/python/paddle/fluid/layers/nn.py +++ b/python/paddle/fluid/layers/nn.py @@ -1902,7 +1902,7 @@ def pool2d(input, None by default. exclusive (bool): Whether to exclude padding points in average pooling mode, default is `true`. - data_format (string): The data format of the input and output data. An optional string from: `"NCHW"`, `"NDHW"`. + data_format (string): The data format of the input and output data. An optional string from: `"NCHW"`, `"NHWC"`. The default is `"NCHW"`. When it is `"NCHW"`, the data is stored in the order of: `[batch_size, input_channels, input_height, input_width]`. @@ -11045,8 +11045,26 @@ def shape(input): Get the shape of the input. + .. code-block:: text + + Case1: + Given N-D Tensor: + input = [ [1, 2, 3, 4], [5, 6, 7, 8] ] + + Then: + input.shape = [2, 4] + + Case2: + Given SelectedRows: + input.rows = [0, 4, 19] + input.height = 20 + input.value = [ [1, 2], [3, 4], [5, 6] ] # inner tensor + Then: + input.shape = [3, 2] + Args: - input (Variable): The input N-D Tensor. Datatype can be float32, float64, int32, int64. + input (Variable): The input can be N-D Tensor or SelectedRows with data type float32, float64, int32, int64. + If input variable is type of SelectedRows, returns the shape of it's inner tensor. Returns: Variable (Tensor): The shape of the input variable. @@ -11057,7 +11075,7 @@ def shape(input): import paddle.fluid as fluid import numpy as np - inputs = fluid.layers.data(name="x", shape=[3, 100, 100], dtype="float32") + inputs = fluid.data(name="x", shape=[3, 100, 100], dtype="float32") output = fluid.layers.shape(inputs) exe = fluid.Executor(fluid.CPUPlace()) diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/bert_utils.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/bert_utils.py index 75ad7ae077fd8e1f9952561fab3743172ce859fe..a18bb34e18282af90773f6a032ddd280f529ebb7 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_to_static/bert_utils.py +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/bert_utils.py @@ -49,10 +49,13 @@ def mask(batch_tokens, total_token_num, vocab_size, CLS=1, SEP=2, MASK=3): max_len = max([len(sent) for sent in batch_tokens]) mask_label = [] mask_pos = [] - np.random.seed(SEED) - prob_mask = np.random.rand(total_token_num) + # NOTE: numpy random is not thread-safe, for async DataLoader, + # using np.random.seed() directly is risky, using RandomState + # class is a better way + self_random = np.random.RandomState(SEED) + prob_mask = self_random.rand(total_token_num) # Note: the first token is [CLS], so [low=1] - replace_ids = np.random.randint(1, high=vocab_size, size=total_token_num) + replace_ids = self_random.randint(1, high=vocab_size, size=total_token_num) pre_sent_len = 0 prob_index = 0 for sent_index, sent in enumerate(batch_tokens): @@ -85,7 +88,9 @@ def mask(batch_tokens, total_token_num, vocab_size, CLS=1, SEP=2, MASK=3): # ensure at least mask one word in a sentence while not mask_flag: - token_index = int(np.random.randint(1, high=len(sent) - 1, size=1)) + token_index = int( + self_random.randint( + 1, high=len(sent) - 1, size=1)) if sent[token_index] != SEP and sent[token_index] != CLS: mask_label.append(sent[token_index]) sent[token_index] = MASK @@ -244,13 +249,16 @@ class DataReader(object): def build_fake_data(self): for _ in range(1000000): - random.seed(SEED) - sent0_len = random.randint(50, 100) - sent1_len = random.randint(50, 100) + # NOTE: python random has bug in python2, + # we should avoid using random module, + # please using numpy.random + self_random = np.random.RandomState(SEED) + sent0_len = self_random.randint(50, 100) + sent1_len = self_random.randint(50, 100) token_ids = [1] \ - + [random.randint(0, 10000) for i in range(sent0_len-1)] \ - + [random.randint(0, 10000) for i in range(sent1_len-1)] \ + + [self_random.randint(0, 10000) for i in range(sent0_len-1)] \ + + [self_random.randint(0, 10000) for i in range(sent1_len-1)] \ + [2] sent_ids = [0 for i in range(sent0_len) diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_len.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_len.py new file mode 100644 index 0000000000000000000000000000000000000000..00a1b018376c67e769e6d4061e861dfced72ca4e --- /dev/null +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_len.py @@ -0,0 +1,122 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest + +import numpy as np +import paddle.fluid as fluid +from paddle.fluid.dygraph import declarative +from paddle.fluid.dygraph.dygraph_to_static import convert_call + +SEED = 2020 +np.random.seed(SEED) + + +def len_with_tensor(x): + x = fluid.dygraph.to_variable(x) + x_len = len(x) + return x_len + + +def len_with_lod_tensor_array(x): + x = fluid.dygraph.to_variable(x) + + i = fluid.layers.fill_constant(shape=[1], dtype='int64', value=0) + arr = fluid.layers.array_write(x, i=i) + arr_len = len(arr) + + return arr_len + + +class TestLen(unittest.TestCase): + def setUp(self): + self.place = fluid.CUDAPlace(0) if fluid.is_compiled_with_cuda( + ) else fluid.CPUPlace() + self.x_data = np.random.random([10, 16]).astype('float32') + self.init_func() + + def init_func(self): + self.func = len_with_tensor + + def _run(self, to_static): + with fluid.dygraph.guard(self.place): + if to_static: + out = declarative(self.func)(self.x_data) + else: + out = self.func(self.x_data) + + if isinstance(out, fluid.core.VarBase): + out = out.numpy() + return out + + def test_len(self): + dygraph_res = self._run(to_static=False) + static_res = self._run(to_static=True) + self.assertTrue(np.allclose(dygraph_res, static_res)) + + +class TestLenWithTensorArray(TestLen): + def init_func(self): + self.func = len_with_lod_tensor_array + + +# Note: Variable(SelectedRows) is not exposed directly in dygraph. +# The unittest is used to test coverage by fake transformed code. +def len_with_selected_rows(place): + block = fluid.default_main_program().global_block() + # create selected_rows variable + var = block.create_var( + name="X", + dtype="float32", + persistable=True, + type=fluid.core.VarDesc.VarType.SELECTED_ROWS) + # y is Variable(SelectedRows) + y = fluid.layers.merge_selected_rows(var) + y_len = convert_call(len)(y) + + # z is inner tensor with shape [4, 2] + z = fluid.layers.get_tensor_from_selected_rows(y) + z_len = convert_call(len)(z) + + # set data for selected_rows + x_rows = [0, 2, 2, 4, 19] + row_numel = 2 + np_array = np.ones((len(x_rows), row_numel)).astype("float32") + + x_var = fluid.global_scope().var("X").get_selected_rows() + x_var.set_rows(x_rows) + x_var.set_height(20) + x_tensor = x_var.get_tensor() + x_tensor.set(np_array, place) + + exe = fluid.Executor(place=place) + result = exe.run(fluid.default_main_program(), fetch_list=[y_len, z_len]) + return result + + +class TestLenWithSelectedRows(unittest.TestCase): + def setUp(self): + self.place = fluid.CUDAPlace(0) if fluid.is_compiled_with_cuda( + ) else fluid.CPUPlace() + + def test_len(self): + selected_rows_var_len, var_tensor_len = len_with_selected_rows( + self.place) + self.assertEqual(selected_rows_var_len, var_tensor_len) + + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_loop.py b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_loop.py index 66f153d9ef06c3721f86e2b4baff3788cacb43bd..08b1336152ccd9169dbdfa8fb608c021df9d8ea9 100644 --- a/python/paddle/fluid/tests/unittests/dygraph_to_static/test_loop.py +++ b/python/paddle/fluid/tests/unittests/dygraph_to_static/test_loop.py @@ -132,6 +132,19 @@ def var_create_in_for_loop(max_len): return ret +def nested_for_loop_dyfunc(): + two = fluid.layers.fill_constant(shape=[1], value=2, dtype="int32") + three = fluid.layers.fill_constant(shape=[1], value=3, dtype="int32") + for j in range(two): + for i in range(10): + a = 2 + + for i in range(three): + b = fluid.layers.zeros(shape=[1], dtype='float32') + + return b + + class TestNameVisitor(unittest.TestCase): def setUp(self): self.loop_funcs = [ @@ -142,6 +155,8 @@ class TestNameVisitor(unittest.TestCase): ] self.create_var_names = [set(), set(["ret"]), set()] + self.nested_for_loop_func = nested_for_loop_dyfunc + def test_loop_vars(self): for i in range(len(self.loop_funcs)): func = self.loop_funcs[i] @@ -155,6 +170,28 @@ class TestNameVisitor(unittest.TestCase): self.assertEqual(loop_var_names, self.loop_var_names[i]) self.assertEqual(create_var_names, self.create_var_names[i]) + def test_nested_loop_vars(self): + func = self.nested_for_loop_func + test_func = inspect.getsource(func) + gast_root = gast.parse(test_func) + name_visitor = NameVisitor(gast_root) + + self.loop_var_names = [ + set(["j", "two"]), + set(["i", "three", "b"]), + set(["i"]), + ] + self.create_var_names = [set(), set(["b"]), set()] + i = 0 + for node in gast.walk(gast_root): + if isinstance(node, (gast.While, gast.For)): + loop_var_names, create_var_names = name_visitor.get_loop_var_names( + node) + # print(loop_var_names) + self.assertEqual(loop_var_names, self.loop_var_names[i]) + self.assertEqual(create_var_names, self.create_var_names[i]) + i += 1 + class TestTransformWhileLoop(unittest.TestCase): def setUp(self): diff --git a/python/paddle/fluid/tests/unittests/test_boxps.py b/python/paddle/fluid/tests/unittests/test_boxps.py index 4403f99b610d5a54a9741ed169eb8fabd77b0b15..88a6e5e9be84f54545218c93de3189fb85ff7e35 100644 --- a/python/paddle/fluid/tests/unittests/test_boxps.py +++ b/python/paddle/fluid/tests/unittests/test_boxps.py @@ -172,6 +172,7 @@ class TestBoxPSPreload(unittest.TestCase): exe.run(fluid.default_startup_program()) datasets[0].load_into_memory() datasets[0].begin_pass() + datasets[0].slots_shuffle([]) datasets[1].preload_into_memory() exe.train_from_dataset( program=fluid.default_main_program(), diff --git a/python/paddle/fluid/tests/unittests/test_dataset.py b/python/paddle/fluid/tests/unittests/test_dataset.py index dcf0beba3046ee915560d356534784bb9d7bcc0a..ea40d9abb96f019616487d8cd316748240708fcd 100644 --- a/python/paddle/fluid/tests/unittests/test_dataset.py +++ b/python/paddle/fluid/tests/unittests/test_dataset.py @@ -125,6 +125,7 @@ class TestDataset(unittest.TestCase): dataset.set_trainer_num(4) dataset.set_hdfs_config("my_fs_name", "my_fs_ugi") dataset.set_download_cmd("./read_from_afs my_fs_name my_fs_ugi") + dataset.enable_pv_merge() thread_num = dataset.get_thread_num() self.assertEqual(thread_num, 12) @@ -231,7 +232,7 @@ class TestDataset(unittest.TestCase): dataset.set_pipe_command("cat") dataset.set_use_var(slots_vars) dataset.load_into_memory() - dataset.set_fea_eval(10000, True) + dataset.set_fea_eval(1, True) dataset.slots_shuffle(["slot1"]) dataset.local_shuffle() dataset.set_generate_unique_feasigns(True, 15) diff --git a/python/paddle/fluid/tests/unittests/test_dequantize_log_op.py b/python/paddle/fluid/tests/unittests/test_dequantize_log_op.py index 6c6f0811bbb8a1474a6c783feaf565a7877f9200..3ad1f05f92d58c9fa9e82611283a8bf0b7af237e 100644 --- a/python/paddle/fluid/tests/unittests/test_dequantize_log_op.py +++ b/python/paddle/fluid/tests/unittests/test_dequantize_log_op.py @@ -26,9 +26,9 @@ def dequantize_log(x, dict_data): output_data_f = output_data.flatten() for i in range(x_f.size): if x_f[i] < 0: - output_data_f[i] = -np.power(2, dict_data[x_f[i] + 128]) + output_data_f[i] = -dict_data[x_f[i] + 128] else: - output_data_f[i] = np.power(2, dict_data[x_f[i]]) + output_data_f[i] = dict_data[x_f[i]] return output_data_f.reshape(x.shape) diff --git a/python/paddle/fluid/tests/unittests/test_paddlebox_datafeed.py b/python/paddle/fluid/tests/unittests/test_paddlebox_datafeed.py index 35bc144989aa2406d58d57474b01e60261e435c5..94bc8ff28861b266015101707be12c6077379055 100644 --- a/python/paddle/fluid/tests/unittests/test_paddlebox_datafeed.py +++ b/python/paddle/fluid/tests/unittests/test_paddlebox_datafeed.py @@ -17,7 +17,6 @@ import paddle.fluid.core as core import os import unittest import paddle.fluid.layers as layers -from paddle.fluid.layers.nn import _pull_box_sparse class TestDataFeed(unittest.TestCase): @@ -57,9 +56,9 @@ class TestDataFeed(unittest.TestCase): lod_level=0, append_batch_size=False) - emb_x, emb_y = _pull_box_sparse([x, y], size=2) - emb_xp = _pull_box_sparse(x, size=2) - concat = layers.concat([emb_x, emb_y], axis=1) + emb_x, emb_y = fluid.contrib.layers._pull_box_extended_sparse( + [x, y], size=2, extend_size=128) + concat = layers.concat([emb_x[0], emb_x[1], emb_y[0], emb_y[1]], axis=1) fc = layers.fc(input=concat, name="fc", size=1, diff --git a/python/paddle/fluid/tests/unittests/test_pool2d_op.py b/python/paddle/fluid/tests/unittests/test_pool2d_op.py index e3b79fe9651aa20d5796085f0c0bfbba2ed978fd..a9fdcd55f74cd53824016765fe82a03190f23f89 100644 --- a/python/paddle/fluid/tests/unittests/test_pool2d_op.py +++ b/python/paddle/fluid/tests/unittests/test_pool2d_op.py @@ -1295,6 +1295,78 @@ class TestDygraphPool2DAPIError(unittest.TestCase): name='x1', shape=[3, 32, 32, 5], dtype="int32") self.assertRaises(TypeError, pool2d, data2) + def test_data_format_error(self): + with program_guard(Program(), Program()): + # the data_format must be 'NCHW' or 'NHWC' + data1 = np.random.random((3, 32, 32, 5)).astype('float32') + self.assertRaises( + ValueError, + fluid.dygraph.Pool2D, + pool_size=2, + pool_type='max', + pool_stride=1, + global_pooling=False, + data_format='NWHC') + + +class TestDygraphPool2DAPI(unittest.TestCase): + def test_nhwc(self): + with fluid.dygraph.guard(): + data = np.random.random((3, 32, 32, 5)).astype('float32') + x = fluid.dygraph.to_variable(data) + pool2d = fluid.dygraph.Pool2D( + pool_size=2, + pool_type='max', + pool_stride=1, + pool_padding=[0, 0], + global_pooling=False, + data_format='NHWC') + out1 = pool2d(x) + out2 = pool2D_forward_naive( + data, [2, 2], [1, 1], + paddings=[0, 0], + pool_type='max', + data_format='NHWC') + self.assertTrue(np.allclose(out1.numpy(), out2)) + + def test_lower_case(self): + with fluid.dygraph.guard(): + data = np.random.random((3, 32, 32, 5)).astype('float32') + x = fluid.dygraph.to_variable(data) + pool2d = fluid.dygraph.Pool2D( + pool_size=2, + pool_type='max', + pool_stride=1, + pool_padding=[0, 0], + global_pooling=False, + data_format='nhwc') + out1 = pool2d(x) + out2 = pool2D_forward_naive( + data, [2, 2], [1, 1], + paddings=[0, 0], + pool_type='max', + data_format='NHWC') + self.assertTrue(np.allclose(out1.numpy(), out2)) + + def test_upper_case(self): + with fluid.dygraph.guard(): + data = np.random.random((3, 32, 32, 5)).astype('float32') + x = fluid.dygraph.to_variable(data) + pool2d = fluid.dygraph.Pool2D( + pool_size=2, + pool_type='MAX', + pool_stride=1, + pool_padding=[0, 0], + global_pooling=False, + data_format='nhwc') + out1 = pool2d(x) + out2 = pool2D_forward_naive( + data, [2, 2], [1, 1], + paddings=[0, 0], + pool_type='max', + data_format='NHWC') + self.assertTrue(np.allclose(out1.numpy(), out2)) + if __name__ == '__main__': unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_shape_op.py b/python/paddle/fluid/tests/unittests/test_shape_op.py index 02231ea943e1e92a08730e6e9f1aa3cefeb927c0..bada62e3239eadfb75da47eb85e73a3ac67e8e41 100644 --- a/python/paddle/fluid/tests/unittests/test_shape_op.py +++ b/python/paddle/fluid/tests/unittests/test_shape_op.py @@ -17,6 +17,8 @@ from __future__ import print_function import unittest import numpy as np from op_test import OpTest +from paddle.fluid import core +from paddle.fluid.op import Operator class TestShapeOp(OpTest): @@ -45,5 +47,41 @@ class case2(TestShapeOp): self.shape = [1, 2, 3] +class TestShapeWithSelectedRows(unittest.TestCase): + def get_places(self): + places = [core.CPUPlace()] + if core.is_compiled_with_cuda(): + places.append(core.CUDAPlace(0)) + return places + + def check_with_place(self, place): + scope = core.Scope() + x_rows = [0, 1, 5, 4, 19] + height = 20 + row_numel = 2 + + np_array = np.ones((len(x_rows), row_numel)).astype("float32") + + # initialize input variable X + x = scope.var('X').get_selected_rows() + x.set_rows(x_rows) + x.set_height(height) + x_tensor = x.get_tensor() + x_tensor.set(np_array, place) + + # initialize input variable Out + out_shape = scope.var("Out").get_tensor() + op = Operator("shape", Input="X", Out="Out") + + op.run(scope, place) + + out_shape = np.array(out_shape).tolist() + self.assertListEqual([5, 2], out_shape) + + def test_check_output(self): + for place in self.get_places(): + self.check_with_place(place) + + if __name__ == '__main__': unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_var_base.py b/python/paddle/fluid/tests/unittests/test_var_base.py index 1c648ee1908fa74de11ddfc0340f04ec9a5a3ccb..ebd357106c3320e376861755d50632119a2602e9 100644 --- a/python/paddle/fluid/tests/unittests/test_var_base.py +++ b/python/paddle/fluid/tests/unittests/test_var_base.py @@ -50,7 +50,7 @@ class TestVarBase(unittest.TestCase): def test_tensor_to_variable(self): with fluid.dygraph.guard(): t = fluid.Tensor() - t.set(np.ndarray([5, 30]), fluid.CPUPlace()) + t.set(np.random.random((1024, 1024)), fluid.CPUPlace()) var = fluid.dygraph.to_variable(t) self.assertTrue(np.array_equal(t, var.numpy())) diff --git a/python/paddle/fluid/transpiler/collective.py b/python/paddle/fluid/transpiler/collective.py index 42623337de933ddcf2e3d4a036c3e79907ce6c21..ae4befa004c9e587a4a58d7f8df3f248a6fc277f 100644 --- a/python/paddle/fluid/transpiler/collective.py +++ b/python/paddle/fluid/transpiler/collective.py @@ -314,7 +314,8 @@ class LocalSGD(Collective): name=self.snapshot_name(param.name), shape=param.shape, persistable=True, - stop_gradient=True) + stop_gradient=True, + dtype=param.dtype) block._insert_op( idx + 1, diff --git a/tools/check_api_approvals.sh b/tools/check_api_approvals.sh index e6d41c11ce1ad380382182c8b759d3326fac37a2..ca698887c30318b25020b527d4a1f200604dec75 100644 --- a/tools/check_api_approvals.sh +++ b/tools/check_api_approvals.sh @@ -283,6 +283,16 @@ if [ "${ADDED_OP_USE_DEFAULT_GRAD_MAKER}" != "" ]; then check_approval 1 32832641 6836917 fi +# Get the list of PR authors with unresolved unit test issues +pip install PyGithub +# For getting PR related data +wget https://paddle-ci.gz.bcebos.com/blk/block.txt +HASUTFIXED=`python ${PADDLE_ROOT}/tools/check_ut.py | grep "has unit-test to be fixed" || true` +if [ "${HASUTFIXED}" != "" ]; then + echo_line="${HASUTFIXED} You must have one RD (chalsliu (Recommend) or kolinwei) approval.\n" + check_approval 1 45041955 22165420 +fi + if [ -n "${echo_list}" ];then echo "****************" echo -e "${echo_list[@]}" diff --git a/tools/check_ut.py b/tools/check_ut.py new file mode 100644 index 0000000000000000000000000000000000000000..6e507d6543fe4c17299b5fc657c4e37dcc371f17 --- /dev/null +++ b/tools/check_ut.py @@ -0,0 +1,54 @@ +#!/bin/env python +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +""" Get pull requests. """ + +import os +import time +import os.path +from github import Github + + +class PRChecker(object): + """ PR Checker. """ + + def __init__(self): + self.github = Github(os.getenv('GITHUB_API_TOKEN'), timeout=60) + self.repo = None + + def check(self): + """ check pr """ + filename = 'block.txt' + pr_id = os.getenv('GIT_PR_ID') + if not pr_id: + print('No PR ID') + exit(0) + print(pr_id) + if not os.path.isfile(filename): + print('No author to check') + exit(0) + self.repo = self.github.get_repo('PaddlePaddle/Paddle') + pr = self.repo.get_pull(int(pr_id)) + user = pr.user.login + with open(filename) as f: + for l in f: + if l.rstrip('\r\n') == user: + print('{} has UT to be fixed, so CI failed.'.format(user)) + exit(1) + exit(0) + + +if __name__ == '__main__': + pr_checker = PRChecker() + pr_checker.check() diff --git a/tools/count_invalid_enforce.sh b/tools/count_invalid_enforce.sh index 927d96e9a08d7f95973285fe6d0b49143963a88a..0294132e25dfcb4079fe290319d7fc7342e23533 100644 --- a/tools/count_invalid_enforce.sh +++ b/tools/count_invalid_enforce.sh @@ -45,7 +45,7 @@ function walk_dir(){ if [ $level -le 1 ]; then enforce_scan $1"/"$file total_check_cnt valid_check_cnt dir_name=$1 - echo "${dir_name#../}"/"$file - total: ${total_check_cnt}, valid: ${valid_check_cnt}, invalid: $(($total_check_cnt-$valid_check_cnt))" + echo "${dir_name#../}/"$file" | ${total_check_cnt} | ${valid_check_cnt} | $(($total_check_cnt-$valid_check_cnt))" ALL_PADDLE_CHECK_CNT=$(($ALL_PADDLE_CHECK_CNT+$total_check_cnt)) VALID_PADDLE_CHECK_CNT=$(($VALID_PADDLE_CHECK_CNT+$valid_check_cnt)) walk_dir $1"/"$file $level diff --git a/tools/file_invalid_enforce.sh b/tools/file_invalid_enforce.sh index f2c1630a1d43e4ee14b73904d535e7b475f6528e..3feb10cfb97961b1186f8cd2294aff7be25e2d92 100644 --- a/tools/file_invalid_enforce.sh +++ b/tools/file_invalid_enforce.sh @@ -29,6 +29,15 @@ ROOT_DIR=../paddle/fluid/operators +white_list_str = "\ + layer_norm_op.cc \ + box_clip_op.cc \ + box_clip_op.h \ + random_crop_op.h \ + elementwise_op_function.cu.h \ + fused_elemwise_activation_op.cc \ + auc_op.cu" + function enforce_scan(){ paddle_check=`grep -r -zoE "(PADDLE_ENFORCE[A-Z_]{0,9}|PADDLE_THROW)\(.[^,\);]*.[^;]*\);\s" $1 || true` total_check_cnt=`echo "$paddle_check" | grep -cE "(PADDLE_ENFORCE|PADDLE_THROW)" || true` @@ -45,14 +54,16 @@ function walk_dir(){ for file in `ls $1` do if [ -f $1"/"$file ];then - enforce_scan $1"/"$file file_total_check_cnt file_valid_check_cnt - file_invalid_check_cnt=$(($total_check_cnt-$valid_check_cnt)) - if [ $file_invalid_check_cnt -gt 0 ];then - echo "- $file | ${file_total_check_cnt} | ${file_valid_check_cnt} | ${file_invalid_check_cnt}" + in_white_list=$(echo $white_list_str | grep "${file}") + if [[ "$in_white_list" == "" ]];then + enforce_scan $1"/"$file file_total_check_cnt file_valid_check_cnt + file_invalid_check_cnt=$(($total_check_cnt-$valid_check_cnt)) + if [ $file_invalid_check_cnt -gt 0 ];then + echo "- $file | ${file_total_check_cnt} | ${file_valid_check_cnt} | ${file_invalid_check_cnt}" + fi fi fi if [ -d $1"/"$file ];then - dir_array[$i]=$1"/"$file ((i++)) fi diff --git a/tools/manylinux1/Dockerfile.CI35-GCC4.8 b/tools/manylinux1/Dockerfile.CI35-GCC4.8 deleted file mode 120000 index 6f5de91a12b94868f315e56d9d349b3356091073..0000000000000000000000000000000000000000 --- a/tools/manylinux1/Dockerfile.CI35-GCC4.8 +++ /dev/null @@ -1 +0,0 @@ -Dockerfile.cuda9_cudnn7_gcc48_py35_centos6 \ No newline at end of file diff --git a/tools/manylinux1/Dockerfile.cuda9_cudnn7_gcc8_py35_centos6 b/tools/manylinux1/Dockerfile.cuda10_cudnn7_gcc8_py35_centos6 similarity index 99% rename from tools/manylinux1/Dockerfile.cuda9_cudnn7_gcc8_py35_centos6 rename to tools/manylinux1/Dockerfile.cuda10_cudnn7_gcc8_py35_centos6 index d51356b29fd18ad775607230bfed388dd7fbb848..fa80ae72c39ed4a389b0b7b895c7b90fe8c6f744 100644 --- a/tools/manylinux1/Dockerfile.cuda9_cudnn7_gcc8_py35_centos6 +++ b/tools/manylinux1/Dockerfile.cuda10_cudnn7_gcc8_py35_centos6 @@ -3,7 +3,7 @@ # which requires some headers and symbols not present on CentOS-5 (e.g., # signalfd.h, pipe2, O_NONBLOCK, SOCK_NONBLOCK, etc.). See # https://github.com/sandstorm-io/capnproto/issues/350. -FROM nvidia/cuda:9.0-cudnn7-devel-centos6 +FROM nvidia/cuda:10.1-cudnn7-devel-centos6 MAINTAINER Numenta, based on the ManyLinux project ENV LC_ALL en_US.UTF-8 diff --git a/tools/manylinux1/Dockerfile.cuda10_cudnn7_gcc8_ubuntu16 b/tools/manylinux1/Dockerfile.cuda10_cudnn7_gcc8_ubuntu16 new file mode 100644 index 0000000000000000000000000000000000000000..242c071c0f828423ac4b09b228c280c55f8c8afa --- /dev/null +++ b/tools/manylinux1/Dockerfile.cuda10_cudnn7_gcc8_ubuntu16 @@ -0,0 +1,248 @@ +# A image for building paddle binaries +# Use cuda devel base image for both cpu and gpu environment +# When you modify it, please be aware of cudnn-runtime version +FROM nvidia/cuda:10.1-cudnn7-devel-ubuntu16.04 +MAINTAINER PaddlePaddle Authors + +ARG UBUNTU_MIRROR +RUN /bin/bash -c 'if [[ -n ${UBUNTU_MIRROR} ]]; then sed -i 's#http://archive.ubuntu.com/ubuntu#${UBUNTU_MIRROR}#g' /etc/apt/sources.list; fi' + +# ENV variables +ARG WITH_GPU +ARG WITH_AVX + +ENV WOBOQ OFF +ENV WITH_GPU=${WITH_GPU:-ON} +ENV WITH_AVX=${WITH_AVX:-ON} + +ENV HOME /root +# Add bash enhancements +COPY ./paddle/scripts/docker/root/ /root/ + +# gcc8.2 +RUN wget -q https://paddle-docker-tar.bj.bcebos.com/home/users/tianshuo/bce-python-sdk-0.8.27/gcc-8.2.0.tar.xz && \ + tar -xvf gcc-8.2.0.tar.xz && \ + cd gcc-8.2.0 && \ + sed -i 's#ftp://gcc.gnu.org/pub/gcc/infrastructure/#https://paddle-ci.gz.bcebos.com/#g' ./contrib/download_prerequisites && \ + unset LIBRARY_PATH CPATH C_INCLUDE_PATH PKG_CONFIG_PATH CPLUS_INCLUDE_PATH INCLUDE && \ + ./contrib/download_prerequisites && \ + cd .. && mkdir temp_gcc82 && cd temp_gcc82 && \ + ../gcc-8.2.0/configure --prefix=/usr/local/gcc-8.2 --enable-threads=posix --disable-checking --disable-multilib && \ + make -j8 && make install + +ENV PATH=/usr/local/gcc-8.2/bin:$PATH +RUN rm -rf /temp_gcc82 && rm -rf /gcc-8.2.0.tar.xz && rm -rf /gcc-8.2.0 + +# Prepare packages for Python +RUN apt-get update && \ + apt-get install -y make build-essential libssl-dev zlib1g-dev libbz2-dev \ + libreadline-dev libsqlite3-dev wget curl llvm libncurses5-dev libncursesw5-dev \ + xz-utils tk-dev libffi-dev liblzma-dev + +# Downgrade gcc&&g++ +RUN apt-get update +WORKDIR /usr/bin +RUN apt install -y gcc-4.8 g++-4.8 +RUN cp gcc gcc.bak +RUN cp g++ g++.bak +RUN rm gcc +RUN rm g++ +RUN ln -s gcc-4.8 gcc +RUN ln -s g++-4.8 g++ + +# Install Python3.6 +RUN mkdir -p /root/python_build/ && wget -q https://www.sqlite.org/2018/sqlite-autoconf-3250300.tar.gz && \ + tar -zxf sqlite-autoconf-3250300.tar.gz && cd sqlite-autoconf-3250300 && \ + ./configure -prefix=/usr/local && make -j8 && make install && cd ../ && rm sqlite-autoconf-3250300.tar.gz && \ + wget -q https://www.python.org/ftp/python/3.6.0/Python-3.6.0.tgz && \ + tar -xzf Python-3.6.0.tgz && cd Python-3.6.0 && \ + CFLAGS="-Wformat" ./configure --prefix=/usr/local/ --enable-shared > /dev/null && \ + make -j8 > /dev/null && make altinstall > /dev/null + +# Install Python3.7 +RUN wget -q https://www.python.org/ftp/python/3.7.0/Python-3.7.0.tgz && \ + tar -xzf Python-3.7.0.tgz && cd Python-3.7.0 && \ + CFLAGS="-Wformat" ./configure --prefix=/usr/local/ --enable-shared > /dev/null && \ + make -j8 > /dev/null && make altinstall > /dev/null +RUN rm -r /root/python_build + +RUN apt-get update && \ + apt-get install -y --allow-downgrades --allow-change-held-packages \ + patchelf python3 python3-dev python3-pip \ + git python-pip python-dev python-opencv openssh-server bison \ + wget unzip unrar tar xz-utils bzip2 gzip coreutils ntp \ + curl sed grep graphviz libjpeg-dev zlib1g-dev \ + python-matplotlib gcc-4.8 g++-4.8 \ + automake locales clang-format swig \ + liblapack-dev liblapacke-dev \ + clang-3.8 llvm-3.8 libclang-3.8-dev \ + net-tools libtool && \ + apt-get clean -y + +# install cmake +WORKDIR /home +RUN wget -q https://cmake.org/files/v3.16/cmake-3.16.0-Linux-x86_64.tar.gz +RUN tar -zxvf cmake-3.16.0-Linux-x86_64.tar.gz +RUN rm cmake-3.16.0-Linux-x86_64.tar.gz +ENV PATH=/home/cmake-3.16.0-Linux-x86_64/bin:$PATH + +# Install Python2.7.15 to replace original python +WORKDIR /home +ENV version=2.7.15 +RUN wget https://www.python.org/ftp/python/$version/Python-$version.tgz +RUN tar -xvf Python-$version.tgz +WORKDIR /home/Python-$version +RUN ./configure --enable-unicode=ucs4 --enable-shared CFLAGS=-fPIC --prefix=/usr/local/python2.7.15 +RUN make && make install + +RUN echo "export PATH=/usr/local/python2.7.15/include:${PATH}" >> ~/.bashrc +RUN echo "export PATH=/usr/local/python2.7.15/bin:${PATH}" >> ~/.bashrc +RUN echo "export LD_LIBRARY_PATH=/usr/local/python2.7.15/lib:${LD_LIBRARY_PATH}" >> ~/.bashrc +RUN echo "export CPLUS_INCLUDE_PATH=/usr/local/python2.7.15/include/python2.7:$CPLUS_INCLUDE_PATH" >> ~/.bashrc +ENV PATH=/usr/local/python2.7.15/include:${PATH} +ENV PATH=/usr/local/python2.7.15/bin:${PATH} +ENV LD_LIBRARY_PATH=/usr/local/python2.7.15/lib:${LD_LIBRARY_PATH} +ENV CPLUS_INCLUDE_PATH=/usr/local/python2.7.15/include/python2.7:$CPLUS_INCLUDE_PATH +RUN mv /usr/bin/python /usr/bin/python.bak +RUN ln -s /usr/local/python2.7.15/bin/python2.7 /usr/local/bin/python +RUN ln -s /usr/local/python2.7.15/bin/python2.7 /usr/bin/python +WORKDIR /home +RUN wget https://files.pythonhosted.org/packages/b0/d1/8acb42f391cba52e35b131e442e80deffbb8d0676b93261d761b1f0ef8fb/setuptools-40.6.2.zip +RUN apt-get -y install unzip +RUN unzip setuptools-40.6.2.zip +WORKDIR /home/setuptools-40.6.2 +RUN python setup.py build +RUN python setup.py install +WORKDIR /home +RUN wget https://files.pythonhosted.org/packages/69/81/52b68d0a4de760a2f1979b0931ba7889202f302072cc7a0d614211bc7579/pip-18.0.tar.gz +RUN tar -zxvf pip-18.0.tar.gz +WORKDIR pip-18.0 +RUN python setup.py install + +WORKDIR /home +RUN rm Python-$version.tgz setuptools-40.6.2.zip pip-18.0.tar.gz && \ + rm -r Python-$version setuptools-40.6.2 pip-18.0 + +# Install Go and glide +RUN wget -qO- https://paddle-ci.cdn.bcebos.com/go1.8.1.linux-amd64.tar.gz | \ + tar -xz -C /usr/local && \ + mkdir /root/gopath && \ + mkdir /root/gopath/bin && \ + mkdir /root/gopath/src +ENV GOROOT=/usr/local/go GOPATH=/root/gopath +# should not be in the same line with GOROOT definition, otherwise docker build could not find GOROOT. +ENV PATH=${PATH}:${GOROOT}/bin:${GOPATH}/bin +# install glide +RUN curl -s -q https://glide.sh/get | sh + +# Install TensorRT +# following TensorRT.tar.gz is not the default official one, we do two miny changes: +# 1. Remove the unnecessary files to make the library small. TensorRT.tar.gz only contains include and lib now, +# and its size is only one-third of the official one. +# 2. Manually add ~IPluginFactory() in IPluginFactory class of NvInfer.h, otherwise, it couldn't work in paddle. +# See https://github.com/PaddlePaddle/Paddle/issues/10129 for details. + +RUN wget -q https://paddlepaddledeps.cdn.bcebos.com/TensorRT-6.0.1.5.Ubuntu-16.04.x86_64-gnu.cuda-10.1.cudnn7.6.tar.gz --no-check-certificate && \ + tar -zxf TensorRT-6.0.1.5.Ubuntu-16.04.x86_64-gnu.cuda-10.1.cudnn7.6.tar.gz -C /usr/local && \ + cp -rf /usr/local/TensorRT-6.0.1.5/include/* /usr/include/ && cp -rf /usr/local/TensorRT-6.0.1.5/lib/* /usr/lib/ + +# git credential to skip password typing +RUN git config --global credential.helper store + +# Fix locales to en_US.UTF-8 +RUN localedef -i en_US -f UTF-8 en_US.UTF-8 + +# FIXME: due to temporary ipykernel dependency issue, specify ipykernel jupyter +# version util jupyter fixes this issue. + +# specify sphinx version as 1.5.6 and remove -U option for [pip install -U +# sphinx-rtd-theme] since -U option will cause sphinx being updated to newest +# version(1.7.1 for now), which causes building documentation failed. +RUN pip3 --no-cache-dir install -U wheel py-cpuinfo==5.0.0 && \ + pip3 --no-cache-dir install -U docopt PyYAML sphinx==1.5.6 && \ + pip3 --no-cache-dir install sphinx-rtd-theme==0.1.9 recommonmark && \ + pip3.6 --no-cache-dir install -U wheel py-cpuinfo==5.0.0 && \ + pip3.6 --no-cache-dir install -U docopt PyYAML sphinx==1.5.6 && \ + pip3.6 --no-cache-dir install sphinx-rtd-theme==0.1.9 recommonmark && \ + pip3.7 --no-cache-dir install -U wheel py-cpuinfo==5.0.0 && \ + pip3.7 --no-cache-dir install -U docopt PyYAML sphinx==1.5.6 && \ + pip3.7 --no-cache-dir install sphinx-rtd-theme==0.1.9 recommonmark && \ + pip --no-cache-dir install -U wheel py-cpuinfo==5.0.0 && \ + pip --no-cache-dir install -U docopt PyYAML sphinx==1.5.6 && \ + pip --no-cache-dir install sphinx-rtd-theme==0.1.9 recommonmark + +RUN pip3 --no-cache-dir install 'pre-commit==1.10.4' 'ipython==5.3.0' && \ + pip3 --no-cache-dir install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \ + pip3 --no-cache-dir install opencv-python && \ + pip3.6 --no-cache-dir install 'pre-commit==1.10.4' 'ipython==5.3.0' && \ + pip3.6 --no-cache-dir install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \ + pip3.6 --no-cache-dir install opencv-python && \ + pip3.7 --no-cache-dir install 'pre-commit==1.10.4' 'ipython==5.3.0' && \ + pip3.7 --no-cache-dir install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \ + pip3.7 --no-cache-dir install opencv-python && \ + pip --no-cache-dir install 'pre-commit==1.10.4' 'ipython==5.3.0' && \ + pip --no-cache-dir install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \ + pip --no-cache-dir install opencv-python + +#For docstring checker +RUN pip3 --no-cache-dir install pylint pytest astroid isort +RUN pip3.6 --no-cache-dir install pylint pytest astroid isort +RUN pip3.7 --no-cache-dir install pylint pytest astroid isort +RUN pip --no-cache-dir install pylint pytest astroid isort LinkChecker + +RUN pip3 --no-cache-dir install coverage +RUN pip3.6 --no-cache-dir install coverage +RUN pip3.7 --no-cache-dir install coverage +RUN pip --no-cache-dir install coverage + +COPY ./python/requirements.txt /root/ +RUN pip3 --no-cache-dir install -r /root/requirements.txt +RUN pip3.6 --no-cache-dir install -r /root/requirements.txt +RUN pip3.7 --no-cache-dir install -r /root/requirements.txt +RUN pip --no-cache-dir install -r /root/requirements.txt + +# To fix https://github.com/PaddlePaddle/Paddle/issues/1954, we use +# the solution in https://urllib3.readthedocs.io/en/latest/user-guide.html#ssl-py2 +RUN apt-get install -y libssl-dev libffi-dev && apt-get clean -y +RUN pip3 --no-cache-dir install certifi urllib3[secure] +RUN pip3.6 --no-cache-dir install certifi urllib3[secure] +RUN pip3.7 --no-cache-dir install certifi urllib3[secure] +RUN pip --no-cache-dir install certifi urllib3[secure] + + +# Install woboq_codebrowser to /woboq +RUN git clone https://github.com/woboq/woboq_codebrowser /woboq && \ + (cd /woboq \ + cmake -DLLVM_CONFIG_EXECUTABLE=/usr/bin/llvm-config-3.8 \ + -DCMAKE_BUILD_TYPE=Release . \ + make) + +# ar mishandles 4GB files +# https://sourceware.org/bugzilla/show_bug.cgi?id=14625 +# remove them when apt-get support 2.27 and higher version +RUN wget -q https://paddle-ci.gz.bcebos.com/binutils_2.27.orig.tar.gz && \ + tar -xzf binutils_2.27.orig.tar.gz && \ + cd binutils-2.27 && \ + ./configure && make -j && make install && cd .. && rm -rf binutils-2.27 binutils_2.27.orig.tar.gz + +RUN wget --no-check-certificate https://pslib.bj.bcebos.com/openmpi-1.4.5.tar.gz && tar -xzf openmpi-1.4.5.tar.gz && \ + cd openmpi-1.4.5 && ./configure --prefix=/usr/local && make all -j8 && make install -j8 && \ + export LD_LIBRARY_PATH=/usr/local/lib/:$LD_LIBRARY_PATH && export PATH=/usr/local/bin:$PATH && cd .. && \ + rm -rf openmpi-1.4.5.tar.gz && pip --no-cache-dir install mpi4py && ln -fs /bin/bash /bin/sh && \ + apt-get install libprotobuf-dev -y +RUN pip --no-cache-dir install -U netifaces==0.10.9 + +# ccache 3.7.9 +RUN wget https://paddle-ci.gz.bcebos.com/ccache-3.7.9.tar.gz && \ + tar xf ccache-3.7.9.tar.gz && mkdir /usr/local/ccache-3.7.9 && cd ccache-3.7.9 && \ + ./configure -prefix=/usr/local/ccache-3.7.9 && \ + make -j8 && make install && \ + ln -s /usr/local/ccache-3.7.9/bin/ccache /usr/local/bin/ccache + +# Configure OpenSSH server. c.f. https://docs.docker.com/engine/examples/running_ssh_service +RUN mkdir /var/run/sshd +RUN echo 'root:root' | chpasswd +RUN sed -ri 's/^PermitRootLogin\s+.*/PermitRootLogin yes/' /etc/ssh/sshd_config +RUN sed -ri 's/UsePAM yes/#UsePAM yes/g' /etc/ssh/sshd_config +CMD source ~/.bashrc +EXPOSE 22