From 950892044ff21900b18eabf4a529ef4abd4b0fab Mon Sep 17 00:00:00 2001 From: ShenLiang <2282912238@qq.com> Date: Tue, 26 May 2020 13:51:16 +0800 Subject: [PATCH] fix conflict, test=develop (#24238) --- paddle/fluid/framework/fleet/box_wrapper.cc | 218 ++++++++---------- paddle/fluid/framework/fleet/box_wrapper.cu | 129 +++++++++-- paddle/fluid/framework/fleet/box_wrapper.h | 61 ++++- .../fluid/framework/fleet/box_wrapper_impl.h | 163 +++++++++++++ .../operators/pull_box_extended_sparse_op.cc | 157 +++++++++++++ .../operators/pull_box_extended_sparse_op.cu | 46 ++++ .../operators/pull_box_extended_sparse_op.h | 119 ++++++++++ paddle/fluid/operators/pull_box_sparse_op.h | 4 +- paddle/fluid/pybind/box_helper_py.cc | 4 +- python/paddle/fluid/contrib/layers/nn.py | 50 +++- .../unittests/test_paddlebox_datafeed.py | 7 +- 11 files changed, 790 insertions(+), 168 deletions(-) create mode 100644 paddle/fluid/framework/fleet/box_wrapper_impl.h create mode 100644 paddle/fluid/operators/pull_box_extended_sparse_op.cc create mode 100644 paddle/fluid/operators/pull_box_extended_sparse_op.cu create mode 100644 paddle/fluid/operators/pull_box_extended_sparse_op.h diff --git a/paddle/fluid/framework/fleet/box_wrapper.cc b/paddle/fluid/framework/fleet/box_wrapper.cc index 9cdae4c3cc..2d3e694382 100644 --- a/paddle/fluid/framework/fleet/box_wrapper.cc +++ b/paddle/fluid/framework/fleet/box_wrapper.cc @@ -28,6 +28,8 @@ std::shared_ptr BoxWrapper::s_instance_ = nullptr; cudaStream_t BoxWrapper::stream_list_[8]; std::shared_ptr BoxWrapper::boxps_ptr_ = nullptr; AfsManager* BoxWrapper::afs_manager = nullptr; +int BoxWrapper::embedx_dim_ = 8; +int BoxWrapper::expand_embed_dim_ = 0; void BasicAucCalculator::compute() { double* table[2] = {&_table[0][0], &_table[1][0]}; @@ -57,6 +59,94 @@ void BasicAucCalculator::compute() { _size = fp + tp; } +void BoxWrapper::CheckEmbedSizeIsValid(int embedx_dim, int expand_embed_dim) { + PADDLE_ENFORCE_EQ( + embedx_dim_, embedx_dim, + platform::errors::InvalidArgument("SetInstance(): invalid embedx_dim. " + "When embedx_dim = %d, but got %d.", + embedx_dim_, embedx_dim)); + PADDLE_ENFORCE_EQ(expand_embed_dim_, expand_embed_dim, + platform::errors::InvalidArgument( + "SetInstance(): invalid expand_embed_dim. When " + "expand_embed_dim = %d, but got %d.", + expand_embed_dim_, expand_embed_dim)); +} + +void BoxWrapper::PullSparse(const paddle::platform::Place& place, + const std::vector& keys, + const std::vector& values, + const std::vector& slot_lengths, + const int hidden_size, const int expand_embed_dim) { +#define EMBEDX_CASE(i, ...) \ + case i: { \ + constexpr size_t EmbedxDim = i; \ + switch (expand_embed_dim) { \ + __VA_ARGS__ \ + default: \ + PADDLE_THROW(platform::errors::InvalidArgument( \ + "Unsupport this expand embedding size [%d]", expand_embed_dim)); \ + } \ + } break + +#define PULLSPARSE_CASE(i, ...) \ + case i: { \ + constexpr size_t ExpandDim = i; \ + PullSparseCase(place, keys, values, slot_lengths, \ + hidden_size, expand_embed_dim); \ + } break + + CheckEmbedSizeIsValid(hidden_size - 3, expand_embed_dim); + switch (hidden_size - 3) { + EMBEDX_CASE(8, PULLSPARSE_CASE(0); PULLSPARSE_CASE(8); + PULLSPARSE_CASE(64);); + EMBEDX_CASE(16, PULLSPARSE_CASE(0);); + default: + PADDLE_THROW(platform::errors::InvalidArgument( + "Unsupport this embedding size [%d]", hidden_size - 3)); + } +#undef PULLSPARSE_CASE +#undef EMBEDX_CASE +} + +void BoxWrapper::PushSparseGrad(const paddle::platform::Place& place, + const std::vector& keys, + const std::vector& grad_values, + const std::vector& slot_lengths, + const int hidden_size, + const int expand_embed_dim, + const int batch_size) { +#define EMBEDX_CASE(i, ...) \ + case i: { \ + constexpr size_t EmbedxDim = i; \ + switch (expand_embed_dim) { \ + __VA_ARGS__ \ + default: \ + PADDLE_THROW(platform::errors::InvalidArgument( \ + "Unsupport this expand embedding size [%d]", expand_embed_dim)); \ + } \ + } break + +#define PUSHSPARSE_CASE(i, ...) \ + case i: { \ + constexpr size_t ExpandDim = i; \ + PushSparseGradCase(place, keys, grad_values, \ + slot_lengths, hidden_size, \ + expand_embed_dim, batch_size); \ + } break + + CheckEmbedSizeIsValid(hidden_size - 3, expand_embed_dim); + switch (hidden_size - 3) { + EMBEDX_CASE(8, PUSHSPARSE_CASE(0); PUSHSPARSE_CASE(8); + PUSHSPARSE_CASE(64);); + EMBEDX_CASE(16, PUSHSPARSE_CASE(0);); + default: + PADDLE_THROW(platform::errors::InvalidArgument( + "Unsupport this embedding size [%d]", hidden_size - 3)); + } +#undef PUSHSPARSE_CASE +#undef EMBEDX_CASE +} + void BasicAucCalculator::calculate_bucket_error() { double last_ctr = -1; double impression_sum = 0; @@ -128,134 +218,6 @@ void BoxWrapper::EndPass(bool need_save_delta) const { ret, 0, platform::errors::PreconditionNotMet("EndPass failed in BoxPS.")); } -void BoxWrapper::PullSparse(const paddle::platform::Place& place, - const std::vector& keys, - const std::vector& values, - const std::vector& slot_lengths, - const int hidden_size) { - VLOG(3) << "Begin PullSparse"; - platform::Timer all_timer; - platform::Timer pull_boxps_timer; - all_timer.Start(); - - int64_t total_length = - std::accumulate(slot_lengths.begin(), slot_lengths.end(), 0UL); - auto buf = - memory::AllocShared(place, total_length * sizeof(boxps::FeatureValueGpu)); - boxps::FeatureValueGpu* total_values_gpu = - reinterpret_cast(buf->ptr()); - - if (platform::is_cpu_place(place)) { - PADDLE_THROW(platform::errors::Unimplemented( - "Warning:: CPUPlace is not supported in PaddleBox now.")); - } else if (platform::is_gpu_place(place)) { -#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) - VLOG(3) << "Begin copy keys, key_num[" << total_length << "]"; - int device_id = BOOST_GET_CONST(platform::CUDAPlace, place).GetDeviceId(); - LoDTensor& total_keys_tensor = keys_tensor[device_id]; - uint64_t* total_keys = reinterpret_cast( - total_keys_tensor.mutable_data({total_length, 1}, place)); - - // construct slot_level lod info - auto slot_lengths_lod = slot_lengths; - for (size_t i = 1; i < slot_lengths_lod.size(); i++) { - slot_lengths_lod[i] += slot_lengths_lod[i - 1]; - } - auto buf_key = memory::AllocShared(place, keys.size() * sizeof(uint64_t*)); - auto buf_length = - memory::AllocShared(place, slot_lengths.size() * sizeof(int64_t)); - uint64_t** gpu_keys = reinterpret_cast(buf_key->ptr()); - int64_t* gpu_len = reinterpret_cast(buf_length->ptr()); - cudaMemcpy(gpu_keys, keys.data(), keys.size() * sizeof(uint64_t*), - cudaMemcpyHostToDevice); - cudaMemcpy(gpu_len, slot_lengths_lod.data(), - slot_lengths.size() * sizeof(int64_t), cudaMemcpyHostToDevice); - - this->CopyKeys(place, gpu_keys, total_keys, gpu_len, - static_cast(slot_lengths.size()), - static_cast(total_length)); - VLOG(3) << "Begin call PullSparseGPU in BoxPS"; - pull_boxps_timer.Start(); - int ret = - boxps_ptr_->PullSparseGPU(total_keys, total_values_gpu, - static_cast(total_length), device_id); - PADDLE_ENFORCE_EQ(ret, 0, platform::errors::PreconditionNotMet( - "PullSparseGPU failed in BoxPS.")); - pull_boxps_timer.Pause(); - - VLOG(3) << "Begin Copy result to tensor, total_length[" << total_length - << "]"; - this->CopyForPull(place, gpu_keys, values, total_values_gpu, gpu_len, - static_cast(slot_lengths.size()), hidden_size, - total_length); -#else - PADDLE_THROW(platform::errors::PreconditionNotMet( - "Please compile WITH_GPU option, because NCCL doesn't support " - "windows.")); -#endif - } else { - PADDLE_THROW(platform::errors::PreconditionNotMet( - "PaddleBox: PullSparse Only Support CPUPlace or CUDAPlace Now.")); - } - all_timer.Pause(); - VLOG(1) << "PullSparse total costs: " << all_timer.ElapsedSec() - << " s, of which BoxPS costs: " << pull_boxps_timer.ElapsedSec() - << " s"; - VLOG(3) << "End PullSparse"; -} - -void BoxWrapper::PushSparseGrad(const paddle::platform::Place& place, - const std::vector& keys, - const std::vector& grad_values, - const std::vector& slot_lengths, - const int hidden_size, const int batch_size) { - VLOG(3) << "Begin PushSparseGrad"; - platform::Timer all_timer; - platform::Timer push_boxps_timer; - all_timer.Start(); - int64_t total_length = - std::accumulate(slot_lengths.begin(), slot_lengths.end(), 0UL); - auto buf = memory::AllocShared( - place, total_length * sizeof(boxps::FeaturePushValueGpu)); - boxps::FeaturePushValueGpu* total_grad_values_gpu = - reinterpret_cast(buf->ptr()); - if (platform::is_cpu_place(place)) { - PADDLE_THROW(platform::errors::Unimplemented( - "Warning:: CPUPlace is not supported in PaddleBox now.")); - } else if (platform::is_gpu_place(place)) { -#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) - int device_id = BOOST_GET_CONST(platform::CUDAPlace, place).GetDeviceId(); - LoDTensor& cached_total_keys_tensor = keys_tensor[device_id]; - uint64_t* total_keys = - reinterpret_cast(cached_total_keys_tensor.data()); - VLOG(3) << "Begin copy grad tensor to boxps struct"; - this->CopyForPush(place, grad_values, total_grad_values_gpu, slot_lengths, - hidden_size, total_length, batch_size); - - VLOG(3) << "Begin call PushSparseGPU in BoxPS"; - push_boxps_timer.Start(); - int ret = boxps_ptr_->PushSparseGPU( - total_keys, total_grad_values_gpu, static_cast(total_length), - BOOST_GET_CONST(platform::CUDAPlace, place).GetDeviceId()); - PADDLE_ENFORCE_EQ(ret, 0, platform::errors::PreconditionNotMet( - "PushSparseGPU failed in BoxPS.")); - push_boxps_timer.Pause(); -#else - PADDLE_THROW(platform::errors::PreconditionNotMet( - "Please compile WITH_GPU option, because NCCL doesn't support " - "windows.")); -#endif - } else { - PADDLE_THROW(platform::errors::PreconditionNotMet( - "PaddleBox: PushSparseGrad Only Support CPUPlace or CUDAPlace Now.")); - } - all_timer.Pause(); - VLOG(1) << "PushSparseGrad total cost: " << all_timer.ElapsedSec() - << " s, of which BoxPS cost: " << push_boxps_timer.ElapsedSec() - << " s"; - VLOG(3) << "End PushSparseGrad"; -} - void BoxWrapper::GetRandomReplace(const std::vector& pass_data) { VLOG(0) << "Begin GetRandomReplace"; size_t ins_num = pass_data.size(); diff --git a/paddle/fluid/framework/fleet/box_wrapper.cu b/paddle/fluid/framework/fleet/box_wrapper.cu index a24627c068..c315abd737 100644 --- a/paddle/fluid/framework/fleet/box_wrapper.cu +++ b/paddle/fluid/framework/fleet/box_wrapper.cu @@ -27,9 +27,12 @@ namespace framework { for (int i = blockIdx.x * blockDim.x + threadIdx.x; i < (n); \ i += blockDim.x * gridDim.x) -__global__ void PullCopy(float** dest, const boxps::FeatureValueGpu* src, - const int64_t* len, int hidden, int slot_num, - int total_len, uint64_t** keys) { +template +__global__ void PullCopy( + float** dest, + const boxps::FeatureValueGpu* src, + const int64_t* len, int hidden, int expand_dim, int slot_num, int total_len, + uint64_t** keys) { CUDA_KERNEL_LOOP(i, total_len) { int low = 0; int high = slot_num - 1; @@ -52,15 +55,28 @@ __global__ void PullCopy(float** dest, const boxps::FeatureValueGpu* src, *(dest[x] + y * hidden + 2) = (src + i)->embed_w; } if ((src + i)->embedding_size == 0 || *(keys[x] + y) == 0) { - for (int j = 0; j < 8; j++) { + for (int j = 0; j < hidden - 3; j++) { *(dest[x] + y * hidden + 3 + j) = 0; } } else { - for (int j = 0; j < 8; j++) { + for (int j = 0; j < hidden - 3; j++) { *(dest[x] + y * hidden + 3 + j) = (src + i)->embedx[1 + j]; } } - } + // process embed_expand + if (expand_dim > 0) { + int z = x + slot_num; + if ((src + i)->embed_expand_size[0] == 0 || *(keys[x] + y) == 0) { + for (int j = 0; j < expand_dim; j++) { + *(dest[z] + y * expand_dim + j) = 0; + } + } else { + for (int j = 0; j < expand_dim; j++) { + *(dest[z] + y * expand_dim + j) = (src + i)->embed_expand[1 + j]; + } + } + } + } // end kernel loop } __global__ void CopyKeysKernel(uint64_t** src_keys, uint64_t* dest_total_keys, @@ -82,9 +98,11 @@ __global__ void CopyKeysKernel(uint64_t** src_keys, uint64_t* dest_total_keys, } } -__global__ void PushCopy(boxps::FeaturePushValueGpu* dest, float** src, - int64_t* len, int hidden, int slot_num, int total_len, - int bs, int* slot_vector) { +template +__global__ void PushCopy( + boxps::FeaturePushValueGpu* dest, float** src, + int64_t* len, int hidden, int expand_dim, int slot_num, int total_len, + int bs, int* slot_vector) { CUDA_KERNEL_LOOP(i, total_len) { int low = 0; int high = slot_num - 1; @@ -101,18 +119,25 @@ __global__ void PushCopy(boxps::FeaturePushValueGpu* dest, float** src, (dest + i)->show = *(src[x] + y * hidden); (dest + i)->clk = *(src[x] + y * hidden + 1); (dest + i)->embed_g = *(src[x] + y * hidden + 2) * -1. * bs; - for (int j = 0; j < 8; j++) { + for (int j = 0; j < hidden - 3; j++) { (dest + i)->embedx_g[j] = *(src[x] + y * hidden + 3 + j) * -1. * bs; } + if (expand_dim > 0) { + int z = x + slot_num; + for (int j = 0; j < expand_dim; j++) { + (dest + i)->embed_expand_g[j] = + *(src[z] + y * expand_dim + j) * -1. * bs; + } + } } } void BoxWrapper::CopyForPull(const paddle::platform::Place& place, uint64_t** gpu_keys, const std::vector& values, - const boxps::FeatureValueGpu* total_values_gpu, - const int64_t* gpu_len, const int slot_num, - const int hidden_size, + void* total_values_gpu, const int64_t* gpu_len, + const int slot_num, const int hidden_size, + const int expand_embed_dim, const int64_t total_length) { auto stream = dynamic_cast( platform::DeviceContextPool::Instance().Get( @@ -122,11 +147,40 @@ void BoxWrapper::CopyForPull(const paddle::platform::Place& place, float** gpu_values = reinterpret_cast(buf_value->ptr()); cudaMemcpy(gpu_values, values.data(), values.size() * sizeof(float*), cudaMemcpyHostToDevice); +#define EMBEDX_CASE(i, ...) \ + case i: { \ + constexpr size_t EmbedxDim = i; \ + switch (expand_embed_dim) { \ + __VA_ARGS__ \ + default: \ + PADDLE_THROW(platform::errors::InvalidArgument( \ + "Unsupport this expand embedding size [%d]", expand_embed_dim)); \ + } \ + } break + +#define EXPAND_EMBED_PULL_CASE(i, ...) \ + case i: { \ + constexpr size_t ExpandDim = i; \ + PullCopy<<<(total_length + 512 - 1) / 512, 512, 0, stream>>>( \ + gpu_values, \ + reinterpret_cast*>( \ + total_values_gpu), \ + gpu_len, hidden_size, expand_embed_dim, slot_num, total_length, \ + gpu_keys); \ + } break - PullCopy<<<(total_length + 512 - 1) / 512, 512, 0, stream>>>( - gpu_values, total_values_gpu, gpu_len, hidden_size, slot_num, - total_length, gpu_keys); + switch (hidden_size - 3) { + EMBEDX_CASE(8, EXPAND_EMBED_PULL_CASE(0); EXPAND_EMBED_PULL_CASE(8); + EXPAND_EMBED_PULL_CASE(64);); + EMBEDX_CASE(16, EXPAND_EMBED_PULL_CASE(0);); + default: + PADDLE_THROW(platform::errors::InvalidArgument( + "Unsupport this embedding size [%d]", hidden_size - 3)); + } cudaStreamSynchronize(stream); +#undef EXPAND_EMBED_PULL_CASE +#undef EMBEDX_CASE } void BoxWrapper::CopyKeys(const paddle::platform::Place& place, @@ -143,10 +197,10 @@ void BoxWrapper::CopyKeys(const paddle::platform::Place& place, void BoxWrapper::CopyForPush(const paddle::platform::Place& place, const std::vector& grad_values, - boxps::FeaturePushValueGpu* total_grad_values_gpu, + void* total_grad_values_gpu, const std::vector& slot_lengths, - const int hidden_size, const int64_t total_length, - const int batch_size) { + const int hidden_size, const int expand_embed_dim, + const int64_t total_length, const int batch_size) { auto stream = dynamic_cast( platform::DeviceContextPool::Instance().Get( BOOST_GET_CONST(platform::CUDAPlace, place))) @@ -173,11 +227,42 @@ void BoxWrapper::CopyForPush(const paddle::platform::Place& place, cudaMemcpy(d_slot_vector, slot_vector_.data(), slot_lengths_lod.size() * sizeof(int), cudaMemcpyHostToDevice); - PushCopy<<<(total_length + 512 - 1) / 512, 512, 0, stream>>>( - total_grad_values_gpu, gpu_values, gpu_len, hidden_size, - slot_lengths.size(), total_length, batch_size, d_slot_vector); +#define EMBEDX_CASE(i, ...) \ + case i: { \ + constexpr size_t EmbedxDim = i; \ + switch (expand_embed_dim) { \ + __VA_ARGS__ \ + default: \ + PADDLE_THROW(platform::errors::InvalidArgument( \ + "Unsupport this expand embedding size [%d]", expand_embed_dim)); \ + } \ + } break + +#define EXPAND_EMBED_PUSH_CASE(i, ...) \ + case i: { \ + constexpr size_t ExpandDim = i; \ + PushCopy<<<(total_length + 512 - 1) / 512, 512, 0, stream>>>( \ + reinterpret_cast*>( \ + total_grad_values_gpu), \ + gpu_values, gpu_len, hidden_size, expand_embed_dim, \ + slot_lengths.size(), total_length, batch_size, d_slot_vector); \ + } break + + switch (hidden_size - 3) { + EMBEDX_CASE(8, EXPAND_EMBED_PUSH_CASE(0); EXPAND_EMBED_PUSH_CASE(8); + EXPAND_EMBED_PUSH_CASE(64);); + EMBEDX_CASE(16, EXPAND_EMBED_PUSH_CASE(0);); + default: + PADDLE_THROW(platform::errors::InvalidArgument( + "Unsupport this embedding size [%d]", hidden_size - 3)); + } + cudaStreamSynchronize(stream); +#undef EXPAND_EMBED_PUSH_CASE +#undef EMBEDX_CASE } + } // end namespace framework } // end namespace paddle #endif diff --git a/paddle/fluid/framework/fleet/box_wrapper.h b/paddle/fluid/framework/fleet/box_wrapper.h index 85c8d14ed0..af533fe22e 100644 --- a/paddle/fluid/framework/fleet/box_wrapper.h +++ b/paddle/fluid/framework/fleet/box_wrapper.h @@ -341,30 +341,54 @@ class BoxWrapper { void BeginPass() const; void EndPass(bool need_save_delta) const; void SetTestMode(bool is_test) const; + + template + void PullSparseCase(const paddle::platform::Place& place, + const std::vector& keys, + const std::vector& values, + const std::vector& slot_lengths, + const int hidden_size, const int expand_embed_dim); + void PullSparse(const paddle::platform::Place& place, const std::vector& keys, const std::vector& values, const std::vector& slot_lengths, - const int hidden_size); + const int hidden_size, const int expand_embed_dim); + + template + void PushSparseGradCase(const paddle::platform::Place& place, + const std::vector& keys, + const std::vector& grad_values, + const std::vector& slot_lengths, + const int hidden_size, const int expand_embed_dim, + const int batch_size); + void PushSparseGrad(const paddle::platform::Place& place, const std::vector& keys, const std::vector& grad_values, const std::vector& slot_lengths, - const int hidden_size, const int batch_size); + const int hidden_size, const int expand_embed_dim, + const int batch_size); + void CopyForPull(const paddle::platform::Place& place, uint64_t** gpu_keys, - const std::vector& values, - const boxps::FeatureValueGpu* total_values_gpu, + const std::vector& values, void* total_values_gpu, const int64_t* gpu_len, const int slot_num, - const int hidden_size, const int64_t total_length); + const int hidden_size, const int expand_embed_dim, + const int64_t total_length); + void CopyForPush(const paddle::platform::Place& place, const std::vector& grad_values, - boxps::FeaturePushValueGpu* total_grad_values_gpu, + void* total_grad_values_gpu, const std::vector& slot_lengths, - const int hidden_size, const int64_t total_length, - const int batch_size); + const int hidden_size, const int expand_embed_dim, + const int64_t total_length, const int batch_size); + void CopyKeys(const paddle::platform::Place& place, uint64_t** origin_keys, uint64_t* total_keys, const int64_t* gpu_len, int slot_num, int total_len); + + void CheckEmbedSizeIsValid(int embedx_dim, int expand_embed_dim); + boxps::PSAgentBase* GetAgent() { return p_agent_; } void InitializeGPUAndLoadModel( const char* conf_file, const std::vector& slot_vector, @@ -442,6 +466,15 @@ class BoxWrapper { } static std::shared_ptr GetInstance() { + PADDLE_ENFORCE_EQ( + s_instance_ == nullptr, false, + platform::errors::PreconditionNotMet( + "GetInstance failed in BoxPs, you should use SetInstance firstly")); + return s_instance_; + } + + static std::shared_ptr SetInstance(int embedx_dim = 8, + int expand_embed_dim = 0) { if (nullptr == s_instance_) { // If main thread is guaranteed to init this, this lock can be removed static std::mutex mutex; @@ -449,8 +482,13 @@ class BoxWrapper { if (nullptr == s_instance_) { VLOG(3) << "s_instance_ is null"; s_instance_.reset(new paddle::framework::BoxWrapper()); - s_instance_->boxps_ptr_.reset(boxps::BoxPSBase::GetIns()); + s_instance_->boxps_ptr_.reset( + boxps::BoxPSBase::GetIns(embedx_dim, expand_embed_dim)); + embedx_dim_ = embedx_dim; + expand_embed_dim_ = expand_embed_dim; } + } else { + LOG(WARNING) << "You have already used SetInstance() before"; } return s_instance_; } @@ -776,6 +814,9 @@ class BoxWrapper { const int feedpass_thread_num_ = 30; // magic number static std::shared_ptr s_instance_; std::unordered_set slot_name_omited_in_feedpass_; + // EMBEDX_DIM and EXPAND_EMBED_DIM + static int embedx_dim_; + static int expand_embed_dim_; // Metric Related int phase_ = 1; @@ -1009,3 +1050,5 @@ class BoxHelper { } // end namespace framework } // end namespace paddle + +#include "paddle/fluid/framework/fleet/box_wrapper_impl.h" diff --git a/paddle/fluid/framework/fleet/box_wrapper_impl.h b/paddle/fluid/framework/fleet/box_wrapper_impl.h new file mode 100644 index 0000000000..b4e414dc83 --- /dev/null +++ b/paddle/fluid/framework/fleet/box_wrapper_impl.h @@ -0,0 +1,163 @@ +/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#ifdef PADDLE_WITH_BOX_PS +#include +namespace paddle { +namespace framework { + +template +void BoxWrapper::PullSparseCase(const paddle::platform::Place& place, + const std::vector& keys, + const std::vector& values, + const std::vector& slot_lengths, + const int hidden_size, + const int expand_embed_dim) { + VLOG(3) << "Begin PullSparse"; + platform::Timer all_timer; + platform::Timer pull_boxps_timer; + all_timer.Start(); + + int64_t total_length = + std::accumulate(slot_lengths.begin(), slot_lengths.end(), 0UL); + auto buf = memory::AllocShared( + place, total_length * + sizeof(boxps::FeatureValueGpu)); + boxps::FeatureValueGpu* total_values_gpu = + reinterpret_cast*>( + buf->ptr()); + + if (platform::is_cpu_place(place)) { + PADDLE_THROW(platform::errors::Unimplemented( + "Warning:: CPUPlace is not supported in PaddleBox now.")); + } else if (platform::is_gpu_place(place)) { +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) + VLOG(3) << "Begin copy keys, key_num[" << total_length << "]"; + int device_id = BOOST_GET_CONST(platform::CUDAPlace, place).GetDeviceId(); + LoDTensor& total_keys_tensor = keys_tensor[device_id]; + uint64_t* total_keys = reinterpret_cast( + total_keys_tensor.mutable_data({total_length, 1}, place)); + + // construct slot_level lod info + auto slot_lengths_lod = slot_lengths; + for (size_t i = 1; i < slot_lengths_lod.size(); i++) { + slot_lengths_lod[i] += slot_lengths_lod[i - 1]; + } + auto buf_key = memory::AllocShared(place, keys.size() * sizeof(uint64_t*)); + auto buf_length = + memory::AllocShared(place, slot_lengths.size() * sizeof(int64_t)); + uint64_t** gpu_keys = reinterpret_cast(buf_key->ptr()); + int64_t* gpu_len = reinterpret_cast(buf_length->ptr()); + cudaMemcpy(gpu_keys, keys.data(), keys.size() * sizeof(uint64_t*), + cudaMemcpyHostToDevice); + cudaMemcpy(gpu_len, slot_lengths_lod.data(), + slot_lengths.size() * sizeof(int64_t), cudaMemcpyHostToDevice); + + this->CopyKeys(place, gpu_keys, total_keys, gpu_len, + static_cast(slot_lengths.size()), + static_cast(total_length)); + VLOG(3) << "Begin call PullSparseGPU in BoxPS"; + pull_boxps_timer.Start(); + int ret = boxps_ptr_->PullSparseGPU( + total_keys, reinterpret_cast(total_values_gpu), + static_cast(total_length), device_id); + PADDLE_ENFORCE_EQ(ret, 0, platform::errors::PreconditionNotMet( + "PullSparseGPU failed in BoxPS.")); + pull_boxps_timer.Pause(); + + VLOG(3) << "Begin Copy result to tensor, total_length[" << total_length + << "]"; + this->CopyForPull(place, gpu_keys, values, + reinterpret_cast(total_values_gpu), gpu_len, + static_cast(slot_lengths.size()), hidden_size, + expand_embed_dim, total_length); +#else + PADDLE_THROW(platform::errors::PreconditionNotMet( + "Please compile WITH_GPU option, because NCCL doesn't support " + "windows.")); +#endif + } else { + PADDLE_THROW(platform::errors::PreconditionNotMet( + "PaddleBox: PullSparse Only Support CPUPlace or CUDAPlace Now.")); + } + all_timer.Pause(); + VLOG(1) << "PullSparse total costs: " << all_timer.ElapsedSec() + << " s, of which BoxPS costs: " << pull_boxps_timer.ElapsedSec() + << " s"; + VLOG(3) << "End PullSparse"; +} + +template +void BoxWrapper::PushSparseGradCase( + const paddle::platform::Place& place, + const std::vector& keys, + const std::vector& grad_values, + const std::vector& slot_lengths, const int hidden_size, + const int expand_embed_dim, const int batch_size) { + VLOG(3) << "Begin PushSparseGrad"; + platform::Timer all_timer; + platform::Timer push_boxps_timer; + all_timer.Start(); + int64_t total_length = + std::accumulate(slot_lengths.begin(), slot_lengths.end(), 0UL); + auto buf = memory::AllocShared( + place, + total_length * + sizeof(boxps::FeaturePushValueGpu)); + boxps::FeaturePushValueGpu* + total_grad_values_gpu = reinterpret_cast< + boxps::FeaturePushValueGpu*>( + buf->ptr()); + if (platform::is_cpu_place(place)) { + PADDLE_THROW(platform::errors::Unimplemented( + "Warning:: CPUPlace is not supported in PaddleBox now.")); + } else if (platform::is_gpu_place(place)) { +#if defined(PADDLE_WITH_CUDA) && !defined(_WIN32) + int device_id = BOOST_GET_CONST(platform::CUDAPlace, place).GetDeviceId(); + LoDTensor& cached_total_keys_tensor = keys_tensor[device_id]; + uint64_t* total_keys = + reinterpret_cast(cached_total_keys_tensor.data()); + VLOG(3) << "Begin copy grad tensor to boxps struct"; + this->CopyForPush(place, grad_values, total_grad_values_gpu, slot_lengths, + hidden_size, expand_embed_dim, total_length, batch_size); + + VLOG(3) << "Begin call PushSparseGPU in BoxPS"; + push_boxps_timer.Start(); + int ret = boxps_ptr_->PushSparseGPU( + total_keys, reinterpret_cast(total_grad_values_gpu), + static_cast(total_length), + BOOST_GET_CONST(platform::CUDAPlace, place).GetDeviceId()); + PADDLE_ENFORCE_EQ(ret, 0, platform::errors::PreconditionNotMet( + "PushSparseGPU failed in BoxPS.")); + push_boxps_timer.Pause(); +#else + PADDLE_THROW(platform::errors::PreconditionNotMet( + "Please compile WITH_GPU option, because NCCL doesn't support " + "windows.")); +#endif + } else { + PADDLE_THROW(platform::errors::PreconditionNotMet( + "PaddleBox: PushSparseGrad Only Support CPUPlace or CUDAPlace Now.")); + } + all_timer.Pause(); + VLOG(1) << "PushSparseGrad total cost: " << all_timer.ElapsedSec() + << " s, of which BoxPS cost: " << push_boxps_timer.ElapsedSec() + << " s"; + VLOG(3) << "End PushSparseGrad"; +} + +} // namespace framework +} // namespace paddle +#endif diff --git a/paddle/fluid/operators/pull_box_extended_sparse_op.cc b/paddle/fluid/operators/pull_box_extended_sparse_op.cc new file mode 100644 index 0000000000..a3f9defc91 --- /dev/null +++ b/paddle/fluid/operators/pull_box_extended_sparse_op.cc @@ -0,0 +1,157 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/operators/pull_box_extended_sparse_op.h" + +namespace paddle { +namespace operators { + +class PullBoxExtendedSparseOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + void InferShape(framework::InferShapeContext* ctx) const override { + PADDLE_ENFORCE_GE( + ctx->Inputs("Ids").size(), 1UL, + platform::errors::InvalidArgument( + "Inputs(Ids) of PullBoxExtendedSparseOp should not be empty.")); + PADDLE_ENFORCE_GE( + ctx->Outputs("Out").size(), 1UL, + platform::errors::InvalidArgument( + "Outputs(Out) of PullBoxExtendedSparseOp should not be empty.")); + PADDLE_ENFORCE_GE(ctx->Outputs("OutExtend").size(), 1UL, + platform::errors::InvalidArgument( + "Outputs(OutExtend) of PullBoxExtendedSparseOp " + "should not be empty.")); + auto emb_size = static_cast(ctx->Attrs().Get("emb_size")); + auto emb_extended_size = + static_cast(ctx->Attrs().Get("emb_extended_size")); + auto all_ids_dim = ctx->GetInputsDim("Ids"); + const size_t n_ids = all_ids_dim.size(); + std::vector outs_dims; + std::vector outs_extended_dims; + outs_dims.resize(n_ids); + outs_extended_dims.resize(n_ids); + for (size_t i = 0; i < n_ids; ++i) { + const auto ids_dims = all_ids_dim[i]; + int ids_rank = ids_dims.size(); + PADDLE_ENFORCE_EQ(ids_dims[ids_rank - 1], 1, + platform::errors::InvalidArgument( + "Shape error in %lu id, the last dimension of the " + "'Ids' tensor must be 1.", + i)); + auto out_dim = framework::vectorize( + framework::slice_ddim(ids_dims, 0, ids_rank - 1)); + out_dim.push_back(emb_size); + outs_dims[i] = framework::make_ddim(out_dim); + + auto out_extended_dim = framework::vectorize( + framework::slice_ddim(ids_dims, 0, ids_rank - 1)); + out_extended_dim.push_back(emb_extended_size); + outs_extended_dims[i] = framework::make_ddim(out_extended_dim); + } + ctx->SetOutputsDim("Out", outs_dims); + ctx->SetOutputsDim("OutExtend", outs_extended_dims); + for (size_t i = 0; i < n_ids; ++i) { + ctx->ShareLoD("Ids", "Out", i, i); + ctx->ShareLoD("Ids", "OutExtend", i, i); + } + } + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType(framework::proto::VarType::FP32, + ctx.device_context()); + } +}; + +class PullBoxExtendedSparseOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override { + AddInput("Ids", + "Input tensors with type int32 or int64 " + "contains the ids to be looked up in BoxPS. " + "The last dimension size must be 1.") + .AsDuplicable(); + AddOutput("Out", "The lookup results tensors.").AsDuplicable(); + AddOutput("OutExtend", "The lookup extended results tensors.") + .AsDuplicable(); + AddAttr("emb_size", "(int, the embedding hidden size").SetDefault(1); + AddAttr("emb_extended_size", + "(int, the extended_embedding hidden size") + .SetDefault(128); + AddComment(R"DOC( +Pull Box Extended Sparse Operator. + +This operator is used to perform lookups on the BoxPS, +then concatenated into a dense tensor. + +The input Ids can carry the LoD (Level of Details) information, +or not. And the output only shares the LoD information with input Ids. + +)DOC"); + } +}; + +template +class PushBoxExtendedSparseOpMaker : public framework::SingleGradOpMaker { + public: + using framework::SingleGradOpMaker::SingleGradOpMaker; + + protected: + void Apply(GradOpPtr op) const override { + op->SetType("push_box_extended_sparse"); + op->SetInput("Ids", this->Input("Ids")); + op->SetInput(framework::GradVarName("Out"), this->OutputGrad("Out")); + op->SetInput(framework::GradVarName("OutExtend"), + this->OutputGrad("OutExtend")); + op->SetOutput(framework::GradVarName("Out"), this->OutputGrad("Out")); + op->SetAttrMap(this->Attrs()); + } +}; + +class PushBoxExtendedSparseOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override {} + + protected: + framework::OpKernelType GetExpectedKernelType( + const framework::ExecutionContext& ctx) const override { + return framework::OpKernelType(OperatorWithKernel::IndicateVarDataType( + ctx, framework::GradVarName("Out")), + ctx.device_context()); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR( + pull_box_extended_sparse, ops::PullBoxExtendedSparseOp, + ops::PullBoxExtendedSparseOpMaker, + ops::PushBoxExtendedSparseOpMaker, + ops::PushBoxExtendedSparseOpMaker); + +REGISTER_OPERATOR(push_box_extended_sparse, ops::PushBoxExtendedSparseOp); + +REGISTER_OP_CPU_KERNEL(pull_box_extended_sparse, + ops::PullBoxExtendedSparseCPUKernel, + ops::PullBoxExtendedSparseCPUKernel); + +REGISTER_OP_CPU_KERNEL(push_box_extended_sparse, + ops::PushBoxExtendedSparseCPUKernel, + ops::PushBoxExtendedSparseCPUKernel); diff --git a/paddle/fluid/operators/pull_box_extended_sparse_op.cu b/paddle/fluid/operators/pull_box_extended_sparse_op.cu new file mode 100644 index 0000000000..5bde6bc2e5 --- /dev/null +++ b/paddle/fluid/operators/pull_box_extended_sparse_op.cu @@ -0,0 +1,46 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/operators/pull_box_extended_sparse_op.h" +#include "paddle/fluid/platform/cuda_primitives.h" +#include "paddle/fluid/platform/gpu_info.h" + +namespace paddle { +namespace operators { + +template +class PullBoxExtendedSparseCUDAKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &ctx) const override { + PullBoxExtendedSparseFunctor(ctx); + } +}; + +template +class PushBoxExtendedSparseCUDAKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &ctx) const override { + PushBoxExtendedSparseFunctor(ctx); + } +}; +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_CUDA_KERNEL(pull_box_extended_sparse, + ops::PullBoxExtendedSparseCUDAKernel, + ops::PullBoxExtendedSparseCUDAKernel); +REGISTER_OP_CUDA_KERNEL(push_box_extended_sparse, + ops::PushBoxExtendedSparseCUDAKernel, + ops::PushBoxExtendedSparseCUDAKernel); diff --git a/paddle/fluid/operators/pull_box_extended_sparse_op.h b/paddle/fluid/operators/pull_box_extended_sparse_op.h new file mode 100644 index 0000000000..559c7eed84 --- /dev/null +++ b/paddle/fluid/operators/pull_box_extended_sparse_op.h @@ -0,0 +1,119 @@ +// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once +#include +#include +#include "paddle/fluid/framework/fleet/box_wrapper.h" +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/tensor.h" + +namespace paddle { +namespace operators { + +template +static void PullBoxExtendedSparseFunctor( + const framework::ExecutionContext &ctx) { + auto inputs = ctx.MultiInput("Ids"); + auto outputs = ctx.MultiOutput("Out"); + auto outputs_extend = ctx.MultiOutput("OutExtend"); + const auto slot_size = inputs.size(); + std::vector all_keys(slot_size); + // BoxPS only supports float now + std::vector all_values(slot_size * 2); + std::vector slot_lengths(slot_size); + for (size_t i = 0; i < slot_size; i++) { + const auto *slot = inputs[i]; + const uint64_t *single_slot_keys = + reinterpret_cast(slot->data()); + all_keys[i] = single_slot_keys; + slot_lengths[i] = slot->numel(); + auto *output = outputs[i]->mutable_data(ctx.GetPlace()); + auto *output_extend = outputs_extend[i]->mutable_data(ctx.GetPlace()); + all_values[i] = reinterpret_cast(output); + all_values[i + slot_size] = reinterpret_cast(output_extend); + } +#ifdef PADDLE_WITH_BOX_PS + auto emb_size = ctx.Attr("emb_size"); + auto emb_extended_size = ctx.Attr("emb_extended_size"); + auto box_ptr = paddle::framework::BoxWrapper::GetInstance(); + box_ptr->PullSparse(ctx.GetPlace(), all_keys, all_values, slot_lengths, + emb_size, emb_extended_size); +#endif +} + +template +static void PushBoxExtendedSparseFunctor( + const framework::ExecutionContext &ctx) { + auto inputs = ctx.MultiInput("Ids"); + auto d_output = + ctx.MultiInput(framework::GradVarName("Out")); + auto d_output_extend = + ctx.MultiInput(framework::GradVarName("OutExtend")); + const auto slot_size = inputs.size(); + std::vector all_keys(slot_size); + std::vector all_grad_values(slot_size * 2); + std::vector slot_lengths(slot_size); + int batch_size = -1; + for (size_t i = 0; i < slot_size; i++) { + const auto *slot = inputs[i]; + const uint64_t *single_slot_keys = + reinterpret_cast(slot->data()); + all_keys[i] = single_slot_keys; + slot_lengths[i] = slot->numel(); + int cur_batch_size = + slot->lod().size() ? slot->lod()[0].size() - 1 : slot->dims()[0]; + if (batch_size == -1) { + batch_size = cur_batch_size; + } else { + PADDLE_ENFORCE_EQ(batch_size, cur_batch_size, + platform::errors::PreconditionNotMet( + "The batch size of all input slots should be same," + "please cheack")); + } + const float *grad_value = d_output[i]->data(); + const float *grad_value_extend = d_output_extend[i]->data(); + all_grad_values[i] = reinterpret_cast(grad_value); + all_grad_values[i + slot_size] = + reinterpret_cast(grad_value_extend); + } +#ifdef PADDLE_WITH_BOX_PS + auto emb_size = ctx.Attr("emb_size"); + auto emb_extended_size = ctx.Attr("emb_extended_size"); + auto box_ptr = paddle::framework::BoxWrapper::GetInstance(); + box_ptr->PushSparseGrad(ctx.GetPlace(), all_keys, all_grad_values, + slot_lengths, emb_size, emb_extended_size, + batch_size); +#endif +} + +using LoDTensor = framework::LoDTensor; +template +class PullBoxExtendedSparseCPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &ctx) const override { + PullBoxExtendedSparseFunctor(ctx); + } +}; + +template +class PushBoxExtendedSparseCPUKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext &ctx) const override { + PushBoxExtendedSparseFunctor(ctx); + } +}; + +} // namespace operators +} // namespace paddle diff --git a/paddle/fluid/operators/pull_box_sparse_op.h b/paddle/fluid/operators/pull_box_sparse_op.h index 1f8c7932c4..3b48341368 100644 --- a/paddle/fluid/operators/pull_box_sparse_op.h +++ b/paddle/fluid/operators/pull_box_sparse_op.h @@ -44,7 +44,7 @@ static void PullBoxSparseFunctor(const framework::ExecutionContext &ctx) { auto hidden_size = ctx.Attr("size"); auto box_ptr = paddle::framework::BoxWrapper::GetInstance(); box_ptr->PullSparse(ctx.GetPlace(), all_keys, all_values, slot_lengths, - hidden_size); + hidden_size, 0); #endif } @@ -81,7 +81,7 @@ static void PushBoxSparseFunctor(const framework::ExecutionContext &ctx) { auto hidden_size = ctx.Attr("size"); auto box_ptr = paddle::framework::BoxWrapper::GetInstance(); box_ptr->PushSparseGrad(ctx.GetPlace(), all_keys, all_grad_values, - slot_lengths, hidden_size, batch_size); + slot_lengths, hidden_size, 0, batch_size); #endif } diff --git a/paddle/fluid/pybind/box_helper_py.cc b/paddle/fluid/pybind/box_helper_py.cc index 706c27679e..33c5cd65a7 100644 --- a/paddle/fluid/pybind/box_helper_py.cc +++ b/paddle/fluid/pybind/box_helper_py.cc @@ -63,9 +63,9 @@ void BindBoxHelper(py::module* m) { void BindBoxWrapper(py::module* m) { py::class_>( *m, "BoxWrapper") - .def(py::init([]() { + .def(py::init([](int embedx_dim, int expand_embed_dim) { // return std::make_shared(dataset); - return framework::BoxWrapper::GetInstance(); + return framework::BoxWrapper::SetInstance(embedx_dim, expand_embed_dim); })) .def("save_base", &framework::BoxWrapper::SaveBase, py::call_guard()) diff --git a/python/paddle/fluid/contrib/layers/nn.py b/python/paddle/fluid/contrib/layers/nn.py index 3bb56edb9b..273a669a14 100644 --- a/python/paddle/fluid/contrib/layers/nn.py +++ b/python/paddle/fluid/contrib/layers/nn.py @@ -34,7 +34,8 @@ __all__ = [ 'fused_elemwise_activation', 'sequence_topk_avg_pooling', 'var_conv_2d', 'match_matrix_tensor', 'tree_conv', 'fused_embedding_seq_pool', 'multiclass_nms2', 'search_pyramid_hash', 'shuffle_batch', 'partial_concat', - 'partial_sum', 'tdm_child', 'rank_attention', 'tdm_sampler', 'batch_fc' + 'partial_sum', 'tdm_child', 'rank_attention', 'tdm_sampler', 'batch_fc', + '_pull_box_extended_sparse' ] @@ -1361,3 +1362,50 @@ def batch_fc(input, param_size, param_attr, bias_size, bias_attr, act=None): "Bias": b}, outputs={"Out": pre_act}) return helper.append_activation(pre_act) + + +def _pull_box_extended_sparse(input, size, extend_size=64, dtype='float32'): + """ + **Pull Box Extended Sparse Layer** + This layer is used to lookup embeddings of IDs, provided by :attr:`input`, in + BoxPS lookup table. The result of this lookup is the embedding of each ID in the + :attr:`input`. + Args: + input(Variable|list of Variable): Input is a Tensor Variable, which + contains the IDs information. + size(int): The embedding size parameter, which indicates the size of + each embedding vector respectively. + extend_size(int): The embedding size parameter in extended dim, + which indicates the size of each embedding vector respectively. + dtype(str): The dtype refers to the data type of output tensor. Only supports + float32 now. + Returns: + Variable|list of Variable: The tensor variable storing the embeddings of the \ + supplied inputs. + Examples: + .. code-block:: python + import paddle.fluid as fluid + data = fluid.layers.data(name='sequence', shape=[1], dtype='int64', lod_level=1) + emb, emb_ex = fluid.contrib.layers._pull_box_extended_sparse(input=data, size=8, extend_size=128) + """ + helper = LayerHelper('pull_box_extended_sparse', **locals()) + helper.input_dtype() + inputs = helper.multiple_input() + outs = [ + helper.create_variable_for_type_inference(dtype) + for i in range(len(inputs)) + ] + outs_extend = [ + helper.create_variable_for_type_inference(dtype) + for i in range(len(inputs)) + ] + helper.append_op( + type='pull_box_extended_sparse', + inputs={'Ids': inputs}, + outputs={'Out': outs, + 'OutExtend': outs_extend}, + attrs={'emb_size': size, + 'emb_extended_size': extend_size}) + if len(outs) == 1: + return outs[0], outs_extend[0] + return outs, outs_extend diff --git a/python/paddle/fluid/tests/unittests/test_paddlebox_datafeed.py b/python/paddle/fluid/tests/unittests/test_paddlebox_datafeed.py index 35bc144989..94bc8ff288 100644 --- a/python/paddle/fluid/tests/unittests/test_paddlebox_datafeed.py +++ b/python/paddle/fluid/tests/unittests/test_paddlebox_datafeed.py @@ -17,7 +17,6 @@ import paddle.fluid.core as core import os import unittest import paddle.fluid.layers as layers -from paddle.fluid.layers.nn import _pull_box_sparse class TestDataFeed(unittest.TestCase): @@ -57,9 +56,9 @@ class TestDataFeed(unittest.TestCase): lod_level=0, append_batch_size=False) - emb_x, emb_y = _pull_box_sparse([x, y], size=2) - emb_xp = _pull_box_sparse(x, size=2) - concat = layers.concat([emb_x, emb_y], axis=1) + emb_x, emb_y = fluid.contrib.layers._pull_box_extended_sparse( + [x, y], size=2, extend_size=128) + concat = layers.concat([emb_x[0], emb_x[1], emb_y[0], emb_y[1]], axis=1) fc = layers.fc(input=concat, name="fc", size=1, -- GitLab