diff --git a/paddle/fluid/operators/sequence_expand_op.cu b/paddle/fluid/operators/sequence_expand_op.cu index 26622d23afa1c703e237628bcb11db8f1da73210..6477af89f110a51fe2d494f1b54f4002fae3560d 100644 --- a/paddle/fluid/operators/sequence_expand_op.cu +++ b/paddle/fluid/operators/sequence_expand_op.cu @@ -15,6 +15,58 @@ limitations under the License. */ #define EIGEN_USE_GPU #include "paddle/fluid/operators/sequence_expand_op.h" +namespace paddle { +namespace operators { + +using LoDTensor = framework::LoDTensor; + +template +__global__ sequence_expand_kernel(const T* x_data, T* out_data, size_t* lod, + size_t element_len) { + int BLOCK_SIZE = 1024; + __shared__ T shm_lod[BLOCK_SIZE]; + for (int idx = threadIdx.x; idx < BLOCK_SIZE; ++idx) { + shm_lod[idx] = lod[idx]; + } + for (int idx = threadIdx.x + blockIdx.x * blockDim.x; idx < lod.size(); + idx += blockDim.x * gridDim.x) { + int scale = lod[i] + } +} + +template +void SequenceExpandFunctor::operator()( + const platform::CPUDeviceContext& context, const LoDTensor& x, + LoDTensor* out) { + x_dims = x.dims(); + size_t element_len = framework::product(x_dims) / x_dims[0]; + T* out_data = out->mutable_data(context.GetPlace()); + auto out_starts = out->lod().back(); + + const int kThreadsPerBlock = 1024; + int block_cols = kThreadsPerBlock; + if (out_cols < kThreadsPerBlock) { // block_cols is aligned by 32. + block_cols = ((out_cols + 31) >> 5) << 5; + } + int block_rows = kThreadsPerBlock / block_cols; + dim3 block_size = dim3(block_cols, block_rows, 1); + + int max_threads = context.GetMaxPhysicalThreadCount(); + int max_blocks = std::max(max_threads / kThreadsPerBlock, 1); + + int grid_cols = + std::min((out_cols + block_cols - 1) / block_cols, max_blocks); + int grid_rows = + std::min(max_blocks / grid_cols, std::max(out_rows / block_rows, 1)); + dim3 grid_size = dim3(grid_cols, grid_rows, 1); + sequence_expand_kernel<<>>( + x.data(), out->mutable_data(context.GetPlace()), + out_starts.CUDAData(context.GetPlace()), element_len); +} + +} // namespace operators +} // namespace paddle + namespace ops = paddle::operators; REGISTER_OP_CUDA_KERNEL( sequence_expand, diff --git a/paddle/fluid/operators/sequence_expand_op.h b/paddle/fluid/operators/sequence_expand_op.h index 76dde976db2d19e307ae7406be8280f9b4987187..12e4018b95bbf6b9d995e4d2933c6e81b7841d2b 100644 --- a/paddle/fluid/operators/sequence_expand_op.h +++ b/paddle/fluid/operators/sequence_expand_op.h @@ -16,13 +16,44 @@ limitations under the License. */ #include "paddle/fluid/framework/op_registry.h" #include "paddle/fluid/memory/memcpy.h" -#include "unsupported/Eigen/CXX11/Tensor" +#include "paddle/fluid/platform/device_context.h" namespace paddle { namespace operators { using LoDTensor = framework::LoDTensor; +template +struct SequenceExpandFunctor { + void operator()(const DeviceContext& ctx, const LoDTensor& x, LoDTensor* out); +}; + +// template +// struct SequenceExpandGradFunctor {}; + +template +void SequenceExpandFunctor::operator()( + const platform::CPUDeviceContext& context, const LoDTensor& x, + LoDTensor* out) { + x_dims = x.dims(); + size_t element_len = framework::product(x_dims) / x_dims[0]; + T* out_data = out->mutable_data(context.GetPlace()); + auto out_starts = out->lod().back(); + + for (size_t i = 0; i < out_starts.size() - 1; i++) { + int scale = out_starts[i + 1] - out_starts[i]; + Eigen::TensorMap< + Eigen::Tensor> + x_t(x_data, 1, element_len); + Eigen::TensorMap> + out_t(out_data, scale, element_len); + Eigen::array cast({{scale, 1}}); + out_t.device(*context.eigen_device()) = x_t.broadcast(cast); + x_data += element_len; + out_data += element_len * scale; + } +} + template class SequenceExpandKernel : public framework::OpKernel { public: @@ -38,24 +69,8 @@ class SequenceExpandKernel : public framework::OpKernel { "The size of last lod level in Input(Y)" "must be equal to dims[0] of Input(X)."); out->set_lod(y->lod()); - auto* place = - context.template device_context().eigen_device(); - size_t element_len = framework::product(x_dims) / x_dims[0]; - T* out_data = out->mutable_data(context.GetPlace()); - auto out_starts = out->lod().back(); - - for (size_t i = 0; i < out_starts.size() - 1; i++) { - int scale = out_starts[i + 1] - out_starts[i]; - Eigen::TensorMap< - Eigen::Tensor> - x_t(x_data, 1, element_len); - Eigen::TensorMap> - out_t(out_data, scale, element_len); - Eigen::array cast({{scale, 1}}); - out_t.device(*place) = x_t.broadcast(cast); - x_data += element_len; - out_data += element_len * scale; - } + SequenceExpandFunctor functor; + functor(context.template device_context(), *x, out); } };