/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #pragma once #include #include #include "lite/backends/cuda/math/transpose.h" #include "lite/core/op_registry.h" #include "lite/kernels/cuda/search_grnn_compute.h" namespace paddle { namespace lite { namespace kernels { namespace cuda { using Tensor = lite::Tensor; template __global__ void trans_map2out( Dtype* output, const Dtype* input, const int* map, int count, int lastdim) { int tid = blockIdx.x * blockDim.x + threadIdx.x; if (tid < count) { int seq = tid / lastdim; output[map[seq] * lastdim + tid % lastdim] = input[tid]; } } template __global__ void trans_map2in( Dtype* output, const Dtype* input, const int* map, int count, int lastdim) { int tid = blockIdx.x * blockDim.x + threadIdx.x; if (tid < count) { int seq = tid / lastdim; output[tid] = input[map[seq] * lastdim + tid % lastdim]; } } template void trans_map2out_cfunc(const Dtype* input, Dtype* output, int word_size, int seq_sum, cudaStream_t stream, int* dev_map_vec) { int count = seq_sum * word_size; int block_dim = count; int grid_dim = 1; if (count > 1024) { block_dim = 256; grid_dim = (count + block_dim - 1) / block_dim; } trans_map2out<<>>( output, input, dev_map_vec, count, word_size); } template void trans_map2in_cfunc(const Dtype* input, Dtype* output, int hidden_size, int seq_sum, cudaStream_t stream, int* dev_map_vec) { int count = seq_sum * hidden_size; int block_dim = count; int grid_dim = 1; if (count > 1024) { block_dim = 256; grid_dim = (count + block_dim - 1) / block_dim; } trans_map2in<<>>( output, input, dev_map_vec, count, hidden_size); } template void SeqSortedseqTranseUtil::seq_2_sorted_seq(const Dtype* input, Dtype* output, int word_size, cudaStream_t stream) { int seq_sum = _map_vec.size(); trans_map2out_cfunc(input, output, word_size, seq_sum, stream, _dev_map_vec); } template void SeqSortedseqTranseUtil::sorted_seq_2_seq(const Dtype* input, Dtype* output, int hidden_size, cudaStream_t stream) { int seq_sum = _map_vec.size(); trans_map2in_cfunc(input, output, hidden_size, seq_sum, stream, _dev_map_vec); } bool SeqSortedseqTranseUtil::get_sorted_map(const std::vector& offset_vec, cudaStream_t stream_id) { int batch_size = offset_vec.size() - 1; int word_sum = offset_vec[offset_vec.size() - 1]; std::vector length_vec(batch_size); _length_index.resize(batch_size); int emit_length = 0; if (batch_size == 1) { emit_length = offset_vec[1] - offset_vec[0]; _emit_offset_vec.resize(emit_length + 1); for (int i = 0; i <= emit_length; ++i) { _emit_offset_vec[i] = i; } return false; } int max_len = 0; for (int i = 0; i < offset_vec.size() - 1; ++i) { int len = offset_vec[i + 1] - offset_vec[i]; max_len = max_len > len ? max_len : len; length_vec[i] = len; _length_index[i] = i; } emit_length = max_len; if (max_len == 1) { _emit_offset_vec.resize(2); _emit_offset_vec[0] = 0; _emit_offset_vec[1] = emit_length * batch_size; return false; } std::stable_sort(_length_index.begin(), _length_index.end(), [&length_vec](int i1, int i2) { return length_vec[i1] > length_vec[i2]; }); _emit_offset_vec.resize(max_len + 1); _map_vec.resize(word_sum); if (word_sum > _dev_map_vec_length) { if (_dev_map_vec != nullptr) { TargetWrapperCuda::Free(static_cast(_dev_map_vec)); } _dev_map_vec = static_cast(TargetWrapperCuda::Malloc(sizeof(int) * word_sum)); _dev_map_vec_length = word_sum; } int target_word_id = 0; std::vector length_vec_cnt = length_vec; int last_batch_size = batch_size; for (int word_id_in_seq = 0; word_id_in_seq < max_len; word_id_in_seq++) { _emit_offset_vec[word_id_in_seq] = target_word_id; for (int batch_id = 0; batch_id < last_batch_size; batch_id++) { int old_batch_id = _length_index[batch_id]; if (length_vec_cnt[old_batch_id] > 0) { int inner_word_id_in_seq = word_id_in_seq; if (_is_reverse) { inner_word_id_in_seq = length_vec[old_batch_id] - 1 - word_id_in_seq; } int old_word_id = offset_vec[old_batch_id] + inner_word_id_in_seq; _map_vec[old_word_id] = target_word_id; length_vec_cnt[old_batch_id]--; target_word_id++; } else { last_batch_size--; break; } } } TargetWrapperCuda::MemcpyAsync(_dev_map_vec, _map_vec.data(), sizeof(int) * word_sum, IoDirection::HtoD, stream_id); _emit_offset_vec[max_len] = word_sum; _emit_length = emit_length; return true; } template __global__ void transpose_2d(Dtype* output, const Dtype* input, int m, int n) { int tid = blockIdx.x * blockDim.x + threadIdx.x; if (tid < m * n) { int i = tid / n; int j = tid % m; output[tid] = input[j * n + i]; } } void SearchGrnnCompute::WeightsPreprocess() { auto& param = this->Param(); auto& context = this->ctx_->template As(); auto stream = context.exec_stream(); DDim idims = param.wi->dims(); DDim hdims = param.wh->dims(); _wi.Resize({idims[2], idims[0], idims[1]}); _wh.Resize({hdims[2], hdims[0], hdims[1]}); lite::cuda::math::Transpose trans; trans.transpose(_wi.mutable_data(TARGET(kCUDA)), param.wi->data(), idims.Vectorize(), {2, 0, 1}, &stream); trans.transpose(_wh.mutable_data(TARGET(kCUDA)) + hdims[1] * hdims[2], param.wh->data() + hdims[1] * hdims[2], {hdims[0] - 1, hdims[1], hdims[2]}, {2, 0, 1}, &stream); trans.transpose(_wh.mutable_data(TARGET(kCUDA)), param.wh->data(), {hdims[1], hdims[2]}, {1, 0}, &stream); // int thread_num = 512; // int block_num = (hdims[1] * hdims[2] + thread_num - 1) / thread_num; // transpose_2d<<>>( // _wh.mutable_data(TARGET(kCUDA)), // param.wh->data(), // hdims[1], // hdims[2]); } void SearchGrnnCompute::PrepareForRun() { auto& param = this->Param(); auto& context = this->ctx_->template As(); auto stream = context.exec_stream(); gemm_impl_.reset(new lite::cuda::math::Gemm); _seq_util = SeqSortedseqTranseUtil(); WeightsPreprocess(); int hidden_size = param.num_hidden; int word_size = param.num_input; int weights_h2h_size = hidden_size * hidden_size * 3; int weights_i2h_size = hidden_size * word_size * 3; lite::Tensor temp_weights_h2h_ori; lite::Tensor temp_weights_h2h_swarp; temp_weights_h2h_ori.Resize({weights_h2h_size}); temp_weights_h2h_swarp.Resize({weights_h2h_size}); TargetWrapperCuda::MemcpyAsync(temp_weights_h2h_ori.mutable_data(), _wh.data(), sizeof(float) * weights_h2h_size, IoDirection::DtoH, stream); cudaStreamSynchronize(stream); float* temp_tensor_ptr = temp_weights_h2h_swarp.mutable_data(); memcpy(temp_tensor_ptr, temp_weights_h2h_ori.data(), sizeof(float) * hidden_size * hidden_size); float* rz_temp_tensor_ptr = temp_tensor_ptr + hidden_size * hidden_size; const float* rz_weights_tensor_ptr = temp_weights_h2h_ori.data() + hidden_size * hidden_size; for (int row = 0; row < hidden_size; row++) { for (int block = 0; block < 2; block++) { int block_offset = block * hidden_size; for (int cow = 0; cow < hidden_size; cow++) { rz_temp_tensor_ptr[block * hidden_size * hidden_size + row * hidden_size + cow] = rz_weights_tensor_ptr[row * (2 * hidden_size) + cow + block_offset]; } } } float* orz_temp_tensor_ptr = temp_tensor_ptr; float* orz_weights_tensor_ptr = temp_weights_h2h_ori.mutable_data(); for (int row = 0; row < hidden_size; row++) { for (int block = 0; block < 3; block++) { int block_offset = block * hidden_size; for (int cow = 0; cow < hidden_size; cow++) { orz_weights_tensor_ptr[row * (3 * hidden_size) + cow + block_offset] = orz_temp_tensor_ptr[block * hidden_size * hidden_size + row * hidden_size + cow]; } } } _temp_weights_h2h.Resize({weights_h2h_size}); TargetWrapperCuda::MemcpyAsync( _temp_weights_h2h.mutable_data(TARGET(kCUDA)), temp_weights_h2h_ori.data(), sizeof(float) * weights_h2h_size, IoDirection::HtoD, stream); cudaStreamSynchronize(stream); } template static inline __device__ Dtype Sigmoid(const Dtype a) { return static_cast(1.0) / (static_cast(1.0) + expf(-a)); } template static inline __device__ Dtype Tanh(const Dtype a) { Dtype tmp = static_cast(-2.0) * a; return (static_cast(2.0) / (static_cast(1.0) + expf(tmp))) - static_cast(1.0); } template __global__ void cal_cudnn_kernel(const Dtype* w_x_r, const Dtype* w_x_z, const Dtype* w_x_o, const Dtype* w_h_r, const Dtype* w_h_z, const Dtype* w_h_o, int hidden_size, int batch_size, Dtype* output, const Dtype* hidden_pre) { const int thread_id = blockIdx.x * blockDim.x + threadIdx.x; const int batch_id = thread_id / hidden_size; const int index = thread_id % hidden_size; if (index < hidden_size && batch_id < batch_size) { int w_base_index = batch_id * hidden_size * 3 + index; int h_base_index = batch_id * hidden_size + index; Dtype hidden_pre_value = hidden_pre[h_base_index]; Dtype r = Sigmoid(w_x_r[w_base_index] + w_h_r[w_base_index]); Dtype z = Sigmoid(w_x_z[w_base_index] + w_h_z[w_base_index]); Dtype _h = Tanh(w_x_o[w_base_index] + w_h_o[w_base_index] * r); output[h_base_index] = (static_cast(1.0) - z) * _h + z * hidden_pre_value; } } void SearchGrnnCompute::Run() { auto& param = this->Param(); auto& context = this->ctx_->template As(); auto stream = context.exec_stream(); auto* x = param.x; LoD offset_vec_vec = x->lod(); std::vector offset(offset_vec_vec[offset_vec_vec.size() - 1].size()); for (size_t i = 0; i < offset_vec_vec[offset_vec_vec.size() - 1].size(); ++i) { offset[i] = static_cast(offset_vec_vec[offset_vec_vec.size() - 1][i]); } const float* x_data = x->data(); auto* dout = param.out; std::vector out_dims_vec{x->dims()[0], param.num_hidden}; dout->Resize(out_dims_vec); float* dout_data = dout->mutable_data(TARGET(kCUDA)); auto* wi = &_wi; auto* wh = &_wh; const float* weights_i2h = wi->data(); const float* weights_h2h = wh->data(); int batch_size = offset.size() - 1; int seq_sum = x->dims()[0]; bool is_batched = offset.size() > 2; int hidden_size = param.num_hidden; int word_size = param.num_input; int o_offset = 0; int r_offset = 1; int z_offset = 2; is_batched = _seq_util.get_sorted_map(offset, stream); std::vector emit_offset_vec = _seq_util.get_emit_offset_vec(); int emit_length = emit_offset_vec.size() - 1; if (is_batched) { std::vector seq_shape{1, 1, seq_sum, word_size}; _temp_tensor_in.Resize(seq_shape); std::vector seq_out_shape{1, 1, seq_sum, hidden_size}; _temp_tensor_out.Resize(seq_out_shape); _seq_util.seq_2_sorted_seq( x_data, _temp_tensor_in.mutable_data(TARGET(kCUDA)), word_size, stream); x_data = _temp_tensor_in.data(); dout_data = _temp_tensor_out.mutable_data(TARGET(kCUDA)); } std::vector shape_wx({seq_sum, 1, 3, hidden_size}); _temp_wx.Resize(shape_wx); std::vector shape_wh({1, batch_size, 3, hidden_size}); _temp_wh.Resize(shape_wh); gemm_impl_->init(false, false, seq_sum, 3 * hidden_size, word_size, &context); gemm_impl_->run(1.0f, 0.0f, x_data, weights_i2h, _temp_wx.mutable_data(TARGET(kCUDA)), &context); std::vector shape_zero({batch_size * hidden_size}); _temp_zero.Resize(shape_zero); TargetWrapperCuda::MemsetAsync(_temp_zero.mutable_data(TARGET(kCUDA)), 0, sizeof(float) * batch_size * hidden_size, stream); const float* h = _temp_zero.data(); for (int word_id = 0; word_id < emit_length; word_id++) { int real_word_id = word_id; int last_word_id = word_id - 1; int emit_word_id_start = emit_offset_vec[real_word_id]; int emit_word_id_end = emit_offset_vec[real_word_id + 1]; int emit_word_length = emit_word_id_end - emit_word_id_start; const float* hidden_in; float* hidden_out = dout_data + emit_offset_vec[real_word_id] * hidden_size; if (word_id == 0) { hidden_in = h; } else { hidden_in = dout_data + emit_offset_vec[last_word_id] * hidden_size; } float* w_x_r = _temp_wx.mutable_data(TARGET(kCUDA)) + r_offset * hidden_size + emit_word_id_start * hidden_size * 3; float* w_x_z = _temp_wx.mutable_data(TARGET(kCUDA)) + z_offset * hidden_size + emit_word_id_start * hidden_size * 3; float* w_x_o = _temp_wx.mutable_data(TARGET(kCUDA)) + o_offset * hidden_size + emit_word_id_start * hidden_size * 3; float* w_h_r = _temp_wh.mutable_data(TARGET(kCUDA)) + r_offset * hidden_size; float* w_h_z = _temp_wh.mutable_data(TARGET(kCUDA)) + z_offset * hidden_size; float* w_h_o = _temp_wh.mutable_data(TARGET(kCUDA)) + o_offset * hidden_size; gemm_impl_->init( false, false, emit_word_length, 3 * hidden_size, hidden_size, &context); gemm_impl_->run(1.0f, 0.0f, hidden_in, _temp_weights_h2h.data(), _temp_wh.mutable_data(TARGET(kCUDA)), &context); const float* w_o = weights_h2h; const int block_dim = 512; const int grid_dim = (emit_word_length * hidden_size + block_dim - 1) / block_dim; cal_cudnn_kernel<<>>(w_x_r, w_x_z, w_x_o, w_h_r, w_h_z, w_h_o, hidden_size, emit_word_length, hidden_out, hidden_in); } if (is_batched) { _seq_util.sorted_seq_2_seq(_temp_tensor_out.data(), dout->mutable_data(TARGET(kCUDA)), hidden_size, stream); } dout->set_lod(x->lod()); } } // namespace cuda } // namespace kernels } // namespace lite } // namespace paddle REGISTER_LITE_KERNEL(search_grnn, kCUDA, kFloat, kNCHW, paddle::lite::kernels::cuda::SearchGrnnCompute, def) .BindInput("X", {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kFloat), DATALAYOUT(kNCHW))}) .BindInput("Wi", {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kFloat), DATALAYOUT(kNCHW))}) .BindInput("Wh", {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kFloat), DATALAYOUT(kNCHW))}) .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kFloat), DATALAYOUT(kNCHW))}) .BindOutput("tmp_buffer", {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kFloat), DATALAYOUT(kNCHW))}) .BindOutput("idx_sorted_by_width", {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kFloat), DATALAYOUT(kNCHW))}) .BindOutput("layout_input", {LiteType::GetTensorTy(TARGET(kCUDA), PRECISION(kFloat), DATALAYOUT(kNCHW))}) .Finalize();