// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // // http://www.apache.org/licenses/LICENSE-2.0 // // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #include "paddle/phi/kernels/rnn_kernel.h" #include "paddle/fluid/framework/generator.h" #include "paddle/fluid/operators/utils.h" #include "paddle/phi/backends/gpu/gpu_context.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/empty_kernel.h" #include "paddle/phi/kernels/gpu/rnn_functor.h" namespace phi { template void RNNInferece(bool has_seq_length, const gpuDnnHandle_t &handle, int seq_length, RNNDescriptors *rnn, const T *x_data, const T *init_h_data, const T *init_c_data, const T *w_data, T *out_data, T *last_h_data, T *last_c_data, DenseTensor *workspace_data, size_t workspace_size) { if (!has_seq_length) { // for inference // This interface is used when the input/output is unpadded. #ifdef PADDLE_WITH_HIP PADDLE_ENFORCE_GPU_SUCCESS( paddle::platform::dynload::miopenRNNForwardInference( handle, rnn->rnn_desc(), seq_length, rnn->x_descs(), x_data, rnn->init_h_desc(), init_h_data, rnn->init_c_desc(), init_c_data, rnn->weight_desc(), w_data, rnn->y_descs(), out_data, rnn->last_h_desc(), last_h_data, rnn->last_c_desc(), last_c_data, workspace_data->data(), workspace_size)); #else PADDLE_ENFORCE_GPU_SUCCESS( paddle::platform::dynload::cudnnRNNForwardInference( handle, rnn->rnn_desc(), seq_length, rnn->x_descs(), x_data, rnn->init_h_desc(), init_h_data, rnn->init_c_desc(), init_c_data, rnn->weight_desc(), w_data, rnn->y_descs(), out_data, rnn->last_h_desc(), last_h_data, rnn->last_c_desc(), last_c_data, workspace_data->data(), workspace_size)); #endif } else { #if defined(PADDLE_WITH_CUDA) && CUDNN_VERSION >= 7201 // for inference // This interface is used when the input/output is padded. PADDLE_ENFORCE_GPU_SUCCESS( paddle::platform::dynload::cudnnRNNForwardInferenceEx( handle, rnn->rnn_desc(), rnn->x_seq_desc(), x_data, rnn->init_h_desc(), init_h_data, rnn->init_c_desc(), init_c_data, rnn->weight_desc(), w_data, rnn->y_seq_desc(), out_data, rnn->last_h_desc(), last_h_data, rnn->last_c_desc(), last_c_data, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, workspace_data->data(), workspace_size)); #else // CUDNN VERSION has to >=7.2.1 PADDLE_THROW(phi::errors::Unavailable( "The padded input is supported by " "cudnnRNNForwardInferenceEx, but it only works when " "the version of cudnn is larger than 7.2.1")); #endif } } template void RnnKernel(const Context &dev_ctx, const DenseTensor &x, const std::vector &pre_state, const std::vector &weight_list, const paddle::optional &sequence_length, float dropout_prob, bool is_bidirec, int input_size, int hidden_size, int num_layers, const std::string &mode, int seed, bool is_test, DenseTensor *out, DenseTensor *dropout_state, std::vector state, DenseTensor *reserve) { #ifdef PADDLE_WITH_HIP gpuRNNMode_t rnn_mode = miopenLSTM; if (mode == "LSTM") rnn_mode = miopenLSTM; else if (mode == "GRU") rnn_mode = miopenGRU; else if (mode == "RNN_RELU") rnn_mode = miopenRNNRELU; else if (mode == "RNN_TANH") rnn_mode = miopenRNNTANH; #else gpuRNNMode_t rnn_mode = CUDNN_LSTM; if (mode == "LSTM") rnn_mode = CUDNN_LSTM; else if (mode == "GRU") rnn_mode = CUDNN_GRU; else if (mode == "RNN_RELU") rnn_mode = CUDNN_RNN_RELU; else if (mode == "RNN_TANH") rnn_mode = CUDNN_RNN_TANH; #endif else PADDLE_THROW(phi::errors::InvalidArgument( "rnn_mode should be LSTM, GRU, RNN_RELU or RNN_TANH, but received: " "%s.", mode)); if (!is_test) { if (seed == 0) { // If not specify seed, use global Generator to generate seed. int device_id = dev_ctx.GetPlace().GetDeviceId(); auto gen_cuda = paddle::framework::DefaultCUDAGenerator(device_id); seed = static_cast(gen_cuda->Random64()); } // else use `ctx.Attr("seed")` specified seed } const T *x_data = x.data(); const T *init_h_data = pre_state[0]->data(); const T *init_c_data = nullptr; T *out_data = dev_ctx.template Alloc(out); T *last_h_data = dev_ctx.template Alloc(state[0]); T *last_c_data = nullptr; #ifdef PADDLE_WITH_HIP if (rnn_mode == miopenLSTM) { #else if (rnn_mode == CUDNN_LSTM) { #endif init_c_data = pre_state[1]->data(); last_c_data = dev_ctx.template Alloc(state[1]); } bool has_seq_length = sequence_length.is_initialized(); #ifdef PADDLE_WITH_HIP PADDLE_ENFORCE_EQ( has_seq_length, false, phi::errors::InvalidArgument("ROCm do not support SequenceLength yet.")); #endif std::vector SequenceLength; if (has_seq_length) { SequenceLength = paddle::operators::GetDataFromTensor(sequence_length.get_ptr()); } auto handle = dev_ctx.cudnn_handle(); int seq_length = x.dims()[0]; int batch_size = x.dims()[1]; int input_size_local = x.dims()[2]; size_t workspace_size; size_t reserve_size; DenseTensor weight_whole; T *w_data = nullptr; auto place = dev_ctx.GetPlace(); auto stream = dev_ctx.stream(); auto weight_numel = std::accumulate( weight_list.begin(), weight_list.end(), 0, [](int64_t num, const DenseTensor *t) { return num + t->numel(); }); bool continuous = IsContinuous>(weight_list); #ifdef PADDLE_WITH_HIP // Need to permute weight, set continuous to false continuous = false; #endif if (!continuous) { LOG_FIRST_N(WARNING, 2) << "If the memory space of the Input WeightList is not continuous, " "less efficient calculation will be called. Please call " "flatten_parameters() to make the input memory continuous."; weight_whole.Resize({weight_numel}); dev_ctx.template Alloc(&weight_whole); #ifdef PADDLE_WITH_HIP // MIOPEN need to permute weight for miopenLSTM or miopenGRU std::vector weight_list_tmp = weight_list; WeightToPermutedTensor( place, stream, &weight_list_tmp, &weight_whole, rnn_mode, is_bidirec); #else WeightToTensor(place, stream, weight_list, &weight_whole); #endif w_data = weight_whole.data(); #ifndef PADDLE_WITH_HIP // MIOPEN need to permute weight, do not share with weight_grad if (is_test) { // maybe also reset small weights' ptr for training int offset = 0; for (size_t i = 0; i < weight_list.size(); ++i) { size_t len = weight_list[i]->numel(); auto dim = weight_list[i]->dims(); const_cast(weight_list[i]) ->ShareDataWith( weight_whole.Slice(static_cast(offset), static_cast(offset + len))) .Resize(dim); offset += len; } } #endif } else { w_data = const_cast(weight_list[0]->data()); } RNNDescriptors rnn(seq_length, batch_size, input_size_local, hidden_size, num_layers, dropout_prob, seed, weight_numel, rnn_mode, is_bidirec, is_test); rnn.Create(handle, dev_ctx.GetPlace(), SequenceLength, &workspace_size, &reserve_size, dropout_state); DenseTensor workspace_data_ = Empty(dev_ctx, {static_cast(workspace_size)}); reserve->Resize({static_cast(reserve_size)}); auto *reserve_data = dev_ctx.template Alloc(reserve); if (is_test) { RNNInferece(has_seq_length, handle, seq_length, &rnn, x_data, init_h_data, init_c_data, w_data, out_data, last_h_data, last_c_data, &workspace_data_, workspace_size); } else { if (!has_seq_length) { // for train // This interface is used when the input/output is unpadded. #ifdef PADDLE_WITH_HIP PADDLE_ENFORCE_GPU_SUCCESS( paddle::platform::dynload::miopenRNNForwardTraining( handle, rnn.rnn_desc(), seq_length, rnn.x_descs(), x_data, rnn.init_h_desc(), init_h_data, rnn.init_c_desc(), init_c_data, rnn.weight_desc(), w_data, rnn.y_descs(), out_data, rnn.last_h_desc(), last_h_data, rnn.last_c_desc(), last_c_data, workspace_data_.data(), workspace_size, reserve_data, reserve_size)); #else PADDLE_ENFORCE_GPU_SUCCESS( paddle::platform::dynload::cudnnRNNForwardTraining( handle, rnn.rnn_desc(), seq_length, rnn.x_descs(), x_data, rnn.init_h_desc(), init_h_data, rnn.init_c_desc(), init_c_data, rnn.weight_desc(), w_data, rnn.y_descs(), out_data, rnn.last_h_desc(), last_h_data, rnn.last_c_desc(), last_c_data, workspace_data_.data(), workspace_size, reserve_data, reserve_size)); #endif } else { #if defined(PADDLE_WITH_CUDA) && CUDNN_VERSION >= 7201 // for train // This interface is used when the input/output is padded. PADDLE_ENFORCE_GPU_SUCCESS( paddle::platform::dynload::cudnnRNNForwardTrainingEx( handle, rnn.rnn_desc(), rnn.x_seq_desc(), x_data, rnn.init_h_desc(), init_h_data, rnn.init_c_desc(), init_c_data, rnn.weight_desc(), w_data, rnn.y_seq_desc(), out_data, rnn.last_h_desc(), last_h_data, rnn.last_c_desc(), last_c_data, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, workspace_data_.data(), workspace_size, reserve_data, reserve_size)); #else PADDLE_THROW(phi::errors::Unavailable( "The padded input is supported by " "cudnnRNNForwardTrainingEx, but it only works when " "the version of cudnn is larger than 7.2.1")); #endif } } } } // namespace phi #ifdef PADDLE_WITH_HIP // MIOPEN do not support double PD_REGISTER_KERNEL(rnn, GPU, ALL_LAYOUT, phi::RnnKernel, float) {} #else PD_REGISTER_KERNEL(rnn, GPU, ALL_LAYOUT, phi::RnnKernel, float, double) {} #endif