/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #include #include #include #include "dnnl.hpp" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/operators/fused/multi_gru_op.h" #include "paddle/fluid/platform/errors.h" #include "paddle/fluid/platform/mkldnn_reuse.h" namespace paddle { namespace operators { using paddle::framework::LoDTensor; using paddle::framework::Tensor; using paddle::platform::CPUDeviceContext; using paddle::platform::CreateKey; using paddle::platform::MKLDNNGetDataType; using paddle::platform::MKLDNNMemDesc; using platform::to_void_cast; using framework::vectorize; using Direction = dnnl::rnn_direction; namespace { // oneDNN RNN dimensions const int64_t D = 1; // Directions const int64_t L = 1; // Layers (PP supports only 1 stacked layer) const int64_t G = 3; // Number of Gates, 3 for GRU constexpr Direction L2R = Direction::unidirectional_left2right; constexpr Direction R2L = Direction::unidirectional_right2left; constexpr const char* dir2str(Direction dir) { return dir == L2R ? "LR" : "RL"; } } // namespace template class MultiGRUHandler { public: MultiGRUHandler(const paddle::framework::ExecutionContext& ctx, const platform::MKLDNNDeviceContext& dev_ctx) : dev_ctx_(dev_ctx), engine_(dev_ctx.GetEngine()), place_(ctx.GetPlace()), origin_mode_(ctx.Attr("origin_mode")), layers_(ctx.Attr("layers")), concat_pds_(layers_, std::shared_ptr()), x_(ctx.Input("X")), weights_x_(ctx.MultiInput("WeightX")), weights_h_(ctx.MultiInput("WeightH")), biases_(ctx.MultiInput("Bias")), hidden_(ctx.Output("Hidden")), x_lod_(x_->lod()[0]) { PADDLE_ENFORCE_EQ( weights_x_.size(), layers_ * 2, platform::errors::InvalidArgument("The number of WeightX inputs does " "not match the number of layers.")); PADDLE_ENFORCE_EQ( weights_h_.size(), layers_ * 2, platform::errors::InvalidArgument("The number of WeightH inputs does " "not match the number of layers.")); if (biases_.size() > 0) PADDLE_ENFORCE_EQ( biases_.size(), layers_ * 2, platform::errors::InvalidArgument("The number of Bias inputs does " "not match the number of layers.")); // oneDNN kernel has hardcoded activation functions PADDLE_ENFORCE_EQ( ctx.Attr("gate_activation"), "sigmoid", platform::errors::Unimplemented( "oneDNN fusion_gru supports only sigmoid as a gate activation.")); PADDLE_ENFORCE_EQ( ctx.Attr("activation"), "tanh", platform::errors::Unimplemented( "oneDNN fusion_gru supports only tanh as an activation.")); N_ = x_lod_.size() - 1; // Number of sentences (batches) Ti_ = // Max length of the sentence in a batch [this]() { size_t res = 0; for (size_t i = 0; i < (x_lod_.size() - 1); ++i) { res = std::max(res, x_lod_[i + 1] - x_lod_[i]); } return res; }(); // Weights come in pairs, with the same dimensions within a pair for (int layer = 0; layer < layers_; ++layer) { ICs.push_back(vectorize(weights_x_[2 * layer]->dims())[0]); OCs.push_back(vectorize(weights_h_[2 * layer]->dims())[0]); } const std::string unique_name = ctx.OutputName("Hidden"); // Create memory key without Ti because weights, bias and h0 memories // do not depend on Ti size but primitive and input/output memory do if (platform::MKLDNNDeviceContext::tls().get_cur_mkldnn_session_id() != platform::MKLDNNDeviceContextThreadLocals::kMKLDNNSessionID_Default) { memory_key_ = CreateKey(unique_name, MKLDNNGetDataType()); } else { memory_key_ = CreateKey(unique_name, MKLDNNGetDataType(), "-t:", platform::ThreadIDasStr()); } key_ = memory_key_; key_.append("T").append(std::to_string(Ti_)); // Is it int8 kernel const bool is_int8 = std::is_same::value; // Create attributes for each oneDNN gru for (int i = 0; i < 2 * layers_; ++i) { attrs_.push_back(dnnl::primitive_attr()); } if (is_int8) { // Add int8 attributes const auto scale_weights = ctx.MultiInput("Scale_weights"); PADDLE_ENFORCE_EQ( scale_weights.size(), layers_ * 2, platform::errors::InvalidArgument( "The number of weight scale inputs does " "not match the number of layers. Expected: %d. Actual: %d", layers_ * 2, scale_weights.size())); const float scale_data = ctx.Attr("Scale_data"); const float shift_data = ctx.Attr("Shift_data"); const int weights_scale_mask = 0 + (1 << 3) // bit, indicating the unique scales for `g` dim in `ldigo` + (1 << 4); // bit, indicating the unique scales for `o` dim in `ldigo` int w_scale_num = scale_weights.size(); for (int i = 0; i < w_scale_num; ++i) { attrs_[i].set_rnn_data_qparams(scale_data, shift_data); const auto scale_weights_data = std::vector( scale_weights[i]->data(), scale_weights[i]->data() + scale_weights[i]->numel()); attrs_[i].set_rnn_weights_qparams(weights_scale_mask, scale_weights_data); } } for (int layer = 0; layer < layers_; ++layer) { AcquireGruPrimitiveDescriptor(layer, L2R); AcquireGruPrimitiveDescriptor(layer, R2L); AcquireConcatPrimitiveDescriptor(layer); } } void AcquireGruPrimitiveDescriptor(int layer, Direction dir) { auto pd_key = key_; pd_key.append("@gru_pd").append(dir2str(dir)).append(std::to_string(layer)); auto pd = std::static_pointer_cast( dev_ctx_.GetBlob(pd_key)); if (pd == nullptr) { const bool is_int8 = std::is_same::value; // Weights for int8 kernel are of a type s8 const auto weights_dt = is_int8 ? dnnl::memory::data_type::s8 : dnnl::memory::data_type::f32; auto x_md = MKLDNNMemDesc({Ti_, N_, ICs[layer]}, MKLDNNGetDataType(), MKLDNNMemoryFormat::ntc); auto h0_md = MKLDNNMemDesc({L, D, N_, OCs[layer]}, MKLDNNGetDataType(), MKLDNNMemoryFormat::ldnc); auto wx_md = MKLDNNMemDesc({L, D, ICs[layer], G, OCs[layer]}, weights_dt, MKLDNNMemoryFormat::any); auto wh_md = MKLDNNMemDesc({L, D, OCs[layer], G, OCs[layer]}, weights_dt, MKLDNNMemoryFormat::any); auto b_md = MKLDNNMemDesc({L, D, G, OCs[layer]}, MKLDNNGetDataType(), MKLDNNMemoryFormat::ldgo); auto h_md = MKLDNNMemDesc({Ti_, N_, OCs[layer]}, (layer == layers_ - 1) ? MKLDNNGetDataType() : MKLDNNGetDataType(), MKLDNNMemoryFormat::ntc); auto desc = std::make_shared( dnnl::prop_kind::forward_inference, dir, x_md, h0_md, wx_md, wh_md, b_md, h_md, dnnl::memory::desc()); pd = std::make_shared( *desc, attrs_[2 * layer + (dir == R2L)], engine_); PADDLE_ENFORCE_NOT_NULL( pd, platform::errors::InvalidArgument( "Primitive descriptor for gru_forward cannot be null.")); dev_ctx_.SetBlob(pd_key, pd); } gru_pds_[{layer, dir}] = pd; } void AcquireConcatPrimitiveDescriptor(int layer) { auto pd_key = key_; pd_key.append("@c_pd").append(std::to_string(layer)); auto pd = std::static_pointer_cast( dev_ctx_.GetBlob(pd_key)); if (pd == nullptr) { const int axis = 2; auto in_md = MKLDNNMemDesc({Ti_, N_, OCs[layer]}, (layer == layers_ - 1) ? MKLDNNGetDataType() : MKLDNNGetDataType(), MKLDNNMemoryFormat::ntc); std::vector src_mds{in_md, in_md}; pd = std::make_shared(axis, src_mds, engine_); dev_ctx_.SetBlob(pd_key, pd); } concat_pds_[layer] = pd; } std::shared_ptr AcquireInputMemoryWithReorder() { auto key = key_; key.append("@x_m"); auto memory_p = std::static_pointer_cast(dev_ctx_.GetBlob(key)); if (!memory_p) { memory_p = std::make_shared(gru_pds_[{0, L2R}]->src_desc(), engine_); dev_ctx_.SetBlob(key, memory_p); } auto* x_data = to_void_cast(x_->data()); auto* x_onednn_data = memory_p->get_data_handle(); memset(x_onednn_data, 0, sizeof(T) * N_ * Ti_ * ICs[0]); if (platform::GetMKLDNNFormat(gru_pds_[{0, L2R}]->src_desc()) == dnnl::memory::format_tag::ntc) { reorderPPtoNTC(x_data, x_onednn_data, x_lod_, 0, L2R); } else { reorderPPtoTNC(x_data, x_onednn_data, x_lod_, 0, L2R); } return memory_p; } // Reorder input memory [WORDS, C] + LoD -> [N, T, C] void reorderPPtoNTC(void* input_data, void* output_data, std::vector lod, int layer, Direction dir) { auto* input_data_iter = reinterpret_cast(input_data); auto* output_data_iter = reinterpret_cast(output_data); for (int n = 0; n < N_; ++n) { const auto num_elements = (lod[n + 1] - lod[n]) * ICs[layer]; const auto offset = dir == R2L ? (Ti_ * ICs[layer] - num_elements) : 0; memcpy(output_data_iter + n * Ti_ * ICs[layer] + offset, input_data_iter, sizeof(T) * num_elements); input_data_iter += num_elements; } } // Reorder input memory [WORDS, C] + LoD -> [T, N, C] void reorderPPtoTNC(void* input_data, void* output_data, std::vector lod, int layer, Direction dir) { auto* input_data_iter = reinterpret_cast(input_data); auto* output_data_iter = reinterpret_cast(output_data); for (int n = 0; n < N_; ++n) { const auto num_elements = (lod[n + 1] - lod[n]); const auto offset = dir == R2L ? (Ti_ - num_elements) : 0; for (size_t t = 0; t < num_elements; ++t) { memcpy( output_data_iter + (t + offset) * N_ * ICs[layer] + n * ICs[layer], input_data_iter, sizeof(T) * ICs[layer]); input_data_iter += ICs[layer]; } } } std::shared_ptr executeSingleGru( std::shared_ptr input_mem, int layer, Direction dir) { auto h0_mem = AcquireH0Memory(layer, dir); auto wx_mem = AcquireWeightXMemory(layer, dir); auto wh_mem = AcquireWeightHMemory(layer, dir); auto b_mem = AcquireBiasMemory(layer, dir); auto out_mem = AcquireGruOutputMemory(layer, dir); std::unordered_map gru_args = { {DNNL_ARG_SRC_LAYER, *input_mem}, {DNNL_ARG_SRC_ITER, *h0_mem}, {DNNL_ARG_WEIGHTS_LAYER, *wx_mem}, {DNNL_ARG_WEIGHTS_ITER, *wh_mem}, {DNNL_ARG_BIAS, *b_mem}, {DNNL_ARG_DST_LAYER, *out_mem}}; auto gru_forward_p0 = AcquireGruPrimitive(layer, dir); dnnl::stream astream(engine_); gru_forward_p0->execute(astream, gru_args); astream.wait(); return out_mem; } // TODO(grygielski) H0 is for now persistable std::shared_ptr AcquireH0Memory(int layer, Direction dir) { auto key = memory_key_; key.append("@h0").append(dir2str(dir)).append(std::to_string(layer)); auto memory_p = std::static_pointer_cast(dev_ctx_.GetBlob(key)); if (!memory_p) { auto user_h0_memory = dnnl::memory(); user_h0_memory = dnnl::memory({{1, 1, N_, OCs[layer]}, MKLDNNGetDataType(), MKLDNNMemoryFormat::ldnc}, engine_); memset(user_h0_memory.get_data_handle(), 0, sizeof(float) * N_ * OCs[layer]); memory_p = std::make_shared( gru_pds_[{layer, dir}]->src_iter_desc(), engine_); dnnl::stream astream(engine_); dnnl::reorder(user_h0_memory, *memory_p, attrs_[2 * layer + (dir == R2L)]) .execute(astream, user_h0_memory, *memory_p); dev_ctx_.SetBlob(key, memory_p); } return memory_p; } std::shared_ptr AcquireWeightXMemory(int layer, Direction dir) { auto key = memory_key_; key.append("@wx").append(dir2str(dir)).append(std::to_string(layer)); auto memory_p = std::static_pointer_cast(dev_ctx_.GetBlob(key)); if (!memory_p) { auto user_md = MKLDNNMemDesc({1, 1, ICs[layer], 3, OCs[layer]}, MKLDNNGetDataType(), MKLDNNMemoryFormat::ldigo); auto user_memory = dnnl::memory(user_md, engine_); auto* weight_x_data = reinterpret_cast(user_memory.get_data_handle()); int idx = layer * 2 + (dir == R2L); memcpy(weight_x_data, weights_x_[idx]->data(), sizeof(float) * ICs[layer] * 3 * OCs[layer]); if (origin_mode_ == false) { for (int64_t i = 0; i < ICs[layer]; ++i) { for (int64_t j = 0; j < OCs[layer]; ++j) { weight_x_data[j] *= -1; } weight_x_data += 3 * OCs[layer]; } } memory_p = std::make_shared( gru_pds_[{layer, dir}]->weights_layer_desc(), engine_); dnnl::stream astream(engine_); dnnl::reorder(user_memory, *memory_p, attrs_[2 * layer + (dir == R2L)]) .execute(astream, user_memory, *memory_p); dev_ctx_.SetBlob(key, memory_p); } return memory_p; } std::shared_ptr AcquireWeightHMemory(int layer, Direction dir) { auto key = memory_key_; key.append("@wh").append(dir2str(dir)).append(std::to_string(layer)); auto memory_p = std::static_pointer_cast(dev_ctx_.GetBlob(key)); if (!memory_p) { auto user_md = MKLDNNMemDesc({1, 1, OCs[layer], 3, OCs[layer]}, MKLDNNGetDataType(), MKLDNNMemoryFormat::ldigo); auto user_memory = dnnl::memory(user_md, engine_); // Reorder weights_h from PP format [OC, 2OC] + [OC, OC] to // oneDNN format [OC, 3OC] auto* weight_h_data = reinterpret_cast(user_memory.get_data_handle()); int idx = layer * 2 + (dir == R2L); auto* user_weight_h_data = weights_h_[idx]->data(); auto src1_iter = user_weight_h_data; auto src2_iter = user_weight_h_data + 2 * OCs[layer] * OCs[layer]; for (int64_t c = 0; c < OCs[layer]; ++c) { memcpy(weight_h_data, src1_iter, 2 * OCs[layer] * sizeof(float)); memcpy(weight_h_data + 2 * OCs[layer], src2_iter, OCs[layer] * sizeof(float)); src1_iter += 2 * OCs[layer]; src2_iter += OCs[layer]; weight_h_data += 3 * OCs[layer]; } weight_h_data = reinterpret_cast(user_memory.get_data_handle()); if (origin_mode_ == false) { for (int64_t i = 0; i < OCs[layer]; ++i) { for (int64_t j = 0; j < OCs[layer]; ++j) { weight_h_data[j] *= -1; } weight_h_data += 3 * OCs[layer]; } } memory_p = std::make_shared( gru_pds_[{layer, dir}]->weights_iter_desc(), engine_); dnnl::stream astream(engine_); dnnl::reorder(user_memory, *memory_p, attrs_[2 * layer + (dir == R2L)]) .execute(astream, user_memory, *memory_p); dev_ctx_.SetBlob(key, memory_p); } return memory_p; } std::shared_ptr AcquireBiasMemory(int layer, Direction dir) { auto key = memory_key_; key.append("@b").append(dir2str(dir)).append(std::to_string(layer)); auto memory_p = std::static_pointer_cast(dev_ctx_.GetBlob(key)); if (!memory_p) { memory_p = std::make_shared( gru_pds_[{layer, dir}]->bias_desc(), engine_); auto* bias_data = reinterpret_cast(memory_p->get_data_handle()); int idx = layer * 2 + (dir == R2L); if (biases_.size() > 0 && biases_[idx]) { const float* user_bias_data = biases_[idx]->data(); // Bias in oneDNN is always float memcpy(bias_data, user_bias_data, sizeof(float) * 3 * OCs[layer]); } else { // oneDNN always need bias memory, if it's not provided in PP, let // oneDNN allocate memory and set it to 0 memset(bias_data, 0, sizeof(float) * 3 * OCs[layer]); } if (origin_mode_ == false && biases_.size() && biases_[idx]) { for (int64_t i = 0; i < OCs[layer]; ++i) { bias_data[i] *= -1; } } dev_ctx_.SetBlob(key, memory_p); } return memory_p; } std::shared_ptr AcquireGruOutputMemory(int layer, Direction dir) { auto key = key_; key.append("@h_m").append(dir2str(dir)).append(std::to_string(layer)); auto memory_p = std::static_pointer_cast(dev_ctx_.GetBlob(key)); if (!memory_p) { memory_p = std::make_shared( gru_pds_[{layer, dir}]->dst_desc(), engine_); dev_ctx_.SetBlob(key, memory_p); } return memory_p; } std::shared_ptr AcquireGruPrimitive(int layer, Direction dir) { auto key = key_; key.append("@gru_p").append(dir2str(dir)).append(std::to_string(layer)); auto prim = std::static_pointer_cast(dev_ctx_.GetBlob(key)); if (prim == nullptr) { prim = std::make_shared(*gru_pds_[{layer, dir}]); dev_ctx_.SetBlob(key, prim); } return prim; } void reorderInputL2RtoR2L(std::shared_ptr mem, int layer) { auto* data = mem->get_data_handle(); auto* data_iter = reinterpret_cast(data); for (int n = 0; n < N_; ++n) { const auto num_elements = (x_lod_[n + 1] - x_lod_[n]) * ICs[layer]; const auto offset = Ti_ * ICs[layer] - num_elements; memmove(data_iter + offset, data_iter, sizeof(T) * num_elements); memset(data_iter, 0, sizeof(T) * offset); data_iter += Ti_ * ICs[layer]; } } template void reorderOutputR2LtoL2R(std::shared_ptr mem, int layer) { auto* data = mem->get_data_handle(); auto* data_iter = reinterpret_cast(data); for (int n = 0; n < N_; ++n) { const auto num_elements = (x_lod_[n + 1] - x_lod_[n]) * OCs[layer]; const auto offset = Ti_ * OCs[layer] - num_elements; memmove(data_iter, data_iter + offset, sizeof(K) * num_elements); memset(data_iter + num_elements, 0, sizeof(K) * offset); data_iter += Ti_ * OCs[layer]; } } std::shared_ptr executeConcat( std::shared_ptr mem1, std::shared_ptr mem2, int layer) { auto out_mem = AcquireConcatOutputMemory(layer); std::unordered_map concat_args{ {DNNL_ARG_MULTIPLE_SRC, *mem1}, {DNNL_ARG_MULTIPLE_SRC + 1, *mem2}, {DNNL_ARG_DST, *out_mem}}; auto concat_p = AcquireConcatPrimitive(layer); dnnl::stream astream(engine_); concat_p->execute(astream, concat_args); astream.wait(); return out_mem; } std::shared_ptr> AcquireConcatInputMemories( int layer) { auto key = key_; key.append("@ci_m").append(std::to_string(layer)); auto memory_p = std::static_pointer_cast>( dev_ctx_.GetBlob(key)); if (!memory_p) { std::vector src_mems{ dnnl::memory(concat_pds_[layer]->src_desc(0), engine_), dnnl::memory(concat_pds_[layer]->src_desc(1), engine_)}; memory_p = std::make_shared>(src_mems); dev_ctx_.SetBlob(key, memory_p); } return memory_p; } std::shared_ptr AcquireConcatOutputMemory(int layer) { auto key = key_; key.append("@co_m").append(std::to_string(layer)); auto memory_p = std::static_pointer_cast(dev_ctx_.GetBlob(key)); if (!memory_p) { memory_p = std::make_shared(concat_pds_[layer]->dst_desc(), engine_); dev_ctx_.SetBlob(key, memory_p); } return memory_p; } std::shared_ptr AcquireConcatPrimitive(int layer) { auto key = key_; key.append("@c_p").append(std::to_string(layer)); auto prim = std::static_pointer_cast(dev_ctx_.GetBlob(key)); if (prim == nullptr) { prim = std::make_shared(*concat_pds_[layer]); dev_ctx_.SetBlob(key, prim); } return prim; } template void reorderOutput(std::shared_ptr mem, int layer) { auto* data = mem->get_data_handle(); auto* hidden_data = to_void_cast(hidden_->mutable_data(place_)); if (isNTC(layers_ - 1)) { reorderNTCtoPP(data, hidden_data, layers_ - 1); } else { reorderTNCtoPP(data, hidden_data, layers_ - 1); } } bool isNTC(int layer) { return (platform::GetMKLDNNFormat(gru_pds_[{layer, L2R}]->dst_desc()) == dnnl::memory::format_tag::ntc); } int getLayers() const { return layers_; } // Reorder output values to PP format [N, T, C] -> [WORDS, C] void reorderNTCtoPP(void* input_data, void* output_data, int layer) { auto* input_data_iter = reinterpret_cast(input_data); auto* output_data_iter = reinterpret_cast(output_data); auto oc = OCs[layer] * 2; for (int n = 0; n < N_; ++n) { const auto num_elements = (x_lod_[n + 1] - x_lod_[n]) * oc; memcpy(output_data_iter, input_data_iter + n * Ti_ * oc, sizeof(T_out) * num_elements); output_data_iter += num_elements; } } // Reorder output values to PP format [T, N, C] -> [WORDS, C] void reorderTNCtoPP(void* input_data, void* output_data, int layer) { auto* input_data_iter = reinterpret_cast(input_data); auto* output_data_iter = reinterpret_cast(output_data); for (int n = 0; n < N_; ++n) { const auto num_elements = x_lod_[n + 1] - x_lod_[n]; for (size_t t = 0; t < num_elements; ++t) { memcpy(output_data_iter, input_data_iter + t * N_ * OCs[layer] + n * OCs[layer], sizeof(T_out) * OCs[layer]); output_data_iter += OCs[layer]; } } } private: // RNN dimensions // N - Batch Size // Ti - Max sentence length // ICs - Input Channels // OCs - Output Channels int64_t N_, Ti_; std::vector ICs, OCs; const platform::MKLDNNDeviceContext& dev_ctx_; const dnnl::engine engine_; const platform::Place place_; const bool origin_mode_; const int layers_; std::map, std::shared_ptr> gru_pds_; std::vector> concat_pds_; std::string key_; // Memory size of weights, bias and h0 does not depend // on Ti size, thus we need another key to cache them std::string memory_key_; const LoDTensor* x_; const std::vector weights_x_; const std::vector weights_h_; const std::vector biases_; LoDTensor* hidden_; std::vector attrs_; const paddle::framework::Vector& x_lod_; }; template class MultiGRUMKLDNNKernel : public framework::OpKernel { public: void Compute(const framework::ExecutionContext& ctx) const override { const bool force_fp32_output = ctx.HasAttr("force_fp32_output") && ctx.Attr("force_fp32_output"); if (force_fp32_output) { RunKernel(ctx); } else { RunKernel(ctx); } } template void RunKernel(const framework::ExecutionContext& ctx) const { auto& dev_ctx = ctx.template device_context(); MultiGRUHandler handler(ctx, dev_ctx); int layers = handler.getLayers(); auto input_mem = handler.AcquireInputMemoryWithReorder(); for (int layer = 0; layer < layers; ++layer) { auto gru_out_L2R = handler.executeSingleGru(input_mem, layer, L2R); handler.reorderInputL2RtoR2L(input_mem, layer); auto gru_out_R2L = handler.executeSingleGru(input_mem, layer, R2L); if (layer < layers - 1) handler.template reorderOutputR2LtoL2R(gru_out_R2L, layer); else handler.template reorderOutputR2LtoL2R(gru_out_R2L, layer); input_mem = handler.executeConcat(gru_out_L2R, gru_out_R2L, layer); } handler.template reorderOutput(input_mem, layers - 1); } }; } // namespace operators } // namespace paddle namespace ops = paddle::operators; REGISTER_OP_KERNEL(multi_gru, MKLDNN, paddle::platform::CPUPlace, ops::MultiGRUMKLDNNKernel, ops::MultiGRUMKLDNNKernel);