diff --git a/paddle/fluid/operators/fused/mkldnn/multi_gru_mkldnn_op.cc b/paddle/fluid/operators/fused/mkldnn/multi_gru_mkldnn_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..b7fd40f78ff9d3fb829c1a0d5c2cc91a62a8455c --- /dev/null +++ b/paddle/fluid/operators/fused/mkldnn/multi_gru_mkldnn_op.cc @@ -0,0 +1,694 @@ +/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include +#include +#include "dnnl.hpp" +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/operators/fused/multi_gru_op.h" +#include "paddle/fluid/platform/errors.h" +#include "paddle/fluid/platform/mkldnn_reuse.h" + +namespace paddle { +namespace operators { + +using paddle::framework::LoDTensor; +using paddle::framework::Tensor; +using paddle::platform::CPUDeviceContext; +using paddle::platform::CreateKey; +using paddle::platform::MKLDNNGetDataType; +using paddle::platform::MKLDNNMemDesc; +using platform::to_void_cast; +using framework::vectorize; +using Direction = dnnl::rnn_direction; + +namespace { + +// oneDNN RNN dimensions +const int64_t D = 1; // Directions +const int64_t L = 1; // Layers (PP supports only 1 stacked layer) +const int64_t G = 3; // Number of Gates, 3 for GRU + +constexpr Direction L2R = Direction::unidirectional_left2right; +constexpr Direction R2L = Direction::unidirectional_right2left; + +constexpr const char* dir2str(Direction dir) { + return dir == L2R ? "LR" : "RL"; +} + +} // namespace + +template +class MultiGRUHandler { + public: + MultiGRUHandler(const paddle::framework::ExecutionContext& ctx, + const platform::MKLDNNDeviceContext& dev_ctx) + : dev_ctx_(dev_ctx), + engine_(dev_ctx.GetEngine()), + place_(ctx.GetPlace()), + origin_mode_(ctx.Attr("origin_mode")), + layers_(ctx.Attr("layers")), + concat_pds_(layers_, std::shared_ptr()), + x_(ctx.Input("X")), + weights_x_(ctx.MultiInput("WeightX")), + weights_h_(ctx.MultiInput("WeightH")), + biases_(ctx.MultiInput("Bias")), + hidden_(ctx.Output("Hidden")), + x_lod_(x_->lod()[0]) { + PADDLE_ENFORCE_EQ( + weights_x_.size(), layers_ * 2, + platform::errors::InvalidArgument("The number of WeightX inputs does " + "not match the number of layers.")); + PADDLE_ENFORCE_EQ( + weights_h_.size(), layers_ * 2, + platform::errors::InvalidArgument("The number of WeightH inputs does " + "not match the number of layers.")); + if (biases_.size() > 0) + PADDLE_ENFORCE_EQ( + biases_.size(), layers_ * 2, + platform::errors::InvalidArgument("The number of Bias inputs does " + "not match the number of layers.")); + // oneDNN kernel has hardcoded activation functions + PADDLE_ENFORCE_EQ( + ctx.Attr("gate_activation"), "sigmoid", + platform::errors::Unimplemented( + "oneDNN fusion_gru supports only sigmoid as a gate activation.")); + PADDLE_ENFORCE_EQ( + ctx.Attr("activation"), "tanh", + platform::errors::Unimplemented( + "oneDNN fusion_gru supports only tanh as an activation.")); + + N_ = x_lod_.size() - 1; // Number of sentences (batches) + Ti_ = // Max length of the sentence in a batch + [this]() { + size_t res = 0; + for (size_t i = 0; i < (x_lod_.size() - 1); ++i) { + res = std::max(res, x_lod_[i + 1] - x_lod_[i]); + } + return res; + }(); + + // Weights come in pairs, with the same dimensions within a pair + for (int layer = 0; layer < layers_; ++layer) { + ICs.push_back(vectorize(weights_x_[2 * layer]->dims())[0]); + OCs.push_back(vectorize(weights_h_[2 * layer]->dims())[0]); + } + + const std::string unique_name = ctx.OutputName("Hidden"); + // Create memory key without Ti because weights, bias and h0 memories + // do not depend on Ti size but primitive and input/output memory do + if (platform::MKLDNNDeviceContext::tls().get_cur_mkldnn_session_id() != + platform::MKLDNNDeviceContextThreadLocals::kMKLDNNSessionID_Default) { + memory_key_ = CreateKey(unique_name, MKLDNNGetDataType()); + } else { + memory_key_ = CreateKey(unique_name, MKLDNNGetDataType(), "-t:", + platform::ThreadIDasStr()); + } + key_ = memory_key_; + key_.append("T").append(std::to_string(Ti_)); + + // Is it int8 kernel + const bool is_int8 = std::is_same::value; + + // Create attributes for each oneDNN gru + for (int i = 0; i < 2 * layers_; ++i) { + attrs_.push_back(dnnl::primitive_attr()); + } + + if (is_int8) { + // Add int8 attributes + const auto scale_weights = ctx.MultiInput("Scale_weights"); + PADDLE_ENFORCE_EQ( + scale_weights.size(), layers_ * 2, + platform::errors::InvalidArgument( + "The number of weight scale inputs does " + "not match the number of layers. Expected: %d. Actual: %d", + layers_ * 2, scale_weights.size())); + const float scale_data = ctx.Attr("Scale_data"); + const float shift_data = ctx.Attr("Shift_data"); + + const int weights_scale_mask = + 0 + + (1 << 3) // bit, indicating the unique scales for `g` dim in `ldigo` + + + (1 << 4); // bit, indicating the unique scales for `o` dim in `ldigo` + + int w_scale_num = scale_weights.size(); + for (int i = 0; i < w_scale_num; ++i) { + attrs_[i].set_rnn_data_qparams(scale_data, shift_data); + const auto scale_weights_data = std::vector( + scale_weights[i]->data(), + scale_weights[i]->data() + scale_weights[i]->numel()); + attrs_[i].set_rnn_weights_qparams(weights_scale_mask, + scale_weights_data); + } + } + + for (int layer = 0; layer < layers_; ++layer) { + AcquireGruPrimitiveDescriptor(layer, L2R); + AcquireGruPrimitiveDescriptor(layer, R2L); + AcquireConcatPrimitiveDescriptor(layer); + } + } + + void AcquireGruPrimitiveDescriptor(int layer, Direction dir) { + auto pd_key = key_; + pd_key.append("@gru_pd").append(dir2str(dir)).append(std::to_string(layer)); + auto pd = std::static_pointer_cast( + dev_ctx_.GetBlob(pd_key)); + if (pd == nullptr) { + const bool is_int8 = std::is_same::value; + // Weights for int8 kernel are of a type s8 + const auto weights_dt = + is_int8 ? dnnl::memory::data_type::s8 : dnnl::memory::data_type::f32; + + auto x_md = MKLDNNMemDesc({Ti_, N_, ICs[layer]}, MKLDNNGetDataType(), + MKLDNNMemoryFormat::ntc); + auto h0_md = MKLDNNMemDesc({L, D, N_, OCs[layer]}, MKLDNNGetDataType(), + MKLDNNMemoryFormat::ldnc); + auto wx_md = MKLDNNMemDesc({L, D, ICs[layer], G, OCs[layer]}, weights_dt, + MKLDNNMemoryFormat::any); + auto wh_md = MKLDNNMemDesc({L, D, OCs[layer], G, OCs[layer]}, weights_dt, + MKLDNNMemoryFormat::any); + auto b_md = + MKLDNNMemDesc({L, D, G, OCs[layer]}, MKLDNNGetDataType(), + MKLDNNMemoryFormat::ldgo); + auto h_md = + MKLDNNMemDesc({Ti_, N_, OCs[layer]}, + (layer == layers_ - 1) ? MKLDNNGetDataType() + : MKLDNNGetDataType(), + MKLDNNMemoryFormat::ntc); + + auto desc = std::make_shared( + dnnl::prop_kind::forward_inference, dir, x_md, h0_md, wx_md, wh_md, + b_md, h_md, dnnl::memory::desc()); + pd = std::make_shared( + *desc, attrs_[2 * layer + (dir == R2L)], engine_); + PADDLE_ENFORCE_NOT_NULL( + pd, platform::errors::InvalidArgument( + "Primitive descriptor for gru_forward cannot be null.")); + dev_ctx_.SetBlob(pd_key, pd); + } + gru_pds_[{layer, dir}] = pd; + } + + void AcquireConcatPrimitiveDescriptor(int layer) { + auto pd_key = key_; + pd_key.append("@c_pd").append(std::to_string(layer)); + auto pd = std::static_pointer_cast( + dev_ctx_.GetBlob(pd_key)); + if (pd == nullptr) { + const int axis = 2; + auto in_md = + MKLDNNMemDesc({Ti_, N_, OCs[layer]}, + (layer == layers_ - 1) ? MKLDNNGetDataType() + : MKLDNNGetDataType(), + MKLDNNMemoryFormat::ntc); + + std::vector src_mds{in_md, in_md}; + pd = std::make_shared(axis, src_mds, + engine_); + dev_ctx_.SetBlob(pd_key, pd); + } + concat_pds_[layer] = pd; + } + + std::shared_ptr AcquireInputMemoryWithReorder() { + auto key = key_; + key.append("@x_m"); + auto memory_p = + std::static_pointer_cast(dev_ctx_.GetBlob(key)); + + if (!memory_p) { + memory_p = std::make_shared(gru_pds_[{0, L2R}]->src_desc(), + engine_); + dev_ctx_.SetBlob(key, memory_p); + } + + auto* x_data = to_void_cast(x_->data()); + + auto* x_onednn_data = memory_p->get_data_handle(); + memset(x_onednn_data, 0, sizeof(T) * N_ * Ti_ * ICs[0]); + + if (platform::GetMKLDNNFormat(gru_pds_[{0, L2R}]->src_desc()) == + dnnl::memory::format_tag::ntc) { + reorderPPtoNTC(x_data, x_onednn_data, x_lod_, 0, L2R); + } else { + reorderPPtoTNC(x_data, x_onednn_data, x_lod_, 0, L2R); + } + return memory_p; + } + + // Reorder input memory [WORDS, C] + LoD -> [N, T, C] + void reorderPPtoNTC(void* input_data, void* output_data, + std::vector lod, int layer, Direction dir) { + auto* input_data_iter = reinterpret_cast(input_data); + auto* output_data_iter = reinterpret_cast(output_data); + for (int n = 0; n < N_; ++n) { + const auto num_elements = (lod[n + 1] - lod[n]) * ICs[layer]; + const auto offset = dir == R2L ? (Ti_ * ICs[layer] - num_elements) : 0; + memcpy(output_data_iter + n * Ti_ * ICs[layer] + offset, input_data_iter, + sizeof(T) * num_elements); + input_data_iter += num_elements; + } + } + + // Reorder input memory [WORDS, C] + LoD -> [T, N, C] + void reorderPPtoTNC(void* input_data, void* output_data, + std::vector lod, int layer, Direction dir) { + auto* input_data_iter = reinterpret_cast(input_data); + auto* output_data_iter = reinterpret_cast(output_data); + for (int n = 0; n < N_; ++n) { + const auto num_elements = (lod[n + 1] - lod[n]); + const auto offset = dir == R2L ? (Ti_ - num_elements) : 0; + for (size_t t = 0; t < num_elements; ++t) { + memcpy( + output_data_iter + (t + offset) * N_ * ICs[layer] + n * ICs[layer], + input_data_iter, sizeof(T) * ICs[layer]); + input_data_iter += ICs[layer]; + } + } + } + + std::shared_ptr executeSingleGru( + std::shared_ptr input_mem, int layer, Direction dir) { + auto h0_mem = AcquireH0Memory(layer, dir); + auto wx_mem = AcquireWeightXMemory(layer, dir); + auto wh_mem = AcquireWeightHMemory(layer, dir); + auto b_mem = AcquireBiasMemory(layer, dir); + auto out_mem = AcquireGruOutputMemory(layer, dir); + + std::unordered_map gru_args = { + {DNNL_ARG_SRC_LAYER, *input_mem}, {DNNL_ARG_SRC_ITER, *h0_mem}, + {DNNL_ARG_WEIGHTS_LAYER, *wx_mem}, {DNNL_ARG_WEIGHTS_ITER, *wh_mem}, + {DNNL_ARG_BIAS, *b_mem}, {DNNL_ARG_DST_LAYER, *out_mem}}; + + auto gru_forward_p0 = AcquireGruPrimitive(layer, dir); + + dnnl::stream astream(engine_); + gru_forward_p0->execute(astream, gru_args); + astream.wait(); + return out_mem; + } + + // TODO(grygielski) H0 is for now persistable + std::shared_ptr AcquireH0Memory(int layer, Direction dir) { + auto key = memory_key_; + key.append("@h0").append(dir2str(dir)).append(std::to_string(layer)); + auto memory_p = + std::static_pointer_cast(dev_ctx_.GetBlob(key)); + if (!memory_p) { + auto user_h0_memory = dnnl::memory(); + user_h0_memory = dnnl::memory({{1, 1, N_, OCs[layer]}, + MKLDNNGetDataType(), + MKLDNNMemoryFormat::ldnc}, + engine_); + memset(user_h0_memory.get_data_handle(), 0, + sizeof(float) * N_ * OCs[layer]); + memory_p = std::make_shared( + gru_pds_[{layer, dir}]->src_iter_desc(), engine_); + + dnnl::stream astream(engine_); + dnnl::reorder(user_h0_memory, *memory_p, attrs_[2 * layer + (dir == R2L)]) + .execute(astream, user_h0_memory, *memory_p); + + dev_ctx_.SetBlob(key, memory_p); + } + return memory_p; + } + + std::shared_ptr AcquireWeightXMemory(int layer, Direction dir) { + auto key = memory_key_; + key.append("@wx").append(dir2str(dir)).append(std::to_string(layer)); + auto memory_p = + std::static_pointer_cast(dev_ctx_.GetBlob(key)); + + if (!memory_p) { + auto user_md = + MKLDNNMemDesc({1, 1, ICs[layer], 3, OCs[layer]}, + MKLDNNGetDataType(), MKLDNNMemoryFormat::ldigo); + auto user_memory = dnnl::memory(user_md, engine_); + + auto* weight_x_data = + reinterpret_cast(user_memory.get_data_handle()); + int idx = layer * 2 + (dir == R2L); + memcpy(weight_x_data, weights_x_[idx]->data(), + sizeof(float) * ICs[layer] * 3 * OCs[layer]); + + if (origin_mode_ == false) { + for (int64_t i = 0; i < ICs[layer]; ++i) { + for (int64_t j = 0; j < OCs[layer]; ++j) { + weight_x_data[j] *= -1; + } + weight_x_data += 3 * OCs[layer]; + } + } + + memory_p = std::make_shared( + gru_pds_[{layer, dir}]->weights_layer_desc(), engine_); + + dnnl::stream astream(engine_); + dnnl::reorder(user_memory, *memory_p, attrs_[2 * layer + (dir == R2L)]) + .execute(astream, user_memory, *memory_p); + + dev_ctx_.SetBlob(key, memory_p); + } + return memory_p; + } + + std::shared_ptr AcquireWeightHMemory(int layer, Direction dir) { + auto key = memory_key_; + key.append("@wh").append(dir2str(dir)).append(std::to_string(layer)); + auto memory_p = + std::static_pointer_cast(dev_ctx_.GetBlob(key)); + + if (!memory_p) { + auto user_md = + MKLDNNMemDesc({1, 1, OCs[layer], 3, OCs[layer]}, + MKLDNNGetDataType(), MKLDNNMemoryFormat::ldigo); + auto user_memory = dnnl::memory(user_md, engine_); + + // Reorder weights_h from PP format [OC, 2OC] + [OC, OC] to + // oneDNN format [OC, 3OC] + auto* weight_h_data = + reinterpret_cast(user_memory.get_data_handle()); + + int idx = layer * 2 + (dir == R2L); + auto* user_weight_h_data = weights_h_[idx]->data(); + + auto src1_iter = user_weight_h_data; + auto src2_iter = user_weight_h_data + 2 * OCs[layer] * OCs[layer]; + + for (int64_t c = 0; c < OCs[layer]; ++c) { + memcpy(weight_h_data, src1_iter, 2 * OCs[layer] * sizeof(float)); + memcpy(weight_h_data + 2 * OCs[layer], src2_iter, + OCs[layer] * sizeof(float)); + + src1_iter += 2 * OCs[layer]; + src2_iter += OCs[layer]; + weight_h_data += 3 * OCs[layer]; + } + + weight_h_data = reinterpret_cast(user_memory.get_data_handle()); + + if (origin_mode_ == false) { + for (int64_t i = 0; i < OCs[layer]; ++i) { + for (int64_t j = 0; j < OCs[layer]; ++j) { + weight_h_data[j] *= -1; + } + weight_h_data += 3 * OCs[layer]; + } + } + + memory_p = std::make_shared( + gru_pds_[{layer, dir}]->weights_iter_desc(), engine_); + + dnnl::stream astream(engine_); + dnnl::reorder(user_memory, *memory_p, attrs_[2 * layer + (dir == R2L)]) + .execute(astream, user_memory, *memory_p); + + dev_ctx_.SetBlob(key, memory_p); + } + return memory_p; + } + + std::shared_ptr AcquireBiasMemory(int layer, Direction dir) { + auto key = memory_key_; + key.append("@b").append(dir2str(dir)).append(std::to_string(layer)); + auto memory_p = + std::static_pointer_cast(dev_ctx_.GetBlob(key)); + + if (!memory_p) { + memory_p = std::make_shared( + gru_pds_[{layer, dir}]->bias_desc(), engine_); + auto* bias_data = reinterpret_cast(memory_p->get_data_handle()); + + int idx = layer * 2 + (dir == R2L); + if (biases_.size() > 0 && biases_[idx]) { + const float* user_bias_data = + biases_[idx]->data(); // Bias in oneDNN is always float + memcpy(bias_data, user_bias_data, sizeof(float) * 3 * OCs[layer]); + } else { + // oneDNN always need bias memory, if it's not provided in PP, let + // oneDNN allocate memory and set it to 0 + memset(bias_data, 0, sizeof(float) * 3 * OCs[layer]); + } + + if (origin_mode_ == false && biases_.size() && biases_[idx]) { + for (int64_t i = 0; i < OCs[layer]; ++i) { + bias_data[i] *= -1; + } + } + dev_ctx_.SetBlob(key, memory_p); + } + return memory_p; + } + + std::shared_ptr AcquireGruOutputMemory(int layer, + Direction dir) { + auto key = key_; + key.append("@h_m").append(dir2str(dir)).append(std::to_string(layer)); + auto memory_p = + std::static_pointer_cast(dev_ctx_.GetBlob(key)); + + if (!memory_p) { + memory_p = std::make_shared( + gru_pds_[{layer, dir}]->dst_desc(), engine_); + dev_ctx_.SetBlob(key, memory_p); + } + return memory_p; + } + + std::shared_ptr AcquireGruPrimitive(int layer, + Direction dir) { + auto key = key_; + key.append("@gru_p").append(dir2str(dir)).append(std::to_string(layer)); + auto prim = + std::static_pointer_cast(dev_ctx_.GetBlob(key)); + if (prim == nullptr) { + prim = std::make_shared(*gru_pds_[{layer, dir}]); + dev_ctx_.SetBlob(key, prim); + } + return prim; + } + + void reorderInputL2RtoR2L(std::shared_ptr mem, int layer) { + auto* data = mem->get_data_handle(); + auto* data_iter = reinterpret_cast(data); + for (int n = 0; n < N_; ++n) { + const auto num_elements = (x_lod_[n + 1] - x_lod_[n]) * ICs[layer]; + const auto offset = Ti_ * ICs[layer] - num_elements; + memmove(data_iter + offset, data_iter, sizeof(T) * num_elements); + memset(data_iter, 0, sizeof(T) * offset); + data_iter += Ti_ * ICs[layer]; + } + } + + template + void reorderOutputR2LtoL2R(std::shared_ptr mem, int layer) { + auto* data = mem->get_data_handle(); + auto* data_iter = reinterpret_cast(data); + for (int n = 0; n < N_; ++n) { + const auto num_elements = (x_lod_[n + 1] - x_lod_[n]) * OCs[layer]; + const auto offset = Ti_ * OCs[layer] - num_elements; + memmove(data_iter, data_iter + offset, sizeof(K) * num_elements); + memset(data_iter + num_elements, 0, sizeof(K) * offset); + data_iter += Ti_ * OCs[layer]; + } + } + + std::shared_ptr executeConcat( + std::shared_ptr mem1, std::shared_ptr mem2, + int layer) { + auto out_mem = AcquireConcatOutputMemory(layer); + + std::unordered_map concat_args{ + {DNNL_ARG_MULTIPLE_SRC, *mem1}, + {DNNL_ARG_MULTIPLE_SRC + 1, *mem2}, + {DNNL_ARG_DST, *out_mem}}; + + auto concat_p = AcquireConcatPrimitive(layer); + + dnnl::stream astream(engine_); + concat_p->execute(astream, concat_args); + astream.wait(); + return out_mem; + } + + std::shared_ptr> AcquireConcatInputMemories( + int layer) { + auto key = key_; + key.append("@ci_m").append(std::to_string(layer)); + auto memory_p = std::static_pointer_cast>( + dev_ctx_.GetBlob(key)); + + if (!memory_p) { + std::vector src_mems{ + dnnl::memory(concat_pds_[layer]->src_desc(0), engine_), + dnnl::memory(concat_pds_[layer]->src_desc(1), engine_)}; + memory_p = std::make_shared>(src_mems); + dev_ctx_.SetBlob(key, memory_p); + } + return memory_p; + } + + std::shared_ptr AcquireConcatOutputMemory(int layer) { + auto key = key_; + key.append("@co_m").append(std::to_string(layer)); + auto memory_p = + std::static_pointer_cast(dev_ctx_.GetBlob(key)); + + if (!memory_p) { + memory_p = std::make_shared(concat_pds_[layer]->dst_desc(), + engine_); + dev_ctx_.SetBlob(key, memory_p); + } + return memory_p; + } + + std::shared_ptr AcquireConcatPrimitive(int layer) { + auto key = key_; + key.append("@c_p").append(std::to_string(layer)); + auto prim = std::static_pointer_cast(dev_ctx_.GetBlob(key)); + if (prim == nullptr) { + prim = std::make_shared(*concat_pds_[layer]); + dev_ctx_.SetBlob(key, prim); + } + return prim; + } + + template + void reorderOutput(std::shared_ptr mem, int layer) { + auto* data = mem->get_data_handle(); + auto* hidden_data = to_void_cast(hidden_->mutable_data(place_)); + if (isNTC(layers_ - 1)) { + reorderNTCtoPP(data, hidden_data, layers_ - 1); + } else { + reorderTNCtoPP(data, hidden_data, layers_ - 1); + } + } + + bool isNTC(int layer) { + return (platform::GetMKLDNNFormat(gru_pds_[{layer, L2R}]->dst_desc()) == + dnnl::memory::format_tag::ntc); + } + + int getLayers() const { return layers_; } + + // Reorder output values to PP format [N, T, C] -> [WORDS, C] + void reorderNTCtoPP(void* input_data, void* output_data, int layer) { + auto* input_data_iter = reinterpret_cast(input_data); + auto* output_data_iter = reinterpret_cast(output_data); + auto oc = OCs[layer] * 2; + for (int n = 0; n < N_; ++n) { + const auto num_elements = (x_lod_[n + 1] - x_lod_[n]) * oc; + memcpy(output_data_iter, input_data_iter + n * Ti_ * oc, + sizeof(T_out) * num_elements); + output_data_iter += num_elements; + } + } + + // Reorder output values to PP format [T, N, C] -> [WORDS, C] + void reorderTNCtoPP(void* input_data, void* output_data, int layer) { + auto* input_data_iter = reinterpret_cast(input_data); + auto* output_data_iter = reinterpret_cast(output_data); + for (int n = 0; n < N_; ++n) { + const auto num_elements = x_lod_[n + 1] - x_lod_[n]; + for (size_t t = 0; t < num_elements; ++t) { + memcpy(output_data_iter, + input_data_iter + t * N_ * OCs[layer] + n * OCs[layer], + sizeof(T_out) * OCs[layer]); + output_data_iter += OCs[layer]; + } + } + } + + private: + // RNN dimensions + // N - Batch Size + // Ti - Max sentence length + // ICs - Input Channels + // OCs - Output Channels + int64_t N_, Ti_; + std::vector ICs, OCs; + + const platform::MKLDNNDeviceContext& dev_ctx_; + const dnnl::engine engine_; + const platform::Place place_; + const bool origin_mode_; + const int layers_; + + std::map, + std::shared_ptr> + gru_pds_; + std::vector> concat_pds_; + + std::string key_; + // Memory size of weights, bias and h0 does not depend + // on Ti size, thus we need another key to cache them + std::string memory_key_; + + const LoDTensor* x_; + const std::vector weights_x_; + const std::vector weights_h_; + const std::vector biases_; + LoDTensor* hidden_; + std::vector attrs_; + const paddle::framework::Vector& x_lod_; +}; + +template +class MultiGRUMKLDNNKernel : public framework::OpKernel { + public: + void Compute(const framework::ExecutionContext& ctx) const override { + const bool force_fp32_output = + ctx.HasAttr("force_fp32_output") && ctx.Attr("force_fp32_output"); + + if (force_fp32_output) { + RunKernel(ctx); + } else { + RunKernel(ctx); + } + } + + template + void RunKernel(const framework::ExecutionContext& ctx) const { + auto& dev_ctx = + ctx.template device_context(); + MultiGRUHandler handler(ctx, dev_ctx); + + int layers = handler.getLayers(); + auto input_mem = handler.AcquireInputMemoryWithReorder(); + for (int layer = 0; layer < layers; ++layer) { + auto gru_out_L2R = handler.executeSingleGru(input_mem, layer, L2R); + handler.reorderInputL2RtoR2L(input_mem, layer); + auto gru_out_R2L = handler.executeSingleGru(input_mem, layer, R2L); + if (layer < layers - 1) + handler.template reorderOutputR2LtoL2R(gru_out_R2L, layer); + else + handler.template reorderOutputR2LtoL2R(gru_out_R2L, layer); + input_mem = handler.executeConcat(gru_out_L2R, gru_out_R2L, layer); + } + handler.template reorderOutput(input_mem, layers - 1); + } +}; + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OP_KERNEL(multi_gru, MKLDNN, paddle::platform::CPUPlace, + ops::MultiGRUMKLDNNKernel, + ops::MultiGRUMKLDNNKernel); diff --git a/paddle/fluid/operators/fused/multi_gru_op.cc b/paddle/fluid/operators/fused/multi_gru_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..922b8496441bc9f73e8db980c7d18589b09f69d3 --- /dev/null +++ b/paddle/fluid/operators/fused/multi_gru_op.cc @@ -0,0 +1,203 @@ +/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "paddle/fluid/operators/fused/multi_gru_op.h" +// #include "paddle/fluid/operators/fused/fusion_gru_op.h" +#include // for memcpy +#include +#include +#include "paddle/fluid/operators/jit/kernels.h" +#include "paddle/fluid/operators/math/blas.h" +#include "paddle/fluid/operators/math/fc.h" +#include "paddle/fluid/operators/math/sequence2batch.h" +#ifdef PADDLE_WITH_MKLDNN +#include "paddle/fluid/platform/mkldnn_helper.h" +#endif + +namespace paddle { +namespace operators { + +void MultiGRUOp::InferShape(framework::InferShapeContext* ctx) const { + OP_INOUT_CHECK(ctx->HasInput("X"), "Input", "X", "multi_gru"); + OP_INOUT_CHECK(ctx->HasInputs("WeightX"), "Input", "WeightX", "multi_gru"); + OP_INOUT_CHECK(ctx->HasInputs("WeightH"), "Input", "WeightH", "multi_gru"); + OP_INOUT_CHECK(ctx->HasOutput("Hidden"), "Output", "Hidden", "multi_gru"); + auto x_dims = ctx->GetInputDim("X"); + auto x_mat_dims = (x_dims.size() == 3 && x_dims[1] == 1) + ? framework::flatten_to_2d(x_dims, 1) + : x_dims; + PADDLE_ENFORCE_EQ( + x_mat_dims.size(), 2, + platform::errors::InvalidArgument("The size of input X dims should be 2, " + "or 3 with second dimension equal to " + "1, but now Input X dim is:[%s] ", + x_dims)); + + auto layers = ctx->Attrs().Get("layers"); + auto wx_dims = ctx->GetInputsDim("WeightX"); + for (int i : {0, 1}) { + PADDLE_ENFORCE_EQ( + wx_dims[i][0], x_mat_dims[1], + platform::errors::InvalidArgument( + "The first dimension of flattened WeightX #%d" + "should equal to last dimension of flattened input X, but " + "received fattened WeightX dimension is:%d, flattened X dimension " + "is:%d", + i, wx_dims[i][0], x_mat_dims[1])); + } + + auto wh_dims = ctx->GetInputsDim("WeightH"); + for (int i = 0; i < 2 * layers; ++i) { + PADDLE_ENFORCE_EQ(wx_dims[i].size(), 2, + platform::errors::InvalidArgument( + "The rank of WeightX #%d should be 2, but received " + "WeightX dim size is:%d, WeightX dim is:[%s] ", + i, wx_dims[i].size(), wx_dims[i])); + PADDLE_ENFORCE_EQ(wh_dims[i].size(), 2, + platform::errors::InvalidArgument( + "The rank of WeightH #%d should be 2, but received " + "WeightH dim size is:%d, WeightH dim is:[%s] ", + i, wh_dims[i].size(), wh_dims[i])); + int frame_size = wh_dims[i][0]; + PADDLE_ENFORCE_EQ( + wh_dims[i][1], 3 * frame_size, + platform::errors::InvalidArgument( + "The second dimension of WeightH #%d " + "should equal to 3 * frame_size, but received WeightH's " + "second dimension is: %d, frame size is:%d", + i, wh_dims[1], frame_size)); + PADDLE_ENFORCE_EQ( + wx_dims[i][1], 3 * frame_size, + platform::errors::InvalidArgument( + "The second dimension of WeightX #%d " + "should equal to 3 * frame_size, but received WeightX's " + "second dimension is: %d, frame size is:%d", + i, wx_dims[i][1], frame_size)); + } + + if (ctx->HasInputs("Bias")) { + auto b_dims = ctx->GetInputsDim("Bias"); + for (int i = 0; i < 2 * layers; ++i) { + int frame_size = wh_dims[i][0]; + PADDLE_ENFORCE_EQ(b_dims[i].size(), 2, + platform::errors::InvalidArgument( + "The rank of Bias #%d should be 2, but received " + "Bias rank is:%d, Bias dim is:[%s]", + i, b_dims[i].size(), b_dims[i])); + PADDLE_ENFORCE_EQ(b_dims[i][0], 1, + platform::errors::InvalidArgument( + "The first dimension of Bias #%d should be 1, but " + "received Bias first dim is:%d, Bias dim is:[%s]", + i, b_dims[i][0], b_dims[i])); + PADDLE_ENFORCE_EQ( + b_dims[i][1], frame_size * 3, + platform::errors::InvalidArgument( + "The shape of Bias #%d must be [1, frame_size * 3], but " + "received bias dim is:[%s], frame size is:%d", + i, b_dims[i], frame_size)); + } + } + + int last_frame_size = wh_dims.back()[0]; + framework::DDim out_dims({x_mat_dims[0], 2 * last_frame_size}); + ctx->SetOutputDim("Hidden", out_dims); + ctx->ShareLoD("X", "Hidden"); +} + +framework::OpKernelType MultiGRUOp::GetExpectedKernelType( + const framework::ExecutionContext& ctx) const { + framework::LibraryType library = framework::LibraryType::kMKLDNN; + framework::DataLayout layout = framework::DataLayout::kMKLDNN; + + return framework::OpKernelType( + OperatorWithKernel::IndicateVarDataType(ctx, "X"), ctx.GetPlace(), layout, + library); +} + +void MultiGRUOpMaker::Make() { + AddInput("X", + "(LoDTensor) the input is an LodTensor, which support " + "variable-time length input sequence. The underlying tensor in " + "this LoDTensor is a matrix with shape (T X M), where T is the " + "total time steps in this mini-batch, M is the dim size of x."); + AddInput("WeightX", + "(MultiTensor) The FC weight with shape (M x 3D)," + "where M is the dim size of x, D is the hidden size. ") + .AsDuplicable(); + AddInput("WeightH", + "(MultiTensor) (D x 3D) Same as GRUOp, where D is the hidden size. " + "This weight is not exactly D x 3D as: {W_update, W_reset, W_state}" + "Acutally they are D x 2D and D x D two part weights." + "{W_update, W_reset; W_state}" + "{D x (D + D); D x D}") + .AsDuplicable(); + AddInput("Bias", + "(MultiTensor, optional) (1 x 3D)." + "Almost same as GRUOp." + "Note: if have FC bias it should be added on this bias.") + .AsDuplicable() + .AsDispensable(); + AddInput( + "Scale_weights", + "(MultiTensor, optional) Scale_weights to be used for int8 weights data." + "Only used with MKL-DNN INT8.") + .AsDuplicable() + .AsDispensable(); + AddOutput("Hidden", "(LoDTensor) (T x D) Same as GRUOp"); + AddAttr("activation", + "(string, default tanh) " + "The activation type used for output candidate {h}_t.") + .SetDefault("tanh"); + AddAttr( + "gate_activation", + "(string, default sigmoid) " + "The activation type used in update gate and reset gate.") + .SetDefault("sigmoid"); + AddAttr("layers", + "(int, default: 1) " + "Number of stacked GRU layers.") + .SetDefault(1); + AddAttr("origin_mode", + "bool" + "use origin mode in article https://arxiv.org/abs/1412.3555") + .SetDefault(false); + AddAttr( + "mkldnn_data_type", + "(string, default \"float32\"). Data type of mkldnn kernel") + .SetDefault("float32") + .InEnum({"float32", "int8", "bfloat16"}); + AddAttr("Scale_data", + "Scales to be used for int8 input/output data." + "Only used with MKL-DNN INT8.") + .SetDefault({1.f}); + AddAttr("Shift_data", + "Shifts to be used for int8 input/output data." + "Only used with MKL-DNN INT8.") + .SetDefault({0.f}); + AddAttr("force_fp32_output", + "(bool, default: false) Force INT8 kernel output FP32, only " + "used in MKL-DNN INT8") + .SetDefault(false); + AddComment(R"DOC( +The Fusion complete GRU Operator. +This operator fuse the fully-connected operator into GRU, +more details can refer to GRU op. +)DOC"); +} + +} // namespace operators +} // namespace paddle + +namespace ops = paddle::operators; +REGISTER_OPERATOR(multi_gru, ops::MultiGRUOp, ops::MultiGRUOpMaker); diff --git a/paddle/fluid/operators/fused/multi_gru_op.h b/paddle/fluid/operators/fused/multi_gru_op.h new file mode 100644 index 0000000000000000000000000000000000000000..ebd3faf44a84b74194191df8d3e73a4d12a00436 --- /dev/null +++ b/paddle/fluid/operators/fused/multi_gru_op.h @@ -0,0 +1,43 @@ +/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once +#include "paddle/fluid/framework/op_registry.h" +#include "paddle/fluid/framework/operator.h" + +namespace paddle { +namespace operators { + +using framework::LoDTensor; +using framework::Tensor; +using framework::ExecutionContext; + +class MultiGRUOp : public framework::OperatorWithKernel { + public: + using framework::OperatorWithKernel::OperatorWithKernel; + + void InferShape(framework::InferShapeContext* ctx) const override; + + protected: + framework::OpKernelType GetExpectedKernelType( + const ExecutionContext& ctx) const override; +}; + +class MultiGRUOpMaker : public framework::OpProtoAndCheckerMaker { + public: + void Make() override; +}; + +} // namespace operators +} // namespace paddle diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_multi_gru_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_multi_gru_mkldnn_op.py new file mode 100644 index 0000000000000000000000000000000000000000..04941ef22ac3b78442154d2bd63fc207b1c79814 --- /dev/null +++ b/python/paddle/fluid/tests/unittests/mkldnn/test_multi_gru_mkldnn_op.py @@ -0,0 +1,248 @@ +# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest +import numpy as np +from paddle.fluid.tests.unittests.op_test import OpTest +from paddle.fluid.tests.unittests.test_fusion_gru_op import fusion_gru, ACTIVATION +from paddle.fluid.dygraph.base import disable_dygraph + + +def multi_gru( + x, # T x M + lod, # 1 x N + h0, # N x D + wx, # M x 3D + wh, # D x 3D + bias, # 1 x 3D + origin_mode, + layers): + act_state = ACTIVATION['tanh'] + act_gate = ACTIVATION['sigmoid'] + input = x + for i in range(0, layers * 2, 2): + _, _, _, gru1_out = fusion_gru(input, lod, h0[i], wx[i], wh[i], bias[i], + False, origin_mode, act_state, act_gate) + _, _, _, gru2_out = fusion_gru(input, lod, h0[i + 1], wx[i + 1], + wh[i + 1], bias[i + 1], True, + origin_mode, act_state, act_gate) + input = np.concatenate((gru1_out, gru2_out), axis=1) + return input + + +class TestMultiGruMkldnnOp(OpTest): + def set_confs(self): + pass + + def set_dtype(self): + pass + + def set_force_fp32_output(self): + pass + + def setUp(self): + self.op_type = "multi_gru" + self.lod = [[2, 4, 3]] + self.ICs = [3] + self.OCs = [5] + self.with_bias = True + self.layers = 1 + self.origin_mode = False + self._cpu_only = True + self.error_margin = 1e-5 + self.set_confs() + self.dtype = "float32" + self.set_dtype() + self.force_fp32_output = False + self.set_force_fp32_output() + + is_int8 = self.dtype == 'int8' + scale_data = 63 + shift_data = 64 + + T = sum(self.lod[0]) + N = len(self.lod[0]) + + self.inputs = {} + if is_int8: + x_f32 = np.random.rand(T, self.ICs[0]).astype('float32') * 2 - 1 + x_u8 = np.rint(x_f32 * scale_data + shift_data).astype(np.uint8) + self.inputs['X'] = (x_u8, self.lod) + + else: + x_f32 = np.random.rand(T, self.ICs[0]).astype('float32') + self.inputs['X'] = (x_f32, self.lod) + + wx = [] + wh = [] + bias = [] + h0 = [] + + for layer in range(self.layers): + IC = self.ICs[layer] + OC = self.OCs[layer] + for j in range(2): + wx.append(np.random.rand(IC, 3 * OC).astype('float32')) + wh.append(np.random.rand(OC, 3 * OC).astype('float32')) + bias.append( + np.random.rand(1, 3 * OC).astype('float32') + if self.with_bias else np.zeros( + (1, 3 * OC), dtype='float32')) + h0.append(np.zeros((N, OC), dtype='float32')) + + self.inputs['WeightX'] = [('wx' + str(i), wx[i]) + for i in range(self.layers * 2)] + self.inputs['WeightH'] = [('wh' + str(i), wh[i]) + for i in range(self.layers * 2)] + if self.with_bias: + self.inputs['Bias'] = [('b' + str(i), bias[i]) + for i in range(self.layers * 2)] + + if is_int8: + s8_max = 127.0 + scale_weights = [] + for layer in range(self.layers): + OC = self.OCs[layer] + for j in range(2): + scale_ur = s8_max / np.max(np.abs( + np.concatenate( + [ + wx[2 * layer + j][:, :2 * OC], wh[2 * layer + j] + .flatten()[:2 * OC * OC].reshape(OC, 2 * OC) + ], + axis=0)), + axis=0) + scale_o = s8_max / np.max(np.abs( + np.concatenate( + [ + wx[2 * layer + j][:, 2 * OC:], wh[2 * layer + j] + .flatten()[2 * OC * OC:].reshape(OC, OC) + ], + axis=0)), + axis=0) + + scale_weights.append( + np.concatenate([scale_ur, scale_o]).astype('float32')) + self.inputs['Scale_weights'] = [('w_scale' + str(i), + scale_weights[i]) + for i in range(self.layers * 2)] + self.error_margin = 1e-1 if self.force_fp32_output else 1 + + hidden_f32 = multi_gru(x_f32, self.lod, h0, wx, wh, bias, + self.origin_mode, self.layers) + + if self.dtype == 'float32' or self.force_fp32_output: + self.outputs = {'Hidden': (hidden_f32, self.lod)} + else: + hidden_u8 = np.rint(hidden_f32 * scale_data + shift_data).astype( + np.uint8) + self.outputs = {'Hidden': (hidden_u8, self.lod)} + + self.attrs = { + 'activation': 'tanh', + 'gate_activation': 'sigmoid', + 'layers': self.layers, + 'origin_mode': self.origin_mode, + 'use_mkldnn': True, + } + + if is_int8: + self.attrs['force_fp32_output'] = self.force_fp32_output + self.attrs['Scale_data'] = scale_data + self.attrs['Shift_data'] = shift_data + + def test_check_output(self): + self.check_output(check_dygraph=False, atol=self.error_margin) + + +class TestMultiGruMkldnnOpNoBias(TestMultiGruMkldnnOp): + def set_confs(self): + self.with_bias = False + + +class TestMultiGruMkldnnOpLayers2(TestMultiGruMkldnnOp): + def set_confs(self): + self.layers = 2 + self.ICs = [2, 6] + self.OCs = [3, 8] + + +class TestMultiGruMkldnnOpLayers3(TestMultiGruMkldnnOp): + def set_confs(self): + self.layers = 3 + self.ICs = [2, 6, 12] + self.OCs = [3, 6, 14] + + +class TestMultiGruMkldnnOpOriginMode(TestMultiGruMkldnnOp): + def set_confs(self): + self.origin_mode = True + + +class TestMultiGruMkldnnInt8Op(TestMultiGruMkldnnOp): + def set_dtype(self): + self.dtype = 'int8' + + +class TestMultiGruMkldnnInt8OpForceFP32Output(TestMultiGruMkldnnInt8Op): + def set_force_fp32_output(self): + self.force_fp32_output = True + + +class TestMultiGruMkldnnInt8OpNoBias(TestMultiGruMkldnnOpNoBias): + def set_dtype(self): + self.dtype = 'int8' + + +class TestMultiGruMkldnnInt8OpNoBiasForceFP32Output( + TestMultiGruMkldnnInt8OpNoBias): + def set_force_fp32_output(self): + self.force_fp32_output = True + + +class TestMultiGruMkldnnInt8OpLayers2(TestMultiGruMkldnnOpLayers2): + def set_dtype(self): + self.dtype = 'int8' + + +class TestMultiGruMkldnnInt8OpLayers2ForceFP32Output( + TestMultiGruMkldnnInt8OpLayers2): + def set_force_fp32_output(self): + self.force_fp32_output = True + + +class TestMultiGruMkldnnInt8OpLayers3(TestMultiGruMkldnnOpLayers3): + def set_dtype(self): + self.dtype = 'int8' + + +class TestMultiGruMkldnnInt8OpLayers3ForceFP32Output( + TestMultiGruMkldnnInt8OpLayers3): + def set_force_fp32_output(self): + self.force_fp32_output = True + + +class TestMultiGruMkldnnInt8OpOriginMode(TestMultiGruMkldnnOpOriginMode): + def set_dtype(self): + self.dtype = 'int8' + + +class TestMultiGruMkldnnInt8OpOriginModeForceFP32Output( + TestMultiGruMkldnnInt8OpOriginMode): + def set_force_fp32_output(self): + self.force_fp32_output = True + + +if __name__ == "__main__": + unittest.main() diff --git a/tools/static_mode_white_list.py b/tools/static_mode_white_list.py index 5fe1cc722e8753a8475cd7dfe663de514f542735..7f2ee9cb170329a441fc629b12bf9136fdde5902 100644 --- a/tools/static_mode_white_list.py +++ b/tools/static_mode_white_list.py @@ -598,6 +598,7 @@ STATIC_MODE_TESTING_LIST = [ 'test_lrn_mkldnn_op', 'test_matmul_mkldnn_op', 'test_mul_int8_mkldnn_op', + 'test_multi_gru_mkldnn_op', 'test_pool2d_int8_mkldnn_op', 'test_pool2d_mkldnn_op', 'test_quantize_mkldnn_op',