未验证 提交 f8da5536 编写于 作者: J jakpiase 提交者: GitHub

REUPLOAD Added vanilla LSTM and LSTM with peepholes oneDNN fp32 kernel (#30719)

* added external reorder to profiler

* resolved conflict

* added enable_static

* initial version of lstm, not working yet

* added lstm to operators.cmake

* added vanilla lstm mkldnn op

* added peephole weights integration

* minor changes

* added formatting

* added fusion_lstm_mkldnn to static_whitelist

* added formatting

* removed comment

* moved use_peepholes attribute inside is_cached block

* reverted wrong changes

* minor formatting change

* minor changes

* changed stream handling

* minor change

* added datatype to GetExpectedKernelType()

* added reading stream from TLS
上级 67abfc15
...@@ -197,7 +197,7 @@ function(op_library TARGET) ...@@ -197,7 +197,7 @@ function(op_library TARGET)
"tensor_array_read_write_op" "tensorrt_engine_op" "conv_fusion_op" "tensor_array_read_write_op" "tensorrt_engine_op" "conv_fusion_op"
"fusion_transpose_flatten_concat_op" "fusion_conv_inception_op" "fusion_transpose_flatten_concat_op" "fusion_conv_inception_op"
"sync_batch_norm_op" "dgc_op" "fused_fc_elementwise_layernorm_op" "sync_batch_norm_op" "dgc_op" "fused_fc_elementwise_layernorm_op"
"skip_layernorm_op" "multihead_matmul_op" "fusion_group_op" "fused_bn_activation_op" "fused_embedding_eltwise_layernorm_op" "fusion_gru_op" "skip_layernorm_op" "multihead_matmul_op" "fusion_group_op" "fused_bn_activation_op" "fused_embedding_eltwise_layernorm_op" "fusion_gru_op" "fusion_lstm_op"
"fused_bn_add_activation_op") "fused_bn_add_activation_op")
if ("${TARGET}" STREQUAL "${manual_pybind_op}") if ("${TARGET}" STREQUAL "${manual_pybind_op}")
set(pybind_flag 1) set(pybind_flag 1)
......
...@@ -14,11 +14,15 @@ register_operators(EXCLUDES ...@@ -14,11 +14,15 @@ register_operators(EXCLUDES
fused_embedding_eltwise_layernorm_op fused_embedding_eltwise_layernorm_op
fusion_group_op fusion_group_op
fusion_gru_op fusion_gru_op
fusion_lstm_op
fused_bn_add_activation_op) fused_bn_add_activation_op)
# fusion_gru_op does not have CUDA kernel # fusion_gru_op does not have CUDA kernel
op_library(fusion_gru_op) op_library(fusion_gru_op)
op_library(fusion_lstm_op)
file(APPEND ${pybind_file} "USE_CPU_ONLY_OP(fusion_gru);\n") file(APPEND ${pybind_file} "USE_CPU_ONLY_OP(fusion_gru);\n")
file(APPEND ${pybind_file} "USE_CPU_ONLY_OP(fusion_lstm);\n")
if (WITH_GPU) if (WITH_GPU)
# fused_bn_activation_op needs cudnn 7.4.1 above # fused_bn_activation_op needs cudnn 7.4.1 above
......
...@@ -18,6 +18,9 @@ limitations under the License. */ ...@@ -18,6 +18,9 @@ limitations under the License. */
#include "paddle/fluid/operators/math/blas.h" #include "paddle/fluid/operators/math/blas.h"
#include "paddle/fluid/operators/math/fc.h" #include "paddle/fluid/operators/math/fc.h"
#include "paddle/fluid/operators/math/sequence2batch.h" #include "paddle/fluid/operators/math/sequence2batch.h"
#ifdef PADDLE_WITH_MKLDNN
#include "paddle/fluid/platform/mkldnn_helper.h"
#endif
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -145,8 +148,16 @@ void FusionLSTMOp::InferShape(framework::InferShapeContext* ctx) const { ...@@ -145,8 +148,16 @@ void FusionLSTMOp::InferShape(framework::InferShapeContext* ctx) const {
framework::OpKernelType FusionLSTMOp::GetExpectedKernelType( framework::OpKernelType FusionLSTMOp::GetExpectedKernelType(
const framework::ExecutionContext& ctx) const { const framework::ExecutionContext& ctx) const {
return framework::OpKernelType( framework::LibraryType library = framework::LibraryType::kPlain;
OperatorWithKernel::IndicateVarDataType(ctx, "X"), ctx.device_context()); framework::DataLayout layout = framework::DataLayout::kAnyLayout;
auto data_type = OperatorWithKernel::IndicateVarDataType(ctx, "X");
#ifdef PADDLE_WITH_MKLDNN
if (this->CanMKLDNNBeUsed(ctx, data_type)) {
library = framework::LibraryType::kMKLDNN;
layout = framework::DataLayout::kMKLDNN;
}
#endif
return framework::OpKernelType(data_type, ctx.GetPlace(), layout, library);
} }
void FusionLSTMOpMaker::Make() { void FusionLSTMOpMaker::Make() {
...@@ -235,6 +246,9 @@ void FusionLSTMOpMaker::Make() { ...@@ -235,6 +246,9 @@ void FusionLSTMOpMaker::Make() {
"`tanh` by default.") "`tanh` by default.")
.SetDefault("tanh") .SetDefault("tanh")
.InEnum({"sigmoid", "tanh", "relu", "identity"}); .InEnum({"sigmoid", "tanh", "relu", "identity"});
AddAttr<bool>("use_mkldnn",
"(bool, default false) Only used in mkldnn kernel")
.SetDefault(false);
AddComment(R"DOC( AddComment(R"DOC(
Fusion Long-Short Term Memory (LSTM) Operator. Fusion Long-Short Term Memory (LSTM) Operator.
This operator fuse the X into LSTM, more details can refer to LSTM op. This operator fuse the X into LSTM, more details can refer to LSTM op.
......
...@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and ...@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include "paddle/fluid/operators/fused/fusion_gru_op.h" #include "paddle/fluid/operators/fused/fusion_gru_op.h"
#include "paddle/fluid/platform/mkldnn_reuse.h" #include "paddle/fluid/operators/fused/mkldnn/fusion_rnn_mkldnn.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -27,7 +27,7 @@ using paddle::platform::MKLDNNMemDesc; ...@@ -27,7 +27,7 @@ using paddle::platform::MKLDNNMemDesc;
using platform::to_void_cast; using platform::to_void_cast;
template <typename T, typename T_out = T> template <typename T, typename T_out = T>
class GRUMKLDNNHandler : public platform::MKLDNNHandlerT<T, dnnl::gru_forward> { class GRUMKLDNNHandler : public RNNMKLDNNHandler<T, dnnl::gru_forward, T_out> {
public: public:
GRUMKLDNNHandler(const paddle::framework::ExecutionContext& ctx, GRUMKLDNNHandler(const paddle::framework::ExecutionContext& ctx,
const platform::MKLDNNDeviceContext& dev_ctx, const platform::MKLDNNDeviceContext& dev_ctx,
...@@ -37,37 +37,12 @@ class GRUMKLDNNHandler : public platform::MKLDNNHandlerT<T, dnnl::gru_forward> { ...@@ -37,37 +37,12 @@ class GRUMKLDNNHandler : public platform::MKLDNNHandlerT<T, dnnl::gru_forward> {
const bool is_reverse, const int64_t N, const int64_t Ti, const bool is_reverse, const int64_t N, const int64_t Ti,
const int64_t IC, const int64_t OC, const int64_t IC, const int64_t OC,
const std::string& unique_name) const std::string& unique_name)
: platform::MKLDNNHandlerT<T, dnnl::gru_forward>( : RNNMKLDNNHandler<T, dnnl::gru_forward, T_out>(
dev_ctx, dev_ctx.GetEngine(), cpu_place, ctx, dev_ctx, mkldnn_engine, ctx.GetPlace(), input, weight_h, h0,
CreateKey(dev_ctx, unique_name, MKLDNNGetDataType<T>(), Ti)), is_reverse, N, Ti, IC, OC, 3,
N(N), ctx.InputName("X") + ctx.InputName("WeightH")) {
Ti(Ti),
IC(IC),
OC(OC) {
// Create memory key without Ti because weights, bias and h0 memories
// do not depend on Ti size but primitive and input/output memory do
memory_key_ = platform::ExtendKeyWithThreadInfoIfNeeded(
dev_ctx, CreateKey(dev_ctx, unique_name, MKLDNNGetDataType<T>()));
// Is it int8 kernel
const bool is_INT8 = std::is_same<T, uint8_t>::value; const bool is_INT8 = std::is_same<T, uint8_t>::value;
if (is_INT8) {
// Int8 attributes
const float scale_data = ctx.Attr<float>("Scale_data");
const float shift_data = ctx.Attr<float>("Shift_data");
const auto scale_weights = ctx.Attr<std::vector<float>>("Scale_weights");
const int weights_scale_mask =
0 +
(1 << 3) // bit, indicating the unique scales for `g` dim in `ldigo`
+
(1 << 4); // bit, indicating the unique scales for `o` dim in `ldigo`
attr_.set_rnn_data_qparams(scale_data, shift_data);
attr_.set_rnn_weights_qparams(weights_scale_mask, scale_weights);
}
if (!this->isCached()) { if (!this->isCached()) {
// oneDNN kernel has hardcoded activation functions // oneDNN kernel has hardcoded activation functions
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
...@@ -108,176 +83,35 @@ class GRUMKLDNNHandler : public platform::MKLDNNHandlerT<T, dnnl::gru_forward> { ...@@ -108,176 +83,35 @@ class GRUMKLDNNHandler : public platform::MKLDNNHandlerT<T, dnnl::gru_forward> {
: dnnl::rnn_direction::unidirectional_left2right; : dnnl::rnn_direction::unidirectional_left2right;
this->AcquireForwardPrimitiveDescriptor( this->AcquireForwardPrimitiveDescriptor(
attr_, dnnl::prop_kind::forward_inference, direction, input_md, h0_md, this->attr_, dnnl::prop_kind::forward_inference, direction, input_md,
weight_x_md, weight_h_md, bias_md, hidden_md, dnnl::memory::desc()); h0_md, weight_x_md, weight_h_md, bias_md, hidden_md,
} dnnl::memory::desc());
} }
bool is_NTC() {
return (platform::GetMKLDNNFormat(this->fwd_pd_->dst_desc()) ==
dnnl::memory::format_tag::ntc);
}
void reorderRNNdata(void* input_data, void* output_data,
std::vector<size_t> lod, const bool is_reverse,
platform::RNNReorderType reorder_type) {
switch (reorder_type) {
// Reorder input memory [WORDS, C] + LoD -> [N, T, C]
case platform::RNNReorderType::PP_NTC: {
auto* input_data_iter = reinterpret_cast<T*>(input_data);
auto* output_data_iter = reinterpret_cast<T*>(output_data);
for (int n = 0; n < N; ++n) {
const auto num_elements = (lod[n + 1] - lod[n]) * IC;
const auto offset = is_reverse ? (Ti * IC - num_elements) : 0;
memcpy(output_data_iter + n * Ti * IC + offset, input_data_iter,
sizeof(T) * num_elements);
input_data_iter += num_elements;
}
} break;
// Reorder input memory [WORDS, C] + LoD -> [T, N, C]
case platform::RNNReorderType::PP_TNC: {
auto* input_data_iter = reinterpret_cast<T*>(input_data);
auto* output_data_iter = reinterpret_cast<T*>(output_data);
for (int n = 0; n < N; ++n) {
const auto num_elements = (lod[n + 1] - lod[n]);
const auto offset = is_reverse ? (Ti - num_elements) : 0;
for (size_t t = 0; t < num_elements; ++t) {
memcpy(output_data_iter + (t + offset) * N * IC + n * IC,
input_data_iter, sizeof(T) * IC);
input_data_iter += IC;
}
}
} break;
// Reorder output values to PP format [N, T, C] -> [WORDS, C]
case platform::RNNReorderType::NTC_PP: {
auto* input_data_iter = reinterpret_cast<T_out*>(input_data);
auto* output_data_iter = reinterpret_cast<T_out*>(output_data);
for (int n = 0; n < N; ++n) {
const auto num_elements = (lod[n + 1] - lod[n]) * OC;
const auto offset = is_reverse ? (Ti * OC - num_elements) : 0;
memcpy(output_data_iter, input_data_iter + n * Ti * OC + offset,
sizeof(T_out) * num_elements);
output_data_iter += num_elements;
}
} break;
// Reorder output values to PP format [T, N, C] -> [WORDS, C]
case platform::RNNReorderType::TNC_PP: {
auto* input_data_iter = reinterpret_cast<T_out*>(input_data);
auto* output_data_iter = reinterpret_cast<T_out*>(output_data);
for (int n = 0; n < N; ++n) {
const auto num_elements = lod[n + 1] - lod[n];
const auto offset = is_reverse ? (Ti - num_elements) : 0;
for (size_t t = 0; t < num_elements; ++t) {
memcpy(output_data_iter,
input_data_iter + (t + offset) * N * OC + n * OC,
sizeof(T_out) * OC);
output_data_iter += OC;
}
}
} break;
}
}
std::shared_ptr<dnnl::memory> AcquireInputMemoryWithReorder(
const LoDTensor* input, const bool is_reverse) {
const auto name = this->key_ + "@input_mem";
auto memory_p =
std::static_pointer_cast<dnnl::memory>(this->dev_ctx_.GetBlob(name));
if (!memory_p) {
memory_p = std::make_shared<dnnl::memory>(this->fwd_pd_->src_desc(),
this->engine_);
this->dev_ctx_.SetBlob(name, memory_p);
}
const auto& input_lod = input->lod()[0];
auto* x_data = to_void_cast(input->data<T>());
auto* x_onednn_data = memory_p->get_data_handle();
memset(x_onednn_data, 0, sizeof(T) * N * Ti * IC);
if (platform::GetMKLDNNFormat(this->fwd_pd_->src_desc()) ==
dnnl::memory::format_tag::ntc) {
reorderRNNdata(x_data, x_onednn_data, input_lod, is_reverse,
platform::RNNReorderType::PP_NTC);
} else {
reorderRNNdata(x_data, x_onednn_data, input_lod, is_reverse,
platform::RNNReorderType::PP_TNC);
}
return memory_p;
}
std::shared_ptr<dnnl::memory> AcquireOutputMemory() {
const auto name = this->key_ + "@output_mem";
auto memory_p =
std::static_pointer_cast<dnnl::memory>(this->dev_ctx_.GetBlob(name));
if (!memory_p) {
memory_p = std::make_shared<dnnl::memory>(this->fwd_pd_->dst_desc(),
this->engine_);
this->dev_ctx_.SetBlob(name, memory_p);
}
return memory_p;
}
// TODO(grygielski) H0 is for now persistable
// TODO(jczaja) H0 should be updated each iter and of T type (Fusion pass does
// not support in yet)
std::shared_ptr<dnnl::memory> AcquireH0Memory(const Tensor* h0) {
const std::string h0_key = memory_key_ + "@h0";
auto memory_p =
std::static_pointer_cast<dnnl::memory>(this->dev_ctx_.GetBlob(h0_key));
if (!memory_p) {
auto user_h0_memory = dnnl::memory();
if (h0) {
user_h0_memory =
dnnl::memory({{1, 1, N, OC},
MKLDNNGetDataType<float>(),
MKLDNNMemoryFormat::ldnc},
this->engine_, to_void_cast(h0->data<float>()));
} else {
user_h0_memory = dnnl::memory({{1, 1, N, OC},
MKLDNNGetDataType<float>(),
MKLDNNMemoryFormat::ldnc},
this->engine_);
memset(user_h0_memory.get_data_handle(), 0, sizeof(float) * N * OC);
}
memory_p = std::make_shared<dnnl::memory>(this->fwd_pd_->src_iter_desc(),
this->engine_);
auto& astream = paddle::platform::MKLDNNDeviceContext::tls().get_stream();
dnnl::reorder(user_h0_memory, *memory_p, attr_)
.execute(astream, user_h0_memory, *memory_p);
this->dev_ctx_.SetBlob(h0_key, memory_p);
}
return memory_p;
} }
std::shared_ptr<dnnl::memory> AcquireWeightXMemory(const Tensor* weight_x, std::shared_ptr<dnnl::memory> AcquireWeightXMemory(const Tensor* weight_x,
const bool origin_mode) { const bool origin_mode) {
const std::string wx_key = memory_key_ + "@weight_x"; const std::string wx_key = this->memory_key_ + "@weight_x";
auto memory_p = auto memory_p =
std::static_pointer_cast<dnnl::memory>(this->dev_ctx_.GetBlob(wx_key)); std::static_pointer_cast<dnnl::memory>(this->dev_ctx_.GetBlob(wx_key));
if (!memory_p) { if (!memory_p) {
auto user_md = auto user_md =
MKLDNNMemDesc({1, 1, IC, 3, OC}, MKLDNNGetDataType<float>(), MKLDNNMemDesc({1, 1, this->IC, this->G, this->OC},
MKLDNNMemoryFormat::ldigo); MKLDNNGetDataType<float>(), MKLDNNMemoryFormat::ldigo);
auto user_memory = dnnl::memory(user_md, this->engine_); auto user_memory = dnnl::memory(user_md, this->engine_);
auto* weight_x_data = auto* weight_x_data =
reinterpret_cast<float*>(user_memory.get_data_handle()); reinterpret_cast<float*>(user_memory.get_data_handle());
memcpy(weight_x_data, weight_x->data<float>(), memcpy(weight_x_data, weight_x->data<float>(),
sizeof(float) * IC * 3 * OC); sizeof(float) * this->IC * this->G * this->OC);
if (origin_mode == false) { if (origin_mode == false) {
for (int64_t i = 0; i < IC; ++i) { for (int64_t i = 0; i < this->IC; ++i) {
for (int64_t j = 0; j < OC; ++j) { for (int64_t j = 0; j < this->OC; ++j) {
weight_x_data[j] *= -1; weight_x_data[j] *= -1;
} }
weight_x_data += 3 * OC; weight_x_data += 3 * this->OC;
} }
} }
...@@ -285,7 +119,7 @@ class GRUMKLDNNHandler : public platform::MKLDNNHandlerT<T, dnnl::gru_forward> { ...@@ -285,7 +119,7 @@ class GRUMKLDNNHandler : public platform::MKLDNNHandlerT<T, dnnl::gru_forward> {
this->fwd_pd_->weights_layer_desc(), this->engine_); this->fwd_pd_->weights_layer_desc(), this->engine_);
auto& astream = paddle::platform::MKLDNNDeviceContext::tls().get_stream(); auto& astream = paddle::platform::MKLDNNDeviceContext::tls().get_stream();
dnnl::reorder(user_memory, *memory_p, attr_) dnnl::reorder(user_memory, *memory_p, this->attr_)
.execute(astream, user_memory, *memory_p); .execute(astream, user_memory, *memory_p);
this->dev_ctx_.SetBlob(wx_key, memory_p); this->dev_ctx_.SetBlob(wx_key, memory_p);
...@@ -295,14 +129,14 @@ class GRUMKLDNNHandler : public platform::MKLDNNHandlerT<T, dnnl::gru_forward> { ...@@ -295,14 +129,14 @@ class GRUMKLDNNHandler : public platform::MKLDNNHandlerT<T, dnnl::gru_forward> {
std::shared_ptr<dnnl::memory> AcquireWeightHMemory(const Tensor* weight_h, std::shared_ptr<dnnl::memory> AcquireWeightHMemory(const Tensor* weight_h,
const bool origin_mode) { const bool origin_mode) {
const std::string wh_key = memory_key_ + "@weight_h"; const std::string wh_key = this->memory_key_ + "@weight_h";
auto memory_p = auto memory_p =
std::static_pointer_cast<dnnl::memory>(this->dev_ctx_.GetBlob(wh_key)); std::static_pointer_cast<dnnl::memory>(this->dev_ctx_.GetBlob(wh_key));
if (!memory_p) { if (!memory_p) {
auto user_md = auto user_md =
MKLDNNMemDesc({1, 1, OC, 3, OC}, MKLDNNGetDataType<float>(), MKLDNNMemDesc({1, 1, this->OC, this->G, this->OC},
MKLDNNMemoryFormat::ldigo); MKLDNNGetDataType<float>(), MKLDNNMemoryFormat::ldigo);
auto user_memory = dnnl::memory(user_md, this->engine_); auto user_memory = dnnl::memory(user_md, this->engine_);
// Reorder weights_h from PP format [OC, 2OC] + [OC, OC] to // Reorder weights_h from PP format [OC, 2OC] + [OC, OC] to
...@@ -312,25 +146,26 @@ class GRUMKLDNNHandler : public platform::MKLDNNHandlerT<T, dnnl::gru_forward> { ...@@ -312,25 +146,26 @@ class GRUMKLDNNHandler : public platform::MKLDNNHandlerT<T, dnnl::gru_forward> {
auto* user_weight_h_data = weight_h->data<float>(); auto* user_weight_h_data = weight_h->data<float>();
auto src1_iter = user_weight_h_data; auto src1_iter = user_weight_h_data;
auto src2_iter = user_weight_h_data + 2 * OC * OC; auto src2_iter = user_weight_h_data + 2 * this->OC * this->OC;
for (int64_t c = 0; c < OC; ++c) { for (int64_t c = 0; c < this->OC; ++c) {
memcpy(weight_h_data, src1_iter, 2 * OC * sizeof(float)); memcpy(weight_h_data, src1_iter, 2 * this->OC * sizeof(float));
memcpy(weight_h_data + 2 * OC, src2_iter, OC * sizeof(float)); memcpy(weight_h_data + 2 * this->OC, src2_iter,
this->OC * sizeof(float));
src1_iter += 2 * OC; src1_iter += 2 * this->OC;
src2_iter += OC; src2_iter += this->OC;
weight_h_data += 3 * OC; weight_h_data += 3 * this->OC;
} }
weight_h_data = reinterpret_cast<float*>(user_memory.get_data_handle()); weight_h_data = reinterpret_cast<float*>(user_memory.get_data_handle());
if (origin_mode == false) { if (origin_mode == false) {
for (int64_t i = 0; i < OC; ++i) { for (int64_t i = 0; i < this->OC; ++i) {
for (int64_t j = 0; j < OC; ++j) { for (int64_t j = 0; j < this->OC; ++j) {
weight_h_data[j] *= -1; weight_h_data[j] *= -1;
} }
weight_h_data += 3 * OC; weight_h_data += 3 * this->OC;
} }
} }
...@@ -338,7 +173,7 @@ class GRUMKLDNNHandler : public platform::MKLDNNHandlerT<T, dnnl::gru_forward> { ...@@ -338,7 +173,7 @@ class GRUMKLDNNHandler : public platform::MKLDNNHandlerT<T, dnnl::gru_forward> {
this->fwd_pd_->weights_iter_desc(), this->engine_); this->fwd_pd_->weights_iter_desc(), this->engine_);
auto& astream = paddle::platform::MKLDNNDeviceContext::tls().get_stream(); auto& astream = paddle::platform::MKLDNNDeviceContext::tls().get_stream();
dnnl::reorder(user_memory, *memory_p, attr_) dnnl::reorder(user_memory, *memory_p, this->attr_)
.execute(astream, user_memory, *memory_p); .execute(astream, user_memory, *memory_p);
this->dev_ctx_.SetBlob(wh_key, memory_p); this->dev_ctx_.SetBlob(wh_key, memory_p);
...@@ -348,7 +183,7 @@ class GRUMKLDNNHandler : public platform::MKLDNNHandlerT<T, dnnl::gru_forward> { ...@@ -348,7 +183,7 @@ class GRUMKLDNNHandler : public platform::MKLDNNHandlerT<T, dnnl::gru_forward> {
std::shared_ptr<dnnl::memory> AcquireBiasMemory(const Tensor* bias, std::shared_ptr<dnnl::memory> AcquireBiasMemory(const Tensor* bias,
const bool origin_mode) { const bool origin_mode) {
const std::string bias_key = memory_key_ + "@bias"; const std::string bias_key = this->memory_key_ + "@bias";
auto memory_p = std::static_pointer_cast<dnnl::memory>( auto memory_p = std::static_pointer_cast<dnnl::memory>(
this->dev_ctx_.GetBlob(bias_key)); this->dev_ctx_.GetBlob(bias_key));
...@@ -359,15 +194,15 @@ class GRUMKLDNNHandler : public platform::MKLDNNHandlerT<T, dnnl::gru_forward> { ...@@ -359,15 +194,15 @@ class GRUMKLDNNHandler : public platform::MKLDNNHandlerT<T, dnnl::gru_forward> {
if (bias) { if (bias) {
const float* user_bias_data = const float* user_bias_data =
bias->data<float>(); // Bias in oneDNN is always float bias->data<float>(); // Bias in oneDNN is always float
memcpy(bias_data, user_bias_data, sizeof(float) * 3 * OC); memcpy(bias_data, user_bias_data, sizeof(float) * this->G * this->OC);
} else { } else {
// oneDNN always need bias memory, if it's not provided in PP, let // oneDNN always need bias memory, if it's not provided in PP, let
// oneDNN allocate memory and set it to 0 // oneDNN allocate memory and set it to 0
memset(bias_data, 0, sizeof(float) * 3 * OC); memset(bias_data, 0, sizeof(float) * this->G * this->OC);
} }
if (origin_mode == false && bias) { if (origin_mode == false && bias) {
for (int64_t i = 0; i < OC; ++i) { for (int64_t i = 0; i < this->OC; ++i) {
bias_data[i] *= -1; bias_data[i] *= -1;
} }
} }
...@@ -375,19 +210,6 @@ class GRUMKLDNNHandler : public platform::MKLDNNHandlerT<T, dnnl::gru_forward> { ...@@ -375,19 +210,6 @@ class GRUMKLDNNHandler : public platform::MKLDNNHandlerT<T, dnnl::gru_forward> {
} }
return memory_p; return memory_p;
} }
private:
// RNN dimensions
// N - Batch Size
// Ti - Max sentence length
// IC - Input Channels
// OC - Output Channels
const int64_t N, Ti, IC, OC;
// Memory size of weights, bias and h0 does not depend
// on Ti size, thus we need another key to cache them
std::string memory_key_;
dnnl::primitive_attr attr_;
}; };
template <typename T> template <typename T>
......
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/fused/fusion_lstm_op.h"
#include "paddle/fluid/operators/fused/mkldnn/fusion_rnn_mkldnn.h"
namespace paddle {
namespace operators {
using paddle::framework::LoDTensor;
using paddle::framework::Tensor;
using paddle::platform::CPUDeviceContext;
using paddle::platform::CreateKey;
using paddle::platform::MKLDNNGetDataType;
using paddle::platform::MKLDNNMemDesc;
using platform::to_void_cast;
template <typename T, typename T_out = T>
class LSTMMKLDNNHandler
: public RNNMKLDNNHandler<T, dnnl::lstm_forward, T_out> {
public:
LSTMMKLDNNHandler(const paddle::framework::ExecutionContext& ctx,
const platform::MKLDNNDeviceContext& dev_ctx,
const mkldnn::engine mkldnn_engine,
platform::Place cpu_place, const LoDTensor* input,
const Tensor* weight_h, const Tensor* h0, const Tensor* c0,
const bool is_reverse, const int64_t N, const int64_t Ti,
const int64_t IC, const int64_t OC,
const std::string& unique_name)
: RNNMKLDNNHandler<T, dnnl::lstm_forward, T_out>(
ctx, dev_ctx, mkldnn_engine, ctx.GetPlace(), input, weight_h, h0,
is_reverse, N, Ti, IC, OC, 4,
ctx.InputName("X") + ctx.InputName("WeightH")) {
if (!this->isCached()) {
const bool is_INT8 = std::is_same<T, uint8_t>::value;
const bool use_peepholes = ctx.Attr<bool>("use_peepholes");
// oneDNN kernel has hardcoded activation functions
PADDLE_ENFORCE_EQ(
ctx.Attr<std::string>("gate_activation"), "sigmoid",
platform::errors::Unimplemented("oneDNN fusion_lstm supports only "
"sigmoid as a gate activation."));
PADDLE_ENFORCE_EQ(
ctx.Attr<std::string>("cell_activation"), "tanh",
platform::errors::Unimplemented(
"oneDNN fusion_lstm supports only tanh as a cell activation."));
PADDLE_ENFORCE_EQ(
ctx.Attr<std::string>("candidate_activation"), "tanh",
platform::errors::Unimplemented(
"oneDNN fusion_lstm supports only tanh a candidate activation."));
// Weights for int8 kernel are of a type s8
const auto weights_dt =
is_INT8 ? dnnl::memory::data_type::s8 : MKLDNNGetDataType<T>();
// oneDNN RNN dimensions
const int64_t D = 1; // Directions
const int64_t L = 1; // Layers (PP supports only 1 stacked layer)
const int64_t G = 4; // Number of Gates, 4 for LSTM
// Create memory descriptors
auto input_md = MKLDNNMemDesc({Ti, N, IC}, MKLDNNGetDataType<T>(),
MKLDNNMemoryFormat::tnc);
auto weight_x_md =
MKLDNNMemDesc({L, D, IC, G, OC}, weights_dt, MKLDNNMemoryFormat::any);
auto weight_h_md =
MKLDNNMemDesc({L, D, OC, G, OC}, weights_dt, MKLDNNMemoryFormat::any);
auto bias_md = MKLDNNMemDesc({L, D, G, OC}, MKLDNNGetDataType<float>(),
MKLDNNMemoryFormat::ldgo);
auto hidden_md = MKLDNNMemDesc({Ti, N, OC}, MKLDNNGetDataType<T_out>(),
MKLDNNMemoryFormat::tnc);
auto h0_md = MKLDNNMemDesc({L, D, N, OC}, MKLDNNGetDataType<T>(),
MKLDNNMemoryFormat::ldnc);
auto c0_md = MKLDNNMemDesc({L, D, N, OC}, MKLDNNGetDataType<T>(),
MKLDNNMemoryFormat::ldnc);
// Create LSTM oneDNN primitive
const auto direction =
is_reverse ? dnnl::rnn_direction::unidirectional_right2left
: dnnl::rnn_direction::unidirectional_left2right;
if (!use_peepholes) {
this->AcquireForwardPrimitiveDescriptor(
this->attr_, dnnl::prop_kind::forward_inference, direction,
input_md, h0_md, c0_md, weight_x_md, weight_h_md, bias_md,
hidden_md, dnnl::memory::desc(), dnnl::memory::desc());
} else {
auto weight_peephole_md =
MKLDNNMemDesc({L, D, 3, OC}, MKLDNNGetDataType<float>(),
MKLDNNMemoryFormat::ldgo);
this->AcquireForwardPrimitiveDescriptor(
this->attr_, dnnl::prop_kind::forward_inference, direction,
input_md, h0_md, c0_md, weight_x_md, weight_h_md,
weight_peephole_md, bias_md, hidden_md, dnnl::memory::desc(),
dnnl::memory::desc());
}
}
}
// PaddlePaddle has different order of weights than oneDNN, so a reorder is
// needed
// PaddlePaddle: {c, i, f, o}
// oneDNN: {i, f, c, o}
void ReorderGates(float* weights, int64_t I) {
size_t inner_block_size = this->OC;
size_t block_size = inner_block_size * this->G;
for (size_t i = 0; i < (size_t)I; ++i) {
size_t offset = i * block_size;
float* base_pos = weights + offset;
std::swap_ranges(base_pos, base_pos + inner_block_size,
base_pos + inner_block_size); // c <-> i
std::swap_ranges(base_pos + inner_block_size,
base_pos + 2 * inner_block_size,
base_pos + 2 * inner_block_size); // c <-> f
}
}
std::shared_ptr<dnnl::memory> AcquireWeightXMemory(const Tensor* weight_x) {
const std::string wx_key = this->memory_key_ + "@weight_x";
auto memory_p =
std::static_pointer_cast<dnnl::memory>(this->dev_ctx_.GetBlob(wx_key));
if (!memory_p) {
auto user_md =
MKLDNNMemDesc({1, 1, this->IC, this->G, this->OC},
MKLDNNGetDataType<float>(), MKLDNNMemoryFormat::ldigo);
auto user_memory = dnnl::memory(user_md, this->engine_);
auto* weight_x_data =
reinterpret_cast<float*>(user_memory.get_data_handle());
memcpy(weight_x_data, weight_x->data<float>(),
sizeof(float) * this->IC * this->G * this->OC);
ReorderGates(weight_x_data, this->IC);
memory_p = std::make_shared<dnnl::memory>(
this->fwd_pd_->weights_layer_desc(), this->engine_);
auto& astream = paddle::platform::MKLDNNDeviceContext::tls().get_stream();
dnnl::reorder(user_memory, *memory_p, this->attr_)
.execute(astream, user_memory, *memory_p);
this->dev_ctx_.SetBlob(wx_key, memory_p);
}
return memory_p;
}
std::shared_ptr<dnnl::memory> AcquireWeightHMemory(const Tensor* weight_h) {
const std::string wh_key = this->memory_key_ + "@weight_h";
auto memory_p =
std::static_pointer_cast<dnnl::memory>(this->dev_ctx_.GetBlob(wh_key));
if (!memory_p) {
auto user_md =
MKLDNNMemDesc({1, 1, this->OC, this->G, this->OC},
MKLDNNGetDataType<float>(), MKLDNNMemoryFormat::ldigo);
auto user_memory = dnnl::memory(user_md, this->engine_);
auto* weight_h_data =
reinterpret_cast<float*>(user_memory.get_data_handle());
memcpy(weight_h_data, weight_h->data<float>(),
sizeof(float) * this->OC * this->G * this->OC);
ReorderGates(weight_h_data, this->OC);
memory_p = std::make_shared<dnnl::memory>(
this->fwd_pd_->weights_iter_desc(), this->engine_);
auto& astream = paddle::platform::MKLDNNDeviceContext::tls().get_stream();
dnnl::reorder(user_memory, *memory_p, this->attr_)
.execute(astream, user_memory, *memory_p);
this->dev_ctx_.SetBlob(wh_key, memory_p);
}
return memory_p;
}
std::shared_ptr<dnnl::memory> AcquireBiasMemory(const Tensor* bias) {
const std::string bias_key = this->memory_key_ + "@bias";
auto memory_p = std::static_pointer_cast<dnnl::memory>(
this->dev_ctx_.GetBlob(bias_key));
if (!memory_p) {
memory_p = std::make_shared<dnnl::memory>(this->fwd_pd_->bias_desc(),
this->engine_);
auto* bias_data = reinterpret_cast<float*>(memory_p->get_data_handle());
if (bias) {
const float* user_bias_data =
bias->data<float>(); // Bias in oneDNN is always float
memcpy(bias_data, user_bias_data, sizeof(float) * this->G * this->OC);
ReorderGates(bias_data, 1);
} else {
// oneDNN always need bias memory, if it's not provided in PP, let
// oneDNN allocate memory and set it to 0
memset(bias_data, 0, sizeof(float) * this->G * this->OC);
}
this->dev_ctx_.SetBlob(bias_key, memory_p);
}
return memory_p;
}
std::shared_ptr<dnnl::memory> AcquirePeepholeWeights(const Tensor* bias) {
const std::string peepholes_key = this->memory_key_ + "@peepholes_weights";
auto memory_p = std::static_pointer_cast<dnnl::memory>(
this->dev_ctx_.GetBlob(peepholes_key));
if (!memory_p) {
auto user_md =
MKLDNNMemDesc({1, 1, 3, this->OC}, MKLDNNGetDataType<float>(),
MKLDNNMemoryFormat::ldgo);
auto user_memory = dnnl::memory(user_md, this->engine_);
memory_p = std::make_shared<dnnl::memory>(
this->fwd_pd_->weights_peephole_desc(), this->engine_);
auto* peephole_weights_data =
reinterpret_cast<float*>(memory_p->get_data_handle());
const float* user_bias_data =
bias->data<float>(); // Bias in oneDNN is always float
memcpy(peephole_weights_data, user_bias_data + 4 * this->OC,
sizeof(float) * 3 * this->OC);
this->dev_ctx_.SetBlob(peepholes_key, memory_p);
}
return memory_p;
}
std::shared_ptr<dnnl::memory> AcquireC0Memory(const Tensor* c0) {
const std::string c0_key = this->memory_key_ + "@c0";
auto memory_p =
std::static_pointer_cast<dnnl::memory>(this->dev_ctx_.GetBlob(c0_key));
if (!memory_p) {
auto user_c0_memory = dnnl::memory();
if (c0) {
user_c0_memory =
dnnl::memory({{1, 1, this->N, this->OC},
MKLDNNGetDataType<float>(),
MKLDNNMemoryFormat::ldnc},
this->engine_, to_void_cast(c0->data<float>()));
} else {
user_c0_memory = dnnl::memory({{1, 1, this->N, this->OC},
MKLDNNGetDataType<float>(),
MKLDNNMemoryFormat::ldnc},
this->engine_);
memset(user_c0_memory.get_data_handle(), 0,
sizeof(float) * this->N * this->OC);
}
memory_p = std::make_shared<dnnl::memory>(this->fwd_pd_->src_iter_desc(),
this->engine_);
auto& astream = paddle::platform::MKLDNNDeviceContext::tls().get_stream();
dnnl::reorder(user_c0_memory, *memory_p, this->attr_)
.execute(astream, user_c0_memory, *memory_p);
this->dev_ctx_.SetBlob(c0_key, memory_p);
}
return memory_p;
}
};
template <typename T>
class FusionLSTMMKLDNNKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
RunKernel<float>(ctx);
}
template <typename Tout = T>
void RunKernel(const framework::ExecutionContext& ctx) const {
auto& dev_ctx =
ctx.template device_context<platform::MKLDNNDeviceContext>();
const auto& mkldnn_engine = dev_ctx.GetEngine();
// Get Tensors
const auto* input = ctx.Input<LoDTensor>("X");
const auto* h0 = ctx.Input<Tensor>("H0");
const auto* c0 = ctx.Input<Tensor>("C0");
const auto* weight_x = ctx.Input<Tensor>("WeightX");
const auto* weight_h = ctx.Input<Tensor>("WeightH");
const auto* bias = ctx.Input<Tensor>("Bias");
auto* hidden = ctx.Output<LoDTensor>("Hidden");
auto* cell = ctx.Output<LoDTensor>("Cell");
cell = cell;
auto x_dims = input->dims();
auto x_mat_dims = (x_dims.size() == 3 && x_dims[1] == 1)
? framework::flatten_to_2d(x_dims, 1)
: x_dims;
// Get attributes
const bool is_reverse = ctx.Attr<bool>("is_reverse");
const bool use_peepholes = ctx.Attr<bool>("use_peepholes");
// Get tensor dimensions
const auto x_mat_dims_vec = framework::vectorize(x_mat_dims);
const auto weight_h_dims = framework::vectorize(weight_h->dims());
const auto& input_lod = input->lod()[0];
// Calculate RNN dimensions
const int64_t N = input_lod.size() - 1; // Number of sentences (batches)
const int64_t Ti = // Max length of the sentence in a batch
[&input_lod]() {
size_t res = 0;
for (size_t i = 0; i < (input_lod.size() - 1); ++i) {
res = std::max(res, input_lod[i + 1] - input_lod[i]);
}
return res;
}();
const int64_t IC = x_mat_dims_vec[1]; // Input channels
const int64_t OC = weight_h_dims[0]; // Output channels
LSTMMKLDNNHandler<T, Tout> handler(
ctx, dev_ctx, mkldnn_engine, ctx.GetPlace(), input, weight_h, h0, c0,
is_reverse, N, Ti, IC, OC,
ctx.InputName("X") + ctx.InputName("WeightH"));
auto input_memory_p =
handler.AcquireInputMemoryWithReorder(input, is_reverse);
auto h0_memory_p = handler.AcquireH0Memory(h0);
auto c0_memory_p = handler.AcquireC0Memory(c0);
auto weight_x_memory_p = handler.AcquireWeightXMemory(weight_x);
auto weight_h_memory_p = handler.AcquireWeightHMemory(weight_h);
auto bias_memory_p = handler.AcquireBiasMemory(bias);
auto hidden_onednn_memory_p = handler.AcquireOutputMemory();
std::unordered_map<int, dnnl::memory> lstm_args = {
{DNNL_ARG_SRC_LAYER, *input_memory_p},
{DNNL_ARG_SRC_ITER, *h0_memory_p},
{DNNL_ARG_SRC_ITER_C, *c0_memory_p},
{DNNL_ARG_WEIGHTS_LAYER, *weight_x_memory_p},
{DNNL_ARG_WEIGHTS_ITER, *weight_h_memory_p},
{DNNL_ARG_BIAS, *bias_memory_p},
{DNNL_ARG_DST_LAYER, *hidden_onednn_memory_p}};
if (use_peepholes) {
auto peephole_weight_p = handler.AcquirePeepholeWeights(bias);
std::pair<int, dnnl::memory> peepholes_weights(DNNL_ARG_WEIGHTS_PEEPHOLE,
*peephole_weight_p);
lstm_args.insert(peepholes_weights);
}
auto lstm_forward_p = handler.AcquireForwardPrimitive();
auto& astream = paddle::platform::MKLDNNDeviceContext::tls().get_stream();
lstm_forward_p->execute(astream, lstm_args);
astream.wait();
auto* hidden_onednn_data = hidden_onednn_memory_p->get_data_handle();
auto* hidden_data =
to_void_cast(hidden->mutable_data<Tout>(ctx.GetPlace()));
if (handler.is_NTC()) {
handler.reorderRNNdata(hidden_onednn_data, hidden_data, input_lod,
is_reverse, platform::RNNReorderType::NTC_PP);
} else {
handler.reorderRNNdata(hidden_onednn_data, hidden_data, input_lod,
is_reverse, platform::RNNReorderType::TNC_PP);
}
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_KERNEL(fusion_lstm, MKLDNN, paddle::platform::CPUPlace,
ops::FusionLSTMMKLDNNKernel<float>);
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/platform/mkldnn_reuse.h"
namespace paddle {
namespace operators {
using paddle::framework::LoDTensor;
using paddle::framework::Tensor;
using paddle::platform::CPUDeviceContext;
using paddle::platform::CreateKey;
using paddle::platform::MKLDNNGetDataType;
using paddle::platform::MKLDNNMemDesc;
using platform::to_void_cast;
template <typename T, typename T_alg, typename T_out = T>
class RNNMKLDNNHandler : public platform::MKLDNNHandlerT<T, T_alg> {
public:
RNNMKLDNNHandler(const paddle::framework::ExecutionContext& ctx,
const platform::MKLDNNDeviceContext& dev_ctx,
const mkldnn::engine mkldnn_engine,
platform::Place cpu_place, const LoDTensor* input,
const Tensor* weight_h, const Tensor* h0,
const bool is_reverse, const int64_t N, const int64_t Ti,
const int64_t IC, const int64_t OC, const int64_t G,
const std::string& unique_name)
: platform::MKLDNNHandlerT<T, T_alg>(
dev_ctx, dev_ctx.GetEngine(), cpu_place,
CreateKey(dev_ctx, unique_name, MKLDNNGetDataType<T>(), Ti)),
N(N),
Ti(Ti),
IC(IC),
OC(OC),
G(G) {
// Create memory key without Ti because weights, bias and h0 memories
// do not depend on Ti size but primitive and input/output memory do
memory_key_ = platform::ExtendKeyWithThreadInfoIfNeeded(
dev_ctx, CreateKey(dev_ctx, unique_name, MKLDNNGetDataType<T>()));
// Is it int8 kernel
const bool is_INT8 = std::is_same<T, uint8_t>::value;
if (is_INT8) {
// Int8 attributes
const float scale_data = ctx.Attr<float>("Scale_data");
const float shift_data = ctx.Attr<float>("Shift_data");
const auto scale_weights = ctx.Attr<std::vector<float>>("Scale_weights");
const int weights_scale_mask =
0 +
(1 << 3) // bit, indicating the unique scales for `g` dim in `ldigo`
+
(1 << 4); // bit, indicating the unique scales for `o` dim in `ldigo`
attr_.set_rnn_data_qparams(scale_data, shift_data);
attr_.set_rnn_weights_qparams(weights_scale_mask, scale_weights);
}
}
bool is_NTC() {
return (platform::GetMKLDNNFormat(this->fwd_pd_->dst_desc()) ==
dnnl::memory::format_tag::ntc);
}
void reorderRNNdata(void* input_data, void* output_data,
std::vector<size_t> lod, const bool is_reverse,
platform::RNNReorderType reorder_type) {
switch (reorder_type) {
// Reorder input memory [WORDS, C] + LoD -> [N, T, C]
case platform::RNNReorderType::PP_NTC: {
auto* input_data_iter = reinterpret_cast<T*>(input_data);
auto* output_data_iter = reinterpret_cast<T*>(output_data);
for (int n = 0; n < N; ++n) {
const auto num_elements = (lod[n + 1] - lod[n]) * IC;
const auto offset = is_reverse ? (Ti * IC - num_elements) : 0;
memcpy(output_data_iter + n * Ti * IC + offset, input_data_iter,
sizeof(T) * num_elements);
input_data_iter += num_elements;
}
} break;
// Reorder input memory [WORDS, C] + LoD -> [T, N, C]
case platform::RNNReorderType::PP_TNC: {
auto* input_data_iter = reinterpret_cast<T*>(input_data);
auto* output_data_iter = reinterpret_cast<T*>(output_data);
for (int n = 0; n < N; ++n) {
const auto num_elements = (lod[n + 1] - lod[n]);
const auto offset = is_reverse ? (Ti - num_elements) : 0;
for (size_t t = 0; t < num_elements; ++t) {
memcpy(output_data_iter + (t + offset) * N * IC + n * IC,
input_data_iter, sizeof(T) * IC);
input_data_iter += IC;
}
}
} break;
// Reorder output values to PP format [N, T, C] -> [WORDS, C]
case platform::RNNReorderType::NTC_PP: {
auto* input_data_iter = reinterpret_cast<T_out*>(input_data);
auto* output_data_iter = reinterpret_cast<T_out*>(output_data);
for (int n = 0; n < N; ++n) {
const auto num_elements = (lod[n + 1] - lod[n]) * OC;
const auto offset = is_reverse ? (Ti * OC - num_elements) : 0;
memcpy(output_data_iter, input_data_iter + n * Ti * OC + offset,
sizeof(T_out) * num_elements);
output_data_iter += num_elements;
}
} break;
// Reorder output values to PP format [T, N, C] -> [WORDS, C]
case platform::RNNReorderType::TNC_PP: {
auto* input_data_iter = reinterpret_cast<T_out*>(input_data);
auto* output_data_iter = reinterpret_cast<T_out*>(output_data);
for (int n = 0; n < N; ++n) {
const auto num_elements = lod[n + 1] - lod[n];
const auto offset = is_reverse ? (Ti - num_elements) : 0;
for (size_t t = 0; t < num_elements; ++t) {
memcpy(output_data_iter,
input_data_iter + (t + offset) * N * OC + n * OC,
sizeof(T_out) * OC);
output_data_iter += OC;
}
}
} break;
}
}
std::shared_ptr<dnnl::memory> AcquireInputMemoryWithReorder(
const LoDTensor* input, const bool is_reverse) {
const auto name = this->key_ + "@input_mem";
auto memory_p =
std::static_pointer_cast<dnnl::memory>(this->dev_ctx_.GetBlob(name));
if (!memory_p) {
memory_p = std::make_shared<dnnl::memory>(this->fwd_pd_->src_desc(),
this->engine_);
this->dev_ctx_.SetBlob(name, memory_p);
}
const auto& input_lod = input->lod()[0];
auto* x_data = to_void_cast(input->data<T>());
auto* x_onednn_data = memory_p->get_data_handle();
memset(x_onednn_data, 0, sizeof(T) * N * Ti * IC);
if (platform::GetMKLDNNFormat(this->fwd_pd_->src_desc()) ==
dnnl::memory::format_tag::ntc) {
reorderRNNdata(x_data, x_onednn_data, input_lod, is_reverse,
platform::RNNReorderType::PP_NTC);
} else {
reorderRNNdata(x_data, x_onednn_data, input_lod, is_reverse,
platform::RNNReorderType::PP_TNC);
}
return memory_p;
}
std::shared_ptr<dnnl::memory> AcquireOutputMemory() {
const auto name = this->key_ + "@output_mem";
auto memory_p =
std::static_pointer_cast<dnnl::memory>(this->dev_ctx_.GetBlob(name));
if (!memory_p) {
memory_p = std::make_shared<dnnl::memory>(this->fwd_pd_->dst_desc(),
this->engine_);
this->dev_ctx_.SetBlob(name, memory_p);
}
return memory_p;
}
// TODO(grygielski) H0 is for now persistable
// TODO(jczaja) H0 should be updated each iter and of T type (Fusion pass does
// not support in yet)
std::shared_ptr<dnnl::memory> AcquireH0Memory(const Tensor* h0) {
const std::string h0_key = memory_key_ + "@h0";
auto memory_p =
std::static_pointer_cast<dnnl::memory>(this->dev_ctx_.GetBlob(h0_key));
if (!memory_p) {
auto user_h0_memory = dnnl::memory();
if (h0) {
user_h0_memory =
dnnl::memory({{1, 1, N, OC},
MKLDNNGetDataType<float>(),
MKLDNNMemoryFormat::ldnc},
this->engine_, to_void_cast(h0->data<float>()));
} else {
user_h0_memory = dnnl::memory({{1, 1, N, OC},
MKLDNNGetDataType<float>(),
MKLDNNMemoryFormat::ldnc},
this->engine_);
memset(user_h0_memory.get_data_handle(), 0, sizeof(float) * N * OC);
}
memory_p = std::make_shared<dnnl::memory>(this->fwd_pd_->src_iter_desc(),
this->engine_);
auto& astream = paddle::platform::MKLDNNDeviceContext::tls().get_stream();
dnnl::reorder(user_h0_memory, *memory_p, attr_)
.execute(astream, user_h0_memory, *memory_p);
this->dev_ctx_.SetBlob(h0_key, memory_p);
}
return memory_p;
}
protected:
// RNN dimensions
// N - Batch Size
// Ti - Max sentence length
// IC - Input Channels
// OC - Output Channels
// G - Number of gates
const int64_t N, Ti, IC, OC, G;
// Memory size of weights, bias and h0 does not depend
// on Ti size, thus we need another key to cache them
std::string memory_key_;
dnnl::primitive_attr attr_;
};
} // namespace operators
} // namespace paddle
...@@ -75,4 +75,6 @@ class TestFusionGRUMKLDNNOpBS1(TestFusionGRUOp): ...@@ -75,4 +75,6 @@ class TestFusionGRUMKLDNNOpBS1(TestFusionGRUOp):
if __name__ == "__main__": if __name__ == "__main__":
from paddle import enable_static
enable_static()
unittest.main() unittest.main()
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
import numpy as np
from paddle.fluid.tests.unittests.test_fusion_lstm_op import TestFusionLSTMOp
class TestFusionLSTMONEDNNOp(TestFusionLSTMOp):
def set_conf(self):
self.use_mkldnn = True
def test_check_output(self):
for use_seq in {True, False}:
self.attrs['use_seq'] = use_seq
self.check_output(check_dygraph=False, no_check_set=["Cell"])
class TestFusionLSTMONEDNNOpReverse(TestFusionLSTMONEDNNOp):
def set_conf(self):
self.is_reverse = True
self.use_mkldnn = True
class TestFusionLSTMONEDNNOpInitReverse(TestFusionLSTMONEDNNOp):
def set_conf(self):
self.has_initial_state = True
self.is_reverse = True
self.use_mkldnn = True
class TestFusionLSTMONEDNNOpMD1(TestFusionLSTMONEDNNOp):
def set_conf(self):
self.M = 36
self.D = 8
self.use_mkldnn = True
class TestFusionLSTMONEDNNOpMD2(TestFusionLSTMONEDNNOp):
def set_conf(self):
self.M = 8
self.D = 8
self.use_mkldnn = True
class TestFusionLSTMONEDNNOpMD3(TestFusionLSTMONEDNNOp):
def set_conf(self):
self.M = 15
self.D = 3
self.use_mkldnn = True
class TestFusionLSTMONEDNNOpBS1(TestFusionLSTMONEDNNOp):
def set_conf(self):
self.lod = [[3]]
self.D = 16
self.use_mkldnn = True
class TestFusionLSTMONEDNNOpPeepholesInit(TestFusionLSTMONEDNNOp):
def set_conf(self):
self.use_peepholes = True
self.has_initial_state = True
self.use_mkldnn = True
if __name__ == '__main__':
from paddle import enable_static
enable_static()
unittest.main()
...@@ -144,4 +144,6 @@ class TestFusionGRUOpBS1(TestFusionGRUOp): ...@@ -144,4 +144,6 @@ class TestFusionGRUOpBS1(TestFusionGRUOp):
if __name__ == "__main__": if __name__ == "__main__":
from paddle import enable_static
enable_static()
unittest.main() unittest.main()
...@@ -58,6 +58,7 @@ class TestFusionLSTMOp(OpTest): ...@@ -58,6 +58,7 @@ class TestFusionLSTMOp(OpTest):
self.act_gate = 'sigmoid' self.act_gate = 'sigmoid'
self.act_cell = 'tanh' self.act_cell = 'tanh'
self.act_cand = 'tanh' self.act_cand = 'tanh'
self.use_mkldnn = False
self.set_conf() self.set_conf()
T = sum(self.lod[0]) T = sum(self.lod[0])
...@@ -110,7 +111,8 @@ class TestFusionLSTMOp(OpTest): ...@@ -110,7 +111,8 @@ class TestFusionLSTMOp(OpTest):
'is_reverse': self.is_reverse, 'is_reverse': self.is_reverse,
'gate_activation': self.act_gate, 'gate_activation': self.act_gate,
'cell_activation': self.act_cell, 'cell_activation': self.act_cell,
'candidate_activation': self.act_cand 'candidate_activation': self.act_cand,
'use_mkldnn': self.use_mkldnn
} }
def test_check_output(self): def test_check_output(self):
...@@ -191,4 +193,6 @@ class TestFusionLSTMOpPeepholesBS1(TestFusionLSTMOp): ...@@ -191,4 +193,6 @@ class TestFusionLSTMOpPeepholesBS1(TestFusionLSTMOp):
if __name__ == '__main__': if __name__ == '__main__':
from paddle import enable_static
enable_static()
unittest.main() unittest.main()
...@@ -29,4 +29,5 @@ no_check_set_white_list = [ ...@@ -29,4 +29,5 @@ no_check_set_white_list = [
'update_loss_scaling', 'update_loss_scaling',
'cudnn_lstm', 'cudnn_lstm',
'rnn', 'rnn',
'fusion_lstm',
] ]
...@@ -601,6 +601,7 @@ STATIC_MODE_TESTING_LIST = [ ...@@ -601,6 +601,7 @@ STATIC_MODE_TESTING_LIST = [
'test_bilinear_interp_mkldnn_op', 'test_bilinear_interp_mkldnn_op',
'test_fusion_gru_int8_mkldnn_op', 'test_fusion_gru_int8_mkldnn_op',
'test_fusion_gru_mkldnn_op', 'test_fusion_gru_mkldnn_op',
'test_fusion_lstm_mkldnn_op',
'test_gaussian_random_mkldnn_op', 'test_gaussian_random_mkldnn_op',
'test_lrn_mkldnn_op', 'test_lrn_mkldnn_op',
'test_matmul_mkldnn_op', 'test_matmul_mkldnn_op',
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册