未验证 提交 5b4f8aac 编写于 作者: J jakpiase 提交者: GitHub

Added LSTM BF16 and fixed GRU BF16 (#31234)

上级 7cdf6ea7
...@@ -249,6 +249,10 @@ void FusionLSTMOpMaker::Make() { ...@@ -249,6 +249,10 @@ void FusionLSTMOpMaker::Make() {
AddAttr<bool>("use_mkldnn", AddAttr<bool>("use_mkldnn",
"(bool, default false) Only used in mkldnn kernel") "(bool, default false) Only used in mkldnn kernel")
.SetDefault(false); .SetDefault(false);
AddAttr<bool>("force_fp32_output",
"(bool, default false) Force INT8 kernel output FP32, only "
"used in MKL-DNN INT8")
.SetDefault(false);
AddComment(R"DOC( AddComment(R"DOC(
Fusion Long-Short Term Memory (LSTM) Operator. Fusion Long-Short Term Memory (LSTM) Operator.
This operator fuse the X into LSTM, more details can refer to LSTM op. This operator fuse the X into LSTM, more details can refer to LSTM op.
......
...@@ -89,6 +89,7 @@ class GRUMKLDNNHandler : public RNNMKLDNNHandler<T, dnnl::gru_forward, T_out> { ...@@ -89,6 +89,7 @@ class GRUMKLDNNHandler : public RNNMKLDNNHandler<T, dnnl::gru_forward, T_out> {
} }
} }
template <typename U>
std::shared_ptr<dnnl::memory> AcquireWeightXMemory(const Tensor* weight_x, std::shared_ptr<dnnl::memory> AcquireWeightXMemory(const Tensor* weight_x,
const bool origin_mode) { const bool origin_mode) {
const std::string wx_key = this->memory_key_ + "@weight_x"; const std::string wx_key = this->memory_key_ + "@weight_x";
...@@ -98,18 +99,18 @@ class GRUMKLDNNHandler : public RNNMKLDNNHandler<T, dnnl::gru_forward, T_out> { ...@@ -98,18 +99,18 @@ class GRUMKLDNNHandler : public RNNMKLDNNHandler<T, dnnl::gru_forward, T_out> {
if (!memory_p) { if (!memory_p) {
auto user_md = auto user_md =
MKLDNNMemDesc({1, 1, this->IC, this->G, this->OC}, MKLDNNMemDesc({1, 1, this->IC, this->G, this->OC},
MKLDNNGetDataType<float>(), MKLDNNMemoryFormat::ldigo); MKLDNNGetDataType<U>(), MKLDNNMemoryFormat::ldigo);
auto user_memory = dnnl::memory(user_md, this->engine_); auto user_memory = dnnl::memory(user_md, this->engine_);
auto* weight_x_data = auto* weight_x_data = reinterpret_cast<U*>(user_memory.get_data_handle());
reinterpret_cast<float*>(user_memory.get_data_handle()); memcpy(weight_x_data, weight_x->data<U>(),
memcpy(weight_x_data, weight_x->data<float>(), sizeof(U) * this->IC * this->G * this->OC);
sizeof(float) * this->IC * this->G * this->OC);
if (origin_mode == false) { if (origin_mode == false) {
for (int64_t i = 0; i < this->IC; ++i) { for (int64_t i = 0; i < this->IC; ++i) {
for (int64_t j = 0; j < this->OC; ++j) { for (int64_t j = 0; j < this->OC; ++j) {
weight_x_data[j] *= -1; U minus_one(-1.0f);
weight_x_data[j] = minus_one * weight_x_data[j];
} }
weight_x_data += 3 * this->OC; weight_x_data += 3 * this->OC;
} }
...@@ -127,6 +128,7 @@ class GRUMKLDNNHandler : public RNNMKLDNNHandler<T, dnnl::gru_forward, T_out> { ...@@ -127,6 +128,7 @@ class GRUMKLDNNHandler : public RNNMKLDNNHandler<T, dnnl::gru_forward, T_out> {
return memory_p; return memory_p;
} }
template <typename U>
std::shared_ptr<dnnl::memory> AcquireWeightHMemory(const Tensor* weight_h, std::shared_ptr<dnnl::memory> AcquireWeightHMemory(const Tensor* weight_h,
const bool origin_mode) { const bool origin_mode) {
const std::string wh_key = this->memory_key_ + "@weight_h"; const std::string wh_key = this->memory_key_ + "@weight_h";
...@@ -136,34 +138,33 @@ class GRUMKLDNNHandler : public RNNMKLDNNHandler<T, dnnl::gru_forward, T_out> { ...@@ -136,34 +138,33 @@ class GRUMKLDNNHandler : public RNNMKLDNNHandler<T, dnnl::gru_forward, T_out> {
if (!memory_p) { if (!memory_p) {
auto user_md = auto user_md =
MKLDNNMemDesc({1, 1, this->OC, this->G, this->OC}, MKLDNNMemDesc({1, 1, this->OC, this->G, this->OC},
MKLDNNGetDataType<float>(), MKLDNNMemoryFormat::ldigo); MKLDNNGetDataType<U>(), MKLDNNMemoryFormat::ldigo);
auto user_memory = dnnl::memory(user_md, this->engine_); auto user_memory = dnnl::memory(user_md, this->engine_);
// Reorder weights_h from PP format [OC, 2OC] + [OC, OC] to // Reorder weights_h from PP format [OC, 2OC] + [OC, OC] to
// oneDNN format [OC, 3OC] // oneDNN format [OC, 3OC]
auto* weight_h_data = auto* weight_h_data = reinterpret_cast<U*>(user_memory.get_data_handle());
reinterpret_cast<float*>(user_memory.get_data_handle()); auto* user_weight_h_data = weight_h->data<U>();
auto* user_weight_h_data = weight_h->data<float>();
auto src1_iter = user_weight_h_data; auto src1_iter = user_weight_h_data;
auto src2_iter = user_weight_h_data + 2 * this->OC * this->OC; auto src2_iter = user_weight_h_data + 2 * this->OC * this->OC;
for (int64_t c = 0; c < this->OC; ++c) { for (int64_t c = 0; c < this->OC; ++c) {
memcpy(weight_h_data, src1_iter, 2 * this->OC * sizeof(float)); memcpy(weight_h_data, src1_iter, 2 * this->OC * sizeof(U));
memcpy(weight_h_data + 2 * this->OC, src2_iter, memcpy(weight_h_data + 2 * this->OC, src2_iter, this->OC * sizeof(U));
this->OC * sizeof(float));
src1_iter += 2 * this->OC; src1_iter += 2 * this->OC;
src2_iter += this->OC; src2_iter += this->OC;
weight_h_data += 3 * this->OC; weight_h_data += 3 * this->OC;
} }
weight_h_data = reinterpret_cast<float*>(user_memory.get_data_handle()); weight_h_data = reinterpret_cast<U*>(user_memory.get_data_handle());
if (origin_mode == false) { if (origin_mode == false) {
for (int64_t i = 0; i < this->OC; ++i) { for (int64_t i = 0; i < this->OC; ++i) {
for (int64_t j = 0; j < this->OC; ++j) { for (int64_t j = 0; j < this->OC; ++j) {
weight_h_data[j] *= -1; U minus_one(-1.0f);
weight_h_data[j] = minus_one * weight_h_data[j];
} }
weight_h_data += 3 * this->OC; weight_h_data += 3 * this->OC;
} }
...@@ -273,11 +274,34 @@ class FusionGRUMKLDNNKernel : public framework::OpKernel<T> { ...@@ -273,11 +274,34 @@ class FusionGRUMKLDNNKernel : public framework::OpKernel<T> {
auto input_memory_p = auto input_memory_p =
handler.AcquireInputMemoryWithReorder(input, is_reverse); handler.AcquireInputMemoryWithReorder(input, is_reverse);
auto h0_memory_p = handler.AcquireH0Memory(h0);
auto weight_x_memory_p = std::shared_ptr<dnnl::memory> h0_memory_p, weight_h_memory_p,
handler.AcquireWeightXMemory(weight_x, origin_mode); weight_x_memory_p;
auto weight_h_memory_p =
handler.AcquireWeightHMemory(weight_h, origin_mode); if (weight_h->type() == paddle::framework::proto::VarType_Type_FP32) {
h0_memory_p = handler.template AcquireH0Memory<float>(h0);
weight_x_memory_p =
handler.template AcquireWeightXMemory<float>(weight_x, origin_mode);
weight_h_memory_p =
handler.template AcquireWeightHMemory<float>(weight_h, origin_mode);
} else if (weight_h->type() ==
paddle::framework::proto::VarType_Type_BF16) {
h0_memory_p =
handler.template AcquireH0Memory<paddle::platform::bfloat16>(h0);
weight_x_memory_p =
handler.template AcquireWeightXMemory<paddle::platform::bfloat16>(
weight_x, origin_mode);
weight_h_memory_p =
handler.template AcquireWeightHMemory<paddle::platform::bfloat16>(
weight_h, origin_mode);
} else {
h0_memory_p = handler.template AcquireH0Memory<uint8_t>(h0);
weight_x_memory_p =
handler.template AcquireWeightXMemory<int8_t>(weight_x, origin_mode);
weight_h_memory_p =
handler.template AcquireWeightHMemory<int8_t>(weight_h, origin_mode);
}
auto bias_memory_p = handler.AcquireBiasMemory(bias, origin_mode); auto bias_memory_p = handler.AcquireBiasMemory(bias, origin_mode);
auto hidden_onednn_memory_p = handler.AcquireOutputMemory(); auto hidden_onednn_memory_p = handler.AcquireOutputMemory();
......
...@@ -81,8 +81,11 @@ class LSTMMKLDNNHandler ...@@ -81,8 +81,11 @@ class LSTMMKLDNNHandler
MKLDNNMemoryFormat::tnc); MKLDNNMemoryFormat::tnc);
auto h0_md = MKLDNNMemDesc({L, D, N, OC}, MKLDNNGetDataType<T>(), auto h0_md = MKLDNNMemDesc({L, D, N, OC}, MKLDNNGetDataType<T>(),
MKLDNNMemoryFormat::ldnc); MKLDNNMemoryFormat::ldnc);
auto c0_md = MKLDNNMemDesc({L, D, N, OC}, MKLDNNGetDataType<T>(), auto c0_md = MKLDNNMemDesc(
MKLDNNMemoryFormat::ldnc); {L, D, N, OC}, MKLDNNGetDataType<float>(), // Vanilla LSTM and LSTM
// with peepoles has c0 as
// fp32
MKLDNNMemoryFormat::ldnc);
// Create LSTM oneDNN primitive // Create LSTM oneDNN primitive
const auto direction = const auto direction =
...@@ -110,13 +113,14 @@ class LSTMMKLDNNHandler ...@@ -110,13 +113,14 @@ class LSTMMKLDNNHandler
// needed // needed
// PaddlePaddle: {c, i, f, o} // PaddlePaddle: {c, i, f, o}
// oneDNN: {i, f, c, o} // oneDNN: {i, f, c, o}
void ReorderGates(float* weights, int64_t I) { template <typename U>
void ReorderGates(U* weights, int64_t I) {
size_t inner_block_size = this->OC; size_t inner_block_size = this->OC;
size_t block_size = inner_block_size * this->G; size_t block_size = inner_block_size * this->G;
for (size_t i = 0; i < (size_t)I; ++i) { for (size_t i = 0; i < (size_t)I; ++i) {
size_t offset = i * block_size; size_t offset = i * block_size;
float* base_pos = weights + offset; U* base_pos = weights + offset;
std::swap_ranges(base_pos, base_pos + inner_block_size, std::swap_ranges(base_pos, base_pos + inner_block_size,
base_pos + inner_block_size); // c <-> i base_pos + inner_block_size); // c <-> i
std::swap_ranges(base_pos + inner_block_size, std::swap_ranges(base_pos + inner_block_size,
...@@ -125,6 +129,7 @@ class LSTMMKLDNNHandler ...@@ -125,6 +129,7 @@ class LSTMMKLDNNHandler
} }
} }
template <typename U>
std::shared_ptr<dnnl::memory> AcquireWeightXMemory(const Tensor* weight_x) { std::shared_ptr<dnnl::memory> AcquireWeightXMemory(const Tensor* weight_x) {
const std::string wx_key = this->memory_key_ + "@weight_x"; const std::string wx_key = this->memory_key_ + "@weight_x";
auto memory_p = auto memory_p =
...@@ -133,13 +138,12 @@ class LSTMMKLDNNHandler ...@@ -133,13 +138,12 @@ class LSTMMKLDNNHandler
if (!memory_p) { if (!memory_p) {
auto user_md = auto user_md =
MKLDNNMemDesc({1, 1, this->IC, this->G, this->OC}, MKLDNNMemDesc({1, 1, this->IC, this->G, this->OC},
MKLDNNGetDataType<float>(), MKLDNNMemoryFormat::ldigo); MKLDNNGetDataType<U>(), MKLDNNMemoryFormat::ldigo);
auto user_memory = dnnl::memory(user_md, this->engine_); auto user_memory = dnnl::memory(user_md, this->engine_);
auto* weight_x_data = auto* weight_x_data = reinterpret_cast<U*>(user_memory.get_data_handle());
reinterpret_cast<float*>(user_memory.get_data_handle()); memcpy(weight_x_data, weight_x->data<U>(),
memcpy(weight_x_data, weight_x->data<float>(), sizeof(U) * this->IC * this->G * this->OC);
sizeof(float) * this->IC * this->G * this->OC);
ReorderGates(weight_x_data, this->IC); ReorderGates(weight_x_data, this->IC);
...@@ -155,6 +159,7 @@ class LSTMMKLDNNHandler ...@@ -155,6 +159,7 @@ class LSTMMKLDNNHandler
return memory_p; return memory_p;
} }
template <typename U>
std::shared_ptr<dnnl::memory> AcquireWeightHMemory(const Tensor* weight_h) { std::shared_ptr<dnnl::memory> AcquireWeightHMemory(const Tensor* weight_h) {
const std::string wh_key = this->memory_key_ + "@weight_h"; const std::string wh_key = this->memory_key_ + "@weight_h";
auto memory_p = auto memory_p =
...@@ -163,13 +168,12 @@ class LSTMMKLDNNHandler ...@@ -163,13 +168,12 @@ class LSTMMKLDNNHandler
if (!memory_p) { if (!memory_p) {
auto user_md = auto user_md =
MKLDNNMemDesc({1, 1, this->OC, this->G, this->OC}, MKLDNNMemDesc({1, 1, this->OC, this->G, this->OC},
MKLDNNGetDataType<float>(), MKLDNNMemoryFormat::ldigo); MKLDNNGetDataType<U>(), MKLDNNMemoryFormat::ldigo);
auto user_memory = dnnl::memory(user_md, this->engine_); auto user_memory = dnnl::memory(user_md, this->engine_);
auto* weight_h_data = auto* weight_h_data = reinterpret_cast<U*>(user_memory.get_data_handle());
reinterpret_cast<float*>(user_memory.get_data_handle()); memcpy(weight_h_data, weight_h->data<U>(),
memcpy(weight_h_data, weight_h->data<float>(), sizeof(U) * this->OC * this->G * this->OC);
sizeof(float) * this->OC * this->G * this->OC);
ReorderGates(weight_h_data, this->OC); ReorderGates(weight_h_data, this->OC);
...@@ -258,8 +262,8 @@ class LSTMMKLDNNHandler ...@@ -258,8 +262,8 @@ class LSTMMKLDNNHandler
memset(user_c0_memory.get_data_handle(), 0, memset(user_c0_memory.get_data_handle(), 0,
sizeof(float) * this->N * this->OC); sizeof(float) * this->N * this->OC);
} }
memory_p = std::make_shared<dnnl::memory>(this->fwd_pd_->src_iter_desc(), memory_p = std::make_shared<dnnl::memory>(
this->engine_); this->fwd_pd_->src_iter_c_desc(), this->engine_);
auto& astream = paddle::platform::MKLDNNDeviceContext::tls().get_stream(); auto& astream = paddle::platform::MKLDNNDeviceContext::tls().get_stream();
dnnl::reorder(user_c0_memory, *memory_p, this->attr_) dnnl::reorder(user_c0_memory, *memory_p, this->attr_)
...@@ -275,7 +279,15 @@ template <typename T> ...@@ -275,7 +279,15 @@ template <typename T>
class FusionLSTMMKLDNNKernel : public framework::OpKernel<T> { class FusionLSTMMKLDNNKernel : public framework::OpKernel<T> {
public: public:
void Compute(const framework::ExecutionContext& ctx) const override { void Compute(const framework::ExecutionContext& ctx) const override {
RunKernel<float>(ctx); const bool is_bf16 = std::is_same<T, paddle::platform::bfloat16>::value;
const bool force_fp32_output = ctx.Attr<bool>("force_fp32_output");
// BF16 does not support force output
if (!is_bf16 && force_fp32_output) {
RunKernel<float>(ctx);
} else {
RunKernel<T>(ctx);
}
} }
template <typename Tout = T> template <typename Tout = T>
...@@ -327,10 +339,29 @@ class FusionLSTMMKLDNNKernel : public framework::OpKernel<T> { ...@@ -327,10 +339,29 @@ class FusionLSTMMKLDNNKernel : public framework::OpKernel<T> {
auto input_memory_p = auto input_memory_p =
handler.AcquireInputMemoryWithReorder(input, is_reverse); handler.AcquireInputMemoryWithReorder(input, is_reverse);
auto h0_memory_p = handler.AcquireH0Memory(h0);
auto c0_memory_p = handler.AcquireC0Memory(c0); auto c0_memory_p = handler.AcquireC0Memory(c0);
auto weight_x_memory_p = handler.AcquireWeightXMemory(weight_x);
auto weight_h_memory_p = handler.AcquireWeightHMemory(weight_h); std::shared_ptr<dnnl::memory> h0_memory_p, weight_h_memory_p,
weight_x_memory_p;
if (weight_h->type() == paddle::framework::proto::VarType_Type_FP32) {
h0_memory_p = handler.template AcquireH0Memory<float>(h0);
weight_x_memory_p =
handler.template AcquireWeightXMemory<float>(weight_x);
weight_h_memory_p =
handler.template AcquireWeightHMemory<float>(weight_h);
} else if (weight_h->type() ==
paddle::framework::proto::VarType_Type_BF16) {
h0_memory_p =
handler.template AcquireH0Memory<paddle::platform::bfloat16>(h0);
weight_x_memory_p =
handler.template AcquireWeightXMemory<paddle::platform::bfloat16>(
weight_x);
weight_h_memory_p =
handler.template AcquireWeightHMemory<paddle::platform::bfloat16>(
weight_h);
}
auto bias_memory_p = handler.AcquireBiasMemory(bias); auto bias_memory_p = handler.AcquireBiasMemory(bias);
auto hidden_onednn_memory_p = handler.AcquireOutputMemory(); auto hidden_onednn_memory_p = handler.AcquireOutputMemory();
...@@ -374,4 +405,5 @@ class FusionLSTMMKLDNNKernel : public framework::OpKernel<T> { ...@@ -374,4 +405,5 @@ class FusionLSTMMKLDNNKernel : public framework::OpKernel<T> {
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_OP_KERNEL(fusion_lstm, MKLDNN, paddle::platform::CPUPlace, REGISTER_OP_KERNEL(fusion_lstm, MKLDNN, paddle::platform::CPUPlace,
ops::FusionLSTMMKLDNNKernel<float>); ops::FusionLSTMMKLDNNKernel<float>,
ops::FusionLSTMMKLDNNKernel<paddle::platform::bfloat16>);
...@@ -179,6 +179,7 @@ class RNNMKLDNNHandler : public platform::MKLDNNHandlerT<T, T_alg> { ...@@ -179,6 +179,7 @@ class RNNMKLDNNHandler : public platform::MKLDNNHandlerT<T, T_alg> {
// TODO(grygielski) H0 is for now persistable // TODO(grygielski) H0 is for now persistable
// TODO(jczaja) H0 should be updated each iter and of T type (Fusion pass does // TODO(jczaja) H0 should be updated each iter and of T type (Fusion pass does
// not support in yet) // not support in yet)
template <typename U>
std::shared_ptr<dnnl::memory> AcquireH0Memory(const Tensor* h0) { std::shared_ptr<dnnl::memory> AcquireH0Memory(const Tensor* h0) {
const std::string h0_key = memory_key_ + "@h0"; const std::string h0_key = memory_key_ + "@h0";
auto memory_p = auto memory_p =
...@@ -187,17 +188,14 @@ class RNNMKLDNNHandler : public platform::MKLDNNHandlerT<T, T_alg> { ...@@ -187,17 +188,14 @@ class RNNMKLDNNHandler : public platform::MKLDNNHandlerT<T, T_alg> {
if (!memory_p) { if (!memory_p) {
auto user_h0_memory = dnnl::memory(); auto user_h0_memory = dnnl::memory();
if (h0) { if (h0) {
user_h0_memory = user_h0_memory = dnnl::memory(
dnnl::memory({{1, 1, N, OC}, {{1, 1, N, OC}, MKLDNNGetDataType<U>(), MKLDNNMemoryFormat::ldnc},
MKLDNNGetDataType<float>(), this->engine_, to_void_cast(h0->data<U>()));
MKLDNNMemoryFormat::ldnc},
this->engine_, to_void_cast(h0->data<float>()));
} else { } else {
user_h0_memory = dnnl::memory({{1, 1, N, OC}, user_h0_memory = dnnl::memory(
MKLDNNGetDataType<float>(), {{1, 1, N, OC}, MKLDNNGetDataType<U>(), MKLDNNMemoryFormat::ldnc},
MKLDNNMemoryFormat::ldnc}, this->engine_);
this->engine_); memset(user_h0_memory.get_data_handle(), 0, sizeof(U) * N * OC);
memset(user_h0_memory.get_data_handle(), 0, sizeof(float) * N * OC);
} }
memory_p = std::make_shared<dnnl::memory>(this->fwd_pd_->src_iter_desc(), memory_p = std::make_shared<dnnl::memory>(this->fwd_pd_->src_iter_desc(),
this->engine_); this->engine_);
......
...@@ -30,6 +30,11 @@ class TestFusionGRUBF16MKLDNNOp(OpTest): ...@@ -30,6 +30,11 @@ class TestFusionGRUBF16MKLDNNOp(OpTest):
def set_confs(self): def set_confs(self):
self.mkldnn_data_type = False self.mkldnn_data_type = False
def test_check_output(self):
for use_seq in {True, False}:
self.attrs['use_seq'] = use_seq
self.check_output(check_dygraph=False)
def setUp(self): def setUp(self):
self.op_type = "fusion_gru" self.op_type = "fusion_gru"
self.lod = [[2, 4, 3]] self.lod = [[2, 4, 3]]
...@@ -45,6 +50,7 @@ class TestFusionGRUBF16MKLDNNOp(OpTest): ...@@ -45,6 +50,7 @@ class TestFusionGRUBF16MKLDNNOp(OpTest):
self.origin_mode = False self.origin_mode = False
self.use_mkldnn = True self.use_mkldnn = True
self.force_fp32_output = False self.force_fp32_output = False
self.weights_dtype = 'fp32'
self.set_confs() self.set_confs()
T = sum(self.lod[0]) T = sum(self.lod[0])
...@@ -58,6 +64,9 @@ class TestFusionGRUBF16MKLDNNOp(OpTest): ...@@ -58,6 +64,9 @@ class TestFusionGRUBF16MKLDNNOp(OpTest):
wx_fp32 = np.random.rand(self.M, 3 * self.D).astype('float32') wx_fp32 = np.random.rand(self.M, 3 * self.D).astype('float32')
wh_fp32 = np.random.rand(self.D, 3 * self.D).astype('float32') wh_fp32 = np.random.rand(self.D, 3 * self.D).astype('float32')
wx_bf16 = convert_float_to_uint16(wx_fp32)
wh_bf16 = convert_float_to_uint16(wh_fp32)
# bias is fp32 despite other inputs being in bf16 # bias is fp32 despite other inputs being in bf16
bias = np.random.rand( bias = np.random.rand(
1, 3 * self.D).astype('float32') if self.with_bias else np.zeros( 1, 3 * self.D).astype('float32') if self.with_bias else np.zeros(
...@@ -74,20 +83,30 @@ class TestFusionGRUBF16MKLDNNOp(OpTest): ...@@ -74,20 +83,30 @@ class TestFusionGRUBF16MKLDNNOp(OpTest):
hidden_bf16 = convert_float_to_uint16(hidden) hidden_bf16 = convert_float_to_uint16(hidden)
self.inputs = { if self.weights_dtype == 'bf16':
'X': (x_bf16, self.lod), self.inputs = {
'WeightX': wx_fp32, 'X': (x_bf16, self.lod),
'WeightH': wh_fp32 'WeightX': wx_bf16,
} 'WeightH': wh_bf16
}
elif self.weights_dtype == 'fp32':
self.inputs = {
'X': (x_bf16, self.lod),
'WeightX': wx_fp32,
'WeightH': wh_fp32
}
if self.with_bias: if self.with_bias:
self.inputs['Bias'] = bias self.inputs['Bias'] = bias
if self.with_h0: if self.with_h0:
self.inputs['H0'] = h0_bf16 if self.weights_dtype == 'bf16':
self.inputs['H0'] = h0_bf16
elif self.weights_dtype == 'fp32':
self.inputs['H0'] = h0_fp32
h0_bf16 = convert_float_to_uint16(h0_fp32) h0_bf16 = convert_float_to_uint16(h0_fp32)
self.outputs = {'Hidden': (hidden_bf16, self.lod)} self.outputs = {'Hidden': (hidden, self.lod)}
self.attrs = { self.attrs = {
'activation': self.act_state, 'activation': self.act_state,
...@@ -109,6 +128,11 @@ class TestFusionGRUINT8MKLDNNOp3(TestFusionGRUBF16MKLDNNOp): ...@@ -109,6 +128,11 @@ class TestFusionGRUINT8MKLDNNOp3(TestFusionGRUBF16MKLDNNOp):
self.with_bias = False self.with_bias = False
class TestFusionGRUINT8MKLDNNBF16WeightsOp(TestFusionGRUBF16MKLDNNOp):
def set_confs(self):
self.weights_dtype = 'bf16'
if __name__ == "__main__": if __name__ == "__main__":
from paddle import enable_static from paddle import enable_static
enable_static() enable_static()
......
...@@ -146,4 +146,6 @@ class TestFusionGRUINT8MKLDNNOp5(TestFusionGRUINT8MKLDNNOp): ...@@ -146,4 +146,6 @@ class TestFusionGRUINT8MKLDNNOp5(TestFusionGRUINT8MKLDNNOp):
if __name__ == "__main__": if __name__ == "__main__":
from paddle import enable_static
enable_static()
unittest.main() unittest.main()
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import unittest
import numpy as np
import struct
import paddle.fluid.core as core
from paddle.fluid.tests.unittests.op_test import OpTest, convert_float_to_uint16, convert_uint16_to_float
from paddle.fluid.tests.unittests.test_fusion_lstm_op import TestFusionLSTMOp, fc, ACTIVATION, fusion_lstm
from paddle.fluid.tests.unittests.test_fusion_gru_op import fusion_gru
@unittest.skipIf(not core.supports_bfloat16(),
"place does not support BF16 evaluation")
class TestFusionLSTMBF16ONEDNNOp(OpTest):
def set_confs(self):
self.mkldnn_data_type = False
def test_check_output(self):
for use_seq in {True, False}:
self.attrs['use_seq'] = use_seq
self.check_output(check_dygraph=False, no_check_set=["Cell"])
def setUp(self):
self.op_type = 'fusion_lstm'
self.lod = [[2, 3, 5, 4]]
self.M = 8
self.D = 16
self.has_initial_state = False
self.use_peepholes = False
self.is_reverse = False
self._cpu_only = True
self.act_gate = 'sigmoid'
self.act_cell = 'tanh'
self.act_cand = 'tanh'
self.use_mkldnn = True
self.force_fp32_output = False
self.weights_dtype = 'fp32'
self.set_confs()
T = sum(self.lod[0])
bs = len(self.lod[0])
# fp32 X input for reference implementation and
# corressponding bf16 data as input to LSTM oneDNN bf16 kernel
x = np.random.normal(size=(T, self.M)).astype('float32')
x_bf16 = convert_float_to_uint16(x)
if self.has_initial_state:
h0 = np.random.normal(size=(bs, self.D)).astype('float32')
c0 = np.random.normal(size=(bs, self.D)).astype('float32')
else:
h0 = np.zeros((bs, self.D)).astype('float32')
c0 = np.zeros((bs, self.D)).astype('float32')
wh = np.random.normal(size=(self.D, 4 * self.D)).astype('float32')
h0_bf16 = convert_float_to_uint16(h0)
if self.use_peepholes:
b = np.random.normal(size=(1, 7 * self.D)).astype('float32')
else:
b = np.random.normal(size=(1, 4 * self.D)).astype('float32')
w_b = np.copy(b[:, 0:4 * self.D])
w_c = b[:, 4 * self.D:] if self.use_peepholes else None
wx = np.random.normal(size=(self.M, 4 * self.D)).astype('float32')
wx_bf16 = convert_float_to_uint16(wx)
wh_bf16 = convert_float_to_uint16(wh)
bx = np.random.normal(size=(1, 4 * self.D)).astype('float32')
b[0, 0:4 * self.D] += bx[0, :]
hidden, c = fusion_lstm(x, self.lod, wx, bx, h0, c0, wh, w_b, w_c,
self.is_reverse, ACTIVATION[self.act_gate],
ACTIVATION[self.act_cell],
ACTIVATION[self.act_cand])
hidden = hidden.astype('float32')
hidden_bf16 = convert_float_to_uint16(hidden)
if self.weights_dtype == 'bf16':
self.inputs = {
'X': (x_bf16, self.lod),
'WeightX': wx_bf16,
'WeightH': wh_bf16,
'Bias': b
}
elif self.weights_dtype == 'fp32':
self.inputs = {
'X': (x_bf16, self.lod),
'WeightX': wx,
'WeightH': wh,
'Bias': b
}
if self.has_initial_state:
if self.weights_dtype == 'bf16':
self.inputs['H0'] = h0_bf16
elif self.weights_dtype == 'fp32':
self.inputs['H0'] = h0
self.inputs['C0'] = c0
self.outputs = {
'Hidden': (hidden, self.lod),
'Cell': (c, self.lod),
}
self.attrs = {
'use_peepholes': self.use_peepholes,
'is_reverse': self.is_reverse,
'gate_activation': self.act_gate,
'cell_activation': self.act_cell,
'candidate_activation': self.act_cand,
'force_fp32_output': self.force_fp32_output,
'use_mkldnn': self.use_mkldnn
}
class TestFusionLSTMBF16ONEDNNPeepholesOp(TestFusionLSTMBF16ONEDNNOp):
def set_confs(self):
self.use_peepholes = True
class TestFusionLSTMBF16ONEDNNInitializedStateOp(TestFusionLSTMBF16ONEDNNOp):
def set_confs(self):
self.has_initial_state = True
class TestFusionLSTMBF16ONEDNNReverseOp(TestFusionLSTMBF16ONEDNNOp):
def set_confs(self):
self.is_reverse = True
class TestFusionLSTMBF16ONEDNNBF16WeightsOp(TestFusionLSTMBF16ONEDNNOp):
def set_confs(self):
self.weights_dtype = 'bf16'
if __name__ == "__main__":
from paddle import enable_static
enable_static()
unittest.main()
...@@ -235,6 +235,19 @@ def convert_float_to_uint16(float_list, data_format="NCHW"): ...@@ -235,6 +235,19 @@ def convert_float_to_uint16(float_list, data_format="NCHW"):
return new_output return new_output
def copy_bits_from_uint16_to_float(i):
i = np.uint32(i) << 16
return struct.unpack('<f', struct.pack('<I', i))[0]
def convert_uint16_to_float(uint16_list):
new_output = []
for x in np.nditer(uint16_list):
new_output.append(np.float32(copy_bits_from_uint16_to_float(x)))
return np.reshape(new_output, uint16_list.shape).view(np.float32)
class OpTest(unittest.TestCase): class OpTest(unittest.TestCase):
@classmethod @classmethod
def setUpClass(cls): def setUpClass(cls):
...@@ -1143,8 +1156,14 @@ class OpTest(unittest.TestCase): ...@@ -1143,8 +1156,14 @@ class OpTest(unittest.TestCase):
idx = find_actual(out_name, fetch_list) idx = find_actual(out_name, fetch_list)
actual = outs[idx] actual = outs[idx]
actual_t = np.array(actual) actual_t = np.array(actual)
expect = self.outputs[out_name] expect = self.outputs[out_name]
expect_t = expect[0] if isinstance(expect, tuple) else expect expect_t = expect[0] if isinstance(expect, tuple) else expect
if actual_t.dtype == np.uint16 and expect_t.dtype == np.float32:
actual_t = convert_uint16_to_float(actual_t)
atol = 0.03
self.assertTrue( self.assertTrue(
np.allclose( np.allclose(
actual_t, expect_t, atol=atol, equal_nan=equal_nan), actual_t, expect_t, atol=atol, equal_nan=equal_nan),
......
...@@ -602,8 +602,10 @@ STATIC_MODE_TESTING_LIST = [ ...@@ -602,8 +602,10 @@ STATIC_MODE_TESTING_LIST = [
'test_nearest_interp_mkldnn_op', 'test_nearest_interp_mkldnn_op',
'test_bilinear_interp_mkldnn_op', 'test_bilinear_interp_mkldnn_op',
'test_fusion_gru_int8_mkldnn_op', 'test_fusion_gru_int8_mkldnn_op',
'test_fusion_gru_bf16_mkldnn_op',
'test_fusion_gru_mkldnn_op', 'test_fusion_gru_mkldnn_op',
'test_fusion_lstm_mkldnn_op', 'test_fusion_lstm_mkldnn_op',
'test_fusion_lstm_bf16_mkldnn_op',
'test_gaussian_random_mkldnn_op', 'test_gaussian_random_mkldnn_op',
'test_lrn_mkldnn_op', 'test_lrn_mkldnn_op',
'test_matmul_mkldnn_op', 'test_matmul_mkldnn_op',
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册