未验证 提交 6d8d3d4c 编写于 作者: J Jacek Czaja 提交者: GitHub

[oneDNN] Layer norm bf16 kernel (#28619)

上级 cdc4e662
...@@ -2102,8 +2102,8 @@ PDNode *patterns::Bfloat16Placement::operator()( ...@@ -2102,8 +2102,8 @@ PDNode *patterns::Bfloat16Placement::operator()(
const std::unordered_set<std::string> &bfloat16_enabled_op_types) { const std::unordered_set<std::string> &bfloat16_enabled_op_types) {
std::unordered_set<std::string> supported_op_types = std::unordered_set<std::string> supported_op_types =
std::unordered_set<std::string>({"concat", "conv2d", "fusion_gru", "gelu", std::unordered_set<std::string>({"concat", "conv2d", "fusion_gru", "gelu",
"reshape2", "softmax", "sum", "layer_norm", "reshape2", "softmax",
"transpose2"}); "sum", "transpose2"});
if (!bfloat16_enabled_op_types.empty()) { if (!bfloat16_enabled_op_types.empty()) {
supported_op_types = bfloat16_enabled_op_types; supported_op_types = bfloat16_enabled_op_types;
} }
......
...@@ -15,6 +15,10 @@ limitations under the License. */ ...@@ -15,6 +15,10 @@ limitations under the License. */
#include "paddle/fluid/operators/layer_norm_op.h" #include "paddle/fluid/operators/layer_norm_op.h"
#include <memory> #include <memory>
#ifdef PADDLE_WITH_MKLDNN
#include "paddle/fluid/platform/mkldnn_helper.h"
#endif
namespace paddle { namespace paddle {
namespace operators { namespace operators {
...@@ -91,6 +95,25 @@ class LayerNormOp : public framework::OperatorWithKernel { ...@@ -91,6 +95,25 @@ class LayerNormOp : public framework::OperatorWithKernel {
ctx->SetOutputDim("Variance", {left}); ctx->SetOutputDim("Variance", {left});
ctx->ShareLoD("X", "Y"); ctx->ShareLoD("X", "Y");
} }
protected:
framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext &ctx) const {
framework::LibraryType library = framework::LibraryType::kPlain;
framework::DataLayout layout = framework::DataLayout::kAnyLayout;
#ifdef PADDLE_WITH_MKLDNN
if (library == framework::LibraryType::kPlain &&
platform::CanMKLDNNBeUsed(ctx)) {
library = framework::LibraryType::kMKLDNN;
layout = framework::DataLayout::kMKLDNN;
}
#endif
return framework::OpKernelType(
OperatorWithKernel::IndicateVarDataType(ctx, "X"), ctx.GetPlace(),
layout, library);
}
}; };
class LayerNormOpMaker : public framework::OpProtoAndCheckerMaker { class LayerNormOpMaker : public framework::OpProtoAndCheckerMaker {
...@@ -134,6 +157,18 @@ class LayerNormOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -134,6 +157,18 @@ class LayerNormOpMaker : public framework::OpProtoAndCheckerMaker {
"greater than zero. But received [%d].", "greater than zero. But received [%d].",
begin_norm_axis)); begin_norm_axis));
}); });
AddAttr<bool>("use_mkldnn",
"(bool, default false) Only used in mkldnn kernel")
.SetDefault(false);
AddAttr<std::string>(
"mkldnn_data_type",
"(string, default \"float32\"). Data type of mkldnn kernel")
.SetDefault("float32")
.InEnum({"float32", "bfloat16"});
AddAttr<bool>("is_test",
"(bool, default false) Set to true for inference only, false "
"for training. Some layers may run faster when this is true.")
.SetDefault(false);
AddComment(R"DOC( AddComment(R"DOC(
Assume feature vectors exist on dimensions Assume feature vectors exist on dimensions
......
/* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/operators/layer_norm_op.h"
#include "paddle/fluid/platform/mkldnn_reuse.h"
namespace paddle {
namespace operators {
template <typename T>
class LayerNormMKLDNNHandler
: public platform::MKLDNNHandlerT<T, dnnl::layer_normalization_forward> {
public:
LayerNormMKLDNNHandler(const std::vector<int64_t>& dims, const float& epsilon,
const dnnl::normalization_flags& flags,
const bool& is_test, const MKLDNNMemoryFormat fmt,
const platform::MKLDNNDeviceContext& dev_ctx,
platform::Place cpu_place,
const std::string& uniq_name)
: platform::MKLDNNHandlerT<T, dnnl::layer_normalization_forward>(
dev_ctx, dev_ctx.GetEngine(), cpu_place,
platform::CreateKey(dims, uniq_name)) {
if (!this->isCached()) {
auto md = dnnl::memory::desc(dims, platform::MKLDNNGetDataType<T>(), fmt);
if (!is_test) {
// TODO(grygielski) Delete forcing stats_md after DNNL 1.2 is introduced
auto stats_md = dnnl::memory::desc(
{begin(dims), end(dims) - 1}, platform::MKLDNNGetDataType<float>(),
platform::MKLDNNFormatForSize(dims.size() - 1,
MKLDNNMemoryFormat::nchw));
this->AcquireForwardPrimitiveDescriptor(
dnnl::prop_kind::forward_training, md, stats_md, epsilon, flags);
} else {
this->AcquireForwardPrimitiveDescriptor(
dnnl::prop_kind::forward_inference, md, epsilon, flags);
}
}
}
std::shared_ptr<dnnl::memory> AcquireScaleShiftMemory() {
return this->AcquireMemoryFromPrimitive("@scaleshift_mem_p");
}
std::shared_ptr<dnnl::memory> AcquireScaleShiftMemory(
std::vector<float>& scaleshift_data) {
// scaleshift_data comes from temporary buffer so we need to copy it into
// created memory primitivie
auto scaleshift_mem = this->AcquireMemoryFromPrimitive(
this->fwd_pd_->weights_desc(), "@scaleshift_mem_p");
auto data_ptr = scaleshift_mem->get_data_handle();
std::size_t num_bytes = scaleshift_data.size() * sizeof(float);
std::memcpy(data_ptr, scaleshift_data.data(), num_bytes);
return scaleshift_mem;
}
std::shared_ptr<dnnl::memory> AcquireMeanMemory(framework::Tensor* mean) {
T* mean_data = mean->mutable_data<T>(this->place_,
this->fwd_pd_->mean_desc().get_size());
return this->AcquireMemoryFromPrimitive(this->fwd_pd_->mean_desc(),
mean_data, "@mean_mem_p");
}
std::shared_ptr<dnnl::memory> AcquireVarianceMemory(
framework::Tensor* variance) {
T* variance_data = variance->mutable_data<T>(
this->place_, this->fwd_pd_->variance_desc().get_size());
return this->AcquireMemoryFromPrimitive(this->fwd_pd_->variance_desc(),
variance_data, "@variance_mem_p");
}
};
template <typename T>
class LayerNormMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
auto* x = ctx.Input<Tensor>("X");
auto* scale = ctx.Input<Tensor>("Scale");
auto* bias = ctx.Input<Tensor>("Bias");
auto* y = ctx.Output<Tensor>("Y");
const float epsilon = ctx.Attr<float>("epsilon");
const auto begin_norm_axis = ctx.Attr<int>("begin_norm_axis");
const bool is_test = ctx.Attr<bool>("is_test");
auto& dev_ctx =
ctx.template device_context<platform::MKLDNNDeviceContext>();
auto src_tz = paddle::framework::vectorize(x->dims());
PADDLE_ENFORCE_EQ(begin_norm_axis, (src_tz.size() - 1),
platform::errors::InvalidArgument(
"MKL-DNN Layer Norm supports only last logical "
"axis:%d as begin_norm_axis.",
(src_tz.size() - 1)));
y->mutable_data<T>(ctx.GetPlace());
const bool with_scaleshift = (scale && bias);
dnnl::normalization_flags flags{};
if (with_scaleshift) {
flags |= dnnl::normalization_flags::use_scale_shift;
}
LayerNormMKLDNNHandler<T> handler(src_tz, epsilon, flags, is_test,
x->format(), dev_ctx, ctx.GetPlace(),
ctx.OutputName("Y"));
auto src_memory = handler.AcquireSrcMemory(x);
auto dst_memory = handler.AcquireDstMemory(y);
auto layer_norm_p = handler.AcquireForwardPrimitive();
dnnl::stream astream(dev_ctx.GetEngine());
std::unordered_map<int, dnnl::memory> args;
args.insert({DNNL_ARG_SRC, *src_memory});
args.insert({DNNL_ARG_DST, *dst_memory});
if (!is_test) {
auto* mean = ctx.Output<Tensor>("Mean");
auto* var = ctx.Output<Tensor>("Variance");
mean->mutable_data<T>(ctx.GetPlace());
var->mutable_data<T>(ctx.GetPlace());
auto mean_memory = handler.AcquireMeanMemory(mean);
auto variance_memory = handler.AcquireVarianceMemory(var);
args.insert({DNNL_ARG_MEAN, *mean_memory});
args.insert({DNNL_ARG_VARIANCE, *variance_memory});
}
auto scaleshift_memory = handler.AcquireScaleShiftMemory();
if (with_scaleshift) {
if (scaleshift_memory == nullptr || !is_test) {
auto scale_tz = paddle::framework::vectorize(scale->dims());
const unsigned int C = scale_tz[0];
// MKLDNN requires a single piece of memory for scale and shift/bias
// data
std::vector<float> scaleshift_data;
scaleshift_data.reserve(2 * C);
scaleshift_data.insert(scaleshift_data.begin(), scale->data<float>(),
scale->data<float>() + C);
scaleshift_data.insert(scaleshift_data.end(), bias->data<float>(),
bias->data<float>() + C);
scaleshift_memory = handler.AcquireScaleShiftMemory(scaleshift_data);
}
args.insert({DNNL_ARG_SCALE_SHIFT, *scaleshift_memory});
}
layer_norm_p->execute(astream, args);
astream.wait();
y->set_layout(DataLayout::kMKLDNN);
y->set_format(platform::GetMKLDNNFormat(*dst_memory));
}
};
} // namespace operators
} // namespace paddle
// TODO(jczaja): Enable FP32 when performance is good
namespace ops = paddle::operators;
REGISTER_OP_KERNEL(layer_norm, MKLDNN, ::paddle::platform::CPUPlace,
ops::LayerNormMKLDNNOpKernel<paddle::platform::bfloat16>);
...@@ -190,6 +190,12 @@ class MKLDNNHandlerT { ...@@ -190,6 +190,12 @@ class MKLDNNHandlerT {
} }
} }
std::shared_ptr<mkldnn::memory> AcquireMemoryFromPrimitive(
const std::string& suffix) {
return std::static_pointer_cast<mkldnn::memory>(
dev_ctx_.GetBlob(key_ + suffix));
}
std::shared_ptr<mkldnn::memory> AcquireMemoryFromPrimitive( std::shared_ptr<mkldnn::memory> AcquireMemoryFromPrimitive(
mkldnn::memory::desc md, void* ptr, const std::string& suffix) { mkldnn::memory::desc md, void* ptr, const std::string& suffix) {
const auto local_key = key_ + suffix; const auto local_key = key_ + suffix;
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# from paddle.fluid.tests.unittests.test_layer_norm_op import *
from __future__ import print_function
import unittest
import numpy as np
from operator import mul
import paddle.fluid.core as core
import paddle.fluid as fluid
from paddle import enable_static
from functools import reduce
from paddle.fluid.tests.unittests.mkldnn.test_layer_norm_mkldnn_op import TestLayerNormMKLDNNOp
from paddle.fluid.tests.unittests.mkldnn.test_layer_norm_mkldnn_op import _reference_layer_norm_naive
from paddle.fluid.tests.unittests.op_test import OpTest, convert_float_to_uint16
from paddle.fluid.tests.unittests.op_test import _set_use_system_allocator
np.random.random(123)
_set_use_system_allocator(True)
@unittest.skipIf(not core.supports_bfloat16(),
"place does not support BF16 evaluation")
class TestLayerNormBF16MKLDNNOp(TestLayerNormMKLDNNOp):
def __assert_close(self, tensor, np_array, msg, rtol=2e-02, atol=2):
self.assertTrue(
np.allclose(
np.array(tensor), np_array, rtol=rtol, atol=atol), msg)
def check_forward(self,
shape,
begin_norm_axis,
with_scale_bias=True,
with_is_test=False):
# attr
epsilon = 0.00001
x_shape = shape
D = reduce(mul, x_shape[begin_norm_axis:len(x_shape)], 1)
scale_shape = [D]
np.random.seed(123)
x = np.random.random_sample(x_shape).astype(np.float32)
x_bf16 = convert_float_to_uint16(x)
if with_scale_bias:
scale = np.random.random_sample(scale_shape).astype(np.float32)
bias = np.random.random_sample(scale_shape).astype(np.float32)
else:
scale = np.array([])
bias = np.array([])
# reference forward & backward
y, mean, variance = _reference_layer_norm_naive(x, scale, bias, epsilon,
begin_norm_axis)
y_bf16 = convert_float_to_uint16(y)
var_dict = locals()
var_names = ['x_bf16', 'mean', 'variance', 'y_bf16']
if with_scale_bias:
var_names.append('scale')
var_names.append('bias')
ground_truth = {name: var_dict[name] for name in var_names}
program = fluid.Program()
with fluid.program_guard(program):
block = program.global_block()
# scale and bias are fp32 and other vars are of bf16
for name in ground_truth:
if name == 'x_bf16' or name == 'y_bf16':
block.create_var(
name=name,
dtype='uint16',
shape=ground_truth[name].shape)
else:
block.create_var(
name=name,
dtype='float32',
shape=ground_truth[name].shape)
inputs = {"X": block.var('x_bf16')}
if with_scale_bias:
inputs["Scale"] = block.var('scale')
inputs["Bias"] = block.var('bias')
block.append_op(
type="layer_norm",
inputs=inputs,
outputs={
"Y": block.var('y_bf16'),
"Mean": block.var('mean'), # share the same memory
"Variance": block.var('variance'), # share the same memory
},
attrs={
"epsilon": epsilon,
"begin_norm_axis": begin_norm_axis,
"use_mkldnn": True,
"is_test": with_is_test
})
exe = fluid.Executor(core.CPUPlace())
input_list = ['x_bf16']
if with_scale_bias:
input_list.append('scale')
input_list.append('bias')
out = exe.run(program,
feed={name: var_dict[name]
for name in input_list},
fetch_list=['y_bf16', 'mean', 'variance'])
self.__assert_close(y_bf16, out[0], "y_bf16", 2)
if not with_is_test:
self.__assert_close(mean, out[1], "mean")
self.__assert_close(variance, out[2], "variance", 1e-3)
def test_check_forward_with_is_test(self):
self.check_forward(
shape=[2, 3, 4, 5], begin_norm_axis=3, with_is_test=True)
# TODO (jczaja): Enable those to test when enabling training using bf16
def test_check_forward_with_scale_and_bias(self):
pass
def test_check_forward_without_scale_and_bias(self):
pass
if __name__ == "__main__":
enable_static()
unittest.main()
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# from paddle.fluid.tests.unittests.test_layer_norm_op import *
from __future__ import print_function
import unittest
import numpy as np
from operator import mul
import paddle.fluid.core as core
import paddle.fluid as fluid
from paddle import enable_static
from functools import reduce
from paddle.fluid.tests.unittests.op_test import _set_use_system_allocator
np.random.random(123)
_set_use_system_allocator(True)
def _reference_layer_norm_naive(x, scale, beta, epsilon, begin_norm_axis=1):
x_shape = x.shape
N = reduce(mul, x_shape[0:begin_norm_axis], 1)
D = reduce(mul, x_shape[begin_norm_axis:len(x_shape)], 1)
x.shape = [N, D]
if scale.size == 0 and beta.size == 0:
scale = np.ones([1, D])
beta = np.zeros([1, D])
else:
scale = scale.reshape([1, D])
beta = beta.reshape([1, D])
mean = np.mean(x, axis=1)
var = np.var(x, axis=1) + epsilon
output = scale * np.divide((x - mean.reshape([N, 1])),
(np.sqrt(var)).reshape([N, 1])) + beta
x.shape, output.shape = x_shape, x_shape
return output, mean, var
class TestLayerNormMKLDNNOp(unittest.TestCase):
def setUp(self):
self.use_mkldnn = True
def __assert_close(self, tensor, np_array, msg, atol=1e-4):
self.assertTrue(np.allclose(np.array(tensor), np_array, atol=atol), msg)
def check_forward(self,
shape,
begin_norm_axis,
with_scale_bias=True,
with_is_test=False):
# attr
epsilon = 0.00001
x_shape = shape
D = reduce(mul, x_shape[begin_norm_axis:len(x_shape)], 1)
scale_shape = [D]
np.random.seed(123)
x = np.random.random_sample(x_shape).astype(np.float32)
if with_scale_bias:
scale = np.random.random_sample(scale_shape).astype(np.float32)
bias = np.random.random_sample(scale_shape).astype(np.float32)
else:
scale = np.array([])
bias = np.array([])
# reference forward & backward
y, mean, variance = _reference_layer_norm_naive(x, scale, bias, epsilon,
begin_norm_axis)
var_dict = locals()
var_names = ['x', 'mean', 'variance', 'y']
if with_scale_bias:
var_names.append('scale')
var_names.append('bias')
ground_truth = {name: var_dict[name] for name in var_names}
program = fluid.Program()
with fluid.program_guard(program):
block = program.global_block()
for name in ground_truth:
block.create_var(
name=name, dtype='float32', shape=ground_truth[name].shape)
inputs = {"X": block.var('x')}
if with_scale_bias:
inputs["Scale"] = block.var('scale')
inputs["Bias"] = block.var('bias')
block.append_op(
type="layer_norm",
inputs=inputs,
outputs={
"Y": block.var('y'),
"Mean": block.var('mean'), # share the same memory
"Variance": block.var('variance'), # share the same memory
},
attrs={
"epsilon": epsilon,
"begin_norm_axis": begin_norm_axis,
"use_mkldnn": True,
"is_test": with_is_test
})
exe = fluid.Executor(core.CPUPlace())
input_list = ['x']
if with_scale_bias:
input_list.append('scale')
input_list.append('bias')
out = exe.run(program,
feed={name: var_dict[name]
for name in input_list},
fetch_list=['y', 'mean', 'variance'])
self.__assert_close(y, out[0], "y")
if not with_is_test:
self.__assert_close(mean, out[1], "mean")
self.__assert_close(variance, out[2], "variance", 1e-3)
def test_check_forward_with_scale_and_bias(self):
self.check_forward(shape=[2, 3, 4, 5], begin_norm_axis=3)
def test_check_forward_without_scale_and_bias(self):
self.check_forward(
shape=[2, 3, 4, 5], begin_norm_axis=3, with_scale_bias=False)
def test_check_forward_with_is_test(self):
self.check_forward(
shape=[2, 3, 4, 5], begin_norm_axis=3, with_is_test=True)
if __name__ == "__main__":
enable_static()
unittest.main()
...@@ -25,7 +25,7 @@ import paddle.fluid.op as fluid_op ...@@ -25,7 +25,7 @@ import paddle.fluid.op as fluid_op
@unittest.skipIf(not core.supports_bfloat16(), @unittest.skipIf(not core.supports_bfloat16(),
"place does not support BF16 evaluation") "place does not support BF16 evaluation")
class TestSumMKLDNN(TestSumOp): class TestSumBF16MKLDNN(TestSumOp):
def setUp(self): def setUp(self):
self.op_type = "sum" self.op_type = "sum"
self.use_mkldnn = True self.use_mkldnn = True
......
...@@ -117,8 +117,12 @@ class TestLayerNormOp(unittest.TestCase): ...@@ -117,8 +117,12 @@ class TestLayerNormOp(unittest.TestCase):
begin_norm_axis, begin_norm_axis,
has_scale=True, has_scale=True,
has_bias=True, has_bias=True,
y_grad_scale=1.0): y_grad_scale=1.0,
def test_with_place(place, shape, begin_norm_axis): use_mkldnn=False):
def test_with_place(place,
shape,
begin_norm_axis,
use_mkldnn=use_mkldnn):
# attr # attr
epsilon = 0.00001 epsilon = 0.00001
x_shape = shape x_shape = shape
...@@ -181,7 +185,8 @@ class TestLayerNormOp(unittest.TestCase): ...@@ -181,7 +185,8 @@ class TestLayerNormOp(unittest.TestCase):
}, },
attrs={ attrs={
"epsilon": epsilon, "epsilon": epsilon,
"begin_norm_axis": begin_norm_axis "begin_norm_axis": begin_norm_axis,
"use_mkldnn": use_mkldnn
}) })
# generate backward op_desc # generate backward op_desc
grad_op_desc_list, op_grad_to_var = core.get_grad_op_desc( grad_op_desc_list, op_grad_to_var = core.get_grad_op_desc(
......
...@@ -293,6 +293,8 @@ STATIC_MODE_TESTING_LIST = [ ...@@ -293,6 +293,8 @@ STATIC_MODE_TESTING_LIST = [
'test_label_smooth_op', 'test_label_smooth_op',
'test_lamb_op', 'test_lamb_op',
'test_layer_norm_op', 'test_layer_norm_op',
'test_layer_norm_mkldnn_op',
'test_layer_norm_bf16_mkldnn_op',
'test_layer_norm_op_v2', 'test_layer_norm_op_v2',
'test_learning_rate_scheduler', 'test_learning_rate_scheduler',
'test_linear_interp_op', 'test_linear_interp_op',
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册