未验证 提交 070cab11 编写于 作者: J jakpiase 提交者: GitHub

Added slice BF16/FP32 FWD/BWD kernels (#34332)

* aded slice FWD FP32

* added tests for slice FWD FP32

* added slice bwd

* added bf16 tests

* CI fix

* CI fix

* added reason to skip_if

* minor change

* temporary fix for failing test

* temporary fix

* changes after review

* CI rerun
上级 a647b80a
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/platform/mkldnn_reuse.h"
namespace paddle {
namespace operators {
using paddle::framework::Tensor;
template <typename T>
class SliceMKLDNNKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
this->RunKernel(ctx);
}
void RunKernel(const framework::ExecutionContext& ctx) const {
const auto& dev_ctx =
ctx.template device_context<platform::MKLDNNDeviceContext>();
const auto& onednn_engine = dev_ctx.GetEngine();
auto* x = ctx.Input<Tensor>("Input");
auto* out = ctx.Output<Tensor>("Out");
auto x_vec_dims = framework::vectorize(x->dims());
auto out_vec_dims = framework::vectorize(out->dims());
auto axes_int = ctx.Attr<std::vector<int>>("axes");
auto starts_int = ctx.Attr<std::vector<int>>("starts");
auto ends_int = ctx.Attr<std::vector<int>>("ends");
std::vector<int64_t> axes(ctx.Attr<std::vector<int>>("axes").begin(),
ctx.Attr<std::vector<int>>("axes").end());
std::vector<int64_t> starts(ctx.Attr<std::vector<int>>("starts").begin(),
ctx.Attr<std::vector<int>>("starts").end());
std::vector<int64_t> ends(ctx.Attr<std::vector<int>>("ends").begin(),
ctx.Attr<std::vector<int>>("ends").end());
auto decrease_axis = ctx.Attr<std::vector<int>>("decrease_axis");
std::vector<int64_t> offsets(x_vec_dims.size(), 0);
std::vector<int64_t> slice_dims(x_vec_dims);
for (size_t i = 0; i < axes.size(); ++i) {
starts[i] = starts[i] < 0 ? x_vec_dims[axes[i]] + starts[i] : starts[i];
ends[i] = ends[i] < 0 ? x_vec_dims[axes[i]] + ends[i]
: std::min(ends[i], x_vec_dims[axes[i]]);
offsets[axes[i]] = starts[i];
slice_dims[axes[i]] = ends[i] - starts[i];
}
mkldnn::memory::data_type x_type = framework::ToMKLDNNDataType(x->type());
auto key = platform::CreateKey(dev_ctx, x_vec_dims, axes, starts, ends,
x->format(), x_type);
platform::ReorderMKLDNNHandler reorder_handler(
x_vec_dims, x->type(), x_type, dev_ctx, onednn_engine, key);
auto reorder_src_memory_p = reorder_handler.AcquireSrcMemory(
x->format(), platform::to_void_cast(x->data<T>()));
auto slice_mem_p = reorder_handler.AcquireSubmemory(slice_dims, offsets,
reorder_src_memory_p);
auto reorder_dst_memory_p = reorder_handler.AcquireDstMemory(
out, slice_dims, 0, x->format(), ctx.GetPlace());
auto reorder_p =
reorder_handler.AcquireReorder(reorder_dst_memory_p, slice_mem_p);
auto& astream = platform::MKLDNNDeviceContext::tls().get_stream();
reorder_p->execute(astream, *slice_mem_p, *reorder_dst_memory_p);
astream.wait();
out->set_layout(framework::DataLayout::kMKLDNN);
out->set_format(platform::GetMKLDNNFormat(
reorder_dst_memory_p->get_desc().reshape(out_vec_dims)));
}
};
template <typename T>
class SliceGradMKLDNNKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
this->RunKernel(ctx);
}
void RunKernel(const framework::ExecutionContext& ctx) const {
const auto& dev_ctx =
ctx.template device_context<platform::MKLDNNDeviceContext>();
const auto& onednn_engine = dev_ctx.GetEngine();
auto* dout = ctx.Input<Tensor>(framework::GradVarName("Out"));
auto* dx = ctx.Output<Tensor>(framework::GradVarName("Input"));
auto dx_vec_dims = framework::vectorize(dx->dims());
auto dout_vec_dims = framework::vectorize(dout->dims());
auto axes_int = ctx.Attr<std::vector<int>>("axes");
auto starts_int = ctx.Attr<std::vector<int>>("starts");
auto ends_int = ctx.Attr<std::vector<int>>("ends");
std::vector<int64_t> axes(ctx.Attr<std::vector<int>>("axes").begin(),
ctx.Attr<std::vector<int>>("axes").end());
std::vector<int64_t> starts(ctx.Attr<std::vector<int>>("starts").begin(),
ctx.Attr<std::vector<int>>("starts").end());
std::vector<int64_t> ends(ctx.Attr<std::vector<int>>("ends").begin(),
ctx.Attr<std::vector<int>>("ends").end());
auto decrease_axis = ctx.Attr<std::vector<int>>("decrease_axis");
std::vector<int64_t> offsets(dx_vec_dims.size(), 0);
std::vector<int64_t> slice_dims(dx_vec_dims);
for (size_t i = 0; i < axes.size(); ++i) {
starts[i] = starts[i] < 0 ? dx_vec_dims[axes[i]] + starts[i] : starts[i];
ends[i] = ends[i] < 0 ? dx_vec_dims[axes[i]] + ends[i]
: std::min(ends[i], dx_vec_dims[axes[i]]);
offsets[axes[i]] = starts[i];
slice_dims[axes[i]] = ends[i] - starts[i];
}
mkldnn::memory::data_type dout_type =
framework::ToMKLDNNDataType(dout->type());
mkldnn::memory::desc md(dout_vec_dims, platform::MKLDNNGetDataType<T>(),
dout->format());
mkldnn::memory::format_tag reorder_format_tag =
platform::GetMKLDNNFormat(md.reshape(slice_dims));
auto key = platform::CreateKey(dev_ctx, dout_vec_dims, axes, starts, ends,
reorder_format_tag, dout_type);
platform::ReorderMKLDNNHandler reorder_handler(
slice_dims, dout->type(), dout_type, dev_ctx, onednn_engine, key);
auto reorder_src_memory_p = reorder_handler.AcquireSrcMemory(
reorder_format_tag, platform::to_void_cast(dout->data<T>()));
auto reorder_dst_memory_p = reorder_handler.AcquireDstMemory(
dx, dx_vec_dims, 0, reorder_format_tag, ctx.GetPlace());
memset(dx->data<T>(), 0, reorder_dst_memory_p->get_desc().get_size());
auto slice_mem_p = reorder_handler.AcquireSubmemory(slice_dims, offsets,
reorder_dst_memory_p);
auto reorder_p =
reorder_handler.AcquireReorder(slice_mem_p, reorder_src_memory_p);
auto& astream = platform::MKLDNNDeviceContext::tls().get_stream();
reorder_p->execute(astream, *reorder_src_memory_p, *slice_mem_p);
astream.wait();
dx->set_layout(framework::DataLayout::kMKLDNN);
dx->set_format(reorder_format_tag);
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_KERNEL(slice, MKLDNN, paddle::platform::CPUPlace,
ops::SliceMKLDNNKernel<float>,
ops::SliceMKLDNNKernel<paddle::platform::bfloat16>);
namespace ops = paddle::operators;
REGISTER_OP_KERNEL(slice_grad, MKLDNN, paddle::platform::CPUPlace,
ops::SliceGradMKLDNNKernel<float>,
ops::SliceGradMKLDNNKernel<paddle::platform::bfloat16>);
\ No newline at end of file
...@@ -105,7 +105,7 @@ class SplitMKLDNNKernel : public framework::OpKernel<T> { ...@@ -105,7 +105,7 @@ class SplitMKLDNNKernel : public framework::OpKernel<T> {
for (size_t i = 0; i < outs_number; ++i) { for (size_t i = 0; i < outs_number; ++i) {
auto out_vec_dims = framework::vectorize(outs[i]->dims()); auto out_vec_dims = framework::vectorize(outs[i]->dims());
auto slice_mem_p = reorder_handler.AcquireSrcSubmemory( auto slice_mem_p = reorder_handler.AcquireSubmemory(
out_vec_dims, offset, reorder_src_memory_p, i); out_vec_dims, offset, reorder_src_memory_p, i);
auto reorder_dst_memory_p = reorder_handler.AcquireDstMemory( auto reorder_dst_memory_p = reorder_handler.AcquireDstMemory(
......
...@@ -132,6 +132,26 @@ class SliceOp : public framework::OperatorWithKernel { ...@@ -132,6 +132,26 @@ class SliceOp : public framework::OperatorWithKernel {
if (platform::is_cuda_pinned_place(in_tensor.place())) { if (platform::is_cuda_pinned_place(in_tensor.place())) {
return framework::OpKernelType(in_tensor.type(), ctx.device_context()); return framework::OpKernelType(in_tensor.type(), ctx.device_context());
} }
#ifdef PADDLE_WITH_MKLDNN
auto input_data_type =
framework::OperatorWithKernel::IndicateVarDataType(ctx, "Input");
if (this->CanMKLDNNBeUsed(ctx, input_data_type)) {
// OneDNN uses blocking format, which cannot be always supported with
// reorders, because if blocked dimension is not divisible by 8 or
// 16(depending on which blocking format is used) submemory cannot be
// created, so in that scenario a fallback is needed
auto tmp_md = dnnl::memory::desc(
framework::vectorize(ctx.Input<Tensor>("Input")->dims()),
dnnl::memory::data_type::f32, ctx.Input<Tensor>("Input")->format());
if (tmp_md.data.format_desc.blocking.inner_nblks == 0)
return framework::OpKernelType(input_data_type, ctx.GetPlace(),
framework::DataLayout::kMKLDNN,
framework::LibraryType::kMKLDNN);
}
#endif
return framework::OpKernelType(in_tensor.type(), in_tensor.place()); return framework::OpKernelType(in_tensor.type(), in_tensor.place());
} }
return framework::OpKernelType( return framework::OpKernelType(
...@@ -216,6 +236,14 @@ class SliceOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -216,6 +236,14 @@ class SliceOpMaker : public framework::OpProtoAndCheckerMaker {
.SetDefault({}); .SetDefault({});
AddAttr<std::vector<int>>("decrease_axis", "(list<int>) decrease_axis") AddAttr<std::vector<int>>("decrease_axis", "(list<int>) decrease_axis")
.SetDefault({}); .SetDefault({});
AddAttr<bool>("use_mkldnn",
"(bool, default false) Only used in mkldnn kernel")
.SetDefault(false);
AddAttr<std::string>(
"mkldnn_data_type",
"(string, default \"float32\"). Data type of mkldnn kernel")
.SetDefault("float32")
.InEnum({"float32", "bfloat16"});
AddComment(R"DOC( AddComment(R"DOC(
Slice Operator. Slice Operator.
...@@ -278,12 +306,32 @@ class SliceOpGrad : public framework::OperatorWithKernel { ...@@ -278,12 +306,32 @@ class SliceOpGrad : public framework::OperatorWithKernel {
ctx->SetOutputDim(x_grad_name, x_dims); ctx->SetOutputDim(x_grad_name, x_dims);
} }
} }
framework::OpKernelType GetExpectedKernelType( framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext &ctx) const override { const framework::ExecutionContext &ctx) const override {
return framework::OpKernelType(OperatorWithKernel::IndicateVarDataType( auto input_data_type = framework::OperatorWithKernel::IndicateVarDataType(
ctx, framework::GradVarName("Out")), ctx, framework::GradVarName("Out"));
ctx.device_context());
#ifdef PADDLE_WITH_MKLDNN
if (this->CanMKLDNNBeUsed(ctx, input_data_type)) {
// OneDNN uses blocking format, which cannot be always supported with
// reorders, because if blocked dimension is not divisible by 8 or
// 16(depending on which blocking format is used) submemory cannot be
// created, so in that scenario a fallback is needed
auto tmp_md = dnnl::memory::desc(
framework::vectorize(
ctx.Input<Tensor>(framework::GradVarName("Out"))->dims()),
dnnl::memory::data_type::f32,
ctx.Input<Tensor>(framework::GradVarName("Out"))->format());
if (tmp_md.data.format_desc.blocking.inner_nblks == 0)
return framework::OpKernelType(input_data_type, ctx.GetPlace(),
framework::DataLayout::kMKLDNN,
framework::LibraryType::kMKLDNN);
}
#endif
return framework::OpKernelType(input_data_type, ctx.GetPlace());
} }
framework::OpKernelType GetKernelTypeForVar( framework::OpKernelType GetKernelTypeForVar(
const std::string &var_name, const Tensor &tensor, const std::string &var_name, const Tensor &tensor,
const framework::OpKernelType &expected_kernel_type) const override { const framework::OpKernelType &expected_kernel_type) const override {
......
...@@ -78,10 +78,9 @@ class SplitOp : public framework::OperatorWithKernel { ...@@ -78,10 +78,9 @@ class SplitOp : public framework::OperatorWithKernel {
#ifdef PADDLE_WITH_MKLDNN #ifdef PADDLE_WITH_MKLDNN
if (this->CanMKLDNNBeUsed(ctx, input_data_type)) { if (this->CanMKLDNNBeUsed(ctx, input_data_type)) {
// OneDNN uses blocking format, which cannot be always // OneDNN uses blocking format, which cannot be always supported with
// supported with reorders, because if blocked dimension is not divisible // reorders, because if blocked dimension is not divisible by 8 or
// by // 16(depending on which blocking format is used) submemory cannot be
// 8 or 16(depending on which blocking format is used) submemory cannot be
// created, so in that scenario a fallback is needed // created, so in that scenario a fallback is needed
auto tmp_md = dnnl::memory::desc( auto tmp_md = dnnl::memory::desc(
framework::vectorize(ctx.Input<Tensor>("X")->dims()), framework::vectorize(ctx.Input<Tensor>("X")->dims()),
......
...@@ -1090,9 +1090,9 @@ class ReorderMKLDNNHandler : public MKLDNNHandler { ...@@ -1090,9 +1090,9 @@ class ReorderMKLDNNHandler : public MKLDNNHandler {
return this->AcquireMemory(dims_, dtype_, fmt, ptr, "@user_src_mem_p"); return this->AcquireMemory(dims_, dtype_, fmt, ptr, "@user_src_mem_p");
} }
std::shared_ptr<mkldnn::memory> AcquireSrcSubmemory( std::shared_ptr<mkldnn::memory> AcquireSubmemory(
const std::vector<int64_t>& dims, const std::vector<int64_t>& offset, const std::vector<int64_t>& dims, const std::vector<int64_t>& offset,
const std::shared_ptr<mkldnn::memory>& mem_p, int submemory_number) { const std::shared_ptr<mkldnn::memory>& mem_p, int submemory_number = 0) {
std::string local_key = key_; std::string local_key = key_;
local_key.append("@submem") local_key.append("@submem")
.append(std::to_string(submemory_number)) .append(std::to_string(submemory_number))
......
# Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import unittest
import numpy as np
import paddle.fluid.core as core
from paddle.fluid.tests.unittests.op_test import OpTest, OpTestTool, convert_float_to_uint16
import paddle.fluid as fluid
import paddle.fluid.layers as layers
import paddle
@OpTestTool.skip_if(core.is_compiled_with_cuda(),
"CUDA required dygraph so oneDNN UT must be skipped")
class TestSliceOneDNNOp(OpTest):
def setUp(self):
self.op_type = "slice"
self.config()
self.set_inputs()
self.outputs = {'Out': self.out}
self.attrs = {
'axes': self.axes,
'starts': self.starts,
'ends': self.ends,
'infer_flags': self.infer_flags,
'use_mkldnn': True
}
self.set_attrs()
def set_inputs(self):
self.inputs = {'Input': self.input}
def set_attrs(self):
pass
def config(self):
self.input = np.random.random([3, 4, 5, 6]).astype("float32")
self.starts = [1, 0, 2]
self.ends = [3, 3, 4]
self.axes = [0, 1, 2]
self.infer_flags = [1, 1, 1]
self.out = self.input[1:3, 0:3, 2:4, :]
def test_check_output(self):
self.check_output()
def test_check_grad(self):
self.check_grad(['Input'], 'Out')
class TestSliceOneDNNOp1(TestSliceOneDNNOp):
def config(self):
self.input = np.random.random([3, 4, 5, 6]).astype("float32")
self.starts = [-3, 0, 2]
self.ends = [3, 100, -1]
self.axes = [0, 1, 2]
self.infer_flags = [1, 1, 1]
self.out = self.input[-3:3, 0:100, 2:-1, :]
class TestSliceOneDNNOp2(TestSliceOneDNNOp):
def config(self):
self.input = np.random.random([3, 4, 5, 6]).astype("float32")
self.starts = [-3, 0, 2]
self.ends = [3, 100, -1]
self.axes = [0, 1, 3]
self.infer_flags = [1, 1, 1]
self.out = self.input[-3:3, 0:100, :, 2:-1]
class TestSliceDecrease1AxisOneDNNOp(TestSliceOneDNNOp):
def set_attrs(self):
self.attrs['decrease_axis'] = self.decrease_axis
def config(self):
self.input = np.random.random([3, 4, 5, 6]).astype("float32")
self.starts = [1, 0, 2]
self.ends = [2, 3, 4]
self.axes = [0, 1, 2]
self.decrease_axis = [0]
self.infer_flags = [1, 1, 1]
self.out = self.input[1, 0:3, 2:4, :]
class TestSliceDecrease2AxesOneDNNOp(TestSliceDecrease1AxisOneDNNOp):
def config(self):
self.input = np.random.random([3, 4, 5, 6]).astype("float32")
self.starts = [1, 0, 2]
self.ends = [2, 1, 4]
self.axes = [0, 1, 2]
self.decrease_axis = [0, 1]
self.infer_flags = [1, 1, 1]
self.out = self.input[1, 0, 2:4, :]
class TestSliceDecrease3AxesOneDNNOp(TestSliceDecrease1AxisOneDNNOp):
def config(self):
self.input = np.random.random([3, 4, 5, 6]).astype("float32")
self.starts = [-1, 0, 2]
self.ends = [1000000, 1, 4]
self.axes = [0, 1, 2]
self.decrease_axis = [0, 1]
self.infer_flags = [1, 1, 1]
self.out = self.input[-1, 0, 2:4, :]
class TestSliceDecrease4AxesOneDNNOp(TestSliceDecrease1AxisOneDNNOp):
def config(self):
self.input = np.random.random([3, 4, 5, 7]).astype("float32")
self.starts = [0, 1, 2, 3]
self.ends = [1, 2, 3, 4]
self.axes = [0, 1, 2, 3]
self.decrease_axis = [0, 1, 2, 3]
self.infer_flags = [1, 1, 1]
self.out = self.input[0, 1, 2, 3:4]
class TestSlice5DOneDNNOp(TestSliceDecrease1AxisOneDNNOp):
def config(self):
self.input = np.random.random([3, 4, 5, 6, 7]).astype("float32")
self.starts = [-1]
self.ends = [1000000]
self.axes = [4]
self.decrease_axis = [4]
self.infer_flags = [1, 1, 1]
self.out = self.input[:, :, :, :, -1]
class TestSlice3DOneDNNOp(TestSliceDecrease1AxisOneDNNOp):
def config(self):
self.input = np.random.random([5, 4, 5]).astype("float32")
self.starts = [-1]
self.ends = [1000000]
self.axes = [2]
self.decrease_axis = [2]
self.infer_flags = [1, 1, 1]
self.out = self.input[:, :, -1]
# BF16 TESTS
def create_bf16_test_class(parent):
@OpTestTool.skip_if_not_cpu_bf16()
class TestSliceBF16OneDNNOp(parent):
def set_inputs(self):
self.dtype = np.uint16
self.inputs = {'Input': convert_float_to_uint16(self.input)}
def calculate_grads(self):
self.dout = self.out
self.dx = np.zeros(shape=self.input.shape)
begin = [None] * self.input.ndim
end = [None] * self.input.ndim
for i in range(len(self.axes)):
begin[self.axes[i]] = self.starts[i]
end[self.axes[i]] = self.ends[i]
self.dx[begin[0]:end[0], begin[1]:end[1], begin[2]:end[2], begin[3]:
end[3]] = self.dout
def test_check_output(self):
self.check_output_with_place(core.CPUPlace())
def test_check_grad(self):
self.calculate_grads()
self.check_grad_with_place(
core.CPUPlace(), ["Input"],
"Out",
user_defined_grads=[self.dx],
user_defined_grad_outputs=[convert_float_to_uint16(self.dout)])
cls_name = "{0}_{1}".format(parent.__name__, "BF16")
TestSliceBF16OneDNNOp.__name__ = cls_name
globals()[cls_name] = TestSliceBF16OneDNNOp
create_bf16_test_class(TestSliceOneDNNOp)
create_bf16_test_class(TestSliceOneDNNOp1)
create_bf16_test_class(TestSliceDecrease1AxisOneDNNOp)
create_bf16_test_class(TestSliceDecrease2AxesOneDNNOp)
create_bf16_test_class(TestSliceDecrease3AxesOneDNNOp)
create_bf16_test_class(TestSliceDecrease4AxesOneDNNOp)
if __name__ == '__main__':
paddle.enable_static()
unittest.main()
...@@ -81,10 +81,13 @@ class TestSaveLoadBF16(unittest.TestCase): ...@@ -81,10 +81,13 @@ class TestSaveLoadBF16(unittest.TestCase):
y_data = np.arange(1, 13).reshape(4, 3).astype('int64') y_data = np.arange(1, 13).reshape(4, 3).astype('int64')
x_data = x_data.reshape((-1, num_steps, 1)) x_data = x_data.reshape((-1, num_steps, 1))
y_data = y_data.reshape((-1, 1)) y_data = y_data.reshape((-1, 1))
#TODO investigate initializing model with "float32" instead of "uint16" as it was before
# slice_op PR(datatypes in model graph are different than datatypes during runtime because of that)
init_hidden_data = np.zeros( init_hidden_data = np.zeros(
(num_layers, batch_size, hidden_size), dtype='float32') (num_layers, batch_size, hidden_size), dtype='uint16')
init_cell_data = np.zeros( init_cell_data = np.zeros(
(num_layers, batch_size, hidden_size), dtype='float32') (num_layers, batch_size, hidden_size), dtype='uint16')
fetch_list = [static_loss, static_last_hidden, static_last_cell] fetch_list = [static_loss, static_last_hidden, static_last_cell]
out = exe.run(fluid.default_main_program(), out = exe.run(fluid.default_main_program(),
feed={ feed={
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册