未验证 提交 39a5424e 编写于 作者: J Jacek Czaja 提交者: GitHub

[oneDNN] elementwise add bf16 grad kernel with broadcasting (#31385)

上级 5f621321
......@@ -277,7 +277,10 @@ class ElementwiseOpGrad : public framework::OperatorWithKernel {
#ifdef PADDLE_WITH_MKLDNN
// If broadcasting is needed, use native implementation
auto CanMKLDNNElementwiseAddGradBeUsed = [&]() {
return (ctx.Input<Tensor>("X")->dims() == ctx.Input<Tensor>("Y")->dims());
auto dx_dims = ctx.Input<Tensor>("X")->dims();
auto dy_dims = ctx.Input<Tensor>("Y")->dims();
// No broadcast or broadcasting of data on inner dims is supported
return (dx_dims[dx_dims.size() - 1] == dy_dims[dy_dims.size() - 1]);
};
if (this->CanMKLDNNBeUsed(ctx, input_data_type) &&
......
......@@ -64,14 +64,29 @@ class EltwiseAddMKLDNNGradKernel : public ElemwiseGradKernel<T> {
}
if (dy) {
auto reorder_dst_memory_p =
handler.AcquireDstMemory(dy, dout->format(), ctx.GetPlace());
auto reorder_p =
handler.AcquireReorder(reorder_dst_memory_p, reorder_src_memory_p);
platform::RecordEvent record_reorder("int_reorder",
platform::EventRole::kUniqueOp);
reorder_p->execute(astream, *reorder_src_memory_p, *reorder_dst_memory_p);
astream.wait();
// Direct copy
if (dout->dims() == dy->dims()) {
auto reorder_dst_memory_p =
handler.AcquireDstMemory(dy, dout->format(), ctx.GetPlace());
auto reorder_p =
handler.AcquireReorder(reorder_dst_memory_p, reorder_src_memory_p);
platform::RecordEvent record_reorder("int_reorder",
platform::EventRole::kUniqueOp);
reorder_p->execute(astream, *reorder_src_memory_p,
*reorder_dst_memory_p);
astream.wait();
} else {
// Broadcasting
platform::ReductionMKLDNNHandler<T> handler_sum(
dnnl::algorithm::reduction_sum, 0.0f, 0.0f, dev_ctx, onednn_engine,
ctx.GetPlace(), dout, dy,
ctx.InputName(framework::GradVarName("Out")));
auto dy_memory_p = handler_sum.AcquireDstMemory(dy);
auto reduction_p = handler_sum.AcquireForwardPrimitive();
reduction_p->execute(astream, {{DNNL_ARG_SRC, *reorder_src_memory_p},
{DNNL_ARG_DST, *dy_memory_p}});
astream.wait();
}
}
}
};
......
......@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <algorithm>
#include <memory>
#include <sstream>
#include <string>
......@@ -621,6 +622,49 @@ class BinaryMKLDNNHandler : public platform::MKLDNNHandlerT<T, dnnl::binary> {
}
};
template <typename T>
class ReductionMKLDNNHandler
: public platform::MKLDNNHandlerT<T, dnnl::reduction> {
public:
ReductionMKLDNNHandler(const dnnl::algorithm algo, const float p,
const float eps, const MKLDNNDeviceContext& dev_ctx,
const mkldnn::engine engine, platform::Place cpu_place,
const Tensor* x, const Tensor* y,
const std::string& uniq_name)
: platform::MKLDNNHandlerT<T, dnnl::reduction>(
dev_ctx, engine, cpu_place,
platform::CreateKey(dev_ctx, framework::vectorize(x->dims()),
uniq_name,
(std::to_string(static_cast<int>(algo))))) {
if (!this->isCached()) {
PADDLE_ENFORCE_EQ(
x->layout(), DataLayout::kMKLDNN,
platform::errors::InvalidArgument("Wrong layout set for X tensor."));
PADDLE_ENFORCE_NE(
x->format(), MKLDNNMemoryFormat::undef,
platform::errors::InvalidArgument("Wrong format set for X tensor."));
const auto src_tz = framework::vectorize(x->dims());
const auto dst_tz = framework::vectorize(y->dims());
// For oneDNN dimensionality should match so we need to
// extend Y tensor dims with values of 1 (before and after pattern)
int j = 0;
std::vector<int64_t> dst_tz_ex(src_tz.size(), 1);
for (size_t i = 0; i < src_tz.size(); ++i) {
dst_tz_ex[i] = (src_tz[i] != dst_tz[j]) ? 1 : dst_tz[j++];
}
const auto src_md = dnnl::memory::desc(
src_tz, platform::MKLDNNGetDataType<T>(), x->format());
const auto dst_md = memory::desc(
dst_tz_ex, platform::MKLDNNGetDataType<T>(), x->format());
this->AcquireForwardPrimitiveDescriptor(algo, src_md, dst_md, p, eps);
}
}
};
template <typename T>
class ActivationMKLDNNHandler
: public MKLDNNHandlerT<T, mkldnn::eltwise_forward,
......
......@@ -45,13 +45,13 @@ class TestElementwiseAddBf16MklDNNOp(OpTest):
def test_check_output(self):
self.check_output_with_place(core.CPUPlace())
# elementwise_add grad is just passing upper gradients to either X or Y or both
# elementwise_add grad (no braodcasting) is just passing upper gradients to either X or Y or both
def test_check_grad_normal(self):
self.check_grad_with_place(
core.CPUPlace(), ["X", "Y"],
"Out",
check_dygraph=False,
user_defined_grads=[self.x_bf16, self.x_bf16],
user_defined_grads=[self.x, self.x],
user_defined_grad_outputs=[self.x_bf16])
def test_check_grad_ingore_x(self):
......@@ -59,7 +59,7 @@ class TestElementwiseAddBf16MklDNNOp(OpTest):
core.CPUPlace(), ["Y"],
"Out",
check_dygraph=False,
user_defined_grads=[self.y_bf16],
user_defined_grads=[self.y],
user_defined_grad_outputs=[self.y_bf16])
def test_check_grad_ingore_y(self):
......@@ -67,7 +67,40 @@ class TestElementwiseAddBf16MklDNNOp(OpTest):
core.CPUPlace(), ["X"],
"Out",
check_dygraph=False,
user_defined_grads=[self.x_bf16],
user_defined_grads=[self.x],
user_defined_grad_outputs=[self.x_bf16])
class TestElementwiseAddBroadCastingBf16MklDNNOp(
TestElementwiseAddBf16MklDNNOp):
def generate_data(self):
self.x = np.random.uniform(1, 2, [2, 3, 4, 100]).astype(np.float32)
self.y = np.random.uniform(1, 2, [100]).astype(np.float32)
self.out = np.add(self.x, self.y)
# Compute partial sums along all axes but last one
def compute_reduced_gradients(self, out_grads):
part_sum = np.add.reduceat(out_grads, [0], axis=0)
part_sum = np.add.reduceat(part_sum, [0], axis=1)
part_sum = np.add.reduceat(part_sum, [0], axis=2)
return part_sum.flatten()
def test_check_grad_normal(self):
self.check_grad_with_place(
core.CPUPlace(), ["X", "Y"],
"Out",
check_dygraph=False,
user_defined_grads=[
self.x, self.compute_reduced_gradients(self.x)
],
user_defined_grad_outputs=[self.x_bf16])
def test_check_grad_ingore_x(self):
self.check_grad_with_place(
core.CPUPlace(), ["Y"],
"Out",
check_dygraph=False,
user_defined_grads=[self.compute_reduced_gradients(self.x)],
user_defined_grad_outputs=[self.x_bf16])
......
......@@ -17,6 +17,7 @@ import unittest
import numpy as np
from paddle.fluid.tests.unittests.op_test import skip_check_grad_ci
from paddle.fluid.tests.unittests.test_elementwise_add_op import TestElementwiseAddOp
from paddle import enable_static
class TestMKLDNNElementwiseAddOp(TestElementwiseAddOp):
......@@ -51,13 +52,17 @@ class TestMKLDNNElementwiseAddOp4(TestMKLDNNElementwiseAddOp):
def test_check_grad_normal(self):
pass
def test_check_grad_ingore_x(self):
pass
def test_check_grad_ingore_y(self):
pass
class TestMKLDNNElementwiseAddOp5(TestMKLDNNElementwiseAddOp):
def init_input_output(self):
self.x = np.random.uniform(1, 2, [2, 3, 4, 100]).astype(self.dtype)
self.y = np.random.uniform(1, 2, [100]).astype(self.dtype)
self.out = np.add(self.x, self.y)
class TestMKLDNNElementwiseAddOp_broadcast_3(TestMKLDNNElementwiseAddOp):
def init_input_output(self):
self.x = np.random.rand(2, 10, 12, 3).astype(self.dtype)
......@@ -150,4 +155,5 @@ class TestUint8Scales(TestInt8Scales):
if __name__ == '__main__':
enable_static()
unittest.main()
......@@ -50,8 +50,9 @@ class TestReshapeBf16Op(OpTest):
self.infered_shape = (10, 2, 3, -1)
def init_input_data(self):
self.input_data = convert_float_to_uint16(
np.random.random(self.ori_shape).astype(np.float32))
self.input_data_fp32 = np.random.random(self.ori_shape).astype(
np.float32)
self.input_data = convert_float_to_uint16(self.input_data_fp32)
def test_check_output(self):
self.check_output_with_place(core.CPUPlace(), no_check_set=['XShape'])
......@@ -61,7 +62,7 @@ class TestReshapeBf16Op(OpTest):
core.CPUPlace(), ["X"],
"Out",
check_dygraph=False,
user_defined_grads=[self.inputs["X"]],
user_defined_grads=[self.input_data_fp32],
user_defined_grad_outputs=[
self.inputs["X"].reshape(self.infered_shape)
])
......
......@@ -1452,6 +1452,16 @@ class OpTest(unittest.TestCase):
analytic_grads = self._get_gradient(inputs_to_check, place,
output_names, no_grad_set,
user_defined_grad_outputs)
# comparison of bf16 results will happen as fp32
# loop over list of grads and convert bf16 to fp32
fp32_grads = []
for grad in analytic_grads:
if grad.dtype == np.uint16:
grad = convert_uint16_to_float(grad)
max_relative_error = 0.03
fp32_grads.append(grad)
analytic_grads = fp32_grads
self._assert_is_close(numeric_grads, analytic_grads, inputs_to_check,
max_relative_error,
"Gradient Check On %s" % str(place))
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册