未验证 提交 30c7758f 编写于 作者: P Piotr Paturej 提交者: GitHub

Fix oneDNN elementwise_sub dnnl_error in unit test (#47237)

* Fix dnnl errors in elementwise_sub tests

* Fix model accuracy attempt

* Add new fix

* Add proper fix

* Refactor by removing code repetition
上级 818132a0
......@@ -34,16 +34,84 @@ inline std::vector<int64_t> CalculateBroadcastedDims(
const auto src_tz = phi::vectorize(x->dims());
const auto dst_tz = phi::vectorize(y->dims());
size_t j = 0;
std::vector<int64_t> dst_tz_ex(src_tz.size(), 1);
for (size_t i = 0; i < src_tz.size(); ++i) {
dst_tz_ex[i] = (src_tz[i] != dst_tz[j]) ? 1 : dst_tz[j++];
if (j == dst_tz.size()) break;
if (src_tz.size() == dst_tz.size()) {
for (size_t i = 0; i < src_tz.size(); i++) {
dst_tz_ex[i] = (src_tz[i] == dst_tz[i]) ? dst_tz[i] : 1;
}
} else {
size_t j = 0;
for (size_t i = 0; i < src_tz.size(); i++) {
dst_tz_ex[i] = (src_tz[i] != dst_tz[j]) ? 1 : dst_tz[j++];
if (j == dst_tz.size()) break;
}
}
return dst_tz_ex;
}
inline void AddSubNonBroadcast(platform::ReorderMKLDNNHandler* reorder_handler,
phi::DenseTensor* grad_tensor,
const std::shared_ptr<dnnl::memory>& src_memory,
const std::shared_ptr<dnnl::memory>& dst_memory,
const std::vector<float>& scales) {
dnnl::primitive_attr reorder_attr;
reorder_attr.set_output_scales(0, scales);
auto reorder_p =
reorder_handler->AcquireReorder(dst_memory, src_memory, reorder_attr);
platform::RecordEvent record_reorder("int_reorder",
platform::TracerEventType::UserDefined,
2,
platform::EventRole::kUniqueOp);
reorder_p->execute(platform::MKLDNNDeviceContext::tls().get_stream(),
*src_memory,
*dst_memory);
}
template <typename T>
inline void BroadcastReduction(const framework::ExecutionContext& ctx,
const dnnl::engine& onednn_engine,
phi::DenseTensor* grad_tensor,
const phi::DenseTensor* dout,
const std::shared_ptr<dnnl::memory>& src_memory,
std::shared_ptr<dnnl::memory> dst_memory,
const std::vector<float>& scales,
const bool is_sub) {
dnnl::primitive_attr broadcast_reduction_attr;
// Broadcasting
if (is_sub) {
dnnl::post_ops po;
po.append_eltwise(1.0f, dnnl::algorithm::eltwise_linear, scales[0], 0);
broadcast_reduction_attr.set_post_ops(po);
}
platform::ReductionMKLDNNHandler<T> reduction_handler(
dnnl::algorithm::reduction_sum,
0.0f,
0.0f,
onednn_engine,
ctx.GetPlace(),
dout,
grad_tensor,
CalculateBroadcastedDims(dout, grad_tensor),
broadcast_reduction_attr);
dst_memory = reduction_handler.AcquireDstMemory(grad_tensor);
auto reduction_p = reduction_handler.AcquireForwardPrimitive();
auto astream = platform::MKLDNNDeviceContext::tls().get_stream();
reduction_p->execute(astream,
{
{DNNL_ARG_SRC, *src_memory},
{DNNL_ARG_DST, *dst_memory},
});
astream.wait();
grad_tensor->set_mem_desc(dst_memory->get_desc().reshape(
phi::vectorize<int64_t>(grad_tensor->dims())));
}
template <typename T, dnnl::algorithm BINARY_OP>
class EltwiseMKLDNNKernel : public framework::OpKernel<T> {
private:
......@@ -155,7 +223,6 @@ class EltwiseMKLDNNGradKernel : public ElemwiseGradKernel<T> {
auto* dx = ctx.Output<phi::DenseTensor>(framework::GradVarName("X"));
auto* dy = ctx.Output<phi::DenseTensor>(framework::GradVarName("Y"));
auto* dout = ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"));
VLOG(4) << "element sub: dx " << dx << " dy " << dy << " dout " << dout;
// oneDNN's binary is optimized for broadcasting y into x, so in other case
// we have to swap tensors to achieve optimal performance
......@@ -182,31 +249,23 @@ class EltwiseMKLDNNGradKernel : public ElemwiseGradKernel<T> {
framework::ToMKLDNNDataType(proto_type_dout),
onednn_engine);
auto reorder_src_memory_p = reorder_handler.AcquireSrcMemory(
auto reorder_src_memory = reorder_handler.AcquireSrcMemory(
dout->mem_desc(), platform::to_void_cast(dout->data<T>()));
std::shared_ptr<dnnl::memory> dst_memory;
std::shared_ptr<dnnl::memory> broadcast_src_memory = reorder_src_memory;
auto& astream = platform::MKLDNNDeviceContext::tls().get_stream();
if (dx) {
std::shared_ptr<dnnl::memory> dst_memory;
// elementwise_add & elementwise_sub
if (BINARY_OP == dnnl::algorithm::binary_add ||
BINARY_OP == dnnl::algorithm::binary_sub) {
dst_memory = reorder_handler.AcquireDstMemory(
dx, dout->mem_desc(), ctx.GetPlace());
dnnl::primitive_attr reorder_attr;
reorder_attr.set_output_scales(0, scales);
auto reorder_p = reorder_handler.AcquireReorder(
dst_memory, reorder_src_memory_p, reorder_attr);
platform::RecordEvent record_reorder(
"int_reorder",
platform::TracerEventType::UserDefined,
2,
platform::EventRole::kUniqueOp);
reorder_p->execute(astream, *reorder_src_memory_p, *dst_memory);
if (dout->dims() == dx->dims()) {
dst_memory = reorder_handler.AcquireDstMemory(
dx, dout->mem_desc(), ctx.GetPlace());
AddSubNonBroadcast(
&reorder_handler, dx, reorder_src_memory, dst_memory, scales);
}
} else { // elementwise_mul & elementwise_div
platform::BinaryMKLDNNHandler<T> binary_handler(BINARY_OP,
axis,
......@@ -235,37 +294,29 @@ class EltwiseMKLDNNGradKernel : public ElemwiseGradKernel<T> {
}
astream.wait();
dx->set_mem_desc(dst_memory->get_desc());
if (dout->dims() != dx->dims()) {
BroadcastReduction<T>(ctx,
onednn_engine,
dx,
dout,
broadcast_src_memory,
dst_memory,
scales,
BINARY_OP == dnnl::algorithm::binary_sub);
} else {
dx->set_mem_desc(dst_memory->get_desc());
}
}
if (dy) {
dnnl::primitive_attr broadcast_reduction_attr;
std::shared_ptr<dnnl::memory> broadcast_src_memory;
std::shared_ptr<dnnl::memory> dst_memory;
// elementwise_add & elementwise_sub
if (BINARY_OP == dnnl::algorithm::binary_add ||
BINARY_OP == dnnl::algorithm::binary_sub) {
if (dout->dims() == dy->dims()) {
auto reorder_dst_memory_p = reorder_handler.AcquireDstMemory(
dst_memory = reorder_handler.AcquireDstMemory(
dy, dout->mem_desc(), ctx.GetPlace());
dnnl::primitive_attr reorder_attr;
reorder_attr.set_output_scales(0, scales);
auto reorder_p = reorder_handler.AcquireReorder(
reorder_dst_memory_p, reorder_src_memory_p, reorder_attr);
platform::RecordEvent record_reorder(
"int_reorder",
platform::TracerEventType::UserDefined,
2,
platform::EventRole::kUniqueOp);
reorder_p->execute(
astream, *reorder_src_memory_p, *reorder_dst_memory_p);
dst_memory = reorder_dst_memory_p;
} else {
broadcast_src_memory = reorder_src_memory_p;
AddSubNonBroadcast(
&reorder_handler, dy, reorder_src_memory, dst_memory, scales);
}
} else { // elementwise_mul & elementwise_div
std::unordered_map<int, dnnl::memory> args;
......@@ -348,36 +399,14 @@ class EltwiseMKLDNNGradKernel : public ElemwiseGradKernel<T> {
astream.wait();
if (dout->dims() != dy->dims()) {
// Broadcasting
if (BINARY_OP == dnnl::algorithm::binary_sub) {
dnnl::post_ops po;
po.append_eltwise(
1.0f, dnnl::algorithm::eltwise_linear, scales[0], 0);
broadcast_reduction_attr.set_post_ops(po);
}
platform::ReductionMKLDNNHandler<T> reduction_handler(
dnnl::algorithm::reduction_sum,
0.0f,
0.0f,
onednn_engine,
ctx.GetPlace(),
dout,
dy,
CalculateBroadcastedDims(dout, dy),
broadcast_reduction_attr);
dst_memory = reduction_handler.AcquireDstMemory(dy);
auto reduction_p = reduction_handler.AcquireForwardPrimitive();
reduction_p->execute(astream,
{
{DNNL_ARG_SRC, *broadcast_src_memory},
{DNNL_ARG_DST, *dst_memory},
});
astream.wait();
dy->set_mem_desc(dst_memory->get_desc().reshape(
phi::vectorize<int64_t>(dy->dims())));
BroadcastReduction<T>(ctx,
onednn_engine,
dy,
dout,
broadcast_src_memory,
dst_memory,
scales,
BINARY_OP == dnnl::algorithm::binary_sub);
} else {
dy->set_mem_desc(dst_memory->get_desc());
}
......
......@@ -90,26 +90,24 @@ class TestMKLDNNElementwiseSubOp4(TestMKLDNNElementwiseSubOp):
self.out = np.subtract(self.x, self.y)
class TestMKLDNNElementwiseSubOp40(TestMKLDNNElementwiseSubOp):
class TestMKLDNNElementwiseSubOp5(TestMKLDNNElementwiseSubOp):
def init_input_output(self):
self.x = np.random.uniform(0.1, 2, [180, 1]).astype(self.dtype)
self.y = np.random.uniform(0.1, 1, [1, 256]).astype(self.dtype)
self.x = np.random.uniform(1, 2, [2, 3, 4, 100]).astype(self.dtype)
self.y = np.random.uniform(1, 2, [100]).astype(self.dtype)
self.out = np.subtract(self.x, self.y)
def test_check_grad_normal(self):
self.check_grad(['X', 'Y'], 'Out')
def test_check_grad_ignore_x(self):
self.check_grad(['Y'], 'Out', no_grad_set=set("X"))
def test_check_grad_ignore_y(self):
self.check_grad(['X'], 'Out', no_grad_set=set('Y'))
class TestMKLDNNElementwiseSubOp6(TestMKLDNNElementwiseSubOp):
def init_input_output(self):
self.x = np.random.uniform(0.1, 2, [180, 1]).astype(self.dtype)
self.y = np.random.uniform(0.1, 1, [1, 256]).astype(self.dtype)
self.out = np.subtract(self.x, self.y)
class TestMKLDNNElementwiseSubOp5(TestMKLDNNElementwiseSubOp):
class TestMKLDNNElementwiseSubOp7(TestMKLDNNElementwiseSubOp):
def init_input_output(self):
self.x = np.random.uniform(1, 2, [2, 3, 4, 100]).astype(self.dtype)
self.y = np.random.uniform(1, 2, [100]).astype(self.dtype)
self.x = np.random.uniform(0.1, 2, [1, 180]).astype(self.dtype)
self.y = np.random.uniform(0.1, 1, [256, 1]).astype(self.dtype)
self.out = np.subtract(self.x, self.y)
......@@ -132,15 +130,6 @@ class TestElementwiseSubOp_xsize_lessthan_ysize_sub(TestMKLDNNElementwiseSubOp):
def init_axis(self):
self.axis = 2
def test_check_grad_normal(self):
pass
def test_check_grad_ignore_y(self):
pass
def test_check_grad_ignore_x(self):
pass
@OpTestTool.skip_if_not_cpu_bf16()
class TestBf16(TestMKLDNNElementwiseSubOp):
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册