diff --git a/paddle/fluid/operators/elementwise_add_op.h b/paddle/fluid/operators/elementwise_add_op.h index 9e58cff01d3ccad482c7482644b25b0d5db518d4..0b19723720171a857c946880c246e2247a0023a7 100644 --- a/paddle/fluid/operators/elementwise_add_op.h +++ b/paddle/fluid/operators/elementwise_add_op.h @@ -145,13 +145,22 @@ class ElementwiseAddGradKernel : public framework::OpKernel { auto* dx = ctx.Output(framework::GradVarName("X")); auto* dy = ctx.Output(framework::GradVarName("Y")); - if (dx != nullptr) dx->ShareDataWith(*dout); + if (dx != nullptr) { + // In fact, we can just share memory, but it may cause a bug of memory + // optimizer + // dx->ShareDataWith(*dout); + framework::TensorCopy(*dout, ctx.GetPlace(), + ctx.template device_context(), dx); + } + if (dy == nullptr) return; const framework::DDim& x_dim = dout->dims(); framework::DDim y_dim = dy->dims(); if (x_dim == y_dim) { - dy->ShareDataWith(*dout); + // dy->ShareDataWith(*dout); + framework::TensorCopy(*dout, ctx.GetPlace(), + ctx.template device_context(), dy); } else { dy->mutable_data(ctx.GetPlace()); // Perform reduction to dout to calculate dy @@ -160,15 +169,15 @@ class ElementwiseAddGradKernel : public framework::OpKernel { y_dim = trim_trailing_singular_dims(y_dim); axis = (y_dim.size() == 0) ? x_dim.size() : axis; - auto* device = - ctx.template device_context().eigen_device(); + auto& device = + *(ctx.template device_context().eigen_device()); int pre, n, post; get_mid_dims(x_dim, y_dim, axis, &pre, &n, &post); auto eigen_dout = framework::EigenTensor::From( *dout, framework::make_ddim({pre, n, post})); auto eigen_dy = framework::EigenTensor::From(*dy, framework::make_ddim({n})); - eigen_dy.device(*device) = eigen_dout.sum( + eigen_dy.device(device) = eigen_dout.sum( framework::EigenDim<2>::From(framework::make_ddim({0, 2}))); } }