未验证 提交 0ee76f92 编写于 作者: C Chen Weihang 提交者: GitHub

add double grad op example (#40963)

上级 b94cf842
...@@ -192,6 +192,7 @@ OpMetaInfoBuilder::OpMetaInfoBuilder(std::string&& name, size_t index) { ...@@ -192,6 +192,7 @@ OpMetaInfoBuilder::OpMetaInfoBuilder(std::string&& name, size_t index) {
break; break;
case 2: case 2:
name_ = name_ + "_grad_grad"; name_ = name_ + "_grad_grad";
break;
default: default:
PADDLE_THROW(phi::errors::InvalidArgument( PADDLE_THROW(phi::errors::InvalidArgument(
"Not support index `%d` when construct OpMetaInfoBuilder, " "Not support index `%d` when construct OpMetaInfoBuilder, "
......
...@@ -17,6 +17,9 @@ ...@@ -17,6 +17,9 @@
#include "paddle/extension.h" #include "paddle/extension.h"
#define CHECK_CPU_INPUT(x) \
PD_CHECK(x.place() == paddle::PlaceType::kCPU, #x " must be a CPU Tensor.")
template <typename data_t> template <typename data_t>
void relu_cpu_forward_kernel(const data_t* x_data, void relu_cpu_forward_kernel(const data_t* x_data,
data_t* out_data, data_t* out_data,
...@@ -39,6 +42,17 @@ void relu_cpu_backward_kernel(const data_t* grad_out_data, ...@@ -39,6 +42,17 @@ void relu_cpu_backward_kernel(const data_t* grad_out_data,
} }
} }
template <typename data_t>
void relu_cpu_double_backward_kernel(const data_t* out_data,
const data_t* ddx_data,
data_t* ddout_data,
int64_t ddout_numel) {
for (int64_t i = 0; i < ddout_numel; ++i) {
ddout_data[i] =
ddx_data[i] * (out_data[i] > static_cast<data_t>(0) ? 1. : 0.);
}
}
std::vector<paddle::Tensor> relu_cpu_forward(const paddle::Tensor& x) { std::vector<paddle::Tensor> relu_cpu_forward(const paddle::Tensor& x) {
auto out = paddle::Tensor(paddle::PlaceType::kCPU, x.shape()); auto out = paddle::Tensor(paddle::PlaceType::kCPU, x.shape());
...@@ -67,10 +81,31 @@ std::vector<paddle::Tensor> relu_cpu_backward(const paddle::Tensor& x, ...@@ -67,10 +81,31 @@ std::vector<paddle::Tensor> relu_cpu_backward(const paddle::Tensor& x,
return {grad_x}; return {grad_x};
} }
std::vector<paddle::Tensor> relu_cpu_double_backward(
const paddle::Tensor& out, const paddle::Tensor& ddx) {
CHECK_CPU_INPUT(out);
CHECK_CPU_INPUT(ddx);
auto ddout = paddle::Tensor(paddle::PlaceType::kCPU, out.shape());
PD_DISPATCH_FLOATING_TYPES(out.type(), "relu_cpu_double_backward", ([&] {
relu_cpu_double_backward_kernel<data_t>(
out.data<data_t>(),
ddx.data<data_t>(),
ddout.mutable_data<data_t>(out.place()),
ddout.size());
}));
std::cout << "Debug info: run relu cpu double backward success." << std::endl;
return {ddout};
}
std::vector<paddle::Tensor> relu_cuda_forward(const paddle::Tensor& x); std::vector<paddle::Tensor> relu_cuda_forward(const paddle::Tensor& x);
std::vector<paddle::Tensor> relu_cuda_backward(const paddle::Tensor& x, std::vector<paddle::Tensor> relu_cuda_backward(const paddle::Tensor& x,
const paddle::Tensor& out, const paddle::Tensor& out,
const paddle::Tensor& grad_out); const paddle::Tensor& grad_out);
std::vector<paddle::Tensor> relu_cuda_double_backward(
const paddle::Tensor& out, const paddle::Tensor& ddx);
std::vector<paddle::Tensor> ReluForward(const paddle::Tensor& x) { std::vector<paddle::Tensor> ReluForward(const paddle::Tensor& x) {
// TODO(chenweihang): Check Input // TODO(chenweihang): Check Input
...@@ -96,6 +131,23 @@ std::vector<paddle::Tensor> ReluBackward(const paddle::Tensor& x, ...@@ -96,6 +131,23 @@ std::vector<paddle::Tensor> ReluBackward(const paddle::Tensor& x,
} }
} }
std::vector<paddle::Tensor> ReluDoubleBackward(const paddle::Tensor& out,
const paddle::Tensor& ddx) {
if (out.place() == paddle::PlaceType::kCPU) {
return relu_cpu_double_backward(out, ddx);
} else if (out.place() == paddle::PlaceType::kGPU) {
return relu_cuda_double_backward(out, ddx);
} else {
PD_THROW("Not implemented.");
}
}
std::vector<std::vector<int64_t>> ReluDoubleBackwardInferShape(
const std::vector<int64_t>& out_shape,
const std::vector<int64_t>& ddx_shape) {
return {out_shape};
}
PD_BUILD_OP(custom_relu) PD_BUILD_OP(custom_relu)
.Inputs({"X"}) .Inputs({"X"})
.Outputs({"Out"}) .Outputs({"Out"})
...@@ -106,6 +158,12 @@ PD_BUILD_GRAD_OP(custom_relu) ...@@ -106,6 +158,12 @@ PD_BUILD_GRAD_OP(custom_relu)
.Outputs({paddle::Grad("X")}) .Outputs({paddle::Grad("X")})
.SetKernelFn(PD_KERNEL(ReluBackward)); .SetKernelFn(PD_KERNEL(ReluBackward));
PD_BUILD_DOUBLE_GRAD_OP(custom_relu)
.Inputs({"Out", paddle::Grad(paddle::Grad("X"))})
.Outputs({paddle::Grad(paddle::Grad("Out"))})
.SetKernelFn(PD_KERNEL(ReluDoubleBackward))
.SetInferShapeFn(PD_INFER_SHAPE(ReluDoubleBackwardInferShape));
std::vector<paddle::Tensor> relu_cpu_backward_without_x( std::vector<paddle::Tensor> relu_cpu_backward_without_x(
const paddle::Tensor& out, const paddle::Tensor& grad_out) { const paddle::Tensor& out, const paddle::Tensor& grad_out) {
auto grad_x = paddle::Tensor(paddle::PlaceType::kCPU, out.shape()); auto grad_x = paddle::Tensor(paddle::PlaceType::kCPU, out.shape());
......
...@@ -14,6 +14,9 @@ ...@@ -14,6 +14,9 @@
#include "paddle/extension.h" #include "paddle/extension.h"
#define CHECK_GPU_INPUT(x) \
PD_CHECK(x.place() == paddle::PlaceType::kGPU, #x " must be a GPU Tensor.")
template <typename data_t> template <typename data_t>
__global__ void relu_cuda_forward_kernel(const data_t* x, __global__ void relu_cuda_forward_kernel(const data_t* x,
data_t* y, data_t* y,
...@@ -36,6 +39,19 @@ __global__ void relu_cuda_backward_kernel(const data_t* dy, ...@@ -36,6 +39,19 @@ __global__ void relu_cuda_backward_kernel(const data_t* dy,
} }
} }
template <typename data_t>
__global__ void relu_cuda_double_backward_kernel(const data_t* out_data,
const data_t* ddx_data,
data_t* ddout_data,
int64_t num) {
int64_t gid = blockIdx.x * blockDim.x + threadIdx.x;
for (int64_t i = num; i < num; i += blockDim.x * gridDim.x) {
ddout_data[i] = ddx_data[i] * (out_data[i] > static_cast<data_t>(0.)
? static_cast<data_t>(1.)
: static_cast<data_t>(0.));
}
}
std::vector<paddle::Tensor> relu_cuda_forward(const paddle::Tensor& x) { std::vector<paddle::Tensor> relu_cuda_forward(const paddle::Tensor& x) {
auto out = paddle::Tensor(paddle::PlaceType::kGPU, x.shape()); auto out = paddle::Tensor(paddle::PlaceType::kGPU, x.shape());
...@@ -71,6 +87,30 @@ std::vector<paddle::Tensor> relu_cuda_backward(const paddle::Tensor& x, ...@@ -71,6 +87,30 @@ std::vector<paddle::Tensor> relu_cuda_backward(const paddle::Tensor& x,
return {grad_x}; return {grad_x};
} }
std::vector<paddle::Tensor> relu_cuda_double_backward(
const paddle::Tensor& out, const paddle::Tensor& ddx) {
CHECK_GPU_INPUT(out);
CHECK_GPU_INPUT(ddx);
auto ddout = paddle::Tensor(paddle::PlaceType::kGPU, out.shape());
int64_t numel = out.size();
int64_t block = 512;
int64_t grid = (numel + block - 1) / block;
PD_DISPATCH_FLOATING_AND_HALF_TYPES(
out.type(), "relu_cuda_double_backward_kernel", ([&] {
relu_cuda_double_backward_kernel<
data_t><<<grid, block, 0, out.stream()>>>(
out.data<data_t>(),
ddx.data<data_t>(),
ddout.mutable_data<data_t>(out.place()),
numel);
}));
std::cout << "Debug info: run relu gpu double backward success." << std::endl;
return {ddout};
}
std::vector<paddle::Tensor> relu_cuda_backward_without_x( std::vector<paddle::Tensor> relu_cuda_backward_without_x(
const paddle::Tensor& out, const paddle::Tensor& grad_out) { const paddle::Tensor& out, const paddle::Tensor& grad_out) {
auto grad_x = paddle::Tensor(paddle::PlaceType::kGPU, out.shape()); auto grad_x = paddle::Tensor(paddle::PlaceType::kGPU, out.shape());
......
...@@ -31,4 +31,5 @@ setup( ...@@ -31,4 +31,5 @@ setup(
ext_modules=Extension( # test for not specific name here. ext_modules=Extension( # test for not specific name here.
sources=sources, # test for multi ops sources=sources, # test for multi ops
include_dirs=paddle_includes, include_dirs=paddle_includes,
extra_compile_args=extra_compile_args)) extra_compile_args=extra_compile_args,
verbose=True))
...@@ -138,6 +138,23 @@ def custom_relu_static_inference(func, device, np_data, np_label, path_prefix): ...@@ -138,6 +138,23 @@ def custom_relu_static_inference(func, device, np_data, np_label, path_prefix):
return predict_v return predict_v
def custom_relu_double_grad_dynamic(func, device, dtype, np_x, use_func=True):
paddle.set_device(device)
t = paddle.to_tensor(np_x, dtype=dtype, stop_gradient=False)
out = func(t) if use_func else paddle.nn.functional.relu(t)
out.stop_gradient = False
dx = paddle.grad(
outputs=[out], inputs=[t], create_graph=True, retain_graph=True)
dx[0].backward()
assert dx[0].grad is not None
return dx[0].numpy(), dx[0].grad.numpy()
class TestNewCustomOpSetUpInstall(unittest.TestCase): class TestNewCustomOpSetUpInstall(unittest.TestCase):
def setUp(self): def setUp(self):
cur_dir = os.path.dirname(os.path.abspath(__file__)) cur_dir = os.path.dirname(os.path.abspath(__file__))
...@@ -293,6 +310,25 @@ class TestNewCustomOpSetUpInstall(unittest.TestCase): ...@@ -293,6 +310,25 @@ class TestNewCustomOpSetUpInstall(unittest.TestCase):
predict, predict_infer)) predict, predict_infer))
paddle.disable_static() paddle.disable_static()
def test_func_double_grad_dynamic(self):
for device in self.devices:
for dtype in self.dtypes:
if device == 'cpu' and dtype == 'float16':
continue
x = np.random.uniform(-1, 1, [4, 8]).astype(dtype)
out, dx_grad = custom_relu_double_grad_dynamic(
self.custom_ops[0], device, dtype, x)
pd_out, pd_dx_grad = custom_relu_double_grad_dynamic(
self.custom_ops[0], device, dtype, x, False)
self.assertTrue(
np.array_equal(out, pd_out),
"custom op out: {},\n paddle api out: {}".format(out,
pd_out))
self.assertTrue(
np.array_equal(dx_grad, pd_dx_grad),
"custom op dx grad: {},\n paddle api dx grad: {}".format(
dx_grad, pd_dx_grad))
if __name__ == '__main__': if __name__ == '__main__':
unittest.main() unittest.main()
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册