diff --git a/paddle/phi/api/ext/tensor_compat.h b/paddle/phi/api/ext/tensor_compat.h index 7233744c65c3fd482810608cb04b6be5092e7f7b..17c0dd3f8732dde96d371d99bc8798692146a3f3 100644 --- a/paddle/phi/api/ext/tensor_compat.h +++ b/paddle/phi/api/ext/tensor_compat.h @@ -19,7 +19,7 @@ limitations under the License. */ // Note(chenweihang): In order to be compatible with the original custom // operator Tensor interface, only available to external users, the file -// cannot be includeed in paddle +// cannot be included in paddle namespace paddle { using Tensor = experimental::Tensor; diff --git a/python/paddle/fluid/tests/custom_runtime/CMakeLists.txt b/python/paddle/fluid/tests/custom_runtime/CMakeLists.txt index 1dd6ef6776750c01fa78b6e6a269fea0df63f33d..00eef2d5a77316dcb3918ff32dde55b4fe9a1c73 100644 --- a/python/paddle/fluid/tests/custom_runtime/CMakeLists.txt +++ b/python/paddle/fluid/tests/custom_runtime/CMakeLists.txt @@ -28,4 +28,5 @@ if(WITH_CUSTOM_DEVICE AND NOT WITH_GPU) set_tests_properties(test_custom_cpu_profiler_plugin PROPERTIES TIMEOUT 120) set_tests_properties(test_fleet_launch_custom_device PROPERTIES TIMEOUT 120) set_tests_properties(test_custom_cpu_to_static PROPERTIES TIMEOUT 120) + set_tests_properties(test_custom_device_relu_setup PROPERTIES TIMEOUT 120) endif() diff --git a/python/paddle/fluid/tests/custom_runtime/custom_relu_op.cc b/python/paddle/fluid/tests/custom_runtime/custom_relu_op.cc new file mode 100644 index 0000000000000000000000000000000000000000..da0563ffeb10e3762dc874676ffc9402d0529bc7 --- /dev/null +++ b/python/paddle/fluid/tests/custom_runtime/custom_relu_op.cc @@ -0,0 +1,193 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include +#include + +#include "paddle/extension.h" + +#define CHECK_CPU_INPUT(x) PD_CHECK(x.is_cpu(), #x " must be a CPU Tensor.") +#define CHECK_CUSTOM_INPUT(x) \ + PD_CHECK(x.is_custom_device(), #x " must be a custom Tensor.") + +template +void relu_cpu_forward_kernel(const data_t* x_data, + data_t* out_data, + int64_t x_numel) { + PD_CHECK(x_data != nullptr, "x_data is nullptr."); + PD_CHECK(out_data != nullptr, "out_data is nullptr."); + for (int64_t i = 0; i < x_numel; ++i) { + out_data[i] = std::max(static_cast(0.), x_data[i]); + } +} + +template +void relu_cpu_backward_kernel(const data_t* grad_out_data, + const data_t* out_data, + data_t* grad_x_data, + int64_t out_numel) { + for (int64_t i = 0; i < out_numel; ++i) { + grad_x_data[i] = + grad_out_data[i] * (out_data[i] > static_cast(0) ? 1. : 0.); + } +} + +template +void relu_cpu_double_backward_kernel(const data_t* out_data, + const data_t* ddx_data, + data_t* ddout_data, + int64_t ddout_numel) { + for (int64_t i = 0; i < ddout_numel; ++i) { + ddout_data[i] = + ddx_data[i] * (out_data[i] > static_cast(0) ? 1. : 0.); + } +} + +std::vector relu_cpu_forward(const paddle::Tensor& x) { + CHECK_CPU_INPUT(x); + auto out = paddle::empty_like(x); + + PD_DISPATCH_FLOATING_TYPES( + x.type(), "relu_cpu_forward", ([&] { + relu_cpu_forward_kernel( + x.data(), out.data(), x.numel()); + })); + + return {out}; +} + +std::vector relu_cpu_backward(const paddle::Tensor& x, + const paddle::Tensor& out, + const paddle::Tensor& grad_out) { + auto grad_x = paddle::empty_like(x); + + PD_DISPATCH_FLOATING_TYPES(out.type(), "relu_cpu_backward", ([&] { + relu_cpu_backward_kernel( + grad_out.data(), + out.data(), + grad_x.data(), + out.size()); + })); + + return {grad_x}; +} + +std::vector relu_cpu_double_backward( + const paddle::Tensor& out, const paddle::Tensor& ddx) { + CHECK_CPU_INPUT(out); + CHECK_CPU_INPUT(ddx); + auto ddout = paddle::empty(out.shape(), out.dtype(), out.place()); + + PD_DISPATCH_FLOATING_TYPES(out.type(), "relu_cpu_double_backward", ([&] { + relu_cpu_double_backward_kernel( + out.data(), + ddx.data(), + ddout.mutable_data(out.place()), + ddout.size()); + })); + + return {ddout}; +} + +std::vector relu_custom_forward(const paddle::Tensor& x) { + CHECK_CUSTOM_INPUT(x); + auto out = paddle::relu(x); + return {out}; +} + +std::vector relu_custom_backward( + const paddle::Tensor& x, + const paddle::Tensor& out, + const paddle::Tensor& grad_out) { + CHECK_CUSTOM_INPUT(x); + CHECK_CUSTOM_INPUT(out); + auto grad_x = paddle::empty_like(x, x.dtype(), x.place()); + auto ones = paddle::experimental::full_like(x, 1.0, x.dtype(), x.place()); + auto zeros = paddle::experimental::full_like(x, 0.0, x.dtype(), x.place()); + auto condition = paddle::experimental::greater_than(x, zeros); + + grad_x = paddle::multiply(grad_out, paddle::where(condition, ones, zeros)); + + return {grad_x}; +} + +std::vector relu_custom_double_backward( + const paddle::Tensor& out, const paddle::Tensor& ddx) { + CHECK_CUSTOM_INPUT(out); + auto ddout = paddle::empty(out.shape(), out.dtype(), out.place()); + auto ones = + paddle::experimental::full_like(out, 1.0, out.dtype(), out.place()); + auto zeros = + paddle::experimental::full_like(out, 0.0, out.dtype(), out.place()); + auto condition = paddle::experimental::greater_than(out, zeros); + + ddout = paddle::multiply(ddx, paddle::where(condition, ones, zeros)); + + return {ddout}; +} + +std::vector ReluForward(const paddle::Tensor& x) { + if (x.is_cpu()) { + return relu_cpu_forward(x); + } else if (x.is_custom_device()) { + return relu_custom_forward(x); + } else { + PD_THROW("Not implemented."); + } +} + +std::vector ReluBackward(const paddle::Tensor& x, + const paddle::Tensor& out, + const paddle::Tensor& grad_out) { + if (x.is_cpu()) { + return relu_cpu_backward(x, out, grad_out); + } else if (x.is_custom_device()) { + return relu_custom_backward(x, out, grad_out); + } else { + PD_THROW("Not implemented."); + } +} + +std::vector ReluDoubleBackward(const paddle::Tensor& out, + const paddle::Tensor& ddx) { + if (out.is_cpu()) { + return relu_cpu_double_backward(out, ddx); + } else if (out.is_custom_device()) { + return relu_custom_double_backward(out, ddx); + } else { + PD_THROW("Not implemented."); + } +} + +std::vector> ReluDoubleBackwardInferShape( + const std::vector& out_shape, + const std::vector& ddx_shape) { + return {out_shape}; +} + +PD_BUILD_OP(custom_relu) + .Inputs({"X"}) + .Outputs({"Out"}) + .SetKernelFn(PD_KERNEL(ReluForward)); + +PD_BUILD_GRAD_OP(custom_relu) + .Inputs({"X", "Out", paddle::Grad("Out")}) + .Outputs({paddle::Grad("X")}) + .SetKernelFn(PD_KERNEL(ReluBackward)); + +PD_BUILD_DOUBLE_GRAD_OP(custom_relu) + .Inputs({"Out", paddle::Grad(paddle::Grad("X"))}) + .Outputs({paddle::Grad(paddle::Grad("Out"))}) + .SetKernelFn(PD_KERNEL(ReluDoubleBackward)) + .SetInferShapeFn(PD_INFER_SHAPE(ReluDoubleBackwardInferShape)); diff --git a/python/paddle/fluid/tests/custom_runtime/test_custom_device_relu_setup.py b/python/paddle/fluid/tests/custom_runtime/test_custom_device_relu_setup.py new file mode 100644 index 0000000000000000000000000000000000000000..760ad56cc3380e4d5b53fd65e07638e14d5859f5 --- /dev/null +++ b/python/paddle/fluid/tests/custom_runtime/test_custom_device_relu_setup.py @@ -0,0 +1,325 @@ +# Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import sys +import tempfile +import unittest +from site import getsitepackages + +import numpy as np + + +def custom_relu_dynamic(func, device, dtype, np_x, use_func=True): + import paddle + + paddle.fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True}) + paddle.set_device(device) + + t = paddle.to_tensor(np_x, dtype=dtype) + t.stop_gradient = False + sys.stdout.flush() + + out = func(t) if use_func else paddle.nn.functional.relu(t) + out.stop_gradient = False + + out.backward() + + paddle.fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": False}) + if t.grad is None: + return out.numpy(), t.grad + else: + return out.numpy(), t.grad.numpy() + + +def custom_relu_static(func, device, dtype, np_x, use_func=True): + import paddle + import paddle.static as static + + paddle.enable_static() + paddle.set_device(device) + + with static.scope_guard(static.Scope()): + with static.program_guard(static.Program()): + x = static.data(name="X", shape=[None, 8], dtype=dtype) + x.stop_gradient = False + out = func(x) if use_func else paddle.nn.functional.relu(x) + static.append_backward(out) + + exe = static.Executor() + exe.run(static.default_startup_program()) + # in static mode, x data has been covered by out + out_v = exe.run( + static.default_main_program(), + feed={"X": np_x}, + fetch_list=[out.name], + ) + + paddle.disable_static() + return out_v + + +def custom_relu_static_pe(func, device, dtype, np_x, use_func=True): + import paddle + import paddle.static as static + + paddle.enable_static() + paddle.set_device(device) + + places = paddle.CustomPlace("custom_cpu", 0) + + with static.scope_guard(static.Scope()): + with static.program_guard(static.Program()): + x = static.data(name="X", shape=[None, 8], dtype=dtype) + x.stop_gradient = False + out = func(x) if use_func else paddle.nn.functional.relu(x) + static.append_backward(out) + + exe = static.Executor() + exe.run(static.default_startup_program()) + + # in static mode, x data has been covered by out + compiled_prog = static.CompiledProgram( + static.default_main_program() + ).with_data_parallel(loss_name=out.name, places=places) + out_v = exe.run( + compiled_prog, feed={"X": np_x}, fetch_list=[out.name] + ) + + paddle.disable_static() + return out_v + + +def custom_relu_double_grad_dynamic(func, device, dtype, np_x, use_func=True): + import paddle + + paddle.set_device(device) + paddle.fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": True}) + + t = paddle.to_tensor(np_x, dtype=dtype, stop_gradient=False) + + out = func(t) if use_func else paddle.nn.functional.relu(t) + dx = paddle.grad( + outputs=out, + inputs=t, + grad_outputs=paddle.ones_like(t), + create_graph=True, + retain_graph=True, + ) + + ddout = paddle.grad( + outputs=dx[0], + inputs=out.grad, + grad_outputs=paddle.ones_like(t), + create_graph=False, + ) + + paddle.fluid.set_flags({"FLAGS_retain_grad_for_all_tensor": False}) + assert ddout[0].numpy() is not None + return dx[0].numpy(), ddout[0].numpy() + + +class TestNewCustomOpSetUpInstall(unittest.TestCase): + def setUp(self): + # compile so and set to current path + self.cur_dir = os.path.dirname(os.path.abspath(__file__)) + self.temp_dir = tempfile.TemporaryDirectory() + cmd = 'cd {} \ + && git clone {} \ + && cd PaddleCustomDevice \ + && git fetch origin \ + && git checkout {} -b dev \ + && cd backends/custom_cpu \ + && mkdir build && cd build && cmake .. && make -j8 \ + && cd {}'.format( + self.temp_dir.name, + os.getenv('PLUGIN_URL'), + os.getenv('PLUGIN_TAG'), + self.cur_dir, + ) + os.system(cmd) + + # set environment for loading and registering compiled custom kernels + # only valid in current process + os.environ['CUSTOM_DEVICE_ROOT'] = os.path.join( + self.cur_dir, + '{}/PaddleCustomDevice/backends/custom_cpu/build'.format( + self.temp_dir.name + ), + ) + + # `import paddle` loads custom_cpu.so, hence we must import paddle after finishing build PaddleCustomDevice + import paddle + + # [Why specific paddle_includes directory?] + # Add paddle_includes to pass CI, for more details, + # please refer to the comments in `paddle/fluid/tests/custom_op/utils.py`` + paddle_includes = [] + for site_packages_path in getsitepackages(): + paddle_includes.append( + os.path.join(site_packages_path, 'paddle', 'include') + ) + paddle_includes.append( + os.path.join( + site_packages_path, 'paddle', 'include', 'third_party' + ) + ) + + custom_module = paddle.utils.cpp_extension.load( + name='custom_device_relu', + sources=['custom_relu_op.cc'], + extra_include_paths=paddle_includes, # add for Coverage CI + extra_cxx_cflags=["-w", "-g"], # test for cc flags + # build_directory=self.cur_dir, + verbose=True, + ) + self.custom_op = custom_module.custom_relu + + self.dtypes = ["float32", "float64"] + self.device = "custom_cpu" + + # config seed + SEED = 2021 + paddle.seed(SEED) + paddle.framework.random._manual_program_seed(SEED) + + def tearDown(self): + self.temp_dir.cleanup() + del os.environ['CUSTOM_DEVICE_ROOT'] + + def test_custom_device(self): + self._test_static() + self._test_static_pe() + self._test_dynamic() + self._test_double_grad_dynamic() + self._test_with_dataloader() + + def _test_static(self): + for dtype in self.dtypes: + x = np.random.uniform(-1, 1, [4, 8]).astype(dtype) + out = custom_relu_static(self.custom_op, self.device, dtype, x) + pd_out = custom_relu_static( + self.custom_op, self.device, dtype, x, False + ) + np.testing.assert_array_equal( + out, + pd_out, + err_msg="custom op out: {},\n paddle api out: {}".format( + out, pd_out + ), + ) + + def _test_static_pe(self): + for dtype in self.dtypes: + x = np.random.uniform(-1, 1, [4, 8]).astype(dtype) + out = custom_relu_static_pe(self.custom_op, self.device, dtype, x) + pd_out = custom_relu_static_pe( + self.custom_op, self.device, dtype, x, False + ) + np.testing.assert_array_equal( + out, + pd_out, + err_msg="custom op out: {},\n paddle api out: {}".format( + out, pd_out + ), + ) + + def _test_dynamic(self): + for dtype in self.dtypes: + x = np.random.uniform(-1, 1, [4, 8]).astype(dtype) + out, x_grad = custom_relu_dynamic( + self.custom_op, self.device, dtype, x + ) + pd_out, pd_x_grad = custom_relu_dynamic( + self.custom_op, self.device, dtype, x, False + ) + np.testing.assert_array_equal( + out, + pd_out, + err_msg="custom op out: {},\n paddle api out: {}".format( + out, pd_out + ), + ) + np.testing.assert_array_equal( + x_grad, + pd_x_grad, + err_msg="custom op x grad: {},\n paddle api x grad: {}".format( + x_grad, pd_x_grad + ), + ) + + def _test_double_grad_dynamic(self): + for dtype in self.dtypes: + x = np.random.uniform(-1, 1, [4, 8]).astype(dtype) + out, dx_grad = custom_relu_double_grad_dynamic( + self.custom_op, self.device, dtype, x + ) + pd_out, pd_dx_grad = custom_relu_double_grad_dynamic( + self.custom_op, self.device, dtype, x, False + ) + np.testing.assert_array_equal( + out, + pd_out, + err_msg="custom op out: {},\n paddle api out: {}".format( + out, pd_out + ), + ) + np.testing.assert_array_equal( + dx_grad, + pd_dx_grad, + err_msg="custom op dx grad: {},\n paddle api dx grad: {}".format( + dx_grad, pd_dx_grad + ), + ) + + def _test_with_dataloader(self): + import paddle + from paddle.vision.transforms import Compose, Normalize + + paddle.set_device(self.device) + # data loader + transform = Compose( + [Normalize(mean=[127.5], std=[127.5], data_format="CHW")] + ) + train_dataset = paddle.vision.datasets.MNIST( + mode="train", transform=transform + ) + train_loader = paddle.io.DataLoader( + train_dataset, + batch_size=64, + shuffle=True, + drop_last=True, + num_workers=0, + ) + + for batch_id, (image, _) in enumerate(train_loader()): + out = self.custom_op(image) + pd_out = paddle.nn.functional.relu(image) + np.testing.assert_array_equal( + out, + pd_out, + err_msg="custom op out: {},\n paddle api out: {}".format( + out, pd_out + ), + ) + + if batch_id == 5: + break + + +if __name__ == "__main__": + if os.name == 'nt' or sys.platform.startswith('darwin'): + # only support Linux now + exit() + unittest.main()