From 9e776f62f90ec525525c9f57072449811bc31b4f Mon Sep 17 00:00:00 2001 From: Chen Weihang Date: Mon, 27 Jun 2022 14:16:56 +0800 Subject: [PATCH] [Cherry-pick] Fix incompatible error for place type (#43830) * Create Tensor by paddle::empty in custom operator (#41840) * create tensor by empty in custom op * fix some bug * update relu custom op demo (#43173) * Fix incompatible error for custom op Placetype (#43749) * fix incompatible error * rmeove default constructor * add macro * fix cpu make error * add DefaultGPUPlace api Co-authored-by: zyfncg --- .../final_state_generator/codegen_utils.py | 2 +- .../final_state_generator/python_c_gen.py | 2 +- paddle/fluid/pybind/eager_utils.cc | 10 ++- paddle/fluid/pybind/eager_utils.h | 10 ++- paddle/phi/api/lib/tensor.cc | 22 +------ paddle/phi/common/CMakeLists.txt | 20 +++++- paddle/phi/common/place.cc | 32 +++++++++- paddle/phi/common/place.h | 5 +- paddle/phi/tests/api/test_data_transform.cc | 19 +++--- paddle/phi/tests/api/test_scale_benchmark.cc | 2 +- .../tests/custom_op/context_pool_test_op.cc | 6 +- .../fluid/tests/custom_op/custom_concat_op.cc | 8 +-- .../fluid/tests/custom_op/custom_conj_op.cc | 2 +- .../fluid/tests/custom_op/custom_relu_op.cc | 30 +++++---- .../fluid/tests/custom_op/custom_relu_op.cu | 61 ++++++++++--------- .../fluid/tests/custom_op/custom_tanh_op.cc | 8 +-- 16 files changed, 128 insertions(+), 111 deletions(-) diff --git a/paddle/fluid/eager/auto_code_generator/final_state_generator/codegen_utils.py b/paddle/fluid/eager/auto_code_generator/final_state_generator/codegen_utils.py index ab8c28c33e7..7769c5371ba 100644 --- a/paddle/fluid/eager/auto_code_generator/final_state_generator/codegen_utils.py +++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/codegen_utils.py @@ -45,7 +45,7 @@ yaml_types_mapping = { 'int' : 'int', 'int32_t' : 'int32_t', 'int64_t' : 'int64_t', 'size_t' : 'size_t', \ 'float' : 'float', 'double' : 'double', 'bool' : 'bool', \ 'str' : 'std::string', \ - 'Place' : 'paddle::experimental::Place', 'DataLayout' : 'paddle::experimental::DataLayout', 'DataType' : 'paddle::experimental::DataType', \ + 'Place' : 'paddle::Place', 'DataLayout' : 'paddle::experimental::DataLayout', 'DataType' : 'paddle::experimental::DataType', \ 'int64_t[]' : 'std::vector', 'int[]' : 'std::vector', 'Tensor' : 'Tensor', 'Tensor[]' : 'std::vector', diff --git a/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py b/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py index 5a361ef39b7..7ca5fc833ea 100644 --- a/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py +++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/python_c_gen.py @@ -46,7 +46,7 @@ atype_to_parsing_function = { "std::vector": "CastPyArg2Strings", "paddle::experimental::Scalar": "CastPyArg2Scalar", "paddle::experimental::IntArray": "CastPyArg2IntArray", - "paddle::experimental::Place": "CastPyArg2Place", + "paddle::Place": "CastPyArg2Place", "paddle::experimental::DataType": "CastPyArg2DataType", } diff --git a/paddle/fluid/pybind/eager_utils.cc b/paddle/fluid/pybind/eager_utils.cc index 124e5883324..4033e2d424f 100644 --- a/paddle/fluid/pybind/eager_utils.cc +++ b/paddle/fluid/pybind/eager_utils.cc @@ -1194,15 +1194,13 @@ std::vector GetScopePtrListFromArgs( return result; } -paddle::experimental::Place CastPyArg2Place(PyObject* obj, - const std::string& op_type, - ssize_t arg_pos) { +paddle::Place CastPyArg2Place(PyObject* obj, const std::string& op_type, + ssize_t arg_pos) { return CastPyArg2Place(obj, arg_pos); } -paddle::experimental::DataType CastPyArg2DataType(PyObject* obj, - const std::string& op_type, - ssize_t arg_pos) { +paddle::DataType CastPyArg2DataType(PyObject* obj, const std::string& op_type, + ssize_t arg_pos) { if (obj == Py_None) { return paddle::experimental::DataType::UNDEFINED; } diff --git a/paddle/fluid/pybind/eager_utils.h b/paddle/fluid/pybind/eager_utils.h index f1fab6db6ea..c4ddb347632 100644 --- a/paddle/fluid/pybind/eager_utils.h +++ b/paddle/fluid/pybind/eager_utils.h @@ -171,13 +171,11 @@ paddle::experimental::IntArray CastPyArg2IntArray(PyObject* obj, const std::string& op_type, ssize_t arg_pos); -paddle::experimental::Place CastPyArg2Place(PyObject* obj, - const std::string& op_type, - ssize_t arg_pos); +paddle::Place CastPyArg2Place(PyObject* obj, const std::string& op_type, + ssize_t arg_pos); -paddle::experimental::DataType CastPyArg2DataType(PyObject* obj, - const std::string& op_type, - ssize_t arg_pos); +paddle::DataType CastPyArg2DataType(PyObject* obj, const std::string& op_type, + ssize_t arg_pos); paddle::optional GetOptionalTensorFromArgs( const std::string& op_type, const std::string& arg_name, PyObject* args, diff --git a/paddle/phi/api/lib/tensor.cc b/paddle/phi/api/lib/tensor.cc index a7b89d7a4dc..fb81092ffee 100644 --- a/paddle/phi/api/lib/tensor.cc +++ b/paddle/phi/api/lib/tensor.cc @@ -37,24 +37,6 @@ limitations under the License. */ namespace paddle { namespace experimental { -namespace detail { -static Place GetCorrectPlaceByPlaceType(const Place &place_type) { - auto alloc_type = place_type.GetType(); - switch (alloc_type) { - case AllocationType::CPU: - return place_type; -#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) - case AllocationType::GPU: - return phi::Place(AllocationType::GPU, - phi::backends::gpu::GetCurrentDeviceId()); -#endif - default: - PADDLE_THROW(phi::errors::Unavailable( - "The PlaceType is a legacy design, only supports CPU and GPU, " - "and will not support other place types in the future.")); - } -} -} // namespace detail /////// Tensor Methods //////// @@ -76,7 +58,7 @@ Tensor::Tensor(const Place &place) { "Reason: A legal tensor cannot be constructed only based on " "the `place`, and datatype, shape, layout, etc. is also " "required."; - DefaultAllocator alloc(detail::GetCorrectPlaceByPlaceType(place)); + DefaultAllocator alloc(place); impl_ = std::move(std::make_shared( &alloc, std::move(phi::DenseTensorMeta( @@ -92,7 +74,7 @@ Tensor::Tensor(const Place &place, const std::vector &shape) { "Reason: A legal tensor cannot be constructed only based on " "the `place` and `shape`, and datatype, layout, etc. is also " "required."; - DefaultAllocator alloc(detail::GetCorrectPlaceByPlaceType(place)); + DefaultAllocator alloc(place); impl_ = std::move(std::make_shared( &alloc, std::move(phi::DenseTensorMeta(phi::DataType::FLOAT32, diff --git a/paddle/phi/common/CMakeLists.txt b/paddle/phi/common/CMakeLists.txt index aa839eab587..98f55a4f721 100644 --- a/paddle/phi/common/CMakeLists.txt +++ b/paddle/phi/common/CMakeLists.txt @@ -1,2 +1,18 @@ -cc_library(phi_place SRCS place.cc) -cc_library(scalar SRCS scalar.cc DEPS phi_enforce tensor) +if(WITH_GPU) + nv_library( + phi_place + SRCS place.cc + DEPS phi_gpu_info) +elseif(WITH_ROCM) + hip_library( + phi_place + SRCS place.cc + DEPS phi_gpu_info) +else() + cc_library(phi_place SRCS place.cc) +endif() + +cc_library( + scalar + SRCS scalar.cc + DEPS phi_enforce tensor) diff --git a/paddle/phi/common/place.cc b/paddle/phi/common/place.cc index 667d0a32b93..1a67f1a192d 100644 --- a/paddle/phi/common/place.cc +++ b/paddle/phi/common/place.cc @@ -21,6 +21,7 @@ limitations under the License. */ #include "glog/logging.h" #include "paddle/phi/api/ext/exception.h" +#include "paddle/phi/backends/gpu/gpu_info.h" namespace phi { @@ -110,14 +111,32 @@ uint32_t Place::Hash::operator()(const Place &place) const { return hash_value; } +namespace detail { +static int8_t GetCorrectDeviceIdByPlaceType( + const paddle::PlaceType &place_type) { + switch (place_type) { + case paddle::PlaceType::kCPU: + return 0; +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + case paddle::PlaceType::kGPU: + return phi::backends::gpu::GetCurrentDeviceId(); +#endif + default: + PD_THROW( + "The PlaceType is a legacy design, only supports CPU and GPU, " + "and will not support other place types in the future."); + } +} +} // namespace detail + Place::Place(paddle::PlaceType type) - : device(0), + : device(detail::GetCorrectDeviceIdByPlaceType(type)), alloc_type_(static_cast(type)), device_type_id_(GetOrRegisterGlobalDeviceTypeId("")) { LOG_FIRST_N(WARNING, 1) << "The `paddle::PlaceType::kCPU/kGPU` is deprecated since version " "2.3, and will be removed in version 2.4! Please use " - "`paddle::CPUPlace()/GPUPlace()` to represent the place type."; + "`paddle::CPUPlace()/DefaultGPUPlace()` to represent the place type."; } } // namespace phi @@ -140,4 +159,13 @@ bool operator==(PlaceType place_type, const Place &place) { return static_cast(place_type) == place.GetType(); } +GPUPlace DefaultGPUPlace() { + return GPUPlace( +#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) + phi::backends::gpu::GetCurrentDeviceId()); +#else + 0); +#endif +} + } // namespace paddle diff --git a/paddle/phi/common/place.h b/paddle/phi/common/place.h index ed9fb787642..cbc1faf94f0 100644 --- a/paddle/phi/common/place.h +++ b/paddle/phi/common/place.h @@ -213,9 +213,6 @@ std::ostream& operator<<(std::ostream&, const Place&); namespace paddle { namespace experimental { using AllocationType = phi::AllocationType; -using Place = phi::Place; -using CPUPlace = phi::CPUPlace; -using GPUPlace = phi::GPUPlace; using GPUPinnedPlace = phi::GPUPinnedPlace; using XPUPlace = phi::XPUPlace; using NPUPlace = phi::NPUPlace; @@ -259,4 +256,6 @@ enum class PlaceType { PADDLE_API bool operator==(const Place& place, PlaceType place_type); PADDLE_API bool operator==(PlaceType place_type, const Place& place); +PADDLE_API GPUPlace DefaultGPUPlace(); + } // namespace paddle diff --git a/paddle/phi/tests/api/test_data_transform.cc b/paddle/phi/tests/api/test_data_transform.cc index a2bd1f2cad9..21d5eef4098 100644 --- a/paddle/phi/tests/api/test_data_transform.cc +++ b/paddle/phi/tests/api/test_data_transform.cc @@ -37,13 +37,11 @@ namespace tests { // TODO(chenweihang): Remove this test after the API is used in the dygraph TEST(API, data_transform_same_place) { // 1. create tensor - auto x = paddle::experimental::full({3, 3}, - 1.0, - experimental::DataType::COMPLEX128, - experimental::CPUPlace()); + auto x = + paddle::experimental::full({3, 3}, 1.0, DataType::COMPLEX128, CPUPlace()); - auto y = paddle::experimental::full( - {3, 3}, 2.0, experimental::DataType::FLOAT32, experimental::CPUPlace()); + auto y = + paddle::experimental::full({3, 3}, 2.0, DataType::FLOAT32, CPUPlace()); std::vector> sum(9, 6.0); @@ -75,10 +73,10 @@ TEST(API, data_transform_same_place) { TEST(Tensor, data_transform_diff_place) { // 1. create tensor auto x = paddle::experimental::full( - {3, 3}, 1.0, experimental::DataType::FLOAT64, experimental::CPUPlace()); + {3, 3}, 1.0, experimental::DataType::FLOAT64, CPUPlace()); auto y = paddle::experimental::full( - {3, 3}, 2.0, experimental::DataType::FLOAT64, experimental::GPUPlace()); + {3, 3}, 2.0, experimental::DataType::FLOAT64, GPUPlace()); std::vector sum(9, 6.0); @@ -93,10 +91,9 @@ TEST(Tensor, data_transform_diff_place) { ASSERT_EQ(out.dtype(), phi::DataType::FLOAT64); ASSERT_EQ(out.layout(), phi::DataLayout::NCHW); ASSERT_EQ(out.initialized(), true); - ASSERT_EQ(out.impl()->place(), - phi::TransToPhiPlace(experimental::Backend::GPU)); + ASSERT_EQ(out.impl()->place(), phi::TransToPhiPlace(phi::Backend::GPU)); - auto ref_out = experimental::copy_to(out, experimental::CPUPlace(), true); + auto ref_out = experimental::copy_to(out, CPUPlace(), true); auto dense_out = std::dynamic_pointer_cast(ref_out.impl()); for (size_t i = 0; i < 9; i++) { diff --git a/paddle/phi/tests/api/test_scale_benchmark.cc b/paddle/phi/tests/api/test_scale_benchmark.cc index ca4a264e511..e2870a780ae 100644 --- a/paddle/phi/tests/api/test_scale_benchmark.cc +++ b/paddle/phi/tests/api/test_scale_benchmark.cc @@ -30,7 +30,7 @@ namespace tests { TEST(API, scale) { auto x = experimental::full( - {3, 4}, 1.0, experimental::DataType::FLOAT32, experimental::CPUPlace()); + {3, 4}, 1.0, experimental::DataType::FLOAT32, CPUPlace()); const size_t cycles = 300; phi::tests::Timer timer; diff --git a/python/paddle/fluid/tests/custom_op/context_pool_test_op.cc b/python/paddle/fluid/tests/custom_op/context_pool_test_op.cc index 6b0edcc7ab1..9286ae7ca00 100644 --- a/python/paddle/fluid/tests/custom_op/context_pool_test_op.cc +++ b/python/paddle/fluid/tests/custom_op/context_pool_test_op.cc @@ -22,8 +22,7 @@ std::vector ContextPoolTest(const paddle::Tensor& x) { // 1. test cpu context - paddle::experimental::Place cpu_place( - paddle::experimental::AllocationType::CPU); + paddle::Place cpu_place(paddle::experimental::AllocationType::CPU); auto* cpu_ctx = paddle::experimental::DeviceContextPool::Instance() .Get(cpu_place); @@ -34,8 +33,7 @@ std::vector ContextPoolTest(const paddle::Tensor& x) { #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) // 2. test gpu context - paddle::experimental::Place gpu_place( - paddle::experimental::AllocationType::GPU); + paddle::Place gpu_place(paddle::experimental::AllocationType::GPU); auto* gpu_ctx = paddle::experimental::DeviceContextPool::Instance() .Get(gpu_place); diff --git a/python/paddle/fluid/tests/custom_op/custom_concat_op.cc b/python/paddle/fluid/tests/custom_op/custom_concat_op.cc index 66cc36c300e..80f76e2df54 100644 --- a/python/paddle/fluid/tests/custom_op/custom_concat_op.cc +++ b/python/paddle/fluid/tests/custom_op/custom_concat_op.cc @@ -75,7 +75,7 @@ std::vector ConcatForwardDynamicAxis( auto out_shape = ComputeOutShape(in_shapes, axis); // create output - auto out = paddle::Tensor(paddle::PlaceType::kCPU, out_shape); + auto out = paddle::empty(out_shape, inputs[0].type(), paddle::CPUPlace()); // calc PD_DISPATCH_FLOATING_AND_INTEGRAL_TYPES( @@ -106,7 +106,7 @@ std::vector ConcatBackwardDynamicAxis( // create outputs std::vector grad_inputs; for (auto& t : inputs) { - auto grad = paddle::Tensor(paddle::PlaceType::kCPU, t.shape()); + auto grad = paddle::empty(t.shape(), t.dtype(), t.place()); grad_inputs.emplace_back(grad); } @@ -161,7 +161,7 @@ std::vector ConcatForwardStaticAxis( auto out_shape = ComputeOutShape(in_shapes, final_axis); // create output - auto out = paddle::Tensor(paddle::PlaceType::kCPU, out_shape); + auto out = paddle::empty(out_shape, inputs[0].type(), paddle::CPUPlace()); // calc PD_DISPATCH_FLOATING_AND_INTEGRAL_TYPES( @@ -190,7 +190,7 @@ std::vector ConcatBackwardStaticAxis( // create outputs std::vector grad_inputs; for (auto& t : inputs) { - auto grad = paddle::Tensor(paddle::PlaceType::kCPU, t.shape()); + auto grad = paddle::empty(t.shape(), t.dtype(), t.place()); grad_inputs.emplace_back(grad); } diff --git a/python/paddle/fluid/tests/custom_op/custom_conj_op.cc b/python/paddle/fluid/tests/custom_op/custom_conj_op.cc index b9c10f479e0..56938552420 100644 --- a/python/paddle/fluid/tests/custom_op/custom_conj_op.cc +++ b/python/paddle/fluid/tests/custom_op/custom_conj_op.cc @@ -71,7 +71,7 @@ void ConjCPUKernel(const data_t* x_data, int64_t numel, data_t* out_data) { std::vector ConjFunction(const paddle::Tensor& x) { CHECK_INPUT(x); - paddle::Tensor out(x.place(), x.shape()); + paddle::Tensor out = paddle::empty(x.shape(), x.dtype(), x.place()); PD_DISPATCH_FLOATING_AND_COMPLEX_TYPES( x.type(), "ConjCPUKernel", ([&] { diff --git a/python/paddle/fluid/tests/custom_op/custom_relu_op.cc b/python/paddle/fluid/tests/custom_op/custom_relu_op.cc index 121a855a18f..f1860635ed5 100644 --- a/python/paddle/fluid/tests/custom_op/custom_relu_op.cc +++ b/python/paddle/fluid/tests/custom_op/custom_relu_op.cc @@ -17,8 +17,7 @@ #include "paddle/extension.h" -#define CHECK_CPU_INPUT(x) \ - PD_CHECK(x.place() == paddle::PlaceType::kCPU, #x " must be a CPU Tensor.") +#define CHECK_CPU_INPUT(x) PD_CHECK(x.is_cpu(), #x " must be a CPU Tensor.") template void relu_cpu_forward_kernel(const data_t* x_data, @@ -26,7 +25,7 @@ void relu_cpu_forward_kernel(const data_t* x_data, int64_t x_numel) { PD_CHECK(x_data != nullptr, "x_data is nullptr."); PD_CHECK(out_data != nullptr, "out_data is nullptr."); - for (int i = 0; i < x_numel; ++i) { + for (int64_t i = 0; i < x_numel; ++i) { out_data[i] = std::max(static_cast(0.), x_data[i]); } } @@ -36,7 +35,7 @@ void relu_cpu_backward_kernel(const data_t* grad_out_data, const data_t* out_data, data_t* grad_x_data, int64_t out_numel) { - for (int i = 0; i < out_numel; ++i) { + for (int64_t i = 0; i < out_numel; ++i) { grad_x_data[i] = grad_out_data[i] * (out_data[i] > static_cast(0) ? 1. : 0.); } @@ -54,12 +53,12 @@ void relu_cpu_double_backward_kernel(const data_t* out_data, } std::vector relu_cpu_forward(const paddle::Tensor& x) { - auto out = paddle::Tensor(paddle::PlaceType::kCPU, x.shape()); + auto out = paddle::empty_like(x); PD_DISPATCH_FLOATING_TYPES( x.type(), "relu_cpu_forward", ([&] { relu_cpu_forward_kernel( - x.data(), out.mutable_data(x.place()), x.size()); + x.data(), out.data(), x.numel()); })); return {out}; @@ -68,13 +67,13 @@ std::vector relu_cpu_forward(const paddle::Tensor& x) { std::vector relu_cpu_backward(const paddle::Tensor& x, const paddle::Tensor& out, const paddle::Tensor& grad_out) { - auto grad_x = paddle::Tensor(paddle::PlaceType::kCPU, x.shape()); + auto grad_x = paddle::empty_like(x); PD_DISPATCH_FLOATING_TYPES(out.type(), "relu_cpu_backward", ([&] { relu_cpu_backward_kernel( grad_out.data(), out.data(), - grad_x.mutable_data(x.place()), + grad_x.data(), out.size()); })); @@ -85,7 +84,7 @@ std::vector relu_cpu_double_backward( const paddle::Tensor& out, const paddle::Tensor& ddx) { CHECK_CPU_INPUT(out); CHECK_CPU_INPUT(ddx); - auto ddout = paddle::Tensor(paddle::PlaceType::kCPU, out.shape()); + auto ddout = paddle::empty(out.shape(), out.dtype(), out.place()); PD_DISPATCH_FLOATING_TYPES(out.type(), "relu_cpu_double_backward", ([&] { relu_cpu_double_backward_kernel( @@ -108,9 +107,9 @@ std::vector relu_cuda_double_backward( const paddle::Tensor& out, const paddle::Tensor& ddx); std::vector ReluForward(const paddle::Tensor& x) { - if (x.place() == paddle::PlaceType::kCPU) { + if (x.is_cpu()) { return relu_cpu_forward(x); - } else if (x.place() == paddle::PlaceType::kGPU) { + } else if (x.is_gpu()) { return relu_cuda_forward(x); } else { PD_THROW("Not implemented."); @@ -120,10 +119,9 @@ std::vector ReluForward(const paddle::Tensor& x) { std::vector ReluBackward(const paddle::Tensor& x, const paddle::Tensor& out, const paddle::Tensor& grad_out) { - // TODO(chenweihang): Check Input - if (x.place() == paddle::PlaceType::kCPU) { + if (x.is_cpu()) { return relu_cpu_backward(x, out, grad_out); - } else if (x.place() == paddle::PlaceType::kGPU) { + } else if (x.is_gpu()) { return relu_cuda_backward(x, out, grad_out); } else { PD_THROW("Not implemented."); @@ -165,7 +163,7 @@ PD_BUILD_DOUBLE_GRAD_OP(custom_relu) std::vector relu_cpu_backward_without_x( const paddle::Tensor& out, const paddle::Tensor& grad_out) { - auto grad_x = paddle::Tensor(paddle::PlaceType::kCPU, out.shape()); + auto grad_x = paddle::empty(out.shape(), out.dtype(), out.place()); PD_DISPATCH_FLOATING_TYPES(out.type(), "relu_cpu_backward", ([&] { relu_cpu_backward_kernel( @@ -214,7 +212,7 @@ void relu_cpu_forward_out(const paddle::Tensor& x, paddle::Tensor* out) { PD_DISPATCH_FLOATING_TYPES( x.type(), "relu_cpu_forward", ([&] { relu_cpu_forward_kernel( - x.data(), out->mutable_data(x.place()), x.size()); + x.data(), out->mutable_data(x.place()), x.numel()); })); } diff --git a/python/paddle/fluid/tests/custom_op/custom_relu_op.cu b/python/paddle/fluid/tests/custom_op/custom_relu_op.cu index 364a2216b9e..e791ea8cb76 100644 --- a/python/paddle/fluid/tests/custom_op/custom_relu_op.cu +++ b/python/paddle/fluid/tests/custom_op/custom_relu_op.cu @@ -14,15 +14,14 @@ #include "paddle/extension.h" -#define CHECK_GPU_INPUT(x) \ - PD_CHECK(x.place() == paddle::PlaceType::kGPU, #x " must be a GPU Tensor.") +#define CHECK_GPU_INPUT(x) PD_CHECK(x.is_gpu(), #x " must be a GPU Tensor.") template __global__ void relu_cuda_forward_kernel(const data_t* x, data_t* y, - const int num) { - int gid = blockIdx.x * blockDim.x + threadIdx.x; - for (int i = gid; i < num; i += blockDim.x * gridDim.x) { + int64_t num) { + int64_t gid = blockIdx.x * blockDim.x + threadIdx.x; + for (int64_t i = gid; i < num; i += blockDim.x * gridDim.x) { y[i] = x[i] > static_cast(0.) ? x[i] : static_cast(0.); } } @@ -31,9 +30,9 @@ template __global__ void relu_cuda_backward_kernel(const data_t* dy, const data_t* y, data_t* dx, - const int num) { - int gid = blockIdx.x * blockDim.x + threadIdx.x; - for (int i = gid; i < num; i += blockDim.x * gridDim.x) { + int64_t num) { + int64_t gid = blockIdx.x * blockDim.x + threadIdx.x; + for (int64_t i = gid; i < num; i += blockDim.x * gridDim.x) { dx[i] = dy[i] * (y[i] > static_cast(0.) ? static_cast(1.) : static_cast(0.)); } @@ -54,15 +53,17 @@ __global__ void relu_cuda_double_backward_kernel(const data_t* out_data, std::vector relu_cuda_forward(const paddle::Tensor& x) { CHECK_GPU_INPUT(x); - auto out = paddle::Tensor(paddle::PlaceType::kGPU, x.shape()); + auto out = paddle::empty_like(x); - int numel = x.size(); - int block = 512; - int grid = (numel + block - 1) / block; + PD_CHECK(x.place() == paddle::DefaultGPUPlace()); + + int64_t numel = x.numel(); + int64_t block = 512; + int64_t grid = (numel + block - 1) / block; PD_DISPATCH_FLOATING_AND_HALF_TYPES( x.type(), "relu_cuda_forward_kernel", ([&] { relu_cuda_forward_kernel<<>>( - x.data(), out.mutable_data(x.place()), numel); + x.data(), out.data(), numel); })); return {out}; @@ -74,11 +75,13 @@ std::vector relu_cuda_backward(const paddle::Tensor& x, CHECK_GPU_INPUT(x); CHECK_GPU_INPUT(out); CHECK_GPU_INPUT(grad_out); - auto grad_x = paddle::Tensor(paddle::PlaceType::kGPU, x.shape()); + auto grad_x = paddle::empty_like(x); - int numel = out.size(); - int block = 512; - int grid = (numel + block - 1) / block; + PD_CHECK(x.place() == paddle::DefaultGPUPlace()); + + int64_t numel = out.numel(); + int64_t block = 512; + int64_t grid = (numel + block - 1) / block; PD_DISPATCH_FLOATING_AND_HALF_TYPES( out.type(), "relu_cuda_backward_kernel", ([&] { relu_cuda_backward_kernel<<>>( @@ -95,19 +98,19 @@ std::vector relu_cuda_double_backward( const paddle::Tensor& out, const paddle::Tensor& ddx) { CHECK_GPU_INPUT(out); CHECK_GPU_INPUT(ddx); - auto ddout = paddle::Tensor(paddle::PlaceType::kGPU, out.shape()); + auto ddout = paddle::empty(out.shape(), out.dtype(), out.place()); - int64_t numel = out.size(); + int64_t numel = out.numel(); int64_t block = 512; int64_t grid = (numel + block - 1) / block; PD_DISPATCH_FLOATING_AND_HALF_TYPES( out.type(), "relu_cuda_double_backward_kernel", ([&] { - relu_cuda_double_backward_kernel< - data_t><<>>( - out.data(), - ddx.data(), - ddout.mutable_data(out.place()), - numel); + relu_cuda_double_backward_kernel + <<>>( + out.data(), + ddx.data(), + ddout.mutable_data(out.place()), + numel); })); std::cout << "Debug info: run relu gpu double backward success." << std::endl; @@ -117,9 +120,9 @@ std::vector relu_cuda_double_backward( std::vector relu_cuda_backward_without_x( const paddle::Tensor& out, const paddle::Tensor& grad_out) { - auto grad_x = paddle::Tensor(paddle::PlaceType::kGPU, out.shape()); + auto grad_x = paddle::empty(out.shape(), out.dtype(), out.place()); - int numel = out.size(); + int numel = out.numel(); int block = 512; int grid = (numel + block - 1) / block; PD_DISPATCH_FLOATING_AND_HALF_TYPES( @@ -135,7 +138,7 @@ std::vector relu_cuda_backward_without_x( } void relu_cuda_forward_out(const paddle::Tensor& x, paddle::Tensor* out) { - int numel = x.size(); + int numel = x.numel(); int block = 512; int grid = (numel + block - 1) / block; out->reshape(x.shape()); @@ -150,7 +153,7 @@ void relu_cuda_backward_out(const paddle::Tensor& x, const paddle::Tensor& out, const paddle::Tensor& grad_out, paddle::Tensor* grad_x) { - int numel = out.size(); + int numel = out.numel(); int block = 512; int grid = (numel + block - 1) / block; grad_x->reshape(x.shape()); diff --git a/python/paddle/fluid/tests/custom_op/custom_tanh_op.cc b/python/paddle/fluid/tests/custom_op/custom_tanh_op.cc index f96297d69bd..399eb5b6366 100644 --- a/python/paddle/fluid/tests/custom_op/custom_tanh_op.cc +++ b/python/paddle/fluid/tests/custom_op/custom_tanh_op.cc @@ -68,7 +68,7 @@ void tanh_cpu_double_backward_kernel(const data_t* out_data, std::vector TanhForward(const paddle::Tensor& x) { CHECK_CPU_INPUT(x); - auto out = paddle::Tensor(paddle::PlaceType::kCPU, x.shape()); + auto out = paddle::empty(x.shape(), x.dtype(), x.place()); PD_DISPATCH_FLOATING_TYPES( x.dtype(), "tanh_cpu_forward", ([&] { @@ -82,7 +82,7 @@ std::vector TanhForward(const paddle::Tensor& x) { std::vector TanhBackward(const paddle::Tensor& out, const paddle::Tensor& grad_out) { CHECK_CPU_INPUT(out); - auto grad_x = paddle::Tensor(paddle::PlaceType::kCPU, out.shape()); + auto grad_x = paddle::empty(out.shape(), out.dtype(), out.place()); PD_DISPATCH_FLOATING_TYPES(out.dtype(), "tanh_cpu_backward", ([&] { tanh_cpu_backward_kernel( @@ -101,8 +101,8 @@ std::vector TanhDoubleBackward(const paddle::Tensor& out, CHECK_CPU_INPUT(out); CHECK_CPU_INPUT(ddx); CHECK_CPU_INPUT(dout); - auto dout_new = paddle::Tensor(paddle::PlaceType::kCPU, out.shape()); - auto ddout = paddle::Tensor(paddle::PlaceType::kCPU, out.shape()); + auto dout_new = paddle::empty(out.shape(), out.dtype(), out.place()); + auto ddout = paddle::empty(out.shape(), out.dtype(), out.place()); PD_DISPATCH_FLOATING_TYPES(out.dtype(), "tanh_cpu_double_backward", ([&] { tanh_cpu_double_backward_kernel( -- GitLab