diff --git a/paddle/pten/api/lib/kernel_declare.h b/paddle/pten/api/lib/kernel_declare.h index f8874cfb17fef0a2dd2a77c6e1d0c5fd9c682aae..a5d9537097a97bbe007a44716446f36a1eac98eb 100644 --- a/paddle/pten/api/lib/kernel_declare.h +++ b/paddle/pten/api/lib/kernel_declare.h @@ -21,12 +21,12 @@ limitations under the License. */ // file name of the kernel, and this header file will be removed PT_DECLARE_KERNEL(matmul, CPU, ALL_LAYOUT); -PT_DECLARE_KERNEL(cast, CPU, ALL_LAYOUT); +PT_DECLARE_KERNEL(reshape, CPU, ALL_LAYOUT); PT_DECLARE_KERNEL(mean, CPU, ALL_LAYOUT); #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) PT_DECLARE_KERNEL(matmul, GPU, ALL_LAYOUT); -PT_DECLARE_KERNEL(cast, GPU, ALL_LAYOUT); +PT_DECLARE_KERNEL(reshape, GPU, ALL_LAYOUT); PT_DECLARE_KERNEL(mean, GPU, ALL_LAYOUT); #endif diff --git a/paddle/pten/include/manipulation.h b/paddle/pten/include/manipulation.h index 80bfcef89f73a270b513fbb7515319f55d2c0e10..2d429c060f4498fa492f3751ceeb8123bb9b48d6 100644 --- a/paddle/pten/include/manipulation.h +++ b/paddle/pten/include/manipulation.h @@ -17,6 +17,7 @@ // See Note: [ How do we organize the kernel directory ] #include "paddle/pten/api/lib/utils/storage.h" #include "paddle/pten/include/infermeta.h" +#include "paddle/pten/kernels/cast_kernel.h" #include "paddle/pten/kernels/cpu/manipulation.h" #include "paddle/pten/kernels/flatten_kernel.h" #include "paddle/pten/kernels/gpu/manipulation.h" @@ -48,7 +49,7 @@ DenseTensor Cast(const ContextT& dev_ctx, pten::make_intrusive( dev_ctx.GetPlace()), std::move(out_meta)); - Cast(dev_ctx, x, out_dtype, in_dtype, &dense_out); + Cast(dev_ctx, x, out_dtype, in_dtype, &dense_out); return dense_out; } diff --git a/paddle/pten/kernels/cast_kernel.h b/paddle/pten/kernels/cast_kernel.h new file mode 100644 index 0000000000000000000000000000000000000000..968139f6f17636abe303502035dd8e15c0b48dff --- /dev/null +++ b/paddle/pten/kernels/cast_kernel.h @@ -0,0 +1,28 @@ +/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "paddle/pten/core/dense_tensor.h" + +namespace pten { + +template +void Cast(const ContextT& dev_ctx, + const DenseTensor& x, + DataType out_dtype, + DataType in_dtype, + DenseTensor* out); + +} // namespace pten diff --git a/paddle/pten/kernels/cpu/CMakeLists.txt b/paddle/pten/kernels/cpu/CMakeLists.txt index f7dabf47eb68f929cb733d572dac2931a44fa366..b67bb6296bc98b4f77857b2fa9984746b34c3a4f 100644 --- a/paddle/pten/kernels/cpu/CMakeLists.txt +++ b/paddle/pten/kernels/cpu/CMakeLists.txt @@ -1,3 +1,3 @@ -cc_library(math_cpu SRCS math.cc DEPS dense_tensor kernel_context kernel_factory eigen_function blas pten_transpose_cpu) +cc_library(math_cpu SRCS math.cc DEPS dense_tensor kernel_context kernel_factory eigen_function blas pten_transpose_cpu cast_kernel) cc_library(linalg_cpu SRCS linalg.cc DEPS dense_tensor kernel_context kernel_factory) cc_library(manipulation_cpu SRCS manipulation.cc DEPS dense_tensor kernel_context kernel_factory copy_kernel unary) diff --git a/paddle/pten/kernels/cpu/cast_kernel.cc b/paddle/pten/kernels/cpu/cast_kernel.cc new file mode 100644 index 0000000000000000000000000000000000000000..db57da1d41d339d97559648b3077f912e9b441a7 --- /dev/null +++ b/paddle/pten/kernels/cpu/cast_kernel.cc @@ -0,0 +1,78 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/pten/kernels/cast_kernel.h" + +#include "paddle/pten/api/ext/dispatch.h" +#include "paddle/pten/backends/cpu/cpu_context.h" +#include "paddle/pten/core/kernel_registry.h" + +// See Note [ Why still include the fluid headers? ] +#include "paddle/fluid/platform/transform.h" + +namespace pten { + +template +struct CastOpTransformFunctor { + HOSTDEVICE OutT operator()(InT in) const { return static_cast(in); } +}; + +template +void CastKernelImpl(const CPUContext& dev_ctx, + const DenseTensor& x, + DenseTensor* out) { + auto* in_begin = x.data(); + auto numel = x.numel(); + auto* in_end = in_begin + numel; + + auto* out_begin = out->mutable_data(); + + paddle::platform::Transform trans; + trans(dev_ctx, + in_begin, + in_end, + out_begin, + CastOpTransformFunctor()); +} + +template +void Cast(const ContextT& dev_ctx, + const DenseTensor& x, + DataType out_dtype, + DataType in_dtype, + DenseTensor* out) { + PD_VISIT_ALL_TYPES(out_dtype, "CastKernelImpl", ([&] { + CastKernelImpl(dev_ctx, x, out); + })); +} + +} // namespace pten + +PT_REGISTER_CTX_KERNEL(cast, + CPU, + ALL_LAYOUT, + pten::Cast, + float, + double, + int, + int64_t, + int16_t, + bool, + uint8_t, + paddle::platform::float16, + paddle::platform::bfloat16, + paddle::platform::complex, + paddle::platform::complex) { + kernel->OutputAt(0).SetDataType(paddle::experimental::DataType::UNDEFINED); +} diff --git a/paddle/pten/kernels/cpu/manipulation.cc b/paddle/pten/kernels/cpu/manipulation.cc index 3dba89ea19620b4c8ea41e725e03c512062a0e6c..e55eb316325107db9142fcb36ae3e1935acf2e82 100644 --- a/paddle/pten/kernels/cpu/manipulation.cc +++ b/paddle/pten/kernels/cpu/manipulation.cc @@ -17,7 +17,6 @@ #include "paddle/pten/infermeta/unary.h" #include "paddle/pten/kernels/copy_kernel.h" #include "paddle/pten/kernels/hybird/general/manipulation.h" -#include "paddle/pten/kernels/hybird/math/cast_func.h" namespace pten { @@ -44,38 +43,8 @@ void ReshapeWithXShape(const CPUContext& dev_ctx, Reshape(dev_ctx, x, shape, out); } -template -void Cast(const CPUContext& dev_ctx, - const DenseTensor& x, - DataType out_dtype, - DataType in_dtype, - DenseTensor* out) { - PD_VISIT_ALL_TYPES(out_dtype, "CastKernelImpl", ([&] { - math::CastKernelImpl( - dev_ctx, x, out); - })); -} - } // namespace pten -PT_REGISTER_KERNEL(cast, - CPU, - ALL_LAYOUT, - pten::Cast, - float, - double, - int, - int64_t, - int16_t, - bool, - uint8_t, - paddle::platform::float16, - paddle::platform::bfloat16, - paddle::platform::complex, - paddle::platform::complex) { - kernel->OutputAt(0).SetDataType(paddle::experimental::DataType::UNDEFINED); -} - PT_REGISTER_NO_TEMPLATE_KERNEL( reshape, CPU, ALL_LAYOUT, pten::Reshape, ALL_DTYPE) {} PT_REGISTER_NO_TEMPLATE_KERNEL( diff --git a/paddle/pten/kernels/cpu/manipulation.h b/paddle/pten/kernels/cpu/manipulation.h index 5aa5344b192dd3ca5b792b26571e303a979a9523..7eaa430f5f6652c309a4a246305fcb29a781ba6c 100644 --- a/paddle/pten/kernels/cpu/manipulation.h +++ b/paddle/pten/kernels/cpu/manipulation.h @@ -21,13 +21,6 @@ limitations under the License. */ namespace pten { -template -void Cast(const CPUContext& dev_ctx, - const DenseTensor& x, - DataType out_dtype, - DataType in_dtype, - DenseTensor* out); - void Reshape(const CPUContext& dev_ctx, const DenseTensor& x, const ScalarArray& shape, diff --git a/paddle/pten/kernels/gpu/CMakeLists.txt b/paddle/pten/kernels/gpu/CMakeLists.txt index 5fdb5359e1fd757311ebc8dbcaa094fd84ff583a..99e3871044f2ff158d0c9df4f10a528852ba1b4e 100644 --- a/paddle/pten/kernels/gpu/CMakeLists.txt +++ b/paddle/pten/kernels/gpu/CMakeLists.txt @@ -1,9 +1,9 @@ if(WITH_GPU) - nv_library(math_gpu SRCS math.cu DEPS eigen_function dense_tensor convert_utils kernel_context kernel_factory pten_transpose_gpu) + nv_library(math_gpu SRCS math.cu DEPS eigen_function dense_tensor convert_utils kernel_context kernel_factory pten_transpose_gpu cast_kernel) nv_library(linalg_gpu SRCS linalg.cu DEPS eigen_function dense_tensor kernel_context kernel_factory) nv_library(manipulation_gpu SRCS manipulation.cu DEPS dense_tensor kernel_context kernel_factory copy_kernel unary) elseif(WITH_ROCM) - hip_library(math_gpu SRCS math.cu DEPS eigen_function dense_tensor convert_utils kernel_context kernel_factory pten_transpose_gpu) + hip_library(math_gpu SRCS math.cu DEPS eigen_function dense_tensor convert_utils kernel_context kernel_factory pten_transpose_gpu cast_kernel) hip_library(linalg_gpu SRCS linalg.cu DEPS eigen_function dense_tensor kernel_context kernel_factory) hip_library(manipulation_gpu SRCS manipulation.cu DEPS dense_tensor kernel_context kernel_factory copy_kernel unary) endif() diff --git a/paddle/pten/kernels/hybird/cuda/cast_kernel_impl.h b/paddle/pten/kernels/gpu/cast_kernel.cu similarity index 60% rename from paddle/pten/kernels/hybird/cuda/cast_kernel_impl.h rename to paddle/pten/kernels/gpu/cast_kernel.cu index d8c58448c9867232d5a9512d6070a821a69104ed..011fc9077dca9041c7828a69fc996757042124ba 100644 --- a/paddle/pten/kernels/hybird/cuda/cast_kernel_impl.h +++ b/paddle/pten/kernels/gpu/cast_kernel.cu @@ -13,15 +13,21 @@ // limitations under the License. #pragma once -#include "paddle/fluid/platform/device/gpu/gpu_helper.h" -#include "paddle/fluid/platform/float16.h" + +#include "paddle/pten/kernels/cast_kernel.h" + +#include "paddle/pten/api/ext/dispatch.h" #include "paddle/pten/backends/gpu/gpu_context.h" -#include "paddle/pten/core/dense_tensor.h" +#include "paddle/pten/core/kernel_registry.h" +// See Note [ Why still include the fluid headers? ] #include "paddle/fluid/platform/aligned_vector.h" +#include "paddle/fluid/platform/bfloat16.h" +#include "paddle/fluid/platform/device/gpu/gpu_helper.h" #include "paddle/fluid/platform/device/gpu/gpu_launch_config.h" +#include "paddle/fluid/platform/float16.h" + namespace pten { -namespace detail { template __global__ void VecCastCUDAKernel(const InT* in, const int64_t N, OutT* out) { @@ -74,6 +80,41 @@ void CastCUDAKernelImpl(const GPUContext& dev_ctx, } } -} // namespace detail +template +void Cast(const ContextT& dev_ctx, + const DenseTensor& x, + DataType out_dtype, + DataType in_dtype, + DenseTensor* out) { + PD_VISIT_ALL_TYPES(out_dtype, "CastCUDAKernelImpl", ([&] { + CastCUDAKernelImpl(dev_ctx, x, out); + })); +} } // namespace pten + +#define PTEN_REGISTER_CAST_CUDA_BASE_TYPE(op_name, ...) \ + PT_REGISTER_CTX_KERNEL(cast, \ + GPU, \ + ALL_LAYOUT, \ + pten::Cast, \ + float, \ + double, \ + int, \ + int64_t, \ + int16_t, \ + bool, \ + uint8_t, \ + paddle::platform::float16, \ + paddle::platform::complex, \ + paddle::platform::complex, \ + ##__VA_ARGS__) { \ + kernel->OutputAt(0).SetDataType( \ + paddle::experimental::DataType::UNDEFINED); \ + } + +#if !defined(PADDLE_WITH_HIP) +PTEN_REGISTER_CAST_CUDA_BASE_TYPE(cast, paddle::platform::bfloat16) +#else +PTEN_REGISTER_CAST_CUDA_BASE_TYPE(cast) +#endif diff --git a/paddle/pten/kernels/gpu/manipulation.cu b/paddle/pten/kernels/gpu/manipulation.cu index 4dbf8e69b450e2ed085f94cc3bfe39dbcd806023..7e99510c015d07395778e238c041890546a49796 100644 --- a/paddle/pten/kernels/gpu/manipulation.cu +++ b/paddle/pten/kernels/gpu/manipulation.cu @@ -12,11 +12,10 @@ // See the License for the specific language governing permissions and // limitations under the License. -#include "paddle/pten/api/ext/dispatch.h" +#include "paddle/pten/kernels/gpu/manipulation.h" + #include "paddle/pten/infermeta/unary.h" #include "paddle/pten/kernels/copy_kernel.h" -#include "paddle/pten/kernels/gpu/manipulation.h" -#include "paddle/pten/kernels/hybird/cuda/cast_kernel_impl.h" #include "paddle/pten/kernels/hybird/general/manipulation.h" namespace pten { @@ -44,47 +43,8 @@ void ReshapeWithXShape(const GPUContext& dev_ctx, Reshape(dev_ctx, x, shape, out); } -template -void Cast(const GPUContext& dev_ctx, - const DenseTensor& x, - DataType out_dtype, - DataType in_dtype, - DenseTensor* out) { - PD_VISIT_ALL_TYPES(out_dtype, "CastKernelImpl", ([&] { - detail::CastCUDAKernelImpl(dev_ctx, x, out); - })); -} - } // namespace pten -using float16 = paddle::platform::float16; - -#define PTEN_REGISTER_CAST_CUDA_BASE_TYPE(op_name, ...) \ - PT_REGISTER_KERNEL(cast, \ - GPU, \ - ALL_LAYOUT, \ - pten::Cast, \ - float, \ - double, \ - int, \ - int64_t, \ - int16_t, \ - bool, \ - uint8_t, \ - paddle::platform::float16, \ - paddle::platform::complex, \ - paddle::platform::complex, \ - ##__VA_ARGS__) { \ - kernel->OutputAt(0).SetDataType( \ - paddle::experimental::DataType::UNDEFINED); \ - } - -#if !defined(PADDLE_WITH_HIP) -PTEN_REGISTER_CAST_CUDA_BASE_TYPE(cast, paddle::platform::bfloat16) -#else -PTEN_REGISTER_CAST_CUDA_BASE_TYPE(cast) -#endif - PT_REGISTER_NO_TEMPLATE_KERNEL( reshape, GPU, ALL_LAYOUT, pten::Reshape, ALL_DTYPE) {} PT_REGISTER_NO_TEMPLATE_KERNEL( diff --git a/paddle/pten/kernels/gpu/manipulation.h b/paddle/pten/kernels/gpu/manipulation.h index af49932c2e5201a92f4f3b55f09ee5a137d92c51..a099763706b2d081a54de0484c91321919310ee3 100644 --- a/paddle/pten/kernels/gpu/manipulation.h +++ b/paddle/pten/kernels/gpu/manipulation.h @@ -24,13 +24,6 @@ namespace pten { -template -void Cast(const GPUContext& dev_ctx, - const DenseTensor& x, - DataType out_dtype, - DataType in_dtype, - DenseTensor* out); - void Reshape(const GPUContext& dev_ctx, const DenseTensor& x, const ScalarArray& shape, diff --git a/paddle/pten/kernels/hybird/cuda/reduce/reduce_cuda_impl.h b/paddle/pten/kernels/hybird/cuda/reduce/reduce_cuda_impl.h index a2faf2a5416942900ee8ab6ec895286a06996d50..2680f2d3fddca53d56ee1f8931aeac2e13dcc66e 100644 --- a/paddle/pten/kernels/hybird/cuda/reduce/reduce_cuda_impl.h +++ b/paddle/pten/kernels/hybird/cuda/reduce/reduce_cuda_impl.h @@ -41,8 +41,8 @@ namespace cub = hipcub; #include "paddle/pten/api/ext/dispatch.h" #include "paddle/pten/core/dense_tensor.h" +#include "paddle/pten/kernels/cast_kernel.h" #include "paddle/pten/kernels/copy_kernel.h" -#include "paddle/pten/kernels/hybird/math/cast_func.h" // Reduce split or not, Whether to use ReduceHigherDim #define REDUCE_SPLIT_BOUNDARY 512 @@ -1112,12 +1112,7 @@ void TensorReduceFunctorImpl(const pten::DenseTensor& x, AsyncCopy(x, y); y->Resize(out_dims); } else { - PD_VISIT_ALL_TYPES( - y->dtype(), "CastKernelImpl", ([&] { - pten::math::CastKernelImpl(*dev_ctx, x, y); - })); + pten::Cast(*dev_ctx, x, y->dtype(), x.dtype(), y); } return; } diff --git a/paddle/pten/kernels/hybird/general/reduce_impl.h b/paddle/pten/kernels/hybird/general/reduce_impl.h index 52bdf18ad5a31d87f53fb48e983f6b07539f7044..a8e0bc5de2196ea8890d299a5065af9d7cb93aef 100644 --- a/paddle/pten/kernels/hybird/general/reduce_impl.h +++ b/paddle/pten/kernels/hybird/general/reduce_impl.h @@ -16,8 +16,9 @@ #include "paddle/fluid/platform/transform.h" #include "paddle/pten/api/ext/dispatch.h" #include "paddle/pten/core/dense_tensor.h" +#include "paddle/pten/kernels/cast_kernel.h" #include "paddle/pten/kernels/hybird/eigen/reduce.h" -#include "paddle/pten/kernels/hybird/math/cast_func.h" + namespace pten { namespace general { @@ -57,11 +58,8 @@ void Reduce(const DeviceContext& dev_ctx, pten::make_intrusive(x.place()), pten::DenseTensorMeta(out_dtype, x.dims(), x.layout())); - // cast x tensor to out_dtype first - PD_VISIT_ALL_TYPES(out_dtype, "CastKernelImpl", ([&] { - math::CastKernelImpl( - dev_ctx, x, &tmp_tensor); - })); + // cast x tensor to out_dtype + pten::Cast(dev_ctx, x, out_dtype, x.dtype(), &tmp_tensor); // do reduce sum PD_VISIT_ALL_TYPES( diff --git a/paddle/pten/kernels/hybird/math/cast_func.h b/paddle/pten/kernels/hybird/math/cast_func.h deleted file mode 100644 index 0a67736dbb27b66eaf6daa482a0bbd4f07db892b..0000000000000000000000000000000000000000 --- a/paddle/pten/kernels/hybird/math/cast_func.h +++ /dev/null @@ -1,48 +0,0 @@ -// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. -// -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// -// http://www.apache.org/licenses/LICENSE-2.0 -// -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. - -#pragma once - -#include "paddle/fluid/platform/transform.h" -#include "paddle/pten/core/dense_tensor.h" - -namespace pten { -namespace math { - -template -struct CastOpTransformFunctor { - HOSTDEVICE OutT operator()(InT in) const { return static_cast(in); } -}; - -template -void CastKernelImpl(const DeviceContext& dev_ctx, - const DenseTensor& x, - DenseTensor* out) { - auto* in_begin = x.data(); - auto numel = x.numel(); - auto* in_end = in_begin + numel; - - auto* out_begin = out->mutable_data(); - - paddle::platform::Transform trans; - trans(dev_ctx, - in_begin, - in_end, - out_begin, - CastOpTransformFunctor()); -} - -} // namespace math - -} // namespace pten diff --git a/paddle/pten/kernels/hybird/transpose.cu b/paddle/pten/kernels/hybird/transpose.cu index 195277c216fe9127f3664c543c7a7e69fac93e46..6ea5e36e106d54fba6e72ce9edfd2fb74f4ed813 100644 --- a/paddle/pten/kernels/hybird/transpose.cu +++ b/paddle/pten/kernels/hybird/transpose.cu @@ -16,7 +16,6 @@ #include "paddle/fluid/memory/memcpy.h" #include "paddle/pten/backends/gpu/gpu_context.h" #include "paddle/pten/core/dense_tensor.h" -#include "paddle/pten/kernels/hybird/math/cast_func.h" #include "paddle/pten/kernels/hybird/transpose.h" // See Note [ Why still include the fluid headers? ]