未验证 提交 1fb734d7 编写于 作者: C Chen Weihang 提交者: GitHub

[PTen] Move cast kernel impl (#38382)

* rename to api to copy_to

* revert needless change

* polish format
上级 04527ee3
......@@ -21,12 +21,12 @@ limitations under the License. */
// file name of the kernel, and this header file will be removed
PT_DECLARE_KERNEL(matmul, CPU, ALL_LAYOUT);
PT_DECLARE_KERNEL(cast, CPU, ALL_LAYOUT);
PT_DECLARE_KERNEL(reshape, CPU, ALL_LAYOUT);
PT_DECLARE_KERNEL(mean, CPU, ALL_LAYOUT);
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
PT_DECLARE_KERNEL(matmul, GPU, ALL_LAYOUT);
PT_DECLARE_KERNEL(cast, GPU, ALL_LAYOUT);
PT_DECLARE_KERNEL(reshape, GPU, ALL_LAYOUT);
PT_DECLARE_KERNEL(mean, GPU, ALL_LAYOUT);
#endif
......
......@@ -17,6 +17,7 @@
// See Note: [ How do we organize the kernel directory ]
#include "paddle/pten/api/lib/utils/storage.h"
#include "paddle/pten/include/infermeta.h"
#include "paddle/pten/kernels/cast_kernel.h"
#include "paddle/pten/kernels/cpu/manipulation.h"
#include "paddle/pten/kernels/flatten_kernel.h"
#include "paddle/pten/kernels/gpu/manipulation.h"
......@@ -48,7 +49,7 @@ DenseTensor Cast(const ContextT& dev_ctx,
pten::make_intrusive<paddle::experimental::SharedStorage>(
dev_ctx.GetPlace()),
std::move(out_meta));
Cast<T>(dev_ctx, x, out_dtype, in_dtype, &dense_out);
Cast<T, ContextT>(dev_ctx, x, out_dtype, in_dtype, &dense_out);
return dense_out;
}
......
/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include "paddle/pten/core/dense_tensor.h"
namespace pten {
template <typename T, typename ContextT>
void Cast(const ContextT& dev_ctx,
const DenseTensor& x,
DataType out_dtype,
DataType in_dtype,
DenseTensor* out);
} // namespace pten
cc_library(math_cpu SRCS math.cc DEPS dense_tensor kernel_context kernel_factory eigen_function blas pten_transpose_cpu)
cc_library(math_cpu SRCS math.cc DEPS dense_tensor kernel_context kernel_factory eigen_function blas pten_transpose_cpu cast_kernel)
cc_library(linalg_cpu SRCS linalg.cc DEPS dense_tensor kernel_context kernel_factory)
cc_library(manipulation_cpu SRCS manipulation.cc DEPS dense_tensor kernel_context kernel_factory copy_kernel unary)
......@@ -12,21 +12,24 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/pten/kernels/cast_kernel.h"
#include "paddle/pten/api/ext/dispatch.h"
#include "paddle/pten/backends/cpu/cpu_context.h"
#include "paddle/pten/core/kernel_registry.h"
// See Note [ Why still include the fluid headers? ]
#include "paddle/fluid/platform/transform.h"
#include "paddle/pten/core/dense_tensor.h"
namespace pten {
namespace math {
template <typename InT, typename OutT>
struct CastOpTransformFunctor {
HOSTDEVICE OutT operator()(InT in) const { return static_cast<OutT>(in); }
};
template <typename DeviceContext, typename InT, typename OutT>
void CastKernelImpl(const DeviceContext& dev_ctx,
template <typename InT, typename OutT>
void CastKernelImpl(const CPUContext& dev_ctx,
const DenseTensor& x,
DenseTensor* out) {
auto* in_begin = x.data<InT>();
......@@ -35,7 +38,7 @@ void CastKernelImpl(const DeviceContext& dev_ctx,
auto* out_begin = out->mutable_data<OutT>();
paddle::platform::Transform<DeviceContext> trans;
paddle::platform::Transform<CPUContext> trans;
trans(dev_ctx,
in_begin,
in_end,
......@@ -43,6 +46,33 @@ void CastKernelImpl(const DeviceContext& dev_ctx,
CastOpTransformFunctor<InT, OutT>());
}
} // namespace math
template <typename T, typename ContextT>
void Cast(const ContextT& dev_ctx,
const DenseTensor& x,
DataType out_dtype,
DataType in_dtype,
DenseTensor* out) {
PD_VISIT_ALL_TYPES(out_dtype, "CastKernelImpl", ([&] {
CastKernelImpl<T, data_t>(dev_ctx, x, out);
}));
}
} // namespace pten
PT_REGISTER_CTX_KERNEL(cast,
CPU,
ALL_LAYOUT,
pten::Cast,
float,
double,
int,
int64_t,
int16_t,
bool,
uint8_t,
paddle::platform::float16,
paddle::platform::bfloat16,
paddle::platform::complex<float>,
paddle::platform::complex<double>) {
kernel->OutputAt(0).SetDataType(paddle::experimental::DataType::UNDEFINED);
}
......@@ -17,7 +17,6 @@
#include "paddle/pten/infermeta/unary.h"
#include "paddle/pten/kernels/copy_kernel.h"
#include "paddle/pten/kernels/hybird/general/manipulation.h"
#include "paddle/pten/kernels/hybird/math/cast_func.h"
namespace pten {
......@@ -44,38 +43,8 @@ void ReshapeWithXShape(const CPUContext& dev_ctx,
Reshape(dev_ctx, x, shape, out);
}
template <typename T>
void Cast(const CPUContext& dev_ctx,
const DenseTensor& x,
DataType out_dtype,
DataType in_dtype,
DenseTensor* out) {
PD_VISIT_ALL_TYPES(out_dtype, "CastKernelImpl", ([&] {
math::CastKernelImpl<CPUContext, T, data_t>(
dev_ctx, x, out);
}));
}
} // namespace pten
PT_REGISTER_KERNEL(cast,
CPU,
ALL_LAYOUT,
pten::Cast,
float,
double,
int,
int64_t,
int16_t,
bool,
uint8_t,
paddle::platform::float16,
paddle::platform::bfloat16,
paddle::platform::complex<float>,
paddle::platform::complex<double>) {
kernel->OutputAt(0).SetDataType(paddle::experimental::DataType::UNDEFINED);
}
PT_REGISTER_NO_TEMPLATE_KERNEL(
reshape, CPU, ALL_LAYOUT, pten::Reshape, ALL_DTYPE) {}
PT_REGISTER_NO_TEMPLATE_KERNEL(
......
......@@ -21,13 +21,6 @@ limitations under the License. */
namespace pten {
template <typename T>
void Cast(const CPUContext& dev_ctx,
const DenseTensor& x,
DataType out_dtype,
DataType in_dtype,
DenseTensor* out);
void Reshape(const CPUContext& dev_ctx,
const DenseTensor& x,
const ScalarArray& shape,
......
if(WITH_GPU)
nv_library(math_gpu SRCS math.cu DEPS eigen_function dense_tensor convert_utils kernel_context kernel_factory pten_transpose_gpu)
nv_library(math_gpu SRCS math.cu DEPS eigen_function dense_tensor convert_utils kernel_context kernel_factory pten_transpose_gpu cast_kernel)
nv_library(linalg_gpu SRCS linalg.cu DEPS eigen_function dense_tensor kernel_context kernel_factory)
nv_library(manipulation_gpu SRCS manipulation.cu DEPS dense_tensor kernel_context kernel_factory copy_kernel unary)
elseif(WITH_ROCM)
hip_library(math_gpu SRCS math.cu DEPS eigen_function dense_tensor convert_utils kernel_context kernel_factory pten_transpose_gpu)
hip_library(math_gpu SRCS math.cu DEPS eigen_function dense_tensor convert_utils kernel_context kernel_factory pten_transpose_gpu cast_kernel)
hip_library(linalg_gpu SRCS linalg.cu DEPS eigen_function dense_tensor kernel_context kernel_factory)
hip_library(manipulation_gpu SRCS manipulation.cu DEPS dense_tensor kernel_context kernel_factory copy_kernel unary)
endif()
......@@ -13,15 +13,21 @@
// limitations under the License.
#pragma once
#include "paddle/fluid/platform/device/gpu/gpu_helper.h"
#include "paddle/fluid/platform/float16.h"
#include "paddle/pten/kernels/cast_kernel.h"
#include "paddle/pten/api/ext/dispatch.h"
#include "paddle/pten/backends/gpu/gpu_context.h"
#include "paddle/pten/core/dense_tensor.h"
#include "paddle/pten/core/kernel_registry.h"
// See Note [ Why still include the fluid headers? ]
#include "paddle/fluid/platform/aligned_vector.h"
#include "paddle/fluid/platform/bfloat16.h"
#include "paddle/fluid/platform/device/gpu/gpu_helper.h"
#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
#include "paddle/fluid/platform/float16.h"
namespace pten {
namespace detail {
template <typename InT, typename OutT, int VecSize>
__global__ void VecCastCUDAKernel(const InT* in, const int64_t N, OutT* out) {
......@@ -74,6 +80,41 @@ void CastCUDAKernelImpl(const GPUContext& dev_ctx,
}
}
} // namespace detail
template <typename T, typename ContextT>
void Cast(const ContextT& dev_ctx,
const DenseTensor& x,
DataType out_dtype,
DataType in_dtype,
DenseTensor* out) {
PD_VISIT_ALL_TYPES(out_dtype, "CastCUDAKernelImpl", ([&] {
CastCUDAKernelImpl<T, data_t>(dev_ctx, x, out);
}));
}
} // namespace pten
#define PTEN_REGISTER_CAST_CUDA_BASE_TYPE(op_name, ...) \
PT_REGISTER_CTX_KERNEL(cast, \
GPU, \
ALL_LAYOUT, \
pten::Cast, \
float, \
double, \
int, \
int64_t, \
int16_t, \
bool, \
uint8_t, \
paddle::platform::float16, \
paddle::platform::complex<float>, \
paddle::platform::complex<double>, \
##__VA_ARGS__) { \
kernel->OutputAt(0).SetDataType( \
paddle::experimental::DataType::UNDEFINED); \
}
#if !defined(PADDLE_WITH_HIP)
PTEN_REGISTER_CAST_CUDA_BASE_TYPE(cast, paddle::platform::bfloat16)
#else
PTEN_REGISTER_CAST_CUDA_BASE_TYPE(cast)
#endif
......@@ -12,11 +12,10 @@
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/pten/api/ext/dispatch.h"
#include "paddle/pten/kernels/gpu/manipulation.h"
#include "paddle/pten/infermeta/unary.h"
#include "paddle/pten/kernels/copy_kernel.h"
#include "paddle/pten/kernels/gpu/manipulation.h"
#include "paddle/pten/kernels/hybird/cuda/cast_kernel_impl.h"
#include "paddle/pten/kernels/hybird/general/manipulation.h"
namespace pten {
......@@ -44,47 +43,8 @@ void ReshapeWithXShape(const GPUContext& dev_ctx,
Reshape(dev_ctx, x, shape, out);
}
template <typename T>
void Cast(const GPUContext& dev_ctx,
const DenseTensor& x,
DataType out_dtype,
DataType in_dtype,
DenseTensor* out) {
PD_VISIT_ALL_TYPES(out_dtype, "CastKernelImpl", ([&] {
detail::CastCUDAKernelImpl<T, data_t>(dev_ctx, x, out);
}));
}
} // namespace pten
using float16 = paddle::platform::float16;
#define PTEN_REGISTER_CAST_CUDA_BASE_TYPE(op_name, ...) \
PT_REGISTER_KERNEL(cast, \
GPU, \
ALL_LAYOUT, \
pten::Cast, \
float, \
double, \
int, \
int64_t, \
int16_t, \
bool, \
uint8_t, \
paddle::platform::float16, \
paddle::platform::complex<float>, \
paddle::platform::complex<double>, \
##__VA_ARGS__) { \
kernel->OutputAt(0).SetDataType( \
paddle::experimental::DataType::UNDEFINED); \
}
#if !defined(PADDLE_WITH_HIP)
PTEN_REGISTER_CAST_CUDA_BASE_TYPE(cast, paddle::platform::bfloat16)
#else
PTEN_REGISTER_CAST_CUDA_BASE_TYPE(cast)
#endif
PT_REGISTER_NO_TEMPLATE_KERNEL(
reshape, GPU, ALL_LAYOUT, pten::Reshape, ALL_DTYPE) {}
PT_REGISTER_NO_TEMPLATE_KERNEL(
......
......@@ -24,13 +24,6 @@
namespace pten {
template <typename T>
void Cast(const GPUContext& dev_ctx,
const DenseTensor& x,
DataType out_dtype,
DataType in_dtype,
DenseTensor* out);
void Reshape(const GPUContext& dev_ctx,
const DenseTensor& x,
const ScalarArray& shape,
......
......@@ -41,8 +41,8 @@ namespace cub = hipcub;
#include "paddle/pten/api/ext/dispatch.h"
#include "paddle/pten/core/dense_tensor.h"
#include "paddle/pten/kernels/cast_kernel.h"
#include "paddle/pten/kernels/copy_kernel.h"
#include "paddle/pten/kernels/hybird/math/cast_func.h"
// Reduce split or not, Whether to use ReduceHigherDim
#define REDUCE_SPLIT_BOUNDARY 512
......@@ -1112,12 +1112,7 @@ void TensorReduceFunctorImpl(const pten::DenseTensor& x,
AsyncCopy(x, y);
y->Resize(out_dims);
} else {
PD_VISIT_ALL_TYPES(
y->dtype(), "CastKernelImpl", ([&] {
pten::math::CastKernelImpl<paddle::platform::CUDADeviceContext,
Tx,
data_t>(*dev_ctx, x, y);
}));
pten::Cast<Tx>(*dev_ctx, x, y->dtype(), x.dtype(), y);
}
return;
}
......
......@@ -16,8 +16,9 @@
#include "paddle/fluid/platform/transform.h"
#include "paddle/pten/api/ext/dispatch.h"
#include "paddle/pten/core/dense_tensor.h"
#include "paddle/pten/kernels/cast_kernel.h"
#include "paddle/pten/kernels/hybird/eigen/reduce.h"
#include "paddle/pten/kernels/hybird/math/cast_func.h"
namespace pten {
namespace general {
......@@ -57,11 +58,8 @@ void Reduce(const DeviceContext& dev_ctx,
pten::make_intrusive<paddle::experimental::SharedStorage>(x.place()),
pten::DenseTensorMeta(out_dtype, x.dims(), x.layout()));
// cast x tensor to out_dtype first
PD_VISIT_ALL_TYPES(out_dtype, "CastKernelImpl", ([&] {
math::CastKernelImpl<DeviceContext, T, data_t>(
dev_ctx, x, &tmp_tensor);
}));
// cast x tensor to out_dtype
pten::Cast<T, DeviceContext>(dev_ctx, x, out_dtype, x.dtype(), &tmp_tensor);
// do reduce sum
PD_VISIT_ALL_TYPES(
......
......@@ -16,7 +16,6 @@
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/pten/backends/gpu/gpu_context.h"
#include "paddle/pten/core/dense_tensor.h"
#include "paddle/pten/kernels/hybird/math/cast_func.h"
#include "paddle/pten/kernels/hybird/transpose.h"
// See Note [ Why still include the fluid headers? ]
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册