未验证 提交 a0f586bc 编写于 作者: Y YuanRisheng 提交者: GitHub

[PTen]Separate origin Kernel and add Kernel for C++ API (#39002)

* add kernel for c++ api

* fix compile bugs

* fix kunlun compile bugs

* perfect cmake

* fix compile bugs when run ci-inference

* fix compile bugs

* add non-raw kernel for fluid op

* fix compile bugs

* fix compile bugs

* fix unit test bug
上级 854a7ab3
...@@ -103,38 +103,55 @@ function(kernel_library TARGET) ...@@ -103,38 +103,55 @@ function(kernel_library TARGET)
list(LENGTH gpu_srcs gpu_srcs_len) list(LENGTH gpu_srcs gpu_srcs_len)
list(LENGTH xpu_srcs xpu_srcs_len) list(LENGTH xpu_srcs xpu_srcs_len)
if (${common_srcs_len} GREATER 0) # Build Target according different src organization
# If the kernel has a device independent public implementation, if((${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0 OR
# we will use this implementation and will not adopt the implementation ${xpu_srcs_len} GREATER 0) AND ${common_srcs_len} GREATER 0)
# under specific devices # If the common_srcs depends on specific device srcs, build target using this rule.
if (WITH_GPU)
if (${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0)
nv_library(${TARGET}_part SRCS ${cpu_srcs} ${gpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
nv_library(${TARGET} SRCS ${common_srcs} DEPS ${TARGET}_part)
endif()
elseif (WITH_ROCM)
if (${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0)
hip_library(${TARGET}_part SRCS ${cpu_srcs} ${gpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
hip_library(${TARGET} SRCS ${common_srcs} DEPS ${TARGET}_part)
endif()
else()
if (${cpu_srcs_len} GREATER 0 OR ${xpu_srcs_len} GREATER 0)
cc_library(${TARGET}_part SRCS ${cpu_srcs} ${xpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
cc_library(${TARGET} SRCS ${common_srcs} DEPS ${TARGET}_part)
endif()
endif()
elseif (${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0 OR ${xpu_srcs_len} GREATER 0)
if (WITH_GPU) if (WITH_GPU)
nv_library(${TARGET} SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) if (${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0)
nv_library(${TARGET} SRCS ${cpu_srcs} ${gpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
endif()
elseif (WITH_ROCM) elseif (WITH_ROCM)
hip_library(${TARGET} SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) if (${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0)
hip_library(${TARGET} SRCS ${cpu_srcs} ${gpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
endif()
else() else()
cc_library(${TARGET} SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps}) if (${cpu_srcs_len} GREATER 0 OR ${xpu_srcs_len} GREATER 0)
cc_library(${TARGET} SRCS ${cpu_srcs} ${xpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
endif()
endif() endif()
else() else()
# If the kernel has a header file declaration, but no corresponding if (${common_srcs_len} EQUAL 0)
# implementation can be found, this is not allowed message(FATAL_ERROR "Cannot find any implementation for ${TARGET}")
if (${cpu_srcs_len} EQUAL 0 AND ${gpu_srcs_len} EQUAL 0 AND
${xpu_srcs_len} EQUAL 0)
message(FATAL_ERROR "Cannot find any implementation for ${TARGET}")
else() else()
# If the kernel has a device independent public implementation,
# we will use this implementation and will not adopt the implementation
# under specific devices
if (WITH_GPU) if (WITH_GPU)
if (${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0) nv_library(${TARGET} SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
nv_library(${TARGET} SRCS ${cpu_srcs} ${gpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
endif()
elseif (WITH_ROCM) elseif (WITH_ROCM)
if (${cpu_srcs_len} GREATER 0 OR ${gpu_srcs_len} GREATER 0) hip_library(${TARGET} SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
hip_library(${TARGET} SRCS ${cpu_srcs} ${gpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
endif()
else() else()
if (${cpu_srcs_len} GREATER 0 OR ${xpu_srcs_len} GREATER 0) cc_library(${TARGET} SRCS ${common_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
cc_library(${TARGET} SRCS ${cpu_srcs} ${xpu_srcs} DEPS ${kernel_library_DEPS} ${kernel_deps})
endif()
endif() endif()
endif() endif()
endif() endif()
if (${common_srcs_len} GREATER 0 OR ${cpu_srcs_len} GREATER 0 OR if (${common_srcs_len} GREATER 0 OR ${cpu_srcs_len} GREATER 0 OR
......
...@@ -202,7 +202,7 @@ class CholeskySolveGradKernel : public framework::OpKernel<T> { ...@@ -202,7 +202,7 @@ class CholeskySolveGradKernel : public framework::OpKernel<T> {
commonterm_for_range(commonterm_functor); commonterm_for_range(commonterm_functor);
commonterm_conj = helper.Transpose(commonterm_conj); commonterm_conj = helper.Transpose(commonterm_conj);
pten::AddKernel<T>( pten::AddRawKernel<T>(
static_cast<const typename paddle::framework::ConvertToPtenContext< static_cast<const typename paddle::framework::ConvertToPtenContext<
DeviceContext>::TYPE &>(dev_ctx), DeviceContext>::TYPE &>(dev_ctx),
commonterm, commonterm_conj, -1, &commonterm); commonterm, commonterm_conj, -1, &commonterm);
......
...@@ -61,7 +61,7 @@ class ElementwiseAddKernel : public framework::OpKernel<T> { ...@@ -61,7 +61,7 @@ class ElementwiseAddKernel : public framework::OpKernel<T> {
auto pt_x = paddle::experimental::MakePtenDenseTensor(*x); auto pt_x = paddle::experimental::MakePtenDenseTensor(*x);
auto pt_y = paddle::experimental::MakePtenDenseTensor(*y); auto pt_y = paddle::experimental::MakePtenDenseTensor(*y);
auto pt_z = paddle::experimental::MakePtenDenseTensor(*z); auto pt_z = paddle::experimental::MakePtenDenseTensor(*z);
pten::AddKernel<T>( pten::AddRawKernel<T>(
static_cast<const typename framework::ConvertToPtenContext< static_cast<const typename framework::ConvertToPtenContext<
DeviceContext>::TYPE &>(dev_ctx), DeviceContext>::TYPE &>(dev_ctx),
*pt_x.get(), *pt_y.get(), axis, pt_z.get()); *pt_x.get(), *pt_y.get(), axis, pt_z.get());
......
...@@ -51,7 +51,7 @@ class ElementwiseDivKernel : public framework::OpKernel<T> { ...@@ -51,7 +51,7 @@ class ElementwiseDivKernel : public framework::OpKernel<T> {
auto pt_x = paddle::experimental::MakePtenDenseTensor(*x); auto pt_x = paddle::experimental::MakePtenDenseTensor(*x);
auto pt_y = paddle::experimental::MakePtenDenseTensor(*y); auto pt_y = paddle::experimental::MakePtenDenseTensor(*y);
auto pt_z = paddle::experimental::MakePtenDenseTensor(*z); auto pt_z = paddle::experimental::MakePtenDenseTensor(*z);
pten::DivideKernel<T>( pten::DivideRawKernel<T>(
static_cast<const typename framework::ConvertToPtenContext< static_cast<const typename framework::ConvertToPtenContext<
DeviceContext>::TYPE&>(dev_ctx), DeviceContext>::TYPE&>(dev_ctx),
*pt_x.get(), *pt_y.get(), axis, pt_z.get()); *pt_x.get(), *pt_y.get(), axis, pt_z.get());
......
...@@ -51,8 +51,8 @@ class ElementwiseMulKernel<platform::CUDADeviceContext, T> ...@@ -51,8 +51,8 @@ class ElementwiseMulKernel<platform::CUDADeviceContext, T>
auto pt_x = paddle::experimental::MakePtenDenseTensor(*x_lod); auto pt_x = paddle::experimental::MakePtenDenseTensor(*x_lod);
auto pt_y = paddle::experimental::MakePtenDenseTensor(*y_lod); auto pt_y = paddle::experimental::MakePtenDenseTensor(*y_lod);
auto pt_z = paddle::experimental::MakePtenDenseTensor(*z_lod); auto pt_z = paddle::experimental::MakePtenDenseTensor(*z_lod);
pten::MultiplyKernel<T>(cuda_ctx, *pt_x.get(), *pt_y.get(), axis, pten::MultiplyRawKernel<T>(cuda_ctx, *pt_x.get(), *pt_y.get(), axis,
pt_z.get()); pt_z.get());
} else { } else {
PADDLE_THROW(platform::errors::InvalidArgument( PADDLE_THROW(platform::errors::InvalidArgument(
"X's type[%s] is not supported by elementwise_op. X's type should be " "X's type[%s] is not supported by elementwise_op. X's type should be "
......
...@@ -124,7 +124,7 @@ class ElementwiseMulKernel : public framework::OpKernel<T> { ...@@ -124,7 +124,7 @@ class ElementwiseMulKernel : public framework::OpKernel<T> {
auto pt_x = paddle::experimental::MakePtenDenseTensor(*x_lod); auto pt_x = paddle::experimental::MakePtenDenseTensor(*x_lod);
auto pt_y = paddle::experimental::MakePtenDenseTensor(*y); auto pt_y = paddle::experimental::MakePtenDenseTensor(*y);
auto pt_z = paddle::experimental::MakePtenDenseTensor(*z_lod); auto pt_z = paddle::experimental::MakePtenDenseTensor(*z_lod);
pten::MultiplyKernel<T>( pten::MultiplyRawKernel<T>(
static_cast<const typename framework::ConvertToPtenContext< static_cast<const typename framework::ConvertToPtenContext<
DeviceContext>::TYPE&>(dev_ctx), DeviceContext>::TYPE&>(dev_ctx),
*pt_x.get(), *pt_y.get(), axis, pt_z.get()); *pt_x.get(), *pt_y.get(), axis, pt_z.get());
......
...@@ -140,26 +140,42 @@ class ElementwiseOp : public framework::OperatorWithKernel { ...@@ -140,26 +140,42 @@ class ElementwiseOp : public framework::OperatorWithKernel {
framework::KernelSignature GetExpectedPtenKernelArgs( framework::KernelSignature GetExpectedPtenKernelArgs(
const framework::ExecutionContext &ctx) const override { const framework::ExecutionContext &ctx) const override {
int axis = ctx.Attr<int>("axis");
if (Type() == "elementwise_add") { if (Type() == "elementwise_add") {
if (ctx.InputVar("X")->IsType<framework::LoDTensor>()) { if (ctx.InputVar("X")->IsType<framework::LoDTensor>()) {
return framework::KernelSignature("add", {"X", "Y"}, {"axis"}, {"Out"}); if (axis == -1) {
return framework::KernelSignature("add", {"X", "Y"}, {}, {"Out"});
}
return framework::KernelSignature("add_raw", {"X", "Y"}, {"axis"},
{"Out"});
} }
} }
if (Type() == "elementwise_sub") { if (Type() == "elementwise_sub") {
if (ctx.InputVar("X")->IsType<framework::LoDTensor>()) { if (ctx.InputVar("X")->IsType<framework::LoDTensor>()) {
return framework::KernelSignature("subtract", {"X", "Y"}, {"axis"}, if (axis == -1) {
return framework::KernelSignature("subtract", {"X", "Y"}, {},
{"Out"});
}
return framework::KernelSignature("subtract_raw", {"X", "Y"}, {"axis"},
{"Out"}); {"Out"});
} }
} }
if (Type() == "elementwise_div") { if (Type() == "elementwise_div") {
if (ctx.InputVar("X")->IsType<framework::LoDTensor>()) { if (ctx.InputVar("X")->IsType<framework::LoDTensor>()) {
return framework::KernelSignature("divide", {"X", "Y"}, {"axis"}, if (axis == -1) {
return framework::KernelSignature("divide", {"X", "Y"}, {}, {"Out"});
}
return framework::KernelSignature("divide_raw", {"X", "Y"}, {"axis"},
{"Out"}); {"Out"});
} }
} }
if (Type() == "elementwise_mul") { if (Type() == "elementwise_mul") {
if (ctx.InputVar("X")->IsType<framework::LoDTensor>()) { if (ctx.InputVar("X")->IsType<framework::LoDTensor>()) {
return framework::KernelSignature("multiply", {"X", "Y"}, {"axis"}, if (axis == -1) {
return framework::KernelSignature("multiply", {"X", "Y"}, {},
{"Out"});
}
return framework::KernelSignature("multiply_raw", {"X", "Y"}, {"axis"},
{"Out"}); {"Out"});
} }
} }
......
...@@ -51,7 +51,7 @@ class ElementwiseSubKernel : public framework::OpKernel<T> { ...@@ -51,7 +51,7 @@ class ElementwiseSubKernel : public framework::OpKernel<T> {
auto pt_x = paddle::experimental::MakePtenDenseTensor(*x); auto pt_x = paddle::experimental::MakePtenDenseTensor(*x);
auto pt_y = paddle::experimental::MakePtenDenseTensor(*y); auto pt_y = paddle::experimental::MakePtenDenseTensor(*y);
auto pt_z = paddle::experimental::MakePtenDenseTensor(*z); auto pt_z = paddle::experimental::MakePtenDenseTensor(*z);
pten::SubtractKernel<T>( pten::SubtractRawKernel<T>(
static_cast<const typename framework::ConvertToPtenContext< static_cast<const typename framework::ConvertToPtenContext<
DeviceContext>::TYPE&>(dev_ctx), DeviceContext>::TYPE&>(dev_ctx),
*pt_x.get(), *pt_y.get(), axis, pt_z.get()); *pt_x.get(), *pt_y.get(), axis, pt_z.get());
......
...@@ -221,7 +221,7 @@ void Tensor_Add(const DeviceContext& dev_ctx, const framework::Tensor& src1, ...@@ -221,7 +221,7 @@ void Tensor_Add(const DeviceContext& dev_ctx, const framework::Tensor& src1,
out->Resize(src1.dims()); out->Resize(src1.dims());
out->mutable_data<T>(dev_ctx.GetPlace()); out->mutable_data<T>(dev_ctx.GetPlace());
pten::AddKernel< pten::AddRawKernel<
T, typename paddle::framework::ConvertToPtenContext<DeviceContext>::TYPE>( T, typename paddle::framework::ConvertToPtenContext<DeviceContext>::TYPE>(
static_cast<const typename paddle::framework::ConvertToPtenContext< static_cast<const typename paddle::framework::ConvertToPtenContext<
DeviceContext>::TYPE&>(dev_ctx), DeviceContext>::TYPE&>(dev_ctx),
...@@ -234,7 +234,7 @@ void Tensor_Sub(const DeviceContext& dev_ctx, const framework::Tensor& src1, ...@@ -234,7 +234,7 @@ void Tensor_Sub(const DeviceContext& dev_ctx, const framework::Tensor& src1,
out->Resize(src1.dims()); out->Resize(src1.dims());
out->mutable_data<T>(dev_ctx.GetPlace()); out->mutable_data<T>(dev_ctx.GetPlace());
pten::SubtractKernel< pten::SubtractRawKernel<
T, typename paddle::framework::ConvertToPtenContext<DeviceContext>::TYPE>( T, typename paddle::framework::ConvertToPtenContext<DeviceContext>::TYPE>(
static_cast<const typename paddle::framework::ConvertToPtenContext< static_cast<const typename paddle::framework::ConvertToPtenContext<
DeviceContext>::TYPE&>(dev_ctx), DeviceContext>::TYPE&>(dev_ctx),
......
...@@ -551,17 +551,26 @@ class ReduceOp : public framework::OperatorWithKernel { ...@@ -551,17 +551,26 @@ class ReduceOp : public framework::OperatorWithKernel {
framework::KernelSignature GetExpectedPtenKernelArgs( framework::KernelSignature GetExpectedPtenKernelArgs(
const framework::ExecutionContext& ctx) const override { const framework::ExecutionContext& ctx) const override {
bool reduce_all = ctx.Attr<bool>("reduce_all");
if (Type() == "reduce_sum") { if (Type() == "reduce_sum") {
if (ctx.InputVar("X")->IsType<framework::LoDTensor>()) { if (ctx.InputVar("X")->IsType<framework::LoDTensor>()) {
if (!reduce_all) {
return framework::KernelSignature(
"sum", {"X"}, {"dim", "keep_dim", "out_dtype"}, {"Out"});
}
return framework::KernelSignature( return framework::KernelSignature(
"sum", {"X"}, {"dim", "keep_dim", "reduce_all", "out_dtype"}, "sum_raw", {"X"}, {"dim", "keep_dim", "reduce_all", "out_dtype"},
{"Out"}); {"Out"});
} }
} }
if (Type() == "reduce_mean") { if (Type() == "reduce_mean") {
if (ctx.InputVar("X")->IsType<framework::LoDTensor>()) { if (ctx.InputVar("X")->IsType<framework::LoDTensor>()) {
if (!reduce_all) {
return framework::KernelSignature("mean", {"X"}, {"dim", "keep_dim"},
{"Out"});
}
return framework::KernelSignature( return framework::KernelSignature(
"mean", {"X"}, {"dim", "keep_dim", "reduce_all"}, {"Out"}); "mean_raw", {"X"}, {"dim", "keep_dim", "reduce_all"}, {"Out"});
} }
} }
// TODO(chentianyu03): support other cases after selected rows added // TODO(chentianyu03): support other cases after selected rows added
......
...@@ -30,7 +30,6 @@ using DeviceContext = paddle::platform::DeviceContext; ...@@ -30,7 +30,6 @@ using DeviceContext = paddle::platform::DeviceContext;
using add_kernel = void (*)(const DeviceContext&, using add_kernel = void (*)(const DeviceContext&,
const DenseTensor&, const DenseTensor&,
const DenseTensor&, const DenseTensor&,
int,
DenseTensor*); DenseTensor*);
using cast_kernel = void (*)(const DeviceContext&, using cast_kernel = void (*)(const DeviceContext&,
...@@ -46,7 +45,6 @@ using concat_kernel = void (*)(const DeviceContext&, ...@@ -46,7 +45,6 @@ using concat_kernel = void (*)(const DeviceContext&,
using divide_kernel = void (*)(const DeviceContext&, using divide_kernel = void (*)(const DeviceContext&,
const DenseTensor&, const DenseTensor&,
const DenseTensor&, const DenseTensor&,
int,
DenseTensor*); DenseTensor*);
using dot_kernel = void (*)(const DeviceContext&, using dot_kernel = void (*)(const DeviceContext&,
...@@ -82,13 +80,11 @@ using mean_kernel = void (*)(const DeviceContext&, ...@@ -82,13 +80,11 @@ using mean_kernel = void (*)(const DeviceContext&,
const DenseTensor&, const DenseTensor&,
const std::vector<int64_t>&, const std::vector<int64_t>&,
bool, bool,
bool,
DenseTensor*); DenseTensor*);
using multiply_kernel = void (*)(const DeviceContext&, using multiply_kernel = void (*)(const DeviceContext&,
const DenseTensor&, const DenseTensor&,
const DenseTensor&, const DenseTensor&,
int,
DenseTensor*); DenseTensor*);
using reshape_kernel = void (*)(const DeviceContext&, using reshape_kernel = void (*)(const DeviceContext&,
...@@ -107,14 +103,12 @@ using sum_kernel = void (*)(const DeviceContext&, ...@@ -107,14 +103,12 @@ using sum_kernel = void (*)(const DeviceContext&,
const DenseTensor&, const DenseTensor&,
const std::vector<int64_t>&, const std::vector<int64_t>&,
bool, bool,
bool,
DataType, DataType,
DenseTensor*); DenseTensor*);
using subtract_kernel = void (*)(const DeviceContext&, using subtract_kernel = void (*)(const DeviceContext&,
const DenseTensor&, const DenseTensor&,
const DenseTensor&, const DenseTensor&,
int,
DenseTensor*); DenseTensor*);
using conj_kernel = void (*)(const DeviceContext&, using conj_kernel = void (*)(const DeviceContext&,
......
...@@ -20,10 +20,10 @@ namespace pten { ...@@ -20,10 +20,10 @@ namespace pten {
// the key is kernel_name in fluid, the value is the kernel_name in pten // the key is kernel_name in fluid, the value is the kernel_name in pten
// the key is sorted by key's alphabet // the key is sorted by key's alphabet
const std::unordered_map<std::string, std::string> kernel_alias_name_map = { const std::unordered_map<std::string, std::string> kernel_alias_name_map = {
{"elementwise_add", "add"}, {"elementwise_add", "add_raw"},
{"elementwise_div", "divide"}, {"elementwise_div", "divide_raw"},
{"elementwise_mul", "muliply"}, {"elementwise_mul", "muliply_raw"},
{"elementwise_sub", "subtract"}, {"elementwise_sub", "subtract_raw"},
{"fill_any_like", "full_like"}, {"fill_any_like", "full_like"},
{"fill_constant", "full"}, {"fill_constant", "full"},
{"flatten_contiguous_range", "flatten"}, {"flatten_contiguous_range", "flatten"},
...@@ -32,8 +32,8 @@ const std::unordered_map<std::string, std::string> kernel_alias_name_map = { ...@@ -32,8 +32,8 @@ const std::unordered_map<std::string, std::string> kernel_alias_name_map = {
{"matmul_v2_grad", "matmul_grad"}, {"matmul_v2_grad", "matmul_grad"},
{"matmul_v2_grad_grad", "matmul_double_grad"}, {"matmul_v2_grad_grad", "matmul_double_grad"},
{"matmul_v2_triple_grad", "matmul_triple_grad"}, {"matmul_v2_triple_grad", "matmul_triple_grad"},
{"reduce_mean", "mean"}, {"reduce_mean", "mean_raw"},
{"reduce_sum", "sum"}, {"reduce_sum", "sum_raw"},
{"reshape2", "reshape"}, {"reshape2", "reshape"},
{"reshape2_grad", "reshape_grad"}, {"reshape2_grad", "reshape_grad"},
{"reshape2_grad_grad", "reshape_double_grad"}, {"reshape2_grad_grad", "reshape_double_grad"},
......
...@@ -32,11 +32,11 @@ namespace pten { ...@@ -32,11 +32,11 @@ namespace pten {
#define DEFINE_CPU_ELEMENTWISE_OP(name) \ #define DEFINE_CPU_ELEMENTWISE_OP(name) \
template <typename T, typename Context> \ template <typename T, typename Context> \
void name##Kernel(const Context& dev_ctx, \ void name##RawKernel(const Context& dev_ctx, \
const DenseTensor& x, \ const DenseTensor& x, \
const DenseTensor& y, \ const DenseTensor& y, \
int axis, \ int axis, \
DenseTensor* out) { \ DenseTensor* out) { \
out->mutable_data<T>(); \ out->mutable_data<T>(); \
if (x.dims() == y.dims()) { \ if (x.dims() == y.dims()) { \
SameDimsElementwiseCompute<SameDims##name##Functor<CPUContext, T>>()( \ SameDimsElementwiseCompute<SameDims##name##Functor<CPUContext, T>>()( \
...@@ -55,23 +55,35 @@ namespace pten { ...@@ -55,23 +55,35 @@ namespace pten {
} }
template <typename T, typename Context> template <typename T, typename Context>
void MeanKernel(const Context& dev_ctx, void MeanRawKernel(const Context& dev_ctx,
const DenseTensor& x, const DenseTensor& x,
const std::vector<int64_t>& dims, const std::vector<int64_t>& dims,
bool keep_dim, bool keep_dim,
bool reduce_all, bool reduce_all,
DenseTensor* out) { DenseTensor* out) {
auto out_dtype = x.dtype(); auto out_dtype = x.dtype();
pten::Reduce<CPUContext, T, pten::funcs::MeanFunctor>( pten::Reduce<CPUContext, T, pten::funcs::MeanFunctor>(
dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out); dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out);
} }
template <typename T, typename Context> template <typename T, typename Context>
void DivideKernel(const Context& dev_ctx, void SumRawKernel(const Context& dev_ctx,
const DenseTensor& x, const DenseTensor& x,
const DenseTensor& y, const std::vector<int64_t>& dims,
int axis, bool keep_dim,
bool reduce_all,
DataType out_dtype,
DenseTensor* out) { DenseTensor* out) {
pten::Reduce<CPUContext, T, pten::funcs::SumFunctor>(
dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out);
}
template <typename T, typename Context>
void DivideRawKernel(const Context& dev_ctx,
const DenseTensor& x,
const DenseTensor& y,
int axis,
DenseTensor* out) {
// allocate memory for out // allocate memory for out
out->mutable_data<T>(); out->mutable_data<T>();
if (x.dims() == y.dims() && std::is_floating_point<T>::value) { if (x.dims() == y.dims() && std::is_floating_point<T>::value) {
...@@ -90,18 +102,6 @@ void DivideKernel(const Context& dev_ctx, ...@@ -90,18 +102,6 @@ void DivideKernel(const Context& dev_ctx,
} }
} }
template <typename T, typename Context>
void SumKernel(const Context& dev_ctx,
const DenseTensor& x,
const std::vector<int64_t>& dims,
bool keep_dim,
bool reduce_all,
DataType out_dtype,
DenseTensor* out) {
pten::Reduce<CPUContext, T, pten::funcs::SumFunctor>(
dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out);
}
// Create the definition of Add // Create the definition of Add
DEFINE_CPU_ELEMENTWISE_OP(Add) DEFINE_CPU_ELEMENTWISE_OP(Add)
...@@ -118,42 +118,40 @@ using complex128 = ::paddle::platform::complex<double>; ...@@ -118,42 +118,40 @@ using complex128 = ::paddle::platform::complex<double>;
// NOTE(chenweihang): using bfloat16 will cause redefine with xpu bfloat16 // NOTE(chenweihang): using bfloat16 will cause redefine with xpu bfloat16
// using bfloat16 = ::paddle::platform::bfloat16; // using bfloat16 = ::paddle::platform::bfloat16;
PT_REGISTER_KERNEL( PT_REGISTER_KERNEL(add_raw,
mean, CPU, ALL_LAYOUT, pten::MeanKernel, float, double, bool) {}
PT_REGISTER_KERNEL(add,
CPU, CPU,
ALL_LAYOUT, ALL_LAYOUT,
pten::AddKernel, pten::AddRawKernel,
float, float,
double, double,
int, int,
int64_t, int64_t,
complex64, complex64,
complex128) {} complex128) {}
PT_REGISTER_KERNEL(subtract, PT_REGISTER_KERNEL(subtract_raw,
CPU, CPU,
ALL_LAYOUT, ALL_LAYOUT,
pten::SubtractKernel, pten::SubtractRawKernel,
float, float,
double, double,
int, int,
int64_t, int64_t,
complex64, complex64,
complex128) {} complex128) {}
PT_REGISTER_KERNEL(divide, PT_REGISTER_KERNEL(divide_raw,
CPU, CPU,
ALL_LAYOUT, ALL_LAYOUT,
pten::DivideKernel, pten::DivideRawKernel,
float, float,
double, double,
int, int,
int64_t, int64_t,
complex64, complex64,
complex128) {} complex128) {}
PT_REGISTER_KERNEL(multiply, PT_REGISTER_KERNEL(multiply_raw,
CPU, CPU,
ALL_LAYOUT, ALL_LAYOUT,
pten::MultiplyKernel, pten::MultiplyRawKernel,
float, float,
double, double,
int, int,
...@@ -161,10 +159,10 @@ PT_REGISTER_KERNEL(multiply, ...@@ -161,10 +159,10 @@ PT_REGISTER_KERNEL(multiply,
bool, bool,
complex64, complex64,
complex128) {} complex128) {}
PT_REGISTER_KERNEL(sum, PT_REGISTER_KERNEL(sum_raw,
CPU, CPU,
ALL_LAYOUT, ALL_LAYOUT,
pten::SumKernel, pten::SumRawKernel,
bool, bool,
float, float,
double, double,
...@@ -175,3 +173,5 @@ PT_REGISTER_KERNEL(sum, ...@@ -175,3 +173,5 @@ PT_REGISTER_KERNEL(sum,
complex128) { complex128) {
kernel->OutputAt(0).SetDataType(paddle::experimental::DataType::UNDEFINED); kernel->OutputAt(0).SetDataType(paddle::experimental::DataType::UNDEFINED);
} }
PT_REGISTER_KERNEL(
mean_raw, CPU, ALL_LAYOUT, pten::MeanRawKernel, float, double, bool) {}
...@@ -37,11 +37,11 @@ namespace pten { ...@@ -37,11 +37,11 @@ namespace pten {
#define DEFINE_CUDA_ELEMENTWISE_OP(name) \ #define DEFINE_CUDA_ELEMENTWISE_OP(name) \
template <typename T, typename Context> \ template <typename T, typename Context> \
void name##Kernel(const Context& dev_ctx, \ void name##RawKernel(const Context& dev_ctx, \
const DenseTensor& x, \ const DenseTensor& x, \
const DenseTensor& y, \ const DenseTensor& y, \
int axis, \ int axis, \
DenseTensor* out) { \ DenseTensor* out) { \
std::vector<const DenseTensor*> inputs; \ std::vector<const DenseTensor*> inputs; \
std::vector<DenseTensor*> outputs; \ std::vector<DenseTensor*> outputs; \
inputs.emplace_back(&x); \ inputs.emplace_back(&x); \
...@@ -57,17 +57,29 @@ namespace pten { ...@@ -57,17 +57,29 @@ namespace pten {
*/ */
template <typename T, typename Context> template <typename T, typename Context>
void MeanKernel(const Context& dev_ctx, void MeanRawKernel(const Context& dev_ctx,
const DenseTensor& x, const DenseTensor& x,
const std::vector<int64_t>& dims, const std::vector<int64_t>& dims,
bool keep_dim, bool keep_dim,
bool reduce_all, bool reduce_all,
DenseTensor* out) { DenseTensor* out) {
auto out_dtype = x.dtype(); auto out_dtype = x.dtype();
pten::Reduce<T, kps::AddFunctor, kps::DivideFunctor>( pten::Reduce<T, kps::AddFunctor, kps::DivideFunctor>(
dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out); dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out);
} }
template <typename T, typename Context>
void SumRawKernel(const Context& dev_ctx,
const DenseTensor& x,
const std::vector<int64_t>& dims,
bool keep_dim,
bool reduce_all,
DataType out_dtype,
DenseTensor* out) {
pten::Reduce<T, kps::AddFunctor, kps::IdentityFunctor>(
dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out);
}
// Create the definition of Add // Create the definition of Add
DEFINE_CUDA_ELEMENTWISE_OP(Add) DEFINE_CUDA_ELEMENTWISE_OP(Add)
// Create the definition of Subtract // Create the definition of Subtract
...@@ -77,30 +89,16 @@ DEFINE_CUDA_ELEMENTWISE_OP(Multiply) ...@@ -77,30 +89,16 @@ DEFINE_CUDA_ELEMENTWISE_OP(Multiply)
// Create the definition of Divide // Create the definition of Divide
DEFINE_CUDA_ELEMENTWISE_OP(Divide) DEFINE_CUDA_ELEMENTWISE_OP(Divide)
template <typename T, typename Context>
void SumKernel(const Context& dev_ctx,
const DenseTensor& x,
const std::vector<int64_t>& dims,
bool keep_dim,
bool reduce_all,
DataType out_dtype,
DenseTensor* out) {
pten::Reduce<T, kps::AddFunctor, kps::IdentityFunctor>(
dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out);
}
} // namespace pten } // namespace pten
using float16 = paddle::platform::float16; using float16 = paddle::platform::float16;
using complex64 = ::paddle::platform::complex<float>; using complex64 = ::paddle::platform::complex<float>;
using complex128 = ::paddle::platform::complex<double>; using complex128 = ::paddle::platform::complex<double>;
PT_REGISTER_KERNEL( PT_REGISTER_KERNEL(add_raw,
mean, GPU, ALL_LAYOUT, pten::MeanKernel, float, double, bool, float16) {}
PT_REGISTER_KERNEL(add,
GPU, GPU,
ALL_LAYOUT, ALL_LAYOUT,
pten::AddKernel, pten::AddRawKernel,
float, float,
double, double,
int, int,
...@@ -108,10 +106,10 @@ PT_REGISTER_KERNEL(add, ...@@ -108,10 +106,10 @@ PT_REGISTER_KERNEL(add,
float16, float16,
complex64, complex64,
complex128) {} complex128) {}
PT_REGISTER_KERNEL(subtract, PT_REGISTER_KERNEL(subtract_raw,
GPU, GPU,
ALL_LAYOUT, ALL_LAYOUT,
pten::SubtractKernel, pten::SubtractRawKernel,
float, float,
double, double,
int, int,
...@@ -119,10 +117,10 @@ PT_REGISTER_KERNEL(subtract, ...@@ -119,10 +117,10 @@ PT_REGISTER_KERNEL(subtract,
float16, float16,
complex64, complex64,
complex128) {} complex128) {}
PT_REGISTER_KERNEL(divide, PT_REGISTER_KERNEL(divide_raw,
GPU, GPU,
ALL_LAYOUT, ALL_LAYOUT,
pten::DivideKernel, pten::DivideRawKernel,
float, float,
double, double,
int, int,
...@@ -130,10 +128,10 @@ PT_REGISTER_KERNEL(divide, ...@@ -130,10 +128,10 @@ PT_REGISTER_KERNEL(divide,
float16, float16,
complex64, complex64,
complex128) {} complex128) {}
PT_REGISTER_KERNEL(multiply, PT_REGISTER_KERNEL(multiply_raw,
GPU, GPU,
ALL_LAYOUT, ALL_LAYOUT,
pten::MultiplyKernel, pten::MultiplyRawKernel,
float, float,
double, double,
int, int,
...@@ -142,10 +140,10 @@ PT_REGISTER_KERNEL(multiply, ...@@ -142,10 +140,10 @@ PT_REGISTER_KERNEL(multiply,
float16, float16,
complex64, complex64,
complex128) {} complex128) {}
PT_REGISTER_KERNEL(sum, PT_REGISTER_KERNEL(sum_raw,
GPU, GPU,
ALL_LAYOUT, ALL_LAYOUT,
pten::SumKernel, pten::SumRawKernel,
bool, bool,
float, float,
double, double,
...@@ -156,3 +154,12 @@ PT_REGISTER_KERNEL(sum, ...@@ -156,3 +154,12 @@ PT_REGISTER_KERNEL(sum,
complex128) { complex128) {
kernel->OutputAt(0).SetDataType(paddle::experimental::DataType::UNDEFINED); kernel->OutputAt(0).SetDataType(paddle::experimental::DataType::UNDEFINED);
} }
PT_REGISTER_KERNEL(mean_raw,
GPU,
ALL_LAYOUT,
pten::MeanRawKernel,
float,
double,
bool,
float16) {}
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/pten/kernels/math_kernel.h"
#include "paddle/pten/backends/all_context.h"
#include "paddle/pten/core/kernel_registry.h"
namespace pten {
template <typename T, typename Context>
void MeanKernel(const Context& dev_ctx,
const DenseTensor& x,
const std::vector<int64_t>& dims,
bool keep_dim,
DenseTensor* out) {
bool reduce_all = false;
MeanRawKernel<T>(dev_ctx, x, dims, keep_dim, reduce_all, out);
}
template <typename T, typename Context>
void SumKernel(const Context& dev_ctx,
const DenseTensor& x,
const std::vector<int64_t>& dims,
bool keep_dim,
DataType out_dtype,
DenseTensor* out) {
bool reduce_all = false;
SumRawKernel<T>(dev_ctx, x, dims, keep_dim, reduce_all, out_dtype, out);
}
template <typename T, typename Context>
void AddKernel(const Context& dev_ctx,
const DenseTensor& x,
const DenseTensor& y,
DenseTensor* out) {
int axis = -1;
AddRawKernel<T>(dev_ctx, x, y, axis, out);
}
template <typename T, typename Context>
void SubtractKernel(const Context& dev_ctx,
const DenseTensor& x,
const DenseTensor& y,
DenseTensor* out) {
int axis = -1;
SubtractRawKernel<T>(dev_ctx, x, y, axis, out);
}
template <typename T, typename Context>
void DivideKernel(const Context& dev_ctx,
const DenseTensor& x,
const DenseTensor& y,
DenseTensor* out) {
int axis = -1;
DivideRawKernel<T>(dev_ctx, x, y, axis, out);
}
template <typename T, typename Context>
void MultiplyKernel(const Context& dev_ctx,
const DenseTensor& x,
const DenseTensor& y,
DenseTensor* out) {
int axis = -1;
MultiplyRawKernel<T>(dev_ctx, x, y, axis, out);
}
} // namespace pten
using complex64 = ::paddle::platform::complex<float>;
using complex128 = ::paddle::platform::complex<double>;
PT_REGISTER_KERNEL(
mean, CPU, ALL_LAYOUT, pten::MeanKernel, float, double, bool) {}
PT_REGISTER_KERNEL(sum,
CPU,
ALL_LAYOUT,
pten::SumKernel,
bool,
float,
double,
paddle::platform::float16,
int,
int64_t,
complex64,
complex128) {
kernel->OutputAt(0).SetDataType(paddle::experimental::DataType::UNDEFINED);
}
PT_REGISTER_KERNEL(add,
CPU,
ALL_LAYOUT,
pten::AddKernel,
float,
double,
int,
int64_t,
complex64,
complex128) {}
PT_REGISTER_KERNEL(subtract,
CPU,
ALL_LAYOUT,
pten::SubtractKernel,
float,
double,
int,
int64_t,
complex64,
complex128) {}
PT_REGISTER_KERNEL(divide,
CPU,
ALL_LAYOUT,
pten::DivideKernel,
float,
double,
int,
int64_t,
complex64,
complex128) {}
PT_REGISTER_KERNEL(multiply,
CPU,
ALL_LAYOUT,
pten::MultiplyKernel,
float,
double,
int,
int64_t,
bool,
complex64,
complex128) {}
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
PT_REGISTER_KERNEL(mean,
GPU,
ALL_LAYOUT,
pten::MeanKernel,
float,
double,
bool,
paddle::platform::float16) {}
PT_REGISTER_KERNEL(sum,
GPU,
ALL_LAYOUT,
pten::SumKernel,
bool,
float,
double,
paddle::platform::float16,
int,
int64_t,
complex64,
complex128) {
kernel->OutputAt(0).SetDataType(paddle::experimental::DataType::UNDEFINED);
}
PT_REGISTER_KERNEL(add,
GPU,
ALL_LAYOUT,
pten::AddKernel,
float,
double,
int,
int64_t,
paddle::platform::float16,
complex64,
complex128) {}
PT_REGISTER_KERNEL(subtract,
GPU,
ALL_LAYOUT,
pten::SubtractKernel,
float,
double,
int,
int64_t,
paddle::platform::float16,
complex64,
complex128) {}
PT_REGISTER_KERNEL(divide,
GPU,
ALL_LAYOUT,
pten::DivideKernel,
float,
double,
int,
int64_t,
paddle::platform::float16,
complex64,
complex128) {}
PT_REGISTER_KERNEL(multiply,
GPU,
ALL_LAYOUT,
pten::MultiplyKernel,
float,
double,
int,
int64_t,
bool,
paddle::platform::float16,
complex64,
complex128) {}
#endif
...@@ -22,104 +22,127 @@ limitations under the License. */ ...@@ -22,104 +22,127 @@ limitations under the License. */
namespace pten { namespace pten {
template <typename T, typename Context>
void MeanRawKernel(const Context& dev_ctx,
const DenseTensor& x,
const std::vector<int64_t>& dims,
bool keep_dim,
bool reduce_all,
DenseTensor* out);
template <typename T, typename Context> template <typename T, typename Context>
void MeanKernel(const Context& dev_ctx, void MeanKernel(const Context& dev_ctx,
const DenseTensor& x, const DenseTensor& x,
const std::vector<int64_t>& dims, const std::vector<int64_t>& dims,
bool keep_dim, bool keep_dim,
bool reduce_all,
DenseTensor* out); DenseTensor* out);
template <typename T, typename Context>
void SumRawKernel(const Context& dev_ctx,
const DenseTensor& x,
const std::vector<int64_t>& dims,
bool keep_dim,
bool reduce_all,
DataType out_dtype,
DenseTensor* out);
template <typename T, typename Context>
void SumKernel(const Context& dev_ctx,
const DenseTensor& x,
const std::vector<int64_t>& dims,
bool keep_dim,
DataType out_dtype,
DenseTensor* out);
template <typename T, typename Context>
void AddRawKernel(const Context& dev_ctx,
const DenseTensor& x,
const DenseTensor& y,
int axis,
DenseTensor* out);
template <typename T, typename Context> template <typename T, typename Context>
void AddKernel(const Context& dev_ctx, void AddKernel(const Context& dev_ctx,
const DenseTensor& x, const DenseTensor& x,
const DenseTensor& y, const DenseTensor& y,
int axis,
DenseTensor* out); DenseTensor* out);
template <typename T, typename Context>
void SubtractRawKernel(const Context& dev_ctx,
const DenseTensor& x,
const DenseTensor& y,
int axis,
DenseTensor* out);
template <typename T, typename Context> template <typename T, typename Context>
void SubtractKernel(const Context& dev_ctx, void SubtractKernel(const Context& dev_ctx,
const DenseTensor& x, const DenseTensor& x,
const DenseTensor& y, const DenseTensor& y,
int axis,
DenseTensor* out); DenseTensor* out);
template <typename T, typename Context>
void DivideRawKernel(const Context& dev_ctx,
const DenseTensor& x,
const DenseTensor& y,
int axis,
DenseTensor* out);
template <typename T, typename Context> template <typename T, typename Context>
void DivideKernel(const Context& dev_ctx, void DivideKernel(const Context& dev_ctx,
const DenseTensor& x, const DenseTensor& x,
const DenseTensor& y, const DenseTensor& y,
int axis,
DenseTensor* out); DenseTensor* out);
template <typename T, typename Context>
void MultiplyRawKernel(const Context& dev_ctx,
const DenseTensor& x,
const DenseTensor& y,
int axis,
DenseTensor* out);
template <typename T, typename Context> template <typename T, typename Context>
void MultiplyKernel(const Context& dev_ctx, void MultiplyKernel(const Context& dev_ctx,
const DenseTensor& x, const DenseTensor& x,
const DenseTensor& y, const DenseTensor& y,
int axis,
DenseTensor* out); DenseTensor* out);
template <typename T, typename Context>
void SumKernel(const Context& dev_ctx,
const DenseTensor& x,
const std::vector<int64_t>& dims,
bool keep_dim,
bool reduce_all,
DataType out_dtype,
DenseTensor* out);
template <typename T, typename Context> template <typename T, typename Context>
DenseTensor Add(const Context& dev_ctx, DenseTensor Add(const Context& dev_ctx,
const DenseTensor& x, const DenseTensor& x,
const DenseTensor& y, const DenseTensor& y) {
int axis) { auto out_meta = ElementwiseInferMeta(x.meta(), y.meta(), -1);
auto out_meta = ElementwiseInferMeta(x.meta(), y.meta(), axis); auto dense_out = pten::Empty<T, Context>(dev_ctx, std::move(out_meta));
pten::DenseTensor dense_out( AddKernel<T, Context>(dev_ctx, x, y, &dense_out);
pten::make_intrusive<paddle::experimental::SharedStorage>(
dev_ctx.GetPlace()),
std::move(out_meta));
AddKernel<T, Context>(dev_ctx, x, y, axis, &dense_out);
return dense_out; return dense_out;
} }
template <typename T, typename Context> template <typename T, typename Context>
DenseTensor Subtract(const Context& dev_ctx, DenseTensor Subtract(const Context& dev_ctx,
const DenseTensor& x, const DenseTensor& x,
const DenseTensor& y, const DenseTensor& y) {
int axis) { auto out_meta = ElementwiseInferMeta(x.meta(), y.meta(), -1);
auto out_meta = ElementwiseInferMeta(x.meta(), y.meta(), axis); auto dense_out = pten::Empty<T, Context>(dev_ctx, std::move(out_meta));
pten::DenseTensor dense_out( SubtractKernel<T, Context>(dev_ctx, x, y, &dense_out);
pten::make_intrusive<paddle::experimental::SharedStorage>(
dev_ctx.GetPlace()),
std::move(out_meta));
SubtractKernel<T, Context>(dev_ctx, x, y, axis, &dense_out);
return dense_out; return dense_out;
} }
template <typename T, typename Context> template <typename T, typename Context>
DenseTensor Divide(const Context& dev_ctx, DenseTensor Divide(const Context& dev_ctx,
const DenseTensor& x, const DenseTensor& x,
const DenseTensor& y, const DenseTensor& y) {
int axis) { auto out_meta = ElementwiseInferMeta(x.meta(), y.meta(), -1);
auto out_meta = ElementwiseInferMeta(x.meta(), y.meta(), axis); auto dense_out = pten::Empty<T, Context>(dev_ctx, std::move(out_meta));
pten::DenseTensor dense_out( DivideKernel<T, Context>(dev_ctx, x, y, &dense_out);
pten::make_intrusive<paddle::experimental::SharedStorage>(
dev_ctx.GetPlace()),
std::move(out_meta));
DivideKernel<T, Context>(dev_ctx, x, y, axis, &dense_out);
return dense_out; return dense_out;
} }
template <typename T, typename Context> template <typename T, typename Context>
DenseTensor Multiply(const Context& dev_ctx, DenseTensor Multiply(const Context& dev_ctx,
const DenseTensor& x, const DenseTensor& x,
const DenseTensor& y, const DenseTensor& y) {
int axis) { auto out_meta = ElementwiseInferMeta(x.meta(), y.meta(), -1);
auto out_meta = ElementwiseInferMeta(x.meta(), y.meta(), axis); auto dense_out = pten::Empty<T, Context>(dev_ctx, std::move(out_meta));
pten::DenseTensor dense_out( MultiplyKernel<T, Context>(dev_ctx, x, y, &dense_out);
pten::make_intrusive<paddle::experimental::SharedStorage>(
dev_ctx.GetPlace()),
std::move(out_meta));
MultiplyKernel<T, Context>(dev_ctx, x, y, axis, &dense_out);
return dense_out; return dense_out;
} }
...@@ -130,8 +153,7 @@ DenseTensor Mean(const Context& dev_ctx, ...@@ -130,8 +153,7 @@ DenseTensor Mean(const Context& dev_ctx,
bool keep_dim) { bool keep_dim) {
auto out_meta = ReduceInferMeta(x.meta(), axis, keep_dim); auto out_meta = ReduceInferMeta(x.meta(), axis, keep_dim);
auto dense_out = pten::Empty<T, Context>(dev_ctx, std::move(out_meta)); auto dense_out = pten::Empty<T, Context>(dev_ctx, std::move(out_meta));
bool reduce_all = false; MeanKernel<T, Context>(dev_ctx, x, axis, keep_dim, &dense_out);
MeanKernel<T, Context>(dev_ctx, x, axis, keep_dim, reduce_all, &dense_out);
return dense_out; return dense_out;
} }
...@@ -144,12 +166,7 @@ DenseTensor Sum(const Context& dev_ctx, ...@@ -144,12 +166,7 @@ DenseTensor Sum(const Context& dev_ctx,
auto out_meta = ReduceInferMeta(x.meta(), axis, keep_dim, dtype); auto out_meta = ReduceInferMeta(x.meta(), axis, keep_dim, dtype);
auto dense_out = pten::Empty<T, Context>(dev_ctx, std::move(out_meta)); auto dense_out = pten::Empty<T, Context>(dev_ctx, std::move(out_meta));
// The real value of reduce_all will be get in kernel SumKernel<T, Context>(dev_ctx, x, axis, keep_dim, dtype, &dense_out);
// so use default value(false) is OK.
bool reduce_all = false;
SumKernel<T, Context>(
dev_ctx, x, axis, keep_dim, reduce_all, out_meta.dtype, &dense_out);
return dense_out; return dense_out;
} }
......
...@@ -54,11 +54,10 @@ TEST(DEV_API, add) { ...@@ -54,11 +54,10 @@ TEST(DEV_API, add) {
for (size_t i = 0; i < 10; ++i) { for (size_t i = 0; i < 10; ++i) {
dense_y_data[i] = i * 2.0; dense_y_data[i] = i * 2.0;
} }
int axis = 1;
// 2. test API // 2. test API
pten::CPUContext dev_ctx; pten::CPUContext dev_ctx;
auto dense_out = pten::Add<float>(dev_ctx, dense_x, dense_y, axis); auto dense_out = pten::Add<float>(dev_ctx, dense_x, dense_y);
// 3. check result // 3. check result
ASSERT_EQ(dense_out.dims().size(), 2); ASSERT_EQ(dense_out.dims().size(), 2);
...@@ -101,11 +100,10 @@ TEST(DEV_API, subtract) { ...@@ -101,11 +100,10 @@ TEST(DEV_API, subtract) {
for (size_t i = 0; i < 10; ++i) { for (size_t i = 0; i < 10; ++i) {
dense_y_data[i] = i * 2.0; dense_y_data[i] = i * 2.0;
} }
int axis = 1;
// 2. test API // 2. test API
pten::CPUContext dev_ctx; pten::CPUContext dev_ctx;
auto dense_out = pten::Subtract<float>(dev_ctx, dense_x, dense_y, axis); auto dense_out = pten::Subtract<float>(dev_ctx, dense_x, dense_y);
// 3. check result // 3. check result
ASSERT_EQ(dense_out.dims().size(), 2); ASSERT_EQ(dense_out.dims().size(), 2);
...@@ -148,11 +146,10 @@ TEST(DEV_API, divide) { ...@@ -148,11 +146,10 @@ TEST(DEV_API, divide) {
for (size_t i = 0; i < 10; ++i) { for (size_t i = 0; i < 10; ++i) {
dense_y_data[i] = i * 2.0 + 1; dense_y_data[i] = i * 2.0 + 1;
} }
int axis = 1;
// 2. test API // 2. test API
pten::CPUContext dev_ctx; pten::CPUContext dev_ctx;
auto dense_out = pten::Divide<float>(dev_ctx, dense_x, dense_y, axis); auto dense_out = pten::Divide<float>(dev_ctx, dense_x, dense_y);
// 3. check result // 3. check result
ASSERT_EQ(dense_out.dims().size(), 2); ASSERT_EQ(dense_out.dims().size(), 2);
...@@ -195,11 +192,10 @@ TEST(DEV_API, multiply) { ...@@ -195,11 +192,10 @@ TEST(DEV_API, multiply) {
for (size_t i = 0; i < 10; ++i) { for (size_t i = 0; i < 10; ++i) {
dense_y_data[i] = i * 2.0; dense_y_data[i] = i * 2.0;
} }
int axis = 1;
// 2. test API // 2. test API
pten::CPUContext dev_ctx; pten::CPUContext dev_ctx;
auto dense_out = pten::Multiply<float>(dev_ctx, dense_x, dense_y, axis); auto dense_out = pten::Multiply<float>(dev_ctx, dense_x, dense_y);
// 3. check result // 3. check result
ASSERT_EQ(dense_out.dims().size(), 2); ASSERT_EQ(dense_out.dims().size(), 2);
......
...@@ -6,7 +6,6 @@ ...@@ -6,7 +6,6 @@
param : [x, y, -1] param : [x, y, -1]
kernel : kernel :
func : add func : add
param : [x, y, -1]
- api : cast - api : cast
args : (const Tensor& x, DataType out_dtype) args : (const Tensor& x, DataType out_dtype)
...@@ -44,7 +43,6 @@ ...@@ -44,7 +43,6 @@
param : [x, y, -1] param : [x, y, -1]
kernel : kernel :
func : divide func : divide
param : [x, y, -1]
- api : dot - api : dot
args : (const Tensor& x, const Tensor& y) args : (const Tensor& x, const Tensor& y)
...@@ -130,7 +128,6 @@ ...@@ -130,7 +128,6 @@
param: [x, axis, keep_dim] param: [x, axis, keep_dim]
kernel : kernel :
func : mean func : mean
param : [x, axis, keep_dim, false]
- api : multiply - api : multiply
args : (const Tensor& x, const Tensor& y) args : (const Tensor& x, const Tensor& y)
...@@ -140,7 +137,6 @@ ...@@ -140,7 +137,6 @@
param : [x, y, -1] param : [x, y, -1]
kernel : kernel :
func : multiply func : multiply
param : [x, y, -1]
- api : ones_like - api : ones_like
args : (const Tensor& x, DataType dtype=DataType::UNDEFINED, Backend place=Backend::UNDEFINED, DataLayout layout=DataLayout::UNDEFINED) args : (const Tensor& x, DataType dtype=DataType::UNDEFINED, Backend place=Backend::UNDEFINED, DataLayout layout=DataLayout::UNDEFINED)
...@@ -172,7 +168,6 @@ ...@@ -172,7 +168,6 @@
param : [x, y, -1] param : [x, y, -1]
kernel : kernel :
func : subtract func : subtract
param : [x, y, -1]
- api : sum - api : sum
args : (const Tensor& x, const std::vector<int64_t>& axis={}, DataType dtype=DataType::UNDEFINED, bool keep_dim=false) args : (const Tensor& x, const std::vector<int64_t>& axis={}, DataType dtype=DataType::UNDEFINED, bool keep_dim=false)
...@@ -182,7 +177,7 @@ ...@@ -182,7 +177,7 @@
param: [x, axis, keep_dim, dtype] param: [x, axis, keep_dim, dtype]
kernel : kernel :
func : sum func : sum
param : [x, axis, keep_dim, false, DataType::UNDEFINED] param : [x, axis, keep_dim, dtype]
data_type : x data_type : x
- api : zeros_like - api : zeros_like
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册