From 0a4ffbc777700509908e2818de574d9588e2afad Mon Sep 17 00:00:00 2001 From: Chen Weihang Date: Wed, 22 Dec 2021 22:01:26 -0600 Subject: [PATCH] [PTen] Move dot kernel impl (#38359) * move dot kernel impl * remove needless cmake items --- paddle/pten/api/lib/kernel_declare.h | 4 +- paddle/pten/include/linalg.h | 3 +- paddle/pten/kernels/cpu/dot_kernel.cc | 61 ++++++++++++++++++++++++ paddle/pten/kernels/cpu/linalg.cc | 33 ------------- paddle/pten/kernels/cpu/linalg.h | 6 --- paddle/pten/kernels/dot_kernel.h | 27 +++++++++++ paddle/pten/kernels/gpu/dot_kernel.cu | 64 ++++++++++++++++++++++++++ paddle/pten/kernels/gpu/linalg.cu | 20 -------- paddle/pten/kernels/gpu/linalg.h | 6 --- paddle/pten/kernels/hybird/eigen/dot.h | 50 -------------------- 10 files changed, 156 insertions(+), 118 deletions(-) create mode 100644 paddle/pten/kernels/cpu/dot_kernel.cc create mode 100644 paddle/pten/kernels/dot_kernel.h create mode 100644 paddle/pten/kernels/gpu/dot_kernel.cu delete mode 100644 paddle/pten/kernels/hybird/eigen/dot.h diff --git a/paddle/pten/api/lib/kernel_declare.h b/paddle/pten/api/lib/kernel_declare.h index 4f2160a761..6ca6495e2c 100644 --- a/paddle/pten/api/lib/kernel_declare.h +++ b/paddle/pten/api/lib/kernel_declare.h @@ -20,13 +20,13 @@ limitations under the License. */ // the kernel declare statement is automatically generated according to the // file name of the kernel, and this header file will be removed -PT_DECLARE_KERNEL(dot, CPU, ALL_LAYOUT); +PT_DECLARE_KERNEL(matmul, CPU, ALL_LAYOUT); PT_DECLARE_KERNEL(cast, CPU, ALL_LAYOUT); PT_DECLARE_KERNEL(sign, CPU, ALL_LAYOUT); PT_DECLARE_KERNEL(conj, CPU, ALL_LAYOUT); #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) -PT_DECLARE_KERNEL(dot, GPU, ALL_LAYOUT); +PT_DECLARE_KERNEL(matmul, GPU, ALL_LAYOUT); PT_DECLARE_KERNEL(cast, GPU, ALL_LAYOUT); PT_DECLARE_KERNEL(sign, GPU, ALL_LAYOUT); PT_DECLARE_KERNEL(conj, GPU, ALL_LAYOUT); diff --git a/paddle/pten/include/linalg.h b/paddle/pten/include/linalg.h index 8f627f5fc8..34b0183778 100644 --- a/paddle/pten/include/linalg.h +++ b/paddle/pten/include/linalg.h @@ -18,6 +18,7 @@ #include "paddle/pten/api/lib/utils/storage.h" #include "paddle/pten/include/infermeta.h" #include "paddle/pten/kernels/cpu/linalg.h" +#include "paddle/pten/kernels/dot_kernel.h" #include "paddle/pten/kernels/gpu/linalg.h" namespace pten { @@ -31,7 +32,7 @@ DenseTensor Dot(const ContextT& dev_ctx, pten::make_intrusive( dev_ctx.GetPlace()), std::move(out_meta)); - Dot(dev_ctx, x, y, &dense_out); + Dot(dev_ctx, x, y, &dense_out); return dense_out; } diff --git a/paddle/pten/kernels/cpu/dot_kernel.cc b/paddle/pten/kernels/cpu/dot_kernel.cc new file mode 100644 index 0000000000..c485cc8ac5 --- /dev/null +++ b/paddle/pten/kernels/cpu/dot_kernel.cc @@ -0,0 +1,61 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/pten/kernels/dot_kernel.h" + +#include "paddle/pten/backends/cpu/cpu_context.h" +#include "paddle/pten/core/kernel_registry.h" + +// See Note [ Why still include the fluid headers? ] +#include "paddle/fluid/platform/complex.h" + +namespace pten { + +template +void Dot(const ContextT& dev_ctx, + const DenseTensor& x, + const DenseTensor& y, + DenseTensor* out) { + auto const *x_ptr = x.data(), *x_ptr_ = &x_ptr[0]; + auto const *y_ptr = y.data(), *y_ptr_ = &y_ptr[0]; + auto* z = out->mutable_data(); + + // Loop over the total N elements of both operands while sum-reducing every + // B pairs along the way where B is the dimension of the least ordered axis + auto&& d = x.dims(); + auto const N = x.numel(); + auto const B = d[d.size() - 1]; + + for (int j = 0; j < N / B; j++) { + T ss = 0; + for (int i = 0; i < B; i++) ss += (*x_ptr_++) * (*y_ptr_++); + z[j] = ss; + } +} + +} // namespace pten + +using complex64 = ::paddle::platform::complex; +using complex128 = ::paddle::platform::complex; + +PT_REGISTER_CTX_KERNEL(dot, + CPU, + ALL_LAYOUT, + pten::Dot, + float, + double, + int, + int64_t, + complex64, + complex128) {} diff --git a/paddle/pten/kernels/cpu/linalg.cc b/paddle/pten/kernels/cpu/linalg.cc index 87c4078896..0b58b36c59 100644 --- a/paddle/pten/kernels/cpu/linalg.cc +++ b/paddle/pten/kernels/cpu/linalg.cc @@ -25,28 +25,6 @@ namespace pten { -template -void Dot(const CPUContext& dev_ctx, - const DenseTensor& x, - const DenseTensor& y, - DenseTensor* out) { - auto const *x_ptr = x.data(), *x_ptr_ = &x_ptr[0]; - auto const *y_ptr = y.data(), *y_ptr_ = &y_ptr[0]; - auto* z = out->mutable_data(); - - // Loop over the total N elements of both operands while sum-reducing every - // B pairs along the way where B is the dimension of the least ordered axis - auto&& d = x.dims(); - auto const N = x.numel(); - auto const B = d[d.size() - 1]; - - for (int j = 0; j < N / B; j++) { - T ss = 0; - for (int i = 0; i < B; i++) ss += (*x_ptr_++) * (*y_ptr_++); - z[j] = ss; - } -} - template void Matmul(const CPUContext& dev_ctx, const DenseTensor& x, @@ -73,17 +51,6 @@ void Matmul(const CPUContext& dev_ctx, using complex64 = ::paddle::platform::complex; using complex128 = ::paddle::platform::complex; -PT_REGISTER_KERNEL(dot, - CPU, - ALL_LAYOUT, - pten::Dot, - float, - double, - int, - int64_t, - complex64, - complex128) {} - PT_REGISTER_KERNEL(matmul, CPU, ALL_LAYOUT, diff --git a/paddle/pten/kernels/cpu/linalg.h b/paddle/pten/kernels/cpu/linalg.h index 29c6cd16cf..d9fc391996 100644 --- a/paddle/pten/kernels/cpu/linalg.h +++ b/paddle/pten/kernels/cpu/linalg.h @@ -22,12 +22,6 @@ namespace pten { -template -void Dot(const CPUContext& dev_ctx, - const DenseTensor& x, - const DenseTensor& y, - DenseTensor* out); - template void Matmul(const CPUContext& dev_ctx, const DenseTensor& x, diff --git a/paddle/pten/kernels/dot_kernel.h b/paddle/pten/kernels/dot_kernel.h new file mode 100644 index 0000000000..f6db41cbbe --- /dev/null +++ b/paddle/pten/kernels/dot_kernel.h @@ -0,0 +1,27 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/pten/core/dense_tensor.h" + +namespace pten { + +template +void Dot(const ContextT& dev_ctx, + const DenseTensor& x, + const DenseTensor& y, + DenseTensor* out); + +} // namespace pten diff --git a/paddle/pten/kernels/gpu/dot_kernel.cu b/paddle/pten/kernels/gpu/dot_kernel.cu new file mode 100644 index 0000000000..7742e57a02 --- /dev/null +++ b/paddle/pten/kernels/gpu/dot_kernel.cu @@ -0,0 +1,64 @@ +// Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/pten/kernels/dot_kernel.h" + +#include "paddle/pten/backends/gpu/gpu_context.h" +#include "paddle/pten/core/kernel_registry.h" +#include "paddle/pten/kernels/hybird/eigen/common.h" + +// See Note [ Why still include the fluid headers? ] +#include "paddle/fluid/operators/eigen/eigen_function.h" +#include "paddle/fluid/platform/complex.h" + +namespace pten { + +template +void Dot(const ContextT& dev_ctx, + const DenseTensor& x, + const DenseTensor& y, + DenseTensor* out) { + out->mutable_data(); + if (1 == out->dims().size()) { + auto eigen_out = pten::EigenScalar::From(*out); + auto eigen_x = pten::EigenVector::Flatten(x); + auto eigen_y = pten::EigenVector::Flatten(y); + + auto& dev = *dev_ctx.eigen_device(); + eigen_out.device(dev) = (eigen_x * eigen_y).sum(); + } else { + auto eigen_out = pten::EigenMatrix::From(*out); + auto eigen_x = pten::EigenMatrix::From(x); + auto eigen_y = pten::EigenMatrix::From(y); + + auto& dev = *dev_ctx.eigen_device(); + eigen_out.device(dev) = (eigen_x * eigen_y).sum(Eigen::DSizes(1)); + } +} + +} // namespace pten + +using complex64 = ::paddle::platform::complex; +using complex128 = ::paddle::platform::complex; + +PT_REGISTER_CTX_KERNEL(dot, + GPU, + ALL_LAYOUT, + pten::Dot, + float, + double, + int, + int64_t, + complex64, + complex128) {} diff --git a/paddle/pten/kernels/gpu/linalg.cu b/paddle/pten/kernels/gpu/linalg.cu index c9bc4cbd07..e4a69b28e6 100644 --- a/paddle/pten/kernels/gpu/linalg.cu +++ b/paddle/pten/kernels/gpu/linalg.cu @@ -15,7 +15,6 @@ #include "paddle/pten/kernels/gpu/linalg.h" #include "paddle/pten/core/kernel_registry.h" -#include "paddle/pten/kernels/hybird/eigen/dot.h" #include "paddle/pten/kernels/hybird/math/matmul_func.h" // See Note [ Why still include the fluid headers? ] @@ -23,14 +22,6 @@ namespace pten { -template -void Dot(const GPUContext& dev_ctx, - const DenseTensor& x, - const DenseTensor& y, - DenseTensor* out) { - eigen::Dot(dev_ctx, x, y, out); -} - template void Matmul(const GPUContext& dev_ctx, const DenseTensor& x, @@ -58,17 +49,6 @@ using float16 = paddle::platform::float16; using complex64 = ::paddle::platform::complex; using complex128 = ::paddle::platform::complex; -PT_REGISTER_KERNEL(dot, - GPU, - ALL_LAYOUT, - pten::Dot, - float, - double, - int, - int64_t, - complex64, - complex128) {} - PT_REGISTER_KERNEL(matmul, GPU, ALL_LAYOUT, diff --git a/paddle/pten/kernels/gpu/linalg.h b/paddle/pten/kernels/gpu/linalg.h index a848f55c7b..a0f7c0c0aa 100644 --- a/paddle/pten/kernels/gpu/linalg.h +++ b/paddle/pten/kernels/gpu/linalg.h @@ -22,12 +22,6 @@ namespace pten { -template -void Dot(const GPUContext& dev_ctx, - const DenseTensor& x, - const DenseTensor& y, - DenseTensor* out); - template void Matmul(const GPUContext& dev_ctx, const DenseTensor& x, diff --git a/paddle/pten/kernels/hybird/eigen/dot.h b/paddle/pten/kernels/hybird/eigen/dot.h deleted file mode 100644 index eb089037fa..0000000000 --- a/paddle/pten/kernels/hybird/eigen/dot.h +++ /dev/null @@ -1,50 +0,0 @@ -/* Copyright (c) 2021 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#pragma once - -#include "paddle/pten/core/dense_tensor.h" -#include "paddle/pten/kernels/hybird/eigen/common.h" - -// See Note [ Why still include the fluid headers? ] -#include "paddle/fluid/operators/eigen/eigen_function.h" - -namespace pten { -namespace eigen { - -template -void Dot(const DevCtx& dev_ctx, - const DenseTensor& x, - const DenseTensor& y, - DenseTensor* out) { - out->mutable_data(); - if (1 == out->dims().size()) { - auto eigen_out = pten::EigenScalar::From(*out); - auto eigen_x = pten::EigenVector::Flatten(x); - auto eigen_y = pten::EigenVector::Flatten(y); - - auto& dev = *dev_ctx.eigen_device(); - eigen_out.device(dev) = (eigen_x * eigen_y).sum(); - } else { - auto eigen_out = pten::EigenMatrix::From(*out); - auto eigen_x = pten::EigenMatrix::From(x); - auto eigen_y = pten::EigenMatrix::From(y); - - auto& dev = *dev_ctx.eigen_device(); - eigen_out.device(dev) = (eigen_x * eigen_y).sum(Eigen::DSizes(1)); - } -} - -} // namespace eigen -} // namespace pten -- GitLab