Rename the general elementwise and broadcast functions. (#39623)

553afc07 · Yiqun Liu · GitHub · 267275d9 · 553afc07 · 553afc07
14 changed file
--- a/paddle/fluid/operators/dropout_impl.cu.h
+++ b/paddle/fluid/operators/dropout_impl.cu.h
@@ -36,7 +36,6 @@ limitations under the License. */
 #include "paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h"
 #include "paddle/fluid/platform/aligned_vector.h"
 #include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
-#include "paddle/pten/kernels/funcs/cuda_kernel_config.h"

 namespace paddle {
 namespace operators {

--- a/paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h
+++ b/paddle/fluid/operators/elementwise/elementwise_op_broadcast.cu.h
@@ -15,45 +15,13 @@
 #pragma once

 #include "paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h"
-#include "paddle/fluid/operators/kernel_primitives/kernel_primitives.h"
+
+// only can include the headers in paddle/top/api dirs
+#include "paddle/pten/kernels/gpu/elementwise.h"

 namespace paddle {
 namespace operators {

-namespace kps = paddle::operators::kernel_primitives;
-
-template <ElementwiseType ET, typename InT, typename OutT, typename Functor,
-          int NumOuts = 1>
-void LaunchBroadcastElementwiseCudaKernel(
-    const KPDevice &ctx, const std::vector<const framework::Tensor *> &ins,
-    std::vector<framework::Tensor *> *outs, int axis, Functor func) {
-  std::vector<const pten::DenseTensor *> pt_inputs;
-  std::vector<pten::DenseTensor *> pt_outputs;
-  // TODO(YuanRisheng) *_tmp for cache DenseTensor, because the temporary
-  // DenseTensor obj
-  // generated by MakePtenDenseTensor can be destroyed when exits loop. *_tmp
-  // can be deleted
-  // when DenseTensor support copy constructor.
-  std::vector<std::unique_ptr<pten::DenseTensor>> pt_inputs_tmp;
-  std::vector<std::unique_ptr<pten::DenseTensor>> pt_outputs_tmp;
-  for (auto in : ins) {
-    pt_inputs_tmp.emplace_back(
-        std::move(paddle::experimental::MakePtenDenseTensor(*in)));
-  }
-  for (auto out : *outs) {
-    pt_outputs_tmp.emplace_back(
-        std::move(paddle::experimental::MakePtenDenseTensor(*out)));
-  }
-  for (int i = 0; i < pt_inputs_tmp.size(); i++) {
-    pt_inputs.push_back(pt_inputs_tmp[i].get());
-  }
-  for (int i = 0; i < pt_outputs_tmp.size(); i++) {
-    pt_outputs.push_back(pt_outputs_tmp[i].get());
-  }
-  pten::LaunchBroadcastElementwiseCudaKernel<ET, InT, OutT, Functor, NumOuts>(
-      ctx, pt_inputs, &pt_outputs, axis, func);
-}
-
 template <ElementwiseType ET, typename InT, typename OutT, typename Functor,
          int NumOuts = 1>
 void LaunchElementwiseCudaKernel(
@@ -82,7 +50,7 @@ void LaunchElementwiseCudaKernel(
  for (int i = 0; i < pt_outputs_tmp.size(); i++) {
    pt_outputs.push_back(pt_outputs_tmp[i].get());
  }
-  pten::LaunchElementwiseCudaKernel<ET, InT, OutT, Functor, NumOuts>(
+  pten::funcs::BroadcastKernel<ET, InT, OutT, Functor, NumOuts>(
      ctx, pt_inputs, &pt_outputs, axis, func);
 }


--- a/paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h
+++ b/paddle/fluid/operators/elementwise/elementwise_op_impl.cu.h
@@ -19,7 +19,7 @@ limitations under the License. */

 // only can include the headers in paddle/top/api dirs
 #include "paddle/pten/api/lib/utils/tensor_utils.h"
-#include "paddle/pten/kernels/gpu/elementwise.h"
+#include "paddle/pten/kernels/funcs/elementwise_base.h"

 namespace paddle {
 namespace operators {
@@ -53,8 +53,8 @@ void LaunchSameDimsElementwiseCudaKernel(
  for (int i = 0; i < pt_outputs_tmp.size(); i++) {
    pt_outputs.push_back(pt_outputs_tmp[i].get());
  }
-  pten::funcs::LaunchSameDimsElementwiseCudaKernel<OutT, Functor, NumOuts>(
-      ctx, pt_inputs, &pt_outputs, func);
+  pten::funcs::ElementwiseKernel<OutT, Functor, NumOuts>(ctx, pt_inputs,
+                                                         &pt_outputs, func);
 }

 }  // namespace operators

--- a/paddle/pten/kernels/funcs/broadcast_function.h
+++ b/paddle/pten/kernels/funcs/broadcast_function.h
--- a/paddle/pten/kernels/funcs/cuda_kernel_config.h
+++ b/paddle/pten/kernels/funcs/cuda_kernel_config.h
-/* Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#pragma once
-
-#include "paddle/fluid/platform/device/gpu/gpu_device_function.h"
-#include "paddle/fluid/platform/device/gpu/gpu_launch_config.h"
-#include "paddle/pten/backends/gpu/gpu_context.h"
-
-#ifdef __HIPCC__
-#define ELEMENTWISE_BLOCK_SIZE 256
-#else
-#define ELEMENTWISE_BLOCK_SIZE 512
-#endif
-
-namespace pten {
-namespace funcs {
-/*
-* According to NVIDIA, if number of threads per block is 64/128/256/512,
-* cuda performs better. And number of blocks should be greater (at least
-* 2x~4x) than number of SMs. Hence, SM count is took into account within
-* this function to determine the right number of threads per block.
-*/
-inline int GetThreadsConfig(const pten::GPUContext &ctx,
-                            int64_t numel,
-                            int vec_size) {
-  int threads = ELEMENTWISE_BLOCK_SIZE;
-  int sm_count = ctx.GetSMCount();
-  int active_threads_num = numel / vec_size;
-  if (active_threads_num / (sm_count << 1) < ELEMENTWISE_BLOCK_SIZE) {
-    // Round up threads number into an exponential multiple of 2, while number
-    // of acitve blocks is about twice of SM, to acquire better performance.
-    threads = paddle::platform::RoundToPowerOfTwo(active_threads_num /
-                                                  (sm_count << 1));
-  } else if (active_threads_num / (sm_count << 2) < ELEMENTWISE_BLOCK_SIZE) {
-    // Round up threads number into an exponential multiple of 2, while number
-    // of acitve blocks is about 4 times of SM, to acquire better performance.
-    threads = paddle::platform::RoundToPowerOfTwo(active_threads_num /
-                                                  (sm_count << 2));
-  }
-  // Number of threads per block shall be larger than 64.
-  return std::max(64, threads);
-}
-
-}  // namespace funcs
-}  // namespace pten
--- a/paddle/pten/kernels/funcs/elementwise_base.h
+++ b/paddle/pten/kernels/funcs/elementwise_base.h
@@ -746,11 +746,10 @@ void ElementwiseCudaKernel(const KPDevice &ctx,
 }

 template <typename OutT, typename Functor, int NumOuts = 1>
-void LaunchSameDimsElementwiseCudaKernel(
-    const KPDevice &ctx,
-    const std::vector<const DenseTensor *> &ins,
-    std::vector<DenseTensor *> *outs,
-    Functor func) {
+void ElementwiseKernel(const KPDevice &ctx,
+                       const std::vector<const DenseTensor *> &ins,
+                       std::vector<DenseTensor *> *outs,
+                       Functor func) {
  using Traits = paddle::platform::FunctionTraits<Functor>;
  const int kArity = Traits::arity;
  PADDLE_ENFORCE_EQ(ins.size(),

--- a/paddle/pten/kernels/gpu/abs_kernel.cu
+++ b/paddle/pten/kernels/gpu/abs_kernel.cu
@@ -47,8 +47,7 @@ void AbsKernel(const Context& ctx, const DenseTensor& x, DenseTensor* out) {
  std::vector<DenseTensor*> outs = {out};
  auto functor = CudaAbsFunctor<T>();

-  funcs::LaunchSameDimsElementwiseCudaKernel<pten::funcs::Real<T>>(
-      ctx, ins, &outs, functor);
+  funcs::ElementwiseKernel<pten::funcs::Real<T>>(ctx, ins, &outs, functor);
 }

 }  // namespace pten

--- a/paddle/pten/kernels/gpu/cast_kernel.cu
+++ b/paddle/pten/kernels/gpu/cast_kernel.cu
@@ -44,7 +44,7 @@ void CastCUDAKernelImpl(const GPUContext& dev_ctx,
  inputs.emplace_back(&x);
  outputs.emplace_back(out);
  dev_ctx.Alloc<OutT>(out);
-  pten::funcs::LaunchSameDimsElementwiseCudaKernel<OutT>(
+  pten::funcs::ElementwiseKernel<OutT>(
      dev_ctx, inputs, &outputs, CastFuctor<InT, OutT>());
 }


--- a/paddle/pten/kernels/gpu/elementwise.h
+++ b/paddle/pten/kernels/gpu/elementwise.h
--- a/paddle/pten/kernels/gpu/full_kernel.cu
+++ b/paddle/pten/kernels/gpu/full_kernel.cu
@@ -49,7 +49,7 @@ void FullKernel(const Context& dev_ctx,
    // This function has no input, so the inputs.size() == 0. Use kUnary, but
    // the data will not be loaded in the kernel because the number of
    // parameters in the operator is 0
-    pten::funcs::LaunchSameDimsElementwiseCudaKernel<T>(
+    pten::funcs::ElementwiseKernel<T>(
        dev_ctx, inputs, &outputs, FullFuctor<T>(val.to<T>()));
  }
 }
@@ -91,7 +91,7 @@ void FullLikeKernel(const Context& dev_ctx,
  // the operator is 0
  int numel = out->numel();
  if (numel > 0) {
-    pten::funcs::LaunchSameDimsElementwiseCudaKernel<T>(
+    pten::funcs::ElementwiseKernel<T>(
        dev_ctx, inputs, &outputs, FullFuctor<T>(value));
  }
 }

--- a/paddle/pten/kernels/gpu/math_kernel.cu
+++ b/paddle/pten/kernels/gpu/math_kernel.cu
@@ -48,7 +48,7 @@ namespace pten {
    inputs.emplace_back(&y);                                         \
    outputs.emplace_back(out);                                       \
    dev_ctx.template Alloc<T>(out);                                  \
-    LaunchElementwiseCudaKernel<ElementwiseType::kBinary, T, T>(     \
+    funcs::BroadcastKernel<ElementwiseType::kBinary, T, T>(          \
        dev_ctx, inputs, &outputs, axis, funcs::name##Functor<T>()); \
  }


--- a/paddle/pten/kernels/gpu/reduce.h
+++ b/paddle/pten/kernels/gpu/reduce.h
@@ -1091,8 +1091,7 @@ void TensorReduceImpl(const pten::GPUContext& dev_ctx,
  if (config.reduce_num == 1) {
    std::vector<const DenseTensor*> inputs = {&x};
    std::vector<DenseTensor*> outputs = {y};
-    funcs::LaunchSameDimsElementwiseCudaKernel<Ty>(
-        dev_ctx, inputs, &outputs, transform);
+    funcs::ElementwiseKernel<Ty>(dev_ctx, inputs, &outputs, transform);
    return;
  }


--- a/paddle/pten/kernels/gpu/reduce_grad.h
+++ b/paddle/pten/kernels/gpu/reduce_grad.h
@@ -22,8 +22,10 @@
 #include <numeric>
 #include <set>
 #include <vector>
-#include "paddle/pten/kernels/gpu/elementwise.h"
+#include "paddle/pten/kernels/funcs/broadcast_function.h"
+
 namespace pten {
+
 template <typename InT, typename Functor>
 void ReduceGrad(const GPUContext& dev_ctx,
                DenseTensor* d_out,
@@ -33,12 +35,11 @@ void ReduceGrad(const GPUContext& dev_ctx,
  std::vector<const DenseTensor*> inputs = {d_out};
  std::vector<DenseTensor*> outputs = {d_x};
  PD_VISIT_ALL_TYPES(
-      out_dtype, "LaunchBroadcastElementwiseCudaKernel", ([&] {
-        LaunchBroadcastElementwiseCudaKernel<pten::ElementwiseType::kUnary,
-                                             InT,
-                                             data_t>(
+      out_dtype, "BroadcastKernel", ([&] {
+        funcs::BroadcastKernel<pten::ElementwiseType::kUnary, InT, data_t>(
            dev_ctx, inputs, &outputs, 0, functor);
      }));
 }
+
 }  // namespace pten
 #endif
--- a/paddle/pten/kernels/gpu/scale_kernel.cu
+++ b/paddle/pten/kernels/gpu/scale_kernel.cu
@@ -54,7 +54,7 @@ void ScaleKernel(const Context& dev_ctx,
  inputs.emplace_back(&x);
  outputs.emplace_back(out);
  dev_ctx.template Alloc<T>(out);
-  pten::funcs::LaunchSameDimsElementwiseCudaKernel<T>(
+  pten::funcs::ElementwiseKernel<T>(
      dev_ctx,
      inputs,
      &outputs,