未验证 提交 e592534a 编写于 作者: W Winters Montagne 提交者: GitHub

【PaddlePaddle Hackathon 4 No.34】为 Paddle 优化 Lerp OP 在 GPU 上的性能 (#53154)

* modify lerp_kernel.cu

* pre-commit

* fix some CI issues

* fix some CI issues

* fix some CI issues

* fix some CI issues

* fix some CI issues

* fix some CI issues

* fix some CI issues

* fix some CI issues

* Add files via upload

fix some CI issues
上级 0ab7f949
......@@ -15,8 +15,115 @@
#include "paddle/phi/kernels/lerp_kernel.h"
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/common/amp_type_traits.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/impl/lerp_kernel_impl.h"
#include "paddle/phi/kernels/empty_kernel.h"
#include "paddle/phi/kernels/funcs/broadcast_function.h"
#include "paddle/phi/kernels/funcs/common_shape.h"
#include "paddle/phi/kernels/funcs/math_function.h"
namespace phi {
template <typename T>
struct BroadcastMinElementWiseDirectCUDAFunctor {
HOSTDEVICE inline T operator()(const T min) const { return min; }
};
template <typename T>
struct LerpElementWiseDirectCUDAFunctor {
HOSTDEVICE inline T operator()(const T x, const T y, const T weight) const {
return x + weight * (y - x);
}
};
template <typename T>
struct LerpScalarDirectCUDAFunctor {
const T *weight_;
HOSTDEVICE inline LerpScalarDirectCUDAFunctor(const T *weight)
: weight_(weight) {}
HOSTDEVICE inline T operator()(const T x, const T y) const {
return x + weight_[0] * (y - x);
}
};
template <typename T, typename Context>
void LerpKernel(const Context &ctx,
const DenseTensor &x,
const DenseTensor &y,
const DenseTensor &weight,
DenseTensor *out) {
int rank = out->dims().size();
PADDLE_ENFORCE_GE(
rank,
0,
phi::errors::InvalidArgument(
"The number of dimensions for LerpOp must be "
"greater than or equal to 0, but the value received is %d.",
rank));
ctx.template Alloc<T>(out);
std::vector<DenseTensor *> outputs = {out};
std::vector<const DenseTensor *> inputs;
if (weight.numel() == 1) {
const T *weight_ptr = weight.data<T>();
inputs.reserve(2);
inputs.emplace_back(&x);
inputs.emplace_back(&y);
auto functor = LerpScalarDirectCUDAFunctor<T>(weight_ptr);
phi::funcs::BroadcastKernel<T>(ctx, inputs, &outputs, functor);
} else {
inputs.reserve(3);
auto functor = LerpElementWiseDirectCUDAFunctor<T>();
DenseTensor b_min = phi::EmptyLike<T>(ctx, *out);
if (x.dims().size() != y.dims().size() &&
weight.dims().size() != y.dims().size()) {
std::vector<const DenseTensor *> broadcast_min_inputs;
broadcast_min_inputs.reserve(1);
std::vector<DenseTensor *> broadcast_min_outputs = {&b_min};
auto broadcast_min_functor =
BroadcastMinElementWiseDirectCUDAFunctor<T>();
if (x.dims().size() < y.dims().size() &&
x.dims().size() < weight.dims().size()) {
broadcast_min_inputs.emplace_back(&x);
phi::funcs::BroadcastKernel<T>(ctx,
broadcast_min_inputs,
&broadcast_min_outputs,
broadcast_min_functor);
inputs.emplace_back(&b_min);
inputs.emplace_back(&y);
inputs.emplace_back(&weight);
} else if (y.dims().size() < weight.dims().size()) {
broadcast_min_inputs.emplace_back(&y);
phi::funcs::BroadcastKernel<T>(ctx,
broadcast_min_inputs,
&broadcast_min_outputs,
broadcast_min_functor);
inputs.emplace_back(&x);
inputs.emplace_back(&b_min);
inputs.emplace_back(&weight);
} else {
broadcast_min_inputs.emplace_back(&weight);
phi::funcs::BroadcastKernel<T>(ctx,
broadcast_min_inputs,
&broadcast_min_outputs,
broadcast_min_functor);
inputs.emplace_back(&x);
inputs.emplace_back(&y);
inputs.emplace_back(&b_min);
}
} else {
inputs.emplace_back(&x);
inputs.emplace_back(&y);
inputs.emplace_back(&weight);
}
phi::funcs::BroadcastKernel<T>(ctx, inputs, &outputs, functor);
}
}
} // namespace phi
PD_REGISTER_KERNEL(lerp,
GPU,
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册