未验证 提交 772be4f5 编写于 作者: N niuliling123 提交者: GitHub

Replace EigenBroadcast with ElementwiseBroadcast in ReduceGrad (#39255)

上级 b3e049f8
...@@ -17,15 +17,9 @@ ...@@ -17,15 +17,9 @@
template <typename T> template <typename T>
using CUDAReduceMeanGradKernel = using CUDAReduceMeanGradKernel =
ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, T, ops::ReduceCudaGradKernel<T, kps::DivideFunctor>;
ops::MeanGradFunctor, true>;
using FP16CUDAReduceMeanGradKernel =
ops::ReduceGradKernel<paddle::platform::CUDADeviceContext,
paddle::platform::float16, ops::FP16MeanGradFunctor,
true>;
REGISTER_OP_CUDA_KERNEL(reduce_mean_grad, CUDAReduceMeanGradKernel<bool>, REGISTER_OP_CUDA_KERNEL(reduce_mean_grad, CUDAReduceMeanGradKernel<bool>,
FP16CUDAReduceMeanGradKernel, CUDAReduceMeanGradKernel<paddle::platform::float16>,
CUDAReduceMeanGradKernel<float>, CUDAReduceMeanGradKernel<float>,
CUDAReduceMeanGradKernel<double>); CUDAReduceMeanGradKernel<double>);
...@@ -30,6 +30,7 @@ limitations under the License. */ ...@@ -30,6 +30,7 @@ limitations under the License. */
#if defined(__HIPCC__) || defined(__NVCC__) #if defined(__HIPCC__) || defined(__NVCC__)
#include "paddle/pten/kernels/gpu/reduce.h" #include "paddle/pten/kernels/gpu/reduce.h"
#include "paddle/pten/kernels/gpu/reduce_grad.h"
#endif #endif
namespace paddle { namespace paddle {
...@@ -620,11 +621,12 @@ class ReduceGradOp : public framework::OperatorWithKernel { ...@@ -620,11 +621,12 @@ class ReduceGradOp : public framework::OperatorWithKernel {
protected: protected:
framework::OpKernelType GetExpectedKernelType( framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override { const framework::ExecutionContext& ctx) const override {
int in_dtype = ctx.Attr<int>("in_dtype"); int out_dtype = ctx.Attr<int>("out_dtype");
auto input_data_type = auto input_data_type =
(in_dtype >= 0) ? static_cast<framework::proto::VarType::Type>(in_dtype) (out_dtype >= 0)
: OperatorWithKernel::IndicateVarDataType( ? static_cast<framework::proto::VarType::Type>(out_dtype)
ctx, framework::GradVarName("Out")); : OperatorWithKernel::IndicateVarDataType(
ctx, framework::GradVarName("Out"));
#ifdef PADDLE_WITH_MKLDNN #ifdef PADDLE_WITH_MKLDNN
auto CanMKLDNNReduceGradBeUsed = [&]() { auto CanMKLDNNReduceGradBeUsed = [&]() {
auto dx_dims = ctx.Input<Tensor>("X")->dims(); auto dx_dims = ctx.Input<Tensor>("X")->dims();
...@@ -730,6 +732,55 @@ class ReduceCudaKernel : public framework::OpKernel<T> { ...@@ -730,6 +732,55 @@ class ReduceCudaKernel : public framework::OpKernel<T> {
dev_ctx, *input, reduce_all, dims_int64, false, pt_out_dtype, output); dev_ctx, *input, reduce_all, dims_int64, false, pt_out_dtype, output);
} }
}; };
template <typename T, template <typename, typename> class TransformOp>
class ReduceCudaGradKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
bool reduce_all = context.Attr<bool>("reduce_all");
std::vector<int> dims = context.Attr<std::vector<int>>("dim");
auto* in_x = context.Input<Tensor>("X");
auto* d_out =
context.Input<framework::Tensor>(framework::GradVarName("Out"));
auto* d_x = context.Output<framework::Tensor>(framework::GradVarName("X"));
auto out_dtype = context.Attr<int>("in_dtype");
// get reduce_dim and reduce_num for reduce_mean_grad
int dim_size = in_x->dims().size();
std::vector<int> reduce_dims = GetReduceDim(dims, dim_size, reduce_all);
auto update_dims = vectorize(d_x->dims());
int reduce_num = 1;
for (auto i : reduce_dims) {
reduce_num *= (in_x->dims())[i];
update_dims[i] = 1;
}
// make new tensor
framework::Tensor new_d_out(d_out->type());
new_d_out.ShareDataWith(*d_out);
new_d_out.Resize(paddle::framework::make_ddim(update_dims));
auto& dev_ctx = context.cuda_device_context();
if (out_dtype > 0) {
d_x->mutable_data(
dev_ctx.GetPlace(),
static_cast<framework::proto::VarType::Type>(out_dtype));
} else {
d_x->mutable_data(
dev_ctx.GetPlace(),
static_cast<framework::proto::VarType::Type>(d_out->type()));
}
auto pt_d_out = paddle::experimental::MakePtenDenseTensor(new_d_out);
auto pt_d_x = paddle::experimental::MakePtenDenseTensor(*d_x);
auto pt_out_dtype = pten::TransToPtenDataType(
static_cast<framework::proto::VarType::Type>(out_dtype));
if (out_dtype <= 0) {
pt_out_dtype = pten::TransToPtenDataType(
static_cast<framework::proto::VarType::Type>(d_out->type()));
}
using MPType = typename kps::details::MPTypeTrait<T>::Type;
pten::ReduceGrad<T, TransformOp<T, MPType>>(
dev_ctx, pt_d_out.get(), pt_d_x.get(), pt_out_dtype,
TransformOp<T, MPType>(reduce_num));
}
};
#endif #endif
} // namespace operators } // namespace operators
......
...@@ -50,7 +50,7 @@ class ReduceSumOpGradMaker : public framework::SingleGradOpMaker<T> { ...@@ -50,7 +50,7 @@ class ReduceSumOpGradMaker : public framework::SingleGradOpMaker<T> {
framework::OpKernelType GetExpectedKernelType( framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const { const framework::ExecutionContext& ctx) const {
int in_dtype = ctx.Attr<int>("in_dtype"); int in_dtype = ctx.Attr<int>("out_dtype");
if (in_dtype >= 0) { if (in_dtype >= 0) {
return framework::OpKernelType( return framework::OpKernelType(
static_cast<framework::proto::VarType::Type>(in_dtype), static_cast<framework::proto::VarType::Type>(in_dtype),
......
...@@ -74,7 +74,7 @@ class ReduceSumGradKernel : public framework::OpKernel<T> { ...@@ -74,7 +74,7 @@ class ReduceSumGradKernel : public framework::OpKernel<T> {
auto dims = context.Attr<std::vector<int>>("dim"); auto dims = context.Attr<std::vector<int>>("dim");
if (context.GetPlace().GetType() == platform::CPUPlace().GetType() && if (context.GetPlace().GetType() == platform::CPUPlace().GetType() &&
dims.size() == 1) { dims.size() == 1) {
int in_dtype = context.Attr<int>("in_dtype"); int in_dtype = context.Attr<int>("out_dtype");
if (in_dtype >= 0) { if (in_dtype >= 0) {
Tensor tmp_tensor; Tensor tmp_tensor;
......
...@@ -17,8 +17,7 @@ ...@@ -17,8 +17,7 @@
template <typename T> template <typename T>
using CUDAReduceSumGradKernel = using CUDAReduceSumGradKernel =
ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, T, ops::ReduceCudaGradKernel<T, kps::IdentityFunctor>;
ops::SumGradFunctor, true>;
REGISTER_OP_CUDA_KERNEL( REGISTER_OP_CUDA_KERNEL(
reduce_sum_grad, CUDAReduceSumGradKernel<bool>, reduce_sum_grad, CUDAReduceSumGradKernel<bool>,
......
...@@ -134,12 +134,19 @@ struct DimensionsTransform { ...@@ -134,12 +134,19 @@ struct DimensionsTransform {
explicit DimensionsTransform(const std::vector<const DenseTensor *> &ins, explicit DimensionsTransform(const std::vector<const DenseTensor *> &ins,
const pten::framework::DDim &dims, const pten::framework::DDim &dims,
int axis) { int axis) {
const int N = ins.size(); const int N = max(static_cast<int>(ins.size()), 2);
dim_size = dims.size(); dim_size = dims.size();
out_dims = pten::framework::vectorize<int64_t>(dims); out_dims = pten::framework::vectorize<int64_t>(dims);
in_dims.resize(N); in_dims.resize(N);
for (int j = 0; j < N; ++j) { if (ins.size() == 1) {
in_dims[j] = pten::framework::vectorize<int64_t>(ins[j]->dims()); // when ins.size() = 1, broadcast input to output
in_dims[0] = pten::framework::vectorize<int64_t>(ins[0]->dims());
in_dims[1] = out_dims;
// Add out_dims to in_dims to avoid errors in dims merging
} else {
for (int j = 0; j < N; ++j) {
in_dims[j] = pten::framework::vectorize<int64_t>(ins[j]->dims());
}
} }
InputDimensionsExtend(N, axis); InputDimensionsExtend(N, axis);
......
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
// CUDA and HIP use same api
#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
#include <algorithm>
#include <cmath>
#include <numeric>
#include <set>
#include <vector>
#include "paddle/pten/kernels/gpu/elementwise.h"
namespace pten {
template <typename InT, typename Functor>
void ReduceGrad(const GPUContext& dev_ctx,
DenseTensor* d_out,
DenseTensor* d_x,
DataType out_dtype,
Functor functor) {
std::vector<const DenseTensor*> inputs = {d_out};
std::vector<DenseTensor*> outputs = {d_x};
PD_VISIT_ALL_TYPES(
out_dtype, "LaunchBroadcastElementwiseCudaKernel", ([&] {
LaunchBroadcastElementwiseCudaKernel<pten::ElementwiseType::kUnary,
InT,
data_t>(
dev_ctx, inputs, &outputs, 0, functor);
}));
}
} // namespace pten
#endif
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册