未验证 提交 e161979e 编写于 作者: N niuliling123 提交者: GitHub

Replace ReduceAmax/Amax.part.cu with KP (#43202) (#43263)

Reduce amax/amin frobenius_norm_kerne原始实现为Eigen实现,文件编译时间较长,因此本PR将其替换为KP实现
删除DefaultElementwiseOperator中重复功能支持,减少elementwise_double_grad OP编译时间
上级 8f127681
...@@ -12,14 +12,12 @@ ...@@ -12,14 +12,12 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "paddle/fluid/operators/reduce_ops/reduce_min_max_op.h" #include "paddle/fluid/operators/reduce_ops/reduce_op.h"
REGISTER_OP_CUDA_KERNEL( template <typename T>
reduce_amax_grad, ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, using CUDAReduceMaxGradKernel =
float, ops::AMaxOrAMinGradFunctor>, ops::ReduceCudaAMaxAMinGradKernel<T, kps::IdentityFunctor>;
ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, double, REGISTER_OP_CUDA_KERNEL(reduce_amax_grad, CUDAReduceMaxGradKernel<int>,
ops::AMaxOrAMinGradFunctor>, CUDAReduceMaxGradKernel<int64_t>,
ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int, CUDAReduceMaxGradKernel<float>,
ops::AMaxOrAMinGradFunctor>, CUDAReduceMaxGradKernel<double>);
ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int64_t,
ops::AMaxOrAMinGradFunctor>);
...@@ -12,14 +12,12 @@ ...@@ -12,14 +12,12 @@
// See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
// limitations under the License. // limitations under the License.
#include "paddle/fluid/operators/reduce_ops/reduce_min_max_op.h" #include "paddle/fluid/operators/reduce_ops/reduce_op.h"
REGISTER_OP_CUDA_KERNEL( template <typename T>
reduce_amin_grad, ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, using CUDAReduceMinGradKernel =
float, ops::AMaxOrAMinGradFunctor>, ops::ReduceCudaAMaxAMinGradKernel<T, kps::IdentityFunctor>;
ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, double, REGISTER_OP_CUDA_KERNEL(reduce_amin_grad, CUDAReduceMinGradKernel<int>,
ops::AMaxOrAMinGradFunctor>, CUDAReduceMinGradKernel<int64_t>,
ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int, CUDAReduceMinGradKernel<float>,
ops::AMaxOrAMinGradFunctor>, CUDAReduceMinGradKernel<double>);
ops::ReduceGradKernel<paddle::platform::CUDADeviceContext, int64_t,
ops::AMaxOrAMinGradFunctor>);
...@@ -23,7 +23,6 @@ limitations under the License. */ ...@@ -23,7 +23,6 @@ limitations under the License. */
#include "paddle/fluid/operators/cast_op.h" #include "paddle/fluid/operators/cast_op.h"
#include "paddle/fluid/operators/reduce_ops/reduce_op_function.h" #include "paddle/fluid/operators/reduce_ops/reduce_op_function.h"
#include "paddle/phi/kernels/funcs/math_function.h" #include "paddle/phi/kernels/funcs/math_function.h"
// only can include the headers in paddle/phi/api dirs // only can include the headers in paddle/phi/api dirs
#include "paddle/fluid/framework/convert_utils.h" #include "paddle/fluid/framework/convert_utils.h"
#include "paddle/phi/api/lib/utils/tensor_utils.h" #include "paddle/phi/api/lib/utils/tensor_utils.h"
...@@ -649,6 +648,7 @@ class ReduceCudaGradKernel : public framework::OpKernel<T> { ...@@ -649,6 +648,7 @@ class ReduceCudaGradKernel : public framework::OpKernel<T> {
bool reduce_all = context.Attr<bool>("reduce_all"); bool reduce_all = context.Attr<bool>("reduce_all");
std::vector<int> dims = context.Attr<std::vector<int>>("dim"); std::vector<int> dims = context.Attr<std::vector<int>>("dim");
auto* in_x = context.Input<Tensor>("X"); auto* in_x = context.Input<Tensor>("X");
auto* d_out = auto* d_out =
context.Input<framework::Tensor>(framework::GradVarName("Out")); context.Input<framework::Tensor>(framework::GradVarName("Out"));
auto* d_x = context.Output<framework::Tensor>(framework::GradVarName("X")); auto* d_x = context.Output<framework::Tensor>(framework::GradVarName("X"));
...@@ -679,12 +679,106 @@ class ReduceCudaGradKernel : public framework::OpKernel<T> { ...@@ -679,12 +679,106 @@ class ReduceCudaGradKernel : public framework::OpKernel<T> {
if (out_dtype <= 0) { if (out_dtype <= 0) {
pt_out_dtype = d_out->dtype(); pt_out_dtype = d_out->dtype();
} }
using MPType = typename kps::details::MPTypeTrait<T>::Type; using MPType = typename kps::details::MPTypeTrait<T>::Type;
phi::ReduceGrad<T, TransformOp<T, MPType>>( phi::ReduceGrad<T, TransformOp<T, MPType>>(
dev_ctx, pt_d_out.get(), pt_d_x.get(), pt_out_dtype, dev_ctx, pt_d_out.get(), pt_d_x.get(), pt_out_dtype,
TransformOp<T, MPType>(reduce_num)); TransformOp<T, MPType>(reduce_num));
} }
}; };
template <typename T>
struct EqualFunctor {
inline T initial() { return static_cast<T>(0.0f); }
inline HOSTDEVICE T operator()(const T a, const T b) const {
return static_cast<T>(a == b);
}
};
template <typename T, typename Enable = void>
struct DivideFunctor {
inline T initial() { return static_cast<T>(1.0f); }
inline HOSTDEVICE T operator()(const T a, const T b) const { return a / b; }
};
template <typename T, template <typename, typename> class TransformOp>
class ReduceCudaAMaxAMinGradKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
bool reduce_all = context.Attr<bool>("reduce_all");
std::vector<int> dims = context.Attr<std::vector<int>>("dim");
auto* in_x = context.Input<Tensor>("X");
auto* out_y = context.Input<Tensor>("Out");
auto* d_out =
context.Input<framework::Tensor>(framework::GradVarName("Out"));
auto* d_x = context.Output<framework::Tensor>(framework::GradVarName("X"));
auto out_dtype = context.Attr<int>("in_dtype");
auto pt_out_dtype = framework::TransToPhiDataType(
static_cast<framework::proto::VarType::Type>(out_dtype));
// get reduce_dim and reduce_num for reduce_mean_grad
int dim_size = in_x->dims().size();
std::vector<int> reduce_dims = GetReduceDim(dims, dim_size, reduce_all);
auto update_dims = vectorize(d_x->dims());
int reduce_num = 1;
for (auto i : reduce_dims) {
reduce_num *= (in_x->dims())[i];
update_dims[i] = 1;
}
auto& dev_ctx = context.cuda_device_context();
// make new tensor reduce_out
phi::DenseTensor new_y(out_y->type());
new_y.ShareDataWith(*out_y);
new_y.Resize(phi::make_ddim(update_dims));
// make new tensor d_out
phi::DenseTensor new_dout(d_out->type());
new_dout.ShareDataWith(*d_out);
new_dout.Resize(phi::make_ddim(update_dims));
d_x->mutable_data(dev_ctx.GetPlace(), d_out->dtype());
auto new_in = paddle::experimental::MakePhiDenseTensor(*in_x);
auto new_in_tensor = new_in.get();
auto new_dx = paddle::experimental::MakePhiDenseTensor(*d_x);
auto new_dx_tensor = new_dx.get();
// make equal_out
phi::DenseTensor* equal_out = new phi::DenseTensor();
equal_out->Resize(in_x->dims());
dev_ctx.template Alloc<T>(equal_out);
auto equal_out_tensor = *equal_out;
// make new tensor equal_count
phi::DenseTensor* equal_count = new phi::DenseTensor();
equal_count->Resize(phi::make_ddim(update_dims));
dev_ctx.template Alloc<T>(equal_count);
// compute
// 1. equal_out = Equal(x, y)
std::vector<const phi::DenseTensor*> equal_inputs = {&new_y, new_in_tensor};
std::vector<phi::DenseTensor*> equal_outputs = {&equal_out_tensor};
phi::funcs::BroadcastKernel<phi::ElementwiseType::kBinary, T, T>(
dev_ctx, equal_inputs, &equal_outputs, 0, EqualFunctor<T>());
// 2. equal_count = reduceSum(equal_out)
using MPType = typename kps::details::MPTypeTrait<T>::Type;
phi::funcs::ReduceKernel<T, T, kps::AddFunctor,
kps::IdentityFunctor<T, MPType>>(
dev_ctx, equal_out_tensor, equal_count,
kps::IdentityFunctor<T, MPType>(), reduce_dims, false);
// 3. dx = Div(dout, equal_out)
std::vector<const phi::DenseTensor*> grad_inputs = {&equal_out_tensor,
equal_count};
std::vector<phi::DenseTensor*> grad_outputs = {new_dx_tensor};
phi::funcs::BroadcastKernel<phi::ElementwiseType::kBinary, T, T>(
dev_ctx, grad_inputs, &grad_outputs, 0, DivideFunctor<T>());
delete equal_out;
delete equal_count;
}
};
#endif #endif
} // namespace operators } // namespace operators
......
...@@ -522,7 +522,22 @@ void ElementwiseCompute(const GPUContext &dev_ctx, ...@@ -522,7 +522,22 @@ void ElementwiseCompute(const GPUContext &dev_ctx,
dev_ctx, ins, &outs, axis, func); dev_ctx, ins, &outs, axis, func);
} }
#endif template <typename DeviceContext,
typename T,
typename Functor,
typename InverseFunctor>
void DefaultElementwiseOperator(const DeviceContext &dev_ctx,
const DenseTensor &x,
const DenseTensor &y,
DenseTensor *z,
int axis = -1) {
auto x_dims = x.dims();
auto y_dims = y.dims();
dev_ctx.template Alloc<T>(z);
funcs::ElementwiseCompute<Functor, T>(dev_ctx, x, y, axis, Functor(), z);
}
#else
template <typename DeviceContext, template <typename DeviceContext,
typename T, typename T,
...@@ -544,5 +559,7 @@ void DefaultElementwiseOperator(const DeviceContext &dev_ctx, ...@@ -544,5 +559,7 @@ void DefaultElementwiseOperator(const DeviceContext &dev_ctx,
} }
} }
#endif
} // namespace funcs } // namespace funcs
} // namespace phi } // namespace phi
...@@ -13,7 +13,27 @@ ...@@ -13,7 +13,27 @@
// limitations under the License. // limitations under the License.
#include "paddle/phi/kernels/frobenius_norm_kernel.h" #include "paddle/phi/kernels/frobenius_norm_kernel.h"
#include "paddle/phi/kernels/impl/frobenius_norm_kernel_impl.h" #include "paddle/phi/kernels/funcs/activation_functor.h"
#include "paddle/phi/kernels/gpu/reduce.h"
namespace phi {
template <typename T, typename Context>
void FrobeniusNormKernel(const Context& dev_ctx,
const DenseTensor& x,
const std::vector<int64_t>& dims,
bool keep_dim,
bool reduce_all,
DenseTensor* out) {
auto out_dtype = x.dtype();
phi::Reduce<T, kps::AddFunctor, kps::SquareFunctor>(
dev_ctx, x, reduce_all, dims, keep_dim, out_dtype, out);
std::vector<const DenseTensor*> ins = {out};
std::vector<DenseTensor*> outs = {out};
auto functor = funcs::CudaSqrtFunctor<T>();
funcs::ElementwiseKernel<T>(dev_ctx, ins, &outs, functor);
}
} // namespace phi
#include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/core/kernel_registry.h"
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册