未验证 提交 3825b40f 编写于 作者: N Noel 提交者: GitHub

[pnorm] fix bug in fp16 & optimize memory (#39011)

上级 c1e5a393
......@@ -76,22 +76,13 @@ struct AbsFunctor {
}
};
template <typename Tx, typename Ty = Tx>
template <typename T>
struct UnsignedPowFunctor {
HOSTDEVICE explicit inline UnsignedPowFunctor(float porder) {
this->porder = porder;
}
HOSTDEVICE inline Ty operator()(const Tx x) const {
return static_cast<Ty>(inline_pow(inline_abs(x), static_cast<Tx>(porder)));
}
float porder;
};
template <typename Tx, typename Ty = Tx>
struct PowFunctor {
HOSTDEVICE explicit inline PowFunctor(float porder) { this->porder = porder; }
HOSTDEVICE inline Ty operator()(const Tx x) const {
return static_cast<Ty>(inline_pow(x, static_cast<Tx>(porder)));
HOSTDEVICE inline T operator()(const T x) const {
return static_cast<T>(inline_pow(inline_abs(x), static_cast<T>(porder)));
}
float porder;
};
......@@ -105,13 +96,11 @@ class PnormCUDAKernel : public framework::OpKernel<T> {
const T* x = in_x->data<T>();
T* norm = out_norm->mutable_data<T>(ctx.GetPlace());
auto xdim = in_x->dims();
auto ndim = out_norm->dims();
float porder = ctx.Attr<float>("porder");
bool asvector = ctx.Attr<bool>("asvector");
int axis = ctx.Attr<int>("axis");
std::vector<int> reduce_axis = {axis};
reduce_axis = GetReduceDim(reduce_axis, xdim.size(), asvector);
auto stream = ctx.cuda_device_context().stream();
using MT = typename details::MPTypeTrait<T>::Type;
......@@ -125,29 +114,17 @@ class PnormCUDAKernel : public framework::OpKernel<T> {
TensorReduceFunctorImpl<T, T, kps::MinFunctor, AbsFunctor<T>>(
*in_x, out_norm, AbsFunctor<T>(), reduce_axis, stream);
} else {
framework::Tensor tmp_x;
tmp_x.mutable_data<T>(xdim, ctx.GetPlace());
std::vector<const framework::Tensor*> ins = {in_x};
std::vector<framework::Tensor*> outs = {&tmp_x};
auto func = UnsignedPowFunctor<MT, T>(porder);
TensorReduceFunctorImpl<T, T, kps::AddFunctor, UnsignedPowFunctor<T>>(
*in_x, out_norm, UnsignedPowFunctor<T>(porder), reduce_axis, stream);
const framework::Tensor* tmp_norm = out_norm;
std::vector<const framework::Tensor*> ins = {tmp_norm};
std::vector<framework::Tensor*> outs = {out_norm};
const auto& cuda_ctx =
ctx.template device_context<platform::CUDADeviceContext>();
paddle::operators::LaunchSameDimsElementwiseCudaKernel<
ElementwiseType::kUnary, MT, T, UnsignedPowFunctor<MT, T>>(
cuda_ctx, ins, &outs, func);
framework::Tensor tmp_y;
tmp_y.mutable_data<T>(ndim, ctx.GetPlace());
TensorReduceFunctorImpl<T, T, kps::AddFunctor, kps::IdentityFunctor<T>>(
tmp_x, &tmp_y, kps::IdentityFunctor<T>(), reduce_axis, stream);
const framework::Tensor* tmp_norm = &tmp_y;
ins = {tmp_norm};
outs = {out_norm};
auto func_inverse = UnsignedPowFunctor<MT, T>(1. / porder);
paddle::operators::LaunchSameDimsElementwiseCudaKernel<
ElementwiseType::kUnary, MT, T, UnsignedPowFunctor<MT, T>>(
cuda_ctx, ins, &outs, func_inverse);
ElementwiseType::kUnary, T, T, UnsignedPowFunctor<T>>(
cuda_ctx, ins, &outs, UnsignedPowFunctor<T>(1. / porder));
}
}
};
......@@ -158,29 +135,25 @@ struct AbsMaxAndMinGradFunctor {
typename DY, typename Dim>
void operator()(const DeviceContext& place, X* x, Y* y, DX* dx, DY* dy,
const Dim& dim, int size) {
auto equals = ((*x).abs() == y->broadcast(dim));
auto ones = dx->constant(static_cast<T>(1.));
auto negs = dx->constant(static_cast<T>(-1.));
auto zeros = dx->constant(static_cast<T>(0.));
auto positives = (*x) > zeros;
dx->device(place) = dy->broadcast(dim) * equals.select(ones, zeros) *
positives.select(ones, negs);
dx->device(place) = dy->broadcast(dim) * (*x).sign() *
((*x).abs() == y->broadcast(dim)).template cast<T>();
}
};
template <typename T>
struct PNormPostGradFunctor {
struct PNormGradFunctor {
HOSTDEVICE explicit inline PNormGradFunctor(float porder) {
this->porder = static_cast<T>(porder - 1.);
}
template <typename DeviceContext, typename X, typename Y, typename DX,
typename DY, typename Dim>
void operator()(const DeviceContext& place, X* x, Y* y, DX* dx, DY* dy,
const Dim& dim, int size) {
auto ones = dx->constant(static_cast<T>(1.));
auto negs = dx->constant(static_cast<T>(-1.));
auto zeros = dx->constant(static_cast<T>(0.));
auto positives = (*x) > zeros;
dx->device(place) = (*dx) * dy->broadcast(dim) * y->broadcast(dim) *
positives.select(ones, negs);
dx->device(place) = (*x).abs().pow(this->porder) * (*x).sign() *
dy->broadcast(dim) *
(*y).pow(-this->porder).broadcast(dim);
}
T porder;
};
template <typename DeviceContext, typename T, typename AttrType = T>
......@@ -207,26 +180,13 @@ class PnormGradCUDAKernel : public framework::OpKernel<T> {
math::SetConstant<DeviceContext, T> set_zero;
set_zero(cuda_ctx, out_dx, static_cast<T>(0));
} else if (porder == INFINITY || porder == -INFINITY) {
AbsMaxAndMinGradFunctor<T> functor;
LaunchReduceGradKernel<DeviceContext, T, AbsMaxAndMinGradFunctor<T>>(
ctx, in_x, in_norm, in_norm_dy, out_dx, dims, reduce_all);
ctx, in_x, in_norm, in_norm_dy, out_dx, functor, dims, reduce_all);
} else {
framework::Tensor tmp_norm;
tmp_norm.mutable_data<T>(in_norm->dims(), ctx.GetPlace());
std::vector<const framework::Tensor*> ins = {in_norm};
std::vector<framework::Tensor*> outs = {&tmp_norm};
auto pow_functor = PowFunctor<T>(1. - porder);
paddle::operators::LaunchSameDimsElementwiseCudaKernel<
ElementwiseType::kUnary, T, T, PowFunctor<T>>(cuda_ctx, ins, &outs,
pow_functor);
ins = {in_x};
outs = {out_dx};
auto unsigned_pow = UnsignedPowFunctor<T>(porder - 1.);
paddle::operators::LaunchSameDimsElementwiseCudaKernel<
ElementwiseType::kUnary, T, T, UnsignedPowFunctor<T>>(
cuda_ctx, ins, &outs, unsigned_pow);
const framework::Tensor* tmp_norm_const = &tmp_norm;
LaunchReduceGradKernel<DeviceContext, T, PNormPostGradFunctor<T>>(
ctx, in_x, tmp_norm_const, in_norm_dy, out_dx, dims, reduce_all);
auto functor = PNormGradFunctor<T>(porder);
LaunchReduceGradKernel<DeviceContext, T, PNormGradFunctor<T>>(
ctx, in_x, in_norm, in_norm_dy, out_dx, functor, dims, reduce_all);
}
}
};
......
......@@ -139,26 +139,27 @@ class LogsumexpGradKernel : public framework::OpKernel<T> {
broadcast_dim[0]);
} else {
int rank = input->dims().size();
LogsumexpGradFunctor functor;
switch (rank) {
case 1:
ReduceGradFunctor<DeviceContext, T, 1, LogsumexpGradFunctor>(
context.template device_context<DeviceContext>(), *input, *output,
*output_grad, input_grad, axis);
*output_grad, input_grad, functor, axis);
break;
case 2:
ReduceGradFunctor<DeviceContext, T, 2, LogsumexpGradFunctor>(
context.template device_context<DeviceContext>(), *input, *output,
*output_grad, input_grad, axis);
*output_grad, input_grad, functor, axis);
break;
case 3:
ReduceGradFunctor<DeviceContext, T, 3, LogsumexpGradFunctor>(
context.template device_context<DeviceContext>(), *input, *output,
*output_grad, input_grad, axis);
*output_grad, input_grad, functor, axis);
break;
case 4:
ReduceGradFunctor<DeviceContext, T, 4, LogsumexpGradFunctor>(
context.template device_context<DeviceContext>(), *input, *output,
*output_grad, input_grad, axis);
*output_grad, input_grad, functor, axis);
break;
}
}
......
......@@ -143,7 +143,7 @@ void HandleLargeDimGrad(const framework::ExecutionContext& context,
const framework::Tensor* x,
const framework::Tensor* out,
const framework::Tensor* dout, framework::Tensor* dx,
const std::vector<int>& dims) {
Functor functor, const std::vector<int>& dims) {
const int64_t unreduced = out->numel();
const int64_t reduced = x->numel() / unreduced;
DDim out_dim(out->dims());
......@@ -157,7 +157,7 @@ void HandleLargeDimGrad(const framework::ExecutionContext& context,
dx->Resize({unreduced, reduced});
ReduceGradFunctor<DeviceContext, T, 2, Functor>(
context.template device_context<DeviceContext>(), shuffled_x, *out, *dout,
dx, {1});
dx, functor, {1});
// transpose dX
std::vector<int> origin_axis(x_dim.size());
GetOriginDimFromShuffled(x_dim, dims, &origin_axis);
......@@ -333,7 +333,7 @@ void LaunchReduceGradKernel(const framework::ExecutionContext& context,
const framework::Tensor* input0,
const framework::Tensor* input1,
const framework::Tensor* input2,
paddle::framework::Tensor* output,
paddle::framework::Tensor* output, Functor functor,
const std::vector<int>& dims,
bool reduce_all = false) {
if (reduce_all) {
......@@ -345,7 +345,6 @@ void LaunchReduceGradKernel(const framework::ExecutionContext& context,
*context.template device_context<DeviceContext>().eigen_device();
auto broadcast_dim =
Eigen::array<int, 1>({{static_cast<int>(input0->numel())}});
Functor functor;
functor(place, &x, &x_reduce, &x_grad, &x_reduce_grad, broadcast_dim,
broadcast_dim[0]);
} else {
......@@ -354,36 +353,36 @@ void LaunchReduceGradKernel(const framework::ExecutionContext& context,
case 1:
ReduceGradFunctor<DeviceContext, T, 1, Functor>(
context.template device_context<DeviceContext>(), *input0, *input1,
*input2, output, dims);
*input2, output, functor, dims);
break;
case 2:
ReduceGradFunctor<DeviceContext, T, 2, Functor>(
context.template device_context<DeviceContext>(), *input0, *input1,
*input2, output, dims);
*input2, output, functor, dims);
break;
case 3:
ReduceGradFunctor<DeviceContext, T, 3, Functor>(
context.template device_context<DeviceContext>(), *input0, *input1,
*input2, output, dims);
*input2, output, functor, dims);
break;
case 4:
ReduceGradFunctor<DeviceContext, T, 4, Functor>(
context.template device_context<DeviceContext>(), *input0, *input1,
*input2, output, dims);
*input2, output, functor, dims);
break;
case 5:
ReduceGradFunctor<DeviceContext, T, 5, Functor>(
context.template device_context<DeviceContext>(), *input0, *input1,
*input2, output, dims);
*input2, output, functor, dims);
break;
case 6:
ReduceGradFunctor<DeviceContext, T, 6, Functor>(
context.template device_context<DeviceContext>(), *input0, *input1,
*input2, output, dims);
*input2, output, functor, dims);
break;
default:
HandleLargeDimGrad<DeviceContext, T, Functor>(context, input0, input1,
input2, output, dims);
HandleLargeDimGrad<DeviceContext, T, Functor>(
context, input0, input1, input2, output, functor, dims);
break;
}
}
......@@ -430,8 +429,10 @@ class ReduceGradKernel : public framework::OpKernel<T> {
// NOTE(dengkaipeng): Out is unnecessary in some reduce kernel and
// not be set as Input in grad Maker, use Out_grad to replace here
if (!input1) input1 = input2;
LaunchReduceGradKernel<DeviceContext, T, Functor>(
context, input0, input1, input2, output, const_dims, reduce_all);
Functor functor;
LaunchReduceGradKernel<DeviceContext, T, Functor>(context, input0, input1,
input2, output, functor,
const_dims, reduce_all);
}
void Compute(const framework::ExecutionContext& context) const override {
......
......@@ -74,7 +74,7 @@ void ReduceGradFunctor(const DeviceContext& context,
const framework::Tensor& input0,
const framework::Tensor& input1,
const framework::Tensor& input2,
framework::Tensor* output,
framework::Tensor* output, Functor functor,
const std::vector<int>& dims) {
auto x = EigenTensor<T, D>::From(input0);
auto x_grad = EigenTensor<T, D>::From(*output);
......@@ -100,7 +100,6 @@ void ReduceGradFunctor(const DeviceContext& context,
auto& place = *context.eigen_device();
Functor functor;
functor(place, &x, &x_reduce, &x_grad, &x_reduce_grad, broadcast_dim,
broad_cats_times);
}
......
......@@ -19,11 +19,12 @@ import numpy as np
from op_test import OpTest
import paddle
import paddle.fluid as fluid
import paddle.fluid.core as core
def p_norm(x, axis, porder, keepdims=False):
def p_norm(x, axis, porder, keepdims=False, reduce_all=False):
r = []
if axis is None:
if axis is None or reduce_all:
x = x.flatten()
if porder == np.inf:
r = np.amax(np.abs(x), keepdims=keepdims)
......@@ -53,8 +54,8 @@ def p_norm(x, axis, porder, keepdims=False):
else:
if isinstance(axis, list):
axis = tuple(axis)
r = np.linalg.norm(
x, ord=porder, axis=axis, keepdims=keepdims).astype(x.dtype)
r = np.linalg.norm(x, ord=porder, axis=axis, keepdims=keepdims)
r = r.astype(x.dtype)
return r
......@@ -111,13 +112,14 @@ class TestPnormOp(OpTest):
self.op_type = "p_norm"
self.init_test_case()
x = (np.random.random(self.shape) + 0.5).astype(self.dtype)
norm = p_norm(x, self.axis, self.porder, self.keepdim)
norm = p_norm(x, self.axis, self.porder, self.keepdim, self.asvector)
self.inputs = {'X': x}
self.attrs = {
'epsilon': self.epsilon,
'axis': self.axis,
'keepdim': self.keepdim,
'porder': float(self.porder)
'porder': float(self.porder),
'asvector': self.asvector
}
self.outputs = {'Out': norm}
self.gradient = self.calc_gradient()
......@@ -135,34 +137,42 @@ class TestPnormOp(OpTest):
self.porder = 2.0
self.keepdim = False
self.dtype = "float64"
self.asvector = False
def calc_gradient(self):
self.attrs = {
'epsilon': self.epsilon,
'axis': self.axis,
'keepdim': self.keepdim,
'porder': float(self.porder)
'porder': float(self.porder),
'asvector': self.asvector
}
x = self.inputs["X"]
porder = self.attrs["porder"]
axis = self.attrs["axis"]
asvector = self.attrs["asvector"]
x_dtype = x.dtype
x = x.astype(np.float32) if x.dtype == np.float16 else x
if porder == 0:
grad = np.zeros(x.shape).astype(x.dtype)
elif porder in [float("inf"), float("-inf")]:
norm = p_norm(x, axis=axis, porder=porder, keepdims=True)
norm = p_norm(
x, axis=axis, porder=porder, keepdims=True, reduce_all=asvector)
x_abs = np.abs(x)
grad = np.sign(x)
grad[x_abs != norm] = 0.0
else:
norm = p_norm(x, axis=axis, porder=porder, keepdims=True)
norm = p_norm(
x, axis=axis, porder=porder, keepdims=True, reduce_all=asvector)
grad = np.power(norm, 1 - porder) * np.power(
np.abs(x), porder - 1) * np.sign(x)
numel = 1
for s in x.shape:
numel *= s
numel /= x.shape[axis]
return [grad.astype(x.dtype) * 1 / numel]
divisor = numel if asvector else x.shape[axis]
numel /= divisor
return [grad.astype(x_dtype) * 1 / numel]
class TestPnormOp2(TestPnormOp):
......@@ -173,6 +183,7 @@ class TestPnormOp2(TestPnormOp):
self.porder = 2.0
self.keepdim = True
self.dtype = "float32"
self.asvector = False
def test_check_grad(self):
self.check_grad(['X'], 'Out')
......@@ -186,6 +197,7 @@ class TestPnormOp3(TestPnormOp):
self.porder = np.inf
self.keepdim = True
self.dtype = "float32"
self.asvector = False
def test_check_grad(self):
self.check_grad(['X'], 'Out', user_defined_grads=self.gradient)
......@@ -199,6 +211,7 @@ class TestPnormOp4(TestPnormOp):
self.porder = -np.inf
self.keepdim = True
self.dtype = "float32"
self.asvector = False
def test_check_grad(self):
self.check_grad(['X'], 'Out', user_defined_grads=self.gradient)
......@@ -212,11 +225,63 @@ class TestPnormOp5(TestPnormOp):
self.porder = 0
self.keepdim = True
self.dtype = "float32"
self.asvector = False
def test_check_grad(self):
self.check_grad(['X'], 'Out', user_defined_grads=self.gradient)
class TestPnormOp6(TestPnormOp):
def init_test_case(self):
self.shape = [3, 20, 3]
self.axis = -1
self.epsilon = 1e-12
self.porder = 2
self.keepdim = False
self.dtype = "float32"
self.asvector = True
def test_check_grad(self):
self.check_grad(['X'], 'Out', user_defined_grads=self.gradient)
@unittest.skipIf(not core.is_compiled_with_cuda(),
"core is not compiled with CUDA")
class TestPnormOpFP16(TestPnormOp):
def init_test_case(self):
self.shape = [2, 3, 4, 5]
self.axis = 1
self.epsilon = 1e-12
self.porder = 2.0
self.keepdim = False
self.dtype = "float16"
self.asvector = False
def test_check_output(self):
place = core.CUDAPlace(0)
if core.is_float16_supported(place):
self.check_output_with_place(place, atol=1e-3)
def test_check_grad(self):
place = core.CUDAPlace(0)
if core.is_float16_supported(place):
self.check_grad_with_place(
place, ['X'], 'Out', user_defined_grads=self.gradient)
@unittest.skipIf(not core.is_compiled_with_cuda(),
"core is not compiled with CUDA")
class TestPnormOpFP161(TestPnormOpFP16):
def init_test_case(self):
self.shape = [2, 3, 4, 5]
self.axis = -1
self.epsilon = 1e-12
self.porder = 2.0
self.keepdim = False
self.dtype = "float16"
self.asvector = True
def run_fro(self, p, axis, shape_x, dtype, keep_dim, check_dim=False):
with fluid.program_guard(fluid.Program()):
data = fluid.data(name="X", shape=shape_x, dtype=dtype)
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册