未验证 提交 fecbc958 编写于 作者: Q QingshuChen 提交者: GitHub

add some fp16 op for kunlun resnet50 model (#44672)

* add some fp16 op for kunlun resnet50 model
*test=kunlun

* tmp
*test=kunlun
上级 a9919903
......@@ -23,6 +23,8 @@ using Tensor = framework::Tensor;
template <typename T>
class ResNetUnitXPUKernel : public framework::OpKernel<T> {
using XPUType = typename XPUTypeTrait<T>::Type;
public:
void Compute(const framework::ExecutionContext &ctx) const override {
auto place = ctx.GetPlace();
......@@ -63,9 +65,12 @@ class ResNetUnitXPUKernel : public framework::OpKernel<T> {
std::string act_type = ctx.Attr<std::string>("act_type");
auto &dev_ctx = ctx.template device_context<platform::XPUDeviceContext>();
std::vector<const T *> x_list = {input_x->data<T>()};
std::vector<const T *> w_list = {filter_x->data<T>()};
std::vector<T *> conv_y_list = {conv_out_x->mutable_data<T>(place)};
std::vector<const XPUType *> x_list = {
reinterpret_cast<const XPUType *>(input_x->data<T>())};
std::vector<const XPUType *> w_list = {
reinterpret_cast<const XPUType *>(filter_x->data<T>())};
std::vector<XPUType *> conv_y_list = {
reinterpret_cast<XPUType *>(conv_out_x->mutable_data<T>(place))};
std::vector<std::vector<int>> x_shape_list = {
phi::vectorize<int>(input_x->dims())};
......@@ -107,9 +112,10 @@ class ResNetUnitXPUKernel : public framework::OpKernel<T> {
Tensor *running_mean_z = ctx.Output<Tensor>("RunningMeanZ");
Tensor *running_var_z = ctx.Output<Tensor>("RunningVarZ");
x_list.push_back(input_z->data<T>());
w_list.push_back(filter_z->data<T>());
conv_y_list.push_back(conv_out_z->mutable_data<T>(place));
x_list.push_back(reinterpret_cast<const XPUType *>(input_z->data<T>()));
w_list.push_back(reinterpret_cast<const XPUType *>(filter_z->data<T>()));
conv_y_list.push_back(
reinterpret_cast<XPUType *>(conv_out_z->mutable_data<T>(place)));
x_shape_list.push_back(phi::vectorize<int>(input_z->dims()));
......@@ -133,17 +139,17 @@ class ResNetUnitXPUKernel : public framework::OpKernel<T> {
if (fuse_add) {
const Tensor *input_z = ctx.Input<Tensor>("Z");
auto input_z_shape = phi::vectorize<int>(input_z->dims());
x_list.push_back(input_z->data<T>());
x_list.push_back(reinterpret_cast<const XPUType *>(input_z->data<T>()));
x_shape_list.push_back(input_z_shape);
x_maxlist.push_back(nullptr);
}
}
int r = xpu::resnet_unit_fusion<T, T, T, int16_t>(
int r = xpu::resnet_unit_fusion<XPUType, XPUType, XPUType, int16_t>(
dev_ctx.x_context(),
x_list,
w_list,
conv_y_list,
output->mutable_data<T>(place),
reinterpret_cast<XPUType *>(output->mutable_data<T>(place)),
x_shape_list,
filter_x_shape[0],
ksize_list,
......@@ -172,6 +178,8 @@ class ResNetUnitXPUKernel : public framework::OpKernel<T> {
template <typename T>
class ResNetUnitGradXPUKernel : public framework::OpKernel<T> {
using XPUType = typename XPUTypeTrait<T>::Type;
public:
void Compute(const framework::ExecutionContext &ctx) const override {
auto place = ctx.GetPlace();
......@@ -208,11 +216,16 @@ class ResNetUnitGradXPUKernel : public framework::OpKernel<T> {
auto &dev_ctx = ctx.template device_context<platform::XPUDeviceContext>();
std::vector<const T *> x_list = {x->data<T>()};
std::vector<const T *> w_list = {filter_x->data<T>()};
std::vector<const T *> conv_y_list = {conv_out_x->data<T>()};
std::vector<T *> dx_list = {x_grad->mutable_data<T>(place)};
std::vector<T *> dw_list = {filter_x_grad->mutable_data<T>(place)};
std::vector<const XPUType *> x_list = {
reinterpret_cast<const XPUType *>(x->data<T>())};
std::vector<const XPUType *> w_list = {
reinterpret_cast<const XPUType *>(filter_x->data<T>())};
std::vector<const XPUType *> conv_y_list = {
reinterpret_cast<const XPUType *>(conv_out_x->data<T>())};
std::vector<XPUType *> dx_list = {
reinterpret_cast<XPUType *>(x_grad->mutable_data<T>(place))};
std::vector<XPUType *> dw_list = {
reinterpret_cast<XPUType *>(filter_x_grad->mutable_data<T>(place))};
std::vector<std::vector<int>> x_shape_list = {
phi::vectorize<int>(x->dims())};
......@@ -262,11 +275,14 @@ class ResNetUnitGradXPUKernel : public framework::OpKernel<T> {
Tensor *scale_z_grad =
ctx.Output<Tensor>(framework::GradVarName("ScaleZ"));
Tensor *bias_z_grad = ctx.Output<Tensor>(framework::GradVarName("BiasZ"));
x_list.push_back(z->data<T>());
w_list.push_back(filter_z->data<T>());
conv_y_list.push_back(conv_out_z->data<T>());
dx_list.push_back(z_grad->mutable_data<T>(place));
dw_list.push_back(filter_z_grad->mutable_data<T>(place));
x_list.push_back(reinterpret_cast<const XPUType *>(z->data<T>()));
w_list.push_back(reinterpret_cast<const XPUType *>(filter_z->data<T>()));
conv_y_list.push_back(
reinterpret_cast<const XPUType *>(conv_out_z->data<T>()));
dx_list.push_back(
reinterpret_cast<XPUType *>(z_grad->mutable_data<T>(place)));
dw_list.push_back(
reinterpret_cast<XPUType *>(filter_z_grad->mutable_data<T>(place)));
x_shape_list.push_back(phi::vectorize<int>(z->dims()));
auto filter_z_shape = phi::vectorize<int>(filter_z->dims());
......@@ -288,38 +304,39 @@ class ResNetUnitGradXPUKernel : public framework::OpKernel<T> {
} else {
if (fuse_add) {
auto z_grad = ctx.Output<Tensor>(framework::GradVarName("Z"));
dx_list.push_back(z_grad->mutable_data<T>(place));
dx_list.push_back(
reinterpret_cast<XPUType *>(z_grad->mutable_data<T>(place)));
}
}
int r =
xpu::resnet_unit_grad_fusion<T, T, T, int16_t>(dev_ctx.x_context(),
x_list,
w_list,
y_grad->data<T>(),
output->data<T>(),
conv_y_list,
dx_list,
dw_list,
x_shape_list,
filter_x_shape[0],
ksize_list,
stride_list,
paddings,
dilations,
group,
x_maxlist,
w_maxlist,
scale_list,
batch_mean_list,
batch_invstd_list,
dscale_list,
dbias_list,
xpu::Activation_t::RELU,
eps,
is_nchw,
has_shortcut,
fuse_add);
int r = xpu::resnet_unit_grad_fusion<XPUType, XPUType, XPUType, int16_t>(
dev_ctx.x_context(),
x_list,
w_list,
reinterpret_cast<const XPUType *>(y_grad->data<T>()),
reinterpret_cast<const XPUType *>(output->data<T>()),
conv_y_list,
dx_list,
dw_list,
x_shape_list,
filter_x_shape[0],
ksize_list,
stride_list,
paddings,
dilations,
group,
x_maxlist,
w_maxlist,
scale_list,
batch_mean_list,
batch_invstd_list,
dscale_list,
dbias_list,
xpu::Activation_t::RELU,
eps,
is_nchw,
has_shortcut,
fuse_add);
PADDLE_ENFORCE_XDNN_SUCCESS(r, "resnet_unit_grad_fusion");
}
};
......@@ -329,5 +346,9 @@ class ResNetUnitGradXPUKernel : public framework::OpKernel<T> {
namespace ops = paddle::operators;
namespace plat = paddle::platform;
REGISTER_OP_XPU_KERNEL(resnet_unit, ops::ResNetUnitXPUKernel<float>);
REGISTER_OP_XPU_KERNEL(resnet_unit_grad, ops::ResNetUnitGradXPUKernel<float>);
REGISTER_OP_XPU_KERNEL(resnet_unit,
ops::ResNetUnitXPUKernel<plat::float16>,
ops::ResNetUnitXPUKernel<float>);
REGISTER_OP_XPU_KERNEL(resnet_unit_grad,
ops::ResNetUnitGradXPUKernel<plat::float16>,
ops::ResNetUnitGradXPUKernel<float>);
......@@ -22,6 +22,8 @@ namespace operators {
template <typename T>
class LarsMomentumOpXPUKernel : public framework::OpKernel<T> {
using XPUType = typename XPUTypeTrait<T>::Type;
public:
void Compute(const framework::ExecutionContext& ctx) const override {
bool multi_precision = ctx.Attr<bool>("multi_precision");
......@@ -35,14 +37,14 @@ class LarsMomentumOpXPUKernel : public framework::OpKernel<T> {
auto master_param = ctx.MultiInput<framework::LoDTensor>("MasterParam");
auto master_param_out =
ctx.MultiOutput<framework::LoDTensor>("MasterParamOut");
T mu = static_cast<T>(ctx.Attr<float>("mu"));
T lars_coeff = ctx.Attr<float>("lars_coeff");
T epsilon = ctx.Attr<float>("epsilon");
T rescale_grad = ctx.Attr<float>("rescale_grad");
float mu = static_cast<T>(ctx.Attr<float>("mu"));
float lars_coeff = ctx.Attr<float>("lars_coeff");
float epsilon = ctx.Attr<float>("epsilon");
float rescale_grad = ctx.Attr<float>("rescale_grad");
std::vector<T*> param_list;
std::vector<T*> grad_list;
std::vector<T*> param_out_list;
std::vector<XPUType*> param_list;
std::vector<XPUType*> grad_list;
std::vector<XPUType*> param_out_list;
std::vector<float*> velocity_list;
std::vector<float*> velocity_out_list;
std::vector<float*> lrs;
......@@ -52,9 +54,12 @@ class LarsMomentumOpXPUKernel : public framework::OpKernel<T> {
std::vector<float*> master_param_out_list;
int op_num = param.size();
for (int i = 0; i < op_num; ++i) {
param_list.push_back(const_cast<T*>(param[i]->data<T>()));
grad_list.push_back(const_cast<T*>(grad[i]->data<T>()));
param_out_list.push_back(param_out[i]->mutable_data<T>(ctx.GetPlace()));
param_list.push_back(
reinterpret_cast<XPUType*>(const_cast<T*>((param[i]->data<T>()))));
grad_list.push_back(
reinterpret_cast<XPUType*>(const_cast<T*>(grad[i]->data<T>())));
param_out_list.push_back(reinterpret_cast<XPUType*>(
param_out[i]->mutable_data<T>(ctx.GetPlace())));
velocity_list.push_back(const_cast<float*>(velocity[i]->data<float>()));
velocity_out_list.push_back(
velocity_out[i]->mutable_data<float>(ctx.GetPlace()));
......@@ -111,5 +116,7 @@ class LarsMomentumOpXPUKernel : public framework::OpKernel<T> {
} // namespace paddle
namespace ops = paddle::operators;
REGISTER_OP_XPU_KERNEL(lars_momentum, ops::LarsMomentumOpXPUKernel<float>);
REGISTER_OP_XPU_KERNEL(lars_momentum,
ops::LarsMomentumOpXPUKernel<paddle::platform::float16>,
ops::LarsMomentumOpXPUKernel<float>);
#endif
......@@ -231,7 +231,9 @@ XPUOpMap& get_kl2_ops() {
pOpKernelType(vartype::FP16, XPUPlace())})},
{"generate_proposals_v2",
XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
{"grad_add", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
{"grad_add",
XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
pOpKernelType(vartype::FP16, XPUPlace())})},
{"greater_equal",
XPUKernelSet({pOpKernelType(vartype::INT64, XPUPlace()),
pOpKernelType(vartype::INT32, XPUPlace()),
......@@ -254,9 +256,8 @@ XPUOpMap& get_kl2_ops() {
{"label_smooth",
XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
{"lars_momentum",
XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
{"layer_norm_grad",
XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
pOpKernelType(vartype::FP16, XPUPlace())})},
{"layer_norm_grad",
XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
pOpKernelType(vartype::FP16, XPUPlace())})},
......@@ -380,9 +381,12 @@ XPUOpMap& get_kl2_ops() {
pOpKernelType(vartype::INT32, XPUPlace()),
pOpKernelType(vartype::BOOL, XPUPlace()),
pOpKernelType(vartype::FP32, XPUPlace())})},
{"resnet_unit", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
{"resnet_unit",
XPUKernelSet({pOpKernelType(vartype::FP16, XPUPlace()),
pOpKernelType(vartype::FP32, XPUPlace())})},
{"resnet_unit_grad",
XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
XPUKernelSet({pOpKernelType(vartype::FP16, XPUPlace()),
pOpKernelType(vartype::FP32, XPUPlace())})},
{"rmsprop", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
{"rnn", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
{"rnn_grad", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
......@@ -502,6 +506,9 @@ XPUOpMap& get_kl2_ops() {
XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
pOpKernelType(vartype::FP16, XPUPlace())})},
{"top_k_v2", XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace())})},
{"update_loss_scaling",
XPUKernelSet({pOpKernelType(vartype::FP32, XPUPlace()),
pOpKernelType(vartype::FP16, XPUPlace())})},
{"unsqueeze2_grad",
XPUKernelSet({pOpKernelType(vartype::FP64, XPUPlace()),
pOpKernelType(vartype::INT64, XPUPlace()),
......
......@@ -24,13 +24,15 @@ void GradAddXPUKernel(const Context& dev_ctx,
const DenseTensor& x,
const DenseTensor& y,
DenseTensor* out) {
using XPUType = typename XPUTypeTrait<T>::Type;
dev_ctx.template Alloc<T>(out);
auto x_shape = phi::vectorize<int>(x.dims());
auto y_shape = phi::vectorize<int>(y.dims());
int r = xpu::broadcast_add(dev_ctx.x_context(),
x.data<T>(),
y.data<T>(),
out->data<T>(),
reinterpret_cast<const XPUType*>(x.data<T>()),
reinterpret_cast<const XPUType*>(y.data<T>()),
reinterpret_cast<XPUType*>(out->data<T>()),
x_shape,
y_shape);
PADDLE_ENFORCE_XDNN_SUCCESS(r, "broadcast_add");
......@@ -38,4 +40,9 @@ void GradAddXPUKernel(const Context& dev_ctx,
} // namespace phi
PD_REGISTER_KERNEL(grad_add, XPU, ALL_LAYOUT, phi::GradAddXPUKernel, float) {}
PD_REGISTER_KERNEL(grad_add,
XPU,
ALL_LAYOUT,
phi::GradAddXPUKernel,
phi::dtype::float16,
float) {}
......@@ -26,6 +26,7 @@ void LogSoftmaxGradKernel(const Context& dev_ctx,
const DenseTensor& out_grad,
int axis,
DenseTensor* x_grad) {
using XPUType = typename XPUTypeTrait<T>::Type;
const int rank = out.dims().size();
axis = funcs::CanonicalAxis(axis, rank);
......@@ -40,24 +41,29 @@ void LogSoftmaxGradKernel(const Context& dev_ctx,
PADDLE_ENFORCE_NE(
tmp2_ptr, nullptr, phi::errors::External("no enough memory in xpu"));
int r =
xpu::exp(dev_ctx.x_context(), out.data<T>(), tmp_ptr, out_grad.numel());
int r = xpu::exp<XPUType>(dev_ctx.x_context(),
reinterpret_cast<const XPUType*>(out.data<T>()),
reinterpret_cast<XPUType*>(tmp_ptr),
out_grad.numel());
PADDLE_ENFORCE_XDNN_SUCCESS(r, "exp");
r = xpu::reciprocal(
dev_ctx.x_context(), tmp_ptr, tmp2_ptr, out_grad.numel());
r = xpu::reciprocal<XPUType>(dev_ctx.x_context(),
reinterpret_cast<const XPUType*>(tmp_ptr),
reinterpret_cast<XPUType*>(tmp2_ptr),
out_grad.numel());
PADDLE_ENFORCE_XDNN_SUCCESS(r, "reciprocal");
r = xpu::mul(dev_ctx.x_context(),
tmp2_ptr,
out_grad.data<T>(),
tmp2_ptr,
out_grad.numel());
r = xpu::mul<XPUType>(dev_ctx.x_context(),
reinterpret_cast<const XPUType*>(tmp2_ptr),
reinterpret_cast<const XPUType*>(out_grad.data<T>()),
reinterpret_cast<XPUType*>(tmp2_ptr),
out_grad.numel());
PADDLE_ENFORCE_XDNN_SUCCESS(r, "mul");
r = xpu::softmax_grad(dev_ctx.x_context(),
tmp_ptr,
tmp2_ptr,
x_grad->data<T>(),
out_shape,
axis);
r = xpu::softmax_grad<XPUType>(
dev_ctx.x_context(),
reinterpret_cast<const XPUType*>(tmp_ptr),
reinterpret_cast<const XPUType*>(tmp2_ptr),
reinterpret_cast<XPUType*>(x_grad->data<T>()),
out_shape,
axis);
PADDLE_ENFORCE_XDNN_SUCCESS(r, "softmax_grad");
}
}
......
......@@ -25,6 +25,7 @@ void LogSoftmaxKernel(const Context& dev_ctx,
const DenseTensor& x,
int axis,
DenseTensor* out) {
using XPUType = typename XPUTypeTrait<T>::Type;
const int rank = x.dims().size();
axis = funcs::CanonicalAxis(axis, rank);
......@@ -32,11 +33,16 @@ void LogSoftmaxKernel(const Context& dev_ctx,
auto x_shape = phi::vectorize<int>(x.dims());
dev_ctx.template Alloc<T>(out);
if (axis < 0) axis += rank;
int r = xpu::softmax<T>(
dev_ctx.x_context(), x.data<T>(), out->data<T>(), x_shape, axis);
int r = xpu::softmax<XPUType>(dev_ctx.x_context(),
reinterpret_cast<const XPUType*>(x.data<T>()),
reinterpret_cast<XPUType*>(out->data<T>()),
x_shape,
axis);
PADDLE_ENFORCE_XDNN_SUCCESS(r, "softmax");
r = xpu::log<T>(
dev_ctx.x_context(), out->data<T>(), out->data<T>(), out->numel());
r = xpu::log<XPUType>(dev_ctx.x_context(),
reinterpret_cast<const XPUType*>(out->data<T>()),
reinterpret_cast<XPUType*>(out->data<T>()),
out->numel());
PADDLE_ENFORCE_XDNN_SUCCESS(r, "log");
}
}
......
......@@ -23,231 +23,242 @@ import paddle
import paddle.fluid as fluid
import paddle.fluid.contrib.mixed_precision.amp_nn as amp_nn
from op_test_xpu import XPUOpTest
from xpu.get_test_cover_info import create_test_class, get_xpu_op_support_types, XPUOpTestWrapper
paddle.enable_static()
class TestUpdateLossScalingOp(XPUOpTest):
def setUp(self):
self.op_type = "update_loss_scaling"
self.init()
found_inf = np.array([False], dtype=np.bool_)
x = np.random.random((1024, 1024)).astype(self.dtype)
self.inputs = {
'X': [('x0', x)],
'FoundInfinite': found_inf,
'PrevLossScaling': self.prev_loss_scaling,
'InGoodSteps': self.num_good_steps,
'InBadSteps': self.num_bad_steps
}
self.outputs = {
'Out': [('out0', x)],
'LossScaling': self.prev_loss_scaling * self.incr_ratio,
'OutGoodSteps': self.zero_steps,
'OutBadSteps': self.zero_steps
}
def init(self):
self.incr_ratio = 2.0
self.decr_ratio = 0.8
self.dtype = np.float32
self.prev_loss_scaling = np.array([2048]).astype(self.dtype)
self.num_good_steps = np.array([999], dtype=np.int32)
self.num_bad_steps = np.array([1], dtype=np.int32)
self.zero_steps = np.array([0], dtype=np.int32)
self.attrs = {
'incr_every_n_steps': 1000,
'decr_every_n_nan_or_inf': 2,
'incr_ratio': self.incr_ratio,
'decr_ratio': self.decr_ratio,
}
def test_check_output(self):
if paddle.is_compiled_with_xpu():
place = paddle.XPUPlace(0)
self.check_output_with_place(place, no_check_set=['Out'])
class TestUpdateLossScalingOpBad(TestUpdateLossScalingOp):
def setUp(self):
self.op_type = "update_loss_scaling"
self.init()
found_inf = np.array([True], dtype=np.bool_)
x = np.random.random((1024, 1024)).astype(self.dtype)
i = np.random.randint(0, 1024, 1)
j = np.random.randint(0, 1024, 1)
x[i[0]][j[0]] = np.inf
self.inputs = {
'X': [('x0', x)],
'FoundInfinite': found_inf,
'PrevLossScaling': self.prev_loss_scaling,
'InGoodSteps': self.num_good_steps,
'InBadSteps': self.num_bad_steps
}
self.outputs = {
'Out': [('out0', np.zeros_like(x))],
'LossScaling': self.prev_loss_scaling * self.decr_ratio,
'OutGoodSteps': self.zero_steps,
'OutBadSteps': self.zero_steps
}
def test_check_output(self):
if paddle.is_compiled_with_xpu():
place = paddle.XPUPlace(0)
self.check_output_with_place(place)
#self.check_output()
class TestUpdateLossScalingLayer(unittest.TestCase):
def loss_scaling_check(self, scope=fluid.Scope()):
a = fluid.data(name="a", shape=[1024, 1024], dtype='float32')
b = fluid.data(name="b", shape=[512, 128], dtype='float32')
x = [a, b]
found_inf = fluid.data(name="found_inf", shape=[1], dtype='bool')
prev_loss_scaling = fluid.data(name="prev_loss_scaling",
class XPUTestUpdateLossScalingOp(XPUOpTestWrapper):
def __init__(self):
self.op_name = "update_loss_scaling"
self.use_dynamic_create_class = False
class TestUpdateLossScalingOp(XPUOpTest):
def setUp(self):
self.op_type = "update_loss_scaling"
self.init()
found_inf = np.array([False], dtype=np.bool_)
x = np.random.random((1024, 1024)).astype(self.dtype)
self.inputs = {
'X': [('x0', x)],
'FoundInfinite': found_inf,
'PrevLossScaling': self.prev_loss_scaling,
'InGoodSteps': self.num_good_steps,
'InBadSteps': self.num_bad_steps
}
self.outputs = {
'Out': [('out0', x)],
'LossScaling': self.prev_loss_scaling * self.incr_ratio,
'OutGoodSteps': self.zero_steps,
'OutBadSteps': self.zero_steps
}
def init(self):
self.incr_ratio = 2.0
self.decr_ratio = 0.8
self.dtype = np.float32
self.prev_loss_scaling = np.array([2048]).astype(self.dtype)
self.num_good_steps = np.array([999], dtype=np.int32)
self.num_bad_steps = np.array([1], dtype=np.int32)
self.zero_steps = np.array([0], dtype=np.int32)
self.attrs = {
'incr_every_n_steps': 1000,
'decr_every_n_nan_or_inf': 2,
'incr_ratio': self.incr_ratio,
'decr_ratio': self.decr_ratio,
}
def test_check_output(self):
if paddle.is_compiled_with_xpu():
place = paddle.XPUPlace(0)
self.check_output_with_place(place, no_check_set=['Out'])
class TestUpdateLossScalingOpBad(TestUpdateLossScalingOp):
def setUp(self):
self.op_type = "update_loss_scaling"
self.init()
found_inf = np.array([True], dtype=np.bool_)
x = np.random.random((1024, 1024)).astype(self.dtype)
i = np.random.randint(0, 1024, 1)
j = np.random.randint(0, 1024, 1)
x[i[0]][j[0]] = np.inf
self.inputs = {
'X': [('x0', x)],
'FoundInfinite': found_inf,
'PrevLossScaling': self.prev_loss_scaling,
'InGoodSteps': self.num_good_steps,
'InBadSteps': self.num_bad_steps
}
self.outputs = {
'Out': [('out0', np.zeros_like(x))],
'LossScaling': self.prev_loss_scaling * self.decr_ratio,
'OutGoodSteps': self.zero_steps,
'OutBadSteps': self.zero_steps
}
def test_check_output(self):
if paddle.is_compiled_with_xpu():
place = paddle.XPUPlace(0)
self.check_output_with_place(place)
#self.check_output()
class TestUpdateLossScalingLayer(unittest.TestCase):
def loss_scaling_check(self, scope=fluid.Scope()):
a = fluid.data(name="a", shape=[1024, 1024], dtype='float32')
b = fluid.data(name="b", shape=[512, 128], dtype='float32')
x = [a, b]
found_inf = fluid.data(name="found_inf", shape=[1], dtype='bool')
prev_loss_scaling = fluid.data(name="prev_loss_scaling",
shape=[1],
dtype='float32')
num_good_steps = fluid.data(name="num_good_steps",
shape=[1],
dtype='int32')
num_bad_steps = fluid.data(name="num_bad_steps",
shape=[1],
dtype='float32')
num_good_steps = fluid.data(name="num_good_steps",
shape=[1],
dtype='int32')
num_bad_steps = fluid.data(name="num_bad_steps",
shape=[1],
dtype='int32')
a_v = np.random.random([1024, 1024]).astype('float32')
b_v = np.random.random([512, 128]).astype('float32')
found_inf_v = np.array([False]).astype('bool')
prev_loss_scaling_v = np.array([2048]).astype('float32')
num_good_steps_v = np.array([999], dtype=np.int32)
num_bad_steps_v = np.array([1], dtype=np.int32)
incr_every_n_steps = 1000
decr_every_n_nan_or_inf = 2
incr_ratio = 2
decr_ratio = 0.8
result = amp_nn.update_loss_scaling(x,
found_inf,
prev_loss_scaling,
num_good_steps,
num_bad_steps,
incr_every_n_steps,
decr_every_n_nan_or_inf,
incr_ratio,
decr_ratio,
name="update_loss_scaling")
place = fluid.XPUPlace(0)
exe = fluid.Executor(place)
with fluid.scope_guard(scope):
exe.run(fluid.default_startup_program())
result_v = exe.run(feed={
'a': a_v,
'b': b_v,
'found_inf': found_inf_v,
'prev_loss_scaling': prev_loss_scaling_v,
'num_good_steps': num_good_steps_v,
'num_bad_steps': num_bad_steps_v
},
fetch_list=[
result, x, found_inf, prev_loss_scaling,
num_good_steps, num_bad_steps
])
assert np.array_equal(result_v[0], a_v)
assert np.array_equal(result_v[1], b_v)
assert np.array_equal(result_v[0], result_v[2])
assert np.array_equal(result_v[1], result_v[3])
assert np.array_equal(result_v[4], found_inf_v)
assert np.array_equal(result_v[5], prev_loss_scaling_v * incr_ratio)
assert np.array_equal(result_v[6], np.zeros_like(num_good_steps_v))
assert np.array_equal(result_v[7], np.zeros_like(num_bad_steps_v))
def loss_scaling_check_inf(self, use_cuda=True, scope=fluid.Scope()):
a = fluid.data(name="a", shape=[1024, 1024], dtype='float32')
b = fluid.data(name="b", shape=[512, 128], dtype='float32')
x = [a, b]
found_inf = fluid.data(name="found_inf", shape=[1], dtype='bool')
prev_loss_scaling = fluid.data(name="prev_loss_scaling",
dtype='int32')
a_v = np.random.random([1024, 1024]).astype('float32')
b_v = np.random.random([512, 128]).astype('float32')
found_inf_v = np.array([False]).astype('bool')
prev_loss_scaling_v = np.array([2048]).astype('float32')
num_good_steps_v = np.array([999], dtype=np.int32)
num_bad_steps_v = np.array([1], dtype=np.int32)
incr_every_n_steps = 1000
decr_every_n_nan_or_inf = 2
incr_ratio = 2
decr_ratio = 0.8
result = amp_nn.update_loss_scaling(x,
found_inf,
prev_loss_scaling,
num_good_steps,
num_bad_steps,
incr_every_n_steps,
decr_every_n_nan_or_inf,
incr_ratio,
decr_ratio,
name="update_loss_scaling")
place = fluid.XPUPlace(0)
exe = fluid.Executor(place)
with fluid.scope_guard(scope):
exe.run(fluid.default_startup_program())
result_v = exe.run(feed={
'a': a_v,
'b': b_v,
'found_inf': found_inf_v,
'prev_loss_scaling': prev_loss_scaling_v,
'num_good_steps': num_good_steps_v,
'num_bad_steps': num_bad_steps_v
},
fetch_list=[
result, x, found_inf, prev_loss_scaling,
num_good_steps, num_bad_steps
])
assert np.array_equal(result_v[0], a_v)
assert np.array_equal(result_v[1], b_v)
assert np.array_equal(result_v[0], result_v[2])
assert np.array_equal(result_v[1], result_v[3])
assert np.array_equal(result_v[4], found_inf_v)
assert np.array_equal(result_v[5], prev_loss_scaling_v * incr_ratio)
assert np.array_equal(result_v[6], np.zeros_like(num_good_steps_v))
assert np.array_equal(result_v[7], np.zeros_like(num_bad_steps_v))
def loss_scaling_check_inf(self, use_cuda=True, scope=fluid.Scope()):
a = fluid.data(name="a", shape=[1024, 1024], dtype='float32')
b = fluid.data(name="b", shape=[512, 128], dtype='float32')
x = [a, b]
found_inf = fluid.data(name="found_inf", shape=[1], dtype='bool')
prev_loss_scaling = fluid.data(name="prev_loss_scaling",
shape=[1],
dtype='float32')
num_good_steps = fluid.data(name="num_good_steps",
shape=[1],
dtype='int32')
num_bad_steps = fluid.data(name="num_bad_steps",
shape=[1],
dtype='float32')
num_good_steps = fluid.data(name="num_good_steps",
shape=[1],
dtype='int32')
num_bad_steps = fluid.data(name="num_bad_steps",
shape=[1],
dtype='int32')
a_v = np.random.random([1024, 1024]).astype('float32')
b_v = np.random.random([512, 128]).astype('float32')
i = np.random.randint(0, 1024, 1)
j = np.random.randint(0, 1024, 1)
a_v[i[0]][j[0]] = np.inf
found_inf_v = np.array([True]).astype('bool')
prev_loss_scaling_v = np.array([2048]).astype('float32')
num_good_steps_v = np.array([999], dtype=np.int32)
num_bad_steps_v = np.array([1], dtype=np.int32)
incr_every_n_steps = 1000
decr_every_n_nan_or_inf = 2
incr_ratio = 2
decr_ratio = 0.8
result = amp_nn.update_loss_scaling(x,
found_inf,
prev_loss_scaling,
num_good_steps,
num_bad_steps,
incr_every_n_steps,
decr_every_n_nan_or_inf,
incr_ratio,
decr_ratio,
name="update_loss_scaling")
place = fluid.XPUPlace(0)
exe = fluid.Executor(place)
with fluid.scope_guard(scope):
exe.run(fluid.default_startup_program())
result_v = exe.run(feed={
'a': a_v,
'b': b_v,
'found_inf': found_inf_v,
'prev_loss_scaling': prev_loss_scaling_v,
'num_good_steps': num_good_steps_v,
'num_bad_steps': num_bad_steps_v
},
fetch_list=[
result, x, found_inf, prev_loss_scaling,
num_good_steps, num_bad_steps
])
assert np.array_equal(result_v[0], np.zeros_like(a_v))
assert np.array_equal(result_v[1], np.zeros_like(b_v))
assert np.array_equal(result_v[2], np.zeros_like(a_v))
assert np.array_equal(result_v[3], np.zeros_like(b_v))
assert np.array_equal(result_v[4], found_inf_v)
assert np.array_equal(result_v[5], prev_loss_scaling_v * decr_ratio)
assert np.array_equal(result_v[6], np.zeros_like(num_good_steps_v))
assert np.array_equal(result_v[7], np.zeros_like(num_bad_steps_v))
def test_loss_scaling(self):
main = fluid.Program()
startup = fluid.Program()
with fluid.unique_name.guard():
with fluid.program_guard(main, startup):
self.loss_scaling_check()
def test_loss_scaling_inf(self):
main = fluid.Program()
startup = fluid.Program()
with fluid.unique_name.guard():
with fluid.program_guard(main, startup):
self.loss_scaling_check_inf()
dtype='int32')
a_v = np.random.random([1024, 1024]).astype('float32')
b_v = np.random.random([512, 128]).astype('float32')
i = np.random.randint(0, 1024, 1)
j = np.random.randint(0, 1024, 1)
a_v[i[0]][j[0]] = np.inf
found_inf_v = np.array([True]).astype('bool')
prev_loss_scaling_v = np.array([2048]).astype('float32')
num_good_steps_v = np.array([999], dtype=np.int32)
num_bad_steps_v = np.array([1], dtype=np.int32)
incr_every_n_steps = 1000
decr_every_n_nan_or_inf = 2
incr_ratio = 2
decr_ratio = 0.8
result = amp_nn.update_loss_scaling(x,
found_inf,
prev_loss_scaling,
num_good_steps,
num_bad_steps,
incr_every_n_steps,
decr_every_n_nan_or_inf,
incr_ratio,
decr_ratio,
name="update_loss_scaling")
place = fluid.XPUPlace(0)
exe = fluid.Executor(place)
with fluid.scope_guard(scope):
exe.run(fluid.default_startup_program())
result_v = exe.run(feed={
'a': a_v,
'b': b_v,
'found_inf': found_inf_v,
'prev_loss_scaling': prev_loss_scaling_v,
'num_good_steps': num_good_steps_v,
'num_bad_steps': num_bad_steps_v
},
fetch_list=[
result, x, found_inf, prev_loss_scaling,
num_good_steps, num_bad_steps
])
assert np.array_equal(result_v[0], np.zeros_like(a_v))
assert np.array_equal(result_v[1], np.zeros_like(b_v))
assert np.array_equal(result_v[2], np.zeros_like(a_v))
assert np.array_equal(result_v[3], np.zeros_like(b_v))
assert np.array_equal(result_v[4], found_inf_v)
assert np.array_equal(result_v[5], prev_loss_scaling_v * decr_ratio)
assert np.array_equal(result_v[6], np.zeros_like(num_good_steps_v))
assert np.array_equal(result_v[7], np.zeros_like(num_bad_steps_v))
def test_loss_scaling(self):
main = fluid.Program()
startup = fluid.Program()
with fluid.unique_name.guard():
with fluid.program_guard(main, startup):
self.loss_scaling_check()
def test_loss_scaling_inf(self):
main = fluid.Program()
startup = fluid.Program()
with fluid.unique_name.guard():
with fluid.program_guard(main, startup):
self.loss_scaling_check_inf()
support_types = get_xpu_op_support_types('update_loss_scaling')
for stype in support_types:
create_test_class(globals(), XPUTestUpdateLossScalingOp, stype)
if __name__ == '__main__':
unittest.main()
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册