未验证 提交 4bea0ff1 编写于 作者: H heliqi 提交者: GitHub

[NPU]add depthwise_conv_npu_grad op (#35374)

* add depthwise_conv_npu_grad op

* add depthwise_conv_npu_grad op

* add depthwise_conv_npu_grad op

* add NHWC test case
上级 60c5adaa
...@@ -126,6 +126,117 @@ class DepthwiseConvNPUKernel : public framework::OpKernel<T> { ...@@ -126,6 +126,117 @@ class DepthwiseConvNPUKernel : public framework::OpKernel<T> {
} }
}; };
template <typename T>
class DepthwiseConvGradNPUKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& context) const override {
// input
const Tensor* input = context.Input<Tensor>("Input");
const Tensor* filter = context.Input<Tensor>("Filter");
// output
auto output_grad = context.Input<Tensor>(framework::GradVarName("Output"));
auto input_grad = context.Output<Tensor>(framework::GradVarName("Input"));
auto filter_grad = context.Output<Tensor>(framework::GradVarName("Filter"));
// attr
const std::vector<int> stride = context.Attr<std::vector<int>>("strides");
std::vector<int> padding = context.Attr<std::vector<int>>("paddings");
std::vector<int> dilation = context.Attr<std::vector<int>>("dilations");
const std::string data_format = context.Attr<std::string>("data_format");
const std::string padding_algorithm =
context.Attr<std::string>("padding_algorithm");
// npu stream
auto stream =
context.template device_context<platform::NPUDeviceContext>().stream();
// check dimension
const bool channel_last = data_format == "NHWC";
// update padding and dilation
auto in_dims = input->dims();
auto filter_dims = filter->dims();
framework::DDim in_data_dims;
framework::DDim filter_data_dims;
if (channel_last) {
in_data_dims = framework::slice_ddim(in_dims, 1, in_dims.size() - 1);
} else {
in_data_dims = framework::slice_ddim(in_dims, 2, in_dims.size());
}
filter_data_dims = framework::slice_ddim(filter_dims, 2, in_dims.size());
std::vector<int> ksize = framework::vectorize<int>(filter_data_dims);
UpdatePaddingAndDilation(&padding, &dilation, padding_algorithm,
in_data_dims, stride, ksize);
// Transform filter (n, 1, h, w) --> (1, n, h, w)
Tensor transformed_filter(filter->type());
transformed_filter.mutable_data<T>({filter->dims()[1], filter->dims()[0],
filter->dims()[2], filter->dims()[3]},
context.device_context().GetPlace());
std::vector<int> perm = {1, 0, 2, 3};
const auto& runner_trans = NpuOpRunner(
"TransposeD", {*filter}, {transformed_filter}, {{"perm", perm}});
runner_trans.Run(stream);
// construct NPU attr
std::vector<int> strides(4, 1);
std::vector<int> dilations(4, 1);
Tensor input_tensor, output_grad_tensor;
input_tensor.ShareDataWith(*input);
output_grad_tensor.ShareDataWith(*output_grad);
if (channel_last) {
input_tensor.set_layout(DataLayout::kNHWC);
output_grad_tensor.set_layout(DataLayout::kNHWC);
strides[1] = stride[0];
strides[2] = stride[1];
dilations[1] = dilation[0];
dilations[2] = dilation[1];
} else {
strides[2] = stride[0];
strides[3] = stride[1];
dilations[2] = dilation[0];
dilations[3] = dilation[1];
}
if (filter_grad) {
filter_grad->mutable_data<T>(context.GetPlace());
std::vector<int> filter_shape_vec =
framework::vectorize<int>(transformed_filter.dims());
const auto& runner = NpuOpRunner(
"DepthwiseConv2DBackpropFilterD", {input_tensor, output_grad_tensor},
{*filter_grad}, {{"filter_size", filter_shape_vec},
{"strides", strides},
{"pads", padding},
{"dilations", dilations},
{"data_format", data_format}});
runner.Run(stream);
}
if (input_grad) {
input_grad->mutable_data<T>(context.GetPlace());
std::vector<int> input_shape_vec =
framework::vectorize<int>(input->dims());
Tensor input_grad_tensor;
input_grad_tensor.ShareDataWith(*input_grad);
if (channel_last) {
input_grad_tensor.set_layout(DataLayout::kNHWC);
}
const auto& runner =
NpuOpRunner("DepthwiseConv2DBackpropInputD",
{transformed_filter, output_grad_tensor},
{input_grad_tensor}, {{"input_size", input_shape_vec},
{"strides", strides},
{"pads", padding},
{"dilations", dilations},
{"data_format", data_format}});
runner.Run(stream);
}
}
};
template <typename T> template <typename T>
class NPUConvOpKernel : public framework::OpKernel<T> { class NPUConvOpKernel : public framework::OpKernel<T> {
public: public:
...@@ -298,6 +409,9 @@ REGISTER_OP_NPU_KERNEL( ...@@ -298,6 +409,9 @@ REGISTER_OP_NPU_KERNEL(
depthwise_conv2d, depthwise_conv2d,
ops::DepthwiseConvNPUKernel<paddle::platform::NPUDeviceContext, ops::DepthwiseConvNPUKernel<paddle::platform::NPUDeviceContext,
paddle::platform::float16>); paddle::platform::float16>);
REGISTER_OP_NPU_KERNEL(
depthwise_conv2d_grad,
ops::DepthwiseConvGradNPUKernel<paddle::platform::float16>);
REGISTER_OP_NPU_KERNEL(conv2d, ops::NPUConvOpKernel<float>, REGISTER_OP_NPU_KERNEL(conv2d, ops::NPUConvOpKernel<float>,
ops::NPUConvOpKernel<paddle::platform::float16>); ops::NPUConvOpKernel<paddle::platform::float16>);
REGISTER_OP_NPU_KERNEL(conv2d_grad, ops::NPUConvGradOpKernel<float>, REGISTER_OP_NPU_KERNEL(conv2d_grad, ops::NPUConvGradOpKernel<float>,
......
...@@ -22,8 +22,12 @@ import sys ...@@ -22,8 +22,12 @@ import sys
sys.path.append("..") sys.path.append("..")
from op_test import OpTest, skip_check_grad_ci from op_test import OpTest, skip_check_grad_ci
from test_conv2d_op import conv2d_forward_naive from test_conv2d_op import conv2d_forward_naive
from paddle import ParamAttr
from paddle.regularizer import L2Decay
from paddle.nn.initializer import KaimingNormal
paddle.enable_static() paddle.enable_static()
SEED = 2021
def create_test_channel_last_class(parent): def create_test_channel_last_class(parent):
...@@ -279,5 +283,151 @@ create_test_padding_VALID_class(TestDepthwiseConvNPU_Padding) ...@@ -279,5 +283,151 @@ create_test_padding_VALID_class(TestDepthwiseConvNPU_Padding)
create_test_padding_VALID_class(TestDepthwiseConvNPU2_Padding) create_test_padding_VALID_class(TestDepthwiseConvNPU2_Padding)
create_test_padding_VALID_class(TestDepthwiseConvNPU3_Padding) create_test_padding_VALID_class(TestDepthwiseConvNPU3_Padding)
class TestDepthwiseConvNet(unittest.TestCase):
def __init__(self, methodName='runTest'):
super().__init__(methodName=methodName)
def _test(self, run_npu=True):
main_prog = paddle.static.Program()
startup_prog = paddle.static.Program()
main_prog.random_seed = SEED
startup_prog.random_seed = SEED
np.random.seed(SEED)
a_np = np.random.random(size=(2, 4, 16, 16)).astype('float16')
b_np = np.random.random(size=(4, 1, 3, 3)).astype('float16')
if not run_npu:
a_np = a_np.astype('float32')
b_np = b_np.astype('float32')
label_np = np.random.randint(10, size=(2, 10)).astype('float32')
with paddle.static.program_guard(main_prog, startup_prog):
if run_npu:
a = paddle.static.data(
name="a", shape=[2, 4, 16, 16], dtype='float16')
b = paddle.static.data(
name="b", shape=[4, 1, 3, 3], dtype='float16')
else:
a = paddle.static.data(
name="a", shape=[2, 4, 16, 16], dtype='float32')
b = paddle.static.data(
name="b", shape=[4, 1, 3, 3], dtype='float32')
label = paddle.static.data(
name="label", shape=[2, 10], dtype='float32')
a *= 2.0
b += 0.01
fc_1 = paddle.nn.functional.conv2d(a, b, bias=None, groups=4)
if run_npu:
fc_1 = paddle.cast(fc_1, dtype='float32')
fc_1 = paddle.nn.functional.relu(fc_1)
prediction = fluid.layers.fc(input=fc_1, size=10, act='softmax')
cost = paddle.nn.functional.smooth_l1_loss(
input=prediction, label=label)
loss = paddle.sum(cost)
sgd = fluid.optimizer.SGD(learning_rate=0.00001)
sgd.minimize(loss)
if run_npu:
place = paddle.NPUPlace(0)
else:
place = paddle.CPUPlace()
exe = paddle.static.Executor(place)
exe.run(startup_prog)
print("Start run on {}".format(place))
for epoch in range(100):
pred_res, loss_res = exe.run(
main_prog,
feed={"a": a_np,
"b": b_np,
"label": label_np},
fetch_list=[prediction, loss])
return pred_res, loss_res
def test_npu(self):
cpu_pred, cpu_loss = self._test(False)
npu_pred, npu_loss = self._test(True)
self.assertTrue(np.allclose(npu_pred, cpu_pred, rtol=1e-04, atol=1e-03))
self.assertTrue(np.allclose(npu_loss, cpu_loss, rtol=1e-04, atol=1e-03))
class TestDepthwiseConvNet_NHWC(unittest.TestCase):
def __init__(self, methodName='runTest'):
super().__init__(methodName=methodName)
def _test(self, run_npu=True):
main_prog = paddle.static.Program()
startup_prog = paddle.static.Program()
main_prog.random_seed = SEED
startup_prog.random_seed = SEED
np.random.seed(SEED)
a_np = np.random.random(size=(2, 16, 16, 4)).astype('float16')
b_np = np.random.random(size=(4, 1, 3, 3)).astype('float16')
if not run_npu:
a_np = a_np.astype('float32')
b_np = b_np.astype('float32')
label_np = np.random.randint(10, size=(2, 10)).astype('float32')
with paddle.static.program_guard(main_prog, startup_prog):
if run_npu:
a = paddle.static.data(
name="a", shape=[2, 16, 16, 4], dtype='float16')
b = paddle.static.data(
name="b", shape=[4, 1, 3, 3], dtype='float16')
else:
a = paddle.static.data(
name="a", shape=[2, 16, 16, 4], dtype='float32')
b = paddle.static.data(
name="b", shape=[4, 1, 3, 3], dtype='float32')
label = paddle.static.data(
name="label", shape=[2, 10], dtype='float32')
a *= 2.0
b += 0.01
fc_1 = paddle.nn.functional.conv2d(
a, b, bias=None, groups=4, data_format='NHWC')
if run_npu:
fc_1 = paddle.cast(fc_1, dtype='float32')
fc_1 = paddle.nn.functional.relu(fc_1)
prediction = fluid.layers.fc(input=fc_1, size=10, act='softmax')
cost = paddle.nn.functional.smooth_l1_loss(
input=prediction, label=label)
loss = paddle.sum(cost)
sgd = fluid.optimizer.SGD(learning_rate=0.00001)
sgd.minimize(loss)
if run_npu:
place = paddle.NPUPlace(0)
else:
place = paddle.CPUPlace()
exe = paddle.static.Executor(place)
exe.run(startup_prog)
print("Start run on {}".format(place))
for epoch in range(100):
pred_res, loss_res = exe.run(
main_prog,
feed={"a": a_np,
"b": b_np,
"label": label_np},
fetch_list=[prediction, loss])
return pred_res, loss_res
def test_npu(self):
cpu_pred, cpu_loss = self._test(False)
npu_pred, npu_loss = self._test(True)
self.assertTrue(np.allclose(npu_pred, cpu_pred, rtol=1e-04, atol=1e-03))
self.assertTrue(np.allclose(npu_loss, cpu_loss, rtol=1e-04, atol=1e-03))
if __name__ == '__main__': if __name__ == '__main__':
unittest.main() unittest.main()
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册