Refine gemm convolution kernel.

8219f206 · hedaoyuan · 5860150d · 8219f206
隐藏空白更改
内联并排

Showing with 12 addition and 16 deletion

paddle/operators/gemm_conv_op.h paddle/operators/gemm_conv_op.h +12 -16

未找到文件。
--- a/paddle/operators/gemm_conv_op.h
+++ b/paddle/operators/gemm_conv_op.h
@@ -58,7 +58,7 @@ class GemmConvKernel : public framework::OpKernel {
        input_channels * filter_height * filter_width,
        output_height * output_width};
    Tensor col;
-    col.mutable_data<float>(col_shape, context.GetPlace());
+    col.mutable_data<T>(col_shape, context.GetPlace());
    // col_matrix shares the same piece of data with col,
    // but will be reshaped into a two-dimensional matrix shape
    // to call the matrix multiplication interface.
@@ -67,8 +67,8 @@ class GemmConvKernel : public framework::OpKernel {

    framework::DDim input_shape = {input->dims()[1], input->dims()[2],
                                   input->dims()[3]};
-    framework::DDim filter_matrix_shape = {
-        filter.dims()[0], framework::product(filter.dims()) / filter.dims()[0]};
+    framework::DDim filter_matrix_shape = {filter.dims()[0],
+                                           filter.numel() / filter.dims()[0]};
    filter.Resize(filter_matrix_shape);

    framework::DDim output_matrix_shape = {output_channels,
@@ -80,14 +80,12 @@ class GemmConvKernel : public framework::OpKernel {
    // convolution operator: im2col + gemm
    for (int i = 0; i < batch_size; i++) {
      // im2col
-      Tensor in_slice = input->Slice<T>(i, i + 1);
-      in_slice.Resize(input_shape);
+      Tensor in_slice = input->Slice<T>(i, i + 1).Resize(input_shape);
      im2col(in_slice, col, strides[0], strides[1], paddings[0], paddings[1],
             device_context);

      // gemm
-      Tensor out_slice = output->Slice<T>(i, i + 1);
-      out_slice.Resize(output_matrix_shape);
+      Tensor out_slice = output->Slice<T>(i, i + 1).Resize(output_matrix_shape);
      math::matmul<Place, T>(filter, false, col_matrix, false, T(1.0),
                             &out_slice, T(0.0), device_context);
    }
@@ -138,7 +136,7 @@ class GemmConvGradKernel : public framework::OpKernel {
        input_channels * filter_height * filter_width,
        output_height * output_width};
    Tensor col;
-    col.mutable_data<float>(col_shape, context.GetPlace());
+    col.mutable_data<T>(col_shape, context.GetPlace());
    // col_matrix shares the same piece of data with col,
    // but will be reshaped into a two-dimensional matrix shape
    // to call the matrix multiplication interface.
@@ -151,8 +149,8 @@ class GemmConvGradKernel : public framework::OpKernel {
        output_grad->dims()[1],
        output_grad->dims()[2] * output_grad->dims()[3]};

-    framework::DDim filter_matrix_shape = {
-        filter.dims()[0], framework::product(filter.dims()) / filter.dims()[0]};
+    framework::DDim filter_matrix_shape = {filter.dims()[0],
+                                           filter.numel() / filter.dims()[0]};
    filter.Resize(filter_matrix_shape);
    filter_grad.Resize(filter_matrix_shape);

@@ -168,20 +166,18 @@ class GemmConvGradKernel : public framework::OpKernel {
    // convolution backward weight operator: im2col + gemm
    for (int i = 0; i < batch_size; i++) {
      // gemm
-      Tensor out_slice = output_grad->Slice<T>(i, i + 1);
-      out_slice.Resize(output_matrix_shape);
+      Tensor out_slice =
+          output_grad->Slice<T>(i, i + 1).Resize(output_matrix_shape);
      math::matmul<Place, T>(filter, true, out_slice, false, T(1.0),
                             &col_matrix, T(0.0), device_context);

      // col2im
-      Tensor in_grad_slice = input_grad->Slice<T>(i, i + 1);
-      in_grad_slice.Resize(input_shape);
+      Tensor in_grad_slice = input_grad->Slice<T>(i, i + 1).Resize(input_shape);
      col2im(in_grad_slice, col, strides[0], strides[1], paddings[0],
             paddings[1], device_context);

      // im2col
-      Tensor in_slice = input->Slice<T>(i, i + 1);
-      in_slice.Resize(input_shape);
+      Tensor in_slice = input->Slice<T>(i, i + 1).Resize(input_shape);
      im2col(in_slice, col, strides[0], strides[1], paddings[0], paddings[1],
             device_context);