Merge pull request #882 from codeWorm2015/metal

fix conv add 1x5 error

Merge pull request #882 from codeWorm2015/metal
fix conv add 1x5 error
cbfc1d74 · Ruilong Liu · GitHub · 4fdf6612 · 2fd85fd3 · cbfc1d74
2 changed file
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvAddKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvAddKernel.swift
@@ -33,6 +33,8 @@ class ConvAddKernel<P: PrecisionType>: Kernel, Computable {
        super.init(device: device, inFunctionName: "depthwise_conv_add_3x3")
      } else if param.filter.width == 1 && param.filter.height == 5 {
        super.init(device: device, inFunctionName: "conv_add_5x1")
+      } else if param.filter.width == 5 && param.filter.height == 1 {
+        super.init(device: device, inFunctionName: "conv_add_1x5")
      } else {
        super.init(device: device, inFunctionName: "conv_add_3x3")
      }

--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ConvAddMetal.metal
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ConvAddMetal.metal
@@ -183,10 +183,69 @@ kernel void conv_add_5x1(texture2d_array<float, access::sample> inTexture [[text
      output.w += dot(input[j], weight_w);
    }
  }
+  output = output + biase[gid.z];
  outTexture.write(output, gid.xy, gid.z);
 }
+kernel void conv_add_1x5(texture2d_array<float, access::sample> inTexture [[texture(0)]],
+                         texture2d_array<float, access::write> outTexture [[texture(1)]],
+                         constant MetalConvParam &param [[buffer(0)]],
+                         const device float4 *weights [[buffer(1)]],
+                         const device float4 *biase [[buffer(2)]],
+                         uint3 gid [[thread_position_in_grid]]) {
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) {
+    return;
+  }
+  ushort2 stride = ushort2(param.strideX, param.strideY);
+  const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+  const uint kernelHXW = 5;
+  uint input_arr_size = inTexture.get_array_size();
+  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
+  float4 output = float4(0.0);
+  ushort dilation_x = param.dilationX;
+  float4 input[5];
+  for (uint i = 0; i < input_arr_size; ++i) {
+    input[0] = inTexture.sample(sample, float2(posInInput.x - 2 * dilation_x, posInInput.y), i);
+    input[1] = inTexture.sample(sample, float2(posInInput.x - dilation_x, posInInput.y), i);
+    input[2] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
+    input[3] = inTexture.sample(sample, float2(posInInput.x + dilation_x, posInInput.y), i);
+    input[4] = inTexture.sample(sample, float2(posInInput.x + 2 * dilation_x, posInInput.y), i);
+    for (int j = 0; j < 5; ++j) {
+      float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.x += dot(input[j], weight_x);
+      float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.y += dot(input[j], weight_y);
+      float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.z += dot(input[j], weight_z);
+      float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.w += dot(input[j], weight_w);
+    }
+  }
+  output = output + biase[gid.z];
+  outTexture.write(output, gid.xy, gid.z);
+}
 kernel void depthwise_conv_add_3x3(texture2d_array<float, access::sample> inTexture [[texture(0)]],
                                   texture2d_array<float, access::write> outTexture [[texture(1)]],