Merge pull request #833 from codeWorm2015/metal

fix crash

Merge pull request #833 from codeWorm2015/metal
fix crash
0bb67049 · Ruilong Liu · GitHub · ca22a7a5 · 6e5e698d · 0bb67049
7 changed file
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/BoxcoderKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/BoxcoderKernel.swift
@@ -34,6 +34,6 @@ class BoxcoderKernel<P: PrecisionType>: Kernel, Computable{
    required init(device: MTLDevice, param: BoxcoderParam<P>) {
        param.output.initTexture(device: device)
-        super.init(device: device, inFunctionName: "priorbox")
+        super.init(device: device, inFunctionName: "boxcoder")
    }
 }
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvBNReluKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvBNReluKernel.swift
@@ -16,124 +16,126 @@ import Foundation
 import MetalPerformanceShaders
 struct ConvBNReluTestParam: TestParam {
-    let inputTexture: MTLTexture
+  let inputTexture: MTLTexture
-    let outputTexture: MTLTexture
+  let outputTexture: MTLTexture
-    var metalParam: MetalConvParam
+  var metalParam: MetalConvParam
-    let filterBuffer: MTLBuffer
+  let filterBuffer: MTLBuffer
-    let biaseBuffer: MTLBuffer
+  let biaseBuffer: MTLBuffer
-    let newScaleBuffer: MTLBuffer
+  let newScaleBuffer: MTLBuffer
-    let newBiaseBuffer: MTLBuffer
+  let newBiaseBuffer: MTLBuffer
-    let filterSize: (width: Int, height: Int, channel: Int)
+  let filterSize: (width: Int, height: Int, channel: Int)
-    init(inInputTexture: MTLTexture, inOutputTexture: MTLTexture, inMetalParam: MetalConvParam, inFilterBuffer: MTLBuffer, inBiaseBuffer: MTLBuffer, inNewScaleBuffer: MTLBuffer, inNewBiaseBuffer: MTLBuffer, inFilterSize: (width: Int, height: Int, channel: Int)) {
+  init(inInputTexture: MTLTexture, inOutputTexture: MTLTexture, inMetalParam: MetalConvParam, inFilterBuffer: MTLBuffer, inBiaseBuffer: MTLBuffer, inNewScaleBuffer: MTLBuffer, inNewBiaseBuffer: MTLBuffer, inFilterSize: (width: Int, height: Int, channel: Int)) {
-        inputTexture = inInputTexture
+    inputTexture = inInputTexture
-        outputTexture = inOutputTexture
+    outputTexture = inOutputTexture
-        metalParam = inMetalParam
+    metalParam = inMetalParam
-        filterBuffer = inFilterBuffer
+    filterBuffer = inFilterBuffer
-        biaseBuffer = inBiaseBuffer
+    biaseBuffer = inBiaseBuffer
-        newScaleBuffer = inNewScaleBuffer
+    newScaleBuffer = inNewScaleBuffer
-        newBiaseBuffer = inNewBiaseBuffer
+    newBiaseBuffer = inNewBiaseBuffer
-        filterSize = inFilterSize
+    filterSize = inFilterSize
-    }
+  }
 }
 class ConvBNReluKernel<P: PrecisionType>: Kernel, Computable, Testable {
-    required init(device: MTLDevice, testParam: ConvBNReluTestParam) {
+  required init(device: MTLDevice, testParam: ConvBNReluTestParam) {
-        if testParam.filterSize.width == 1 && testParam.filterSize.height == 1 {
+    if testParam.filterSize.width == 1 && testParam.filterSize.height == 1 {
-            super.init(device: device, inFunctionName: "conv_add_batch_norm_relu_1x1")
+      super.init(device: device, inFunctionName: "conv_batch_norm_relu_1x1")
-        } else if testParam.filterSize.channel == 1 {
+    } else if testParam.filterSize.channel == 1 {
-            super.init(device: device, inFunctionName: "depthwise_conv_add_batch_norm_relu_3x3")
+      super.init(device: device, inFunctionName: "depthwise_conv_batch_norm_relu_3x3")
-        } else {
+    } else {
-            super.init(device: device, inFunctionName: "conv_add_batch_norm_relu_3x3")
+      super.init(device: device, inFunctionName: "conv_batch_norm_relu_3x3")
-        }
+    }
+  }
+  var metalParam: MetalConvParam!
+  required init(device: MTLDevice, param: ConvBNReluParam<P>) {
+    if param.filter.width == 1 && param.filter.height == 1 {
+      super.init(device: device, inFunctionName: "conv_batch_norm_relu_1x1")
+    } else if param.filter.channel == 1 {
+      super.init(device: device, inFunctionName: "depthwise_conv_batch_norm_relu_3x3")
+    } else {
+      super.init(device: device, inFunctionName: "conv_batch_norm_relu_3x3")
+    }
+    param.output.initTexture(device: device, transpose: [0, 2, 3, 1])
+    param.filter.initBuffer(device: device, precision: Tensor.BufferPrecision.Float32)
+    param.variance.initBuffer(device: device)
+    param.mean.initBuffer(device: device)
+    param.scale.initBuffer(device: device)
+    param.bias.initBuffer(device: device)
+    let offsetX = param.filter.width/2 - Int(param.paddings[0])
+    let offsetY = param.filter.height/2 - Int(param.paddings[1])
+    print("offset x: \(offsetX)")
+    print("offset y: \(offsetY)")
+    let offsetZ = 0.0
+    print(" fuck ")
+    metalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), paddedZ: UInt16(param.input.metalTexture.arrayLength * 4 - param.input.dim[3]))
+    var invs: [P] = []
+    let varianceContents = param.variance.buffer.contents().assumingMemoryBound(to: P.self)
+    for i in 0..<param.variance.buffer.length/MemoryLayout<P>.stride {
+      let inv = 1.0/pow(Float32.init(varianceContents[i]) + param.epsilon, 0.5)
+      invs.append(P(inv))
    }
-    var metalParam: MetalConvParam!
+    let newScale: UnsafeMutablePointer<P> = UnsafeMutablePointer<P>.allocate(capacity: param.scale.buffer.length)
+    let newBiase: UnsafeMutablePointer<P> = UnsafeMutablePointer<P>.allocate(capacity: param.bias.buffer.length)
-    required init(device: MTLDevice, param: ConvBNReluParam<P>) {
+    let scaleContents = param.scale.buffer.contents().assumingMemoryBound(to: P.self)
-        if param.filter.width == 1 && param.filter.height == 1 {
+    let biaseContents = param.bias.buffer.contents().assumingMemoryBound(to: P.self)
-            super.init(device: device, inFunctionName: "conv_add_batch_norm_relu_1x1")
+    let meanContents = param.mean.buffer.contents().assumingMemoryBound(to: P.self)
-        } else if param.filter.channel == 1 {
+    for i in 0..<param.scale.buffer.length/MemoryLayout<P>.stride {
-            super.init(device: device, inFunctionName: "depthwise_conv_add_batch_norm_relu_3x3")
+      newScale[i] = invs[i] * scaleContents[i]
-        } else {
+      newBiase[i] = biaseContents[i] - meanContents[i] * invs[i] * scaleContents[i]
-            super.init(device: device, inFunctionName: "conv_add_batch_norm_relu_3x3")
-        }
-        param.output.initTexture(device: device, transpose: [0, 2, 3, 1])
-        param.filter.initBuffer(device: device, precision: Tensor.BufferPrecision.Float32)
-        param.variance.initBuffer(device: device)
-        param.mean.initBuffer(device: device)
-        param.scale.initBuffer(device: device)
-        param.bias.initBuffer(device: device)
-        let offsetX = param.filter.width/2 - Int(param.paddings[0])
-        let offsetY = param.filter.height/2 - Int(param.paddings[1])
-        print("offset x: \(offsetX)")
-        print("offset y: \(offsetY)")
-        let offsetZ = 0.0
-        metalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), paddedZ: UInt16(param.input.metalTexture.arrayLength * 4 - param.input.dim[3]))
-        var invs: [P] = []
-        let varianceContents = param.variance.buffer.contents().assumingMemoryBound(to: P.self)
-        for i in 0..<param.variance.buffer.length/MemoryLayout<P>.stride {
-            let inv = 1.0/pow(Float32.init(varianceContents[i]) + param.epsilon, 0.5)
-            invs.append(P(inv))
-        }
-        let newScale: UnsafeMutablePointer<P> = UnsafeMutablePointer<P>.allocate(capacity: param.scale.buffer.length)
-        let newBiase: UnsafeMutablePointer<P> = UnsafeMutablePointer<P>.allocate(capacity: param.bias.buffer.length)
-        let scaleContents = param.scale.buffer.contents().assumingMemoryBound(to: P.self)
-        let biaseContents = param.bias.buffer.contents().assumingMemoryBound(to: P.self)
-        let meanContents = param.mean.buffer.contents().assumingMemoryBound(to: P.self)
-        for i in 0..<param.scale.buffer.length/MemoryLayout<P>.stride {
-            newScale[i] = invs[i] * scaleContents[i]
-            newBiase[i] = biaseContents[i] - meanContents[i] * invs[i] * scaleContents[i]
-        }
-        param.newBiase = device.makeBuffer(bytes: newBiase, length: param.bias.buffer.length)
-        param.newScale = device.makeBuffer(bytes: newScale, length: param.scale.buffer.length)
-        newScale.deinitialize(count: param.scale.buffer.length)
-        newScale.deallocate()
-        newBiase.deinitialize(count: param.bias.buffer.length)
-        newBiase.deallocate()
    }
-    func compute(commandBuffer: MTLCommandBuffer, param: ConvBNReluParam<P>) throws {
+    param.newBiase = device.makeBuffer(bytes: newBiase, length: param.bias.buffer.length)
-        guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
+    param.newScale = device.makeBuffer(bytes: newScale, length: param.scale.buffer.length)
-            throw PaddleMobileError.predictError(message: " encode is nil")
-        }
+    newScale.deinitialize(count: param.scale.buffer.length)
+    newScale.deallocate()
-        encoder.setTexture(param.input.metalTexture, index: 0)
-        encoder.setTexture(param.output.metalTexture, index: 1)
+    newBiase.deinitialize(count: param.bias.buffer.length)
-        encoder.setBytes(&metalParam, length: MemoryLayout<MetalConvParam>.size, index: 0)
+    newBiase.deallocate()
-        encoder.setBuffer(param.filter.buffer, offset: 0, index: 1)
+  }
-        encoder.setBuffer(param.newScale!, offset: 0, index: 3)
-        encoder.setBuffer(param.newBiase!, offset: 0, index: 4)
+  func compute(commandBuffer: MTLCommandBuffer, param: ConvBNReluParam<P>) throws {
-        encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
+    guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
-        encoder.endEncoding()
+      throw PaddleMobileError.predictError(message: " encode is nil")
    }
-    public func test(commandBuffer: MTLCommandBuffer, param: ConvBNReluTestParam) {
+    encoder.setTexture(param.input.metalTexture, index: 0)
-        guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
+    encoder.setTexture(param.output.metalTexture, index: 1)
-            fatalError()
+    encoder.setBytes(&metalParam, length: MemoryLayout<MetalConvParam>.size, index: 0)
-        }
+    encoder.setBuffer(param.filter.buffer, offset: 0, index: 1)
+    encoder.setBuffer(param.newScale!, offset: 0, index: 3)
-        encoder.setTexture(param.inputTexture, index: 0)
+    encoder.setBuffer(param.newBiase!, offset: 0, index: 4)
-        encoder.setTexture(param.outputTexture, index: 1)
+    encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture)
-        var inMetalParam = param.metalParam
+    encoder.endEncoding()
-        encoder.setBytes(&inMetalParam, length: MemoryLayout<MetalConvParam>.size, index: 0)
+  }
-        encoder.setBuffer(param.filterBuffer, offset: 0, index: 1)
-        encoder.setBuffer(param.biaseBuffer, offset: 0, index: 2)
+  public func test(commandBuffer: MTLCommandBuffer, param: ConvBNReluTestParam) {
-        encoder.setBuffer(param.newScaleBuffer, offset: 0, index: 3)
+    guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
-        encoder.setBuffer(param.newBiaseBuffer, offset: 0, index: 4)
+      fatalError()
-        encoder.dispatch(computePipline: pipline, outTexture: param.outputTexture)
-        encoder.endEncoding()
    }
+    encoder.setTexture(param.inputTexture, index: 0)
+    encoder.setTexture(param.outputTexture, index: 1)
+    var inMetalParam = param.metalParam
+    encoder.setBytes(&inMetalParam, length: MemoryLayout<MetalConvParam>.size, index: 0)
+    encoder.setBuffer(param.filterBuffer, offset: 0, index: 1)
+    encoder.setBuffer(param.biaseBuffer, offset: 0, index: 2)
+    encoder.setBuffer(param.newScaleBuffer, offset: 0, index: 3)
+    encoder.setBuffer(param.newBiaseBuffer, offset: 0, index: 4)
+    encoder.dispatch(computePipline: pipline, outTexture: param.outputTexture)
+    encoder.endEncoding()
+  }
 }
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/MulticlassNMSKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/MulticlassNMSKernel.swift
@@ -26,6 +26,6 @@ class MulticlassNMSKernel<P: PrecisionType>: Kernel, Computable{
    }
    required init(device: MTLDevice, param: MulticlassNMSParam<P>) {
-        super.init(device: device, inFunctionName: "priorbox")
+        super.init(device: device, inFunctionName: "prior_box")
    }
 }
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/PriorBoxKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/PriorBoxKernel.swift
@@ -33,7 +33,7 @@ class PriorBoxKernel<P: PrecisionType>: Kernel, Computable{
    var metalParam: PriorBoxMetalParam!
    required init(device: MTLDevice, param: PriorBoxParam<P>) {
-        super.init(device: device, inFunctionName: "priorbox")
+        super.init(device: device, inFunctionName: "prior_box")
        param.output.initTexture(device: device, transpose: [2, 0, 1, 3])
        param.outputVariances.initTexture(device: device, transpose: [2, 0, 1, 3])

--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/Texture2DTo2DArrayKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/Texture2DTo2DArrayKernel.swift
@@ -15,23 +15,24 @@
 import Foundation
 struct Texture2DTo2DArrayParam {
-    let input: MTLTexture
+  let input: MTLTexture
-    let output: MTLTexture
+  let output: MTLTexture
-    let expectDim: Dim
+  let expectDim: Dim
 }
 class Texture2DTo2DArrayKernel<P: PrecisionType>: Kernel, Computable{
-    func compute(commandBuffer: MTLCommandBuffer, param: FeedParam<P>) throws {
+  func compute(commandBuffer: MTLCommandBuffer, param: FeedParam<P>) throws {
-        guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
+    guard let encoder = commandBuffer.makeComputeCommandEncoder() else {
-            throw PaddleMobileError.predictError(message: " encode is nil")
+      throw PaddleMobileError.predictError(message: " encode is nil")
-        }
-        encoder.setTexture(param.input.mtlTexture, index: 0)
-        encoder.setTexture(param.output.metalTexture, index: 1)
-        encoder.dispatch(computePipline: pipline, outTexture: param.input.mtlTexture)
-        encoder.endEncoding()
-    }
-    required init(device: MTLDevice, param: FeedParam<P>) {
-        super.init(device: device, inFunctionName: "texture2d_to_2d_array")
    }
+    encoder.setTexture(param.input.mtlTexture, index: 0)
+    encoder.setTexture(param.output.metalTexture, index: 1)
+    encoder.dispatch(computePipline: pipline, outTexture: param.input.mtlTexture)
+    encoder.endEncoding()
+  }
+  required init(device: MTLDevice, param: FeedParam<P>) {
+    param.output.initTexture(device: device, transpose: [0, 2, 3, 1])
+    super.init(device: device, inFunctionName: "texture2d_to_2d_array")
+  }
 }
--- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ConvKernel.metal
+++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ConvKernel.metal
@@ -699,3 +699,144 @@ kernel void depthwise_conv_add_3x3(texture2d_array<float, access::sample> inText
  outTexture.write(output, gid.xy, gid.z);
 }
+#pragma mark - conv bn relu
+kernel void conv_batch_norm_relu_1x1(texture2d_array<float, access::sample> inTexture [[texture(0)]],
+                                         texture2d_array<float, access::write> outTexture [[texture(1)]],
+                                         constant MetalConvParam &param [[buffer(0)]],
+                                         const device float4 *weights [[buffer(1)]],
+                                         const device float4 *biase [[buffer(2)]],
+                                         const device float4 *new_scale [[buffer(3)]],
+                                         const device float4 *new_biase [[buffer(4)]],
+                                         uint3 gid [[thread_position_in_grid]]) {
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) {
+    return;
+  }
+  ushort2 stride = ushort2(param.strideX, param.strideY);
+  ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+  const uint kernelHXW = 1;
+  uint input_arr_size = inTexture.get_array_size();
+  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
+  float4 output = float4(0.0);
+  float4 input;
+  for (uint i = 0; i < input_arr_size; ++i) {
+    input = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i);
+    float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size  + i];
+    output.x += dot(input, weight_x);
+    float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size  + i];
+    output.y += dot(input, weight_y);
+    float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size  + i];
+    output.z += dot(input, weight_z);
+    float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + i];
+    output.w += dot(input, weight_w);
+  }
+  output = fmax(output * new_scale[gid.z] + new_biase[gid.z], 0.0);
+  outTexture.write(output, gid.xy, gid.z);
+}
+kernel void conv_batch_norm_relu_3x3(texture2d_array<float, access::sample> inTexture [[texture(0)]],
+                                         texture2d_array<float, access::write> outTexture [[texture(1)]],
+                                         constant MetalConvParam &param [[buffer(0)]],
+                                         const device float4 *weights [[buffer(1)]],
+                                         const device float4 *biase [[buffer(2)]],
+                                         const device float4 *new_scale [[buffer(3)]],
+                                         const device float4 *new_biase [[buffer(4)]],
+                                         uint3 gid [[thread_position_in_grid]]) {
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) {
+    return;
+  }
+  ushort2 stride = ushort2(param.strideX, param.strideY);
+  const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+  const uint kernelHXW = 9;
+  uint input_arr_size = inTexture.get_array_size();
+  uint weithTo = gid.z * kernelHXW * input_arr_size * 4;
+  float4 output = float4(0.0);
+  float4 input[9];
+  for (uint i = 0; i < input_arr_size; ++i) {
+    input[0] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y - 1), i);
+    input[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - 1), i);
+    input[2] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y - 1), i);
+    input[3] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y), i);
+    input[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), i);
+    input[5] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y), i);
+    input[6] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y + 1), i);
+    input[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + 1), i);
+    input[8] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y + 1), i);
+    for (int j = 0; j < 9; ++j) {
+      float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.x += dot(input[j], weight_x);
+      float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.y += dot(input[j], weight_y);
+      float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.z += dot(input[j], weight_z);
+      float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i];
+      output.w += dot(input[j], weight_w);
+    }
+  }
+  output = fmax(output * new_scale[gid.z] + new_biase[gid.z], 0.0);
+  outTexture.write(output, gid.xy, gid.z);
+}
+kernel void depthwise_conv_batch_norm_relu_3x3(texture2d_array<float, access::sample> inTexture [[texture(0)]],
+                                                   texture2d_array<float, access::write> outTexture [[texture(1)]],
+                                                   constant MetalConvParam &param [[buffer(0)]],
+                                                   const device float *weights [[buffer(1)]],
+                                                   const device float4 *new_scale [[buffer(3)]],
+                                                   const device float4 *new_biase [[buffer(4)]],
+                                                   uint3 gid [[thread_position_in_grid]]) {
+  if (gid.x >= outTexture.get_width() ||
+      gid.y >= outTexture.get_height() ||
+      gid.z >= outTexture.get_array_size()) {
+    return;
+  }
+  uint output_slice = gid.z;
+  ushort2 stride = ushort2(param.strideX, param.strideY);
+  ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+  constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+  const uint kernelHXW = 9;
+  uint weithTo = gid.z * kernelHXW * 4;
+  float4 output = float4(0.0);
+  float4 inputs[9];
+  inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y - 1), output_slice);
+  inputs[1] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y - 1), output_slice);
+  inputs[2] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y - 1), output_slice);
+  inputs[3] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y), output_slice);
+  inputs[4] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y), output_slice);
+  inputs[5] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y), output_slice);
+  inputs[6] = inTexture.sample(sample, float2(posInInput.x - 1,    posInInput.y + 1), output_slice);
+  inputs[7] = inTexture.sample(sample, float2(posInInput.x,        posInInput.y + 1), output_slice);
+  inputs[8] = inTexture.sample(sample, float2(posInInput.x + 1,    posInInput.y + 1), output_slice);
+  for (int j = 0; j < 9; ++j) {
+    float4 input = inputs[j];
+    output.x += input.x * weights[weithTo + 0 * kernelHXW + j];
+    output.y += input.y * weights[weithTo + 1 * kernelHXW + j];
+    output.z += input.z * weights[weithTo + 2 * kernelHXW + j];
+    output.w += input.w * weights[weithTo + 3 * kernelHXW + j];
+  }
+  output = fmax(output * new_scale[gid.z] + new_biase[gid.z], 0.0);
+  outTexture.write(output, gid.xy, gid.z);
+}
--- a/metal/paddle-mobile/paddle-mobile/Operators/PriorBoxOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Operators/PriorBoxOp.swift
@@ -27,7 +27,7 @@ class PriorBoxParam<P: PrecisionType>: OpParam {
            aspectRatios = try PriorBoxParam.getAttr(key: "aspect_ratios", attrs: opDesc.attrs)
            variances = try PriorBoxParam.getAttr(key: "variances", attrs: opDesc.attrs)
            flip = try PriorBoxParam.getAttr(key: "flip", attrs: opDesc.attrs)
-            clip = try PriorBoxParam.getAttr(key: "clop", attrs: opDesc.attrs)
+            clip = try PriorBoxParam.getAttr(key: "clip", attrs: opDesc.attrs)
            stepW = try PriorBoxParam.getAttr(key: "step_w", attrs: opDesc.attrs)
            stepH = try PriorBoxParam.getAttr(key: "step_h", attrs: opDesc.attrs)
            offset = try PriorBoxParam.getAttr(key: "offset", attrs: opDesc.attrs)