1.add groups to conv3x3. 2.add original size to tensor and tenture and adjust...

1.add groups to conv3x3. 2.add original size to tensor and tenture and adjust concat layer to handle tensor truncation logic. (#1643)

1.add groups to conv3x3. 2.add original size to tensor and tenture and adjust...
1.add groups to conv3x3. 2.add original size to tensor and tenture and adjust concat layer to handle tensor truncation logic. (#1643)
02ebdf59 · Yanzhan Yang · GitHub · e8200397 · 02ebdf59 · 02ebdf59
17 changed file
--- a/metal/paddle-mobile-metallib/paddle-mobile-metallib/BatchNormRelu.metal
+++ b/metal/paddle-mobile-metallib/paddle-mobile-metallib/BatchNormRelu.metal
@@ -6,14 +6,6 @@
 #include <metal_stdlib>
 using namespace metal;
-struct MetalConvParam {
-    short offsetX;
-    short offsetY;
-    short offsetZ;
-    ushort strideX;
-    ushort strideY;
-};
 kernel void batch_norm_relu_3x3(texture2d_array<float, access::sample> inTexture [[texture(0)]],
                                texture2d_array<float, access::write> outTexture [[texture(1)]],
                                const device float4 *new_scale [[buffer(0)]],

--- a/metal/paddle-mobile-metallib/paddle-mobile-metallib/Common.metal
+++ b/metal/paddle-mobile-metallib/paddle-mobile-metallib/Common.metal
@@ -116,4 +116,8 @@ struct MetalConvParam {
    ushort strideY;
    ushort dilationX;
    ushort dilationY;
+    ushort groups;
+    ushort iC;
+    ushort fC;
+    ushort oC;
 };
--- a/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConcatKernel.metal
+++ b/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConcatKernel.metal
@@ -67,8 +67,8 @@ struct ConcatParam {
 #undef R
 #undef V
-// lens: (R=4, N=3, V=y)
+// lens: (R=4, N=3, V=normal)
-#define V VY
+#define V VNORMAL
 #define R 4
 #define N 3
 #define P float

--- a/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConvAddReluMetal.metal
+++ b/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConvAddReluMetal.metal
@@ -129,6 +129,61 @@ kernel void conv_add_relu_3x3(texture2d_array<float, access::sample> inTexture [
    outTexture.write(relu, gid.xy, gid.z);
 }
+kernel void group_conv_add_relu_3x3(texture2d_array<float, access::sample> inTexture [[texture(0)]],
+                              texture2d_array<float, access::write> outTexture [[texture(1)]],
+                              constant MetalConvParam &param [[buffer(0)]],
+                              const device float *weights [[buffer(1)]],
+                              const device float4 *biase [[buffer(2)]],
+                              uint3 gid [[thread_position_in_grid]]) {
+    if (gid.x >= outTexture.get_width() ||
+        gid.y >= outTexture.get_height() ||
+        gid.z >= outTexture.get_array_size()) {
+        return;
+    }
+    ushort2 stride = ushort2(param.strideX, param.strideY);
+    const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+    constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+    const uint kernelHXW = 9;
+    float4 output = biase[gid.z];
+    ushort dilation_x = param.dilationX;
+    ushort dilation_y = param.dilationY;
+    float input[9];
+    uint iC = param.iC, fC = param.fC, oC = param.oC;
+    uint filter_array_size = (fC + 3) / 4;
+    for (uint c = 0; c < 4; ++c) {
+        uint output_depth = gid.z * 4 + c, output_c = output_depth % oC, output_n = output_depth / oC;
+        for (uint i = 0; i < fC; ++i) {
+            uint input_depth = output_n * iC + output_c * fC + i;
+            uint input_array_index = input_depth / 4;
+            uint input_array_item_index = input_depth % 4;
+            input[0] = inTexture.sample(sample, float2(posInInput.x - dilation_x, posInInput.y - dilation_y), input_array_index)[input_array_item_index];
+            input[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - dilation_y), input_array_index)[input_array_item_index];
+            input[2] = inTexture.sample(sample, float2(posInInput.x + dilation_x, posInInput.y - dilation_y), input_array_index)[input_array_item_index];
+            input[3] = inTexture.sample(sample, float2(posInInput.x - dilation_x, posInInput.y), input_array_index)[input_array_item_index];
+            input[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), input_array_index)[input_array_item_index];
+            input[5] = inTexture.sample(sample, float2(posInInput.x + dilation_x, posInInput.y), input_array_index)[input_array_item_index];
+            input[6] = inTexture.sample(sample, float2(posInInput.x - dilation_x, posInInput.y + dilation_y), input_array_index)[input_array_item_index];
+            input[7] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + dilation_y), input_array_index)[input_array_item_index];
+            input[8] = inTexture.sample(sample, float2(posInInput.x + dilation_x, posInInput.y + dilation_y), input_array_index)[input_array_item_index];
+            for (int j = 0; j < 9; ++j) {
+                float weight = weights[(output_c * kernelHXW + j) * filter_array_size * 4 + i];
+                output[c] += input[j] * weight;
+            }
+        }
+    }
+    float4 relu = fmax(output, 0.0);
+    outTexture.write(relu, gid.xy, gid.z);
+}
 kernel void conv_add_relu_5x1(texture2d_array<float, access::sample> inTexture [[texture(0)]],
                         texture2d_array<float, access::write> outTexture [[texture(1)]],
                         constant MetalConvParam &param [[buffer(0)]],
@@ -385,6 +440,61 @@ kernel void conv_add_relu_3x3_half(texture2d_array<half, access::sample> inTextu
    outTexture.write(half4(relu), gid.xy, gid.z);
 }
+kernel void group_conv_add_relu_3x3_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
+                                    texture2d_array<half, access::write> outTexture [[texture(1)]],
+                                    constant MetalConvParam &param [[buffer(0)]],
+                                    const device half *weights [[buffer(1)]],
+                                    const device half4 *biase [[buffer(2)]],
+                                    uint3 gid [[thread_position_in_grid]]) {
+    if (gid.x >= outTexture.get_width() ||
+        gid.y >= outTexture.get_height() ||
+        gid.z >= outTexture.get_array_size()) {
+        return;
+    }
+    ushort2 stride = ushort2(param.strideX, param.strideY);
+    const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+    constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+    const uint kernelHXW = 9;
+    half4 output = biase[gid.z];
+    ushort dilation_x = param.dilationX;
+    ushort dilation_y = param.dilationY;
+    half input[9];
+    uint iC = param.iC, fC = param.fC, oC = param.oC;
+    uint filter_array_size = (fC + 3) / 4;
+    for (uint c = 0; c < 4; ++c) {
+        uint output_depth = gid.z * 4 + c, output_c = output_depth % oC, output_n = output_depth / oC;
+        for (uint i = 0; i < fC; ++i) {
+            uint input_depth = output_n * iC + output_c * fC + i;
+            uint input_array_index = input_depth / 4;
+            uint input_array_item_index = input_depth % 4;
+            input[0] = inTexture.sample(sample, float2(posInInput.x - dilation_x, posInInput.y - dilation_y), input_array_index)[input_array_item_index];
+            input[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - dilation_y), input_array_index)[input_array_item_index];
+            input[2] = inTexture.sample(sample, float2(posInInput.x + dilation_x, posInInput.y - dilation_y), input_array_index)[input_array_item_index];
+            input[3] = inTexture.sample(sample, float2(posInInput.x - dilation_x, posInInput.y), input_array_index)[input_array_item_index];
+            input[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), input_array_index)[input_array_item_index];
+            input[5] = inTexture.sample(sample, float2(posInInput.x + dilation_x, posInInput.y), input_array_index)[input_array_item_index];
+            input[6] = inTexture.sample(sample, float2(posInInput.x - dilation_x, posInInput.y + dilation_y), input_array_index)[input_array_item_index];
+            input[7] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + dilation_y), input_array_index)[input_array_item_index];
+            input[8] = inTexture.sample(sample, float2(posInInput.x + dilation_x, posInInput.y + dilation_y), input_array_index)[input_array_item_index];
+            for (int j = 0; j < 9; ++j) {
+                half weight = weights[(output_c * kernelHXW + j) * filter_array_size * 4 + i];
+                output[c] += input[j] * weight;
+            }
+        }
+    }
+    half4 relu = fmax(output, 0.0);
+    outTexture.write(relu, gid.xy, gid.z);
+}
 kernel void depthwise_conv_add_relu_3x3_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
                                        texture2d_array<half, access::write> outTexture [[texture(1)]],
                                        constant MetalConvParam &param [[buffer(0)]],

--- a/metal/paddle-mobile/paddle-mobile/Src/Common/PaddleMobileUnitTest.swift
+++ b/metal/paddle-mobile/paddle-mobile/Src/Common/PaddleMobileUnitTest.swift
@@ -320,7 +320,12 @@ public class PaddleMobileUnitTest {
        let offsetX = filterSize.width/2 - paddings.0
        let offsetY = filterSize.height/2 - paddings.1
-        let metalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: 0, strideX: UInt16(stride.0), strideY: UInt16(stride.1), dilationX: UInt16(1), dilationY: UInt16(1))
+        let groups = 1
+        let iC = 4
+        let fC = 4
+        let oC = 4
+        let metalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: 0, strideX: UInt16(stride.0), strideY: UInt16(stride.1), dilationX: UInt16(1), dilationY: UInt16(1), groups: UInt16(groups), iC: UInt16(iC), fC: UInt16(fC), oC: UInt16(oC))
        let param = ConvAddBatchNormReluTestParam.init(inInputTexture: inputeTexture, inOutputTexture: outputTexture, inMetalParam: metalParam, inFilterBuffer: filterBuffer, inBiaseBuffer: biaseBuffer, inNewScaleBuffer: newScalueBuffer, inNewBiaseBuffer: newBiaseBuffer, inFilterSize: filterSize)

--- a/metal/paddle-mobile/paddle-mobile/Src/Framework/Loader.swift
+++ b/metal/paddle-mobile/paddle-mobile/Src/Framework/Loader.swift
@@ -244,7 +244,7 @@ public class Loader<P: PrecisionProtocol>: Loaderable{
                            }
                            let dim = Dim.init(inDim: dimArr)
-                            let tensor = Tensor<P>.init(inDim: dim, inLayout: tensorDesc.dataLayout)
+                            let tensor = Tensor<P>.init(inDim: dim, inLayout: tensorDesc.dataLayout, originDimsCount: tensorDesc.originDimsCount)
                            do {
                                if paraLoaderPointer != nil {
                                    try paraLoaderPointer!.read(tensor: tensor)
@@ -261,7 +261,9 @@ public class Loader<P: PrecisionProtocol>: Loaderable{
                            scope[varDesc.name] = tensor
                        } else {
                            let dim = Dim.init(inDim: tensorDesc.dims)
-                            scope[varDesc.name] = Texture.init(device: device, inDim: dim)
+                            let texture = Texture.init(device: device, inDim: dim)
+                            texture.originDimsCount = tensorDesc.originDimsCount
+                            scope[varDesc.name] = texture
                        }
                    } else {
                        if varDesc.name == fetchKey {

--- a/metal/paddle-mobile/paddle-mobile/Src/Framework/Tensor.swift
+++ b/metal/paddle-mobile/paddle-mobile/Src/Framework/Tensor.swift
@@ -202,6 +202,7 @@ class Tensor<P: PrecisionProtocol>: Tensorial {
    var data: Data
    var dim: Dim
+    var originDimsCount: Int
    /// 模型中的维度: 未经过转换 paddle 模型维度为 N C H W
    var tensorDim: Dim
@@ -243,12 +244,13 @@ class Tensor<P: PrecisionProtocol>: Tensorial {
        }
    }
-    init(inDim: Dim, inLayout: DataLayout = DataLayout.NCHW()) {
+    init(inDim: Dim, inLayout: DataLayout = DataLayout.NCHW(), originDimsCount: Int?) {
        tensorDim = inDim
        dim = inDim
        let pointer = UnsafeMutablePointer<P>.allocate(capacity: inDim.numel())
        data = Data.init(inCount: inDim.numel(), inPointer: pointer)
        layout = inLayout
+        self.originDimsCount = originDimsCount ?? inDim.cout()
    }
    func convert(converter: DataConverter<P>) -> UnsafeMutablePointer<P> {

--- a/metal/paddle-mobile/paddle-mobile/Src/Framework/Texture.swift
+++ b/metal/paddle-mobile/paddle-mobile/Src/Framework/Texture.swift
@@ -72,6 +72,7 @@ public class Texture: Tensorial {
    public var dim: Dim
    public var tensorDim: Dim
    public var useMPS = false
+    public var originDimsCount: Int?
    /// tensor dim pad to four
    public var padToFourDim: Dim

--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/ConcatOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/ConcatOp.swift
@@ -35,6 +35,15 @@ class ConcatParam<P: PrecisionProtocol>: OpParam {
                input.append(v)
            }
            axis = try ConcatParam.getAttr(key: "axis", attrs: opDesc.attrs)
+            if input.count > 0 {
+                if let originDimsCount = input[0].originDimsCount {
+                    let nowDimsCount = input[0].dim.cout()
+                    let diff = originDimsCount - nowDimsCount
+                    if diff > 0 {
+                        axis -= diff
+                    }
+                }
+            }
            output = try ConcatParam.outputOut(outputs: opDesc.outputs, from: inScope)
        } catch let error {
            throw error
@@ -43,7 +52,7 @@ class ConcatParam<P: PrecisionProtocol>: OpParam {
    var input: [Texture] = []
    var output: Texture
    var transpose: [Int] = []
-    let axis: Int
+    var axis: Int
 }
 class ConcatOp<P: PrecisionProtocol>: Operator<ConcatKernel<P>, ConcatParam<P>>, Runable, Creator, InferShaperable{

--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddAddPreluKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddAddPreluKernel.swift
@@ -132,7 +132,10 @@ class ConvAddAddPreluKernel<P: PrecisionProtocol>: Kernel, Computable {
        //    print("offset y: \(offsetY)")
        let offsetZ = 0.0
-        let inMetalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1]))
+        let iC = param.input.tensorDim[1];
+        let fC = param.filter.tensorDim[1];
+        let oC = param.output.tensorDim[1];
+        let inMetalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1]), groups: UInt16(param.groups), iC: UInt16(iC), fC: UInt16(fC), oC: UInt16(oC))
        //    print("metal param: ")
        //    print(inMetalParam)

--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddBatchNormReluKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddBatchNormReluKernel.swift
@@ -95,7 +95,10 @@ class ConvAddBatchNormReluKernel<P: PrecisionProtocol>: Kernel, Computable, Test
        print("offset y: \(offsetY)")
        let offsetZ = 0.0
-        metalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1]))
+        let iC = param.input.tensorDim[1];
+        let fC = param.filter.tensorDim[1];
+        let oC = param.output.tensorDim[1];
+        metalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1]), groups: UInt16(param.groups), iC: UInt16(iC), fC: UInt16(fC), oC: UInt16(oC))
        var invs: [P] = []
        let varianceContents = param.variance.buffer.contents().assumingMemoryBound(to: P.self)

--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddKernel.swift
@@ -181,7 +181,10 @@ class ConvAddKernel<P: PrecisionProtocol>: Kernel, Computable {
        let offsetX = (Int(param.dilations[0]) * (param.filter.tensorDim[3] - 1) + 1) / 2 - Int(param.paddings[0])
        let offsetY = (Int(param.dilations[1]) * (param.filter.tensorDim[2] - 1) + 1) / 2 - Int(param.paddings[1])
        let offsetZ = 0.0
-        let inMetalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1]))
+        let iC = param.input.tensorDim[1];
+        let fC = param.filter.tensorDim[1];
+        let oC = param.output.tensorDim[1];
+        let inMetalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1]), groups: UInt16(param.groups), iC: UInt16(iC), fC: UInt16(fC), oC: UInt16(oC))
        metalParam = inMetalParam
        if type(of: self).isWinoGrad(functionName: functionName) {

--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddPreluKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddPreluKernel.swift
@@ -132,7 +132,10 @@ class ConvAddPreluKernel<P: PrecisionProtocol>: Kernel, Computable {
        //    print("offset y: \(offsetY)")
        let offsetZ = 0.0
-        let inMetalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1]))
+        let iC = param.input.tensorDim[1];
+        let fC = param.filter.tensorDim[1];
+        let oC = param.output.tensorDim[1];
+        let inMetalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1]), groups: UInt16(param.groups), iC: UInt16(iC), fC: UInt16(fC), oC: UInt16(oC))
        //    print("metal param: ")
        //    print(inMetalParam)

--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddReluKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddReluKernel.swift
@@ -25,7 +25,11 @@ class ConvAddReluKernel<P: PrecisionProtocol>: ConvAddKernel<P> {
                }
                return "depthwise_conv_add_relu_3x3_half"
            } else if param.filter.width == 3 && param.filter.height == 3 {
-                return "conv_add_relu_3x3_half"
+                if param.groups == 1 {
+                    return "conv_add_relu_3x3_half"
+                } else {
+                    return "group_conv_add_relu_3x3_half"
+                }
            } else if param.filter.width == 1 && param.filter.height == 5 {
                return "conv_add_relu_5x1_half"
            } else if param.filter.width == 5 && param.filter.height == 1 {
@@ -43,7 +47,11 @@ class ConvAddReluKernel<P: PrecisionProtocol>: ConvAddKernel<P> {
            } else if param.filter.width == 5 && param.filter.height == 1 {
                return "conv_add_relu_1x5"
            } else if param.filter.width == 3 && param.filter.height == 3 {
-                return "conv_add_relu_3x3"
+                if param.groups == 1 {
+                    return "conv_add_relu_3x3"
+                } else {
+                    return "group_conv_add_relu_3x3"
+                }
            } else {
                return nil
            }

--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvBNReluKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvBNReluKernel.swift
@@ -102,8 +102,10 @@ class ConvBNReluKernel<P: PrecisionProtocol>: Kernel, Computable, Testable {
        //    print("ConvBNReluKernel offset y: \(offsetY)")
        let offsetZ = 0.0
+        let iC = param.input.tensorDim[1];
-        metalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1]))
+        let fC = param.filter.tensorDim[1];
+        let oC = param.output.tensorDim[1];
+        metalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1]), groups: UInt16(param.groups), iC: UInt16(iC), fC: UInt16(fC), oC: UInt16(oC))
        var invs: [P] = []
        let varianceContents = param.variance.buffer.contents().assumingMemoryBound(to: P.self)

--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvKernel.swift
@@ -22,6 +22,10 @@ public struct MetalConvParam {
    let strideY: UInt16
    let dilationX: UInt16
    let dilationY: UInt16
+    let groups: UInt16
+    let iC: UInt16
+    let fC: UInt16
+    let oC: UInt16
 }
 class ConvKernel<P: PrecisionProtocol>: Kernel, Computable {
@@ -41,8 +45,11 @@ class ConvKernel<P: PrecisionProtocol>: Kernel, Computable {
        let offsetX = param.filter.dim[2]/2 - Int(param.paddings[0])
        let offsetY = param.filter.dim[1]/2 - Int(param.paddings[1])
        let offsetZ = 0.0
+        let iC = param.input.tensorDim[1];
+        let fC = param.filter.tensorDim[1];
+        let oC = param.output.tensorDim[1];
-        metalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1]))
+        metalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1]), groups: UInt16(param.groups), iC: UInt16(iC), fC: UInt16(fC), oC: UInt16(oC))
    }
    func compute(commandBuffer: MTLCommandBuffer, param: ConvParam<P>) throws {

--- a/metal/paddle-mobile/paddle-mobile/Src/Program/TensorDesc.swift
+++ b/metal/paddle-mobile/paddle-mobile/Src/Program/TensorDesc.swift
@@ -16,6 +16,7 @@ import Foundation
 class TensorDesc {
    let dims: [Int]
+    let originDimsCount: Int
    let dataType: VarTypeType
    let dataLayout: DataLayout = DataLayout.NCHW()
    var NCHWDim: [Int] {
@@ -63,6 +64,8 @@ class TensorDesc {
            dimsArray.append(dim)
        }
+        originDimsCount = Int(dimsCount)
        if dimsCount > 4 {
            let headDims = Int(dimsCount - 4)
            for i in 0..<headDims {