From 6b32f9adc595c0aa638d0e7760f90a6bdc9043bf Mon Sep 17 00:00:00 2001
From: Yanzhan Yang <yangyanzhan@gmail.com>
Date: Tue, 21 May 2019 22:54:02 +0800
Subject: [PATCH] 1.add groups to conv3x3. 2.add original size to tensor and
 tenture and adjust concat layer to handle tensor truncation logic. (#1643)

---
 .../BatchNormRelu.metal                       |   8 --
 .../paddle-mobile-metallib/Common.metal       |   4 +
 .../paddle-mobile-metallib/ConcatKernel.metal |   4 +-
 .../ConvAddReluMetal.metal                    | 110 ++++++++++++++++++
 .../Src/Common/PaddleMobileUnitTest.swift     |   7 +-
 .../paddle-mobile/Src/Framework/Loader.swift  |   6 +-
 .../paddle-mobile/Src/Framework/Tensor.swift  |   4 +-
 .../paddle-mobile/Src/Framework/Texture.swift |   1 +
 .../Src/Operators/ConcatOp.swift              |  11 +-
 .../Kernels/ConvAddAddPreluKernel.swift       |   5 +-
 .../Kernels/ConvAddBatchNormReluKernel.swift  |   5 +-
 .../Src/Operators/Kernels/ConvAddKernel.swift |   5 +-
 .../Kernels/ConvAddPreluKernel.swift          |   5 +-
 .../Operators/Kernels/ConvAddReluKernel.swift |  12 +-
 .../Operators/Kernels/ConvBNReluKernel.swift  |   6 +-
 .../Src/Operators/Kernels/ConvKernel.swift    |   9 +-
 .../Src/Program/TensorDesc.swift              |   3 +
 17 files changed, 181 insertions(+), 24 deletions(-)

diff --git a/metal/paddle-mobile-metallib/paddle-mobile-metallib/BatchNormRelu.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/BatchNormRelu.metal
index 98ba10d847..18f1ee3769 100644
--- a/metal/paddle-mobile-metallib/paddle-mobile-metallib/BatchNormRelu.metal
+++ b/metal/paddle-mobile-metallib/paddle-mobile-metallib/BatchNormRelu.metal
@@ -6,14 +6,6 @@
 #include <metal_stdlib>
 using namespace metal;
 
-struct MetalConvParam {
-    short offsetX;
-    short offsetY;
-    short offsetZ;
-    ushort strideX;
-    ushort strideY;
-};
-
 kernel void batch_norm_relu_3x3(texture2d_array<float, access::sample> inTexture [[texture(0)]],
                                 texture2d_array<float, access::write> outTexture [[texture(1)]],
                                 const device float4 *new_scale [[buffer(0)]],
diff --git a/metal/paddle-mobile-metallib/paddle-mobile-metallib/Common.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/Common.metal
index a25e354d71..fb4677a071 100644
--- a/metal/paddle-mobile-metallib/paddle-mobile-metallib/Common.metal
+++ b/metal/paddle-mobile-metallib/paddle-mobile-metallib/Common.metal
@@ -116,4 +116,8 @@ struct MetalConvParam {
     ushort strideY;
     ushort dilationX;
     ushort dilationY;
+    ushort groups;
+    ushort iC;
+    ushort fC;
+    ushort oC;
 };
diff --git a/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConcatKernel.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConcatKernel.metal
index c4c9c7bbcf..671b912bb2 100644
--- a/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConcatKernel.metal
+++ b/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConcatKernel.metal
@@ -67,8 +67,8 @@ struct ConcatParam {
 #undef R
 #undef V
 
-// lens: (R=4, N=3, V=y)
-#define V VY
+// lens: (R=4, N=3, V=normal)
+#define V VNORMAL
 #define R 4
 #define N 3
 #define P float
diff --git a/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConvAddReluMetal.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConvAddReluMetal.metal
index f440af8740..7b937282c6 100644
--- a/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConvAddReluMetal.metal
+++ b/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConvAddReluMetal.metal
@@ -129,6 +129,61 @@ kernel void conv_add_relu_3x3(texture2d_array<float, access::sample> inTexture [
     outTexture.write(relu, gid.xy, gid.z);
 }
 
+kernel void group_conv_add_relu_3x3(texture2d_array<float, access::sample> inTexture [[texture(0)]],
+                              texture2d_array<float, access::write> outTexture [[texture(1)]],
+                              constant MetalConvParam &param [[buffer(0)]],
+                              const device float *weights [[buffer(1)]],
+                              const device float4 *biase [[buffer(2)]],
+                              uint3 gid [[thread_position_in_grid]]) {
+    if (gid.x >= outTexture.get_width() ||
+        gid.y >= outTexture.get_height() ||
+        gid.z >= outTexture.get_array_size()) {
+        return;
+    }
+    
+    ushort2 stride = ushort2(param.strideX, param.strideY);
+    const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+    
+    constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+    
+    const uint kernelHXW = 9;
+    
+    float4 output = biase[gid.z];
+    
+    ushort dilation_x = param.dilationX;
+    ushort dilation_y = param.dilationY;
+    
+    float input[9];
+    
+    uint iC = param.iC, fC = param.fC, oC = param.oC;
+    uint filter_array_size = (fC + 3) / 4;
+    
+    for (uint c = 0; c < 4; ++c) {
+        uint output_depth = gid.z * 4 + c, output_c = output_depth % oC, output_n = output_depth / oC;
+        for (uint i = 0; i < fC; ++i) {
+            uint input_depth = output_n * iC + output_c * fC + i;
+            uint input_array_index = input_depth / 4;
+            uint input_array_item_index = input_depth % 4;
+            input[0] = inTexture.sample(sample, float2(posInInput.x - dilation_x, posInInput.y - dilation_y), input_array_index)[input_array_item_index];
+            input[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - dilation_y), input_array_index)[input_array_item_index];
+            input[2] = inTexture.sample(sample, float2(posInInput.x + dilation_x, posInInput.y - dilation_y), input_array_index)[input_array_item_index];
+            input[3] = inTexture.sample(sample, float2(posInInput.x - dilation_x, posInInput.y), input_array_index)[input_array_item_index];
+            input[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), input_array_index)[input_array_item_index];
+            input[5] = inTexture.sample(sample, float2(posInInput.x + dilation_x, posInInput.y), input_array_index)[input_array_item_index];
+            input[6] = inTexture.sample(sample, float2(posInInput.x - dilation_x, posInInput.y + dilation_y), input_array_index)[input_array_item_index];
+            input[7] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + dilation_y), input_array_index)[input_array_item_index];
+            input[8] = inTexture.sample(sample, float2(posInInput.x + dilation_x, posInInput.y + dilation_y), input_array_index)[input_array_item_index];
+            for (int j = 0; j < 9; ++j) {
+                float weight = weights[(output_c * kernelHXW + j) * filter_array_size * 4 + i];
+                output[c] += input[j] * weight;
+            }
+        }
+    }
+    
+    float4 relu = fmax(output, 0.0);
+    outTexture.write(relu, gid.xy, gid.z);
+}
+
 kernel void conv_add_relu_5x1(texture2d_array<float, access::sample> inTexture [[texture(0)]],
                          texture2d_array<float, access::write> outTexture [[texture(1)]],
                          constant MetalConvParam &param [[buffer(0)]],
@@ -385,6 +440,61 @@ kernel void conv_add_relu_3x3_half(texture2d_array<half, access::sample> inTextu
     outTexture.write(half4(relu), gid.xy, gid.z);
 }
 
+kernel void group_conv_add_relu_3x3_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
+                                    texture2d_array<half, access::write> outTexture [[texture(1)]],
+                                    constant MetalConvParam &param [[buffer(0)]],
+                                    const device half *weights [[buffer(1)]],
+                                    const device half4 *biase [[buffer(2)]],
+                                    uint3 gid [[thread_position_in_grid]]) {
+    if (gid.x >= outTexture.get_width() ||
+        gid.y >= outTexture.get_height() ||
+        gid.z >= outTexture.get_array_size()) {
+        return;
+    }
+    
+    ushort2 stride = ushort2(param.strideX, param.strideY);
+    const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY);
+    
+    constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero);
+    
+    const uint kernelHXW = 9;
+    
+    half4 output = biase[gid.z];
+    
+    ushort dilation_x = param.dilationX;
+    ushort dilation_y = param.dilationY;
+    
+    half input[9];
+    
+    uint iC = param.iC, fC = param.fC, oC = param.oC;
+    uint filter_array_size = (fC + 3) / 4;
+    
+    for (uint c = 0; c < 4; ++c) {
+        uint output_depth = gid.z * 4 + c, output_c = output_depth % oC, output_n = output_depth / oC;
+        for (uint i = 0; i < fC; ++i) {
+            uint input_depth = output_n * iC + output_c * fC + i;
+            uint input_array_index = input_depth / 4;
+            uint input_array_item_index = input_depth % 4;
+            input[0] = inTexture.sample(sample, float2(posInInput.x - dilation_x, posInInput.y - dilation_y), input_array_index)[input_array_item_index];
+            input[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - dilation_y), input_array_index)[input_array_item_index];
+            input[2] = inTexture.sample(sample, float2(posInInput.x + dilation_x, posInInput.y - dilation_y), input_array_index)[input_array_item_index];
+            input[3] = inTexture.sample(sample, float2(posInInput.x - dilation_x, posInInput.y), input_array_index)[input_array_item_index];
+            input[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), input_array_index)[input_array_item_index];
+            input[5] = inTexture.sample(sample, float2(posInInput.x + dilation_x, posInInput.y), input_array_index)[input_array_item_index];
+            input[6] = inTexture.sample(sample, float2(posInInput.x - dilation_x, posInInput.y + dilation_y), input_array_index)[input_array_item_index];
+            input[7] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + dilation_y), input_array_index)[input_array_item_index];
+            input[8] = inTexture.sample(sample, float2(posInInput.x + dilation_x, posInInput.y + dilation_y), input_array_index)[input_array_item_index];
+            for (int j = 0; j < 9; ++j) {
+                half weight = weights[(output_c * kernelHXW + j) * filter_array_size * 4 + i];
+                output[c] += input[j] * weight;
+            }
+        }
+    }
+    
+    half4 relu = fmax(output, 0.0);
+    outTexture.write(relu, gid.xy, gid.z);
+}
+
 kernel void depthwise_conv_add_relu_3x3_half(texture2d_array<half, access::sample> inTexture [[texture(0)]],
                                         texture2d_array<half, access::write> outTexture [[texture(1)]],
                                         constant MetalConvParam &param [[buffer(0)]],
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Common/PaddleMobileUnitTest.swift b/metal/paddle-mobile/paddle-mobile/Src/Common/PaddleMobileUnitTest.swift
index 52c27ccead..02dc760d59 100644
--- a/metal/paddle-mobile/paddle-mobile/Src/Common/PaddleMobileUnitTest.swift
+++ b/metal/paddle-mobile/paddle-mobile/Src/Common/PaddleMobileUnitTest.swift
@@ -320,7 +320,12 @@ public class PaddleMobileUnitTest {
         let offsetX = filterSize.width/2 - paddings.0
         let offsetY = filterSize.height/2 - paddings.1
         
-        let metalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: 0, strideX: UInt16(stride.0), strideY: UInt16(stride.1), dilationX: UInt16(1), dilationY: UInt16(1))
+        let groups = 1
+        let iC = 4
+        let fC = 4
+        let oC = 4
+        
+        let metalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: 0, strideX: UInt16(stride.0), strideY: UInt16(stride.1), dilationX: UInt16(1), dilationY: UInt16(1), groups: UInt16(groups), iC: UInt16(iC), fC: UInt16(fC), oC: UInt16(oC))
         
         let param = ConvAddBatchNormReluTestParam.init(inInputTexture: inputeTexture, inOutputTexture: outputTexture, inMetalParam: metalParam, inFilterBuffer: filterBuffer, inBiaseBuffer: biaseBuffer, inNewScaleBuffer: newScalueBuffer, inNewBiaseBuffer: newBiaseBuffer, inFilterSize: filterSize)
         
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Framework/Loader.swift b/metal/paddle-mobile/paddle-mobile/Src/Framework/Loader.swift
index 588ae5de88..664f2dfff9 100644
--- a/metal/paddle-mobile/paddle-mobile/Src/Framework/Loader.swift
+++ b/metal/paddle-mobile/paddle-mobile/Src/Framework/Loader.swift
@@ -244,7 +244,7 @@ public class Loader<P: PrecisionProtocol>: Loaderable{
                             }
                             
                             let dim = Dim.init(inDim: dimArr)
-                            let tensor = Tensor<P>.init(inDim: dim, inLayout: tensorDesc.dataLayout)
+                            let tensor = Tensor<P>.init(inDim: dim, inLayout: tensorDesc.dataLayout, originDimsCount: tensorDesc.originDimsCount)
                             do {
                                 if paraLoaderPointer != nil {
                                     try paraLoaderPointer!.read(tensor: tensor)
@@ -261,7 +261,9 @@ public class Loader<P: PrecisionProtocol>: Loaderable{
                             scope[varDesc.name] = tensor
                         } else {
                             let dim = Dim.init(inDim: tensorDesc.dims)
-                            scope[varDesc.name] = Texture.init(device: device, inDim: dim)
+                            let texture = Texture.init(device: device, inDim: dim)
+                            texture.originDimsCount = tensorDesc.originDimsCount
+                            scope[varDesc.name] = texture
                         }
                     } else {
                         if varDesc.name == fetchKey {
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Framework/Tensor.swift b/metal/paddle-mobile/paddle-mobile/Src/Framework/Tensor.swift
index 4a0bf10567..b993850434 100644
--- a/metal/paddle-mobile/paddle-mobile/Src/Framework/Tensor.swift
+++ b/metal/paddle-mobile/paddle-mobile/Src/Framework/Tensor.swift
@@ -202,6 +202,7 @@ class Tensor<P: PrecisionProtocol>: Tensorial {
     
     var data: Data
     var dim: Dim
+    var originDimsCount: Int
     
     /// 模型中的维度: 未经过转换 paddle 模型维度为 N C H W
     var tensorDim: Dim
@@ -243,12 +244,13 @@ class Tensor<P: PrecisionProtocol>: Tensorial {
         }
     }
     
-    init(inDim: Dim, inLayout: DataLayout = DataLayout.NCHW()) {
+    init(inDim: Dim, inLayout: DataLayout = DataLayout.NCHW(), originDimsCount: Int?) {
         tensorDim = inDim
         dim = inDim
         let pointer = UnsafeMutablePointer<P>.allocate(capacity: inDim.numel())
         data = Data.init(inCount: inDim.numel(), inPointer: pointer)
         layout = inLayout
+        self.originDimsCount = originDimsCount ?? inDim.cout()
     }
     
     func convert(converter: DataConverter<P>) -> UnsafeMutablePointer<P> {
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Framework/Texture.swift b/metal/paddle-mobile/paddle-mobile/Src/Framework/Texture.swift
index 63250fa085..3c77bc16b4 100644
--- a/metal/paddle-mobile/paddle-mobile/Src/Framework/Texture.swift
+++ b/metal/paddle-mobile/paddle-mobile/Src/Framework/Texture.swift
@@ -72,6 +72,7 @@ public class Texture: Tensorial {
     public var dim: Dim
     public var tensorDim: Dim
     public var useMPS = false
+    public var originDimsCount: Int?
     
     /// tensor dim pad to four
     public var padToFourDim: Dim
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/ConcatOp.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/ConcatOp.swift
index e526bf05db..6cf8d741a9 100644
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/ConcatOp.swift
+++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/ConcatOp.swift
@@ -35,6 +35,15 @@ class ConcatParam<P: PrecisionProtocol>: OpParam {
                 input.append(v)
             }
             axis = try ConcatParam.getAttr(key: "axis", attrs: opDesc.attrs)
+            if input.count > 0 {
+                if let originDimsCount = input[0].originDimsCount {
+                    let nowDimsCount = input[0].dim.cout()
+                    let diff = originDimsCount - nowDimsCount
+                    if diff > 0 {
+                        axis -= diff
+                    }
+                }
+            }
             output = try ConcatParam.outputOut(outputs: opDesc.outputs, from: inScope)
         } catch let error {
             throw error
@@ -43,7 +52,7 @@ class ConcatParam<P: PrecisionProtocol>: OpParam {
     var input: [Texture] = []
     var output: Texture
     var transpose: [Int] = []
-    let axis: Int
+    var axis: Int
 }
 
 class ConcatOp<P: PrecisionProtocol>: Operator<ConcatKernel<P>, ConcatParam<P>>, Runable, Creator, InferShaperable{
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddAddPreluKernel.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddAddPreluKernel.swift
index 6c019c089c..f3e1df7a60 100644
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddAddPreluKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddAddPreluKernel.swift
@@ -132,7 +132,10 @@ class ConvAddAddPreluKernel<P: PrecisionProtocol>: Kernel, Computable {
         //    print("offset y: \(offsetY)")
         
         let offsetZ = 0.0
-        let inMetalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1]))
+        let iC = param.input.tensorDim[1];
+        let fC = param.filter.tensorDim[1];
+        let oC = param.output.tensorDim[1];
+        let inMetalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1]), groups: UInt16(param.groups), iC: UInt16(iC), fC: UInt16(fC), oC: UInt16(oC))
         //    print("metal param: ")
         //    print(inMetalParam)
         
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddBatchNormReluKernel.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddBatchNormReluKernel.swift
index 09b7d3966b..c5405fc8cf 100644
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddBatchNormReluKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddBatchNormReluKernel.swift
@@ -95,7 +95,10 @@ class ConvAddBatchNormReluKernel<P: PrecisionProtocol>: Kernel, Computable, Test
         print("offset y: \(offsetY)")
         
         let offsetZ = 0.0
-        metalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1]))
+        let iC = param.input.tensorDim[1];
+        let fC = param.filter.tensorDim[1];
+        let oC = param.output.tensorDim[1];
+        metalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1]), groups: UInt16(param.groups), iC: UInt16(iC), fC: UInt16(fC), oC: UInt16(oC))
         
         var invs: [P] = []
         let varianceContents = param.variance.buffer.contents().assumingMemoryBound(to: P.self)
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddKernel.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddKernel.swift
index c46640943b..cb6944980a 100644
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddKernel.swift
@@ -181,7 +181,10 @@ class ConvAddKernel<P: PrecisionProtocol>: Kernel, Computable {
         let offsetX = (Int(param.dilations[0]) * (param.filter.tensorDim[3] - 1) + 1) / 2 - Int(param.paddings[0])
         let offsetY = (Int(param.dilations[1]) * (param.filter.tensorDim[2] - 1) + 1) / 2 - Int(param.paddings[1])
         let offsetZ = 0.0
-        let inMetalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1]))
+        let iC = param.input.tensorDim[1];
+        let fC = param.filter.tensorDim[1];
+        let oC = param.output.tensorDim[1];
+        let inMetalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1]), groups: UInt16(param.groups), iC: UInt16(iC), fC: UInt16(fC), oC: UInt16(oC))
         metalParam = inMetalParam
         
         if type(of: self).isWinoGrad(functionName: functionName) {
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddPreluKernel.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddPreluKernel.swift
index 4186f10dbe..5b5daaacbc 100644
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddPreluKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddPreluKernel.swift
@@ -132,7 +132,10 @@ class ConvAddPreluKernel<P: PrecisionProtocol>: Kernel, Computable {
         //    print("offset y: \(offsetY)")
         
         let offsetZ = 0.0
-        let inMetalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1]))
+        let iC = param.input.tensorDim[1];
+        let fC = param.filter.tensorDim[1];
+        let oC = param.output.tensorDim[1];
+        let inMetalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1]), groups: UInt16(param.groups), iC: UInt16(iC), fC: UInt16(fC), oC: UInt16(oC))
         //    print("metal param: ")
         //    print(inMetalParam)
         
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddReluKernel.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddReluKernel.swift
index fc43a6c17e..353ac9f467 100644
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddReluKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddReluKernel.swift
@@ -25,7 +25,11 @@ class ConvAddReluKernel<P: PrecisionProtocol>: ConvAddKernel<P> {
                 }
                 return "depthwise_conv_add_relu_3x3_half"
             } else if param.filter.width == 3 && param.filter.height == 3 {
-                return "conv_add_relu_3x3_half"
+                if param.groups == 1 {
+                    return "conv_add_relu_3x3_half"
+                } else {
+                    return "group_conv_add_relu_3x3_half"
+                }
             } else if param.filter.width == 1 && param.filter.height == 5 {
                 return "conv_add_relu_5x1_half"
             } else if param.filter.width == 5 && param.filter.height == 1 {
@@ -43,7 +47,11 @@ class ConvAddReluKernel<P: PrecisionProtocol>: ConvAddKernel<P> {
             } else if param.filter.width == 5 && param.filter.height == 1 {
                 return "conv_add_relu_1x5"
             } else if param.filter.width == 3 && param.filter.height == 3 {
-                return "conv_add_relu_3x3"
+                if param.groups == 1 {
+                    return "conv_add_relu_3x3"
+                } else {
+                    return "group_conv_add_relu_3x3"
+                }
             } else {
                 return nil
             }
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvBNReluKernel.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvBNReluKernel.swift
index 7239a3e1a5..0ecf929093 100644
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvBNReluKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvBNReluKernel.swift
@@ -102,8 +102,10 @@ class ConvBNReluKernel<P: PrecisionProtocol>: Kernel, Computable, Testable {
         //    print("ConvBNReluKernel offset y: \(offsetY)")
         
         let offsetZ = 0.0
-        
-        metalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1]))
+        let iC = param.input.tensorDim[1];
+        let fC = param.filter.tensorDim[1];
+        let oC = param.output.tensorDim[1];
+        metalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1]), groups: UInt16(param.groups), iC: UInt16(iC), fC: UInt16(fC), oC: UInt16(oC))
         
         var invs: [P] = []
         let varianceContents = param.variance.buffer.contents().assumingMemoryBound(to: P.self)
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvKernel.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvKernel.swift
index d4d4236605..5dbabf9fab 100644
--- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvKernel.swift
+++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvKernel.swift
@@ -22,6 +22,10 @@ public struct MetalConvParam {
     let strideY: UInt16
     let dilationX: UInt16
     let dilationY: UInt16
+    let groups: UInt16
+    let iC: UInt16
+    let fC: UInt16
+    let oC: UInt16
 }
 
 class ConvKernel<P: PrecisionProtocol>: Kernel, Computable {
@@ -41,8 +45,11 @@ class ConvKernel<P: PrecisionProtocol>: Kernel, Computable {
         let offsetX = param.filter.dim[2]/2 - Int(param.paddings[0])
         let offsetY = param.filter.dim[1]/2 - Int(param.paddings[1])
         let offsetZ = 0.0
+        let iC = param.input.tensorDim[1];
+        let fC = param.filter.tensorDim[1];
+        let oC = param.output.tensorDim[1];
         
-        metalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1]))
+        metalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1]), groups: UInt16(param.groups), iC: UInt16(iC), fC: UInt16(fC), oC: UInt16(oC))
     }
     
     func compute(commandBuffer: MTLCommandBuffer, param: ConvParam<P>) throws {
diff --git a/metal/paddle-mobile/paddle-mobile/Src/Program/TensorDesc.swift b/metal/paddle-mobile/paddle-mobile/Src/Program/TensorDesc.swift
index badca3dbac..5871fe0451 100644
--- a/metal/paddle-mobile/paddle-mobile/Src/Program/TensorDesc.swift
+++ b/metal/paddle-mobile/paddle-mobile/Src/Program/TensorDesc.swift
@@ -16,6 +16,7 @@ import Foundation
 
 class TensorDesc {
     let dims: [Int]
+    let originDimsCount: Int
     let dataType: VarTypeType
     let dataLayout: DataLayout = DataLayout.NCHW()
     var NCHWDim: [Int] {
@@ -63,6 +64,8 @@ class TensorDesc {
             dimsArray.append(dim)
         }
         
+        originDimsCount = Int(dimsCount)
+        
         if dimsCount > 4 {
             let headDims = Int(dimsCount - 4)
             for i in 0..<headDims {
-- 
GitLab