From 6b32f9adc595c0aa638d0e7760f90a6bdc9043bf Mon Sep 17 00:00:00 2001 From: Yanzhan Yang Date: Tue, 21 May 2019 22:54:02 +0800 Subject: [PATCH] 1.add groups to conv3x3. 2.add original size to tensor and tenture and adjust concat layer to handle tensor truncation logic. (#1643) --- .../BatchNormRelu.metal | 8 -- .../paddle-mobile-metallib/Common.metal | 4 + .../paddle-mobile-metallib/ConcatKernel.metal | 4 +- .../ConvAddReluMetal.metal | 110 ++++++++++++++++++ .../Src/Common/PaddleMobileUnitTest.swift | 7 +- .../paddle-mobile/Src/Framework/Loader.swift | 6 +- .../paddle-mobile/Src/Framework/Tensor.swift | 4 +- .../paddle-mobile/Src/Framework/Texture.swift | 1 + .../Src/Operators/ConcatOp.swift | 11 +- .../Kernels/ConvAddAddPreluKernel.swift | 5 +- .../Kernels/ConvAddBatchNormReluKernel.swift | 5 +- .../Src/Operators/Kernels/ConvAddKernel.swift | 5 +- .../Kernels/ConvAddPreluKernel.swift | 5 +- .../Operators/Kernels/ConvAddReluKernel.swift | 12 +- .../Operators/Kernels/ConvBNReluKernel.swift | 6 +- .../Src/Operators/Kernels/ConvKernel.swift | 9 +- .../Src/Program/TensorDesc.swift | 3 + 17 files changed, 181 insertions(+), 24 deletions(-) diff --git a/metal/paddle-mobile-metallib/paddle-mobile-metallib/BatchNormRelu.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/BatchNormRelu.metal index 98ba10d847..18f1ee3769 100644 --- a/metal/paddle-mobile-metallib/paddle-mobile-metallib/BatchNormRelu.metal +++ b/metal/paddle-mobile-metallib/paddle-mobile-metallib/BatchNormRelu.metal @@ -6,14 +6,6 @@ #include using namespace metal; -struct MetalConvParam { - short offsetX; - short offsetY; - short offsetZ; - ushort strideX; - ushort strideY; -}; - kernel void batch_norm_relu_3x3(texture2d_array inTexture [[texture(0)]], texture2d_array outTexture [[texture(1)]], const device float4 *new_scale [[buffer(0)]], diff --git a/metal/paddle-mobile-metallib/paddle-mobile-metallib/Common.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/Common.metal index a25e354d71..fb4677a071 100644 --- a/metal/paddle-mobile-metallib/paddle-mobile-metallib/Common.metal +++ b/metal/paddle-mobile-metallib/paddle-mobile-metallib/Common.metal @@ -116,4 +116,8 @@ struct MetalConvParam { ushort strideY; ushort dilationX; ushort dilationY; + ushort groups; + ushort iC; + ushort fC; + ushort oC; }; diff --git a/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConcatKernel.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConcatKernel.metal index c4c9c7bbcf..671b912bb2 100644 --- a/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConcatKernel.metal +++ b/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConcatKernel.metal @@ -67,8 +67,8 @@ struct ConcatParam { #undef R #undef V -// lens: (R=4, N=3, V=y) -#define V VY +// lens: (R=4, N=3, V=normal) +#define V VNORMAL #define R 4 #define N 3 #define P float diff --git a/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConvAddReluMetal.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConvAddReluMetal.metal index f440af8740..7b937282c6 100644 --- a/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConvAddReluMetal.metal +++ b/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConvAddReluMetal.metal @@ -129,6 +129,61 @@ kernel void conv_add_relu_3x3(texture2d_array inTexture [ outTexture.write(relu, gid.xy, gid.z); } +kernel void group_conv_add_relu_3x3(texture2d_array inTexture [[texture(0)]], + texture2d_array outTexture [[texture(1)]], + constant MetalConvParam ¶m [[buffer(0)]], + const device float *weights [[buffer(1)]], + const device float4 *biase [[buffer(2)]], + uint3 gid [[thread_position_in_grid]]) { + if (gid.x >= outTexture.get_width() || + gid.y >= outTexture.get_height() || + gid.z >= outTexture.get_array_size()) { + return; + } + + ushort2 stride = ushort2(param.strideX, param.strideY); + const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY); + + constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); + + const uint kernelHXW = 9; + + float4 output = biase[gid.z]; + + ushort dilation_x = param.dilationX; + ushort dilation_y = param.dilationY; + + float input[9]; + + uint iC = param.iC, fC = param.fC, oC = param.oC; + uint filter_array_size = (fC + 3) / 4; + + for (uint c = 0; c < 4; ++c) { + uint output_depth = gid.z * 4 + c, output_c = output_depth % oC, output_n = output_depth / oC; + for (uint i = 0; i < fC; ++i) { + uint input_depth = output_n * iC + output_c * fC + i; + uint input_array_index = input_depth / 4; + uint input_array_item_index = input_depth % 4; + input[0] = inTexture.sample(sample, float2(posInInput.x - dilation_x, posInInput.y - dilation_y), input_array_index)[input_array_item_index]; + input[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - dilation_y), input_array_index)[input_array_item_index]; + input[2] = inTexture.sample(sample, float2(posInInput.x + dilation_x, posInInput.y - dilation_y), input_array_index)[input_array_item_index]; + input[3] = inTexture.sample(sample, float2(posInInput.x - dilation_x, posInInput.y), input_array_index)[input_array_item_index]; + input[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), input_array_index)[input_array_item_index]; + input[5] = inTexture.sample(sample, float2(posInInput.x + dilation_x, posInInput.y), input_array_index)[input_array_item_index]; + input[6] = inTexture.sample(sample, float2(posInInput.x - dilation_x, posInInput.y + dilation_y), input_array_index)[input_array_item_index]; + input[7] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + dilation_y), input_array_index)[input_array_item_index]; + input[8] = inTexture.sample(sample, float2(posInInput.x + dilation_x, posInInput.y + dilation_y), input_array_index)[input_array_item_index]; + for (int j = 0; j < 9; ++j) { + float weight = weights[(output_c * kernelHXW + j) * filter_array_size * 4 + i]; + output[c] += input[j] * weight; + } + } + } + + float4 relu = fmax(output, 0.0); + outTexture.write(relu, gid.xy, gid.z); +} + kernel void conv_add_relu_5x1(texture2d_array inTexture [[texture(0)]], texture2d_array outTexture [[texture(1)]], constant MetalConvParam ¶m [[buffer(0)]], @@ -385,6 +440,61 @@ kernel void conv_add_relu_3x3_half(texture2d_array inTextu outTexture.write(half4(relu), gid.xy, gid.z); } +kernel void group_conv_add_relu_3x3_half(texture2d_array inTexture [[texture(0)]], + texture2d_array outTexture [[texture(1)]], + constant MetalConvParam ¶m [[buffer(0)]], + const device half *weights [[buffer(1)]], + const device half4 *biase [[buffer(2)]], + uint3 gid [[thread_position_in_grid]]) { + if (gid.x >= outTexture.get_width() || + gid.y >= outTexture.get_height() || + gid.z >= outTexture.get_array_size()) { + return; + } + + ushort2 stride = ushort2(param.strideX, param.strideY); + const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY); + + constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); + + const uint kernelHXW = 9; + + half4 output = biase[gid.z]; + + ushort dilation_x = param.dilationX; + ushort dilation_y = param.dilationY; + + half input[9]; + + uint iC = param.iC, fC = param.fC, oC = param.oC; + uint filter_array_size = (fC + 3) / 4; + + for (uint c = 0; c < 4; ++c) { + uint output_depth = gid.z * 4 + c, output_c = output_depth % oC, output_n = output_depth / oC; + for (uint i = 0; i < fC; ++i) { + uint input_depth = output_n * iC + output_c * fC + i; + uint input_array_index = input_depth / 4; + uint input_array_item_index = input_depth % 4; + input[0] = inTexture.sample(sample, float2(posInInput.x - dilation_x, posInInput.y - dilation_y), input_array_index)[input_array_item_index]; + input[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - dilation_y), input_array_index)[input_array_item_index]; + input[2] = inTexture.sample(sample, float2(posInInput.x + dilation_x, posInInput.y - dilation_y), input_array_index)[input_array_item_index]; + input[3] = inTexture.sample(sample, float2(posInInput.x - dilation_x, posInInput.y), input_array_index)[input_array_item_index]; + input[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), input_array_index)[input_array_item_index]; + input[5] = inTexture.sample(sample, float2(posInInput.x + dilation_x, posInInput.y), input_array_index)[input_array_item_index]; + input[6] = inTexture.sample(sample, float2(posInInput.x - dilation_x, posInInput.y + dilation_y), input_array_index)[input_array_item_index]; + input[7] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + dilation_y), input_array_index)[input_array_item_index]; + input[8] = inTexture.sample(sample, float2(posInInput.x + dilation_x, posInInput.y + dilation_y), input_array_index)[input_array_item_index]; + for (int j = 0; j < 9; ++j) { + half weight = weights[(output_c * kernelHXW + j) * filter_array_size * 4 + i]; + output[c] += input[j] * weight; + } + } + } + + half4 relu = fmax(output, 0.0); + outTexture.write(relu, gid.xy, gid.z); +} + kernel void depthwise_conv_add_relu_3x3_half(texture2d_array inTexture [[texture(0)]], texture2d_array outTexture [[texture(1)]], constant MetalConvParam ¶m [[buffer(0)]], diff --git a/metal/paddle-mobile/paddle-mobile/Src/Common/PaddleMobileUnitTest.swift b/metal/paddle-mobile/paddle-mobile/Src/Common/PaddleMobileUnitTest.swift index 52c27ccead..02dc760d59 100644 --- a/metal/paddle-mobile/paddle-mobile/Src/Common/PaddleMobileUnitTest.swift +++ b/metal/paddle-mobile/paddle-mobile/Src/Common/PaddleMobileUnitTest.swift @@ -320,7 +320,12 @@ public class PaddleMobileUnitTest { let offsetX = filterSize.width/2 - paddings.0 let offsetY = filterSize.height/2 - paddings.1 - let metalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: 0, strideX: UInt16(stride.0), strideY: UInt16(stride.1), dilationX: UInt16(1), dilationY: UInt16(1)) + let groups = 1 + let iC = 4 + let fC = 4 + let oC = 4 + + let metalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: 0, strideX: UInt16(stride.0), strideY: UInt16(stride.1), dilationX: UInt16(1), dilationY: UInt16(1), groups: UInt16(groups), iC: UInt16(iC), fC: UInt16(fC), oC: UInt16(oC)) let param = ConvAddBatchNormReluTestParam.init(inInputTexture: inputeTexture, inOutputTexture: outputTexture, inMetalParam: metalParam, inFilterBuffer: filterBuffer, inBiaseBuffer: biaseBuffer, inNewScaleBuffer: newScalueBuffer, inNewBiaseBuffer: newBiaseBuffer, inFilterSize: filterSize) diff --git a/metal/paddle-mobile/paddle-mobile/Src/Framework/Loader.swift b/metal/paddle-mobile/paddle-mobile/Src/Framework/Loader.swift index 588ae5de88..664f2dfff9 100644 --- a/metal/paddle-mobile/paddle-mobile/Src/Framework/Loader.swift +++ b/metal/paddle-mobile/paddle-mobile/Src/Framework/Loader.swift @@ -244,7 +244,7 @@ public class Loader: Loaderable{ } let dim = Dim.init(inDim: dimArr) - let tensor = Tensor

.init(inDim: dim, inLayout: tensorDesc.dataLayout) + let tensor = Tensor

.init(inDim: dim, inLayout: tensorDesc.dataLayout, originDimsCount: tensorDesc.originDimsCount) do { if paraLoaderPointer != nil { try paraLoaderPointer!.read(tensor: tensor) @@ -261,7 +261,9 @@ public class Loader: Loaderable{ scope[varDesc.name] = tensor } else { let dim = Dim.init(inDim: tensorDesc.dims) - scope[varDesc.name] = Texture.init(device: device, inDim: dim) + let texture = Texture.init(device: device, inDim: dim) + texture.originDimsCount = tensorDesc.originDimsCount + scope[varDesc.name] = texture } } else { if varDesc.name == fetchKey { diff --git a/metal/paddle-mobile/paddle-mobile/Src/Framework/Tensor.swift b/metal/paddle-mobile/paddle-mobile/Src/Framework/Tensor.swift index 4a0bf10567..b993850434 100644 --- a/metal/paddle-mobile/paddle-mobile/Src/Framework/Tensor.swift +++ b/metal/paddle-mobile/paddle-mobile/Src/Framework/Tensor.swift @@ -202,6 +202,7 @@ class Tensor: Tensorial { var data: Data var dim: Dim + var originDimsCount: Int /// 模型中的维度: 未经过转换 paddle 模型维度为 N C H W var tensorDim: Dim @@ -243,12 +244,13 @@ class Tensor: Tensorial { } } - init(inDim: Dim, inLayout: DataLayout = DataLayout.NCHW()) { + init(inDim: Dim, inLayout: DataLayout = DataLayout.NCHW(), originDimsCount: Int?) { tensorDim = inDim dim = inDim let pointer = UnsafeMutablePointer

.allocate(capacity: inDim.numel()) data = Data.init(inCount: inDim.numel(), inPointer: pointer) layout = inLayout + self.originDimsCount = originDimsCount ?? inDim.cout() } func convert(converter: DataConverter

) -> UnsafeMutablePointer

{ diff --git a/metal/paddle-mobile/paddle-mobile/Src/Framework/Texture.swift b/metal/paddle-mobile/paddle-mobile/Src/Framework/Texture.swift index 63250fa085..3c77bc16b4 100644 --- a/metal/paddle-mobile/paddle-mobile/Src/Framework/Texture.swift +++ b/metal/paddle-mobile/paddle-mobile/Src/Framework/Texture.swift @@ -72,6 +72,7 @@ public class Texture: Tensorial { public var dim: Dim public var tensorDim: Dim public var useMPS = false + public var originDimsCount: Int? /// tensor dim pad to four public var padToFourDim: Dim diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/ConcatOp.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/ConcatOp.swift index e526bf05db..6cf8d741a9 100644 --- a/metal/paddle-mobile/paddle-mobile/Src/Operators/ConcatOp.swift +++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/ConcatOp.swift @@ -35,6 +35,15 @@ class ConcatParam: OpParam { input.append(v) } axis = try ConcatParam.getAttr(key: "axis", attrs: opDesc.attrs) + if input.count > 0 { + if let originDimsCount = input[0].originDimsCount { + let nowDimsCount = input[0].dim.cout() + let diff = originDimsCount - nowDimsCount + if diff > 0 { + axis -= diff + } + } + } output = try ConcatParam.outputOut(outputs: opDesc.outputs, from: inScope) } catch let error { throw error @@ -43,7 +52,7 @@ class ConcatParam: OpParam { var input: [Texture] = [] var output: Texture var transpose: [Int] = [] - let axis: Int + var axis: Int } class ConcatOp: Operator, ConcatParam

>, Runable, Creator, InferShaperable{ diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddAddPreluKernel.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddAddPreluKernel.swift index 6c019c089c..f3e1df7a60 100644 --- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddAddPreluKernel.swift +++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddAddPreluKernel.swift @@ -132,7 +132,10 @@ class ConvAddAddPreluKernel: Kernel, Computable { // print("offset y: \(offsetY)") let offsetZ = 0.0 - let inMetalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1])) + let iC = param.input.tensorDim[1]; + let fC = param.filter.tensorDim[1]; + let oC = param.output.tensorDim[1]; + let inMetalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1]), groups: UInt16(param.groups), iC: UInt16(iC), fC: UInt16(fC), oC: UInt16(oC)) // print("metal param: ") // print(inMetalParam) diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddBatchNormReluKernel.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddBatchNormReluKernel.swift index 09b7d3966b..c5405fc8cf 100644 --- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddBatchNormReluKernel.swift +++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddBatchNormReluKernel.swift @@ -95,7 +95,10 @@ class ConvAddBatchNormReluKernel: Kernel, Computable, Test print("offset y: \(offsetY)") let offsetZ = 0.0 - metalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1])) + let iC = param.input.tensorDim[1]; + let fC = param.filter.tensorDim[1]; + let oC = param.output.tensorDim[1]; + metalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1]), groups: UInt16(param.groups), iC: UInt16(iC), fC: UInt16(fC), oC: UInt16(oC)) var invs: [P] = [] let varianceContents = param.variance.buffer.contents().assumingMemoryBound(to: P.self) diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddKernel.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddKernel.swift index c46640943b..cb6944980a 100644 --- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddKernel.swift +++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddKernel.swift @@ -181,7 +181,10 @@ class ConvAddKernel: Kernel, Computable { let offsetX = (Int(param.dilations[0]) * (param.filter.tensorDim[3] - 1) + 1) / 2 - Int(param.paddings[0]) let offsetY = (Int(param.dilations[1]) * (param.filter.tensorDim[2] - 1) + 1) / 2 - Int(param.paddings[1]) let offsetZ = 0.0 - let inMetalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1])) + let iC = param.input.tensorDim[1]; + let fC = param.filter.tensorDim[1]; + let oC = param.output.tensorDim[1]; + let inMetalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1]), groups: UInt16(param.groups), iC: UInt16(iC), fC: UInt16(fC), oC: UInt16(oC)) metalParam = inMetalParam if type(of: self).isWinoGrad(functionName: functionName) { diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddPreluKernel.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddPreluKernel.swift index 4186f10dbe..5b5daaacbc 100644 --- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddPreluKernel.swift +++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddPreluKernel.swift @@ -132,7 +132,10 @@ class ConvAddPreluKernel: Kernel, Computable { // print("offset y: \(offsetY)") let offsetZ = 0.0 - let inMetalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1])) + let iC = param.input.tensorDim[1]; + let fC = param.filter.tensorDim[1]; + let oC = param.output.tensorDim[1]; + let inMetalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1]), groups: UInt16(param.groups), iC: UInt16(iC), fC: UInt16(fC), oC: UInt16(oC)) // print("metal param: ") // print(inMetalParam) diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddReluKernel.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddReluKernel.swift index fc43a6c17e..353ac9f467 100644 --- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddReluKernel.swift +++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddReluKernel.swift @@ -25,7 +25,11 @@ class ConvAddReluKernel: ConvAddKernel

{ } return "depthwise_conv_add_relu_3x3_half" } else if param.filter.width == 3 && param.filter.height == 3 { - return "conv_add_relu_3x3_half" + if param.groups == 1 { + return "conv_add_relu_3x3_half" + } else { + return "group_conv_add_relu_3x3_half" + } } else if param.filter.width == 1 && param.filter.height == 5 { return "conv_add_relu_5x1_half" } else if param.filter.width == 5 && param.filter.height == 1 { @@ -43,7 +47,11 @@ class ConvAddReluKernel: ConvAddKernel

{ } else if param.filter.width == 5 && param.filter.height == 1 { return "conv_add_relu_1x5" } else if param.filter.width == 3 && param.filter.height == 3 { - return "conv_add_relu_3x3" + if param.groups == 1 { + return "conv_add_relu_3x3" + } else { + return "group_conv_add_relu_3x3" + } } else { return nil } diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvBNReluKernel.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvBNReluKernel.swift index 7239a3e1a5..0ecf929093 100644 --- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvBNReluKernel.swift +++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvBNReluKernel.swift @@ -102,8 +102,10 @@ class ConvBNReluKernel: Kernel, Computable, Testable { // print("ConvBNReluKernel offset y: \(offsetY)") let offsetZ = 0.0 - - metalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1])) + let iC = param.input.tensorDim[1]; + let fC = param.filter.tensorDim[1]; + let oC = param.output.tensorDim[1]; + metalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1]), groups: UInt16(param.groups), iC: UInt16(iC), fC: UInt16(fC), oC: UInt16(oC)) var invs: [P] = [] let varianceContents = param.variance.buffer.contents().assumingMemoryBound(to: P.self) diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvKernel.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvKernel.swift index d4d4236605..5dbabf9fab 100644 --- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvKernel.swift +++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvKernel.swift @@ -22,6 +22,10 @@ public struct MetalConvParam { let strideY: UInt16 let dilationX: UInt16 let dilationY: UInt16 + let groups: UInt16 + let iC: UInt16 + let fC: UInt16 + let oC: UInt16 } class ConvKernel: Kernel, Computable { @@ -41,8 +45,11 @@ class ConvKernel: Kernel, Computable { let offsetX = param.filter.dim[2]/2 - Int(param.paddings[0]) let offsetY = param.filter.dim[1]/2 - Int(param.paddings[1]) let offsetZ = 0.0 + let iC = param.input.tensorDim[1]; + let fC = param.filter.tensorDim[1]; + let oC = param.output.tensorDim[1]; - metalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1])) + metalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1]), groups: UInt16(param.groups), iC: UInt16(iC), fC: UInt16(fC), oC: UInt16(oC)) } func compute(commandBuffer: MTLCommandBuffer, param: ConvParam

) throws { diff --git a/metal/paddle-mobile/paddle-mobile/Src/Program/TensorDesc.swift b/metal/paddle-mobile/paddle-mobile/Src/Program/TensorDesc.swift index badca3dbac..5871fe0451 100644 --- a/metal/paddle-mobile/paddle-mobile/Src/Program/TensorDesc.swift +++ b/metal/paddle-mobile/paddle-mobile/Src/Program/TensorDesc.swift @@ -16,6 +16,7 @@ import Foundation class TensorDesc { let dims: [Int] + let originDimsCount: Int let dataType: VarTypeType let dataLayout: DataLayout = DataLayout.NCHW() var NCHWDim: [Int] { @@ -63,6 +64,8 @@ class TensorDesc { dimsArray.append(dim) } + originDimsCount = Int(dimsCount) + if dimsCount > 4 { let headDims = Int(dimsCount - 4) for i in 0..