diff --git a/metal/paddle-mobile-metallib/paddle-mobile-metallib/Common.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/Common.metal index 418dcb43965775032f8df51c3125bd64e157e028..185370c519df8e07317ed01469c6a710587307e8 100644 --- a/metal/paddle-mobile-metallib/paddle-mobile-metallib/Common.metal +++ b/metal/paddle-mobile-metallib/paddle-mobile-metallib/Common.metal @@ -107,6 +107,15 @@ inline void invtrans(int32_t trans[4], int32_t ipos[4], int32_t opos[4]) { } } +struct ElementwiseAddParam { + int32_t fast; + int32_t axis; + int32_t ylen; + int32_t xdim[4]; + int32_t xtrans[4]; + int32_t ydim[4]; + int32_t ytrans[4]; +}; struct MetalConvParam { short offsetX; @@ -122,4 +131,5 @@ struct MetalConvParam { ushort oC; ushort hasAddOp; ushort hasReluOp; + ElementwiseAddParam addParam; }; diff --git a/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConcatKernel.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConcatKernel.metal index 671b912bb222da44c000e06ad7d273c10099299b..55362f44decce3215e5a235a9639fb1985c068d6 100644 --- a/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConcatKernel.metal +++ b/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConcatKernel.metal @@ -204,3 +204,16 @@ struct ConcatParam { #undef N #undef R #undef V + +#define V VY +#define R 4 +#define N 3 +#define P float +#include "ConcatKernel.inc.metal" +#undef P +#define P half +#include "ConcatKernel.inc.metal" +#undef P +#undef N +#undef R +#undef V diff --git a/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConvAddReluMetal.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConvAddReluMetal.metal index de85897c1014437c05e71887c649f06c7e39e574..d487e00fa3b251f271ce85df2c58a0ec088fe46a 100644 --- a/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConvAddReluMetal.metal +++ b/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConvAddReluMetal.metal @@ -17,6 +17,56 @@ using namespace metal; +half4 getBiasHalf(uint3 gid, constant ElementwiseAddParam &addParam, texture2d_array biasTexture) { + half4 output; + if (addParam.fast) { + output = biasTexture.read(gid.xy, gid.z); + } else { + int32_t x_xyzn[4] = {int32_t(gid.x), int32_t(gid.y), int32_t(gid.z), 0}, x_abcd[4], t_abcd[4]; + int32_t y_abcd[4] = {0, 0, 0, 0}, y_xyzn[4]; + int32_t xtrans[4] = {addParam.xtrans[0], addParam.xtrans[1], addParam.xtrans[2], addParam.xtrans[3]}; + int32_t ytrans[4] = {addParam.ytrans[0], addParam.ytrans[1], addParam.ytrans[2], addParam.ytrans[3]}; + int32_t yshift = 4 - addParam.ylen - addParam.axis; + for (int n = 0; n < 4; n++) { + x_xyzn[3] = n; + xyzn2abcd(addParam.xdim[3], x_xyzn, x_abcd); + invtrans(xtrans, x_abcd, t_abcd); + for (int k = addParam.axis; k < (addParam.axis + addParam.ylen); k++) { + y_abcd[yshift+k] = t_abcd[k]; + } + trans(ytrans, y_abcd, t_abcd); + abcd2xyzn(addParam.ydim[3], t_abcd, y_xyzn); + output[n] = biasTexture.read(uint2(y_xyzn[0], y_xyzn[1]), y_xyzn[2])[y_xyzn[3]]; + } + } + return output; +} + +float4 getBias(uint3 gid, constant ElementwiseAddParam &addParam, texture2d_array biasTexture) { + float4 output; + if (addParam.fast) { + output = float4(biasTexture.read(gid.xy, gid.z)); + } else { + int32_t x_xyzn[4] = {int32_t(gid.x), int32_t(gid.y), int32_t(gid.z), 0}, x_abcd[4], t_abcd[4]; + int32_t y_abcd[4] = {0, 0, 0, 0}, y_xyzn[4]; + int32_t xtrans[4] = {addParam.xtrans[0], addParam.xtrans[1], addParam.xtrans[2], addParam.xtrans[3]}; + int32_t ytrans[4] = {addParam.ytrans[0], addParam.ytrans[1], addParam.ytrans[2], addParam.ytrans[3]}; + int32_t yshift = 4 - addParam.ylen - addParam.axis; + for (int n = 0; n < 4; n++) { + x_xyzn[3] = n; + xyzn2abcd(addParam.xdim[3], x_xyzn, x_abcd); + invtrans(xtrans, x_abcd, t_abcd); + for (int k = addParam.axis; k < (addParam.axis + addParam.ylen); k++) { + y_abcd[yshift+k] = t_abcd[k]; + } + trans(ytrans, y_abcd, t_abcd); + abcd2xyzn(addParam.ydim[3], t_abcd, y_xyzn); + output[n] = biasTexture.read(uint2(y_xyzn[0], y_xyzn[1]), y_xyzn[2])[y_xyzn[3]]; + } + } + return output; +} + #pragma mark - convAdd kernel void conv_add_relu_1x1(texture2d_array inTexture [[texture(0)]], texture2d_array biasTexture [[texture(1)]], @@ -39,7 +89,11 @@ kernel void conv_add_relu_1x1(texture2d_array inTexture [ uint input_arr_size = inTexture.get_array_size(); uint weithTo = gid.z * kernelHXW * input_arr_size * 4; - float4 output = param.hasAddOp == 1 ? biasTexture.sample(sample, float2(gid.xy), gid.z) : float4(0.0, 0.0, 0.0, 0.0); + float4 output = float4(0.0, 0.0, 0.0, 0.0); + if (param.hasAddOp) { + constant ElementwiseAddParam &addParam = param.addParam; + output = getBias(gid, addParam, biasTexture); + } float4 input; for (uint i = 0; i < input_arr_size; ++i) { @@ -83,7 +137,11 @@ kernel void conv_add_relu_3x3(texture2d_array inTexture [ uint weithTo = gid.z * kernelHXW * input_arr_size * 4; - float4 output = param.hasAddOp == 1 ? biasTexture.sample(sample, float2(gid.xy), gid.z) : float4(0.0, 0.0, 0.0, 0.0); + float4 output = float4(0.0, 0.0, 0.0, 0.0); + if (param.hasAddOp) { + constant ElementwiseAddParam &addParam = param.addParam; + output = getBias(gid, addParam, biasTexture); + } ushort dilation_x = param.dilationX; ushort dilation_y = param.dilationY; @@ -146,7 +204,11 @@ kernel void group_conv_add_relu_3x3(texture2d_array inTex const uint kernelHXW = 9; - float4 output = param.hasAddOp == 1 ? biasTexture.sample(sample, float2(gid.xy), gid.z) : float4(0.0, 0.0, 0.0, 0.0); + float4 output = float4(0.0, 0.0, 0.0, 0.0); + if (param.hasAddOp) { + constant ElementwiseAddParam &addParam = param.addParam; + output = getBias(gid, addParam, biasTexture); + } ushort dilation_x = param.dilationX; ushort dilation_y = param.dilationY; @@ -205,7 +267,11 @@ kernel void conv_add_relu_5x1(texture2d_array inTexture [ uint weithTo = gid.z * kernelHXW * input_arr_size * 4; - float4 output = param.hasAddOp == 1 ? biasTexture.sample(sample, float2(gid.xy), gid.z) : float4(0.0, 0.0, 0.0, 0.0); + float4 output = float4(0.0, 0.0, 0.0, 0.0); + if (param.hasAddOp) { + constant ElementwiseAddParam &addParam = param.addParam; + output = getBias(gid, addParam, biasTexture); + } ushort dilation_y = param.dilationY; float4 input[5]; @@ -262,7 +328,11 @@ kernel void conv_add_relu_1x5(texture2d_array inTexture [ uint weithTo = gid.z * kernelHXW * input_arr_size * 4; - float4 output = param.hasAddOp == 1 ? biasTexture.sample(sample, float2(gid.xy), gid.z) : float4(0.0, 0.0, 0.0, 0.0); + float4 output = float4(0.0, 0.0, 0.0, 0.0); + if (param.hasAddOp) { + constant ElementwiseAddParam &addParam = param.addParam; + output = getBias(gid, addParam, biasTexture); + } ushort dilation_x = param.dilationX; float4 input[5]; @@ -313,7 +383,13 @@ kernel void depthwise_conv_add_relu_3x3(texture2d_array i constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); const uint kernelHXW = 9; uint weithTo = gid.z * kernelHXW * 4; - float4 output = param.hasAddOp == 1 ? biasTexture.sample(sample, float2(gid.xy), gid.z) : float4(0.0, 0.0, 0.0, 0.0); + + float4 output = float4(0.0, 0.0, 0.0, 0.0); + if (param.hasAddOp) { + constant ElementwiseAddParam &addParam = param.addParam; + output = getBias(gid, addParam, biasTexture); + } + float4 inputs[9]; inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y - 1), output_slice); inputs[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 1), output_slice); @@ -358,7 +434,11 @@ kernel void conv_add_relu_1x1_half(texture2d_array inTextu uint input_arr_size = inTexture.get_array_size(); uint weithTo = gid.z * kernelHXW * input_arr_size * 4; - float4 output = param.hasAddOp == 1 ? float4(biasTexture.sample(sample, float2(gid.xy), gid.z)) : float4(0.0, 0.0, 0.0, 0.0); + float4 output = float4(0.0, 0.0, 0.0, 0.0); + if (param.hasAddOp) { + constant ElementwiseAddParam &addParam = param.addParam; + output = float4(getBiasHalf(gid, addParam, biasTexture)); + } float4 input; for (uint i = 0; i < input_arr_size; ++i) { @@ -399,11 +479,15 @@ kernel void conv_add_relu_3x3_half(texture2d_array inTextu uint input_arr_size = inTexture.get_array_size(); uint weithTo = gid.z * kernelHXW * input_arr_size * 4; - float4 output = param.hasAddOp == 1 ? float4(biasTexture.sample(sample, float2(gid.xy), gid.z)) : float4(0.0, 0.0, 0.0, 0.0); - + float4 output = float4(0.0, 0.0, 0.0, 0.0); + if (param.hasAddOp) { + constant ElementwiseAddParam &addParam = param.addParam; + output = float4(getBiasHalf(gid, addParam, biasTexture)); + } + ushort dilation_x = param.dilationX; ushort dilation_y = param.dilationY; - + half4 input[9]; for (uint i = 0; i < input_arr_size; ++i) { input[0] = inTexture.sample(sample, float2(posInInput.x - dilation_x, posInInput.y - dilation_y), i); @@ -418,13 +502,13 @@ kernel void conv_add_relu_3x3_half(texture2d_array inTextu for (int j = 0; j < 9; ++j) { half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i]; output.x += dot(float4(input[j]), float4(weight_x)); - + half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i]; output.y += dot(float4(input[j]), float4(weight_y)); - + half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i]; output.z += dot(float4(input[j]), float4(weight_z)); - + half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i]; output.w += dot(float4(input[j]), float4(weight_w)); } @@ -452,7 +536,11 @@ kernel void group_conv_add_relu_3x3_half(texture2d_array i const uint kernelHXW = 9; - float4 output = param.hasAddOp == 1 ? float4(biasTexture.sample(sample, float2(gid.xy), gid.z)) : float4(0.0, 0.0, 0.0, 0.0); + float4 output = float4(0.0, 0.0, 0.0, 0.0); + if (param.hasAddOp) { + constant ElementwiseAddParam &addParam = param.addParam; + output = float4(getBiasHalf(gid, addParam, biasTexture)); + } ushort dilation_x = param.dilationX; ushort dilation_y = param.dilationY; @@ -505,7 +593,13 @@ kernel void depthwise_conv_add_relu_3x3_half(texture2d_array= param.oC) { - return; + break; } half I[16]; for (int i = 0; i < 16; ++i) { @@ -644,13 +738,14 @@ kernel void depthwise_conv_add_relu_3x3_half_winograd(texture2d_array inTextu uint weithTo = gid.z * kernelHXW * input_arr_size * 4; - float4 output = param.hasAddOp == 1 ? float4(biasTexture.sample(sample, float2(gid.xy), gid.z)) : float4(0.0, 0.0, 0.0, 0.0); - + float4 output = float4(0.0, 0.0, 0.0, 0.0); + if (param.hasAddOp) { + constant ElementwiseAddParam &addParam = param.addParam; + output = float4(getBiasHalf(gid, addParam, biasTexture)); + } + ushort dilation_y = param.dilationY; half4 input[5]; @@ -747,8 +846,12 @@ kernel void conv_add_relu_1x5_half(texture2d_array inTextu uint weithTo = gid.z * kernelHXW * input_arr_size * 4; - float4 output = param.hasAddOp == 1 ? float4(biasTexture.sample(sample, float2(gid.xy), gid.z)) : float4(0.0, 0.0, 0.0, 0.0); - + float4 output = float4(0.0, 0.0, 0.0, 0.0); + if (param.hasAddOp) { + constant ElementwiseAddParam &addParam = param.addParam; + output = float4(getBiasHalf(gid, addParam, biasTexture)); + } + ushort dilation_x = param.dilationX; half4 input[5]; diff --git a/metal/paddle-mobile-metallib/paddle-mobile-metallib/Elementwise.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/Elementwise.metal index 40cad28df130e2d826500cc840aaabf09d04e79b..45559cb0e809050e869e77ebd472f2eaff0c5871 100644 --- a/metal/paddle-mobile-metallib/paddle-mobile-metallib/Elementwise.metal +++ b/metal/paddle-mobile-metallib/paddle-mobile-metallib/Elementwise.metal @@ -17,16 +17,6 @@ using namespace metal; -struct ElementwiseAddParam { - int32_t fast; - int32_t axis; - int32_t ylen; - int32_t xdim[4]; - int32_t xtrans[4]; - int32_t ydim[4]; - int32_t ytrans[4]; -}; - kernel void elementwise_add(texture2d_array inputX [[texture(0)]], texture2d_array inputY [[texture(1)]], texture2d_array outTexture [[texture(2)]], diff --git a/metal/paddle-mobile-metallib/paddle-mobile-metallib/ElementwiseAddPreluKernel.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/ElementwiseAddPreluKernel.metal index cca11e80861723668eea05169c060cb7fcc455c2..c68867491872686a02dd30e9993d67ba5ded4cda 100644 --- a/metal/paddle-mobile-metallib/paddle-mobile-metallib/ElementwiseAddPreluKernel.metal +++ b/metal/paddle-mobile-metallib/paddle-mobile-metallib/ElementwiseAddPreluKernel.metal @@ -16,16 +16,6 @@ #include "Common.metal" using namespace metal; -struct ElementwiseAddParam { - int32_t fast; - int32_t axis; - int32_t ylen; - int32_t xdim[4]; - int32_t xtrans[4]; - int32_t ydim[4]; - int32_t ytrans[4]; -}; - #define P float #define PRELU_CHANNEL prelu_channel diff --git a/metal/paddle-mobile/paddle-mobile/Src/Common/MetalExtension.swift b/metal/paddle-mobile/paddle-mobile/Src/Common/MetalExtension.swift index c09669137c9ea34eb1bf829493de70243a75ee0b..a966086c8e74923439e88f109f29a75c8a2131fb 100644 --- a/metal/paddle-mobile/paddle-mobile/Src/Common/MetalExtension.swift +++ b/metal/paddle-mobile/paddle-mobile/Src/Common/MetalExtension.swift @@ -287,7 +287,13 @@ extension MTLDevice { var rcount: Int = (ndim[0] * ndim[3] + 3) / 4 rcount = rcount * 4 * ndim[1] * ndim[2] var nvalue: [Float32] = .init(repeating: 0.0, count: rcount) - + var value32: [Float32]? + if value is [Float16] { + var value16 = value as! [Float16] + value32 = float16To32(input: &value16, count: value.count) + } else { + value32 = value as? [Float32] + } for i0 in 0..: Loaderable { } while (false) } else { fseek(file, MemoryLayout.size * tensorDescSize, SEEK_CUR) + nowIndex += MemoryLayout.size * tensorDescSize } - nowIndex += MemoryLayout.size * tensorDescSize /* 这里没有根据 Data Type 去判断, 而是从外部泛型直接指定了精度 diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/ConvAddReluOp.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/ConvAddReluOp.swift index 72b8f6e4ed0642f99fd4a0732a37d3d2a447da93..72be568a672b8294adee736932a76198ad10d058 100644 --- a/metal/paddle-mobile/paddle-mobile/Src/Operators/ConvAddReluOp.swift +++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/ConvAddReluOp.swift @@ -24,6 +24,11 @@ class ConvAddReluParam: OpParam { paddings = try ConvAddReluParam.getAttr(key: "paddings", attrs: opDesc.attrs) dilations = try ConvAddReluParam.getAttr(key: "dilations", attrs: opDesc.attrs) groups = try ConvAddReluParam.getAttr(key: "groups", attrs: opDesc.attrs) + do { + axis = try ConvAddReluParam.getAttr(key: "axis", attrs: opDesc.attrs) + } catch { + axis = -1 + } do { y = try ConvAddReluParam.inputY(inputs: opDesc.paraInputs, from: inScope) } catch { @@ -32,7 +37,7 @@ class ConvAddReluParam: OpParam { let device = input.metalTexture!.device y = Texture.init(device: device, inDim: yTensor.dim) let value: [P] = Array(UnsafeBufferPointer(start: yTensor.data.pointer, count: yTensor.dim.numel())) - y?.metalTexture = device.tensor2texture(value: value, dim: yTensor.dim.dims, transpose: [0, 2, 3, 1], inComputePrecision: GlobalConfig.shared.computePrecision) + y?.metalTexture = device.tensor2texture(value: value, dim: yTensor.dim.dims, transpose: [0, 1, 2, 3], inComputePrecision: GlobalConfig.shared.computePrecision) self.yTensor = yTensor } catch { } @@ -49,6 +54,7 @@ class ConvAddReluParam: OpParam { let paddings: [Int32] let dilations: [Int32] let groups: Int + let axis: Int var y: Texture? var yTensor: Tensor

? diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/FeedOp.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/FeedOp.swift index 0d9510d2b0353890c517c6ece71b60635a10eaf0..5022d312050cad119982cc409a0a0a0f1b3b0d0c 100644 --- a/metal/paddle-mobile/paddle-mobile/Src/Operators/FeedOp.swift +++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/FeedOp.swift @@ -64,7 +64,7 @@ class FeedOp: Operator, FeedPa func delogOutput() { print(" \(type) output: ") print(para.output.metalTexture) - print(para.output.metalTexture.toTensor(dim: (n: para.output.padToFourDim[0], c: para.output.padToFourDim[3], h: para.output.padToFourDim[2], w: para.output.padToFourDim[1])).strideArray()) + print(para.output.toTensor().strideArray()) } } diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddAddPreluKernel.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddAddPreluKernel.swift index 6f7b093b49a5e04ca9d2d4c3ad9219d0f3fabed9..7313837c1293fe864e60a5b4cb699025c706f567 100644 --- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddAddPreluKernel.swift +++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddAddPreluKernel.swift @@ -135,7 +135,7 @@ class ConvAddAddPreluKernel: Kernel, Computable { let iC = param.input.tensorDim[1]; let fC = param.filter.tensorDim[1]; let oC = param.output.tensorDim[1]; - let inMetalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1]), groups: UInt16(param.groups), iC: UInt16(iC), fC: UInt16(fC), oC: UInt16(oC), hasAddOp: UInt16(0), hasReluOp: UInt16(0)) + let inMetalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1]), groups: UInt16(param.groups), iC: UInt16(iC), fC: UInt16(fC), oC: UInt16(oC), hasAddOp: UInt16(0), hasReluOp: UInt16(0), addParam: ElementwiseAddMetalParam()) // print("metal param: ") // print(inMetalParam) diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddBatchNormReluKernel.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddBatchNormReluKernel.swift index 82ccfa99fd2da87233feddbb7b7e50f745ef387b..a10e1939dff66192658c1709787aaa0c65875169 100644 --- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddBatchNormReluKernel.swift +++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddBatchNormReluKernel.swift @@ -98,7 +98,7 @@ class ConvAddBatchNormReluKernel: Kernel, Computable, Test let iC = param.input.tensorDim[1]; let fC = param.filter.tensorDim[1]; let oC = param.output.tensorDim[1]; - metalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1]), groups: UInt16(param.groups), iC: UInt16(iC), fC: UInt16(fC), oC: UInt16(oC), hasAddOp: UInt16(0), hasReluOp: UInt16(0)) + metalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1]), groups: UInt16(param.groups), iC: UInt16(iC), fC: UInt16(fC), oC: UInt16(oC), hasAddOp: UInt16(0), hasReluOp: UInt16(0), addParam: ElementwiseAddMetalParam()) var invs: [P] = [] let varianceContents = param.variance.buffer.contents().assumingMemoryBound(to: P.self) diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddKernel.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddKernel.swift index bd270273a43438743381224a2a0a8b4c47640f4c..eefffd817a16f3edd21f7d2d70ed9a1face2c713 100644 --- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddKernel.swift +++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddKernel.swift @@ -16,11 +16,11 @@ import Foundation import MetalPerformanceShaders class ConvAddKernel: ConvAddReluKernel

{ - override func hasAddOp() -> Bool { + override class func hasAddOp() -> Bool { return true } - override func hasReluOp() -> Bool { + override class func hasReluOp() -> Bool { return false } } diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddPreluKernel.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddPreluKernel.swift index 6821992995900ad683619c17c873eb628bab1d2e..6c6f9261627e778212b1007cb00a679266894fbd 100644 --- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddPreluKernel.swift +++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddPreluKernel.swift @@ -135,7 +135,7 @@ class ConvAddPreluKernel: Kernel, Computable { let iC = param.input.tensorDim[1]; let fC = param.filter.tensorDim[1]; let oC = param.output.tensorDim[1]; - let inMetalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1]), groups: UInt16(param.groups), iC: UInt16(iC), fC: UInt16(fC), oC: UInt16(oC), hasAddOp: UInt16(0), hasReluOp: UInt16(0)) + let inMetalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1]), groups: UInt16(param.groups), iC: UInt16(iC), fC: UInt16(fC), oC: UInt16(oC), hasAddOp: UInt16(0), hasReluOp: UInt16(0), addParam: ElementwiseAddMetalParam()) // print("metal param: ") // print(inMetalParam) diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddReluKernel.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddReluKernel.swift index bfc481a8775238dff05cd8c619780ea83a43d61f..4b742f94d5c33b8e8d7a003691c5e6e9776c4394 100644 --- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddReluKernel.swift +++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddReluKernel.swift @@ -29,6 +29,7 @@ public struct MetalConvParam { let oC: UInt16 let hasAddOp: UInt16 let hasReluOp: UInt16 + let addParam: ElementwiseAddMetalParam } @available(iOS 11.0, *) @@ -124,7 +125,7 @@ class ConvAddReluKernel: Kernel, Computable { if #available(iOS 11.0, *), (initContext.useMPS || initContext.useAggressiveOptimization) { let inputChannel = param.input.tensorDim[1] let outputChannel = param.output.tensorDim[1] - if (inputChannel == 1 || inputChannel > 4) && (outputChannel == 1 || outputChannel > 4) { + if inputChannel > 4 && outputChannel > 4 { shouldUseMPS = true } } @@ -135,6 +136,11 @@ class ConvAddReluKernel: Kernel, Computable { if !isDepthWise && param.groups > 1 { shouldUseMPS = false } + if type(of: self).hasAddOp() { + if !(type(of: self).canAddUseMPS(param: param)) { + shouldUseMPS = false + } + } if shouldUseMPS { super.init(device: device, inFunctionName: nil, initContext: initContext) setupWithMPS(device: device, param: param) @@ -195,11 +201,11 @@ class ConvAddReluKernel: Kernel, Computable { param.input.useMPS = true param.output.useMPS = true if #available(iOS 11.3, *) { - if param.y != nil { + if type(of: self).hasAddOp() && type(of: self).canMPSAddByElement(param: param) && !type(of: self).canMPSAddByChannel(param: param) { mpsAddOp = MPSCNNAdd(device: device) - if hasReluOp() { - mpsReluOp = MPSCNNNeuronReLU(device: device, a: 0.0) - } + } + if type(of: self).hasReluOp() { + mpsReluOp = MPSCNNNeuronReLU(device: device, a: 0.0) } } let neuronFilter: MPSCNNNeuron? = param.y != nil ? nil : (neuronFilterForMPSLayer(device: device) as? MPSCNNNeuron) @@ -217,7 +223,11 @@ class ConvAddReluKernel: Kernel, Computable { desc.strideInPixelsX = Int(param.stride[0]) desc.strideInPixelsY = Int(param.stride[1]) let _ = param.filter.convert(converter: MPSPointerConverter

.init()) - let dataSource = ConvDataSource.init(inDesc: desc, inWeights: param.filter, inBiasTerms: param.yTensor) + var biasTerms: Tensor

? = nil + if type(of: self).hasAddOp() && type(of: self).canMPSAddByChannel(param: param) { + biasTerms = param.yTensor + } + let dataSource = ConvDataSource.init(inDesc: desc, inWeights: param.filter, inBiasTerms: biasTerms) let conv = MPSCNNConvolution.init(device: device, weights: dataSource) conv.offset = MPSOffset.init(x: offsetX, y: offsetY, z: 0) @@ -233,7 +243,11 @@ class ConvAddReluKernel: Kernel, Computable { let iC = param.input.tensorDim[1]; let fC = param.filter.tensorDim[1]; let oC = param.output.tensorDim[1]; - let inMetalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1]), groups: UInt16(param.groups), iC: UInt16(iC), fC: UInt16(fC), oC: UInt16(oC), hasAddOp: UInt16(hasAddOp() ? 1 : 0), hasReluOp: UInt16(hasReluOp() ? 1 : 0)) + var addParam = ElementwiseAddMetalParam() + if let inputY = param.y { + addParam = ElementwiseAddKernel

.metalParamFrom(inputX: param.output, inputY: inputY, axis: param.axis) + } + let inMetalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1]), groups: UInt16(param.groups), iC: UInt16(iC), fC: UInt16(fC), oC: UInt16(oC), hasAddOp: UInt16(type(of: self).hasAddOp() ? 1 : 0), hasReluOp: UInt16(type(of: self).hasReluOp() ? 1 : 0), addParam: addParam) metalParam = inMetalParam if type(of: self).isWinoGrad(functionName: functionName) { @@ -304,7 +318,7 @@ class ConvAddReluKernel: Kernel, Computable { } open func neuronFilterForMPSLayer(device: MTLDevice) -> AnyObject? { - if hasReluOp() { + if type(of: self).hasReluOp() { if #available(iOS 10.0, *) { return MPSCNNNeuronReLU(device: device, a: 0) } @@ -312,11 +326,29 @@ class ConvAddReluKernel: Kernel, Computable { return nil } - open func hasAddOp() -> Bool { + open class func canAddUseMPS(param: ConvAddReluParam

) -> Bool { + return canMPSAddByChannel(param: param) || canMPSAddByElement(param: param) + } + + private class func canMPSAddByChannel(param: ConvAddReluParam

) -> Bool { + if let yTensor = param.yTensor, yTensor.dim.cout() == 1 { + return true + } + return false + } + + private class func canMPSAddByElement(param: ConvAddReluParam

) -> Bool { + if let y = param.y, y.dim.dims == param.input.dim.dims { + return true + } + return false + } + + open class func hasAddOp() -> Bool { return true } - open func hasReluOp() -> Bool { + open class func hasReluOp() -> Bool { return true } diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvBNReluKernel.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvBNReluKernel.swift index 6d20bf6f9ff48e13aa4804b514fcaf328321f209..d70caca5f4301d17d44b01fd9bcad44142eb9b50 100644 --- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvBNReluKernel.swift +++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvBNReluKernel.swift @@ -105,7 +105,7 @@ class ConvBNReluKernel: Kernel, Computable, Testable { let iC = param.input.tensorDim[1]; let fC = param.filter.tensorDim[1]; let oC = param.output.tensorDim[1]; - metalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1]), groups: UInt16(param.groups), iC: UInt16(iC), fC: UInt16(fC), oC: UInt16(oC), hasAddOp: UInt16(0), hasReluOp: UInt16(0)) + metalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1]), groups: UInt16(param.groups), iC: UInt16(iC), fC: UInt16(fC), oC: UInt16(oC), hasAddOp: UInt16(0), hasReluOp: UInt16(0), addParam: ElementwiseAddMetalParam()) var invs: [P] = [] let varianceContents = param.variance.buffer.contents().assumingMemoryBound(to: P.self) diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvKernel.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvKernel.swift index 11f8a2683c446fe59e01364c79a5dbc673cf8406..19dc193ac458336cc80fc2eca0f05c5b873efe5d 100644 --- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvKernel.swift +++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvKernel.swift @@ -66,7 +66,7 @@ class ConvKernel: Kernel, Computable { throw PaddleMobileError.predictError(message: " encode is nil") } encoder.setTexture(param.input.metalTexture, index: 0) - encoder.setTexture(param.output.metalTexture, index: 1) + encoder.setTexture(param.output.metalTexture, index: 2) encoder.setBytes(&metalParam, length: MemoryLayout.size, index: 0) encoder.setBuffer(param.filter.buffer, offset: 0, index: 1) encoder.setBuffer(blankTensor?.buffer, offset: 0, index: 2) @@ -111,7 +111,7 @@ class ConvKernel: Kernel, Computable { let iC = param.input.tensorDim[1]; let fC = param.filter.tensorDim[1]; let oC = param.output.tensorDim[1]; - let inMetalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1]), groups: UInt16(param.groups), iC: UInt16(iC), fC: UInt16(fC), oC: UInt16(oC), hasAddOp: UInt16(hasAddOp() ? 1 : 0), hasReluOp: UInt16(hasReluOp() ? 1 : 0)) + let inMetalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1]), groups: UInt16(param.groups), iC: UInt16(iC), fC: UInt16(fC), oC: UInt16(oC), hasAddOp: UInt16(hasAddOp() ? 1 : 0), hasReluOp: UInt16(hasReluOp() ? 1 : 0), addParam: ElementwiseAddMetalParam()) metalParam = inMetalParam if type(of: self).isWinoGrad(functionName: functionName) { @@ -130,7 +130,7 @@ class ConvKernel: Kernel, Computable { } else if param.filter.channel == 1 && param.filter.n == param.input.tensorDim[1] { if useAggressiveOptimization { let couldUseWinograd = param.filter.width == 3 && param.filter.height == 3 - && param.filter.n == 16 && param.stride[0] == 1 && param.stride[1] == 1 + && param.filter.n <= 16 && param.stride[0] == 1 && param.stride[1] == 1 && param.dilations[0] == 1 && param.dilations[1] == 1 if couldUseWinograd { return "depthwise_conv_add_relu_3x3_half_winograd" diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvReluKernel.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvReluKernel.swift index 0b73deb1b081ec357865fbb4fdd8929e36225e31..9937ca158b4808045543a78097ac60b65fe83a08 100644 --- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvReluKernel.swift +++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvReluKernel.swift @@ -16,11 +16,11 @@ import Foundation import MetalPerformanceShaders class ConvReluKernel: ConvAddReluKernel

{ - override func hasAddOp() -> Bool { + override class func hasAddOp() -> Bool { return false } - override func hasReluOp() -> Bool { + override class func hasReluOp() -> Bool { return true } } diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ElementwiseAddKernel.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ElementwiseAddKernel.swift index acddad5bb196317b5877febb2c2a50243074b184..c1ee435e3a1809a21449a03ecc37d9e3395a3286 100644 --- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ElementwiseAddKernel.swift +++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ElementwiseAddKernel.swift @@ -34,27 +34,8 @@ class ElementwiseAddKernel: Kernel, Computable { throw error } - metalParam = ElementwiseAddMetalParam.init() + metalParam = ElementwiseAddKernel.metalParamFrom(inputX: param.inputX, inputY: param.inputY, axis: param.axis) - let xdim: [Int32] = (0..<4).map { Int32(param.inputX.dim[$0]) } - let ydim: [Int32] = (0..<4).map { Int32(param.inputY.dim[$0]) } - let xtrans: [Int32] = (0..<4).map { Int32(param.inputX.transpose[$0]) } - let ytrans: [Int32] = (0..<4).map { Int32(param.inputY.transpose[$0]) } - - metalParam.xdim = (xdim[0], xdim[1], xdim[2], xdim[3]) - metalParam.ydim = (ydim[0], ydim[1], ydim[2], ydim[3]) - metalParam.xtrans = (xtrans[0], xtrans[1], xtrans[2], xtrans[3]) - metalParam.ytrans = (ytrans[0], ytrans[1], ytrans[2], ytrans[3]) - if param.axis == -1 { - metalParam.axis = 4 - Int32(param.inputY.tensorDim.cout()) - } else { - metalParam.axis = 4 - Int32(param.inputX.tensorDim.cout()) + Int32(param.axis) - } - metalParam.ylen = Int32(param.inputY.tensorDim.cout()) - if (param.inputX.dim == param.inputY.dim) && (param.inputX.transpose == param.inputY.transpose) { - // print("===> elementwise_add fast!!!") - metalParam.fast = 1 - } if GlobalConfig.shared.computePrecision == .Float32 { super.init(device: device, inFunctionName: "elementwise_add", initContext: initContext) } else if GlobalConfig.shared.computePrecision == .Float16 { @@ -75,4 +56,29 @@ class ElementwiseAddKernel: Kernel, Computable { encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture) encoder.endEncoding() } + + static func metalParamFrom(inputX: Texture, inputY: Texture, axis: Int) -> ElementwiseAddMetalParam { + var metalParam = ElementwiseAddMetalParam.init() + + let xdim: [Int32] = (0..<4).map { Int32(inputX.dim[$0]) } + let ydim: [Int32] = (0..<4).map { Int32(inputY.dim[$0]) } + let xtrans: [Int32] = (0..<4).map { Int32(inputX.transpose[$0]) } + let ytrans: [Int32] = (0..<4).map { Int32(inputY.transpose[$0]) } + + metalParam.xdim = (xdim[0], xdim[1], xdim[2], xdim[3]) + metalParam.ydim = (ydim[0], ydim[1], ydim[2], ydim[3]) + metalParam.xtrans = (xtrans[0], xtrans[1], xtrans[2], xtrans[3]) + metalParam.ytrans = (ytrans[0], ytrans[1], ytrans[2], ytrans[3]) + if axis == -1 { + metalParam.axis = 4 - Int32(inputY.tensorDim.cout()) + } else { + metalParam.axis = 4 - Int32(inputX.tensorDim.cout()) + Int32(axis) + } + metalParam.ylen = Int32(inputY.tensorDim.cout()) + if (inputX.dim == inputY.dim) && (inputX.transpose == inputY.transpose) { + // print("===> elementwise_add fast!!!") + metalParam.fast = 1 + } + return metalParam + } } diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ReluKernel.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ReluKernel.swift index 0e2f4983cb2a0d3cc86ed00c9d3a5c9f083f0a00..1f370db649840362e86755c9cb7e37ea2f34b15b 100644 --- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ReluKernel.swift +++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ReluKernel.swift @@ -26,6 +26,11 @@ class ReluKernel: Kernel, Computable{ } required init(device: MTLDevice, param: ReluParam

, initContext: InitContext) throws { + do { + try param.output.initTexture(device: device, inTranspose: param.input.transpose, computePrecision: GlobalConfig.shared.computePrecision) + } catch let error { + throw error + } if GlobalConfig.shared.computePrecision == .Float32 { super.init(device: device, inFunctionName: "relu", initContext: initContext) } else if GlobalConfig.shared.computePrecision == .Float16 { diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ScaleOpKernel.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ScaleOpKernel.swift index fc63c08ebc8b91c83022356faa2b388c187b6d7b..483bedcb0825049035d8d1a3c76cbfa720bbed7e 100644 --- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ScaleOpKernel.swift +++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ScaleOpKernel.swift @@ -34,10 +34,10 @@ class ScaleOpKernel: Kernel, Computable{ } var shouldUseMPS = false - if initContext.useMPS && param.biasAfterScale { + if initContext.useMPS && param.biasAfterScale && param.input.tensorDim.cout() == 4 && param.output.tensorDim.cout() == 4 { let inputChannel = param.input.tensorDim[1] let outputChannel = param.output.tensorDim[1] - if (inputChannel == 1 || inputChannel > 4) && (outputChannel == 1 || outputChannel > 4) { + if (inputChannel > 4) && (outputChannel > 4) { shouldUseMPS = true } }