From 44f50ecb91fc61d35ee1e83b0fe7f18fe6d55f86 Mon Sep 17 00:00:00 2001 From: NazgulLee Date: Fri, 14 Jun 2019 16:19:04 +0800 Subject: [PATCH] 1. fix add bias logic; 2. fix several typo (#1687) --- .../paddle-mobile-metallib/Common.metal | 10 ++ .../paddle-mobile-metallib/ConcatKernel.metal | 13 ++ .../ConvAddReluMetal.metal | 153 +++++++++++++++--- .../paddle-mobile-metallib/Elementwise.metal | 10 -- .../ElementwiseAddPreluKernel.metal | 10 -- .../Src/Common/MetalExtension.swift | 15 +- .../Src/Common/PaddleMobileUnitTest.swift | 2 +- .../paddle-mobile/Src/Framework/Loader.swift | 2 +- .../Src/Operators/ConvAddReluOp.swift | 8 +- .../paddle-mobile/Src/Operators/FeedOp.swift | 2 +- .../Kernels/ConvAddAddPreluKernel.swift | 2 +- .../Kernels/ConvAddBatchNormReluKernel.swift | 2 +- .../Src/Operators/Kernels/ConvAddKernel.swift | 4 +- .../Kernels/ConvAddPreluKernel.swift | 2 +- .../Operators/Kernels/ConvAddReluKernel.swift | 52 ++++-- .../Operators/Kernels/ConvBNReluKernel.swift | 2 +- .../Src/Operators/Kernels/ConvKernel.swift | 6 +- .../Operators/Kernels/ConvReluKernel.swift | 4 +- .../Kernels/ElementwiseAddKernel.swift | 46 +++--- .../Src/Operators/Kernels/ReluKernel.swift | 5 + .../Src/Operators/Kernels/ScaleOpKernel.swift | 4 +- 21 files changed, 259 insertions(+), 95 deletions(-) diff --git a/metal/paddle-mobile-metallib/paddle-mobile-metallib/Common.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/Common.metal index 418dcb4396..185370c519 100644 --- a/metal/paddle-mobile-metallib/paddle-mobile-metallib/Common.metal +++ b/metal/paddle-mobile-metallib/paddle-mobile-metallib/Common.metal @@ -107,6 +107,15 @@ inline void invtrans(int32_t trans[4], int32_t ipos[4], int32_t opos[4]) { } } +struct ElementwiseAddParam { + int32_t fast; + int32_t axis; + int32_t ylen; + int32_t xdim[4]; + int32_t xtrans[4]; + int32_t ydim[4]; + int32_t ytrans[4]; +}; struct MetalConvParam { short offsetX; @@ -122,4 +131,5 @@ struct MetalConvParam { ushort oC; ushort hasAddOp; ushort hasReluOp; + ElementwiseAddParam addParam; }; diff --git a/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConcatKernel.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConcatKernel.metal index 671b912bb2..55362f44de 100644 --- a/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConcatKernel.metal +++ b/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConcatKernel.metal @@ -204,3 +204,16 @@ struct ConcatParam { #undef N #undef R #undef V + +#define V VY +#define R 4 +#define N 3 +#define P float +#include "ConcatKernel.inc.metal" +#undef P +#define P half +#include "ConcatKernel.inc.metal" +#undef P +#undef N +#undef R +#undef V diff --git a/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConvAddReluMetal.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConvAddReluMetal.metal index de85897c10..d487e00fa3 100644 --- a/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConvAddReluMetal.metal +++ b/metal/paddle-mobile-metallib/paddle-mobile-metallib/ConvAddReluMetal.metal @@ -17,6 +17,56 @@ using namespace metal; +half4 getBiasHalf(uint3 gid, constant ElementwiseAddParam &addParam, texture2d_array biasTexture) { + half4 output; + if (addParam.fast) { + output = biasTexture.read(gid.xy, gid.z); + } else { + int32_t x_xyzn[4] = {int32_t(gid.x), int32_t(gid.y), int32_t(gid.z), 0}, x_abcd[4], t_abcd[4]; + int32_t y_abcd[4] = {0, 0, 0, 0}, y_xyzn[4]; + int32_t xtrans[4] = {addParam.xtrans[0], addParam.xtrans[1], addParam.xtrans[2], addParam.xtrans[3]}; + int32_t ytrans[4] = {addParam.ytrans[0], addParam.ytrans[1], addParam.ytrans[2], addParam.ytrans[3]}; + int32_t yshift = 4 - addParam.ylen - addParam.axis; + for (int n = 0; n < 4; n++) { + x_xyzn[3] = n; + xyzn2abcd(addParam.xdim[3], x_xyzn, x_abcd); + invtrans(xtrans, x_abcd, t_abcd); + for (int k = addParam.axis; k < (addParam.axis + addParam.ylen); k++) { + y_abcd[yshift+k] = t_abcd[k]; + } + trans(ytrans, y_abcd, t_abcd); + abcd2xyzn(addParam.ydim[3], t_abcd, y_xyzn); + output[n] = biasTexture.read(uint2(y_xyzn[0], y_xyzn[1]), y_xyzn[2])[y_xyzn[3]]; + } + } + return output; +} + +float4 getBias(uint3 gid, constant ElementwiseAddParam &addParam, texture2d_array biasTexture) { + float4 output; + if (addParam.fast) { + output = float4(biasTexture.read(gid.xy, gid.z)); + } else { + int32_t x_xyzn[4] = {int32_t(gid.x), int32_t(gid.y), int32_t(gid.z), 0}, x_abcd[4], t_abcd[4]; + int32_t y_abcd[4] = {0, 0, 0, 0}, y_xyzn[4]; + int32_t xtrans[4] = {addParam.xtrans[0], addParam.xtrans[1], addParam.xtrans[2], addParam.xtrans[3]}; + int32_t ytrans[4] = {addParam.ytrans[0], addParam.ytrans[1], addParam.ytrans[2], addParam.ytrans[3]}; + int32_t yshift = 4 - addParam.ylen - addParam.axis; + for (int n = 0; n < 4; n++) { + x_xyzn[3] = n; + xyzn2abcd(addParam.xdim[3], x_xyzn, x_abcd); + invtrans(xtrans, x_abcd, t_abcd); + for (int k = addParam.axis; k < (addParam.axis + addParam.ylen); k++) { + y_abcd[yshift+k] = t_abcd[k]; + } + trans(ytrans, y_abcd, t_abcd); + abcd2xyzn(addParam.ydim[3], t_abcd, y_xyzn); + output[n] = biasTexture.read(uint2(y_xyzn[0], y_xyzn[1]), y_xyzn[2])[y_xyzn[3]]; + } + } + return output; +} + #pragma mark - convAdd kernel void conv_add_relu_1x1(texture2d_array inTexture [[texture(0)]], texture2d_array biasTexture [[texture(1)]], @@ -39,7 +89,11 @@ kernel void conv_add_relu_1x1(texture2d_array inTexture [ uint input_arr_size = inTexture.get_array_size(); uint weithTo = gid.z * kernelHXW * input_arr_size * 4; - float4 output = param.hasAddOp == 1 ? biasTexture.sample(sample, float2(gid.xy), gid.z) : float4(0.0, 0.0, 0.0, 0.0); + float4 output = float4(0.0, 0.0, 0.0, 0.0); + if (param.hasAddOp) { + constant ElementwiseAddParam &addParam = param.addParam; + output = getBias(gid, addParam, biasTexture); + } float4 input; for (uint i = 0; i < input_arr_size; ++i) { @@ -83,7 +137,11 @@ kernel void conv_add_relu_3x3(texture2d_array inTexture [ uint weithTo = gid.z * kernelHXW * input_arr_size * 4; - float4 output = param.hasAddOp == 1 ? biasTexture.sample(sample, float2(gid.xy), gid.z) : float4(0.0, 0.0, 0.0, 0.0); + float4 output = float4(0.0, 0.0, 0.0, 0.0); + if (param.hasAddOp) { + constant ElementwiseAddParam &addParam = param.addParam; + output = getBias(gid, addParam, biasTexture); + } ushort dilation_x = param.dilationX; ushort dilation_y = param.dilationY; @@ -146,7 +204,11 @@ kernel void group_conv_add_relu_3x3(texture2d_array inTex const uint kernelHXW = 9; - float4 output = param.hasAddOp == 1 ? biasTexture.sample(sample, float2(gid.xy), gid.z) : float4(0.0, 0.0, 0.0, 0.0); + float4 output = float4(0.0, 0.0, 0.0, 0.0); + if (param.hasAddOp) { + constant ElementwiseAddParam &addParam = param.addParam; + output = getBias(gid, addParam, biasTexture); + } ushort dilation_x = param.dilationX; ushort dilation_y = param.dilationY; @@ -205,7 +267,11 @@ kernel void conv_add_relu_5x1(texture2d_array inTexture [ uint weithTo = gid.z * kernelHXW * input_arr_size * 4; - float4 output = param.hasAddOp == 1 ? biasTexture.sample(sample, float2(gid.xy), gid.z) : float4(0.0, 0.0, 0.0, 0.0); + float4 output = float4(0.0, 0.0, 0.0, 0.0); + if (param.hasAddOp) { + constant ElementwiseAddParam &addParam = param.addParam; + output = getBias(gid, addParam, biasTexture); + } ushort dilation_y = param.dilationY; float4 input[5]; @@ -262,7 +328,11 @@ kernel void conv_add_relu_1x5(texture2d_array inTexture [ uint weithTo = gid.z * kernelHXW * input_arr_size * 4; - float4 output = param.hasAddOp == 1 ? biasTexture.sample(sample, float2(gid.xy), gid.z) : float4(0.0, 0.0, 0.0, 0.0); + float4 output = float4(0.0, 0.0, 0.0, 0.0); + if (param.hasAddOp) { + constant ElementwiseAddParam &addParam = param.addParam; + output = getBias(gid, addParam, biasTexture); + } ushort dilation_x = param.dilationX; float4 input[5]; @@ -313,7 +383,13 @@ kernel void depthwise_conv_add_relu_3x3(texture2d_array i constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); const uint kernelHXW = 9; uint weithTo = gid.z * kernelHXW * 4; - float4 output = param.hasAddOp == 1 ? biasTexture.sample(sample, float2(gid.xy), gid.z) : float4(0.0, 0.0, 0.0, 0.0); + + float4 output = float4(0.0, 0.0, 0.0, 0.0); + if (param.hasAddOp) { + constant ElementwiseAddParam &addParam = param.addParam; + output = getBias(gid, addParam, biasTexture); + } + float4 inputs[9]; inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y - 1), output_slice); inputs[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 1), output_slice); @@ -358,7 +434,11 @@ kernel void conv_add_relu_1x1_half(texture2d_array inTextu uint input_arr_size = inTexture.get_array_size(); uint weithTo = gid.z * kernelHXW * input_arr_size * 4; - float4 output = param.hasAddOp == 1 ? float4(biasTexture.sample(sample, float2(gid.xy), gid.z)) : float4(0.0, 0.0, 0.0, 0.0); + float4 output = float4(0.0, 0.0, 0.0, 0.0); + if (param.hasAddOp) { + constant ElementwiseAddParam &addParam = param.addParam; + output = float4(getBiasHalf(gid, addParam, biasTexture)); + } float4 input; for (uint i = 0; i < input_arr_size; ++i) { @@ -399,11 +479,15 @@ kernel void conv_add_relu_3x3_half(texture2d_array inTextu uint input_arr_size = inTexture.get_array_size(); uint weithTo = gid.z * kernelHXW * input_arr_size * 4; - float4 output = param.hasAddOp == 1 ? float4(biasTexture.sample(sample, float2(gid.xy), gid.z)) : float4(0.0, 0.0, 0.0, 0.0); - + float4 output = float4(0.0, 0.0, 0.0, 0.0); + if (param.hasAddOp) { + constant ElementwiseAddParam &addParam = param.addParam; + output = float4(getBiasHalf(gid, addParam, biasTexture)); + } + ushort dilation_x = param.dilationX; ushort dilation_y = param.dilationY; - + half4 input[9]; for (uint i = 0; i < input_arr_size; ++i) { input[0] = inTexture.sample(sample, float2(posInInput.x - dilation_x, posInInput.y - dilation_y), i); @@ -418,13 +502,13 @@ kernel void conv_add_relu_3x3_half(texture2d_array inTextu for (int j = 0; j < 9; ++j) { half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i]; output.x += dot(float4(input[j]), float4(weight_x)); - + half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i]; output.y += dot(float4(input[j]), float4(weight_y)); - + half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i]; output.z += dot(float4(input[j]), float4(weight_z)); - + half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i]; output.w += dot(float4(input[j]), float4(weight_w)); } @@ -452,7 +536,11 @@ kernel void group_conv_add_relu_3x3_half(texture2d_array i const uint kernelHXW = 9; - float4 output = param.hasAddOp == 1 ? float4(biasTexture.sample(sample, float2(gid.xy), gid.z)) : float4(0.0, 0.0, 0.0, 0.0); + float4 output = float4(0.0, 0.0, 0.0, 0.0); + if (param.hasAddOp) { + constant ElementwiseAddParam &addParam = param.addParam; + output = float4(getBiasHalf(gid, addParam, biasTexture)); + } ushort dilation_x = param.dilationX; ushort dilation_y = param.dilationY; @@ -505,7 +593,13 @@ kernel void depthwise_conv_add_relu_3x3_half(texture2d_array= param.oC) { - return; + break; } half I[16]; for (int i = 0; i < 16; ++i) { @@ -644,13 +738,14 @@ kernel void depthwise_conv_add_relu_3x3_half_winograd(texture2d_array inTextu uint weithTo = gid.z * kernelHXW * input_arr_size * 4; - float4 output = param.hasAddOp == 1 ? float4(biasTexture.sample(sample, float2(gid.xy), gid.z)) : float4(0.0, 0.0, 0.0, 0.0); - + float4 output = float4(0.0, 0.0, 0.0, 0.0); + if (param.hasAddOp) { + constant ElementwiseAddParam &addParam = param.addParam; + output = float4(getBiasHalf(gid, addParam, biasTexture)); + } + ushort dilation_y = param.dilationY; half4 input[5]; @@ -747,8 +846,12 @@ kernel void conv_add_relu_1x5_half(texture2d_array inTextu uint weithTo = gid.z * kernelHXW * input_arr_size * 4; - float4 output = param.hasAddOp == 1 ? float4(biasTexture.sample(sample, float2(gid.xy), gid.z)) : float4(0.0, 0.0, 0.0, 0.0); - + float4 output = float4(0.0, 0.0, 0.0, 0.0); + if (param.hasAddOp) { + constant ElementwiseAddParam &addParam = param.addParam; + output = float4(getBiasHalf(gid, addParam, biasTexture)); + } + ushort dilation_x = param.dilationX; half4 input[5]; diff --git a/metal/paddle-mobile-metallib/paddle-mobile-metallib/Elementwise.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/Elementwise.metal index 40cad28df1..45559cb0e8 100644 --- a/metal/paddle-mobile-metallib/paddle-mobile-metallib/Elementwise.metal +++ b/metal/paddle-mobile-metallib/paddle-mobile-metallib/Elementwise.metal @@ -17,16 +17,6 @@ using namespace metal; -struct ElementwiseAddParam { - int32_t fast; - int32_t axis; - int32_t ylen; - int32_t xdim[4]; - int32_t xtrans[4]; - int32_t ydim[4]; - int32_t ytrans[4]; -}; - kernel void elementwise_add(texture2d_array inputX [[texture(0)]], texture2d_array inputY [[texture(1)]], texture2d_array outTexture [[texture(2)]], diff --git a/metal/paddle-mobile-metallib/paddle-mobile-metallib/ElementwiseAddPreluKernel.metal b/metal/paddle-mobile-metallib/paddle-mobile-metallib/ElementwiseAddPreluKernel.metal index cca11e8086..c688674918 100644 --- a/metal/paddle-mobile-metallib/paddle-mobile-metallib/ElementwiseAddPreluKernel.metal +++ b/metal/paddle-mobile-metallib/paddle-mobile-metallib/ElementwiseAddPreluKernel.metal @@ -16,16 +16,6 @@ #include "Common.metal" using namespace metal; -struct ElementwiseAddParam { - int32_t fast; - int32_t axis; - int32_t ylen; - int32_t xdim[4]; - int32_t xtrans[4]; - int32_t ydim[4]; - int32_t ytrans[4]; -}; - #define P float #define PRELU_CHANNEL prelu_channel diff --git a/metal/paddle-mobile/paddle-mobile/Src/Common/MetalExtension.swift b/metal/paddle-mobile/paddle-mobile/Src/Common/MetalExtension.swift index c09669137c..a966086c8e 100644 --- a/metal/paddle-mobile/paddle-mobile/Src/Common/MetalExtension.swift +++ b/metal/paddle-mobile/paddle-mobile/Src/Common/MetalExtension.swift @@ -287,7 +287,13 @@ extension MTLDevice { var rcount: Int = (ndim[0] * ndim[3] + 3) / 4 rcount = rcount * 4 * ndim[1] * ndim[2] var nvalue: [Float32] = .init(repeating: 0.0, count: rcount) - + var value32: [Float32]? + if value is [Float16] { + var value16 = value as! [Float16] + value32 = float16To32(input: &value16, count: value.count) + } else { + value32 = value as? [Float32] + } for i0 in 0..: Loaderable { } while (false) } else { fseek(file, MemoryLayout.size * tensorDescSize, SEEK_CUR) + nowIndex += MemoryLayout.size * tensorDescSize } - nowIndex += MemoryLayout.size * tensorDescSize /* 这里没有根据 Data Type 去判断, 而是从外部泛型直接指定了精度 diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/ConvAddReluOp.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/ConvAddReluOp.swift index 72b8f6e4ed..72be568a67 100644 --- a/metal/paddle-mobile/paddle-mobile/Src/Operators/ConvAddReluOp.swift +++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/ConvAddReluOp.swift @@ -24,6 +24,11 @@ class ConvAddReluParam: OpParam { paddings = try ConvAddReluParam.getAttr(key: "paddings", attrs: opDesc.attrs) dilations = try ConvAddReluParam.getAttr(key: "dilations", attrs: opDesc.attrs) groups = try ConvAddReluParam.getAttr(key: "groups", attrs: opDesc.attrs) + do { + axis = try ConvAddReluParam.getAttr(key: "axis", attrs: opDesc.attrs) + } catch { + axis = -1 + } do { y = try ConvAddReluParam.inputY(inputs: opDesc.paraInputs, from: inScope) } catch { @@ -32,7 +37,7 @@ class ConvAddReluParam: OpParam { let device = input.metalTexture!.device y = Texture.init(device: device, inDim: yTensor.dim) let value: [P] = Array(UnsafeBufferPointer(start: yTensor.data.pointer, count: yTensor.dim.numel())) - y?.metalTexture = device.tensor2texture(value: value, dim: yTensor.dim.dims, transpose: [0, 2, 3, 1], inComputePrecision: GlobalConfig.shared.computePrecision) + y?.metalTexture = device.tensor2texture(value: value, dim: yTensor.dim.dims, transpose: [0, 1, 2, 3], inComputePrecision: GlobalConfig.shared.computePrecision) self.yTensor = yTensor } catch { } @@ -49,6 +54,7 @@ class ConvAddReluParam: OpParam { let paddings: [Int32] let dilations: [Int32] let groups: Int + let axis: Int var y: Texture? var yTensor: Tensor

? diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/FeedOp.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/FeedOp.swift index 0d9510d2b0..5022d31205 100644 --- a/metal/paddle-mobile/paddle-mobile/Src/Operators/FeedOp.swift +++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/FeedOp.swift @@ -64,7 +64,7 @@ class FeedOp: Operator, FeedPa func delogOutput() { print(" \(type) output: ") print(para.output.metalTexture) - print(para.output.metalTexture.toTensor(dim: (n: para.output.padToFourDim[0], c: para.output.padToFourDim[3], h: para.output.padToFourDim[2], w: para.output.padToFourDim[1])).strideArray()) + print(para.output.toTensor().strideArray()) } } diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddAddPreluKernel.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddAddPreluKernel.swift index 6f7b093b49..7313837c12 100644 --- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddAddPreluKernel.swift +++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddAddPreluKernel.swift @@ -135,7 +135,7 @@ class ConvAddAddPreluKernel: Kernel, Computable { let iC = param.input.tensorDim[1]; let fC = param.filter.tensorDim[1]; let oC = param.output.tensorDim[1]; - let inMetalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1]), groups: UInt16(param.groups), iC: UInt16(iC), fC: UInt16(fC), oC: UInt16(oC), hasAddOp: UInt16(0), hasReluOp: UInt16(0)) + let inMetalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1]), groups: UInt16(param.groups), iC: UInt16(iC), fC: UInt16(fC), oC: UInt16(oC), hasAddOp: UInt16(0), hasReluOp: UInt16(0), addParam: ElementwiseAddMetalParam()) // print("metal param: ") // print(inMetalParam) diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddBatchNormReluKernel.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddBatchNormReluKernel.swift index 82ccfa99fd..a10e1939df 100644 --- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddBatchNormReluKernel.swift +++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddBatchNormReluKernel.swift @@ -98,7 +98,7 @@ class ConvAddBatchNormReluKernel: Kernel, Computable, Test let iC = param.input.tensorDim[1]; let fC = param.filter.tensorDim[1]; let oC = param.output.tensorDim[1]; - metalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1]), groups: UInt16(param.groups), iC: UInt16(iC), fC: UInt16(fC), oC: UInt16(oC), hasAddOp: UInt16(0), hasReluOp: UInt16(0)) + metalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1]), groups: UInt16(param.groups), iC: UInt16(iC), fC: UInt16(fC), oC: UInt16(oC), hasAddOp: UInt16(0), hasReluOp: UInt16(0), addParam: ElementwiseAddMetalParam()) var invs: [P] = [] let varianceContents = param.variance.buffer.contents().assumingMemoryBound(to: P.self) diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddKernel.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddKernel.swift index bd270273a4..eefffd817a 100644 --- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddKernel.swift +++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddKernel.swift @@ -16,11 +16,11 @@ import Foundation import MetalPerformanceShaders class ConvAddKernel: ConvAddReluKernel

{ - override func hasAddOp() -> Bool { + override class func hasAddOp() -> Bool { return true } - override func hasReluOp() -> Bool { + override class func hasReluOp() -> Bool { return false } } diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddPreluKernel.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddPreluKernel.swift index 6821992995..6c6f926162 100644 --- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddPreluKernel.swift +++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddPreluKernel.swift @@ -135,7 +135,7 @@ class ConvAddPreluKernel: Kernel, Computable { let iC = param.input.tensorDim[1]; let fC = param.filter.tensorDim[1]; let oC = param.output.tensorDim[1]; - let inMetalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1]), groups: UInt16(param.groups), iC: UInt16(iC), fC: UInt16(fC), oC: UInt16(oC), hasAddOp: UInt16(0), hasReluOp: UInt16(0)) + let inMetalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1]), groups: UInt16(param.groups), iC: UInt16(iC), fC: UInt16(fC), oC: UInt16(oC), hasAddOp: UInt16(0), hasReluOp: UInt16(0), addParam: ElementwiseAddMetalParam()) // print("metal param: ") // print(inMetalParam) diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddReluKernel.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddReluKernel.swift index bfc481a877..4b742f94d5 100644 --- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddReluKernel.swift +++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvAddReluKernel.swift @@ -29,6 +29,7 @@ public struct MetalConvParam { let oC: UInt16 let hasAddOp: UInt16 let hasReluOp: UInt16 + let addParam: ElementwiseAddMetalParam } @available(iOS 11.0, *) @@ -124,7 +125,7 @@ class ConvAddReluKernel: Kernel, Computable { if #available(iOS 11.0, *), (initContext.useMPS || initContext.useAggressiveOptimization) { let inputChannel = param.input.tensorDim[1] let outputChannel = param.output.tensorDim[1] - if (inputChannel == 1 || inputChannel > 4) && (outputChannel == 1 || outputChannel > 4) { + if inputChannel > 4 && outputChannel > 4 { shouldUseMPS = true } } @@ -135,6 +136,11 @@ class ConvAddReluKernel: Kernel, Computable { if !isDepthWise && param.groups > 1 { shouldUseMPS = false } + if type(of: self).hasAddOp() { + if !(type(of: self).canAddUseMPS(param: param)) { + shouldUseMPS = false + } + } if shouldUseMPS { super.init(device: device, inFunctionName: nil, initContext: initContext) setupWithMPS(device: device, param: param) @@ -195,11 +201,11 @@ class ConvAddReluKernel: Kernel, Computable { param.input.useMPS = true param.output.useMPS = true if #available(iOS 11.3, *) { - if param.y != nil { + if type(of: self).hasAddOp() && type(of: self).canMPSAddByElement(param: param) && !type(of: self).canMPSAddByChannel(param: param) { mpsAddOp = MPSCNNAdd(device: device) - if hasReluOp() { - mpsReluOp = MPSCNNNeuronReLU(device: device, a: 0.0) - } + } + if type(of: self).hasReluOp() { + mpsReluOp = MPSCNNNeuronReLU(device: device, a: 0.0) } } let neuronFilter: MPSCNNNeuron? = param.y != nil ? nil : (neuronFilterForMPSLayer(device: device) as? MPSCNNNeuron) @@ -217,7 +223,11 @@ class ConvAddReluKernel: Kernel, Computable { desc.strideInPixelsX = Int(param.stride[0]) desc.strideInPixelsY = Int(param.stride[1]) let _ = param.filter.convert(converter: MPSPointerConverter

.init()) - let dataSource = ConvDataSource.init(inDesc: desc, inWeights: param.filter, inBiasTerms: param.yTensor) + var biasTerms: Tensor

? = nil + if type(of: self).hasAddOp() && type(of: self).canMPSAddByChannel(param: param) { + biasTerms = param.yTensor + } + let dataSource = ConvDataSource.init(inDesc: desc, inWeights: param.filter, inBiasTerms: biasTerms) let conv = MPSCNNConvolution.init(device: device, weights: dataSource) conv.offset = MPSOffset.init(x: offsetX, y: offsetY, z: 0) @@ -233,7 +243,11 @@ class ConvAddReluKernel: Kernel, Computable { let iC = param.input.tensorDim[1]; let fC = param.filter.tensorDim[1]; let oC = param.output.tensorDim[1]; - let inMetalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1]), groups: UInt16(param.groups), iC: UInt16(iC), fC: UInt16(fC), oC: UInt16(oC), hasAddOp: UInt16(hasAddOp() ? 1 : 0), hasReluOp: UInt16(hasReluOp() ? 1 : 0)) + var addParam = ElementwiseAddMetalParam() + if let inputY = param.y { + addParam = ElementwiseAddKernel

.metalParamFrom(inputX: param.output, inputY: inputY, axis: param.axis) + } + let inMetalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1]), groups: UInt16(param.groups), iC: UInt16(iC), fC: UInt16(fC), oC: UInt16(oC), hasAddOp: UInt16(type(of: self).hasAddOp() ? 1 : 0), hasReluOp: UInt16(type(of: self).hasReluOp() ? 1 : 0), addParam: addParam) metalParam = inMetalParam if type(of: self).isWinoGrad(functionName: functionName) { @@ -304,7 +318,7 @@ class ConvAddReluKernel: Kernel, Computable { } open func neuronFilterForMPSLayer(device: MTLDevice) -> AnyObject? { - if hasReluOp() { + if type(of: self).hasReluOp() { if #available(iOS 10.0, *) { return MPSCNNNeuronReLU(device: device, a: 0) } @@ -312,11 +326,29 @@ class ConvAddReluKernel: Kernel, Computable { return nil } - open func hasAddOp() -> Bool { + open class func canAddUseMPS(param: ConvAddReluParam

) -> Bool { + return canMPSAddByChannel(param: param) || canMPSAddByElement(param: param) + } + + private class func canMPSAddByChannel(param: ConvAddReluParam

) -> Bool { + if let yTensor = param.yTensor, yTensor.dim.cout() == 1 { + return true + } + return false + } + + private class func canMPSAddByElement(param: ConvAddReluParam

) -> Bool { + if let y = param.y, y.dim.dims == param.input.dim.dims { + return true + } + return false + } + + open class func hasAddOp() -> Bool { return true } - open func hasReluOp() -> Bool { + open class func hasReluOp() -> Bool { return true } diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvBNReluKernel.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvBNReluKernel.swift index 6d20bf6f9f..d70caca5f4 100644 --- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvBNReluKernel.swift +++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvBNReluKernel.swift @@ -105,7 +105,7 @@ class ConvBNReluKernel: Kernel, Computable, Testable { let iC = param.input.tensorDim[1]; let fC = param.filter.tensorDim[1]; let oC = param.output.tensorDim[1]; - metalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1]), groups: UInt16(param.groups), iC: UInt16(iC), fC: UInt16(fC), oC: UInt16(oC), hasAddOp: UInt16(0), hasReluOp: UInt16(0)) + metalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1]), groups: UInt16(param.groups), iC: UInt16(iC), fC: UInt16(fC), oC: UInt16(oC), hasAddOp: UInt16(0), hasReluOp: UInt16(0), addParam: ElementwiseAddMetalParam()) var invs: [P] = [] let varianceContents = param.variance.buffer.contents().assumingMemoryBound(to: P.self) diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvKernel.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvKernel.swift index 11f8a2683c..19dc193ac4 100644 --- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvKernel.swift +++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvKernel.swift @@ -66,7 +66,7 @@ class ConvKernel: Kernel, Computable { throw PaddleMobileError.predictError(message: " encode is nil") } encoder.setTexture(param.input.metalTexture, index: 0) - encoder.setTexture(param.output.metalTexture, index: 1) + encoder.setTexture(param.output.metalTexture, index: 2) encoder.setBytes(&metalParam, length: MemoryLayout.size, index: 0) encoder.setBuffer(param.filter.buffer, offset: 0, index: 1) encoder.setBuffer(blankTensor?.buffer, offset: 0, index: 2) @@ -111,7 +111,7 @@ class ConvKernel: Kernel, Computable { let iC = param.input.tensorDim[1]; let fC = param.filter.tensorDim[1]; let oC = param.output.tensorDim[1]; - let inMetalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1]), groups: UInt16(param.groups), iC: UInt16(iC), fC: UInt16(fC), oC: UInt16(oC), hasAddOp: UInt16(hasAddOp() ? 1 : 0), hasReluOp: UInt16(hasReluOp() ? 1 : 0)) + let inMetalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1]), groups: UInt16(param.groups), iC: UInt16(iC), fC: UInt16(fC), oC: UInt16(oC), hasAddOp: UInt16(hasAddOp() ? 1 : 0), hasReluOp: UInt16(hasReluOp() ? 1 : 0), addParam: ElementwiseAddMetalParam()) metalParam = inMetalParam if type(of: self).isWinoGrad(functionName: functionName) { @@ -130,7 +130,7 @@ class ConvKernel: Kernel, Computable { } else if param.filter.channel == 1 && param.filter.n == param.input.tensorDim[1] { if useAggressiveOptimization { let couldUseWinograd = param.filter.width == 3 && param.filter.height == 3 - && param.filter.n == 16 && param.stride[0] == 1 && param.stride[1] == 1 + && param.filter.n <= 16 && param.stride[0] == 1 && param.stride[1] == 1 && param.dilations[0] == 1 && param.dilations[1] == 1 if couldUseWinograd { return "depthwise_conv_add_relu_3x3_half_winograd" diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvReluKernel.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvReluKernel.swift index 0b73deb1b0..9937ca158b 100644 --- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvReluKernel.swift +++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ConvReluKernel.swift @@ -16,11 +16,11 @@ import Foundation import MetalPerformanceShaders class ConvReluKernel: ConvAddReluKernel

{ - override func hasAddOp() -> Bool { + override class func hasAddOp() -> Bool { return false } - override func hasReluOp() -> Bool { + override class func hasReluOp() -> Bool { return true } } diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ElementwiseAddKernel.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ElementwiseAddKernel.swift index acddad5bb1..c1ee435e3a 100644 --- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ElementwiseAddKernel.swift +++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ElementwiseAddKernel.swift @@ -34,27 +34,8 @@ class ElementwiseAddKernel: Kernel, Computable { throw error } - metalParam = ElementwiseAddMetalParam.init() + metalParam = ElementwiseAddKernel.metalParamFrom(inputX: param.inputX, inputY: param.inputY, axis: param.axis) - let xdim: [Int32] = (0..<4).map { Int32(param.inputX.dim[$0]) } - let ydim: [Int32] = (0..<4).map { Int32(param.inputY.dim[$0]) } - let xtrans: [Int32] = (0..<4).map { Int32(param.inputX.transpose[$0]) } - let ytrans: [Int32] = (0..<4).map { Int32(param.inputY.transpose[$0]) } - - metalParam.xdim = (xdim[0], xdim[1], xdim[2], xdim[3]) - metalParam.ydim = (ydim[0], ydim[1], ydim[2], ydim[3]) - metalParam.xtrans = (xtrans[0], xtrans[1], xtrans[2], xtrans[3]) - metalParam.ytrans = (ytrans[0], ytrans[1], ytrans[2], ytrans[3]) - if param.axis == -1 { - metalParam.axis = 4 - Int32(param.inputY.tensorDim.cout()) - } else { - metalParam.axis = 4 - Int32(param.inputX.tensorDim.cout()) + Int32(param.axis) - } - metalParam.ylen = Int32(param.inputY.tensorDim.cout()) - if (param.inputX.dim == param.inputY.dim) && (param.inputX.transpose == param.inputY.transpose) { - // print("===> elementwise_add fast!!!") - metalParam.fast = 1 - } if GlobalConfig.shared.computePrecision == .Float32 { super.init(device: device, inFunctionName: "elementwise_add", initContext: initContext) } else if GlobalConfig.shared.computePrecision == .Float16 { @@ -75,4 +56,29 @@ class ElementwiseAddKernel: Kernel, Computable { encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture) encoder.endEncoding() } + + static func metalParamFrom(inputX: Texture, inputY: Texture, axis: Int) -> ElementwiseAddMetalParam { + var metalParam = ElementwiseAddMetalParam.init() + + let xdim: [Int32] = (0..<4).map { Int32(inputX.dim[$0]) } + let ydim: [Int32] = (0..<4).map { Int32(inputY.dim[$0]) } + let xtrans: [Int32] = (0..<4).map { Int32(inputX.transpose[$0]) } + let ytrans: [Int32] = (0..<4).map { Int32(inputY.transpose[$0]) } + + metalParam.xdim = (xdim[0], xdim[1], xdim[2], xdim[3]) + metalParam.ydim = (ydim[0], ydim[1], ydim[2], ydim[3]) + metalParam.xtrans = (xtrans[0], xtrans[1], xtrans[2], xtrans[3]) + metalParam.ytrans = (ytrans[0], ytrans[1], ytrans[2], ytrans[3]) + if axis == -1 { + metalParam.axis = 4 - Int32(inputY.tensorDim.cout()) + } else { + metalParam.axis = 4 - Int32(inputX.tensorDim.cout()) + Int32(axis) + } + metalParam.ylen = Int32(inputY.tensorDim.cout()) + if (inputX.dim == inputY.dim) && (inputX.transpose == inputY.transpose) { + // print("===> elementwise_add fast!!!") + metalParam.fast = 1 + } + return metalParam + } } diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ReluKernel.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ReluKernel.swift index 0e2f4983cb..1f370db649 100644 --- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ReluKernel.swift +++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ReluKernel.swift @@ -26,6 +26,11 @@ class ReluKernel: Kernel, Computable{ } required init(device: MTLDevice, param: ReluParam

, initContext: InitContext) throws { + do { + try param.output.initTexture(device: device, inTranspose: param.input.transpose, computePrecision: GlobalConfig.shared.computePrecision) + } catch let error { + throw error + } if GlobalConfig.shared.computePrecision == .Float32 { super.init(device: device, inFunctionName: "relu", initContext: initContext) } else if GlobalConfig.shared.computePrecision == .Float16 { diff --git a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ScaleOpKernel.swift b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ScaleOpKernel.swift index fc63c08ebc..483bedcb08 100644 --- a/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ScaleOpKernel.swift +++ b/metal/paddle-mobile/paddle-mobile/Src/Operators/Kernels/ScaleOpKernel.swift @@ -34,10 +34,10 @@ class ScaleOpKernel: Kernel, Computable{ } var shouldUseMPS = false - if initContext.useMPS && param.biasAfterScale { + if initContext.useMPS && param.biasAfterScale && param.input.tensorDim.cout() == 4 && param.output.tensorDim.cout() == 4 { let inputChannel = param.input.tensorDim[1] let outputChannel = param.output.tensorDim[1] - if (inputChannel == 1 || inputChannel > 4) && (outputChannel == 1 || outputChannel > 4) { + if (inputChannel > 4) && (outputChannel > 4) { shouldUseMPS = true } } -- GitLab