diff --git a/metal/paddle-mobile-demo/paddle-mobile-demo.xcodeproj/xcuserdata/liuruilong.xcuserdatad/xcschemes/xcschememanagement.plist b/metal/paddle-mobile-demo/paddle-mobile-demo.xcodeproj/xcuserdata/liuruilong.xcuserdatad/xcschemes/xcschememanagement.plist index 8f61f4a88a7bcbe39bbb56e22ef203803776fdec..3ac3272f7fe5e82e15488422c949e3df19853f90 100644 --- a/metal/paddle-mobile-demo/paddle-mobile-demo.xcodeproj/xcuserdata/liuruilong.xcuserdatad/xcschemes/xcschememanagement.plist +++ b/metal/paddle-mobile-demo/paddle-mobile-demo.xcodeproj/xcuserdata/liuruilong.xcuserdatad/xcschemes/xcschememanagement.plist @@ -7,7 +7,7 @@ paddle-mobile-demo.xcscheme orderHint - 2 + 1 SuppressBuildableAutocreation diff --git a/metal/paddle-mobile-demo/paddle-mobile-demo/Net/MobileNetSSD.swift b/metal/paddle-mobile-demo/paddle-mobile-demo/Net/MobileNetSSD.swift index 9bcb7c191fd1032d1ef98c0b49d6c483bca0b5df..322a6b9f67cb7df2a58679496e3c8a31d08844b5 100644 --- a/metal/paddle-mobile-demo/paddle-mobile-demo/Net/MobileNetSSD.swift +++ b/metal/paddle-mobile-demo/paddle-mobile-demo/Net/MobileNetSSD.swift @@ -33,7 +33,7 @@ class MobileNet_ssd_hand: Net{ return " \(res)" } - func fetchResult(paddleMobileRes: ResultHolder) -> [Float32]{ + func fetchResult(paddleMobileRes: ResultHolder) -> [Float32] { guard let interRes = paddleMobileRes.intermediateResults else { fatalError(" need have inter result ") @@ -46,12 +46,13 @@ class MobileNet_ssd_hand: Net{ guard let bboxs = interRes["BBoxes"], bboxs.count > 0, let bbox = bboxs[0] as? Texture else { fatalError() } - + var scoreFormatArr: [Float32] = score.metalTexture.realNHWC(dim: (n: score.originDim[0], h: score.originDim[1], w: score.originDim[2], c: score.originDim[3])) + var bboxArr = bbox.metalTexture.floatArray { (f) -> Float32 in return f } - + let nmsCompute = NMSCompute.init() nmsCompute.scoreThredshold = 0.01 nmsCompute.nmsTopK = 200 @@ -79,6 +80,7 @@ class MobileNet_ssd_hand: Net{ let modelDir: String + // let paramPointer: UnsafeMutableRawPointer // // let paramSize: Int diff --git a/metal/paddle-mobile-demo/paddle-mobile-demo/Net/PreProcessKernel.metal b/metal/paddle-mobile-demo/paddle-mobile-demo/Net/PreProcessKernel.metal index 75d9dc618ddb9c8d8da1d33f87c1598d78c1edf0..edbb19c980102796a504252bf35ebc9a6d0513fb 100644 --- a/metal/paddle-mobile-demo/paddle-mobile-demo/Net/PreProcessKernel.metal +++ b/metal/paddle-mobile-demo/paddle-mobile-demo/Net/PreProcessKernel.metal @@ -85,6 +85,17 @@ kernel void genet_preprocess(texture2d inTexture [[texture( outTexture.write(float4(inColor.z, inColor.y, inColor.x, 0.0f), gid); } +kernel void genet_preprocess_half(texture2d inTexture [[texture(0)]], texture2d outTexture [[texture(1)]], uint2 gid [[thread_position_in_grid]]) +{ + if (gid.x >= outTexture.get_width() || + gid.y >= outTexture.get_height()) { + return; + } + const auto means = half4(128.0f, 128.0f, 128.0f, 0.0f); + const half4 inColor = (inTexture.read(gid) * 255.0 - means) * 0.017; + outTexture.write(half4(inColor.z, inColor.y, inColor.x, 0.0f), gid); +} + kernel void scale(texture2d inTexture [[texture(0)]], texture2d outTexture [[texture(1)]], uint2 gid [[thread_position_in_grid]]) { if (gid.x >= outTexture.get_width() || gid.y >= outTexture.get_height()) return; @@ -95,12 +106,13 @@ kernel void scale(texture2d inTexture [[texture(0)]], tex outTexture.write(input, gid); } - - - - - - - - +kernel void scale_half(texture2d inTexture [[texture(0)]], texture2d outTexture [[texture(1)]], uint2 gid [[thread_position_in_grid]]) { + if (gid.x >= outTexture.get_width() || + gid.y >= outTexture.get_height()) return; + float w_stride = inTexture.get_width() / outTexture.get_width(); + float h_stride = inTexture.get_height() / outTexture.get_height(); + constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); + float4 input = inTexture.sample(sample, float2(gid.x * w_stride, gid.y * h_stride), 0); + outTexture.write(half4(input), gid); +} diff --git a/metal/paddle-mobile-demo/paddle-mobile-demo/ViewController.swift b/metal/paddle-mobile-demo/paddle-mobile-demo/ViewController.swift index e2351d14e317ada506d1f15c59efda22c984cab8..a9fd4dd515ac76a59eeb98f71557f64e6e3f2058 100644 --- a/metal/paddle-mobile-demo/paddle-mobile-demo/ViewController.swift +++ b/metal/paddle-mobile-demo/paddle-mobile-demo/ViewController.swift @@ -79,7 +79,7 @@ class ViewController: UIViewController { return } do { - let max = 10 + let max = 50 let startDate = Date.init() for i in 0.. [Float32] { + func toTensor(dim: (n: Int, c: Int, h: Int, w: Int), texturePrecision: ComputePrecision = .Float16) -> [Float32] { // print("origin dim: \(dim)") print("texture: ") print(self) + var textureArray: [Float32] +// if texturePrecision == .Float16 - let textureArray = floatArray { (i : Float32) -> Float32 in - return i + if pixelFormat == .rgba32Float { + textureArray = floatArray { (i : Float32) -> Float32 in + return i + } + } else if pixelFormat == .rgba16Float { + + var textureFloat16Array = floatArray { (i : Float16) -> Float16 in + return i + } + textureArray = float16To32(input: &textureFloat16Array, count: textureFloat16Array.count) + } else { + fatalError(" 目前还不支持其他类型 ") } + var output: [Float32] = [] for s in 0.. [Float32] { + func realNHWC(dim: (n: Int, h: Int, w: Int, c: Int), texturePrecision: ComputePrecision = .Float16) -> [Float32] { // print("origin dim: \(dim)") // print("texture: ") // print(self) - let textureArray = floatArray { (i : Float32) -> Float32 in - return i + var textureArray: [Float32] + // if texturePrecision == .Float16 + if pixelFormat == .rgba32Float { + textureArray = floatArray { (i : Float32) -> Float32 in + return i + } + } else if pixelFormat == .rgba16Float { + var textureFloat16Array = floatArray { (i : Float16) -> Float16 in + return i + } + textureArray = float16To32(input: &textureFloat16Array, count: textureFloat16Array.count) + } else { + fatalError(" 目前还不支持其他类型 ") } + var output: [Float32] = [] - let numOfASlice = dim.h * dim.w * 4 for h in 0.. Self @@ -78,6 +79,28 @@ extension Float32: PrecisionType { } } +public func float32ToFloat16(input: UnsafeMutablePointer, output: UnsafeMutableRawPointer, count: Int) { + var float32Buffer = vImage_Buffer(data: input, height: 1, width: UInt(count), rowBytes: count * 4) + var float16buffer = vImage_Buffer(data: output, height: 1, width: UInt(count), rowBytes: count * 2) + guard vImageConvert_PlanarFtoPlanar16F(&float32Buffer, &float16buffer, 0) == kvImageNoError else { + fatalError(" float 32 to float 16 error ! ") + } +} + +public func float16To32(input: UnsafeMutablePointer, count: Int) -> [Float32] { + var output = Array.init(repeating: 0.0, count: count) + float16to32(input: input, output: &output, count: count) + return output +} + +public func float16to32(input: UnsafeMutablePointer, output: UnsafeMutablePointer, count: Int) { + var bufferFloat16 = vImage_Buffer(data: input, height: 1, width: UInt(count), rowBytes: count * 2) + var bufferFloat32 = vImage_Buffer(data: output, height: 1, width: UInt(count), rowBytes: count * 4) + if vImageConvert_Planar16FtoPlanarF(&bufferFloat16, &bufferFloat32, 0) != kvImageNoError { + fatalError(" convert float16 to float32 error") + } +} + // N - 0 C - 1 H - 2 W - 3 struct DataLayout { diff --git a/metal/paddle-mobile/paddle-mobile/Executor.swift b/metal/paddle-mobile/paddle-mobile/Executor.swift index 47c91de096ebaae1899a5ee6a0d4b85f5b1f99da..efdc0b2164de43158f4fd7477c587582e2098bcf 100644 --- a/metal/paddle-mobile/paddle-mobile/Executor.swift +++ b/metal/paddle-mobile/paddle-mobile/Executor.swift @@ -16,6 +16,8 @@ import Foundation let testTo = 54 +let computePrecision: ComputePrecision = .Float32 + public class ResultHolder { public let dim: [Int] public let resultArr: [P] @@ -66,7 +68,6 @@ public class Executor { let op = block.ops[i] do { let op = try OpCreator

.shared.creat(device: inDevice, opDesc: op, scope: inProgram.scope) -// op.inferShape() ops.append(op) } catch let error { throw error diff --git a/metal/paddle-mobile/paddle-mobile/Operators/BoxcoderOp.swift b/metal/paddle-mobile/paddle-mobile/Operators/BoxcoderOp.swift index eaa596e071b7628339be185a7e3599a370763041..1bf5cde92eba79dbd8be2ca8cbd17e0398c428d5 100644 --- a/metal/paddle-mobile/paddle-mobile/Operators/BoxcoderOp.swift +++ b/metal/paddle-mobile/paddle-mobile/Operators/BoxcoderOp.swift @@ -58,28 +58,26 @@ class BoxcoderOp: Operator, BoxcoderParam

} func delogOutput() { - print(" \(type) output: ") - let priorBoxOriginDim = para.priorBox.originDim - let priorBoxArray = para.priorBox.metalTexture.realNHWC(dim: (n: priorBoxOriginDim[0], h: priorBoxOriginDim[1], w: priorBoxOriginDim[2], c: priorBoxOriginDim[3])) - print(" prior box ") - print(priorBoxArray.strideArray()) - - let priorBoxVarOriginDim = para.priorBoxVar.originDim - let priorBoxVarArray = para.priorBoxVar.metalTexture.realNHWC(dim: (n: priorBoxVarOriginDim[0], h: priorBoxVarOriginDim[1], w: priorBoxVarOriginDim[2], c: priorBoxVarOriginDim[3])) - print(" prior box var ") - print(priorBoxVarArray.strideArray()) - - let targetBoxOriginDim = para.targetBox.originDim - let targetBoxArray = para.targetBox.metalTexture.realNHWC(dim: (n: targetBoxOriginDim[0], h: targetBoxOriginDim[1], w: targetBoxOriginDim[2], c: targetBoxOriginDim[3])) - print(" target box ") - print(targetBoxArray.strideArray()) +// let priorBoxOriginDim = para.priorBox.originDim +// let priorBoxArray: [Float32] = para.priorBox.metalTexture.realNHWC(dim: (n: priorBoxOriginDim[0], h: priorBoxOriginDim[1], w: priorBoxOriginDim[2], c: priorBoxOriginDim[3])) +// print(" prior box ") +// print(priorBoxArray.strideArray()) +// +// let priorBoxVarOriginDim = para.priorBoxVar.originDim +// let priorBoxVarArray: [Float32] = para.priorBoxVar.metalTexture.realNHWC(dim: (n: priorBoxVarOriginDim[0], h: priorBoxVarOriginDim[1], w: priorBoxVarOriginDim[2], c: priorBoxVarOriginDim[3])) +// print(" prior box var ") +// print(priorBoxVarArray.strideArray()) +// +// let targetBoxOriginDim = para.targetBox.originDim +// let targetBoxArray: [Float32] = para.targetBox.metalTexture.realNHWC(dim: (n: targetBoxOriginDim[0], h: targetBoxOriginDim[1], w: targetBoxOriginDim[2], c: targetBoxOriginDim[3])) +// print(" target box ") +// print(targetBoxArray.strideArray()) let originDim = para.output.originDim - let outputArray = para.output.metalTexture.realNHWC(dim: (n: originDim[0], h: originDim[1], w: originDim[2], c: originDim[3])) + let outputArray: [Float32] = para.output.metalTexture.realNHWC(dim: (n: originDim[0], h: originDim[1], w: originDim[2], c: originDim[3]), texturePrecision: computePrecision) print(outputArray.strideArray()) - } } diff --git a/metal/paddle-mobile/paddle-mobile/Operators/ConcatOp.swift b/metal/paddle-mobile/paddle-mobile/Operators/ConcatOp.swift index 117fd8c39fadec55b88cc1aeaf3b91e5a0dd966d..aac56ef4f9ffd9711791c3bc7e2ca11702fc4e7b 100644 --- a/metal/paddle-mobile/paddle-mobile/Operators/ConcatOp.swift +++ b/metal/paddle-mobile/paddle-mobile/Operators/ConcatOp.swift @@ -66,9 +66,16 @@ class ConcatOp: Operator, ConcatParam

>, Run func delogOutput() { print(" \(type) output: ") let originDim = para.output.originDim - let outputArray = para.output.metalTexture.realNHWC(dim: (n: originDim[0], h: originDim[1], w: originDim[2], c: originDim[3])) - print(outputArray.strideArray()) - print(para.output.metalTexture.toTensor(dim: (n: para.output.tensorDim[0], c: para.output.tensorDim[1], h: para.output.tensorDim[2], w: para.output.tensorDim[3])).strideArray()) + + if para.output.transpose == [0, 1, 2, 3] { + let outputArray: [Float32] = para.output.metalTexture.realNHWC(dim: (n: originDim[0], h: originDim[1], w: originDim[2], c: originDim[3]), texturePrecision: computePrecision) + print(outputArray.strideArray()) + } else if para.output.transpose == [0, 2, 3, 1] { + print(para.output.metalTexture.toTensor(dim: (n: originDim[0], c: originDim[1], h: originDim[2], w: originDim[3]), texturePrecision: computePrecision).strideArray()) + } else { + fatalError(" not implemet") + } + } } diff --git a/metal/paddle-mobile/paddle-mobile/Operators/ConvAddBatchNormReluOp.swift b/metal/paddle-mobile/paddle-mobile/Operators/ConvAddBatchNormReluOp.swift index 6f67014444e5ef82fe4cdc30f99bc371fef2d417..7bced214bd11bfef61eb405d59073f004e765e03 100644 --- a/metal/paddle-mobile/paddle-mobile/Operators/ConvAddBatchNormReluOp.swift +++ b/metal/paddle-mobile/paddle-mobile/Operators/ConvAddBatchNormReluOp.swift @@ -125,13 +125,6 @@ class ConvAddBatchNormReluOp: Operator P in - return p - } - // - writeToLibrary(fileName: "output_112x112x32_2", array: output) - print(" write done") - // let _: P? = para.output.metalTexture.logDesc(header: "conv add batchnorm relu output: ", stridable: false) } } diff --git a/metal/paddle-mobile/paddle-mobile/Operators/ConvAddOp.swift b/metal/paddle-mobile/paddle-mobile/Operators/ConvAddOp.swift index c42e5fa1d8a5de54c4ab4d251097eb876411a350..5e344014188061c3dbb411226b2655a3bc2659b8 100644 --- a/metal/paddle-mobile/paddle-mobile/Operators/ConvAddOp.swift +++ b/metal/paddle-mobile/paddle-mobile/Operators/ConvAddOp.swift @@ -46,9 +46,6 @@ class ConvAddParam: OpParam { class ConvAddOp: Operator, ConvAddParam

>, Runable, Creator, InferShaperable, Fusion{ typealias OpType = ConvAddOp

- - - static func fusionNode() -> Node { let beginNode = Node.init(inType: gConvType) _ = beginNode @@ -64,7 +61,6 @@ class ConvAddOp: Operator, ConvAddParam

>, return gConvAddType } - func inferShape() { let inDims = para.input.dim @@ -101,10 +97,8 @@ class ConvAddOp: Operator, ConvAddParam

>, print(para.stride) print("dilations: ") print(para.dilations) - - - print(" \(type) output: ") - print(para.output.metalTexture.toTensor(dim: (n: para.output.tensorDim[0], c: para.output.tensorDim[1], h: para.output.tensorDim[2], w: para.output.tensorDim[3])).strideArray()) + + print(para.output.metalTexture.toTensor(dim: (n: para.output.tensorDim[0], c: para.output.tensorDim[1], h: para.output.tensorDim[2], w: para.output.tensorDim[3]), texturePrecision: computePrecision).strideArray()) } } diff --git a/metal/paddle-mobile/paddle-mobile/Operators/ConvBNReluOp.swift b/metal/paddle-mobile/paddle-mobile/Operators/ConvBNReluOp.swift index 3c521a2210614550577369c603dbbdc5e2cb6692..be8c57d3ace01dabd652e0e80a43c5a053213e28 100644 --- a/metal/paddle-mobile/paddle-mobile/Operators/ConvBNReluOp.swift +++ b/metal/paddle-mobile/paddle-mobile/Operators/ConvBNReluOp.swift @@ -110,7 +110,7 @@ class ConvBNReluOp: Operator, ConvBNReluPa func delogOutput() { print(" \(type) output: ") - print(para.output.metalTexture.toTensor(dim: (n: para.output.originDim[0], c: para.output.originDim[1], h: para.output.originDim[2], w: para.output.originDim[3])).strideArray()) + print(para.output.metalTexture.toTensor(dim: (n: para.output.originDim[0], c: para.output.originDim[1], h: para.output.originDim[2], w: para.output.originDim[3]), texturePrecision: computePrecision).strideArray()) } } diff --git a/metal/paddle-mobile/paddle-mobile/Operators/ConvTransposeOp.swift b/metal/paddle-mobile/paddle-mobile/Operators/ConvTransposeOp.swift index ba83de1bf892527737a3a1447352877919c8f18e..387fa420b68f8004a12af85ca398cf306f41a5c6 100644 --- a/metal/paddle-mobile/paddle-mobile/Operators/ConvTransposeOp.swift +++ b/metal/paddle-mobile/paddle-mobile/Operators/ConvTransposeOp.swift @@ -46,10 +46,10 @@ class ConvTransposeOp: Operator, ConvTr print(" \(type) output: ") let originDim = para.output.originDim if para.output.transpose == [0, 1, 2, 3] { - let outputArray = para.output.metalTexture.realNHWC(dim: (n: originDim[0], h: originDim[1], w: originDim[2], c: originDim[3])) + let outputArray: [Float32] = para.output.metalTexture.realNHWC(dim: (n: originDim[0], h: originDim[1], w: originDim[2], c: originDim[3]), texturePrecision: computePrecision) print(outputArray.strideArray()) } else if para.output.transpose == [0, 2, 3, 1] { - print(para.output.metalTexture.toTensor(dim: (n: para.output.tensorDim[0], c: para.output.tensorDim[1], h: para.output.tensorDim[2], w: para.output.tensorDim[3])).strideArray()) + print(para.output.metalTexture.toTensor(dim: (n: para.output.tensorDim[0], c: para.output.tensorDim[1], h: para.output.tensorDim[2], w: para.output.tensorDim[3]), texturePrecision: computePrecision).strideArray()) } else { print(" not implement") } diff --git a/metal/paddle-mobile/paddle-mobile/Operators/DepthwiseConvOp.swift b/metal/paddle-mobile/paddle-mobile/Operators/DepthwiseConvOp.swift index 639c22ce12c7a110cf58f3f9e7b9ee458d393260..36f477bc1cb48007b5b28bf27a7424940918025b 100644 --- a/metal/paddle-mobile/paddle-mobile/Operators/DepthwiseConvOp.swift +++ b/metal/paddle-mobile/paddle-mobile/Operators/DepthwiseConvOp.swift @@ -58,6 +58,6 @@ class DepthConvOp: Operator, ConvParam

>, Runa func delogOutput() { print(" \(type) output: ") - print(para.output.metalTexture.toTensor(dim: (n: para.output.originDim[0], c: para.output.originDim[1], h: para.output.originDim[2], w: para.output.originDim[3])).strideArray()) + print(para.output.metalTexture.toTensor(dim: (n: para.output.originDim[0], c: para.output.originDim[1], h: para.output.originDim[2], w: para.output.originDim[3]), texturePrecision: computePrecision).strideArray()) } } diff --git a/metal/paddle-mobile/paddle-mobile/Operators/DwConvBNReluOp.swift b/metal/paddle-mobile/paddle-mobile/Operators/DwConvBNReluOp.swift index 16a42d5c7b24e7b3a26cab35f68decd226076876..0ea8a62c5c0bf30da200add2a96410136d2f40fb 100644 --- a/metal/paddle-mobile/paddle-mobile/Operators/DwConvBNReluOp.swift +++ b/metal/paddle-mobile/paddle-mobile/Operators/DwConvBNReluOp.swift @@ -65,6 +65,6 @@ class DwConvBNReluOp: Operator, ConvBNRelu func delogOutput() { print(" \(type) output: ") - print(para.output.metalTexture.toTensor(dim: (n: para.output.originDim[0], c: para.output.originDim[1], h: para.output.originDim[2], w: para.output.originDim[3])).strideArray()) + print(para.output.metalTexture.toTensor(dim: (n: para.output.originDim[0], c: para.output.originDim[1], h: para.output.originDim[2], w: para.output.originDim[3]), texturePrecision: computePrecision).strideArray()) } } diff --git a/metal/paddle-mobile/paddle-mobile/Operators/ElementwiseAddOp.swift b/metal/paddle-mobile/paddle-mobile/Operators/ElementwiseAddOp.swift index 4812f051820385727e08ad79f40f7820bb3310f5..0f96b204d59f3d4a0dd0fae20340811855421c95 100644 --- a/metal/paddle-mobile/paddle-mobile/Operators/ElementwiseAddOp.swift +++ b/metal/paddle-mobile/paddle-mobile/Operators/ElementwiseAddOp.swift @@ -56,31 +56,30 @@ class ElementwiseAddOp: Operator, Elem // para.output.dim = para.input.dim } + func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws { + do { + try kernel.compute(commandBuffer: buffer, param: para) + } catch let error { + throw error + } + } + func delogOutput() { - print(" \(type) inputX: ") - print(para.inputX.metalTexture.toTensor(dim: (n: para.inputX.tensorDim[0], c: para.inputX.tensorDim[1], h: para.inputX.tensorDim[2], w: para.inputX.tensorDim[3])).strideArray()) - print(" \(type) inputY: ") - print(para.inputY.metalTexture.toTensor(dim: (n: para.inputY.tensorDim[0], c: para.inputY.tensorDim[1], h: para.inputY.tensorDim[2], w: para.inputY.tensorDim[3])).strideArray()) +// print(" \(type) inputX: ") +// print(para.inputX.metalTexture.toTensor(dim: (n: para.inputX.tensorDim[0], c: para.inputX.tensorDim[1], h: para.inputX.tensorDim[2], w: para.inputX.tensorDim[3])).strideArray()) +// print(" \(type) inputY: ") +// print(para.inputY.metalTexture.toTensor(dim: (n: para.inputY.tensorDim[0], c: para.inputY.tensorDim[1], h: para.inputY.tensorDim[2], w: para.inputY.tensorDim[3])).strideArray()) print(" \(type) output: ") let originDim = para.output.originDim if para.output.transpose == [0, 1, 2, 3] { - let outputArray = para.output.metalTexture.realNHWC(dim: (n: originDim[0], h: originDim[1], w: originDim[2], c: originDim[3])) + let outputArray: [Float32] = para.output.metalTexture.realNHWC(dim: (n: originDim[0], h: originDim[1], w: originDim[2], c: originDim[3]), texturePrecision: computePrecision) print(outputArray.strideArray()) } else if para.output.transpose == [0, 2, 3, 1] { - print(para.output.metalTexture.toTensor(dim: (n: para.output.tensorDim[0], c: para.output.tensorDim[1], h: para.output.tensorDim[2], w: para.output.tensorDim[3])).strideArray()) + print(para.output.metalTexture.toTensor(dim: (n: para.output.tensorDim[0], c: para.output.tensorDim[1], h: para.output.tensorDim[2], w: para.output.tensorDim[3]), texturePrecision: computePrecision).strideArray()) } else { print(" not implement") } - - } - - func runImpl(device: MTLDevice, buffer: MTLCommandBuffer) throws { - do { - try kernel.compute(commandBuffer: buffer, param: para) - } catch let error { - throw error - } } } diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/Base/Kernel.swift b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/Base/Kernel.swift index 530fb8a32b1aa97b6a61ed6f5f2d8a77f453a384..f58358761f820809685510fa4e9b5ff237567b3c 100644 --- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/Base/Kernel.swift +++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/Base/Kernel.swift @@ -19,68 +19,76 @@ public protocol TestParam { } public protocol Testable { - associatedtype TestParamType: TestParam - func test(commandBuffer: MTLCommandBuffer, param: TestParamType) - init(device: MTLDevice, testParam: TestParamType) + associatedtype TestParamType: TestParam + func test(commandBuffer: MTLCommandBuffer, param: TestParamType) + init(device: MTLDevice, testParam: TestParamType) } protocol Computable { - associatedtype ParamType: OpParam - func compute(commandBuffer: MTLCommandBuffer, param: ParamType) throws - init(device: MTLDevice, param: ParamType) + associatedtype ParamType: OpParam + func compute(commandBuffer: MTLCommandBuffer, param: ParamType) throws + init(device: MTLDevice, param: ParamType) } protocol KernelProtocol { - var pipline: MTLComputePipelineState { get set } - var functionName: String { get set } - + var pipline: MTLComputePipelineState { get set } + var functionName: String { get set } + } open class Kernel { - let pipline: MTLComputePipelineState - let functionName: String - public init(device: MTLDevice, inFunctionName: String, usePaddleMobileLib: Bool = true) { - pipline = device.pipeLine(funcName: inFunctionName, inPaddleMobileLib: usePaddleMobileLib) - functionName = inFunctionName - } + let pipline: MTLComputePipelineState + let functionName: String + public init(device: MTLDevice, inFunctionName: String, usePaddleMobileLib: Bool = true) { + pipline = device.pipeLine(funcName: inFunctionName, inPaddleMobileLib: usePaddleMobileLib) + functionName = inFunctionName + } } open class CusomKernel: Kernel { - public struct Shape { - public let width: Int - public let height: Int - public let channel: Int - public init(inWidth: Int, inHeight: Int, inChannel: Int){ - width = inWidth - height = inHeight - channel = inChannel - } - } - public let outputTexture: MTLTexture - public init(device: MTLDevice, inFunctionName: String, outputDim: Shape, usePaddleMobileLib: Bool = false) { - let textureDesc = MTLTextureDescriptor.init() - textureDesc.textureType = .type2D - textureDesc.width = outputDim.width - textureDesc.height = outputDim.height - textureDesc.depth = (outputDim.channel + 3) / 4 - textureDesc.pixelFormat = .rgba32Float - textureDesc.usage = [.shaderRead, .shaderWrite] - textureDesc.storageMode = .shared - outputTexture = device.makeTexture(descriptor: textureDesc) ?! " make texture error " - - super.init(device: device, inFunctionName: inFunctionName, usePaddleMobileLib: usePaddleMobileLib) + public struct Shape { + public let width: Int + public let height: Int + public let channel: Int + public init(inWidth: Int, inHeight: Int, inChannel: Int){ + width = inWidth + height = inHeight + channel = inChannel } + } + public let outputTexture: MTLTexture + public init(device: MTLDevice, inFunctionName: String, outputDim: Shape, usePaddleMobileLib: Bool = false) { + let textureDesc = MTLTextureDescriptor.init() + textureDesc.textureType = .type2D + textureDesc.width = outputDim.width + textureDesc.height = outputDim.height + textureDesc.depth = (outputDim.channel + 3) / 4 - public func compute(inputTexuture: MTLTexture, commandBuffer: MTLCommandBuffer) throws { - guard let encoder = commandBuffer.makeComputeCommandEncoder() else { - throw PaddleMobileError.predictError(message: " encode is nil") - } - encoder.setTexture(inputTexuture, index: 0) - encoder.setTexture(outputTexture, index: 1) - encoder.dispatch(computePipline: pipline, outTexture: outputTexture) - encoder.endEncoding() + if computePrecision == .Float16 { + textureDesc.pixelFormat = .rgba16Float + } else if computePrecision == .Float32 { + textureDesc.pixelFormat = .rgba32Float + } else { + fatalError() } + textureDesc.usage = [.shaderRead, .shaderWrite] + textureDesc.storageMode = .shared + outputTexture = device.makeTexture(descriptor: textureDesc) ?! " make texture error " + + super.init(device: device, inFunctionName: inFunctionName, usePaddleMobileLib: usePaddleMobileLib) + } + + public func compute(inputTexuture: MTLTexture, commandBuffer: MTLCommandBuffer) throws { + guard let encoder = commandBuffer.makeComputeCommandEncoder() else { + throw PaddleMobileError.predictError(message: " encode is nil") + } + encoder.setTexture(inputTexuture, index: 0) + encoder.setTexture(outputTexture, index: 1) + encoder.dispatch(computePipline: pipline, outTexture: outputTexture) + encoder.endEncoding() + } + } diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/BoxcoderKernel.swift b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/BoxcoderKernel.swift index 722ab6b64c953c1fef28082f75794d9e581251ef..939f5db5f192082470ea2ad8773db95af22ffed4 100644 --- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/BoxcoderKernel.swift +++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/BoxcoderKernel.swift @@ -18,22 +18,29 @@ struct BoxcoderMetalParam { } class BoxcoderKernel: Kernel, Computable{ - func compute(commandBuffer: MTLCommandBuffer, param: BoxcoderParam

) throws { - guard let encoder = commandBuffer.makeComputeCommandEncoder() else { - throw PaddleMobileError.predictError(message: " encode is nil") - } - encoder.setTexture(param.priorBox.metalTexture, index: 0) - encoder.setTexture(param.priorBoxVar.metalTexture, index: 1) - encoder.setTexture(param.targetBox.metalTexture, index: 2) - encoder.setTexture(param.output.metalTexture, index: 3) - var bmp = BoxcoderMetalParam.init() - encoder.setBytes(&bmp, length: MemoryLayout.size, index: 0) - encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture) - encoder.endEncoding() + func compute(commandBuffer: MTLCommandBuffer, param: BoxcoderParam

) throws { + guard let encoder = commandBuffer.makeComputeCommandEncoder() else { + throw PaddleMobileError.predictError(message: " encode is nil") } - - required init(device: MTLDevice, param: BoxcoderParam

) { - param.output.initTexture(device: device) - super.init(device: device, inFunctionName: "boxcoder") + encoder.setTexture(param.priorBox.metalTexture, index: 0) + encoder.setTexture(param.priorBoxVar.metalTexture, index: 1) + encoder.setTexture(param.targetBox.metalTexture, index: 2) + encoder.setTexture(param.output.metalTexture, index: 3) + var bmp = BoxcoderMetalParam.init() + encoder.setBytes(&bmp, length: MemoryLayout.size, index: 0) + encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture) + encoder.endEncoding() + } + + required init(device: MTLDevice, param: BoxcoderParam

) { + param.output.initTexture(device: device, computePrecision: computePrecision) + if computePrecision == .Float32 { + super.init(device: device, inFunctionName: "boxcoder") + } else if computePrecision == .Float16 { + super.init(device: device, inFunctionName: "boxcoder_half") + } else { + fatalError() } + } + } diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConcatKernel.swift b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConcatKernel.swift index 60f1437e7fabf0ae088b41f37cc01e2981cbf236..644476ad9dbb471786611fe25a30ed9c4833edbd 100644 --- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConcatKernel.swift +++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConcatKernel.swift @@ -121,8 +121,14 @@ class ConcatKernel: Kernel, Computable{ } required init(device: MTLDevice, param: ConcatParam

) { - param.output.initTexture(device: device, inTranspose: param.transpose) - super.init(device: device, inFunctionName: "concat") + param.output.initTexture(device: device, inTranspose: param.transpose, computePrecision: computePrecision) + if computePrecision == .Float32 { + super.init(device: device, inFunctionName: "concat") + } else if computePrecision == .Float16 { + super.init(device: device, inFunctionName: "concat_half") + } else { + fatalError() + } } required init(device: MTLDevice, testParam: ConcatTestParam) { diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvAddBatchNormReluKernel.swift b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvAddBatchNormReluKernel.swift index eabadc9d44e7b98fccb0f87e73dd2ffd8da931d7..092207cfb7b9fda63cd6b5aa7082640bae515149 100644 --- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvAddBatchNormReluKernel.swift +++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvAddBatchNormReluKernel.swift @@ -15,127 +15,155 @@ import Foundation struct ConvAddBatchNormReluTestParam: TestParam { - let inputTexture: MTLTexture - let outputTexture: MTLTexture - var metalParam: MetalConvParam - let filterBuffer: MTLBuffer - let biaseBuffer: MTLBuffer - let newScaleBuffer: MTLBuffer - let newBiaseBuffer: MTLBuffer - let filterSize: (width: Int, height: Int, channel: Int) - init(inInputTexture: MTLTexture, inOutputTexture: MTLTexture, inMetalParam: MetalConvParam, inFilterBuffer: MTLBuffer, inBiaseBuffer: MTLBuffer, inNewScaleBuffer: MTLBuffer, inNewBiaseBuffer: MTLBuffer, inFilterSize: (width: Int, height: Int, channel: Int)) { - inputTexture = inInputTexture - outputTexture = inOutputTexture - metalParam = inMetalParam - filterBuffer = inFilterBuffer - biaseBuffer = inBiaseBuffer - newScaleBuffer = inNewScaleBuffer - newBiaseBuffer = inNewBiaseBuffer - filterSize = inFilterSize - } + let inputTexture: MTLTexture + let outputTexture: MTLTexture + var metalParam: MetalConvParam + let filterBuffer: MTLBuffer + let biaseBuffer: MTLBuffer + let newScaleBuffer: MTLBuffer + let newBiaseBuffer: MTLBuffer + let filterSize: (width: Int, height: Int, channel: Int) + init(inInputTexture: MTLTexture, inOutputTexture: MTLTexture, inMetalParam: MetalConvParam, inFilterBuffer: MTLBuffer, inBiaseBuffer: MTLBuffer, inNewScaleBuffer: MTLBuffer, inNewBiaseBuffer: MTLBuffer, inFilterSize: (width: Int, height: Int, channel: Int)) { + inputTexture = inInputTexture + outputTexture = inOutputTexture + metalParam = inMetalParam + filterBuffer = inFilterBuffer + biaseBuffer = inBiaseBuffer + newScaleBuffer = inNewScaleBuffer + newBiaseBuffer = inNewBiaseBuffer + filterSize = inFilterSize + } } class ConvAddBatchNormReluKernel: Kernel, Computable, Testable { - required init(device: MTLDevice, testParam: ConvAddBatchNormReluTestParam) { - if testParam.filterSize.width == 1 && testParam.filterSize.height == 1 { - super.init(device: device, inFunctionName: "conv_add_batch_norm_relu_1x1") - } else if testParam.filterSize.channel == 1 { - super.init(device: device, inFunctionName: "depthwise_conv_add_batch_norm_relu_3x3") - } else { - super.init(device: device, inFunctionName: "conv_add_batch_norm_relu_3x3") - } + required init(device: MTLDevice, testParam: ConvAddBatchNormReluTestParam) { + if testParam.filterSize.width == 1 && testParam.filterSize.height == 1 { + super.init(device: device, inFunctionName: "conv_add_batch_norm_relu_1x1") + } else if testParam.filterSize.channel == 1 { + super.init(device: device, inFunctionName: "depthwise_conv_add_batch_norm_relu_3x3") + } else { + super.init(device: device, inFunctionName: "conv_add_batch_norm_relu_3x3") } + } + + var metalParam: MetalConvParam! + + required init(device: MTLDevice, param: ConvAddBatchNormReluParam

) { - var metalParam: MetalConvParam! - - required init(device: MTLDevice, param: ConvAddBatchNormReluParam

) { - - param.output.initTexture(device: device, inTranspose: [0, 2, 3, 1]) - - if param.filter.width == 1 && param.filter.height == 1 { - super.init(device: device, inFunctionName: "conv_add_batch_norm_relu_1x1") - } else if param.filter.channel == 1 { - super.init(device: device, inFunctionName: "depthwise_conv_add_batch_norm_relu_3x3") - } else { - super.init(device: device, inFunctionName: "conv_add_batch_norm_relu_3x3") - } - - param.filter.initBuffer(device: device, precision: Tensor.BufferPrecision.Float32) - param.y.initBuffer(device: device, precision: Tensor.BufferPrecision.Float32) - param.variance.initBuffer(device: device) - param.mean.initBuffer(device: device) - param.scale.initBuffer(device: device) - param.bias.initBuffer(device: device) - - - let offsetX = param.filter.width/2 - Int(param.paddings[0]) - let offsetY = param.filter.height/2 - Int(param.paddings[1]) - - print("offset x: \(offsetX)") - print("offset y: \(offsetY)") - - let offsetZ = 0.0 - metalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), paddedZ: UInt16(param.input.metalTexture.arrayLength * 4 - param.input.dim[3]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1])) - - var invs: [P] = [] - let varianceContents = param.variance.buffer.contents().assumingMemoryBound(to: P.self) - - for i in 0...stride { - let inv = 1.0/pow(Float32.init(varianceContents[i]) + param.epsilon, 0.5) - invs.append(P(inv)) - } - - let newScale: UnsafeMutablePointer

= UnsafeMutablePointer

.allocate(capacity: param.scale.buffer.length) - let newBiase: UnsafeMutablePointer

= UnsafeMutablePointer

.allocate(capacity: param.bias.buffer.length) - - let scaleContents = param.scale.buffer.contents().assumingMemoryBound(to: P.self) - let biaseContents = param.bias.buffer.contents().assumingMemoryBound(to: P.self) - let meanContents = param.mean.buffer.contents().assumingMemoryBound(to: P.self) - for i in 0...stride { - newScale[i] = invs[i] * scaleContents[i] - newBiase[i] = biaseContents[i] - meanContents[i] * invs[i] * scaleContents[i] - } - param.newBiase = device.makeBuffer(bytes: newBiase, length: param.bias.buffer.length) - param.newScale = device.makeBuffer(bytes: newScale, length: param.scale.buffer.length) - - newScale.deinitialize(count: param.scale.buffer.length) - newScale.deallocate() - - newBiase.deinitialize(count: param.bias.buffer.length) - newBiase.deallocate() + param.output.initTexture(device: device, inTranspose: [0, 2, 3, 1], computePrecision: computePrecision) + + if param.filter.width == 1 && param.filter.height == 1 { + super.init(device: device, inFunctionName: "conv_add_batch_norm_relu_1x1") + } else if param.filter.channel == 1 { + super.init(device: device, inFunctionName: "depthwise_conv_add_batch_norm_relu_3x3") + } else { + super.init(device: device, inFunctionName: "conv_add_batch_norm_relu_3x3") } - func compute(commandBuffer: MTLCommandBuffer, param: ConvAddBatchNormReluParam

) throws { - guard let encoder = commandBuffer.makeComputeCommandEncoder() else { - throw PaddleMobileError.predictError(message: " encode is nil") - } + param.filter.initBuffer(device: device, precision: computePrecision) + + param.y.initBuffer(device: device, precision: computePrecision) + + param.variance.initBuffer(device: device, precision: .Float32) + param.mean.initBuffer(device: device, precision: .Float32) + param.scale.initBuffer(device: device, precision: .Float32) + param.bias.initBuffer(device: device, precision: .Float32) + + + let offsetX = param.filter.width/2 - Int(param.paddings[0]) + let offsetY = param.filter.height/2 - Int(param.paddings[1]) + + print("offset x: \(offsetX)") + print("offset y: \(offsetY)") + + let offsetZ = 0.0 + metalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), paddedZ: UInt16(param.input.metalTexture.arrayLength * 4 - param.input.dim[3]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1])) + + var invs: [P] = [] + let varianceContents = param.variance.buffer.contents().assumingMemoryBound(to: P.self) + + for i in 0...stride { + let inv = 1.0/pow(Float32.init(varianceContents[i]) + param.epsilon, 0.5) + invs.append(P(inv)) + } + + let newScale: UnsafeMutablePointer

= UnsafeMutablePointer

.allocate(capacity: param.scale.buffer.length) + let newBiase: UnsafeMutablePointer

= UnsafeMutablePointer

.allocate(capacity: param.bias.buffer.length) + + let scaleContents = param.scale.buffer.contents().assumingMemoryBound(to: P.self) + let biaseContents = param.bias.buffer.contents().assumingMemoryBound(to: P.self) + let meanContents = param.mean.buffer.contents().assumingMemoryBound(to: P.self) + for i in 0...stride { + newScale[i] = invs[i] * scaleContents[i] + newBiase[i] = biaseContents[i] - meanContents[i] * invs[i] * scaleContents[i] + } + +// var newScaleFP16: UnsafeMutableRawPointer +// +// float32ToFloat16(input: newScale as! UnsafeMutablePointer, output: newScaleFP16, count: param.scale.buffer.length / MemoryLayout

.size) + + +// let newBiaseFloat16 = device.makeBuffer(length: <#T##Int#>, options: <#T##MTLResourceOptions#>) + + var newBiaseBuffer: MTLBuffer + var newScaleBuffer: MTLBuffer + + if computePrecision == .Float16 { + newBiaseBuffer = device.makeBuffer(bytes: newBiase, length: param.bias.buffer.length)! + newScaleBuffer = device.makeBuffer(bytes: newScale, length: param.scale.buffer.length)! + } else if computePrecision == .Float32 { - - encoder.setTexture(param.input.metalTexture, index: 0) - encoder.setTexture(param.output.metalTexture, index: 1) - encoder.setBytes(&metalParam, length: MemoryLayout.size, index: 0) - encoder.setBuffer(param.filter.buffer, offset: 0, index: 1) - encoder.setBuffer(param.y.buffer, offset: 0, index: 2) - encoder.setBuffer(param.newScale!, offset: 0, index: 3) - encoder.setBuffer(param.newBiase!, offset: 0, index: 4) - encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture) - encoder.endEncoding() + newBiaseBuffer = device.makeBuffer(length: param.bias.buffer.length / 2)! + newScaleBuffer = device.makeBuffer(length: param.bias.buffer.length / 2)! + + float32ToFloat16(input: newBiase as! UnsafeMutablePointer, output: newBiaseBuffer.contents(), count: param.bias.buffer.length / MemoryLayout

.size) + + float32ToFloat16(input: newScale as! UnsafeMutablePointer, output: newScaleBuffer.contents(), count: param.scale.buffer.length / MemoryLayout

.size) + } else { + fatalError(" unsupport ") } - public func test(commandBuffer: MTLCommandBuffer, param: ConvAddBatchNormReluTestParam) { - guard let encoder = commandBuffer.makeComputeCommandEncoder() else { - fatalError() - } - - encoder.setTexture(param.inputTexture, index: 0) - encoder.setTexture(param.outputTexture, index: 1) - var inMetalParam = param.metalParam - encoder.setBytes(&inMetalParam, length: MemoryLayout.size, index: 0) - encoder.setBuffer(param.filterBuffer, offset: 0, index: 1) - encoder.setBuffer(param.biaseBuffer, offset: 0, index: 2) - encoder.setBuffer(param.newScaleBuffer, offset: 0, index: 3) - encoder.setBuffer(param.newBiaseBuffer, offset: 0, index: 4) - encoder.dispatch(computePipline: pipline, outTexture: param.outputTexture) - encoder.endEncoding() + param.newBiase = newBiaseBuffer + param.newScale = newScaleBuffer + + newScale.deinitialize(count: param.scale.buffer.length) + newScale.deallocate() + + newBiase.deinitialize(count: param.bias.buffer.length) + newBiase.deallocate() + } + + func compute(commandBuffer: MTLCommandBuffer, param: ConvAddBatchNormReluParam

) throws { + guard let encoder = commandBuffer.makeComputeCommandEncoder() else { + throw PaddleMobileError.predictError(message: " encode is nil") } + + + encoder.setTexture(param.input.metalTexture, index: 0) + encoder.setTexture(param.output.metalTexture, index: 1) + encoder.setBytes(&metalParam, length: MemoryLayout.size, index: 0) + encoder.setBuffer(param.filter.buffer, offset: 0, index: 1) + encoder.setBuffer(param.y.buffer, offset: 0, index: 2) + encoder.setBuffer(param.newScale!, offset: 0, index: 3) + encoder.setBuffer(param.newBiase!, offset: 0, index: 4) + encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture) + encoder.endEncoding() + } + + public func test(commandBuffer: MTLCommandBuffer, param: ConvAddBatchNormReluTestParam) { + guard let encoder = commandBuffer.makeComputeCommandEncoder() else { + fatalError() + } + + encoder.setTexture(param.inputTexture, index: 0) + encoder.setTexture(param.outputTexture, index: 1) + var inMetalParam = param.metalParam + encoder.setBytes(&inMetalParam, length: MemoryLayout.size, index: 0) + encoder.setBuffer(param.filterBuffer, offset: 0, index: 1) + encoder.setBuffer(param.biaseBuffer, offset: 0, index: 2) + encoder.setBuffer(param.newScaleBuffer, offset: 0, index: 3) + encoder.setBuffer(param.newBiaseBuffer, offset: 0, index: 4) + encoder.dispatch(computePipline: pipline, outTexture: param.outputTexture) + encoder.endEncoding() + } } diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvAddKernel.swift b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvAddKernel.swift index 83dd4f996ab23a94824deb6194241d6a52ace487..ce1e0f6560e9911e862ead537089d37fdb4fe1c4 100644 --- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvAddKernel.swift +++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvAddKernel.swift @@ -17,22 +17,35 @@ import Foundation class ConvAddKernel: Kernel, Computable { var metalParam: MetalConvParam! required init(device: MTLDevice, param: ConvAddParam

) { - if param.filter.width == 1 && param.filter.height == 1 { - super.init(device: device, inFunctionName: "conv_add_1x1") - } else if param.filter.channel == 1 { - super.init(device: device, inFunctionName: "depthwise_conv_add_3x3") + + if computePrecision == .Float16 { + if param.filter.width == 1 && param.filter.height == 1 { + super.init(device: device, inFunctionName: "conv_add_1x1_half") + } else if param.filter.channel == 1 { + super.init(device: device, inFunctionName: "depthwise_conv_add_3x3_half") + } else { + super.init(device: device, inFunctionName: "conv_add_3x3_half") + } + } else if computePrecision == .Float32 { + if param.filter.width == 1 && param.filter.height == 1 { + super.init(device: device, inFunctionName: "conv_add_1x1") + } else if param.filter.channel == 1 { + super.init(device: device, inFunctionName: "depthwise_conv_add_3x3") + } else { + super.init(device: device, inFunctionName: "conv_add_3x3") + } } else { - super.init(device: device, inFunctionName: "conv_add_3x3") + fatalError() } - param.output.initTexture(device: device, inTranspose: [0, 2, 3, 1]) + param.output.initTexture(device: device, inTranspose: [0, 2, 3, 1], computePrecision: computePrecision) let offsetX = (Int(param.dilations[0]) * (param.filter.width - 1) + 1)/2 - Int(param.paddings[0]) let offsetY = (Int(param.dilations[1]) * (param.filter.height - 1) + 1)/2 - Int(param.paddings[1]) - param.filter.initBuffer(device: device, precision: Tensor.BufferPrecision.Float32) - param.y.initBuffer(device: device, precision: Tensor.BufferPrecision.Float32) + param.filter.initBuffer(device: device, precision: computePrecision) + param.y.initBuffer(device: device, precision: computePrecision) print("offset x: \(offsetX)") print("offset y: \(offsetY)") diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvBNReluKernel.swift b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvBNReluKernel.swift index c5d3ffe6c944ab9019f5b80e66b4691057209529..350c81cece15a242e1c6b7bb91cf515a4eaf2335 100644 --- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvBNReluKernel.swift +++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvBNReluKernel.swift @@ -51,21 +51,33 @@ class ConvBNReluKernel: Kernel, Computable, Testable { var metalParam: MetalConvParam! required init(device: MTLDevice, param: ConvBNReluParam

) { - - if param.filter.width == 1 && param.filter.height == 1 { - super.init(device: device, inFunctionName: "conv_batch_norm_relu_1x1") - } else if param.filter.channel == 1 { - super.init(device: device, inFunctionName: "depthwise_conv_batch_norm_relu_3x3") + if computePrecision == .Float32 { + if param.filter.width == 1 && param.filter.height == 1 { + super.init(device: device, inFunctionName: "conv_batch_norm_relu_1x1") + } else if param.filter.channel == 1 { + super.init(device: device, inFunctionName: "depthwise_conv_batch_norm_relu_3x3") + } else { + super.init(device: device, inFunctionName: "conv_batch_norm_relu_3x3") + } + } else if computePrecision == .Float16 { + if param.filter.width == 1 && param.filter.height == 1 { + super.init(device: device, inFunctionName: "conv_batch_norm_relu_1x1_half") + } else if param.filter.channel == 1 { + super.init(device: device, inFunctionName: "depthwise_conv_batch_norm_relu_3x3_half") + } else { + super.init(device: device, inFunctionName: "conv_batch_norm_relu_3x3_half") + } } else { - super.init(device: device, inFunctionName: "conv_batch_norm_relu_3x3") + fatalError() } - param.output.initTexture(device: device, inTranspose: [0, 2, 3, 1]) - param.filter.initBuffer(device: device, precision: Tensor.BufferPrecision.Float32) - param.variance.initBuffer(device: device) - param.mean.initBuffer(device: device) - param.scale.initBuffer(device: device) - param.bias.initBuffer(device: device) + param.output.initTexture(device: device, inTranspose: [0, 2, 3, 1], computePrecision: computePrecision) + param.filter.initBuffer(device: device, precision: computePrecision) + + param.variance.initBuffer(device: device, precision: .Float32) + param.mean.initBuffer(device: device, precision: .Float32) + param.scale.initBuffer(device: device, precision: .Float32) + param.bias.initBuffer(device: device, precision: .Float32) let offsetX = param.filter.width/2 - Int(param.paddings[0]) let offsetY = param.filter.height/2 - Int(param.paddings[1]) @@ -102,8 +114,26 @@ class ConvBNReluKernel: Kernel, Computable, Testable { newBiase[i] = biaseContents[i] - meanContents[i] * invs[i] * scaleContents[i] } - param.newBiase = device.makeBuffer(bytes: newBiase, length: param.bias.buffer.length) - param.newScale = device.makeBuffer(bytes: newScale, length: param.scale.buffer.length) + var newBiaseBuffer: MTLBuffer + var newScaleBuffer: MTLBuffer + + if computePrecision == .Float16 { + newBiaseBuffer = device.makeBuffer(bytes: newBiase, length: param.bias.buffer.length)! + newScaleBuffer = device.makeBuffer(bytes: newScale, length: param.scale.buffer.length)! + } else if computePrecision == .Float32 { + + newBiaseBuffer = device.makeBuffer(length: param.bias.buffer.length / 2)! + newScaleBuffer = device.makeBuffer(length: param.bias.buffer.length / 2)! + + float32ToFloat16(input: newBiase as! UnsafeMutablePointer, output: newBiaseBuffer.contents(), count: param.bias.buffer.length / MemoryLayout

.size) + + float32ToFloat16(input: newScale as! UnsafeMutablePointer, output: newScaleBuffer.contents(), count: param.scale.buffer.length / MemoryLayout

.size) + } else { + fatalError(" unsupport ") + } + + param.newBiase = newBiaseBuffer + param.newScale = newScaleBuffer newScale.deinitialize(count: param.scale.buffer.length) newScale.deallocate() diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvKernel.swift b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvKernel.swift index 680beba1ea711b389dd6117fc84f00b6079c9a60..e0485851fd610781f475eb43be1ce6fd4937a4ef 100644 --- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvKernel.swift +++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvKernel.swift @@ -39,7 +39,7 @@ class ConvKernel: Kernel, Computable { let offsetX = param.filter.dim[2]/2 - Int(param.paddings[0]) let offsetY = param.filter.dim[1]/2 - Int(param.paddings[1]) let offsetZ = 0.0 - param.filter.initBuffer(device: device, precision: Tensor.BufferPrecision.Float32) + param.filter.initBuffer(device: device, precision: ComputePrecision.Float32) metalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), paddedZ: UInt16(param.input.metalTexture.arrayLength * 4 - param.input.dim[3]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1])) } diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/PoolKernel.swift b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/PoolKernel.swift index 5e8d92054bd2fa15af2d3e75860c0dc4d9b93e5c..b6db7231e83943dbce6f2cbe3266af9fbe508aef 100644 --- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/PoolKernel.swift +++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/PoolKernel.swift @@ -28,7 +28,7 @@ class PoolKernel: Kernel, Computable{ required init(device: MTLDevice, param: PoolParam

) { super.init(device: device, inFunctionName: "pool") - param.output.initTexture(device: device, inTranspose: param.input.transpose) + param.output.initTexture(device: device, inTranspose: param.input.transpose, computePrecision: computePrecision) } func compute(commandBuffer: MTLCommandBuffer, param: PoolParam

) throws { diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/PreluKernel.swift b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/PreluKernel.swift index ad925eb174414ed8f48cc8dd5bf090bc2ed0aed2..1545a848dacb4f11a2a68df31f7ea49a23799a87 100644 --- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/PreluKernel.swift +++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/PreluKernel.swift @@ -17,8 +17,8 @@ class PreluKernel: Kernel, Computable{ } else { super.init(device: device, inFunctionName: "prelu_other") } - param.alpha.initBuffer(device: device) - param.output.initTexture(device: device, inTranspose: param.input.transpose) + param.alpha.initBuffer(device: device, precision: computePrecision) + param.output.initTexture(device: device, inTranspose: param.input.transpose, computePrecision: computePrecision) } func compute(commandBuffer: MTLCommandBuffer, param: PreluParam

) throws { diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/PriorBoxKernel.swift b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/PriorBoxKernel.swift index e2363e44d3a3d81b430f82303b2b1017ddfc5200..08a489ab2298c937f8878af94b557c2fa60d18d0 100644 --- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/PriorBoxKernel.swift +++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/PriorBoxKernel.swift @@ -33,11 +33,16 @@ class PriorBoxKernel: Kernel, Computable{ var metalParam: PriorBoxMetalParam! required init(device: MTLDevice, param: PriorBoxParam

) { - super.init(device: device, inFunctionName: "prior_box") - param.output.initTexture(device: device, inTranspose: [2, 0, 1, 3]) + if computePrecision == .Float32 { + super.init(device: device, inFunctionName: "prior_box") + } else if computePrecision == .Float16 { + super.init(device: device, inFunctionName: "prior_box_half") + } else { + fatalError() + } - - param.outputVariances.initTexture(device: device, inTranspose: [2, 0, 1, 3]) + param.output.initTexture(device: device, inTranspose: [2, 0, 1, 3], computePrecision: computePrecision) + param.outputVariances.initTexture(device: device, inTranspose: [2, 0, 1, 3], computePrecision: computePrecision) let n = 1 let h = param.output.dim[1] @@ -79,7 +84,18 @@ class PriorBoxKernel: Kernel, Computable{ } } - param.newAspectRatios = outputAspectRatior + if computePrecision == .Float16 { + let buffer = device.makeBuffer(length: outputAspectRatior.count) + float32ToFloat16(input: &outputAspectRatior, output:(buffer?.contents())!, count: outputAspectRatior.count) + param.newAspectRatios = buffer + + } else if computePrecision == .Float32 { + let buffer = device.makeBuffer(bytes: outputAspectRatior, length: outputAspectRatior.count * MemoryLayout.size, options: []) + param.newAspectRatios = buffer + } else { + fatalError() + } + let aspectRatiosSize = uint(outputAspectRatior.count) let maxSizeSize: uint = uint(param.maxSizes.count) @@ -102,12 +118,13 @@ class PriorBoxKernel: Kernel, Computable{ encoder.setTexture(param.input.metalTexture, index: 0) encoder.setTexture(param.output.metalTexture, index: 1) encoder.setTexture(param.outputVariances.metalTexture, index: 2) - encoder.setBytes(&metalParam, length: MemoryLayout.size, index: 0) - encoder.setBytes(param.newAspectRatios!, length: MemoryLayout.size * param.newAspectRatios!.count, index: 1) + + encoder.setBuffer(param.newAspectRatios!, offset: 0, index: 0) + + encoder.setBytes(&metalParam, length: MemoryLayout.size, index: 1) + encoder.setBytes(param.variances, length: MemoryLayout.size * param.variances.count, index: 2) encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture) encoder.endEncoding() } - - } diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ReshapeKernel.swift b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ReshapeKernel.swift index 96a1abb6df964ee24d74ca9979ca59512f4e4265..3916c07ce5e8d4f3179a8a3100563a77e68eb53b 100644 --- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ReshapeKernel.swift +++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ReshapeKernel.swift @@ -15,58 +15,65 @@ import Foundation struct ReshapeMetalParam { - var idim: (Int32, Int32, Int32, Int32) - var itrans: (Int32, Int32, Int32, Int32) - var odim: (Int32, Int32, Int32, Int32) - var otrans: (Int32, Int32, Int32, Int32) + var idim: (Int32, Int32, Int32, Int32) + var itrans: (Int32, Int32, Int32, Int32) + var odim: (Int32, Int32, Int32, Int32) + var otrans: (Int32, Int32, Int32, Int32) } struct ReshapeTestParam: TestParam { - let inputTexture: MTLTexture - let outputTexture: MTLTexture - let param: ReshapeMetalParam + let inputTexture: MTLTexture + let outputTexture: MTLTexture + let param: ReshapeMetalParam } class ReshapeKernel: Kernel, Computable{ - required init(device: MTLDevice, param: ReshapeParam

) { - param.output.initTexture(device: device) - super.init(device: device, inFunctionName: "reshape") + required init(device: MTLDevice, param: ReshapeParam

) { + param.output.initTexture(device: device, computePrecision: computePrecision) + if computePrecision == .Float32 { + super.init(device: device, inFunctionName: "reshape") + } else if computePrecision == .Float16 { + super.init(device: device, inFunctionName: "reshape_half") + } else { + fatalError() } - - required init(device: MTLDevice, testParam: ReshapeTestParam) { - super.init(device: device, inFunctionName: "reshape") - } - - func compute(commandBuffer: MTLCommandBuffer, param: ReshapeParam

) throws { - guard let encoder = commandBuffer.makeComputeCommandEncoder() else { - throw PaddleMobileError.predictError(message: " encoder is nil") - } - encoder.setTexture(param.input.metalTexture, index: 0) - encoder.setTexture(param.output.metalTexture, index: 1) - let id: [Int32] = (0..<4).map { Int32(param.input.dim[$0]) } - let it: [Int32] = param.input.transpose.map { Int32($0) } - let od: [Int32] = (0..<4).map { Int32(param.output.dim[$0]) } - let ot: [Int32] = param.output.transpose.map { Int32($0) } - var rmp = ReshapeMetalParam.init( - idim: (id[0], id[1], id[2], id[3]), - itrans: (it[0], it[1], it[2], it[3]), - odim: (od[0], od[1], od[2], od[3]), - otrans: (ot[0], ot[1], ot[2], ot[3]) - ) - encoder.setBytes(&rmp, length: MemoryLayout.size, index: 0) - encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture) - encoder.endEncoding() + } + + required init(device: MTLDevice, testParam: ReshapeTestParam) { + super.init(device: device, inFunctionName: "reshape") + } + + func compute(commandBuffer: MTLCommandBuffer, param: ReshapeParam

) throws { + guard let encoder = commandBuffer.makeComputeCommandEncoder() else { + throw PaddleMobileError.predictError(message: " encoder is nil") } - func test(commandBuffer: MTLCommandBuffer, testParam: ReshapeTestParam) { - guard let encoder = commandBuffer.makeComputeCommandEncoder() else { - fatalError() - } - encoder.setTexture(testParam.inputTexture, index: 0) - encoder.setTexture(testParam.outputTexture, index: 1) - var pm: ReshapeMetalParam = testParam.param - encoder.setBytes(&pm, length: MemoryLayout.size, index: 0) - encoder.dispatch(computePipline: pipline, outTexture: testParam.outputTexture) - encoder.endEncoding() + encoder.setTexture(param.input.metalTexture, index: 0) + encoder.setTexture(param.output.metalTexture, index: 1) + let id: [Int32] = (0..<4).map { Int32(param.input.dim[$0]) } + let it: [Int32] = param.input.transpose.map { Int32($0) } + let od: [Int32] = (0..<4).map { Int32(param.output.dim[$0]) } + let ot: [Int32] = param.output.transpose.map { Int32($0) } + var rmp = ReshapeMetalParam.init( + idim: (id[0], id[1], id[2], id[3]), + itrans: (it[0], it[1], it[2], it[3]), + odim: (od[0], od[1], od[2], od[3]), + otrans: (ot[0], ot[1], ot[2], ot[3]) + ) + encoder.setBytes(&rmp, length: MemoryLayout.size, index: 0) + encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture) + encoder.endEncoding() + } + + func test(commandBuffer: MTLCommandBuffer, testParam: ReshapeTestParam) { + guard let encoder = commandBuffer.makeComputeCommandEncoder() else { + fatalError() } + encoder.setTexture(testParam.inputTexture, index: 0) + encoder.setTexture(testParam.outputTexture, index: 1) + var pm: ReshapeMetalParam = testParam.param + encoder.setBytes(&pm, length: MemoryLayout.size, index: 0) + encoder.dispatch(computePipline: pipline, outTexture: testParam.outputTexture) + encoder.endEncoding() + } } diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/SoftmaxKernel.swift b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/SoftmaxKernel.swift index 5d2d5b1c7af5d9822394d2e7de9b251085c035dc..6f6d0af477f62d7f438b8b6a38c825c2eb95163f 100644 --- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/SoftmaxKernel.swift +++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/SoftmaxKernel.swift @@ -38,7 +38,13 @@ class SoftmaxKernel: Kernel, Computable{ } required init(device: MTLDevice, param: SoftmaxParam

) { - param.output.initTexture(device: device) - super.init(device: device, inFunctionName: "softmax") + param.output.initTexture(device: device, computePrecision: computePrecision) + if computePrecision == .Float32 { + super.init(device: device, inFunctionName: "softmax") + } else if computePrecision == .Float16 { + super.init(device: device, inFunctionName: "softmax_half") + } else { + fatalError() + } } } diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/TransposeKernel.swift b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/TransposeKernel.swift index 33e1219b4d0fff972d8db3d16fc7ce1477841351..6594b3474f0abb04364246830f79302f487af499 100644 --- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/TransposeKernel.swift +++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/TransposeKernel.swift @@ -41,33 +41,27 @@ struct TransposeTestParam: TestParam { } class TransposeKernel: Kernel, Computable, Testable { - var metalParam: TransposeMetalParam! - func compute(commandBuffer: MTLCommandBuffer, param: TransposeParam

) throws { - guard let encoder = commandBuffer.makeComputeCommandEncoder() else { - throw PaddleMobileError.predictError(message: " encode is nil") - } - - encoder.setTexture(param.input.metalTexture, index: 0) - encoder.setTexture(param.output.metalTexture, index: 1) - encoder.setBytes(&metalParam, length: MemoryLayout.size, index: 0) - encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture) - encoder.endEncoding() - } required init(device: MTLDevice, param: TransposeParam

) { - param.output.initTexture(device: device, inTranspose: [0, 1, 2, 3]) - super.init(device: device, inFunctionName: "transpose") + param.output.initTexture(device: device, inTranspose: [0, 1, 2, 3], computePrecision: computePrecision) + if computePrecision == .Float16 { + super.init(device: device, inFunctionName: "transpose_half") + } else if computePrecision == .Float32 { + super.init(device: device, inFunctionName: "transpose") + } else { + fatalError() + } var invT: [Int] = [0, 1, 2, 3] for (i, v) in param.input.transpose.enumerated() { invT[v] = i } var axis: [Int] = [0, 1, 2, 3] -// var doNothing = false -// if param.axis.count == param.input.transpose.count { -// doNothing = param.axis == param.input.transpose.map { Int32($0) } -// } + // var doNothing = false + // if param.axis.count == param.input.transpose.count { + // doNothing = param.axis == param.input.transpose.map { Int32($0) } + // } for i in 0..: Kernel, Computable, Testable { } metalParam = tmp } + required init(device: MTLDevice, testParam: TransposeTestParam) { - super.init(device: device, inFunctionName: "transpose") - fatalError() + if computePrecision == .Float16 { + super.init(device: device, inFunctionName: "transpose_half") + } else if computePrecision == .Float32 { + super.init(device: device, inFunctionName: "transpose") + } else { + fatalError() + } + } + + var metalParam: TransposeMetalParam! + func compute(commandBuffer: MTLCommandBuffer, param: TransposeParam

) throws { + guard let encoder = commandBuffer.makeComputeCommandEncoder() else { + throw PaddleMobileError.predictError(message: " encode is nil") + } + + encoder.setTexture(param.input.metalTexture, index: 0) + encoder.setTexture(param.output.metalTexture, index: 1) + encoder.setBytes(&metalParam, length: MemoryLayout.size, index: 0) + encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture) + encoder.endEncoding() } + public func test(commandBuffer: MTLCommandBuffer, param: TransposeTestParam) { guard let encoder = commandBuffer.makeComputeCommandEncoder() else { diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/BoxCoder.metal b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/BoxCoder.metal index 9a177488861c42740f3a0343b3cb41bb0b969137..7abc17ec6e7a204af4d74b28d40e2a4c69dddc4b 100644 --- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/BoxCoder.metal +++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/BoxCoder.metal @@ -34,7 +34,6 @@ kernel void boxcoder(texture2d_array priorBox [[texture(0)] float tw = exp(pv.z * t.z) * pw; float th = exp(pv.w * t.w) * ph; - float4 r; r.x = tx - tw / 2; r.y = ty - th / 2; @@ -43,3 +42,31 @@ kernel void boxcoder(texture2d_array priorBox [[texture(0)] output.write(r, gid.xy, gid.z); } + +kernel void boxcoder_half(texture2d_array priorBox [[texture(0)]], + texture2d_array priorBoxVar [[texture(1)]], + texture2d_array targetBox [[texture(2)]], + texture2d_array output[[texture(3)]], + uint3 gid [[thread_position_in_grid]]) { + half4 t = targetBox.read(gid.xy, gid.z); + half4 p = priorBox.read(gid.xy, gid.z); + half4 pv = priorBoxVar.read(gid.xy, gid.z); + + float px = (float(p.x) + float(p.z)) / 2; + float py = (float(p.y) + float(p.w)) / 2; + float pw = float(p.z) - float(p.x); + float ph = float(p.w) - float(p.y); + + float tx = float(pv.x) * float(t.x) * pw + px; + float ty = float(pv.y) * float(t.y) * ph + py; + float tw = exp(float(pv.z) * float(t.z)) * pw; + float th = exp(float(pv.w) * float(t.w)) * ph; + + float4 r; + r.x = tx - tw / 2; + r.y = ty - th / 2; + r.z = tx + tw / 2; + r.w = ty + th / 2; + + output.write(half4(r), gid.xy, gid.z); +} diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/Common.metal b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/Common.metal index c5c4ffc5c995500503411148db31b2acfa3459b6..d37be42be64f8fdd7325fd62a68e646737b6dedf 100644 --- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/Common.metal +++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/Common.metal @@ -57,3 +57,14 @@ inline void invtrans(int32_t trans[4], int32_t ipos[4], int32_t opos[4]) { opos[trans[i]] = ipos[i]; } } + + +struct MetalConvParam { + short offsetX; + short offsetY; + short offsetZ; + ushort strideX; + ushort strideY; + ushort dilationX; + ushort dilationY; +}; diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/Concat.metal b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/Concat.metal index 09c0e8dadab759bbdf514f347eff3eb005bfac2f..92d80c315e0d5ca19711b4a2165c89077979d49d 100644 --- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/Concat.metal +++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/Concat.metal @@ -69,3 +69,48 @@ kernel void concat(texture2d_array in0 [[texture(0)]], } out.write(r, gid.xy, gid.z); } + +kernel void concat_half(texture2d_array in0 [[texture(0)]], + texture2d_array in1 [[texture(1)]], + texture2d_array in2 [[texture(2)]], + texture2d_array in3 [[texture(3)]], + texture2d_array in4 [[texture(4)]], + texture2d_array in5 [[texture(5)]], + texture2d_array inx [[texture(6)]], + texture2d_array out [[texture(7)]], + constant ConcatParam & pm [[buffer(0)]], + uint3 gid [[thread_position_in_grid]]) { + ConcatParam cp = pm; + int xyzn[4] = {int(gid.x), int(gid.y), int(gid.z), 0}, abcd[4], oxyzn[4]; + half4 r; + for (int i = 0; i < 4; i++) { + xyzn[3] = i; + xyzn2abcd(cp.odim[3], xyzn, abcd); + int k = abcd[cp.axis] - cp.offset; + int j = 0; + if (k < 0) { + r[i] = inx.read(gid.xy, gid.z)[i]; + } else { + for (; j < 6; j++) { + if (k < cp.vdim[j]) { + break; + } + k -= cp.vdim[j]; + } + int ta = cp.odim[cp.axis]; + abcd[cp.axis] = k; + cp.odim[cp.axis] = cp.vdim[j]; + abcd2xyzn(cp.odim[3], abcd, oxyzn); + cp.odim[cp.axis] = ta; + switch (j) { + case 0: r[i] = in0.read(uint2(oxyzn[0], oxyzn[1]), oxyzn[2])[oxyzn[3]]; break; + case 1: r[i] = in1.read(uint2(oxyzn[0], oxyzn[1]), oxyzn[2])[oxyzn[3]]; break; + case 2: r[i] = in2.read(uint2(oxyzn[0], oxyzn[1]), oxyzn[2])[oxyzn[3]]; break; + case 3: r[i] = in3.read(uint2(oxyzn[0], oxyzn[1]), oxyzn[2])[oxyzn[3]]; break; + case 4: r[i] = in4.read(uint2(oxyzn[0], oxyzn[1]), oxyzn[2])[oxyzn[3]]; break; + case 5: r[i] = in5.read(uint2(oxyzn[0], oxyzn[1]), oxyzn[2])[oxyzn[3]]; break; + } + } + } + out.write(r, gid.xy, gid.z); +} diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ConvAddBNReluKernel.metal b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ConvAddBNReluKernel.metal new file mode 100644 index 0000000000000000000000000000000000000000..ffa66212b16bb6c6180910cae2d0c34f8659c556 --- /dev/null +++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ConvAddBNReluKernel.metal @@ -0,0 +1,308 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#include +#include "Common.metal" +using namespace metal; + + +kernel void conv_add_batch_norm_relu_1x1_half(texture2d_array inTexture [[texture(0)]], + texture2d_array outTexture [[texture(1)]], + constant MetalConvParam ¶m [[buffer(0)]], + const device half4 *weights [[buffer(1)]], + const device half4 *biase [[buffer(2)]], + const device float4 *new_scale [[buffer(3)]], + const device float4 *new_biase [[buffer(4)]], + uint3 gid [[thread_position_in_grid]]) { + + if (gid.x >= outTexture.get_width() || + gid.y >= outTexture.get_height() || + gid.z >= outTexture.get_array_size()) { + return; + } + + ushort2 stride = ushort2(param.strideX, param.strideY); + ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY); + + constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); + const uint kernelHXW = 1; + + uint input_arr_size = inTexture.get_array_size(); + uint weithTo = gid.z * kernelHXW * input_arr_size * 4; + + half4 output = half4(0.0); + + half4 input; + for (uint i = 0; i < input_arr_size; ++i) { + input = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i); + half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + i]; + output.x += dot(input, weight_x); + + half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + i]; + output.y += dot(input, weight_y); + + half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + i]; + output.z += dot(input, weight_z); + + half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + i]; + output.w += dot(input, weight_w); + } + + output = half4(fmax((float4(output) + float4(biase[gid.z])) * new_scale[gid.z] + new_biase[gid.z], 0.0)); + outTexture.write(output, gid.xy, gid.z); +} + +kernel void conv_add_batch_norm_relu_3x3_half(texture2d_array inTexture [[texture(0)]], + texture2d_array outTexture [[texture(1)]], + constant MetalConvParam ¶m [[buffer(0)]], + const device half4 *weights [[buffer(1)]], + const device half4 *biase [[buffer(2)]], + const device float4 *new_scale [[buffer(3)]], + const device float4 *new_biase [[buffer(4)]], + uint3 gid [[thread_position_in_grid]]) { + + if (gid.x >= outTexture.get_width() || + gid.y >= outTexture.get_height() || + gid.z >= outTexture.get_array_size()) { + return; + } + + ushort2 stride = ushort2(param.strideX, param.strideY); + const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY); + + constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); + const uint kernelHXW = 9; + uint input_arr_size = inTexture.get_array_size(); + uint weithTo = gid.z * kernelHXW * input_arr_size * 4; + + half4 output = half4(0.0); + + half4 input[9]; + for (uint i = 0; i < input_arr_size; ++i) { + input[0] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y - 1), i); + input[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 1), i); + input[2] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y - 1), i); + input[3] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y), i); + input[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i); + input[5] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y), i); + input[6] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y + 1), i); + input[7] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + 1), i); + input[8] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y + 1), i); + for (int j = 0; j < 9; ++j) { + half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i]; + output.x += dot(input[j], weight_x); + + half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i]; + output.y += dot(input[j], weight_y); + + half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i]; + output.z += dot(input[j], weight_z); + + half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i]; + output.w += dot(input[j], weight_w); + } + } + output = half4(fmax((float4(output) + float4(biase[gid.z])) * new_scale[gid.z] + new_biase[gid.z], 0.0)); + outTexture.write(output, gid.xy, gid.z); +} + + +kernel void depthwise_conv_add_batch_norm_relu_3x3_half(texture2d_array inTexture [[texture(0)]], + texture2d_array outTexture [[texture(1)]], + constant MetalConvParam ¶m [[buffer(0)]], + const device half *weights [[buffer(1)]], + const device half4 *biase [[buffer(2)]], + const device float4 *new_scale [[buffer(3)]], + const device float4 *new_biase [[buffer(4)]], + uint3 gid [[thread_position_in_grid]]) { + + if (gid.x >= outTexture.get_width() || + gid.y >= outTexture.get_height() || + gid.z >= outTexture.get_array_size()) { + return; + } + uint output_slice = gid.z; + ushort2 stride = ushort2(param.strideX, param.strideY); + ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY); + constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); + const uint kernelHXW = 9; + uint weithTo = gid.z * kernelHXW * 4; + half4 output = half4(0.0); + half4 inputs[9]; + inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y - 1), output_slice); + inputs[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 1), output_slice); + inputs[2] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y - 1), output_slice); + inputs[3] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y), output_slice); + inputs[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), output_slice); + inputs[5] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y), output_slice); + inputs[6] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y + 1), output_slice); + inputs[7] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + 1), output_slice); + inputs[8] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y + 1), output_slice); + for (int j = 0; j < 9; ++j) { + half4 input = inputs[j]; + output.x += input.x * weights[weithTo + 0 * kernelHXW + j]; + output.y += input.y * weights[weithTo + 1 * kernelHXW + j]; + output.z += input.z * weights[weithTo + 2 * kernelHXW + j]; + output.w += input.w * weights[weithTo + 3 * kernelHXW + j]; + } + output = half4(fmax((float4(output) + float4(biase[gid.z])) * new_scale[gid.z] + new_biase[gid.z], 0.0)); + outTexture.write(output, gid.xy, gid.z); +} + + +/*---------------------------------------------*/ + + + +kernel void conv_add_batch_norm_relu_1x1(texture2d_array inTexture [[texture(0)]], + texture2d_array outTexture [[texture(1)]], + constant MetalConvParam ¶m [[buffer(0)]], + const device float4 *weights [[buffer(1)]], + const device float4 *biase [[buffer(2)]], + const device float4 *new_scale [[buffer(3)]], + const device float4 *new_biase [[buffer(4)]], + uint3 gid [[thread_position_in_grid]]) { + + if (gid.x >= outTexture.get_width() || + gid.y >= outTexture.get_height() || + gid.z >= outTexture.get_array_size()) { + return; + } + + ushort2 stride = ushort2(param.strideX, param.strideY); + ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY); + + constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); + const uint kernelHXW = 1; + + uint input_arr_size = inTexture.get_array_size(); + uint weithTo = gid.z * kernelHXW * input_arr_size * 4; + + float4 output = float4(0.0); + + float4 input; + for (uint i = 0; i < input_arr_size; ++i) { + input = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i); + float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + i]; + output.x += dot(input, weight_x); + + float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + i]; + output.y += dot(input, weight_y); + + float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + i]; + output.z += dot(input, weight_z); + + float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + i]; + output.w += dot(input, weight_w); + } + output = fmax((output + biase[gid.z]) * new_scale[gid.z] + new_biase[gid.z], 0.0); + outTexture.write(output, gid.xy, gid.z); +} + +kernel void conv_add_batch_norm_relu_3x3(texture2d_array inTexture [[texture(0)]], + texture2d_array outTexture [[texture(1)]], + constant MetalConvParam ¶m [[buffer(0)]], + const device float4 *weights [[buffer(1)]], + const device float4 *biase [[buffer(2)]], + const device float4 *new_scale [[buffer(3)]], + const device float4 *new_biase [[buffer(4)]], + uint3 gid [[thread_position_in_grid]]) { + + if (gid.x >= outTexture.get_width() || + gid.y >= outTexture.get_height() || + gid.z >= outTexture.get_array_size()) { + return; + } + + ushort2 stride = ushort2(param.strideX, param.strideY); + const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY); + + constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); + const uint kernelHXW = 9; + uint input_arr_size = inTexture.get_array_size(); + uint weithTo = gid.z * kernelHXW * input_arr_size * 4; + + float4 output = float4(0.0); + + float4 input[9]; + for (uint i = 0; i < input_arr_size; ++i) { + input[0] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y - 1), i); + input[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 1), i); + input[2] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y - 1), i); + input[3] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y), i); + input[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i); + input[5] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y), i); + input[6] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y + 1), i); + input[7] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + 1), i); + input[8] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y + 1), i); + for (int j = 0; j < 9; ++j) { + float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i]; + output.x += dot(input[j], weight_x); + + float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i]; + output.y += dot(input[j], weight_y); + + float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i]; + output.z += dot(input[j], weight_z); + + float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i]; + output.w += dot(input[j], weight_w); + } + } + output = fmax((output + biase[gid.z]) * new_scale[gid.z] + new_biase[gid.z], 0.0); + outTexture.write(output, gid.xy, gid.z); +} + +kernel void depthwise_conv_add_batch_norm_relu_3x3(texture2d_array inTexture [[texture(0)]], + texture2d_array outTexture [[texture(1)]], + constant MetalConvParam ¶m [[buffer(0)]], + const device float *weights [[buffer(1)]], + const device float4 *biase [[buffer(2)]], + const device float4 *new_scale [[buffer(3)]], + const device float4 *new_biase [[buffer(4)]], + uint3 gid [[thread_position_in_grid]]) { + + if (gid.x >= outTexture.get_width() || + gid.y >= outTexture.get_height() || + gid.z >= outTexture.get_array_size()) { + return; + } + uint output_slice = gid.z; + ushort2 stride = ushort2(param.strideX, param.strideY); + ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY); + constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); + const uint kernelHXW = 9; + uint weithTo = gid.z * kernelHXW * 4; + float4 output = float4(0.0); + float4 inputs[9]; + inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y - 1), output_slice); + inputs[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 1), output_slice); + inputs[2] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y - 1), output_slice); + inputs[3] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y), output_slice); + inputs[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), output_slice); + inputs[5] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y), output_slice); + inputs[6] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y + 1), output_slice); + inputs[7] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + 1), output_slice); + inputs[8] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y + 1), output_slice); + for (int j = 0; j < 9; ++j) { + float4 input = inputs[j]; + output.x += input.x * weights[weithTo + 0 * kernelHXW + j]; + output.y += input.y * weights[weithTo + 1 * kernelHXW + j]; + output.z += input.z * weights[weithTo + 2 * kernelHXW + j]; + output.w += input.w * weights[weithTo + 3 * kernelHXW + j]; + } + output = fmax((output + biase[gid.z]) * new_scale[gid.z] + new_biase[gid.z], 0.0); + outTexture.write(output, gid.xy, gid.z); +} + diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ConvAddMetal.metal b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ConvAddMetal.metal new file mode 100644 index 0000000000000000000000000000000000000000..9244b2ec4631015ffd192567f734bee4cc1c7c85 --- /dev/null +++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ConvAddMetal.metal @@ -0,0 +1,306 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#include +#include "Common.metal" + +using namespace metal; + +#pragma mark - convAdd +kernel void conv_add_1x1(texture2d_array inTexture [[texture(0)]], + texture2d_array outTexture [[texture(1)]], + constant MetalConvParam ¶m [[buffer(0)]], + const device float4 *weights [[buffer(1)]], + const device float4 *biase [[buffer(2)]], + uint3 gid [[thread_position_in_grid]]) { + + if (gid.x >= outTexture.get_width() || + gid.y >= outTexture.get_height() || + gid.z >= outTexture.get_array_size()) { + return; + } + + ushort2 stride = ushort2(param.strideX, param.strideY); + ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY); + + constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); + const uint kernelHXW = 1; + + uint input_arr_size = inTexture.get_array_size(); + uint weithTo = gid.z * kernelHXW * input_arr_size * 4; + + float4 output = float4(0.0); + + float4 input; + for (uint i = 0; i < input_arr_size; ++i) { + input = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i); + float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + i]; + output.x += dot(input, weight_x); + + float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + i]; + output.y += dot(input, weight_y); + + float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + i]; + output.z += dot(input, weight_z); + + float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + i]; + output.w += dot(input, weight_w); + } + output = output + biase[gid.z]; + outTexture.write(output, gid.xy, gid.z); +} + +kernel void conv_add_3x3(texture2d_array inTexture [[texture(0)]], + texture2d_array outTexture [[texture(1)]], + constant MetalConvParam ¶m [[buffer(0)]], + const device float4 *weights [[buffer(1)]], + const device float4 *biase [[buffer(2)]], + const device float4 *new_scale [[buffer(3)]], + const device float4 *new_biase [[buffer(4)]], + uint3 gid [[thread_position_in_grid]]) { + + if (gid.x >= outTexture.get_width() || + gid.y >= outTexture.get_height() || + gid.z >= outTexture.get_array_size()) { + return; + } + + ushort2 stride = ushort2(param.strideX, param.strideY); + const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY); + + constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); + const uint kernelHXW = 9; + uint input_arr_size = inTexture.get_array_size(); + uint weithTo = gid.z * kernelHXW * input_arr_size * 4; + + float4 output = float4(0.0); + + ushort dilation_x = param.dilationX; + ushort dilation_y = param.dilationY; + + float4 input[9]; + for (uint i = 0; i < input_arr_size; ++i) { + input[0] = inTexture.sample(sample, float2(posInInput.x - dilation_x, posInInput.y - dilation_y), i); + input[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - dilation_y), i); + input[2] = inTexture.sample(sample, float2(posInInput.x + dilation_x, posInInput.y - dilation_y), i); + input[3] = inTexture.sample(sample, float2(posInInput.x - dilation_x, posInInput.y), i); + input[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i); + input[5] = inTexture.sample(sample, float2(posInInput.x + dilation_x, posInInput.y), i); + input[6] = inTexture.sample(sample, float2(posInInput.x - dilation_x, posInInput.y + dilation_y), i); + input[7] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + dilation_y), i); + input[8] = inTexture.sample(sample, float2(posInInput.x + dilation_x, posInInput.y + dilation_y), i); + for (int j = 0; j < 9; ++j) { + float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i]; + output.x += dot(input[j], weight_x); + + float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i]; + output.y += dot(input[j], weight_y); + + float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i]; + output.z += dot(input[j], weight_z); + + float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i]; + output.w += dot(input[j], weight_w); + } + } + output = output + biase[gid.z]; + outTexture.write(output, gid.xy, gid.z); +} + +kernel void depthwise_conv_add_3x3(texture2d_array inTexture [[texture(0)]], + texture2d_array outTexture [[texture(1)]], + constant MetalConvParam ¶m [[buffer(0)]], + const device float *weights [[buffer(1)]], + const device float4 *biase [[buffer(2)]], + const device float4 *new_scale [[buffer(3)]], + const device float4 *new_biase [[buffer(4)]], + uint3 gid [[thread_position_in_grid]]) { + + if (gid.x >= outTexture.get_width() || + gid.y >= outTexture.get_height() || + gid.z >= outTexture.get_array_size()) { + return; + } + uint output_slice = gid.z; + ushort2 stride = ushort2(param.strideX, param.strideY); + ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY); + constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); + const uint kernelHXW = 9; + uint weithTo = gid.z * kernelHXW * 4; + float4 output = float4(0.0); + float4 inputs[9]; + inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y - 1), output_slice); + inputs[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 1), output_slice); + inputs[2] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y - 1), output_slice); + inputs[3] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y), output_slice); + inputs[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), output_slice); + inputs[5] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y), output_slice); + inputs[6] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y + 1), output_slice); + inputs[7] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + 1), output_slice); + inputs[8] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y + 1), output_slice); + for (int j = 0; j < 9; ++j) { + float4 input = inputs[j]; + output.x += input.x * weights[weithTo + 0 * kernelHXW + j]; + output.y += input.y * weights[weithTo + 1 * kernelHXW + j]; + output.z += input.z * weights[weithTo + 2 * kernelHXW + j]; + output.w += input.w * weights[weithTo + 3 * kernelHXW + j]; + } + output = output + biase[gid.z]; + outTexture.write(output, gid.xy, gid.z); +} + + +#pragma mark - half + +kernel void conv_add_1x1_half(texture2d_array inTexture [[texture(0)]], + texture2d_array outTexture [[texture(1)]], + constant MetalConvParam ¶m [[buffer(0)]], + const device half4 *weights [[buffer(1)]], + const device half4 *biase [[buffer(2)]], + uint3 gid [[thread_position_in_grid]]) { + + if (gid.x >= outTexture.get_width() || + gid.y >= outTexture.get_height() || + gid.z >= outTexture.get_array_size()) { + return; + } + + ushort2 stride = ushort2(param.strideX, param.strideY); + ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY); + + constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); + const uint kernelHXW = 1; + + uint input_arr_size = inTexture.get_array_size(); + uint weithTo = gid.z * kernelHXW * input_arr_size * 4; + + float4 output = float4(0.0); + + half4 input; + for (uint i = 0; i < input_arr_size; ++i) { + input = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i); + half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + i]; + output.x += dot(input, weight_x); + + half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + i]; + output.y += dot(input, weight_y); + + half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + i]; + output.z += dot(input, weight_z); + + half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + i]; + output.w += dot(input, weight_w); + } + output = output + float4(biase[gid.z]); + outTexture.write(half4(output), gid.xy, gid.z); +} + +kernel void conv_add_3x3_half(texture2d_array inTexture [[texture(0)]], + texture2d_array outTexture [[texture(1)]], + constant MetalConvParam ¶m [[buffer(0)]], + const device half4 *weights [[buffer(1)]], + const device half4 *biase [[buffer(2)]], + const device half4 *new_scale [[buffer(3)]], + const device half4 *new_biase [[buffer(4)]], + uint3 gid [[thread_position_in_grid]]) { + + if (gid.x >= outTexture.get_width() || + gid.y >= outTexture.get_height() || + gid.z >= outTexture.get_array_size()) { + return; + } + + ushort2 stride = ushort2(param.strideX, param.strideY); + const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY); + + constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); + const uint kernelHXW = 9; + uint input_arr_size = inTexture.get_array_size(); + uint weithTo = gid.z * kernelHXW * input_arr_size * 4; + + float4 output = float4(0.0); + + ushort dilation_x = param.dilationX; + ushort dilation_y = param.dilationY; + + half4 input[9]; + for (uint i = 0; i < input_arr_size; ++i) { + input[0] = inTexture.sample(sample, float2(posInInput.x - dilation_x, posInInput.y - dilation_y), i); + input[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - dilation_y), i); + input[2] = inTexture.sample(sample, float2(posInInput.x + dilation_x, posInInput.y - dilation_y), i); + input[3] = inTexture.sample(sample, float2(posInInput.x - dilation_x, posInInput.y), i); + input[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i); + input[5] = inTexture.sample(sample, float2(posInInput.x + dilation_x, posInInput.y), i); + input[6] = inTexture.sample(sample, float2(posInInput.x - dilation_x, posInInput.y + dilation_y), i); + input[7] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + dilation_y), i); + input[8] = inTexture.sample(sample, float2(posInInput.x + dilation_x, posInInput.y + dilation_y), i); + for (int j = 0; j < 9; ++j) { + half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i]; + output.x += dot(float4(input[j]), float4(weight_x)); + + half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i]; + output.y += dot(float4(input[j]), float4(weight_y)); + + half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i]; + output.z += dot(float4(input[j]), float4(weight_z)); + + half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i]; + output.w += dot(float4(input[j]), float4(weight_w)); + } + } + output = output + float4(biase[gid.z]); + outTexture.write(half4(output), gid.xy, gid.z); +} + +kernel void depthwise_conv_add_3x3_half(texture2d_array inTexture [[texture(0)]], + texture2d_array outTexture [[texture(1)]], + constant MetalConvParam ¶m [[buffer(0)]], + const device half *weights [[buffer(1)]], + const device half4 *biase [[buffer(2)]], + const device half4 *new_scale [[buffer(3)]], + const device half4 *new_biase [[buffer(4)]], + uint3 gid [[thread_position_in_grid]]) { + + if (gid.x >= outTexture.get_width() || + gid.y >= outTexture.get_height() || + gid.z >= outTexture.get_array_size()) { + return; + } + uint output_slice = gid.z; + ushort2 stride = ushort2(param.strideX, param.strideY); + ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY); + constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); + const uint kernelHXW = 9; + uint weithTo = gid.z * kernelHXW * 4; + float4 output = float4(0.0); + half4 inputs[9]; + inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y - 1), output_slice); + inputs[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 1), output_slice); + inputs[2] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y - 1), output_slice); + inputs[3] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y), output_slice); + inputs[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), output_slice); + inputs[5] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y), output_slice); + inputs[6] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y + 1), output_slice); + inputs[7] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + 1), output_slice); + inputs[8] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y + 1), output_slice); + for (int j = 0; j < 9; ++j) { + half4 input = inputs[j]; + output.x += float(input.x) * float(weights[weithTo + 0 * kernelHXW + j]); + output.y += float(input.y) * float(weights[weithTo + 1 * kernelHXW + j]); + output.z += float(input.z) * float(weights[weithTo + 2 * kernelHXW + j]); + output.w += input.w * weights[weithTo + 3 * kernelHXW + j]; + } + output = output + float4(biase[gid.z]); + outTexture.write(half4(output), gid.xy, gid.z); +} diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ConvBNReluKernel.metal b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ConvBNReluKernel.metal new file mode 100644 index 0000000000000000000000000000000000000000..4b97b7829a1fba27704fe7b60a03b2672f4f5953 --- /dev/null +++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ConvBNReluKernel.metal @@ -0,0 +1,297 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#include +#include "Common.metal" + +using namespace metal; + +#pragma mark - conv bn relu +kernel void conv_batch_norm_relu_1x1(texture2d_array inTexture [[texture(0)]], + texture2d_array outTexture [[texture(1)]], + constant MetalConvParam ¶m [[buffer(0)]], + const device float4 *weights [[buffer(1)]], + const device float4 *new_scale [[buffer(2)]], + const device float4 *new_biase [[buffer(3)]], + uint3 gid [[thread_position_in_grid]]) { + + if (gid.x >= outTexture.get_width() || + gid.y >= outTexture.get_height() || + gid.z >= outTexture.get_array_size()) { + return; + } + + ushort2 stride = ushort2(param.strideX, param.strideY); + ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY); + + constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); + const uint kernelHXW = 1; + + uint input_arr_size = inTexture.get_array_size(); + uint weithTo = gid.z * kernelHXW * input_arr_size * 4; + + float4 output = float4(0.0); + + float4 input; + for (uint i = 0; i < input_arr_size; ++i) { + input = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i); + float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + i]; + output.x += dot(input, weight_x); + + float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + i]; + output.y += dot(input, weight_y); + + float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + i]; + output.z += dot(input, weight_z); + + float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + i]; + output.w += dot(input, weight_w); + } + output = fmax(output * new_scale[gid.z] + new_biase[gid.z], 0.0); + outTexture.write(output, gid.xy, gid.z); +} + +kernel void conv_batch_norm_relu_3x3(texture2d_array inTexture [[texture(0)]], + texture2d_array outTexture [[texture(1)]], + constant MetalConvParam ¶m [[buffer(0)]], + const device float4 *weights [[buffer(1)]], + const device float4 *new_scale [[buffer(2)]], + const device float4 *new_biase [[buffer(3)]], + uint3 gid [[thread_position_in_grid]]) { + + if (gid.x >= outTexture.get_width() || + gid.y >= outTexture.get_height() || + gid.z >= outTexture.get_array_size()) { + return; + } + + ushort2 stride = ushort2(param.strideX, param.strideY); + const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY); + + constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); + const uint kernelHXW = 9; + uint input_arr_size = inTexture.get_array_size(); + uint weithTo = gid.z * kernelHXW * input_arr_size * 4; + + float4 output = float4(0.0); + + float4 input[9]; + for (uint i = 0; i < input_arr_size; ++i) { + input[0] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y - 1), i); + input[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 1), i); + input[2] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y - 1), i); + input[3] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y), i); + input[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i); + input[5] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y), i); + input[6] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y + 1), i); + input[7] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + 1), i); + input[8] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y + 1), i); + for (int j = 0; j < 9; ++j) { + float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i]; + output.x += dot(input[j], weight_x); + + float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i]; + output.y += dot(input[j], weight_y); + + float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i]; + output.z += dot(input[j], weight_z); + + float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i]; + output.w += dot(input[j], weight_w); + } + } + output = fmax(output * new_scale[gid.z] + new_biase[gid.z], 0.0); + outTexture.write(output, gid.xy, gid.z); +} + +kernel void depthwise_conv_batch_norm_relu_3x3(texture2d_array inTexture [[texture(0)]], + texture2d_array outTexture [[texture(1)]], + constant MetalConvParam ¶m [[buffer(0)]], + const device float *weights [[buffer(1)]], + const device float4 *new_scale [[buffer(2)]], + const device float4 *new_biase [[buffer(3)]], + uint3 gid [[thread_position_in_grid]]) { + + if (gid.x >= outTexture.get_width() || + gid.y >= outTexture.get_height() || + gid.z >= outTexture.get_array_size()) { + return; + } + uint output_slice = gid.z; + ushort2 stride = ushort2(param.strideX, param.strideY); + ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY); + constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); + const uint kernelHXW = 9; + uint weithTo = gid.z * kernelHXW * 4; + float4 output = float4(0.0); + float4 inputs[9]; + inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y - 1), output_slice); + inputs[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 1), output_slice); + inputs[2] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y - 1), output_slice); + inputs[3] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y), output_slice); + inputs[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), output_slice); + inputs[5] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y), output_slice); + inputs[6] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y + 1), output_slice); + inputs[7] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + 1), output_slice); + inputs[8] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y + 1), output_slice); + for (int j = 0; j < 9; ++j) { + float4 input = inputs[j]; + output.x += input.x * weights[weithTo + 0 * kernelHXW + j]; + output.y += input.y * weights[weithTo + 1 * kernelHXW + j]; + output.z += input.z * weights[weithTo + 2 * kernelHXW + j]; + output.w += input.w * weights[weithTo + 3 * kernelHXW + j]; + } + output = fmax(output * new_scale[gid.z] + new_biase[gid.z], 0.0); + outTexture.write(output, gid.xy, gid.z); +} + +#pragma mark - half +kernel void conv_batch_norm_relu_1x1_half(texture2d_array inTexture [[texture(0)]], + texture2d_array outTexture [[texture(1)]], + constant MetalConvParam ¶m [[buffer(0)]], + const device half4 *weights [[buffer(1)]], + const device half4 *new_scale [[buffer(2)]], + const device half4 *new_biase [[buffer(3)]], + uint3 gid [[thread_position_in_grid]]) { + + if (gid.x >= outTexture.get_width() || + gid.y >= outTexture.get_height() || + gid.z >= outTexture.get_array_size()) { + return; + } + + ushort2 stride = ushort2(param.strideX, param.strideY); + ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY); + + constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); + const uint kernelHXW = 1; + + uint input_arr_size = inTexture.get_array_size(); + uint weithTo = gid.z * kernelHXW * input_arr_size * 4; + + float4 output = float4(0.0); + + half4 input; + for (uint i = 0; i < input_arr_size; ++i) { + input = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i); + half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + i]; + output.x += dot(float4(input), float4(weight_x)); + + half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + i]; + output.y += dot(float4(input), float4(weight_y)); + + half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + i]; + output.z += dot(float4(input), float4(weight_z)); + + half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + i]; + output.w += dot(float4(input), float4(weight_w)); + } + output = fmax(output * float4(new_scale[gid.z]) + float4(new_biase[gid.z]), 0.0); + outTexture.write(half4(output), gid.xy, gid.z); +} + +kernel void conv_batch_norm_relu_3x3_half(texture2d_array inTexture [[texture(0)]], + texture2d_array outTexture [[texture(1)]], + constant MetalConvParam ¶m [[buffer(0)]], + const device half4 *weights [[buffer(1)]], + const device half4 *new_scale [[buffer(2)]], + const device half4 *new_biase [[buffer(3)]], + uint3 gid [[thread_position_in_grid]]) { + + if (gid.x >= outTexture.get_width() || + gid.y >= outTexture.get_height() || + gid.z >= outTexture.get_array_size()) { + return; + } + + ushort2 stride = ushort2(param.strideX, param.strideY); + const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY); + + constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); + const uint kernelHXW = 9; + uint input_arr_size = inTexture.get_array_size(); + uint weithTo = gid.z * kernelHXW * input_arr_size * 4; + + float4 output = float4(0.0); + + half4 input[9]; + for (uint i = 0; i < input_arr_size; ++i) { + input[0] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y - 1), i); + input[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 1), i); + input[2] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y - 1), i); + input[3] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y), i); + input[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i); + input[5] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y), i); + input[6] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y + 1), i); + input[7] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + 1), i); + input[8] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y + 1), i); + for (int j = 0; j < 9; ++j) { + half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i]; + output.x += dot(float4(input[j]), float4(weight_x)); + + half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i]; + output.y += dot(float4(input[j]), float4(weight_y)); + + half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i]; + output.z += dot(float4(input[j]), float4(weight_z)); + + half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i]; + output.w += dot(float4(input[j]), float4(weight_w)); + } + } + output = fmax(output * float4(new_scale[gid.z]) + float4(new_biase[gid.z]), 0.0); + outTexture.write(half4(output), gid.xy, gid.z); +} + +kernel void depthwise_conv_batch_norm_relu_3x3_half(texture2d_array inTexture [[texture(0)]], + texture2d_array outTexture [[texture(1)]], + constant MetalConvParam ¶m [[buffer(0)]], + const device half *weights [[buffer(1)]], + const device half4 *new_scale [[buffer(2)]], + const device half4 *new_biase [[buffer(3)]], + uint3 gid [[thread_position_in_grid]]) { + + if (gid.x >= outTexture.get_width() || + gid.y >= outTexture.get_height() || + gid.z >= outTexture.get_array_size()) { + return; + } + uint output_slice = gid.z; + ushort2 stride = ushort2(param.strideX, param.strideY); + ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY); + constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); + const uint kernelHXW = 9; + uint weithTo = gid.z * kernelHXW * 4; + float4 output = float4(0.0); + half4 inputs[9]; + inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y - 1), output_slice); + inputs[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 1), output_slice); + inputs[2] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y - 1), output_slice); + inputs[3] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y), output_slice); + inputs[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), output_slice); + inputs[5] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y), output_slice); + inputs[6] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y + 1), output_slice); + inputs[7] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + 1), output_slice); + inputs[8] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y + 1), output_slice); + for (int j = 0; j < 9; ++j) { + half4 input = inputs[j]; + output.x += input.x * weights[weithTo + 0 * kernelHXW + j]; + output.y += input.y * weights[weithTo + 1 * kernelHXW + j]; + output.z += input.z * weights[weithTo + 2 * kernelHXW + j]; + output.w += input.w * weights[weithTo + 3 * kernelHXW + j]; + } + output = fmax(output * float4(new_scale[gid.z]) + float4(new_biase[gid.z]), 0.0); + outTexture.write(half4(output), gid.xy, gid.z); +} + diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ConvKernel.metal b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ConvKernel.metal index a17366cbe0e67b4cd38cafe4e09909f537d269bf..c8a2da5407742cb71cb16d118baa2b446bb30334 100644 --- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ConvKernel.metal +++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ConvKernel.metal @@ -13,349 +13,9 @@ limitations under the License. */ #include +#include "Common.metal" using namespace metal; -struct MetalConvParam { - short offsetX; - short offsetY; - short offsetZ; - ushort strideX; - ushort strideY; - ushort dilationX; - ushort dilationY; -}; - -kernel void conv_add_batch_norm_relu_1x1_half(texture2d_array inTexture [[texture(0)]], - texture2d_array outTexture [[texture(1)]], - constant MetalConvParam ¶m [[buffer(0)]], - const device half4 *weights [[buffer(1)]], - const device half4 *biase [[buffer(2)]], - const device float4 *new_scale [[buffer(3)]], - const device float4 *new_biase [[buffer(4)]], - uint3 gid [[thread_position_in_grid]]) { - - if (gid.x >= outTexture.get_width() || - gid.y >= outTexture.get_height() || - gid.z >= outTexture.get_array_size()) { - return; - } - - ushort2 stride = ushort2(param.strideX, param.strideY); - ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY); - - constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); - const uint kernelHXW = 1; - - uint input_arr_size = inTexture.get_array_size(); - uint weithTo = gid.z * kernelHXW * input_arr_size * 4; - - half4 output = half4(0.0); - - half4 input; - for (uint i = 0; i < input_arr_size; ++i) { - input = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i); - half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + i]; - output.x += dot(input, weight_x); - - half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + i]; - output.y += dot(input, weight_y); - - half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + i]; - output.z += dot(input, weight_z); - - half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + i]; - output.w += dot(input, weight_w); - } - - output = half4(fmax((float4(output) + float4(biase[gid.z])) * new_scale[gid.z] + new_biase[gid.z], 0.0)); - outTexture.write(output, gid.xy, gid.z); -} - -kernel void conv_add_batch_norm_relu_3x3_half(texture2d_array inTexture [[texture(0)]], - texture2d_array outTexture [[texture(1)]], - constant MetalConvParam ¶m [[buffer(0)]], - const device half4 *weights [[buffer(1)]], - const device half4 *biase [[buffer(2)]], - const device float4 *new_scale [[buffer(3)]], - const device float4 *new_biase [[buffer(4)]], - uint3 gid [[thread_position_in_grid]]) { - - if (gid.x >= outTexture.get_width() || - gid.y >= outTexture.get_height() || - gid.z >= outTexture.get_array_size()) { - return; - } - - ushort2 stride = ushort2(param.strideX, param.strideY); - const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY); - - constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); - const uint kernelHXW = 9; - uint input_arr_size = inTexture.get_array_size(); - uint weithTo = gid.z * kernelHXW * input_arr_size * 4; - - half4 output = half4(0.0); - - half4 input[9]; - for (uint i = 0; i < input_arr_size; ++i) { - input[0] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y - 1), i); - input[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 1), i); - input[2] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y - 1), i); - input[3] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y), i); - input[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i); - input[5] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y), i); - input[6] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y + 1), i); - input[7] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + 1), i); - input[8] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y + 1), i); - for (int j = 0; j < 9; ++j) { - half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i]; - output.x += dot(input[j], weight_x); - - half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i]; - output.y += dot(input[j], weight_y); - - half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i]; - output.z += dot(input[j], weight_z); - - half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i]; - output.w += dot(input[j], weight_w); - } - } - output = half4(fmax((float4(output) + float4(biase[gid.z])) * new_scale[gid.z] + new_biase[gid.z], 0.0)); - outTexture.write(output, gid.xy, gid.z); -} - -kernel void conv_add_1x1_half(texture2d_array inTexture [[texture(0)]], - texture2d_array outTexture [[texture(1)]], - constant MetalConvParam ¶m [[buffer(0)]], - const device half4 *weights [[buffer(1)]], - const device half4 *biase [[buffer(2)]], - uint3 gid [[thread_position_in_grid]]) { - - if (gid.x >= outTexture.get_width() || - gid.y >= outTexture.get_height() || - gid.z >= outTexture.get_array_size()) { - return; - } - - ushort2 stride = ushort2(param.strideX, param.strideY); - ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY); - - constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); - const uint kernelHXW = 1; - - uint input_arr_size = inTexture.get_array_size(); - uint weithTo = gid.z * kernelHXW * input_arr_size * 4; - - half4 output = half4(0.0); - - half4 input; - for (uint i = 0; i < input_arr_size; ++i) { - input = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i); - half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + i]; - output.x += dot(input, weight_x); - - half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + i]; - output.y += dot(input, weight_y); - - half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + i]; - output.z += dot(input, weight_z); - - half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + i]; - output.w += dot(input, weight_w); - } - output = output + biase[gid.z]; - outTexture.write(output, gid.xy, gid.z); -} - -kernel void depthwise_conv_add_batch_norm_relu_3x3_half(texture2d_array inTexture [[texture(0)]], - texture2d_array outTexture [[texture(1)]], - constant MetalConvParam ¶m [[buffer(0)]], - const device half *weights [[buffer(1)]], - const device half4 *biase [[buffer(2)]], - const device float4 *new_scale [[buffer(3)]], - const device float4 *new_biase [[buffer(4)]], - uint3 gid [[thread_position_in_grid]]) { - - if (gid.x >= outTexture.get_width() || - gid.y >= outTexture.get_height() || - gid.z >= outTexture.get_array_size()) { - return; - } - uint output_slice = gid.z; - ushort2 stride = ushort2(param.strideX, param.strideY); - ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY); - constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); - const uint kernelHXW = 9; - uint weithTo = gid.z * kernelHXW * 4; - half4 output = half4(0.0); - half4 inputs[9]; - inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y - 1), output_slice); - inputs[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 1), output_slice); - inputs[2] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y - 1), output_slice); - inputs[3] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y), output_slice); - inputs[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), output_slice); - inputs[5] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y), output_slice); - inputs[6] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y + 1), output_slice); - inputs[7] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + 1), output_slice); - inputs[8] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y + 1), output_slice); - for (int j = 0; j < 9; ++j) { - half4 input = inputs[j]; - output.x += input.x * weights[weithTo + 0 * kernelHXW + j]; - output.y += input.y * weights[weithTo + 1 * kernelHXW + j]; - output.z += input.z * weights[weithTo + 2 * kernelHXW + j]; - output.w += input.w * weights[weithTo + 3 * kernelHXW + j]; - } - output = half4(fmax((float4(output) + float4(biase[gid.z])) * new_scale[gid.z] + new_biase[gid.z], 0.0)); - outTexture.write(output, gid.xy, gid.z); -} - - -/*---------------------------------------------*/ - - - -kernel void conv_add_batch_norm_relu_1x1(texture2d_array inTexture [[texture(0)]], - texture2d_array outTexture [[texture(1)]], - constant MetalConvParam ¶m [[buffer(0)]], - const device float4 *weights [[buffer(1)]], - const device float4 *biase [[buffer(2)]], - const device float4 *new_scale [[buffer(3)]], - const device float4 *new_biase [[buffer(4)]], - uint3 gid [[thread_position_in_grid]]) { - - if (gid.x >= outTexture.get_width() || - gid.y >= outTexture.get_height() || - gid.z >= outTexture.get_array_size()) { - return; - } - - ushort2 stride = ushort2(param.strideX, param.strideY); - ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY); - - constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); - const uint kernelHXW = 1; - - uint input_arr_size = inTexture.get_array_size(); - uint weithTo = gid.z * kernelHXW * input_arr_size * 4; - - float4 output = float4(0.0); - - float4 input; - for (uint i = 0; i < input_arr_size; ++i) { - input = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i); - float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + i]; - output.x += dot(input, weight_x); - - float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + i]; - output.y += dot(input, weight_y); - - float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + i]; - output.z += dot(input, weight_z); - - float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + i]; - output.w += dot(input, weight_w); - } - output = fmax((output + biase[gid.z]) * new_scale[gid.z] + new_biase[gid.z], 0.0); - outTexture.write(output, gid.xy, gid.z); -} - -kernel void conv_add_batch_norm_relu_3x3(texture2d_array inTexture [[texture(0)]], - texture2d_array outTexture [[texture(1)]], - constant MetalConvParam ¶m [[buffer(0)]], - const device float4 *weights [[buffer(1)]], - const device float4 *biase [[buffer(2)]], - const device float4 *new_scale [[buffer(3)]], - const device float4 *new_biase [[buffer(4)]], - uint3 gid [[thread_position_in_grid]]) { - - if (gid.x >= outTexture.get_width() || - gid.y >= outTexture.get_height() || - gid.z >= outTexture.get_array_size()) { - return; - } - - ushort2 stride = ushort2(param.strideX, param.strideY); - const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY); - - constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); - const uint kernelHXW = 9; - uint input_arr_size = inTexture.get_array_size(); - uint weithTo = gid.z * kernelHXW * input_arr_size * 4; - - float4 output = float4(0.0); - - float4 input[9]; - for (uint i = 0; i < input_arr_size; ++i) { - input[0] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y - 1), i); - input[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 1), i); - input[2] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y - 1), i); - input[3] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y), i); - input[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i); - input[5] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y), i); - input[6] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y + 1), i); - input[7] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + 1), i); - input[8] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y + 1), i); - for (int j = 0; j < 9; ++j) { - float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i]; - output.x += dot(input[j], weight_x); - - float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i]; - output.y += dot(input[j], weight_y); - - float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i]; - output.z += dot(input[j], weight_z); - - float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i]; - output.w += dot(input[j], weight_w); - } - } - output = fmax((output + biase[gid.z]) * new_scale[gid.z] + new_biase[gid.z], 0.0); - outTexture.write(output, gid.xy, gid.z); -} - -kernel void depthwise_conv_add_batch_norm_relu_3x3(texture2d_array inTexture [[texture(0)]], - texture2d_array outTexture [[texture(1)]], - constant MetalConvParam ¶m [[buffer(0)]], - const device float *weights [[buffer(1)]], - const device float4 *biase [[buffer(2)]], - const device float4 *new_scale [[buffer(3)]], - const device float4 *new_biase [[buffer(4)]], - uint3 gid [[thread_position_in_grid]]) { - - if (gid.x >= outTexture.get_width() || - gid.y >= outTexture.get_height() || - gid.z >= outTexture.get_array_size()) { - return; - } - uint output_slice = gid.z; - ushort2 stride = ushort2(param.strideX, param.strideY); - ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY); - constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); - const uint kernelHXW = 9; - uint weithTo = gid.z * kernelHXW * 4; - float4 output = float4(0.0); - float4 inputs[9]; - inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y - 1), output_slice); - inputs[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 1), output_slice); - inputs[2] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y - 1), output_slice); - inputs[3] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y), output_slice); - inputs[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), output_slice); - inputs[5] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y), output_slice); - inputs[6] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y + 1), output_slice); - inputs[7] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + 1), output_slice); - inputs[8] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y + 1), output_slice); - for (int j = 0; j < 9; ++j) { - float4 input = inputs[j]; - output.x += input.x * weights[weithTo + 0 * kernelHXW + j]; - output.y += input.y * weights[weithTo + 1 * kernelHXW + j]; - output.z += input.z * weights[weithTo + 2 * kernelHXW + j]; - output.w += input.w * weights[weithTo + 3 * kernelHXW + j]; - } - output = fmax((output + biase[gid.z]) * new_scale[gid.z] + new_biase[gid.z], 0.0); - outTexture.write(output, gid.xy, gid.z); -} - // conv #pragma mark -- conv kernel void conv_3x3(texture2d_array inTexture [[texture(0)]], @@ -487,285 +147,4 @@ kernel void conv_1x1(texture2d_array inTexture [[texture( outTexture.write(output, gid.xy, gid.z); } -#pragma mark - convAdd -kernel void conv_add_1x1(texture2d_array inTexture [[texture(0)]], - texture2d_array outTexture [[texture(1)]], - constant MetalConvParam ¶m [[buffer(0)]], - const device float4 *weights [[buffer(1)]], - const device float4 *biase [[buffer(2)]], - uint3 gid [[thread_position_in_grid]]) { - - if (gid.x >= outTexture.get_width() || - gid.y >= outTexture.get_height() || - gid.z >= outTexture.get_array_size()) { - return; - } - - ushort2 stride = ushort2(param.strideX, param.strideY); - ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY); - - constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); - const uint kernelHXW = 1; - - uint input_arr_size = inTexture.get_array_size(); - uint weithTo = gid.z * kernelHXW * input_arr_size * 4; - - float4 output = float4(0.0); - - float4 input; - for (uint i = 0; i < input_arr_size; ++i) { - input = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i); - float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + i]; - output.x += dot(input, weight_x); - - float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + i]; - output.y += dot(input, weight_y); - - float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + i]; - output.z += dot(input, weight_z); - - float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + i]; - output.w += dot(input, weight_w); - } - output = output + biase[gid.z]; - outTexture.write(output, gid.xy, gid.z); -} - -kernel void conv_add_3x3(texture2d_array inTexture [[texture(0)]], - texture2d_array outTexture [[texture(1)]], - constant MetalConvParam ¶m [[buffer(0)]], - const device float4 *weights [[buffer(1)]], - const device float4 *biase [[buffer(2)]], - const device float4 *new_scale [[buffer(3)]], - const device float4 *new_biase [[buffer(4)]], - uint3 gid [[thread_position_in_grid]]) { - - if (gid.x >= outTexture.get_width() || - gid.y >= outTexture.get_height() || - gid.z >= outTexture.get_array_size()) { - return; - } - - ushort2 stride = ushort2(param.strideX, param.strideY); - const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY); - - constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); - const uint kernelHXW = 9; - uint input_arr_size = inTexture.get_array_size(); - uint weithTo = gid.z * kernelHXW * input_arr_size * 4; - - float4 output = float4(0.0); - - ushort dilation_x = param.dilationX; - ushort dilation_y = param.dilationY; - - float4 input[9]; - for (uint i = 0; i < input_arr_size; ++i) { - input[0] = inTexture.sample(sample, float2(posInInput.x - dilation_x, posInInput.y - dilation_y), i); - input[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - dilation_y), i); - input[2] = inTexture.sample(sample, float2(posInInput.x + dilation_x, posInInput.y - dilation_y), i); - input[3] = inTexture.sample(sample, float2(posInInput.x - dilation_x, posInInput.y), i); - input[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i); - input[5] = inTexture.sample(sample, float2(posInInput.x + dilation_x, posInInput.y), i); - input[6] = inTexture.sample(sample, float2(posInInput.x - dilation_x, posInInput.y + dilation_y), i); - input[7] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + dilation_y), i); - input[8] = inTexture.sample(sample, float2(posInInput.x + dilation_x, posInInput.y + dilation_y), i); - for (int j = 0; j < 9; ++j) { - float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i]; - output.x += dot(input[j], weight_x); - - float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i]; - output.y += dot(input[j], weight_y); - - float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i]; - output.z += dot(input[j], weight_z); - - float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i]; - output.w += dot(input[j], weight_w); - } - } - output = output + biase[gid.z]; - outTexture.write(output, gid.xy, gid.z); -} - -kernel void depthwise_conv_add_3x3(texture2d_array inTexture [[texture(0)]], - texture2d_array outTexture [[texture(1)]], - constant MetalConvParam ¶m [[buffer(0)]], - const device float *weights [[buffer(1)]], - const device float4 *biase [[buffer(2)]], - const device float4 *new_scale [[buffer(3)]], - const device float4 *new_biase [[buffer(4)]], - uint3 gid [[thread_position_in_grid]]) { - - if (gid.x >= outTexture.get_width() || - gid.y >= outTexture.get_height() || - gid.z >= outTexture.get_array_size()) { - return; - } - uint output_slice = gid.z; - ushort2 stride = ushort2(param.strideX, param.strideY); - ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY); - constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); - const uint kernelHXW = 9; - uint weithTo = gid.z * kernelHXW * 4; - float4 output = float4(0.0); - float4 inputs[9]; - inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y - 1), output_slice); - inputs[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 1), output_slice); - inputs[2] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y - 1), output_slice); - inputs[3] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y), output_slice); - inputs[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), output_slice); - inputs[5] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y), output_slice); - inputs[6] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y + 1), output_slice); - inputs[7] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + 1), output_slice); - inputs[8] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y + 1), output_slice); - for (int j = 0; j < 9; ++j) { - float4 input = inputs[j]; - output.x += input.x * weights[weithTo + 0 * kernelHXW + j]; - output.y += input.y * weights[weithTo + 1 * kernelHXW + j]; - output.z += input.z * weights[weithTo + 2 * kernelHXW + j]; - output.w += input.w * weights[weithTo + 3 * kernelHXW + j]; - } - output = output + biase[gid.z]; - outTexture.write(output, gid.xy, gid.z); -} - -#pragma mark - conv bn relu -kernel void conv_batch_norm_relu_1x1(texture2d_array inTexture [[texture(0)]], - texture2d_array outTexture [[texture(1)]], - constant MetalConvParam ¶m [[buffer(0)]], - const device float4 *weights [[buffer(1)]], - const device float4 *new_scale [[buffer(2)]], - const device float4 *new_biase [[buffer(3)]], - uint3 gid [[thread_position_in_grid]]) { - - if (gid.x >= outTexture.get_width() || - gid.y >= outTexture.get_height() || - gid.z >= outTexture.get_array_size()) { - return; - } - - ushort2 stride = ushort2(param.strideX, param.strideY); - ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY); - - constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); - const uint kernelHXW = 1; - - uint input_arr_size = inTexture.get_array_size(); - uint weithTo = gid.z * kernelHXW * input_arr_size * 4; - - float4 output = float4(0.0); - - float4 input; - for (uint i = 0; i < input_arr_size; ++i) { - input = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i); - float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + i]; - output.x += dot(input, weight_x); - - float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + i]; - output.y += dot(input, weight_y); - - float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + i]; - output.z += dot(input, weight_z); - - float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + i]; - output.w += dot(input, weight_w); - } - output = fmax(output * new_scale[gid.z] + new_biase[gid.z], 0.0); - outTexture.write(output, gid.xy, gid.z); -} - -kernel void conv_batch_norm_relu_3x3(texture2d_array inTexture [[texture(0)]], - texture2d_array outTexture [[texture(1)]], - constant MetalConvParam ¶m [[buffer(0)]], - const device float4 *weights [[buffer(1)]], - const device float4 *new_scale [[buffer(2)]], - const device float4 *new_biase [[buffer(3)]], - uint3 gid [[thread_position_in_grid]]) { - - if (gid.x >= outTexture.get_width() || - gid.y >= outTexture.get_height() || - gid.z >= outTexture.get_array_size()) { - return; - } - - ushort2 stride = ushort2(param.strideX, param.strideY); - const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY); - - constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); - const uint kernelHXW = 9; - uint input_arr_size = inTexture.get_array_size(); - uint weithTo = gid.z * kernelHXW * input_arr_size * 4; - - float4 output = float4(0.0); - - float4 input[9]; - for (uint i = 0; i < input_arr_size; ++i) { - input[0] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y - 1), i); - input[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 1), i); - input[2] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y - 1), i); - input[3] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y), i); - input[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i); - input[5] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y), i); - input[6] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y + 1), i); - input[7] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + 1), i); - input[8] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y + 1), i); - for (int j = 0; j < 9; ++j) { - float4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i]; - output.x += dot(input[j], weight_x); - - float4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i]; - output.y += dot(input[j], weight_y); - - float4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i]; - output.z += dot(input[j], weight_z); - - float4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i]; - output.w += dot(input[j], weight_w); - } - } - output = fmax(output * new_scale[gid.z] + new_biase[gid.z], 0.0); - outTexture.write(output, gid.xy, gid.z); -} - -kernel void depthwise_conv_batch_norm_relu_3x3(texture2d_array inTexture [[texture(0)]], - texture2d_array outTexture [[texture(1)]], - constant MetalConvParam ¶m [[buffer(0)]], - const device float *weights [[buffer(1)]], - const device float4 *new_scale [[buffer(2)]], - const device float4 *new_biase [[buffer(3)]], - uint3 gid [[thread_position_in_grid]]) { - - if (gid.x >= outTexture.get_width() || - gid.y >= outTexture.get_height() || - gid.z >= outTexture.get_array_size()) { - return; - } - uint output_slice = gid.z; - ushort2 stride = ushort2(param.strideX, param.strideY); - ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY); - constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); - const uint kernelHXW = 9; - uint weithTo = gid.z * kernelHXW * 4; - float4 output = float4(0.0); - float4 inputs[9]; - inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y - 1), output_slice); - inputs[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 1), output_slice); - inputs[2] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y - 1), output_slice); - inputs[3] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y), output_slice); - inputs[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), output_slice); - inputs[5] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y), output_slice); - inputs[6] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y + 1), output_slice); - inputs[7] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + 1), output_slice); - inputs[8] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y + 1), output_slice); - for (int j = 0; j < 9; ++j) { - float4 input = inputs[j]; - output.x += input.x * weights[weithTo + 0 * kernelHXW + j]; - output.y += input.y * weights[weithTo + 1 * kernelHXW + j]; - output.z += input.z * weights[weithTo + 2 * kernelHXW + j]; - output.w += input.w * weights[weithTo + 3 * kernelHXW + j]; - } - output = fmax(output * new_scale[gid.z] + new_biase[gid.z], 0.0); - outTexture.write(output, gid.xy, gid.z); -} diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/Kernels.metal b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/Kernels.metal index a45063b3c09873b7c41f7540d1d33df2b1559e54..8bce97f21efe159489e2bc65e598c3252b567b79 100644 --- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/Kernels.metal +++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/Kernels.metal @@ -79,113 +79,6 @@ kernel void texture2d_to_2d_array_half(texture2d inTexture [ outTexture.write(input, gid.xy, 0); } -struct PoolParam { - int ksizeX; - int ksizeY; - int strideX; - int strideY; - int paddingX; - int paddingY; - int poolType; -}; - -kernel void pool(texture2d_array inTexture [[texture(0)]], - texture2d_array outTexture [[texture(1)]], - constant PoolParam &pm [[buffer(0)]], - uint3 gid [[thread_position_in_grid]]) { - if (gid.x >= outTexture.get_width() || - gid.y >= outTexture.get_height() || - gid.z >= outTexture.get_array_size()) return; - int xmin = gid.x * pm.strideX - pm.paddingX; - int xmax = min(xmin + pm.ksizeX, int(inTexture.get_width())); - xmin = max(xmin, 0); - int ymin = gid.y * pm.strideX - pm.paddingX; - int ymax = min(ymin + pm.ksizeX, int(inTexture.get_height())); - ymin = max(ymin, 0); - - float4 r = 0; - if (pm.poolType == 0) { - r = inTexture.read(uint2(xmin, ymin), gid.z); - for (int x = xmin; x < xmax; x++) { - for (int y = ymin; y < ymax; y++) { - r = fmax(r, inTexture.read(uint2(x, y), gid.z)); - } - } - } else if (pm.poolType == 1) { - for (int x = xmin; x < xmax; x++) { - for (int y = ymin; y < ymax; y++) { - r += inTexture.read(uint2(x, y), gid.z); - } - } - r /= pm.ksizeX * pm.ksizeY; - } - outTexture.write(r, gid.xy, gid.z); -} - - -kernel void pool_half(texture2d_array inTexture [[texture(0)]], - texture2d_array outTexture [[texture(1)]], - constant PoolParam &pm [[buffer(0)]], - uint3 gid [[thread_position_in_grid]]) { - if (gid.x >= outTexture.get_width() || - gid.y >= outTexture.get_height() || - gid.z >= outTexture.get_array_size()) return; - int xmin = gid.x * pm.strideX - pm.paddingX; - int xmax = min(xmin + pm.ksizeX, int(inTexture.get_width())); - xmin = max(xmin, 0); - int ymin = gid.y * pm.strideX - pm.paddingX; - int ymax = min(ymin + pm.ksizeX, int(inTexture.get_height())); - ymin = max(ymin, 0); - - half4 r = 0; - if (pm.poolType == 0) { - r = inTexture.read(uint2(xmin, ymin), gid.z); - for (int x = xmin; x < xmax; x++) { - for (int y = ymin; y < ymax; y++) { - r = fmax(r, inTexture.read(uint2(x, y), gid.z)); - } - } - } else if (pm.poolType == 1) { - for (int x = xmin; x < xmax; x++) { - for (int y = ymin; y < ymax; y++) { - r += inTexture.read(uint2(x, y), gid.z); - } - } - r /= pm.ksizeX * pm.ksizeY; - } - outTexture.write(r, gid.xy, gid.z); -} -struct TransposeParam { - int iC; - int oC; - int axis[4]; -}; -kernel void transpose(texture2d_array inTexture [[texture(0)]], - texture2d_array outTexture [[texture(1)]], - constant TransposeParam &pm [[buffer(0)]], - uint3 gid [[thread_position_in_grid]]) { - - if ((pm.axis[0] == 0) && (pm.axis[1] == 1) && (pm.axis[2] == 2) && (pm.axis[3] == 3)) { - // do nothing - float4 r = inTexture.read(gid.xy, gid.z); - outTexture.write(r, gid.xy, gid.z); - } else { - float4 r; - for (int n = 0; n < 4; n++) { - int ixyzn[] = {int(gid.x), int(gid.y), int(gid.z), n}; - int iabcd[4], oabcd[4], oxyzn[4]; - xyzn2abcd(pm.oC, ixyzn, iabcd); - oabcd[pm.axis[0]] = iabcd[0]; - oabcd[pm.axis[1]] = iabcd[1]; - oabcd[pm.axis[2]] = iabcd[2]; - oabcd[pm.axis[3]] = iabcd[3]; - abcd2xyzn(pm.iC, oabcd, oxyzn); - float4 rt = inTexture.read(uint2(oxyzn[0], oxyzn[1]), oxyzn[2]); - r[n] = rt[oxyzn[3]]; - } - outTexture.write(r, gid.xy, gid.z); - } -} diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/PoolKernel.metal b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/PoolKernel.metal new file mode 100644 index 0000000000000000000000000000000000000000..1f2f7240db2ba716090001ed539bddb87dff5117 --- /dev/null +++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/PoolKernel.metal @@ -0,0 +1,93 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#include +#include "Common.metal" +using namespace metal; + +struct PoolParam { + int ksizeX; + int ksizeY; + int strideX; + int strideY; + int paddingX; + int paddingY; + int poolType; +}; + +kernel void pool(texture2d_array inTexture [[texture(0)]], + texture2d_array outTexture [[texture(1)]], + constant PoolParam &pm [[buffer(0)]], + uint3 gid [[thread_position_in_grid]]) { + if (gid.x >= outTexture.get_width() || + gid.y >= outTexture.get_height() || + gid.z >= outTexture.get_array_size()) return; + int xmin = gid.x * pm.strideX - pm.paddingX; + int xmax = min(xmin + pm.ksizeX, int(inTexture.get_width())); + xmin = max(xmin, 0); + int ymin = gid.y * pm.strideX - pm.paddingX; + int ymax = min(ymin + pm.ksizeX, int(inTexture.get_height())); + ymin = max(ymin, 0); + + float4 r = 0; + if (pm.poolType == 0) { + r = inTexture.read(uint2(xmin, ymin), gid.z); + for (int x = xmin; x < xmax; x++) { + for (int y = ymin; y < ymax; y++) { + r = fmax(r, inTexture.read(uint2(x, y), gid.z)); + } + } + } else if (pm.poolType == 1) { + for (int x = xmin; x < xmax; x++) { + for (int y = ymin; y < ymax; y++) { + r += inTexture.read(uint2(x, y), gid.z); + } + } + r /= pm.ksizeX * pm.ksizeY; + } + outTexture.write(r, gid.xy, gid.z); +} + +kernel void pool_half(texture2d_array inTexture [[texture(0)]], + texture2d_array outTexture [[texture(1)]], + constant PoolParam &pm [[buffer(0)]], + uint3 gid [[thread_position_in_grid]]) { + if (gid.x >= outTexture.get_width() || + gid.y >= outTexture.get_height() || + gid.z >= outTexture.get_array_size()) return; + int xmin = gid.x * pm.strideX - pm.paddingX; + int xmax = min(xmin + pm.ksizeX, int(inTexture.get_width())); + xmin = max(xmin, 0); + int ymin = gid.y * pm.strideX - pm.paddingX; + int ymax = min(ymin + pm.ksizeX, int(inTexture.get_height())); + ymin = max(ymin, 0); + + half4 r = 0; + if (pm.poolType == 0) { + r = inTexture.read(uint2(xmin, ymin), gid.z); + for (int x = xmin; x < xmax; x++) { + for (int y = ymin; y < ymax; y++) { + r = fmax(r, inTexture.read(uint2(x, y), gid.z)); + } + } + } else if (pm.poolType == 1) { + for (int x = xmin; x < xmax; x++) { + for (int y = ymin; y < ymax; y++) { + r += inTexture.read(uint2(x, y), gid.z); + } + } + r /= pm.ksizeX * pm.ksizeY; + } + outTexture.write(r, gid.xy, gid.z); +} diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/PreluKernel.metal b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/PreluKernel.metal index 1c5b08ee7eeaaa4fd2a8b5064a6af66c77596120..bd14a146100d0a0723f73ab5fd1f95d1f8e39c97 100644 --- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/PreluKernel.metal +++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/PreluKernel.metal @@ -15,8 +15,6 @@ #include using namespace metal; - - kernel void prelu_channel(texture2d_array inTexture [[texture(0)]], texture2d_array outTexture [[texture(1)]], const device float4 *alpha [[buffer(0)]], @@ -82,3 +80,4 @@ kernel void prelu_other(texture2d_array inTexture [[textu output.w = input.w > 0 ? input.w : (alpha_value * input.w); outTexture.write(output, gid.xy, gid.z); } + diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/PriorBoxKernel.metal b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/PriorBoxKernel.metal index 6083c6b514a3d8a0918d585a950d915e69a045fe..4107f0e30831bdb4c7ac3c020c0dc8e3a52a94cb 100644 --- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/PriorBoxKernel.metal +++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/PriorBoxKernel.metal @@ -35,8 +35,8 @@ struct PriorBoxMetalParam { kernel void prior_box(texture2d_array inTexture [[texture(0)]], texture2d_array outBoxTexture [[texture(1)]], texture2d_array varianceTexture [[texture(2)]], - constant PriorBoxMetalParam ¶m [[buffer(0)]], - const device float *aspect_ratios [[buffer(1)]], + const device float *aspect_ratios [[buffer(0)]], + constant PriorBoxMetalParam ¶m [[buffer(1)]], const device float4 *variances [[buffer(2)]], uint3 gid [[thread_position_in_grid]]) { if (gid.x >= outBoxTexture.get_width() || @@ -96,3 +96,68 @@ kernel void prior_box(texture2d_array inTexture [[texture(0 } } + +kernel void prior_box_half(texture2d_array inTexture [[texture(0)]], + texture2d_array outBoxTexture [[texture(1)]], + texture2d_array varianceTexture [[texture(2)]], + constant PriorBoxMetalParam ¶m [[buffer(0)]], + const device half *aspect_ratios [[buffer(1)]], + const device float4 *variances [[buffer(2)]], + uint3 gid [[thread_position_in_grid]]) { + if (gid.x >= outBoxTexture.get_width() || + gid.y >= outBoxTexture.get_height() || + gid.z >= outBoxTexture.get_array_size()) return; + + float center_x = (gid.x + param.offset) * param.stepWidth; + float center_y = (gid.y + param.offset) * param.stepHeight; + + float box_width, box_height; + + if (gid.z < param.aspecRatiosSize) { + half ar = aspect_ratios[gid.z]; + box_width = param.minSize * sqrt(ar) / 2; + box_height = param.minSize / sqrt(ar) / 2; + float4 box; + box.x = (center_x - box_width) / param.imageWidth; + box.y = (center_y - box_height) / param.imageHeight; + box.z = (center_x + box_width) / param.imageWidth; + box.w = (center_y + box_height) / param.imageHeight; + + float4 res; + if (param.clip) { + res = fmin(fmax(box, 0.0), 1.0); + } else { + res = box; + } + + outBoxTexture.write(half4(res), gid.xy, gid.z); + } else if (gid.z >= param.aspecRatiosSize) { + if (param.maxSizeSize > 0) { + box_width = box_height = sqrt(param.minSize * param.maxSize) / 2; + float4 max_box; + max_box.x = (center_x - box_width) / param.imageWidth; + max_box.y = (center_y - box_height) / param.imageHeight; + max_box.z = (center_x + box_width) / param.imageWidth; + max_box.w = (center_y + box_height) / param.imageHeight; + + float4 res; + if (param.clip) { + res = min(max(max_box, 0.0), 1.0); + } else { + res = max_box; + } + outBoxTexture.write(half4(max_box), gid.xy, gid.z); + } + } + + float4 variance = variances[0]; + if (gid.z < param.numPriors) { + float4 variances_output; + variances_output.x = variance.x; + variances_output.y = variance.y; + variances_output.z = variance.z; + variances_output.w = variance.w; + varianceTexture.write(half4(variances_output), gid.xy, gid.z); + } +} + diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ReshapeKernel.metal b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ReshapeKernel.metal index 533f30156d5c1820bb14051d587e9c576c85ea72..399287da71feb11b4e19167ced4f7fe4acdbf42a 100644 --- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ReshapeKernel.metal +++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ReshapeKernel.metal @@ -102,14 +102,36 @@ kernel void reshape(texture2d_array inTexture [[texture(0)] outTexture.write(r, gid.xy, gid.z); } -kernel void reshape_half(texture2d_array inTexture [[texture(0)]], - texture2d_array outTexture [[texture(1)]], - uint3 gid [[thread_position_in_grid]]) { - if (gid.x >= outTexture.get_width() || - gid.y >= outTexture.get_height() || - gid.z >= outTexture.get_array_size()) return; - half4 r = inTexture.read(uint2(0, 0), gid.x); - outTexture.write(r, gid.xy, gid.z); +kernel void reshape_half(texture2d_array inTexture [[texture(0)]], + texture2d_array outTexture [[texture(1)]], + constant ReshapeParam &rp [[buffer(0)]], + uint3 gid [[thread_position_in_grid]]) { + if (gid.x >= outTexture.get_width() || + gid.y >= outTexture.get_height() || + gid.z >= outTexture.get_array_size()) return; + + int oxyzn[4] = {int(gid.x), int(gid.y), int(gid.z), 0}, oabcd[4], ixyzn[4], iabcd[4]; + ReshapeParam lrp = rp; + int oC = lrp.odim[lrp.otrans[3]]; + int iC = lrp.idim[lrp.itrans[3]]; + int count = lrp.odim[0] * lrp.odim[1] * lrp.odim[2] * lrp.odim[3]; + half4 r; + for (int n = 0; n < 4; n++) { + oxyzn[3] = n; + xyzn2abcd(oC, oxyzn, oabcd); + int tabcd[4]; + invtrans(lrp.otrans, oabcd, tabcd); + int index = abcd2index(lrp.odim, tabcd); + if (index < count) { + index2abcd(lrp.idim, index, tabcd); + trans(lrp.itrans, tabcd, iabcd); + abcd2xyzn(iC, iabcd, ixyzn); + r[n] = inTexture.read(uint2(ixyzn[0], ixyzn[1]), ixyzn[2])[ixyzn[3]]; + } else { + r[n] = 0; + } + } + outTexture.write(r, gid.xy, gid.z); } diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/Softmax.metal b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/Softmax.metal index ce70c9e6652f5e0be73bebba2f55877837b0b4a7..3442ba17ceee08d68b1f84642e00c56c5f73a4a2 100644 --- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/Softmax.metal +++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/Softmax.metal @@ -57,25 +57,44 @@ kernel void softmax(texture2d_array inTexture [[texture(0)] rr = exp(rr - maxv) / sum; outTexture.write(rr, gid.xy, gid.z); } -// -//kernel void softmax_half(texture2d_array inTexture [[texture(0)]], -// texture2d_array outTexture [[texture(1)]], -// uint3 gid [[thread_position_in_grid]]) { -// if (gid.x >= outTexture.get_width() || -// gid.y >= outTexture.get_height() || -// gid.z >= outTexture.get_array_size()) return; -// int zsize = inTexture.get_array_size(); -// half maxv = inTexture.read(uint2(0, 0), 0)[0]; -// for (int z = 0; z < zsize; z++) { -// half4 r = inTexture.read(uint2(0, 0), z); -// maxv = max(maxv, max(max(r[0], r[1]), max(r[2], r[3]))); -// } -// float sum = 0; -// for (int z = 0; z < zsize; z++) { -// half4 r = inTexture.read(uint2(0, 0), z); -// sum += exp(r[0] - maxv) + exp(r[1] - maxv) + exp(r[2] - maxv) + exp(r[3] - maxv); -// } -// half4 rr = inTexture.read(gid.xy, gid.z); -// rr = exp(rr - maxv) / sum; -// outTexture.write(rr, gid.xy, gid.z); -//} + +kernel void softmax_half(texture2d_array inTexture [[texture(0)]], + texture2d_array outTexture [[texture(1)]], + constant SoftmaxParam &sp [[buffer(0)]], + uint3 gid [[thread_position_in_grid]]) { + if (gid.x >= outTexture.get_width() || + gid.y >= outTexture.get_height() || + gid.z >= outTexture.get_array_size()) return; + // int zsize = inTexture.get_array_size(); + half maxv = inTexture.read(gid.xy, 0)[0]; + int group = sp.K / 4; + int remain = sp.K % 4; + for (int z = 0; z < group; z++) { + half4 r = inTexture.read(gid.xy, z); + maxv = max(maxv, max(r[0], max(r[1], max(r[2], r[3])))); + } + if (remain > 0) { + half4 r = inTexture.read(gid.xy, group); + for (int i = 0; i < remain; i++) { + maxv = max(maxv, r[i]); + } + } + float4 rsum = {0, 0, 0, 0}; + for (int z = 0; z < group; z++) { + half4 r = inTexture.read(gid.xy, z); + rsum += exp(float4(r) - float4(maxv)); + } + + float sum = rsum[0] + rsum[1] + rsum[2] + rsum[3]; + if (remain > 0) { + half4 r = inTexture.read(gid.xy, group); + for (int i = 0; i < remain; i++) { + sum += exp(float(r[i]) - float(maxv)); + } + } + + half4 rr = inTexture.read(gid.xy, gid.z); + rr = half4(exp(float4(rr) - float(maxv)) / sum); + outTexture.write(rr, gid.xy, gid.z); +} + diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/TransposeKernel.metal b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/TransposeKernel.metal new file mode 100644 index 0000000000000000000000000000000000000000..4801f0315496ea65ff3ff51174e4e8086909aae8 --- /dev/null +++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/TransposeKernel.metal @@ -0,0 +1,80 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. */ + +#include +#include "Common.metal" +using namespace metal; + +struct TransposeParam { + int iC; + int oC; + int axis[4]; +}; + +kernel void transpose(texture2d_array inTexture [[texture(0)]], + texture2d_array outTexture [[texture(1)]], + constant TransposeParam &pm [[buffer(0)]], + uint3 gid [[thread_position_in_grid]]) { + + + if ((pm.axis[0] == 0) && (pm.axis[1] == 1) && (pm.axis[2] == 2) && (pm.axis[3] == 3)) { + // do nothing + float4 r = inTexture.read(gid.xy, gid.z); + outTexture.write(r, gid.xy, gid.z); + } else { + float4 r; + for (int n = 0; n < 4; n++) { + int ixyzn[] = {int(gid.x), int(gid.y), int(gid.z), n}; + int iabcd[4], oabcd[4], oxyzn[4]; + xyzn2abcd(pm.oC, ixyzn, iabcd); + oabcd[pm.axis[0]] = iabcd[0]; + oabcd[pm.axis[1]] = iabcd[1]; + oabcd[pm.axis[2]] = iabcd[2]; + oabcd[pm.axis[3]] = iabcd[3]; + abcd2xyzn(pm.iC, oabcd, oxyzn); + float4 rt = inTexture.read(uint2(oxyzn[0], oxyzn[1]), oxyzn[2]); + r[n] = rt[oxyzn[3]]; + } + outTexture.write(r, gid.xy, gid.z); + } +} + +kernel void transpose_half(texture2d_array inTexture [[texture(0)]], + texture2d_array outTexture [[texture(1)]], + constant TransposeParam &pm [[buffer(0)]], + uint3 gid [[thread_position_in_grid]]) { + + + if ((pm.axis[0] == 0) && (pm.axis[1] == 1) && (pm.axis[2] == 2) && (pm.axis[3] == 3)) { + // do nothing + half4 r = inTexture.read(gid.xy, gid.z); + outTexture.write(r, gid.xy, gid.z); + } else { + half4 r; + for (int n = 0; n < 4; n++) { + int ixyzn[] = {int(gid.x), int(gid.y), int(gid.z), n}; + int iabcd[4], oabcd[4], oxyzn[4]; + xyzn2abcd(pm.oC, ixyzn, iabcd); + oabcd[pm.axis[0]] = iabcd[0]; + oabcd[pm.axis[1]] = iabcd[1]; + oabcd[pm.axis[2]] = iabcd[2]; + oabcd[pm.axis[3]] = iabcd[3]; + abcd2xyzn(pm.iC, oabcd, oxyzn); + half4 rt = inTexture.read(uint2(oxyzn[0], oxyzn[1]), oxyzn[2]); + r[n] = rt[oxyzn[3]]; + } + outTexture.write(r, gid.xy, gid.z); + } +} + diff --git a/metal/paddle-mobile/paddle-mobile/Operators/PoolOp.swift b/metal/paddle-mobile/paddle-mobile/Operators/PoolOp.swift index 6f42f2aa9f8d0515946ace625ed16c5040fd3099..d3d31cfcd43a81f6068238d45f3442af0fc7795f 100644 --- a/metal/paddle-mobile/paddle-mobile/Operators/PoolOp.swift +++ b/metal/paddle-mobile/paddle-mobile/Operators/PoolOp.swift @@ -60,7 +60,7 @@ class PoolOp: Operator, PoolParam

>, Runable, func delogOutput() { print(" \(type) output: ") - print(para.output.metalTexture.toTensor(dim: (n: para.output.tensorDim[0], c: para.output.tensorDim[1], h: para.output.tensorDim[2], w: para.output.tensorDim[3])).strideArray()) + print(para.output.metalTexture.toTensor(dim: (n: para.output.tensorDim[0], c: para.output.tensorDim[1], h: para.output.tensorDim[2], w: para.output.tensorDim[3]), texturePrecision: computePrecision).strideArray()) // print("pool2d delog") diff --git a/metal/paddle-mobile/paddle-mobile/Operators/PreluOp.swift b/metal/paddle-mobile/paddle-mobile/Operators/PreluOp.swift index 10b5816d7b4528572cdc6b84d53b73499dde93b4..c7e049e3c1b21d9747acca8812abfff8c25d6d98 100644 --- a/metal/paddle-mobile/paddle-mobile/Operators/PreluOp.swift +++ b/metal/paddle-mobile/paddle-mobile/Operators/PreluOp.swift @@ -51,13 +51,13 @@ class PreluOp: Operator, PreluParam

>, Runabl func delogOutput() { print(" \(type) input: ") - print(para.input.metalTexture.toTensor(dim: (n: para.input.originDim[0], c: para.input.originDim[1], h: para.input.originDim[2], w: para.input.originDim[3])).strideArray()) + print(para.input.metalTexture.toTensor(dim: (n: para.input.originDim[0], c: para.input.originDim[1], h: para.input.originDim[2], w: para.input.originDim[3]), texturePrecision: computePrecision).strideArray()) print(" \(type) Alpha: ") let _: Float32? = para.alpha.buffer.logDesc(header: " alpha: ", stridable: false) print(" \(type) output: ") - print(para.output.metalTexture.toTensor(dim: (n: para.output.originDim[0], c: para.output.originDim[1], h: para.output.originDim[2], w: para.output.originDim[3])).strideArray()) + print(para.output.metalTexture.toTensor(dim: (n: para.output.originDim[0], c: para.output.originDim[1], h: para.output.originDim[2], w: para.output.originDim[3]), texturePrecision: computePrecision).strideArray()) } // print("softmax delog") diff --git a/metal/paddle-mobile/paddle-mobile/Operators/PriorBoxOp.swift b/metal/paddle-mobile/paddle-mobile/Operators/PriorBoxOp.swift index 7e82fdec37fb7bd66181fde3af01aedbaf87a023..6999043ab75532e6517398c5bbfe6b893b49cf57 100644 --- a/metal/paddle-mobile/paddle-mobile/Operators/PriorBoxOp.swift +++ b/metal/paddle-mobile/paddle-mobile/Operators/PriorBoxOp.swift @@ -39,7 +39,7 @@ class PriorBoxParam: OpParam { let minSizes: [Float32] let maxSizes: [Float32] let aspectRatios: [Float32] - var newAspectRatios: [Float32]? + var newAspectRatios: MTLBuffer? let variances: [Float32] let flip: Bool let clip: Bool @@ -69,15 +69,19 @@ class PriorBoxOp: Operator, PriorBoxParam

} func delogOutput() { - print(" \(type) output: ") // output - let outputArray = para.output.metalTexture.floatArray { (o: Float32) -> Float32 in - return o + print(" \(type) output: ") + let originDim = para.output.originDim + if para.output.transpose == [0, 1, 2, 3] { + let outputArray: [Float32] = para.output.metalTexture.realNHWC(dim: (n: originDim[0], h: originDim[1], w: originDim[2], c: originDim[3]), texturePrecision: computePrecision) + print(outputArray.strideArray()) + } else if para.output.transpose == [0, 2, 3, 1] { + print(para.output.metalTexture.toTensor(dim: (n: originDim[0], c: originDim[1], h: originDim[2], w: originDim[3]), texturePrecision: computePrecision).strideArray()) + } else { + print(" not implement") } - print(outputArray) - // writeToLibrary(fileName: "box_out", array: outputArray) // output variance diff --git a/metal/paddle-mobile/paddle-mobile/Operators/ReluOp.swift b/metal/paddle-mobile/paddle-mobile/Operators/ReluOp.swift index c9f054c88af44ac3f5dd453b4696c7988d01fa8f..0325f860e078cf639c08e279970a105e3f562a32 100644 --- a/metal/paddle-mobile/paddle-mobile/Operators/ReluOp.swift +++ b/metal/paddle-mobile/paddle-mobile/Operators/ReluOp.swift @@ -46,7 +46,7 @@ class ReluOp: Operator, ReluParam

>, Runable, func delogOutput() { print(" \(type) output: ") - print(para.output.metalTexture.toTensor(dim: (n: para.output.tensorDim[0], c: para.output.tensorDim[1], h: para.output.tensorDim[2], w: para.output.tensorDim[3])).strideArray()) + print(para.output.metalTexture.toTensor(dim: (n: para.output.tensorDim[0], c: para.output.tensorDim[1], h: para.output.tensorDim[2], w: para.output.tensorDim[3]), texturePrecision: computePrecision).strideArray()) } } diff --git a/metal/paddle-mobile/paddle-mobile/Operators/ReshapeOp.swift b/metal/paddle-mobile/paddle-mobile/Operators/ReshapeOp.swift index 373c448b16d3a597f28884ee2e70b29c152f5526..451b064ce19e0e1cb70700d046b6ab059e6df9e3 100644 --- a/metal/paddle-mobile/paddle-mobile/Operators/ReshapeOp.swift +++ b/metal/paddle-mobile/paddle-mobile/Operators/ReshapeOp.swift @@ -76,7 +76,7 @@ class ReshapeOp: Operator, ReshapeParam

>, let originDim = para.output.originDim - let outputArray = para.output.metalTexture.realNHWC(dim: (n: originDim[0], h: originDim[1], w: originDim[2], c: originDim[3])) + let outputArray: [Float32] = para.output.metalTexture.realNHWC(dim: (n: originDim[0], h: originDim[1], w: originDim[2], c: originDim[3]), texturePrecision: computePrecision) print(outputArray.strideArray()) } diff --git a/metal/paddle-mobile/paddle-mobile/Operators/SoftmaxOp.swift b/metal/paddle-mobile/paddle-mobile/Operators/SoftmaxOp.swift index 0eddca3eb9ce21982954c28e5484f94eb5cacbe1..af776450d7f15d031a6af546d13bc1c0374249b7 100644 --- a/metal/paddle-mobile/paddle-mobile/Operators/SoftmaxOp.swift +++ b/metal/paddle-mobile/paddle-mobile/Operators/SoftmaxOp.swift @@ -52,9 +52,9 @@ class SoftmaxOp: Operator, SoftmaxParam

>, func delogOutput() { print("softmax delog") - + let originDim = para.output.originDim - let outputArray = para.output.metalTexture.realNHWC(dim: (n: originDim[0], h: originDim[1], w: originDim[2], c: originDim[3])) + let outputArray: [Float32] = para.output.metalTexture.realNHWC(dim: (n: originDim[0], h: originDim[1], w: originDim[2], c: originDim[3]), texturePrecision: computePrecision) print(outputArray.strideArray()) } } diff --git a/metal/paddle-mobile/paddle-mobile/Operators/TransposeOp.swift b/metal/paddle-mobile/paddle-mobile/Operators/TransposeOp.swift index 3f05a3db3e0de7abaa208ba8b0700688fe349d4e..dc658793a34f96fb0b1fa88b122a1c8639d50058 100644 --- a/metal/paddle-mobile/paddle-mobile/Operators/TransposeOp.swift +++ b/metal/paddle-mobile/paddle-mobile/Operators/TransposeOp.swift @@ -49,7 +49,7 @@ class TransposeOp: Operator, TransposeParam func delogOutput() { print(" \(type) output: ") let originDim = para.output.tensorDim - let outputArray = para.output.metalTexture.realNHWC(dim: (n: originDim[0], h: originDim[1], w: originDim[2], c: originDim[3])) + let outputArray: [Float32] = para.output.metalTexture.realNHWC(dim: (n: originDim[0], h: originDim[1], w: originDim[2], c: originDim[3]), texturePrecision: computePrecision) print(outputArray.strideArray()) } } diff --git a/metal/paddle-mobile/paddle-mobile/framework/Tensor.swift b/metal/paddle-mobile/paddle-mobile/framework/Tensor.swift index e7953e83490e9f3d2bca8b035193dcb0406feaa7..a318180b2b57b162715f0088fdfd88767506ad2a 100644 --- a/metal/paddle-mobile/paddle-mobile/framework/Tensor.swift +++ b/metal/paddle-mobile/paddle-mobile/framework/Tensor.swift @@ -12,7 +12,6 @@ See the License for the specific language governing permissions and limitations under the License. */ -import Accelerate import Foundation protocol Tensorial: CustomStringConvertible, CustomDebugStringConvertible{ @@ -27,10 +26,11 @@ extension Tensorial { } } +public enum ComputePrecision { + case Float32, Float16 +} + class Tensor: Tensorial { - enum BufferPrecision { - case Float32, Float16 - } var data: Data var dim: Dim @@ -93,15 +93,9 @@ class Tensor: Tensorial { layout = to } - func float32ToFloat16(input: UnsafeMutablePointer, output: UnsafeMutableRawPointer, count: Int) { - var float32Buffer = vImage_Buffer(data: input, height: 1, width: UInt(count), rowBytes: count * 4) - var float16buffer = vImage_Buffer(data: output, height: 1, width: UInt(count), rowBytes: count * 2) - guard vImageConvert_PlanarFtoPlanar16F(&float32Buffer, &float16buffer, 0) == kvImageNoError else { - fatalError(" float 32 to float 16 error ! ") - } - } + - func initBuffer(device: MTLDevice, precision: BufferPrecision = .Float32) { + func initBuffer(device: MTLDevice, precision: ComputePrecision = .Float16) { guard let floatPointer = data.pointer as? UnsafeMutablePointer else { fatalError(" not support yet ") } diff --git a/metal/paddle-mobile/paddle-mobile/framework/Texture.swift b/metal/paddle-mobile/paddle-mobile/framework/Texture.swift index 9889171dba3cb8f18b20cac597a6fec505a92e73..42a381c50cd3a66f39d9e19e9cffc8de83ca48b2 100644 --- a/metal/paddle-mobile/paddle-mobile/framework/Texture.swift +++ b/metal/paddle-mobile/paddle-mobile/framework/Texture.swift @@ -46,7 +46,7 @@ public class Texture: Tensorial { public var metalTexture: MTLTexture! var transpose: [Int] = [0, 1, 2, 3] - func initTexture(device: MTLDevice, inTranspose: [Int] = [0, 1, 2, 3]) { + func initTexture(device: MTLDevice, inTranspose: [Int] = [0, 1, 2, 3], computePrecision: ComputePrecision = .Float16) { transpose = inTranspose let newDim = transpose.map { originDim[$0] } @@ -64,12 +64,10 @@ public class Texture: Tensorial { tmpTextureDes.depth = 1 tmpTextureDes.arrayLength = ((newDim[0]) * (newDim[3]) + 3) / 4 tmpTextureDes.textureType = .type2DArray - - if MemoryLayout

.size == 1 { - tmpTextureDes.pixelFormat = .rgba8Unorm - } else if MemoryLayout

.size == 2 { + + if computePrecision == .Float16 { tmpTextureDes.pixelFormat = .rgba16Float - } else if MemoryLayout

.size == 4 { + } else if computePrecision == .Float32 { tmpTextureDes.pixelFormat = .rgba32Float }