diff --git a/metal/paddle-mobile-demo/paddle-mobile-demo/Net/MobileNetSSD.swift b/metal/paddle-mobile-demo/paddle-mobile-demo/Net/MobileNetSSD.swift index 640f51dafac5b684eda9cf002395ef3fe15e5d5f..e48b5a7906272d9a886652ed112602d52d628b30 100644 --- a/metal/paddle-mobile-demo/paddle-mobile-demo/Net/MobileNetSSD.swift +++ b/metal/paddle-mobile-demo/paddle-mobile-demo/Net/MobileNetSSD.swift @@ -68,6 +68,7 @@ class MobileNet_ssd_hand: Net{ let output: [Float32] = result.map { $0.floatValue } + return output } diff --git a/metal/paddle-mobile-demo/paddle-mobile-demo/ViewController.swift b/metal/paddle-mobile-demo/paddle-mobile-demo/ViewController.swift index 5032ef7d68f406b9adfa2baa04f6e5e754c55ff4..8b3f1efc9d1df83c11f96a1d4b0795183d64a4d5 100644 --- a/metal/paddle-mobile-demo/paddle-mobile-demo/ViewController.swift +++ b/metal/paddle-mobile-demo/paddle-mobile-demo/ViewController.swift @@ -19,17 +19,17 @@ import MetalPerformanceShaders let threadSupport = [1] -let modelHelperMap: [SupportModel : Net] = [.mobilenet_ssd : MobileNet_ssd_hand.init(), .genet : Genet.init()] +let modelHelperMap: [SupportModel : Net] = [ .mobilenet : MobileNet.init(), .mobilenet_ssd : MobileNet_ssd_hand.init(), .genet : Genet.init()] //, .genet : Genet.init() //let modelHelperMap: [SupportModel : Net] = [.mobilenet : MobileNet.init(), .mobilenet_ssd : MobileNet_ssd_hand.init()] enum SupportModel: String{ - // case mobilenet = "mobilenet" + case mobilenet = "mobilenet" case mobilenet_ssd = "mobilenetssd" case genet = "genet" static func supportedModels() -> [SupportModel] { - //.mobilenet, - return [.mobilenet_ssd ,.genet] + // + return [.mobilenet, .mobilenet_ssd ,.genet] } } diff --git a/metal/paddle-mobile/paddle-mobile.xcodeproj/project.pbxproj b/metal/paddle-mobile/paddle-mobile.xcodeproj/project.pbxproj index 55dfe91ce0a700ff7c47cdff55b461908f1657ad..b2c24db200e1440ba4830d78a37100b72983d3e4 100644 --- a/metal/paddle-mobile/paddle-mobile.xcodeproj/project.pbxproj +++ b/metal/paddle-mobile/paddle-mobile.xcodeproj/project.pbxproj @@ -41,7 +41,6 @@ FC0E2DBE20EE460D009C1FAC /* BatchNormKernel.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC0E2DBD20EE460D009C1FAC /* BatchNormKernel.swift */; }; FC0E2DC020EE461F009C1FAC /* ElementwiseAddKernel.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC0E2DBF20EE461F009C1FAC /* ElementwiseAddKernel.swift */; }; FC1B16B320EC9A4F00678B91 /* Kernels.metal in Sources */ = {isa = PBXBuildFile; fileRef = FC1B16B220EC9A4F00678B91 /* Kernels.metal */; }; - FC1B186620ECF1C600678B91 /* ResizeKernel.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC1B186520ECF1C600678B91 /* ResizeKernel.swift */; }; FC3602CC2108819F00FACB58 /* PaddleMobileUnitTest.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC3602CB2108819F00FACB58 /* PaddleMobileUnitTest.swift */; }; FC4CB74920F0B954007C0C6D /* ConvKernel.metal in Sources */ = {isa = PBXBuildFile; fileRef = FC4CB74820F0B954007C0C6D /* ConvKernel.metal */; }; FC4CB74B20F12C30007C0C6D /* ProgramOptimize.swift in Sources */ = {isa = PBXBuildFile; fileRef = FC4CB74A20F12C30007C0C6D /* ProgramOptimize.swift */; }; @@ -133,7 +132,6 @@ FC0E2DBD20EE460D009C1FAC /* BatchNormKernel.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = BatchNormKernel.swift; sourceTree = ""; }; FC0E2DBF20EE461F009C1FAC /* ElementwiseAddKernel.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ElementwiseAddKernel.swift; sourceTree = ""; }; FC1B16B220EC9A4F00678B91 /* Kernels.metal */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.metal; path = Kernels.metal; sourceTree = ""; }; - FC1B186520ECF1C600678B91 /* ResizeKernel.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = ResizeKernel.swift; sourceTree = ""; }; FC27990D21341016000B6BAD /* BoxCoder.metal */ = {isa = PBXFileReference; fileEncoding = 4; lastKnownFileType = sourcecode.metal; path = BoxCoder.metal; sourceTree = ""; }; FC3602CB2108819F00FACB58 /* PaddleMobileUnitTest.swift */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.swift; path = PaddleMobileUnitTest.swift; sourceTree = ""; }; FC4CB74820F0B954007C0C6D /* ConvKernel.metal */ = {isa = PBXFileReference; lastKnownFileType = sourcecode.metal; path = ConvKernel.metal; sourceTree = ""; }; @@ -326,7 +324,6 @@ FCEB6837212F00B100D2448E /* metal */, FCDDC6C7212FA3CA00E5EF74 /* ConvTransposeKernel.swift */, FC0E2DBB20EE45FE009C1FAC /* ConvKernel.swift */, - FC1B186520ECF1C600678B91 /* ResizeKernel.swift */, FC0E2DB920EE3B8D009C1FAC /* ReluKernel.swift */, FC0E2DBD20EE460D009C1FAC /* BatchNormKernel.swift */, FC0E2DBF20EE461F009C1FAC /* ElementwiseAddKernel.swift */, @@ -506,7 +503,6 @@ FC039BBB20E11CC20081E9F8 /* ProgramDesc.swift in Sources */, FC9D037920E229E4000F735A /* OpParam.swift in Sources */, FC3602CC2108819F00FACB58 /* PaddleMobileUnitTest.swift in Sources */, - FC1B186620ECF1C600678B91 /* ResizeKernel.swift in Sources */, FCF2D73820E64E70007AC5F5 /* Kernel.swift in Sources */, FCDDC6CC212FDFDB00E5EF74 /* ReluKernel.metal in Sources */, FC0226562138F33800F395E2 /* TransposeKernel.metal in Sources */, diff --git a/metal/paddle-mobile/paddle-mobile/Common/MetalExtension.swift b/metal/paddle-mobile/paddle-mobile/Common/MetalExtension.swift index 2b7636ce00ba2ec56aabd7e732039973ee8efdb3..95ba4b0c596e75e4735f4167a50a5f43c2543bcf 100644 --- a/metal/paddle-mobile/paddle-mobile/Common/MetalExtension.swift +++ b/metal/paddle-mobile/paddle-mobile/Common/MetalExtension.swift @@ -354,7 +354,7 @@ public extension MTLTexture { } // n c h w - dim - func toTensor(dim: (n: Int, c: Int, h: Int, w: Int), texturePrecision: ComputePrecision = .Float16) -> [Float32] { + func toTensor(dim: (n: Int, c: Int, h: Int, w: Int)) -> [Float32] { // print("origin dim: \(dim)") print("texture: ") print(self) @@ -392,7 +392,7 @@ public extension MTLTexture { return output } - func realNHWC(dim: (n: Int, h: Int, w: Int, c: Int), texturePrecision: ComputePrecision = .Float16) -> [Float32] { + func realNHWC(dim: (n: Int, h: Int, w: Int, c: Int)) -> [Float32] { // print("origin dim: \(dim)") // print("texture: ") // print(self) diff --git a/metal/paddle-mobile/paddle-mobile/Executor.swift b/metal/paddle-mobile/paddle-mobile/Executor.swift index 757c13404a80b695946d0daa6121b67d0be46d0e..d28525b187e5c61bcf709defd338ba036b3ea111 100644 --- a/metal/paddle-mobile/paddle-mobile/Executor.swift +++ b/metal/paddle-mobile/paddle-mobile/Executor.swift @@ -14,7 +14,7 @@ import Foundation -let testTo = 61 +let testTo = 161 var isTest = false @@ -128,18 +128,18 @@ public class Executor { // print(stridableInput) // let _: Flo? = input.logDesc(header: "input: ", stridable: true) - for i in 0.. if except > 0 { resultHolder = ResultHolder

.init(inDim: [], inResult: [], inElapsedTime: afterDate.timeIntervalSince(beforeDate), inIntermediateResults: outputTextures) diff --git a/metal/paddle-mobile/paddle-mobile/Loader.swift b/metal/paddle-mobile/paddle-mobile/Loader.swift index 4b5f91f9c771cd5d9031a548cdc1803fc417bbcc..d9df4cf24c8105642b15bf60d8af6112aefc6f98 100644 --- a/metal/paddle-mobile/paddle-mobile/Loader.swift +++ b/metal/paddle-mobile/paddle-mobile/Loader.swift @@ -159,7 +159,7 @@ public class Loader { } catch let error { throw error } - tensor.convert(to: DataLayout.NHWC()) +// tensor.convert(to: DataLayout.NHWC()) // tensor.initBuffer(device: device) scope[varDesc.name] = tensor } else { diff --git a/metal/paddle-mobile/paddle-mobile/Operators/BoxcoderOp.swift b/metal/paddle-mobile/paddle-mobile/Operators/BoxcoderOp.swift index 193a271ccfc3ea3a68429f227394552c9f609f6f..e36a1cbb4f19602ebbe05ad572e543e9bcae5d14 100644 --- a/metal/paddle-mobile/paddle-mobile/Operators/BoxcoderOp.swift +++ b/metal/paddle-mobile/paddle-mobile/Operators/BoxcoderOp.swift @@ -75,12 +75,12 @@ class BoxcoderOp: Operator, BoxcoderParam

// print(targetBoxArray.strideArray()) let targetBoxOriginDim = para.targetBox.originDim - let targetBoxArray = para.targetBox.metalTexture.realNHWC(dim: (n: targetBoxOriginDim[0], h: targetBoxOriginDim[1], w: targetBoxOriginDim[2], c: targetBoxOriginDim[3]), texturePrecision: computePrecision) + let targetBoxArray = para.targetBox.metalTexture.realNHWC(dim: (n: targetBoxOriginDim[0], h: targetBoxOriginDim[1], w: targetBoxOriginDim[2], c: targetBoxOriginDim[3])) print(" target box ") print(targetBoxArray.strideArray()) let originDim = para.output.originDim - let outputArray: [Float32] = para.output.metalTexture.realNHWC(dim: (n: originDim[0], h: originDim[1], w: originDim[2], c: originDim[3]), texturePrecision: computePrecision) + let outputArray: [Float32] = para.output.metalTexture.realNHWC(dim: (n: originDim[0], h: originDim[1], w: originDim[2], c: originDim[3])) print(" output ") print(outputArray.strideArray()) } diff --git a/metal/paddle-mobile/paddle-mobile/Operators/ConcatOp.swift b/metal/paddle-mobile/paddle-mobile/Operators/ConcatOp.swift index 67e564dddd0aadcb059767380e7946040e8cd284..42e7903937e3cff9b3bdac94b60e36edc8c6dfaa 100644 --- a/metal/paddle-mobile/paddle-mobile/Operators/ConcatOp.swift +++ b/metal/paddle-mobile/paddle-mobile/Operators/ConcatOp.swift @@ -67,10 +67,10 @@ class ConcatOp: Operator, ConcatParam

>, Run print(" \(type) output: ") let originDim = para.output.originDim if para.output.transpose == [0, 1, 2, 3] { - let outputArray: [Float32] = para.output.metalTexture.realNHWC(dim: (n: originDim[0], h: originDim[1], w: originDim[2], c: originDim[3]), texturePrecision: computePrecision) + let outputArray: [Float32] = para.output.metalTexture.realNHWC(dim: (n: originDim[0], h: originDim[1], w: originDim[2], c: originDim[3])) print(outputArray.strideArray()) } else if para.output.transpose == [0, 2, 3, 1] { - print(para.output.metalTexture.toTensor(dim: (n: originDim[0], c: originDim[1], h: originDim[2], w: originDim[3]), texturePrecision: computePrecision).strideArray()) + print(para.output.metalTexture.toTensor(dim: (n: originDim[0], c: originDim[1], h: originDim[2], w: originDim[3])).strideArray()) } else { fatalError(" not implemet") } diff --git a/metal/paddle-mobile/paddle-mobile/Operators/ConvAddOp.swift b/metal/paddle-mobile/paddle-mobile/Operators/ConvAddOp.swift index b4220c87a648299bb212f910832c6d9aadebe2d5..d15ec685a545c298cb1f25deab1cdf2122378fde 100644 --- a/metal/paddle-mobile/paddle-mobile/Operators/ConvAddOp.swift +++ b/metal/paddle-mobile/paddle-mobile/Operators/ConvAddOp.swift @@ -94,13 +94,15 @@ class ConvAddOp: Operator, ConvAddParam

>, func delogOutput() { + + print("op \(type): ") + print(" \(type) output: ") print(" padding: ") print(para.paddings) print("stride: ") print(para.stride) print("dilations: ") print(para.dilations) - print(" \(type) output: ") print(" para input dim: ") print(para.input.dim) print(" para filter dim: ") @@ -111,6 +113,14 @@ class ConvAddOp: Operator, ConvAddParam

>, let biase: [Float32] = para.y.buffer.array() print(biase) - print(para.output.metalTexture.toTensor(dim: (n: para.output.tensorDim[0], c: para.output.tensorDim[1], h: para.output.tensorDim[2], w: para.output.tensorDim[3]), texturePrecision: computePrecision).strideArray()) + print(" - filter - ") + let array: [Float32] = para.filter.buffer.array() + print(array) + + print(" - y - ") + let yArray: [Float32] = para.y.buffer.array() + print(yArray) + + print(para.output.metalTexture.toTensor(dim: (n: para.output.tensorDim[0], c: para.output.tensorDim[1], h: para.output.tensorDim[2], w: para.output.tensorDim[3])).strideArray()) } } diff --git a/metal/paddle-mobile/paddle-mobile/Operators/ConvBNReluOp.swift b/metal/paddle-mobile/paddle-mobile/Operators/ConvBNReluOp.swift index be8c57d3ace01dabd652e0e80a43c5a053213e28..3c521a2210614550577369c603dbbdc5e2cb6692 100644 --- a/metal/paddle-mobile/paddle-mobile/Operators/ConvBNReluOp.swift +++ b/metal/paddle-mobile/paddle-mobile/Operators/ConvBNReluOp.swift @@ -110,7 +110,7 @@ class ConvBNReluOp: Operator, ConvBNReluPa func delogOutput() { print(" \(type) output: ") - print(para.output.metalTexture.toTensor(dim: (n: para.output.originDim[0], c: para.output.originDim[1], h: para.output.originDim[2], w: para.output.originDim[3]), texturePrecision: computePrecision).strideArray()) + print(para.output.metalTexture.toTensor(dim: (n: para.output.originDim[0], c: para.output.originDim[1], h: para.output.originDim[2], w: para.output.originDim[3])).strideArray()) } } diff --git a/metal/paddle-mobile/paddle-mobile/Operators/ConvTransposeOp.swift b/metal/paddle-mobile/paddle-mobile/Operators/ConvTransposeOp.swift index 387fa420b68f8004a12af85ca398cf306f41a5c6..e149e0ac15842fb79471d0fe7f5eb6a25f8fccfd 100644 --- a/metal/paddle-mobile/paddle-mobile/Operators/ConvTransposeOp.swift +++ b/metal/paddle-mobile/paddle-mobile/Operators/ConvTransposeOp.swift @@ -43,13 +43,15 @@ class ConvTransposeOp: Operator, ConvTr } func delogOutput() { + print(" \(type) output: ") let originDim = para.output.originDim if para.output.transpose == [0, 1, 2, 3] { - let outputArray: [Float32] = para.output.metalTexture.realNHWC(dim: (n: originDim[0], h: originDim[1], w: originDim[2], c: originDim[3]), texturePrecision: computePrecision) + let outputArray: [Float32] = para.output.metalTexture.realNHWC(dim: (n: originDim[0], h: originDim[1], w: originDim[2], c: originDim[3])) print(outputArray.strideArray()) } else if para.output.transpose == [0, 2, 3, 1] { - print(para.output.metalTexture.toTensor(dim: (n: para.output.tensorDim[0], c: para.output.tensorDim[1], h: para.output.tensorDim[2], w: para.output.tensorDim[3]), texturePrecision: computePrecision).strideArray()) + let output = para.output.metalTexture.toTensor(dim: (n: para.output.tensorDim[0], c: para.output.tensorDim[1], h: para.output.tensorDim[2], w: para.output.tensorDim[3])) + print(output.strideArray()) } else { print(" not implement") } diff --git a/metal/paddle-mobile/paddle-mobile/Operators/DepthwiseConvOp.swift b/metal/paddle-mobile/paddle-mobile/Operators/DepthwiseConvOp.swift index 36f477bc1cb48007b5b28bf27a7424940918025b..639c22ce12c7a110cf58f3f9e7b9ee458d393260 100644 --- a/metal/paddle-mobile/paddle-mobile/Operators/DepthwiseConvOp.swift +++ b/metal/paddle-mobile/paddle-mobile/Operators/DepthwiseConvOp.swift @@ -58,6 +58,6 @@ class DepthConvOp: Operator, ConvParam

>, Runa func delogOutput() { print(" \(type) output: ") - print(para.output.metalTexture.toTensor(dim: (n: para.output.originDim[0], c: para.output.originDim[1], h: para.output.originDim[2], w: para.output.originDim[3]), texturePrecision: computePrecision).strideArray()) + print(para.output.metalTexture.toTensor(dim: (n: para.output.originDim[0], c: para.output.originDim[1], h: para.output.originDim[2], w: para.output.originDim[3])).strideArray()) } } diff --git a/metal/paddle-mobile/paddle-mobile/Operators/DwConvBNReluOp.swift b/metal/paddle-mobile/paddle-mobile/Operators/DwConvBNReluOp.swift index 0ea8a62c5c0bf30da200add2a96410136d2f40fb..16a42d5c7b24e7b3a26cab35f68decd226076876 100644 --- a/metal/paddle-mobile/paddle-mobile/Operators/DwConvBNReluOp.swift +++ b/metal/paddle-mobile/paddle-mobile/Operators/DwConvBNReluOp.swift @@ -65,6 +65,6 @@ class DwConvBNReluOp: Operator, ConvBNRelu func delogOutput() { print(" \(type) output: ") - print(para.output.metalTexture.toTensor(dim: (n: para.output.originDim[0], c: para.output.originDim[1], h: para.output.originDim[2], w: para.output.originDim[3]), texturePrecision: computePrecision).strideArray()) + print(para.output.metalTexture.toTensor(dim: (n: para.output.originDim[0], c: para.output.originDim[1], h: para.output.originDim[2], w: para.output.originDim[3])).strideArray()) } } diff --git a/metal/paddle-mobile/paddle-mobile/Operators/ElementwiseAddOp.swift b/metal/paddle-mobile/paddle-mobile/Operators/ElementwiseAddOp.swift index 0f96b204d59f3d4a0dd0fae20340811855421c95..889fef56c58ca57c795fa42a9c25cde3090a1b14 100644 --- a/metal/paddle-mobile/paddle-mobile/Operators/ElementwiseAddOp.swift +++ b/metal/paddle-mobile/paddle-mobile/Operators/ElementwiseAddOp.swift @@ -71,12 +71,15 @@ class ElementwiseAddOp: Operator, Elem // print(para.inputY.metalTexture.toTensor(dim: (n: para.inputY.tensorDim[0], c: para.inputY.tensorDim[1], h: para.inputY.tensorDim[2], w: para.inputY.tensorDim[3])).strideArray()) print(" \(type) output: ") + + print(para.inputY) + let originDim = para.output.originDim if para.output.transpose == [0, 1, 2, 3] { - let outputArray: [Float32] = para.output.metalTexture.realNHWC(dim: (n: originDim[0], h: originDim[1], w: originDim[2], c: originDim[3]), texturePrecision: computePrecision) + let outputArray: [Float32] = para.output.metalTexture.realNHWC(dim: (n: originDim[0], h: originDim[1], w: originDim[2], c: originDim[3])) print(outputArray.strideArray()) } else if para.output.transpose == [0, 2, 3, 1] { - print(para.output.metalTexture.toTensor(dim: (n: para.output.tensorDim[0], c: para.output.tensorDim[1], h: para.output.tensorDim[2], w: para.output.tensorDim[3]), texturePrecision: computePrecision).strideArray()) + print(para.output.metalTexture.toTensor(dim: (n: para.output.tensorDim[0], c: para.output.tensorDim[1], h: para.output.tensorDim[2], w: para.output.tensorDim[3])).strideArray()) } else { print(" not implement") } diff --git a/metal/paddle-mobile/paddle-mobile/Operators/FeedOp.swift b/metal/paddle-mobile/paddle-mobile/Operators/FeedOp.swift index b6075a807d1637c554587ea99724d6ff1f38e7e6..93560582166c53b6de308e53b1cda431151ad741 100644 --- a/metal/paddle-mobile/paddle-mobile/Operators/FeedOp.swift +++ b/metal/paddle-mobile/paddle-mobile/Operators/FeedOp.swift @@ -61,7 +61,7 @@ class FeedOp: Operator, FeedParam< func delogOutput() { print(" \(type) output: ") - print(para.output.metalTexture.toTensor(dim: (n: para.output.originDim[0], c: para.output.originDim[1], h: para.output.originDim[2], w: para.output.originDim[3]), texturePrecision: computePrecision).strideArray()) + print(para.output.metalTexture.toTensor(dim: (n: para.output.originDim[0], c: para.output.originDim[1], h: para.output.originDim[2], w: para.output.originDim[3])).strideArray()) } } diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/BatchNormKernel.swift b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/BatchNormKernel.swift index bae452dec331957ceda5a6f503802352f63a6dbe..bf0a2959fa9627f5cbe994dcd183f9ed7f04456c 100644 --- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/BatchNormKernel.swift +++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/BatchNormKernel.swift @@ -15,53 +15,60 @@ import Foundation class BatchNormKernel: Kernel, Computable { - var newScale: MTLBuffer - var newBias: MTLBuffer + var newScale: MTLBuffer + var newBias: MTLBuffer + + required init(device: MTLDevice, param: BatchNormParam

) { + guard let newScale = device.makeBuffer(length: param.inputScale.buffer.length) else { + fatalError() + } + + guard let newBias = device.makeBuffer(length: param.inputBias.buffer.length) else { + fatalError() + } + self.newScale = newScale + self.newBias = newBias + + if computePrecision == .Float32 { + super.init(device: device, inFunctionName: "batchnorm") + } else if computePrecision == .Float16 { + super.init(device: device, inFunctionName: "batchnorm_half") + } else { + fatalError() + } + + let varianceBuffer : MTLBuffer = param.inputVariance.buffer - required init(device: MTLDevice, param: BatchNormParam

) { - guard let newScale = device.makeBuffer(length: param.inputScale.buffer.length) else { - fatalError() - } - guard let newBias = device.makeBuffer(length: param.inputBias.buffer.length) else { - fatalError() - } - self.newScale = newScale - self.newBias = newBias - - super.init(device: device, inFunctionName: "batchnorm") - - let varianceBuffer : MTLBuffer = param.inputVariance.buffer - - var invStd: [Float32] = Array(repeating: 0, count: varianceBuffer.length) - let varianceContents = varianceBuffer.contents().assumingMemoryBound(to: P.self) - for i in 0..<(varianceBuffer.length / MemoryLayout

.stride) { - invStd[i] = 1 / (Float32(varianceContents[i]) + param.epsilon).squareRoot() - } - - let newScaleContents = newScale.contents().assumingMemoryBound(to: P.self) - let newBiasContents = newBias.contents().assumingMemoryBound(to: P.self) - let scale : MTLBuffer = param.inputScale.buffer - let scaleContents = scale.contents().assumingMemoryBound(to: P.self) - let bias : MTLBuffer = param.inputBias.buffer - let biasContents = bias.contents().assumingMemoryBound(to: P.self) - let meanContents = param.inputMean.buffer.contents().assumingMemoryBound(to: P.self) - - for i in 0..<(newScale.length / MemoryLayout

.stride) { - newScaleContents[i] = P(invStd[i] * Float32(scaleContents[i])) - newBiasContents[i] = P(Float32(biasContents[i]) - Float32(meanContents[i]) * invStd[i] * Float32(scaleContents[i])) - } + var invStd: [Float32] = Array(repeating: 0, count: varianceBuffer.length) + let varianceContents = varianceBuffer.contents().assumingMemoryBound(to: P.self) + for i in 0..<(varianceBuffer.length / MemoryLayout

.stride) { + invStd[i] = 1 / (Float32(varianceContents[i]) + param.epsilon).squareRoot() } - func compute(commandBuffer: MTLCommandBuffer, param: BatchNormParam

) throws { - guard let encoder = commandBuffer.makeComputeCommandEncoder() else { - throw PaddleMobileError.predictError(message: " encoder is nil") - } - print("BatchNorm compute") - encoder.setTexture(param.input.metalTexture, index: 0) - encoder.setTexture(param.output.metalTexture, index: 1) - encoder.setBuffer(newScale, offset: 0, index: 0) - encoder.setBuffer(newBias, offset: 0, index: 1) - encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture) - encoder.endEncoding() + let newScaleContents = newScale.contents().assumingMemoryBound(to: P.self) + let newBiasContents = newBias.contents().assumingMemoryBound(to: P.self) + let scale : MTLBuffer = param.inputScale.buffer + let scaleContents = scale.contents().assumingMemoryBound(to: P.self) + let bias : MTLBuffer = param.inputBias.buffer + let biasContents = bias.contents().assumingMemoryBound(to: P.self) + let meanContents = param.inputMean.buffer.contents().assumingMemoryBound(to: P.self) + + for i in 0..<(newScale.length / MemoryLayout

.stride) { + newScaleContents[i] = P(invStd[i] * Float32(scaleContents[i])) + newBiasContents[i] = P(Float32(biasContents[i]) - Float32(meanContents[i]) * invStd[i] * Float32(scaleContents[i])) + } + } + + func compute(commandBuffer: MTLCommandBuffer, param: BatchNormParam

) throws { + guard let encoder = commandBuffer.makeComputeCommandEncoder() else { + throw PaddleMobileError.predictError(message: " encoder is nil") } + print("BatchNorm compute") + encoder.setTexture(param.input.metalTexture, index: 0) + encoder.setTexture(param.output.metalTexture, index: 1) + encoder.setBuffer(newScale, offset: 0, index: 0) + encoder.setBuffer(newBias, offset: 0, index: 1) + encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture) + encoder.endEncoding() + } } diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvAddBatchNormReluKernel.swift b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvAddBatchNormReluKernel.swift index bcdcc732b08e5f578bf1b35eb754cf221217144c..909ff47e819e8cc4d1c37d6e8499a86fd9a5b03c 100644 --- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvAddBatchNormReluKernel.swift +++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvAddBatchNormReluKernel.swift @@ -49,26 +49,39 @@ class ConvAddBatchNormReluKernel: Kernel, Computable, Testable var metalParam: MetalConvParam! required init(device: MTLDevice, param: ConvAddBatchNormReluParam

) { - param.output.initTexture(device: device, inTranspose: [0, 2, 3, 1], computePrecision: computePrecision) - - if param.filter.width == 1 && param.filter.height == 1 { - super.init(device: device, inFunctionName: "conv_add_batch_norm_relu_1x1") - } else if param.filter.channel == 1 { - super.init(device: device, inFunctionName: "depthwise_conv_add_batch_norm_relu_3x3") - } else { - super.init(device: device, inFunctionName: "conv_add_batch_norm_relu_3x3") - } - param.filter.initBuffer(device: device, precision: computePrecision) - param.y.initBuffer(device: device, precision: computePrecision) - param.variance.initBuffer(device: device, precision: .Float32) param.mean.initBuffer(device: device, precision: .Float32) param.scale.initBuffer(device: device, precision: .Float32) param.bias.initBuffer(device: device, precision: .Float32) + if computePrecision == .Float32 { + if param.filter.width == 1 && param.filter.height == 1 { + super.init(device: device, inFunctionName: "conv_add_batch_norm_relu_1x1") + } else if param.filter.channel == 1 { + super.init(device: device, inFunctionName: "depthwise_conv_add_batch_norm_relu_3x3") + } else if param.filter.width == 3 && param.filter.height == 3 { + super.init(device: device, inFunctionName: "conv_add_batch_norm_relu_3x3") + } else { + fatalError(" unsupport ") + } + } else if computePrecision == .Float16 { + if param.filter.width == 1 && param.filter.height == 1 { + super.init(device: device, inFunctionName: "conv_add_batch_norm_relu_1x1_half") + } else if param.filter.channel == 1 { + super.init(device: device, inFunctionName: "depthwise_conv_add_batch_norm_relu_3x3_half") + } else if param.filter.width == 3 && param.filter.height == 3 { + super.init(device: device, inFunctionName: "conv_add_batch_norm_relu_3x3_half") + } else { + fatalError(" unsupport ") + } + } else { + fatalError() + } + + let offsetX = param.filter.width/2 - Int(param.paddings[0]) let offsetY = param.filter.height/2 - Int(param.paddings[1]) diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvAddKernel.swift b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvAddKernel.swift index f2ad60fcfc8ec77e40a21a130026fcb8fa290621..d2b54d9be60694871b4d4e3ba4bd3852bc94e875 100644 --- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvAddKernel.swift +++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvAddKernel.swift @@ -17,14 +17,23 @@ import Foundation class ConvAddKernel: Kernel, Computable { var metalParam: MetalConvParam! required init(device: MTLDevice, param: ConvAddParam

) { - + param.output.initTexture(device: device, inTranspose: [0, 2, 3, 1], computePrecision: computePrecision) + param.filter.initBuffer(device: device, precision: computePrecision) + param.y.initBuffer(device: device, precision: computePrecision) + if computePrecision == .Float16 { if param.filter.width == 1 && param.filter.height == 1 { super.init(device: device, inFunctionName: "conv_add_1x1_half") } else if param.filter.channel == 1 { super.init(device: device, inFunctionName: "depthwise_conv_add_3x3_half") - } else { + } else if param.filter.width == 3 && param.filter.height == 3 { super.init(device: device, inFunctionName: "conv_add_3x3_half") + } else if param.filter.width == 1 && param.filter.height == 5 { + super.init(device: device, inFunctionName: "conv_add_5x1_half") + } else if param.filter.width == 5 && param.filter.height == 1 { + super.init(device: device, inFunctionName: "conv_add_1x5_half") + } else { + fatalError(" unsupport yet ") } } else if computePrecision == .Float32 { if param.filter.width == 1 && param.filter.height == 1 { @@ -35,22 +44,21 @@ class ConvAddKernel: Kernel, Computable { super.init(device: device, inFunctionName: "conv_add_5x1") } else if param.filter.width == 5 && param.filter.height == 1 { super.init(device: device, inFunctionName: "conv_add_1x5") - } else { + } else if param.filter.width == 3 && param.filter.height == 3 { super.init(device: device, inFunctionName: "conv_add_3x3") + } else { + fatalError(" unsupport yet ") } } else { fatalError() } + + let offsetY = (Int(param.dilations[1]) * (param.filter.height - 1) + 1)/2 - Int(param.paddings[1]) let offsetX = (Int(param.dilations[0]) * (param.filter.width - 1) + 1)/2 - Int(param.paddings[0]) - param.output.initTexture(device: device, inTranspose: [0, 2, 3, 1], computePrecision: computePrecision) - - param.filter.initBuffer(device: device, precision: computePrecision) - param.y.initBuffer(device: device, precision: computePrecision) - print(" function: \(functionName)") print("offset x: \(offsetX)") print("offset y: \(offsetY)") diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvBNReluKernel.swift b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvBNReluKernel.swift index 4b978e24d2b968d4ee2ee8443d74ce54f502be8b..bbf4eeaaf41e321f2c7c469011f1a243ef035107 100644 --- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvBNReluKernel.swift +++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvBNReluKernel.swift @@ -49,35 +49,41 @@ class ConvBNReluKernel: Kernel, Computable, Testable { } var metalParam: MetalConvParam! - + required init(device: MTLDevice, param: ConvBNReluParam

) { + + param.output.initTexture(device: device, inTranspose: [0, 2, 3, 1], computePrecision: computePrecision) + param.filter.initBuffer(device: device, precision: computePrecision) + param.variance.initBuffer(device: device, precision: .Float32) + param.mean.initBuffer(device: device, precision: .Float32) + param.scale.initBuffer(device: device, precision: .Float32) + param.bias.initBuffer(device: device, precision: .Float32) + if computePrecision == .Float32 { if param.filter.width == 1 && param.filter.height == 1 { super.init(device: device, inFunctionName: "conv_batch_norm_relu_1x1") } else if param.filter.channel == 1 { super.init(device: device, inFunctionName: "depthwise_conv_batch_norm_relu_3x3") - } else { + } else if param.filter.width == 3 && param.filter.height == 3 { super.init(device: device, inFunctionName: "conv_batch_norm_relu_3x3") + } else { + fatalError(" unsupport ") } } else if computePrecision == .Float16 { if param.filter.width == 1 && param.filter.height == 1 { super.init(device: device, inFunctionName: "conv_batch_norm_relu_1x1_half") } else if param.filter.channel == 1 { super.init(device: device, inFunctionName: "depthwise_conv_batch_norm_relu_3x3_half") - } else { + } else if param.filter.width == 3 && param.filter.height == 3 { super.init(device: device, inFunctionName: "conv_batch_norm_relu_3x3_half") + } else { + fatalError(" unsupport ") } } else { fatalError() } - param.output.initTexture(device: device, inTranspose: [0, 2, 3, 1], computePrecision: computePrecision) - param.filter.initBuffer(device: device, precision: computePrecision) - - param.variance.initBuffer(device: device, precision: .Float32) - param.mean.initBuffer(device: device, precision: .Float32) - param.scale.initBuffer(device: device, precision: .Float32) - param.bias.initBuffer(device: device, precision: .Float32) + let offsetX = param.filter.width/2 - Int(param.paddings[0]) let offsetY = param.filter.height/2 - Int(param.paddings[1]) diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvKernel.swift b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvKernel.swift index 6c10ba8d18f3c0e386769c11867e8bb361f49b21..345136a503d8eda6ad23f85ef01eb53fa539d453 100644 --- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvKernel.swift +++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvKernel.swift @@ -27,18 +27,20 @@ public struct MetalConvParam { class ConvKernel: Kernel, Computable { var metalParam: MetalConvParam! required init(device: MTLDevice, param: ConvParam

) { + param.filter.initBuffer(device: device, precision: ComputePrecision.Float32) if param.filter.width == 1 && param.filter.height == 1 { super.init(device: device, inFunctionName: "conv_1x1") } else if param.filter.channel == 1 { super.init(device: device, inFunctionName: "depthwise_conv_3x3") - } else { + } else if param.filter.width == 3 && param.filter.height == 3 { super.init(device: device, inFunctionName: "conv_3x3") + } else { + fatalError(" unsupport ") } - + let offsetX = param.filter.dim[2]/2 - Int(param.paddings[0]) let offsetY = param.filter.dim[1]/2 - Int(param.paddings[1]) let offsetZ = 0.0 - param.filter.initBuffer(device: device, precision: ComputePrecision.Float32) metalParam = MetalConvParam.init(offsetX: Int16(offsetX), offsetY: Int16(offsetY), offsetZ: Int16(offsetZ), strideX: UInt16(param.stride[0]), strideY: UInt16(param.stride[1]), dilationX: UInt16(param.dilations[0]), dilationY: UInt16(param.dilations[1])) } diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvTransposeKernel.swift b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvTransposeKernel.swift index 9354972d71c776392e9b7bf621e3e4de7233ffe8..435776c850854f2fc4259e8a2089299da825f463 100644 --- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvTransposeKernel.swift +++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ConvTransposeKernel.swift @@ -31,7 +31,27 @@ struct MetalConvTransposeParam { class ConvTransposeKernel: Kernel, Computable{ var metalParam: MetalConvTransposeParam! required init(device: MTLDevice, param: ConvTransposeParam

) { - super.init(device: device, inFunctionName: "conv_transpose") + param.output.initTexture(device: device, inTranspose: param.input.transpose, computePrecision: computePrecision) + param.filter.initBuffer(device: device, precision: computePrecision, convertToNHWC: false, withTranspose: true) + if computePrecision == .Float32 { + if param.stride == [2, 2] && param.stride == [2, 2] { + super.init(device: device, inFunctionName: "conv_transpose2x2_stride2") + } else { + fatalError(" -- conv transpose unsupported yet -- ") + } + } else if computePrecision == .Float16 { + if param.stride == [2, 2] && param.stride == [2, 2] { + super.init(device: device, inFunctionName: "conv_transpose2x2_stride2_half") + } else { + fatalError(" -- conv transpose unsupported yet -- ") + } + } else { + fatalError() + } + +// let filter: [Float32] = param.filter.buffer.array() +// print(" conv transpose filter") +// print(filter) let kernelWidth = UInt16(param.filter.width) let kernelHeight = UInt16(param.filter.height) @@ -43,9 +63,7 @@ class ConvTransposeKernel: Kernel, Computable{ let dilationY = UInt16(param.dilations[1]) metalParam = MetalConvTransposeParam.init(kernelW: kernelWidth, kernelH: kernelHeight, strideX: strideX, strideY: strideY, paddingX: paddingX, paddingY: paddingY, dilationX: dilationX, dilationY: dilationY) - - param.output.initTexture(device: device, inTranspose: param.input.transpose) - param.filter.initBuffer(device: device) + } func compute(commandBuffer: MTLCommandBuffer, param: ConvTransposeParam

) throws { diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ElementwiseAddKernel.swift b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ElementwiseAddKernel.swift index e8e9dffc4562f0fc411e81e25f245b09ce292b0b..a4c88016b3a442d8d6937214755db9e33e9cb28f 100644 --- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ElementwiseAddKernel.swift +++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ElementwiseAddKernel.swift @@ -15,6 +15,7 @@ import Foundation struct ElementwiseAddMetalParam { + var unsafe_one_dim: Int32 = 0 var fast: Int32 = 0 var axis: Int32 = 0 var yoff: Int32 = 0 @@ -26,8 +27,14 @@ struct ElementwiseAddMetalParam { class ElementwiseAddKernel: Kernel, Computable { required init(device: MTLDevice, param: ElementwiseAddParam

) { - super.init(device: device, inFunctionName: "elementwise_add") param.output.initTexture(device: device, inTranspose: param.inputX.transpose, computePrecision: computePrecision) + if computePrecision == .Float32 { + super.init(device: device, inFunctionName: "elementwise_add") + } else if computePrecision == .Float16 { + super.init(device: device, inFunctionName: "elementwise_add_half") + } else { + fatalError() + } } func compute(commandBuffer: MTLCommandBuffer, param: ElementwiseAddParam

) throws { @@ -59,6 +66,11 @@ class ElementwiseAddKernel: Kernel, Computable { emp.fast = 1 } + // TODO: + if param.inputY.tensorDim.cout() == 1 { + emp.unsafe_one_dim = 1; + } + encoder.setBytes(&emp, length: MemoryLayout.size, index: 0) encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture) encoder.endEncoding() diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/PoolKernel.swift b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/PoolKernel.swift index b6db7231e83943dbce6f2cbe3266af9fbe508aef..59754e57c112ebf5e300bd0ef98465e08744e4bf 100644 --- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/PoolKernel.swift +++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/PoolKernel.swift @@ -27,8 +27,14 @@ struct PoolMetalParam { class PoolKernel: Kernel, Computable{ required init(device: MTLDevice, param: PoolParam

) { - super.init(device: device, inFunctionName: "pool") param.output.initTexture(device: device, inTranspose: param.input.transpose, computePrecision: computePrecision) + if computePrecision == .Float32 { + super.init(device: device, inFunctionName: "pool") + } else if computePrecision == .Float16 { + super.init(device: device, inFunctionName: "pool_half") + } else { + fatalError() + } } func compute(commandBuffer: MTLCommandBuffer, param: PoolParam

) throws { diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/PreluKernel.swift b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/PreluKernel.swift index 1545a848dacb4f11a2a68df31f7ea49a23799a87..32505faf0faeab97586352d60535d05e86e59631 100644 --- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/PreluKernel.swift +++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/PreluKernel.swift @@ -10,15 +10,27 @@ import Foundation class PreluKernel: Kernel, Computable{ required init(device: MTLDevice, param: PreluParam

) { - if param.mode == "channel" { - super.init(device: device, inFunctionName: "prelu_channel") - } else if param.mode == "element" { - super.init(device: device, inFunctionName: "prelu_element") - } else { - super.init(device: device, inFunctionName: "prelu_other") - } param.alpha.initBuffer(device: device, precision: computePrecision) param.output.initTexture(device: device, inTranspose: param.input.transpose, computePrecision: computePrecision) + if computePrecision == .Float32 { + if param.mode == "channel" { + super.init(device: device, inFunctionName: "prelu_channel") + } else if param.mode == "element" { + super.init(device: device, inFunctionName: "prelu_element") + } else { + super.init(device: device, inFunctionName: "prelu_other") + } + } else if computePrecision == .Float16 { + if param.mode == "channel" { + super.init(device: device, inFunctionName: "prelu_channel_half") + } else if param.mode == "element" { + super.init(device: device, inFunctionName: "prelu_element_half") + } else { + super.init(device: device, inFunctionName: "prelu_other_half") + } + } else { + fatalError() + } } func compute(commandBuffer: MTLCommandBuffer, param: PreluParam

) throws { diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/PriorBoxKernel.swift b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/PriorBoxKernel.swift index ece3e3915dd130c48ef717f29cd95a13eca52d0c..07239c32882f0f1492b181e9f57cad206f628ea1 100644 --- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/PriorBoxKernel.swift +++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/PriorBoxKernel.swift @@ -33,6 +33,10 @@ class PriorBoxKernel: Kernel, Computable{ var metalParam: PriorBoxMetalParam! required init(device: MTLDevice, param: PriorBoxParam

) { + + param.output.initTexture(device: device, inTranspose: [2, 0, 1, 3], computePrecision: computePrecision) + param.outputVariances.initTexture(device: device, inTranspose: [2, 0, 1, 3], computePrecision: computePrecision) + if computePrecision == .Float32 { super.init(device: device, inFunctionName: "prior_box") } else if computePrecision == .Float16 { @@ -41,9 +45,6 @@ class PriorBoxKernel: Kernel, Computable{ fatalError() } - param.output.initTexture(device: device, inTranspose: [2, 0, 1, 3], computePrecision: computePrecision) - param.outputVariances.initTexture(device: device, inTranspose: [2, 0, 1, 3], computePrecision: computePrecision) - let n = 1 let h = param.output.dim[1] let w = param.output.dim[2] diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ReluKernel.swift b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ReluKernel.swift index 3c669cf4d965f7842070c4d38427f6d1d7440db5..18f279e9f3c5226d6eea5b5e6f0a42502173071e 100644 --- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ReluKernel.swift +++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ReluKernel.swift @@ -15,17 +15,23 @@ import Foundation class ReluKernel: Kernel, Computable{ - func compute(commandBuffer: MTLCommandBuffer, param: ReluParam

) throws { - guard let encoder = commandBuffer.makeComputeCommandEncoder() else { - throw PaddleMobileError.predictError(message: " encode is nil") - } - encoder.setTexture(param.input.metalTexture, index: 0) - encoder.setTexture(param.output.metalTexture, index: 1) - encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture) - encoder.endEncoding() + func compute(commandBuffer: MTLCommandBuffer, param: ReluParam

) throws { + guard let encoder = commandBuffer.makeComputeCommandEncoder() else { + throw PaddleMobileError.predictError(message: " encode is nil") } - - required init(device: MTLDevice, param: ReluParam

) { - super.init(device: device, inFunctionName: "relu") + encoder.setTexture(param.input.metalTexture, index: 0) + encoder.setTexture(param.output.metalTexture, index: 1) + encoder.dispatch(computePipline: pipline, outTexture: param.output.metalTexture) + encoder.endEncoding() + } + + required init(device: MTLDevice, param: ReluParam

) { + if computePrecision == .Float32 { + super.init(device: device, inFunctionName: "relu") + } else if computePrecision == .Float16 { + super.init(device: device, inFunctionName: "relu_half") + } else { + fatalError() } + } } diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ResizeKernel.swift b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ResizeKernel.swift deleted file mode 100644 index d2795111ad1f43c759b95aa52ed34085a4ac147a..0000000000000000000000000000000000000000 --- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/ResizeKernel.swift +++ /dev/null @@ -1,62 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - - Licensed under the Apache License, Version 2.0 (the "License"); - you may not use this file except in compliance with the License. - You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - - Unless required by applicable law or agreed to in writing, software - distributed under the License is distributed on an "AS IS" BASIS, - WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - See the License for the specific language governing permissions and - limitations under the License. */ -// -//import Foundation -//import MetalPerformanceShaders -// -// -//struct ResizeParam: OpParam{ -// typealias OutputType = <#type#> -// -// typealias ParamPrecisionType = <#type#> -// -// let input: MTLTexture -// let output: MTLTexture -// let expectDim: Dim -//} -// -//struct OutputDim { -// let width: UInt16 -// let height: UInt16 -// let strideX: UInt16 -// let strideY: UInt16 -//} -// -//class ResizeKernel: Kernel, Computable{ -// var lanczos: MPSImageLanczosScale -// required init(device: MTLDevice, param: ResizeParam) { -// lanczos = MPSImageLanczosScale.init(device: device) -// super.init(device: device, inFunctionName: "resize") -// } -// func compute(commandBuffer: MTLCommandBuffer, param: ResizeParam) throws { -//// guard let encoder = commandBuffer.makeComputeCommandEncoder() else { -//// throw PaddleMobileError.predictError(message: " encode is nil") -//// } -// lanczos.encode(commandBuffer: commandBuffer, sourceTexture: param.input, destinationTexture: param.output) -// -//// encoder.setTexture(param.input, index: 0) -//// encoder.setTexture(param.output, index: 1) -//// let strideX = param.input.width/param.expectDim[2] -//// let strideY = param.input.height/param.expectDim[1] -//// var outputDim = OutputDim.init(width: UInt16(param.expectDim[1]), height: UInt16(param.expectDim[2]), strideX: UInt16(strideX), strideY: UInt16(strideY)) -//// encoder.setBytes(&outputDim, length: MemoryLayout.size, index: 0) -//// encoder.dispatch(computePipline: pipline, outTexture: param.output) -//// encoder.endEncoding() -// } -// -// -// -// -//} - diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ConvAddMetal.metal b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ConvAddMetal.metal index 3cb71ca21b425a76a6395351531ea7e81edcbbf8..0159dcf3e008ed4fe5bfd083674084fb1fb9e978 100644 --- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ConvAddMetal.metal +++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ConvAddMetal.metal @@ -429,7 +429,122 @@ kernel void depthwise_conv_add_3x3_half(texture2d_array in } +kernel void conv_add_5x1_half(texture2d_array inTexture [[texture(0)]], + texture2d_array outTexture [[texture(1)]], + constant MetalConvParam ¶m [[buffer(0)]], + const device half4 *weights [[buffer(1)]], + const device half4 *biase [[buffer(2)]], + uint3 gid [[thread_position_in_grid]]) { + + if (gid.x >= outTexture.get_width() || + gid.y >= outTexture.get_height() || + gid.z >= outTexture.get_array_size()) { + return; + } + + ushort2 stride = ushort2(param.strideX, param.strideY); + const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY); + + constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); + + const uint kernelHXW = 5; + + uint input_arr_size = inTexture.get_array_size(); + + uint weithTo = gid.z * kernelHXW * input_arr_size * 4; + + float4 output = float4(0.0); + + ushort dilation_y = param.dilationY; + half4 input[5]; + + for (uint i = 0; i < input_arr_size; ++i) { + input[0] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 2 * dilation_y), i); + + input[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - dilation_y), i); + + input[2] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i); + + input[3] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + dilation_y), i); + + input[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + 2 * dilation_y), i); + + for (int j = 0; j < 5; ++j) { + half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i]; + output.x += dot(input[j], weight_x); + + half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i]; + output.y += dot(input[j], weight_y); + + half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i]; + output.z += dot(input[j], weight_z); + + half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i]; + output.w += dot(float4(input[j]), float4(weight_w)); + } + } + output = output + float4(biase[gid.z]); + outTexture.write(half4(output), gid.xy, gid.z); +} + +kernel void conv_add_1x5_half(texture2d_array inTexture [[texture(0)]], + texture2d_array outTexture [[texture(1)]], + constant MetalConvParam ¶m [[buffer(0)]], + const device half4 *weights [[buffer(1)]], + const device half4 *biase [[buffer(2)]], + uint3 gid [[thread_position_in_grid]]) { + + if (gid.x >= outTexture.get_width() || + gid.y >= outTexture.get_height() || + gid.z >= outTexture.get_array_size()) { + return; + } + + ushort2 stride = ushort2(param.strideX, param.strideY); + const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY); + + constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); + + const uint kernelHXW = 5; + + uint input_arr_size = inTexture.get_array_size(); + + uint weithTo = gid.z * kernelHXW * input_arr_size * 4; + + float4 output = float4(0.0); + + ushort dilation_x = param.dilationX; + half4 input[5]; + + for (uint i = 0; i < input_arr_size; ++i) { + input[0] = inTexture.sample(sample, float2(posInInput.x - 2 * dilation_x, posInInput.y), i); + + input[1] = inTexture.sample(sample, float2(posInInput.x - dilation_x, posInInput.y), i); + + input[2] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i); + + input[3] = inTexture.sample(sample, float2(posInInput.x + dilation_x, posInInput.y), i); + + input[4] = inTexture.sample(sample, float2(posInInput.x + 2 * dilation_x, posInInput.y), i); + + for (int j = 0; j < 5; ++j) { + half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i]; + output.x += dot(input[j], weight_x); + + half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i]; + output.y += dot(input[j], weight_y); + + half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i]; + output.z += dot(input[j], weight_z); + + half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i]; + output.w += dot(input[j], weight_w); + } + } + output = output + float4(biase[gid.z]); + outTexture.write(half4(output), gid.xy, gid.z); +} kernel void test_conv_add_3x3(texture2d_array inTexture [[texture(0)]], @@ -502,3 +617,6 @@ kernel void test_conv_add_3x3(texture2d_array inTexture [ // output = output + biase[gid.z]; outTexture.write(output, gid.xy, gid.z); } + + + diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ConvKernel.metal b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ConvKernel.metal index 35c2c56cace5975c5fe166b007bb695eba163325..c07515c13da54c7f8bf698f976e47f7cda6de32b 100644 --- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ConvKernel.metal +++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ConvKernel.metal @@ -148,4 +148,133 @@ kernel void conv_1x1(texture2d_array inTexture [[texture( } +kernel void conv_3x3_half(texture2d_array inTexture [[texture(0)]], + texture2d_array outTexture [[texture(1)]], + constant MetalConvParam ¶m [[buffer(0)]], + const device half4 *weights [[buffer(1)]], + uint3 gid [[thread_position_in_grid]]) { + + if (gid.x >= outTexture.get_width() || + gid.y >= outTexture.get_height() || + gid.z >= outTexture.get_array_size()) { + return; + } + + ushort2 stride = ushort2(param.strideX, param.strideY); + const ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY); + + constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); + const uint kernelHXW = 9; + uint input_arr_size = inTexture.get_array_size(); + uint weithTo = gid.z * kernelHXW * input_arr_size * 4; + + float4 output = float4(0.0); + + half4 input[9]; + for (uint i = 0; i < input_arr_size; ++i) { + input[0] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y - 1), i); + input[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 1), i); + input[2] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y - 1), i); + input[3] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y), i); + input[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i); + input[5] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y), i); + input[6] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y + 1), i); + input[7] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + 1), i); + input[8] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y + 1), i); + for (int j = 0; j < 9; ++j) { + half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + j * input_arr_size + i]; + output.x += dot(float4(input[j]), float4(weight_x)); + + half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + j * input_arr_size + i]; + output.y += dot(float4(input[j]), float4(weight_y)); + + half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + j * input_arr_size + i]; + output.z += dot(float4(input[j]), float4(weight_z)); + + half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + j * input_arr_size + i]; + output.w += dot(float4(input[j]), float4(weight_w)); + } + } + outTexture.write(half4(output), gid.xy, gid.z); +} + +kernel void depthwise_conv_3x3_half(texture2d_array inTexture [[texture(0)]], + texture2d_array outTexture [[texture(1)]], + constant MetalConvParam ¶m [[buffer(0)]], + const device half *weights [[buffer(1)]], + uint3 gid [[thread_position_in_grid]]) { + + if (gid.x >= outTexture.get_width() || + gid.y >= outTexture.get_height() || + gid.z >= outTexture.get_array_size()) { + return; + } + uint output_slice = gid.z; + ushort2 stride = ushort2(param.strideX, param.strideY); + ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY); + constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); + const uint kernelHXW = 9; + uint weithTo = gid.z * kernelHXW * 4; + float4 output = float4(0.0); + half4 inputs[9]; + inputs[0] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y - 1), output_slice); + inputs[1] = inTexture.sample(sample, float2(posInInput.x, posInInput.y - 1), output_slice); + inputs[2] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y - 1), output_slice); + inputs[3] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y), output_slice); + inputs[4] = inTexture.sample(sample, float2(posInInput.x, posInInput.y), output_slice); + inputs[5] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y), output_slice); + inputs[6] = inTexture.sample(sample, float2(posInInput.x - 1, posInInput.y + 1), output_slice); + inputs[7] = inTexture.sample(sample, float2(posInInput.x, posInInput.y + 1), output_slice); + inputs[8] = inTexture.sample(sample, float2(posInInput.x + 1, posInInput.y + 1), output_slice); + for (int j = 0; j < 9; ++j) { + half4 input = inputs[j]; + output.x += float(input.x) * float(weights[weithTo + 0 * kernelHXW + j]); + output.y += float(input.y) * float(weights[weithTo + 1 * kernelHXW + j]); + output.z += float(input.z) * float(weights[weithTo + 2 * kernelHXW + j]); + output.w += float(input.w) * float(weights[weithTo + 3 * kernelHXW + j]); + } + outTexture.write(half4(output), gid.xy, gid.z); +} + +kernel void conv_1x1_half(texture2d_array inTexture [[texture(0)]], + texture2d_array outTexture [[texture(1)]], + constant MetalConvParam ¶m [[buffer(0)]], + const device half4 *weights [[buffer(1)]], + uint3 gid [[thread_position_in_grid]]) { + + if (gid.x >= outTexture.get_width() || + gid.y >= outTexture.get_height() || + gid.z >= outTexture.get_array_size()) { + return; + } + + ushort2 stride = ushort2(param.strideX, param.strideY); + ushort2 posInInput = ushort2(gid.xy) * stride + ushort2(param.offsetX, param.offsetY); + + constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); + const uint kernelHXW = 1; + + uint input_arr_size = inTexture.get_array_size(); + uint weithTo = gid.z * kernelHXW * input_arr_size * 4; + + float4 output = float4(0.0); + + half4 input; + for (uint i = 0; i < input_arr_size; ++i) { + input = inTexture.sample(sample, float2(posInInput.x, posInInput.y), i); + half4 weight_x = weights[weithTo + 0 * kernelHXW * input_arr_size + i]; + output.x += dot(float4(input), float4(weight_x)); + + half4 weight_y = weights[weithTo + 1 * kernelHXW * input_arr_size + i]; + output.y += dot(float4(input), float4(weight_y)); + + half4 weight_z = weights[weithTo + 2 * kernelHXW * input_arr_size + i]; + output.z += dot(float4(input), float4(weight_z)); + + half4 weight_w = weights[weithTo + 3 * kernelHXW * input_arr_size + i]; + output.w += dot(float4(input), float4(weight_w)); + } + outTexture.write(half4(output), gid.xy, gid.z); +} + diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ConvTransposeKernel.metal b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ConvTransposeKernel.metal index 5c5a499fceae9061dba30307e2c4ecd3b0e89164..baf3f31157a472412bb08ccb3c803f5ec9e25d9c 100644 --- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ConvTransposeKernel.metal +++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/ConvTransposeKernel.metal @@ -29,11 +29,11 @@ struct MetalConvTransposeParam{ ushort dilationY; }; -kernel void conv_transpose(texture2d_array inTexture [[texture(0)]], - texture2d_array outTexture [[texture(1)]], - constant MetalConvTransposeParam ¶m [[buffer(0)]], - const device float4 *weights [[buffer(1)]], - uint3 gid [[thread_position_in_grid]]){ +kernel void conv_transpose2x2_stride2(texture2d_array inTexture [[texture(0)]], + texture2d_array outTexture [[texture(1)]], + constant MetalConvTransposeParam ¶m [[buffer(0)]], + const device float4 *weights [[buffer(1)]], + uint3 gid [[thread_position_in_grid]]) { if (gid.x >= outTexture.get_width() || gid.y >= outTexture.get_height() || gid.z >= outTexture.get_array_size()) { @@ -41,48 +41,134 @@ kernel void conv_transpose(texture2d_array inTexture [[te } int input_array_size = inTexture.get_array_size(); - - uint kernel_one_output_slice = input_array_size * param.kernelW * param.kernelH; - - uint kernel_stride_z = gid.z * 4 * (kernel_one_output_slice); + int kernel_index_x = gid.x % 2; + int kernel_index_y = gid.y % 2; + int kernel_index = kernel_index_y * 2 + kernel_index_x; + int kernel_to = gid.z * input_array_size * 4 * 4 + (kernel_index * input_array_size); + int input_x = gid.x / 2; + int input_y = gid.y / 2; constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); + float4 output = float4(0.0); + for (int i = 0; i < input_array_size; ++i) { + + float4 input = inTexture.sample(sample, float2(input_x, input_y), i); + + float4 kernel_slice0 = weights[kernel_to + input_array_size * 4 * 0 + i]; + float4 kernel_slice1 = weights[kernel_to + input_array_size * 4 * 1 + i]; + float4 kernel_slice2 = weights[kernel_to + input_array_size * 4 * 2 + i]; + float4 kernel_slice3 = weights[kernel_to + input_array_size * 4 * 3 + i]; + + output.x += dot(input, kernel_slice0); + + output.y += dot(input, kernel_slice1); + + output.z += dot(input, kernel_slice2); + + output.w += dot(input, kernel_slice3); + } + + outTexture.write(output, gid.xy, gid.z); +} + +kernel void conv_transpose2x2_stride2_half(texture2d_array inTexture [[texture(0)]], + texture2d_array outTexture [[texture(1)]], + constant MetalConvTransposeParam ¶m [[buffer(0)]], + const device half4 *weights [[buffer(1)]], + uint3 gid [[thread_position_in_grid]]) { + if (gid.x >= outTexture.get_width() || + gid.y >= outTexture.get_height() || + gid.z >= outTexture.get_array_size()) { + return; + } - float4 output; + int input_array_size = inTexture.get_array_size(); + int kernel_index_x = gid.x % 2; + int kernel_index_y = gid.y % 2; + int kernel_index = kernel_index_y * 2 + kernel_index_x; + int kernel_to = gid.z * input_array_size * 4 * 4 + (kernel_index * input_array_size); + int input_x = gid.x / 2; + int input_y = gid.y / 2; - for (int w = 0; w < param.kernelW; ++w) { - int input_x = (gid.x - w * param.dilationX + param.paddingX) / param.strideX; - if (input_x < 0 || input_x >= int(inTexture.get_width())) { - continue; - } + constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); + float4 output = float4(0.0); + for (int i = 0; i < input_array_size; ++i) { - for (int h = 0; h < param.kernelH; ++h) { - int input_y = (gid.y - h * param.dilationY + param.paddingY) / param.strideY; - if (input_y < 0 || input_y >= int(inTexture.get_height())) { - continue; - } - - uint kernel_index = (w * param.kernelH + h) * inTexture.get_array_size(); - - for (int slice = 0; slice < input_array_size; ++slice) { - - float4 input; - float4 kernel_slice = weights[kernel_stride_z + 0 * kernel_one_output_slice + kernel_index + slice]; - float4 kernel_slice1 = weights[kernel_stride_z + 1 * kernel_one_output_slice + kernel_index + slice]; - - float4 kernel_slice2 = weights[kernel_stride_z + 2 * kernel_one_output_slice + kernel_index + slice]; - - float4 kernel_slice3 = weights[kernel_stride_z + 3 * kernel_one_output_slice + kernel_index + slice]; - - input = inTexture.sample(sample, float2(input_x, input_x), slice); - output.x += dot(input, kernel_slice); - output.x += dot(input, kernel_slice1); - output.x += dot(input, kernel_slice2); - output.x += dot(input, kernel_slice3); - } - } + half4 input = inTexture.sample(sample, float2(input_x, input_y), i); + + half4 kernel_slice0 = weights[kernel_to + input_array_size * 4 * 0 + i]; + half4 kernel_slice1 = weights[kernel_to + input_array_size * 4 * 1 + i]; + half4 kernel_slice2 = weights[kernel_to + input_array_size * 4 * 2 + i]; + half4 kernel_slice3 = weights[kernel_to + input_array_size * 4 * 3 + i]; + + output.x += dot(float4(input), float4(kernel_slice0)); + + output.y += dot(float4(input), float4(kernel_slice1)); + + output.z += dot(float4(input), float4(kernel_slice2)); + + output.w += dot(float4(input), float4(kernel_slice3)); } - outTexture.write(output, gid.xy, gid.z); + outTexture.write(half4(output), gid.xy, gid.z); } +//kernel void conv_transpose(texture2d_array inTexture [[texture(0)]], +// texture2d_array outTexture [[texture(1)]], +// constant MetalConvTransposeParam ¶m [[buffer(0)]], +// const device float4 *weights [[buffer(1)]], +// uint3 gid [[thread_position_in_grid]]){ +// if (gid.x >= outTexture.get_width() || +// gid.y >= outTexture.get_height() || +// gid.z >= outTexture.get_array_size()) { +// return; +// } +// +// int input_array_size = inTexture.get_array_size(); +// +// uint kernel_one_output_slice = input_array_size * param.kernelW * param.kernelH; +// +// uint kernel_stride_z = gid.z * 4 * (kernel_one_output_slice); +// +// constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); +// +// float4 output; +// +// for (int w = 0; w < param.kernelW; ++w) { +// int top = gid.x - w * param.dilationX + param.paddingX; +// int input_x = top / param.strideX; +// if (top < 0 || input_x >= int(inTexture.get_width())) { +// continue; +// } +// +// for (int h = 0; h < param.kernelH; ++h) { +// int top_y = gid.y - h * param.dilationY + param.paddingY; +// int input_y = top_y / param.strideY; +// if (top_y < 0 || input_y >= int(inTexture.get_height())) { +// continue; +// } +// +// uint kernel_index = (w * param.kernelH + h) * inTexture.get_array_size(); +// +// for (int slice = 0; slice < input_array_size; ++slice) { +// +// float4 input; +// float4 kernel_slice = weights[kernel_stride_z + 0 * kernel_one_output_slice + kernel_index + slice]; +// float4 kernel_slice1 = weights[kernel_stride_z + 1 * kernel_one_output_slice + kernel_index + slice]; +// +// float4 kernel_slice2 = weights[kernel_stride_z + 2 * kernel_one_output_slice + kernel_index + slice]; +// +// float4 kernel_slice3 = weights[kernel_stride_z + 3 * kernel_one_output_slice + kernel_index + slice]; +// +// input = inTexture.sample(sample, float2(input_x, input_y), slice); +// output.x += dot(input, kernel_slice); +// output.y += dot(input, kernel_slice1); +// output.z += dot(input, kernel_slice2); +// output.w += dot(input, kernel_slice3); +// } +// } +// } +// +// outTexture.write(output, gid.xy, gid.z); +//} +// diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/Elementwise.metal b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/Elementwise.metal index 5ad53a8ef94adde2de6034727b49415ba2cd17c7..cc662786de91a0c7d62aca4b0124ee68990d797e 100644 --- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/Elementwise.metal +++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/Elementwise.metal @@ -18,6 +18,7 @@ using namespace metal; struct ElementwiseAddParam { + int32_t unsafe_one_dim; int32_t fast; int32_t axis; int32_t yoff; @@ -36,7 +37,10 @@ kernel void elementwise_add(texture2d_array inputX [[textur gid.y >= outTexture.get_height() || gid.z >= outTexture.get_array_size()) return; float4 rx, ry; - if (pm.fast == 1) { + if (pm.unsafe_one_dim == 1) { + rx = inputX.read(gid.xy, gid.z); + ry = inputY.read(uint2(0, 0), gid.z); + } else if (pm.fast == 1) { rx = inputX.read(gid.xy, gid.z); ry = inputY.read(gid.xy, gid.z); } else { @@ -59,3 +63,39 @@ kernel void elementwise_add(texture2d_array inputX [[textur float4 r = rx + ry; outTexture.write(r, gid.xy, gid.z); } + +kernel void elementwise_add_half(texture2d_array inputX [[texture(0)]], + texture2d_array inputY [[texture(1)]], + texture2d_array outTexture [[texture(2)]], + constant ElementwiseAddParam &pm [[buffer(0)]], + uint3 gid [[thread_position_in_grid]]) { + if (gid.x >= outTexture.get_width() || + gid.y >= outTexture.get_height() || + gid.z >= outTexture.get_array_size()) return; + half4 rx, ry; + if (pm.unsafe_one_dim == 1) { + rx = inputX.read(gid.xy, gid.z); + ry = inputY.read(uint2(0, 0), gid.z); + } else if (pm.fast == 1) { + rx = inputX.read(gid.xy, gid.z); + ry = inputY.read(gid.xy, gid.z); + } else { + rx = inputX.read(gid.xy, gid.z); + int32_t x_xyzn[4] = {int32_t(gid.x), int32_t(gid.y), int32_t(gid.z), 0}, x_abcd[4], t_abcd[4]; + int32_t y_abcd[4] = {1, 1, 1, 1}, y_xyzn[4]; + int32_t xtrans[4] = {pm.xtrans[0], pm.xtrans[1], pm.xtrans[2], pm.xtrans[3]}; + int32_t ytrans[4] = {pm.ytrans[0], pm.ytrans[1], pm.ytrans[2], pm.ytrans[3]}; + for (int n = 0; n < 4; n++) { + xyzn2abcd(pm.xdim[3], x_xyzn, x_abcd); + invtrans(xtrans, x_abcd, t_abcd); + for (int k = pm.axis; k < (4 - pm.yoff); k++) { + y_abcd[k+pm.yoff] = t_abcd[k]; + } + trans(ytrans, y_abcd, t_abcd); + abcd2xyzn(pm.ydim[3], t_abcd, y_xyzn); + ry[n] = inputY.read(uint2(y_xyzn[0], y_xyzn[1]), y_xyzn[2])[y_xyzn[3]]; + } + } + half4 r = rx + ry; + outTexture.write(r, gid.xy, gid.z); +} diff --git a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/PreluKernel.metal b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/PreluKernel.metal index bd14a146100d0a0723f73ab5fd1f95d1f8e39c97..597804137743dd253d05d91a5008f558dcaf42e7 100644 --- a/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/PreluKernel.metal +++ b/metal/paddle-mobile/paddle-mobile/Operators/Kernels/metal/PreluKernel.metal @@ -81,3 +81,71 @@ kernel void prelu_other(texture2d_array inTexture [[textu outTexture.write(output, gid.xy, gid.z); } + +kernel void prelu_channel_half(texture2d_array inTexture [[texture(0)]], + texture2d_array outTexture [[texture(1)]], + const device half4 *alpha [[buffer(0)]], + uint3 gid [[thread_position_in_grid]]){ + if (gid.x >= outTexture.get_width() || + gid.y >= outTexture.get_height() || + gid.z >= outTexture.get_array_size()) { + return; + } + + constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); + half4 input = inTexture.sample(sample, float2(gid.x, gid.y), gid.z); + half4 alpha_value = alpha[gid.z]; + half4 output; + output.x = input.x > 0 ? input.x : (alpha_value.x * input.x); + output.y = input.y > 0 ? input.y : (alpha_value.y * input.y); + output.z = input.z > 0 ? input.z : (alpha_value.z * input.z); + output.w = input.w > 0 ? input.w : (alpha_value.w * input.w); + outTexture.write(output, gid.xy, gid.z); +} + +kernel void prelu_element_half(texture2d_array inTexture [[texture(0)]], + texture2d_array outTexture [[texture(1)]], + const device half4 *alpha [[buffer(0)]], + uint3 gid [[thread_position_in_grid]]){ + if (gid.x >= outTexture.get_width() || + gid.y >= outTexture.get_height() || + gid.z >= outTexture.get_array_size()) { + return; + } + + constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); + half4 input = inTexture.sample(sample, float2(gid.x, gid.y), gid.z); + + int alpha_to = (gid.y * inTexture.get_width() + gid.x) * inTexture.get_array_size(); + half4 alpha_value = alpha[alpha_to + gid.z]; + + half4 output; + output.x = input.x > 0 ? input.x : (alpha_value.x * input.x); + output.y = input.y > 0 ? input.y : (alpha_value.y * input.y); + output.z = input.z > 0 ? input.z : (alpha_value.z * input.z); + output.w = input.w > 0 ? input.w : (alpha_value.w * input.w); + outTexture.write(output, gid.xy, gid.z); +} + +kernel void prelu_other_half(texture2d_array inTexture [[texture(0)]], + texture2d_array outTexture [[texture(1)]], + const device half *alpha [[buffer(0)]], + uint3 gid [[thread_position_in_grid]]){ + if (gid.x >= outTexture.get_width() || + gid.y >= outTexture.get_height() || + gid.z >= outTexture.get_array_size()) { + return; + } + + constexpr sampler sample(coord::pixel, filter::nearest, address::clamp_to_zero); + half4 input = inTexture.sample(sample, float2(gid.x, gid.y), gid.z); + half alpha_value = alpha[0]; + half4 output; + output.x = input.x > 0 ? input.x : (alpha_value * input.x); + output.y = input.y > 0 ? input.y : (alpha_value * input.y); + output.z = input.z > 0 ? input.z : (alpha_value * input.z); + output.w = input.w > 0 ? input.w : (alpha_value * input.w); + outTexture.write(output, gid.xy, gid.z); +} + + diff --git a/metal/paddle-mobile/paddle-mobile/Operators/PoolOp.swift b/metal/paddle-mobile/paddle-mobile/Operators/PoolOp.swift index d3d31cfcd43a81f6068238d45f3442af0fc7795f..6f42f2aa9f8d0515946ace625ed16c5040fd3099 100644 --- a/metal/paddle-mobile/paddle-mobile/Operators/PoolOp.swift +++ b/metal/paddle-mobile/paddle-mobile/Operators/PoolOp.swift @@ -60,7 +60,7 @@ class PoolOp: Operator, PoolParam

>, Runable, func delogOutput() { print(" \(type) output: ") - print(para.output.metalTexture.toTensor(dim: (n: para.output.tensorDim[0], c: para.output.tensorDim[1], h: para.output.tensorDim[2], w: para.output.tensorDim[3]), texturePrecision: computePrecision).strideArray()) + print(para.output.metalTexture.toTensor(dim: (n: para.output.tensorDim[0], c: para.output.tensorDim[1], h: para.output.tensorDim[2], w: para.output.tensorDim[3])).strideArray()) // print("pool2d delog") diff --git a/metal/paddle-mobile/paddle-mobile/Operators/PreluOp.swift b/metal/paddle-mobile/paddle-mobile/Operators/PreluOp.swift index c7e049e3c1b21d9747acca8812abfff8c25d6d98..10b5816d7b4528572cdc6b84d53b73499dde93b4 100644 --- a/metal/paddle-mobile/paddle-mobile/Operators/PreluOp.swift +++ b/metal/paddle-mobile/paddle-mobile/Operators/PreluOp.swift @@ -51,13 +51,13 @@ class PreluOp: Operator, PreluParam

>, Runabl func delogOutput() { print(" \(type) input: ") - print(para.input.metalTexture.toTensor(dim: (n: para.input.originDim[0], c: para.input.originDim[1], h: para.input.originDim[2], w: para.input.originDim[3]), texturePrecision: computePrecision).strideArray()) + print(para.input.metalTexture.toTensor(dim: (n: para.input.originDim[0], c: para.input.originDim[1], h: para.input.originDim[2], w: para.input.originDim[3])).strideArray()) print(" \(type) Alpha: ") let _: Float32? = para.alpha.buffer.logDesc(header: " alpha: ", stridable: false) print(" \(type) output: ") - print(para.output.metalTexture.toTensor(dim: (n: para.output.originDim[0], c: para.output.originDim[1], h: para.output.originDim[2], w: para.output.originDim[3]), texturePrecision: computePrecision).strideArray()) + print(para.output.metalTexture.toTensor(dim: (n: para.output.originDim[0], c: para.output.originDim[1], h: para.output.originDim[2], w: para.output.originDim[3])).strideArray()) } // print("softmax delog") diff --git a/metal/paddle-mobile/paddle-mobile/Operators/ReluOp.swift b/metal/paddle-mobile/paddle-mobile/Operators/ReluOp.swift index 0325f860e078cf639c08e279970a105e3f562a32..c9f054c88af44ac3f5dd453b4696c7988d01fa8f 100644 --- a/metal/paddle-mobile/paddle-mobile/Operators/ReluOp.swift +++ b/metal/paddle-mobile/paddle-mobile/Operators/ReluOp.swift @@ -46,7 +46,7 @@ class ReluOp: Operator, ReluParam

>, Runable, func delogOutput() { print(" \(type) output: ") - print(para.output.metalTexture.toTensor(dim: (n: para.output.tensorDim[0], c: para.output.tensorDim[1], h: para.output.tensorDim[2], w: para.output.tensorDim[3]), texturePrecision: computePrecision).strideArray()) + print(para.output.metalTexture.toTensor(dim: (n: para.output.tensorDim[0], c: para.output.tensorDim[1], h: para.output.tensorDim[2], w: para.output.tensorDim[3])).strideArray()) } } diff --git a/metal/paddle-mobile/paddle-mobile/Operators/ReshapeOp.swift b/metal/paddle-mobile/paddle-mobile/Operators/ReshapeOp.swift index 451b064ce19e0e1cb70700d046b6ab059e6df9e3..448896dd330b283d283b4eb881777ee633186c1b 100644 --- a/metal/paddle-mobile/paddle-mobile/Operators/ReshapeOp.swift +++ b/metal/paddle-mobile/paddle-mobile/Operators/ReshapeOp.swift @@ -76,7 +76,7 @@ class ReshapeOp: Operator, ReshapeParam

>, let originDim = para.output.originDim - let outputArray: [Float32] = para.output.metalTexture.realNHWC(dim: (n: originDim[0], h: originDim[1], w: originDim[2], c: originDim[3]), texturePrecision: computePrecision) + let outputArray: [Float32] = para.output.metalTexture.realNHWC(dim: (n: originDim[0], h: originDim[1], w: originDim[2], c: originDim[3])) print(outputArray.strideArray()) } diff --git a/metal/paddle-mobile/paddle-mobile/Operators/SoftmaxOp.swift b/metal/paddle-mobile/paddle-mobile/Operators/SoftmaxOp.swift index af776450d7f15d031a6af546d13bc1c0374249b7..71c7918d4e12733a6289017c020e5dea3ac4c76f 100644 --- a/metal/paddle-mobile/paddle-mobile/Operators/SoftmaxOp.swift +++ b/metal/paddle-mobile/paddle-mobile/Operators/SoftmaxOp.swift @@ -54,7 +54,7 @@ class SoftmaxOp: Operator, SoftmaxParam

>, print("softmax delog") let originDim = para.output.originDim - let outputArray: [Float32] = para.output.metalTexture.realNHWC(dim: (n: originDim[0], h: originDim[1], w: originDim[2], c: originDim[3]), texturePrecision: computePrecision) + let outputArray: [Float32] = para.output.metalTexture.realNHWC(dim: (n: originDim[0], h: originDim[1], w: originDim[2], c: originDim[3])) print(outputArray.strideArray()) } } diff --git a/metal/paddle-mobile/paddle-mobile/framework/Tensor.swift b/metal/paddle-mobile/paddle-mobile/framework/Tensor.swift index a318180b2b57b162715f0088fdfd88767506ad2a..c5ee1414521e7eb92011d4f4b608ad326b005531 100644 --- a/metal/paddle-mobile/paddle-mobile/framework/Tensor.swift +++ b/metal/paddle-mobile/paddle-mobile/framework/Tensor.swift @@ -95,7 +95,28 @@ class Tensor: Tensorial { - func initBuffer(device: MTLDevice, precision: ComputePrecision = .Float16) { + func initBuffer(device: MTLDevice, precision: ComputePrecision = .Float16, convertToNHWC: Bool = true, withTranspose: Bool = false) { + if convertToNHWC { +// print(layout) + convert(to: DataLayout.NHWC()) + } + + if withTranspose { + let transposePointer = UnsafeMutablePointer

.allocate(capacity: numel()) + let n = dim[0] + let hwc = numel()/n + for j in 0.. else { fatalError(" not support yet ") } @@ -139,6 +160,8 @@ class Tensor: Tensorial { for j in 0..: Tensorial { float32ToFloat16(input: convertedPointer, output: buffer.contents(), count: count) } + convertedPointer.deinitialize(count: count) + convertedPointer.deallocate() + } + } else { + let C = dim[3] + let cSlices = (C + 3) / 4 + let paddedC = cSlices * 4 + let count = paddedC * dim[0] * dim[1] * dim[2] + if C == paddedC { + buffer = device.makeBuffer(length: count * precisionSize) + switch precision { + case .Float32: + buffer?.contents().copyMemory(from: data.pointer, byteCount: count * MemoryLayout

.stride) + case .Float16: + float32ToFloat16(input: floatPointer, output: buffer.contents(), count: count) + } + } else if C == 1 { + fatalError(" not support ") + } else { + buffer = device.makeBuffer(length: count * precisionSize) + let convertedPointer = UnsafeMutablePointer.allocate(capacity: count) + var tmpPointer = floatPointer + var dstPtr = convertedPointer + for _ in 0...stride) + case .Float16: + float32ToFloat16(input: convertedPointer, output: buffer.contents(), count: count) + } convertedPointer.deinitialize(count: count) convertedPointer.deallocate() }